From bc472242a30bd8c74b97d46dc83c05f742113de7 Mon Sep 17 00:00:00 2001 From: Feiyue Chen Date: Mon, 1 Apr 2024 07:39:12 +0000 Subject: [PATCH] Updata internal ovxlib to rel1.2.6 Internal ovxlib commit hash: c5d3e69356579fc7b595a7c0939fc7e4e0aaab5a Type: Code Improvement Signed-off-by: Feiyue Chen --- VERSION | 2 +- src/tim/vx/internal/include/interface/ops.def | 2 + .../internal/include/kernel/vsi_nn_kernel.h | 6 +- .../include/ops/vsi_nn_op_resize_internal.h | 1 + .../internal/include/ops/vsi_nn_op_rmsnorm.h | 54 + .../vx/internal/include/ops/vsi_nn_op_shape.h | 47 + .../include/utils/vsi_nn_dtype_util_prv.h | 72 +- src/tim/vx/internal/include/vsi_nn_context.h | 2 - .../vx/internal/include/vsi_nn_node_type.h | 4 + src/tim/vx/internal/include/vsi_nn_version.h | 2 +- ..._tiny_yolov4_postprocess_confidence_evis.c | 2 +- .../custom/ops/vsi_nn_op_custom_warp_affine.c | 11 +- .../src/kernel/cl/bilinear_grid_sample_cl.c | 147 +- .../src/kernel/cl/gather_elements_cl.c | 3 + .../src/kernel/cl/lstmunit_activation_cl.c | 11 +- .../src/kernel/cl/nearest_grid_sample_cl.c | 17 +- .../vx/internal/src/kernel/cl/one_hot_cl.c | 9 +- src/tim/vx/internal/src/kernel/cl/topk_cl.c | 44 +- .../kernel/evis/bilinear_grid_sample_evis.c | 215 +- .../src/kernel/evis/crop_and_resize_evis.c | 2 + .../kernel/evis/lstmunit_activation_evis.c | 11 +- .../kernel/evis/nearest_grid_sample_evis.c | 17 +- .../evis/pre_process_rgb888_planar_evis.c | 2 +- .../pre_process_rgb888_planar_nhwc_evis.c | 2 +- .../src/kernel/evis/resize_cubic_evis.c | 20 +- .../vx/internal/src/kernel/vsi_nn_kernel.c | 5 + .../internal/src/kernel/vsi_nn_kernel_util.c | 6 +- .../vx/internal/src/kernel/vx/rms_norm_vx.c | 94 + .../ops/cl/bilinear_grid_sample_reflect.cl | 169 ++ .../internal/src/libnnext/ops/cl/maxpool.cl | 10 +- .../vx/internal/src/libnnext/ops/cl/topk.cl | 10 +- .../vx/internal/src/libnnext/ops/cl/topk2.cl | 368 +++ .../src/libnnext/ops/cl/topk_odd_even_sort.cl | 125 +- .../libnnext/ops/cl/topk_odd_even_sort2.cl | 81 +- ...linear_grid_sample_reflect_BF16_to_BF16.vx | 171 ++ ...bilinear_grid_sample_reflect_F16_to_F16.vx | 217 ++ .../bilinear_grid_sample_reflect_F16_to_U8.vx | 224 ++ ...bilinear_grid_sample_reflect_I16_to_I16.vx | 160 ++ .../bilinear_grid_sample_reflect_I8_to_I8.vx | 160 ++ .../bilinear_grid_sample_reflect_U8_to_U8.vx | 224 ++ .../src/libnnext/vsi_nn_libnnext_resource.c | 2045 +++++++++++++++-- src/tim/vx/internal/src/ops/vsi_nn_op_clip.c | 10 +- .../src/ops/vsi_nn_op_conv2d_lstm_cell.c | 3 + .../src/ops/vsi_nn_op_deconvolution.c | 7 +- .../internal/src/ops/vsi_nn_op_extra_ending.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_gather.c | 19 + .../src/ops/vsi_nn_op_gather_elements.c | 3 + .../internal/src/ops/vsi_nn_op_grid_sample.c | 28 +- .../src/ops/vsi_nn_op_instancenormalize.c | 28 + .../src/ops/vsi_nn_op_layernormalize.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_reduce.c | 97 +- .../vx/internal/src/ops/vsi_nn_op_reducel2.c | 2 +- .../internal/src/ops/vsi_nn_op_relu_keras.c | 10 +- .../vx/internal/src/ops/vsi_nn_op_reshape.c | 6 +- .../vx/internal/src/ops/vsi_nn_op_reshape2.c | 5 + .../src/ops/vsi_nn_op_resize_internal.c | 3 + .../vx/internal/src/ops/vsi_nn_op_rmsnorm.c | 202 ++ src/tim/vx/internal/src/ops/vsi_nn_op_shape.c | 196 ++ .../src/ops/vsi_nn_op_strided_slice.c | 9 + src/tim/vx/internal/src/ops/vsi_nn_op_topk.c | 7 +- .../src/ops/vsi_nn_op_upsamplescale.c | 11 +- .../src/utils/vsi_nn_code_generator.c | 2 + src/tim/vx/internal/src/utils/vsi_nn_util.c | 22 +- src/tim/vx/internal/src/vsi_nn_context.c | 2 +- src/tim/vx/internal/src/vsi_nn_graph.c | 8 +- .../vx/internal/src/vsi_nn_internal_node.c | 6 + src/tim/vx/internal/src/vsi_nn_log.c | 2 +- .../vx/internal/src/vsi_nn_pre_post_process.c | 1 + src/tim/vx/internal/src/vsi_nn_tensor.c | 1 + 69 files changed, 4878 insertions(+), 590 deletions(-) mode change 100755 => 100644 src/tim/vx/internal/include/interface/ops.def create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_rmsnorm.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_shape.h create mode 100644 src/tim/vx/internal/src/kernel/vx/rms_norm_vx.c create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample_reflect.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_BF16_to_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_F16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_U8.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I16_to_I16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I8_to_I8.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_U8_to_U8.vx create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_shape.c diff --git a/VERSION b/VERSION index da44c7f34..3c43790f5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.50 +1.2.6 diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def old mode 100755 new mode 100644 index 6c879e9c9..fe42d4533 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -197,3 +197,5 @@ DEF_OP(RESIZE_3D) DEF_OP(REDUCEL2) DEF_OP(CROP_AND_RESIZE) DEF_OP(TAN) +DEF_OP(RMSNORM) +DEF_OP(SHAPE) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h index 5150b0e4a..d81bd8408 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h @@ -26,6 +26,8 @@ #define _VSI_NN_KERNEL_H #include +#include + #include "vsi_nn_log.h" #include "vsi_nn_ops.h" #include "vsi_nn_graph.h" @@ -81,6 +83,7 @@ typedef enum U4, FP8_E4M3, FP8_E5M2, + INVALID_DTYPE, } VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e; typedef enum @@ -532,9 +535,8 @@ static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype return FP8_E5M2; default: VSILOGE("error data type %d", dtype); - break; + return INVALID_DTYPE; } - return I8; } /* vsi_nn_kernel_map_dtype() */ static VSI_INLINE_API vsi_nn_type_e vsi_nn_dtype_map_kernel diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h index 6adc8964d..ba9891925 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h @@ -43,6 +43,7 @@ typedef struct _vsi_nn_resize_internal_param vsi_bool half_pixel_centers; float factor; vsi_enum layout; + vsi_enum type; } vsi_nn_resize_internal_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rmsnorm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rmsnorm.h new file mode 100644 index 000000000..12c8113df --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rmsnorm.h @@ -0,0 +1,54 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_RMSNORM_H +#define _VSI_NN_OP_RMSNORM_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +typedef struct _rmsnorm_local_data_t { + int32_t placeholder; +} rmsnorm_local_data_t; + +typedef struct _vsi_nn_rmsnorm_param +{ + struct _rmsnorm_local_data_t* local; + float eps; + int32_t axis; +} vsi_nn_rmsnorm_param; + +_compiler_assert(offsetof(vsi_nn_rmsnorm_param, local) == 0, \ + vsi_nn_rmsnorm_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_shape.h b/src/tim/vx/internal/include/ops/vsi_nn_op_shape.h new file mode 100644 index 000000000..1ae2a5bb3 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_shape.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_SHAPE_H +#define _VSI_NN_OP_SHAPE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_shape_param +{ + struct _shape_local_data_t* local; + // Add parameters here +} vsi_nn_shape_param; +_compiler_assert(offsetof(vsi_nn_shape_param, local) == 0, \ + vsi_nn_shape_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h index 9d32dfb7c..ed7857159 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h @@ -33,6 +33,14 @@ extern "C" { #endif +/* + * A helper union for fp32 bit casting. + */ +typedef union { + float val; + uint32_t data; +} fp32_bit_cast_t; + static VSI_INLINE_API vsi_bool type_is_integer ( const vsi_nn_type_e type @@ -203,9 +211,11 @@ static VSI_INLINE_API vsi_bool fp32_is_inf float val ) { - uint32_t u_value = *(uint32_t*)&val; + fp32_bit_cast_t fp32_bit_cast; + fp32_bit_cast.val = val; + uint32_t fp32_data = fp32_bit_cast.data; - if ((u_value & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF) + if ((fp32_data & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF) { return TRUE; } @@ -232,7 +242,9 @@ static VSI_INLINE_API int32_t fp32_to_affine if (fp32_is_inf(in) != 0) { - uint32_t sign = (*(uint32_t*)&in) >> 31; + fp32_bit_cast_t fp32_bit_cast; + fp32_bit_cast.val = in; + uint32_t sign = fp32_bit_cast.data >> 31; data = sign == 1 ? (int32_t)min_range : (int32_t)max_range; } @@ -277,7 +289,9 @@ static VSI_INLINE_API int32_t fp32_to_dfp if (fp32_is_inf(in) != 0) { - uint32_t sign = (*(uint32_t*)&in) >> 31; + fp32_bit_cast_t fp32_bit_cast; + fp32_bit_cast.val = in; + uint32_t sign = fp32_bit_cast.data >> 31; data = sign == 1 ? (int32_t)min_range : (int32_t) max_range; } @@ -373,8 +387,9 @@ static VSI_INLINE_API float bfp16_to_fp32 int16_t in ) { - int32_t t1, t2, t3; + uint32_t t1, t2, t3; float out; + fp32_bit_cast_t fp32_bit_cast; t1 = in & 0x00FF; // Mantissa t2 = in & 0xFF00; // Sign bit + Exponent @@ -384,9 +399,10 @@ static VSI_INLINE_API float bfp16_to_fp32 t2 <<= 16; // Shift (sign + Exponent) bit into position t1 |= t2; // Re-insert (sign + Exponent) bit - *((uint32_t*)&out) = t1; + fp32_bit_cast.data = t1; + out = fp32_bit_cast.val; - return t3 == 0 ? 0 : out; + return t3 == 0 ? 0.0f : out; } /* bfp16_to_fp32() */ static VSI_INLINE_API uint16_t fp32_to_fp16 @@ -394,10 +410,12 @@ static VSI_INLINE_API uint16_t fp32_to_fp16 float in ) { - uint32_t fp32 = *((uint32_t *) &in); - uint32_t t1 = (fp32 & 0x80000000u) >> 16; /* sign bit. */ - uint32_t t2 = (fp32 & 0x7F800000u) >> 13; /* Exponent bits */ - uint32_t t3 = (fp32 & 0x007FE000u) >> 13; /* Mantissa bits, no rounding */ + fp32_bit_cast_t fp32_bit_cast; + fp32_bit_cast.val = in; + uint32_t fp32_data = fp32_bit_cast.data; + uint32_t t1 = (fp32_data & 0x80000000u) >> 16; /* sign bit. */ + uint32_t t2 = (fp32_data & 0x7F800000u) >> 13; /* Exponent bits */ + uint32_t t3 = (fp32_data & 0x007FE000u) >> 13; /* Mantissa bits, no rounding */ uint32_t fp16 = 0u; if( t2 >= 0x023c00u ) { @@ -420,8 +438,10 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16 float in ) { - uint32_t fp32 = *((unsigned int *) &in); - uint32_t t1 = fp32 >> 16; + fp32_bit_cast_t fp32_bit_cast; + fp32_bit_cast.val = in; + uint32_t fp32_data = fp32_bit_cast.data; + uint32_t t1 = fp32_data >> 16; return (uint16_t) t1; } /* fp32_to_bfp16() */ @@ -435,10 +455,12 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne Convert a float point to bfloat16, with round-nearest-to-even as rounding method. */ - uint32_t fp32 = *((unsigned int *) &in); + fp32_bit_cast_t fp32_bit_cast; + fp32_bit_cast.val = in; + uint32_t fp32_data = fp32_bit_cast.data; uint16_t out; - uint32_t lsb = (fp32 >> 16) & 1; /* Least significant bit of resulting bfloat. */ + uint32_t lsb = (fp32_data >> 16) & 1; /* Least significant bit of resulting bfloat. */ uint32_t rounding_bias = 0x7fff + lsb; if ( VSI_NN_FLOAT32_NAN == in ) @@ -447,8 +469,8 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne } else { - fp32 += rounding_bias; - out = (uint16_t) (fp32 >> 16); + fp32_data += rounding_bias; + out = (uint16_t) (fp32_data >> 16); } return out; @@ -466,7 +488,9 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) { float fp8_f32 = in / scale; - int32_t in_val = *((int32_t*)&fp8_f32); + fp32_bit_cast_t fp32_bit_cast; + fp32_bit_cast.val = fp8_f32; + uint32_t in_val = fp32_bit_cast.data; uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */ uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */ @@ -512,7 +536,9 @@ static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) { static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) { float fp8_f32 = in / scale; - int32_t in_val = *((int32_t*)&fp8_f32); + fp32_bit_cast_t fp32_bit_cast; + fp32_bit_cast.val = fp8_f32; + uint32_t in_val = fp32_bit_cast.data; uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */ uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */ uint32_t in_man = (in_val & 0x7FFFFF); /* low 23 bits is man */ @@ -561,6 +587,7 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) { uint32_t exponentOut = 0; uint32_t mantissaOut = 0; uint32_t out_u = 0; + fp32_bit_cast_t fp32_bit_cast; { uint32_t signIn; @@ -610,7 +637,8 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) { } final: out_u = signOut << 31 | exponentOut << 23 | mantissaOut; - val_fp32 = *((float*)&out_u); + fp32_bit_cast.data = out_u; + val_fp32 = fp32_bit_cast.val; return val_fp32 * scale; } /* fp8_e4m3_to_fp32() */ @@ -621,6 +649,7 @@ static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) { uint32_t exponentOut = 0; uint32_t mantissaOut = 0; uint32_t out_u = 0; + fp32_bit_cast_t fp32_bit_cast; { uint32_t signIn; @@ -670,7 +699,8 @@ static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) { } final: out_u = signOut << 31 | exponentOut << 23 | mantissaOut; - val_fp32 = *((float*)&out_u); + fp32_bit_cast.data = out_u; + val_fp32 = fp32_bit_cast.val; return val_fp32 * scale; } /* fp8_e5m2_to_fp32() */ diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index 4ac9f6113..477cb19f4 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -60,9 +60,7 @@ typedef struct _vsi_nn_hw_config_t { char target_name[VSI_NN_MAX_TARGET_NAME]; vsi_nn_hw_evis_t evis; -#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT uint32_t subGroupSize; -#endif uint32_t use_40bits_va; uint32_t support_stream_processor; uint32_t sp_exec_count; diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index 173be9409..a18e89494 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -210,6 +210,8 @@ #include "ops/vsi_nn_op_resize_3d.h" #include "ops/vsi_nn_op_reducel2.h" #include "ops/vsi_nn_op_crop_and_resize.h" +#include "ops/vsi_nn_op_rmsnorm.h" +#include "ops/vsi_nn_op_shape.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" #include "ops/vsi_nn_op_inverse_sigmoid.h" @@ -408,6 +410,8 @@ typedef union _vsi_nn_nn_param vsi_nn_resize_3d_param resize_3d; vsi_nn_reducel2_param reducel2; vsi_nn_crop_and_resize_param crop_and_resize; + vsi_nn_rmsnorm_param rmsnorm; + vsi_nn_shape_param shape; void* client_param; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 2b7e1bd04..0fafc241d 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 2 -#define VSI_NN_VERSION_PATCH 2 +#define VSI_NN_VERSION_PATCH 6 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c index b36ec6b14..5f34aaa41 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c @@ -267,7 +267,7 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM]; vsi_nn_kernel_node_t node = NULL; - vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; VSI_UNREFERENCED(params); diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c index 94a2d704a..631c2edea 100644 --- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c +++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c @@ -55,8 +55,17 @@ static vsi_status op_compute vsi_nn_kernel_param_t * param = NULL; vsi_nn_custom_warp_affine_param * p; p = &(self->nn_param.custom_warp_affine); - + float matrix_shape[6] = { 1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f}; param = vsi_nn_kernel_param_create(); + //Unlike OpenCV, we use the coordinate of dst and matrix to calculate the coordinate of src in custom_warp_affine. + //Therefore, matrix M_ovx in custom_warp_affine is different from matrix M_cv in OpenCV. + //We get M_ovx by transposing the inverse of M_cv. + //inv_M = cv2.invertAffineTransform(M_cv); M_ovx=inv_M.transpose(1,0) + if (p->matrix == NULL) + { + p->matrix = matrix_shape; + } + vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 ); vsi_nn_kernel_param_add_int32( param, "type", p->type); vsi_nn_kernel_param_add_int32( param, "rgb_type", p->rgb_type); diff --git a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c index 84811fd82..095ed2418 100644 --- a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c @@ -47,18 +47,26 @@ typedef enum } _internal_kernel_e; #define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE() "bilinear_grid_sample" +#define _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE() "bilinear_grid_sample_reflect" #define STR(a) #a // Add kernel hashtable here -#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ - ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE)) +#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, REFLECT) \ + ((IN1_DTYPE << 24) | (IN0_DTYPE << 16) | (OUT_DTYPE << 8) | (REFLECT)) #define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ { \ - BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \ - CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \ - _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE() \ + BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \ + _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE() \ + } + +#define PACK_REFLECT_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { \ + BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_reflect_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \ + _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE() \ } typedef struct @@ -73,6 +81,8 @@ static const _kernel_map_type _bilinear_grid_sample_kernel_map[] = // Register kernel here PACK_KERNEL_MAP(F32, F32, F32 ), PACK_KERNEL_MAP(U8, U8, U8), + PACK_REFLECT_KERNEL_MAP(F32, F32, F32), + PACK_REFLECT_KERNEL_MAP(U8, U8, U8), }; @@ -95,23 +105,24 @@ static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _BILINEAR_GRID_SAMPLE_PARAM_NUM 8 -#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM \ +#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM 14 +#define _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_NUM 12 +#define _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM \ _cnt_of_array(_bilinear_grid_sample_kernel_param_def) + #define SCALAR_HALF_INPUT0_W (3) #define SCALAR_HALF_INPUT0_H (4) #define SCALAR_ADD_VALUE_W (5) #define SCALAR_ADD_VALUE_H (6) #define SCALAR_DEPTH (7) -#define SCALAR_INPUT0_SCALE (8) -#define SCALAR_INPUT0_TAIL (9) -#define SCALAR_INPUT1_SCALE (10) -#define SCALAR_INPUT1_TAIL (11) -#define SCALAR_OUTPUT_SCALE (12) -#define SCALAR_OUTPUT_TAIL (13) /* * Kernel initializer @@ -170,7 +181,8 @@ static vsi_status _query_kernel vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, - vsi_bool* is_use_u8_kernel + vsi_bool* is_use_u8_kernel, + int32_t is_reflect_mode ) { vsi_status status = VSI_FAILURE; @@ -199,14 +211,29 @@ static vsi_status _query_kernel out_dtype = F32; } if ((U8 == in0_dtype) || (U8 == out_dtype)) { - param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM; + if (is_reflect_mode) + { + param_def_size = _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM; + } + else{ + param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM; + } + *is_use_u8_kernel = TRUE; } else { - param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM; + if (is_reflect_mode) + { + param_def_size = _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_NUM; + } + else + { + param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM; + } + *is_use_u8_kernel = FALSE; } - key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype); + key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, is_reflect_mode); for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) { @@ -245,7 +272,7 @@ static vsi_nn_kernel_node_t _setup { vsi_nn_kernel_node_t node = NULL; vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM]; + vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM]; vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; uint32_t final_in1_rank = 0; vsi_nn_tensor_t* rs_tensors = NULL; @@ -263,11 +290,14 @@ static vsi_nn_kernel_node_t _setup vsi_bool is_use_u8_kernel = FALSE; int32_t align_corners = vsi_nn_kernel_param_get_int32(params, "align_corners"); + int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode"); uint32_t pad_val = 0; int32_t depth = 0; vsi_nn_kernel_dtype_e in0_dtype; float half_input0_w, half_input0_h, add_float_value_w, add_float_value_h; + int32_t is_reflect_mode = 0; + float min_val_w, span_w, min_val_h, span_h; // Check if gpu can support the size if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size, @@ -280,6 +310,11 @@ static vsi_nn_kernel_node_t _setup return NULL; } + if (pad_mode == VSI_NN_PAD_MODE_REFLECT) + { + is_reflect_mode = 1; + } + final_tensors[0] = inputs[0]; if (inputs[1]->attr.dim_num >= 3) { @@ -313,12 +348,35 @@ static vsi_nn_kernel_node_t _setup add_float_value_h = half_input0_h - 0.5f; } + if (is_reflect_mode) + { + float low_w, low_h, high_w, high_h; + if (align_corners) + { + low_w = 0; + low_h = 0; + high_w = 2 * (float)(in0_width - 1); + high_h = 2 * (float)(in0_height - 1); + } + else + { + low_w = -1; + low_h = -1; + high_w = 2 * (float)in0_width - 1; + high_h = 2 * (float)in0_height - 1; + } + min_val_w = low_w / 2; + span_w = (high_w - low_w) / 2; + min_val_h = low_h / 2; + span_h = (high_h - low_h) / 2; + } + depth = (int32_t)inputs[0]->attr.size[2]; in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type); if (U8 == in0_dtype) { pad_val = inputs[0]->attr.dtype.zero_point; } - status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel); + status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel, is_reflect_mode); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -326,7 +384,7 @@ static vsi_nn_kernel_node_t _setup { size_t node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM; /* Set inputs and outputs */ - vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM, + vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM, final_tensors, input_num, &final_tensors[2], output_num ); node_params[SCALAR_HALF_INPUT0_W] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_w ); node_params[SCALAR_HALF_INPUT0_H] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_h ); @@ -335,13 +393,19 @@ static vsi_nn_kernel_node_t _setup node_params[SCALAR_DEPTH] = vsi_nn_kernel_scalar_create( graph, I32, &depth ); if (is_use_u8_kernel) { - node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale ); - node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail ); - node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale ); - node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail ); - node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); - node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); - node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM; + node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale ); + node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail ); + node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale ); + node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail ); + node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); + } + if (is_reflect_mode) + { + node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &min_val_w); + node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &span_w); + node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &min_val_h); + node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &span_h); } /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); @@ -351,19 +415,34 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_W]); vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_H]); vsi_nn_kernel_scalar_release(&node_params[SCALAR_DEPTH]); + node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM; if (is_use_u8_kernel) { - vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_SCALE]); - vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_TAIL]); - vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_SCALE]); - vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_TAIL]); - vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_SCALE]); - vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_TAIL]); + vsi_nn_kernel_scalar_release(&node_params[node_params_num++]); + vsi_nn_kernel_scalar_release(&node_params[node_params_num++]); + vsi_nn_kernel_scalar_release(&node_params[node_params_num++]); + vsi_nn_kernel_scalar_release(&node_params[node_params_num++]); + vsi_nn_kernel_scalar_release(&node_params[node_params_num++]); + vsi_nn_kernel_scalar_release(&node_params[node_params_num++]); + } + if (is_reflect_mode) + { + vsi_nn_kernel_scalar_release(&node_params[node_params_num++]); + vsi_nn_kernel_scalar_release(&node_params[node_params_num++]); + vsi_nn_kernel_scalar_release(&node_params[node_params_num++]); + vsi_nn_kernel_scalar_release(&node_params[node_params_num++]); } { // Set default border mode. vx_border_t border; - border.mode = VX_BORDER_CONSTANT; - border.constant_value.U32 = pad_val; + if (pad_mode == VSI_NN_PAD_MODE_CONSTANT) + { + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = pad_val; + } + else + { + border.mode = VX_BORDER_REPLICATE; + } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border)); CHECK_STATUS(status); diff --git a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c index 82838648c..abb4ed4e3 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c @@ -244,7 +244,10 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer) final: #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(input_attr0); + SAFE_FREE_TENSOR_ATTR(input_attr1); SAFE_FREE_TENSOR_ATTR(output_attr); + return status; } /* _gather_elements_initializer() */ diff --git a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c index aeb8b4d6c..67d4d932b 100644 --- a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c @@ -34,6 +34,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" #include "kernel/vsi_nn_kernel.h" __BEGIN_DECLS @@ -1489,8 +1490,8 @@ static vsi_nn_kernel_node_t _setup float twoLogE = 2 * logE; uint32_t uint_min = 0xFBFFFFFF; uint32_t uint_max = 0x7BFFFFFF; - float float_min = *(vx_float32 *)&uint_min; - float float_max = *(vx_float32 *)&uint_max; + float float_min = 0.0f; + float float_max = 0.0f; float scale_val[9] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; float tail_val[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; vsi_bool is_u8_type = FALSE; @@ -1499,6 +1500,12 @@ static vsi_nn_kernel_node_t _setup size_t lstm_activation_in_out_num = 0; uint32_t i; + fp32_bit_cast_t fp32_bit_cast; + fp32_bit_cast.data = uint_min; + float_min = fp32_bit_cast.val; + fp32_bit_cast.data = uint_max; + float_max = fp32_bit_cast.val; + _is_ln = vsi_nn_kernel_param_get_int32( params, "_is_ln" ); _is_cifg = vsi_nn_kernel_param_get_int32( params, "_is_cifg" ); _is_proj = vsi_nn_kernel_param_get_int32( params, "_is_proj" ); diff --git a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c index cc6d53800..8eee2c474 100644 --- a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c @@ -266,6 +266,7 @@ static vsi_nn_kernel_node_t _setup vsi_bool is_use_u8_kernel = FALSE; int32_t align_corners = vsi_nn_kernel_param_get_int32(params, "align_corners"); + int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode"); uint32_t pad_val = 0; int32_t depth = 0; vsi_nn_kernel_dtype_e in0_dtype; @@ -282,6 +283,11 @@ static vsi_nn_kernel_node_t _setup return NULL; } + if (pad_mode == VSI_NN_PAD_MODE_REFLECT) + { + return NULL; + } + final_tensors[0] = inputs[0]; if (inputs[1]->attr.dim_num >= 3) { final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0]; @@ -382,8 +388,15 @@ static vsi_nn_kernel_node_t _setup { // Set default border mode. vx_border_t border; - border.mode = VX_BORDER_CONSTANT; - border.constant_value.U32 = pad_val; + if (pad_mode == VSI_NN_PAD_MODE_CONSTANT) + { + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = pad_val; + } + else + { + border.mode = VX_BORDER_REPLICATE; + } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border)); CHECK_STATUS(status); diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c index a66b89b3e..eb0e556fb 100644 --- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c @@ -36,6 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" __BEGIN_DECLS @@ -242,6 +243,7 @@ static vsi_nn_kernel_node_t _setup vsi_size_t suffix_dim_size = 0; int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" ); vsi_nn_kernel_dtype_e out_dtype; + fp32_bit_cast_t fp32_bit_cast; uint32_t data[2] = {0}; float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" ); float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" ); @@ -258,8 +260,11 @@ static vsi_nn_kernel_node_t _setup } else { - data[0] = *(uint32_t*)&on_value; - data[1] = *(uint32_t*)&off_value; + fp32_bit_cast.val = on_value; + data[0] = fp32_bit_cast.data; + + fp32_bit_cast.val = off_value; + data[1] = fp32_bit_cast.data; } axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis; diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c index b8cdfd086..a21d29093 100644 --- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c @@ -34,20 +34,24 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS #define _TOPK_KERNEL_SOURCE "topk" #define STR(a) #a // Add kernel hashtable here -#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \ - ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) ) +#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \ + ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26)) #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \ - { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \ + { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \ CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \ _TOPK_KERNEL_SOURCE } +#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \ + CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \ + "topk2" } + #define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) ) #define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ @@ -111,6 +115,9 @@ static const _kernel_map_type _topk_kernel_map[] = PACK_KERNEL_MAP( F32, I32, 4 ), PACK_KERNEL_MAP( F32, I32, 5 ), PACK_KERNEL_MAP( F32, I32, 6 ), + + PACK_MERGE_KERNEL_MAP(U32, U32), + PACK_MERGE_KERNEL_MAP(I32, I32), }; static const _kernel_map_type _topk_odd_even_sort_kernel_map[] = @@ -254,7 +261,8 @@ static vsi_status _query_kernel vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, - int32_t num_stages + int32_t num_stages, + vsi_bool is_bitnoic_segment ) { vsi_status status = VSI_FAILURE; @@ -272,21 +280,23 @@ static vsi_status _query_kernel in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + num_stages = is_bitnoic_segment ? 0 : num_stages; + switch (_PACK_SELECT_KEY(in_dtype, out_dtype)) { case _PACK_SELECT_KEY(F32, F32): case _PACK_SELECT_KEY(F16, F16): - key = TOPK_HASH_KEY( F32, F32, num_stages ); + key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment ); break; case _PACK_SELECT_KEY(U32, U32): case _PACK_SELECT_KEY(U16, U16): case _PACK_SELECT_KEY(U8, U8): - key = TOPK_HASH_KEY( U32, U32, num_stages ); + key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment ); break; case _PACK_SELECT_KEY(I32, I32): case _PACK_SELECT_KEY(I16, I16): case _PACK_SELECT_KEY(I8, I8): - key = TOPK_HASH_KEY( I32, I32, num_stages ); + key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment ); break; case _PACK_SELECT_KEY(F32, U32): case _PACK_SELECT_KEY(F16, U32): @@ -294,7 +304,7 @@ static vsi_status _query_kernel case _PACK_SELECT_KEY(F16, U16): case _PACK_SELECT_KEY(F32, U8): case _PACK_SELECT_KEY(F16, U8): - key = TOPK_HASH_KEY( F32, U32, num_stages ); + key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment ); break; case _PACK_SELECT_KEY(F32, I32): case _PACK_SELECT_KEY(F16, I32): @@ -302,7 +312,7 @@ static vsi_status _query_kernel case _PACK_SELECT_KEY(F16, I16): case _PACK_SELECT_KEY(F32, I8): case _PACK_SELECT_KEY(F16, I8): - key = TOPK_HASH_KEY( F32, I32, num_stages ); + key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment ); break; default: break; @@ -440,7 +450,12 @@ static vsi_nn_kernel_node_t _setup int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k"); int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0); vsi_bool is_odd_even_sort = FALSE; + vsi_bool is_bitnoic_segment = FALSE; size_t param_num = _TOPK_PARAM_NUM; + int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2); + vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); float outputScale = vsi_nn_get_tensor_scale(outputs[0]); @@ -471,9 +486,14 @@ static vsi_nn_kernel_node_t _setup rs_tensors[0] = vsi_nn_reshape_tensor( graph, inputs[0], shape[0], 2 ); - if (num_stages < 7) + is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) && + type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32); + num_stages = is_bitnoic_segment ? 9 : num_stages; + max_stages = is_bitnoic_segment ? max_stages : 7; + + if (num_stages < max_stages || is_bitnoic_segment) { - status = _query_kernel( kernel, inputs, outputs, num_stages ); + status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, outputs[0], shape[1], 2 ); diff --git a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c index a94c93c8c..ed1e295fb 100644 --- a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c @@ -52,15 +52,26 @@ typedef enum #define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(_input_type, _output_type) \ "bilinear_grid_sample_" #_input_type "_to_" #_output_type +#define _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE(_input_type, _output_type) \ + "bilinear_grid_sample_reflect_" #_input_type "_to_" #_output_type + // Add kernel hashtable here -#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ - ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE)) +#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, REFLECT) \ + ((IN1_DTYPE << 24) | (IN0_DTYPE << 16) | (OUT_DTYPE << 8) | (REFLECT)) + #define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ - { \ - BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \ - CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \ - _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \ - } + { \ + BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \ + _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \ + } + +#define PACK_REFLECT_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { \ + BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_reflect_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \ + _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \ + } typedef struct { @@ -83,6 +94,18 @@ static const _kernel_map_type _bilinear_grid_sample_kernel_map[] = PACK_KERNEL_MAP(I16, I16, I16), PACK_KERNEL_MAP(I8, I8, I8), PACK_KERNEL_MAP(BF16, BF16, BF16), + PACK_REFLECT_KERNEL_MAP(F16, F32, F16), + PACK_REFLECT_KERNEL_MAP(F16, U8, F16), + PACK_REFLECT_KERNEL_MAP(F16, F16, F16), + PACK_REFLECT_KERNEL_MAP(F16, F32, U8), + PACK_REFLECT_KERNEL_MAP(F16, F16, U8), + PACK_REFLECT_KERNEL_MAP(F16, U8, U8), + PACK_REFLECT_KERNEL_MAP(U8, U8, U8), + PACK_REFLECT_KERNEL_MAP(U8, F16, U8), + PACK_REFLECT_KERNEL_MAP(U8, F32, U8), + PACK_REFLECT_KERNEL_MAP(I16, I16, I16), + PACK_REFLECT_KERNEL_MAP(I8, I8, I8), + PACK_REFLECT_KERNEL_MAP(BF16, BF16, BF16), }; @@ -96,18 +119,20 @@ static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] = {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; -#define _BILINEAR_GRID_SAMPLE_PARAM_NUM _cnt_of_array( _bilinear_grid_sample_kernel_param_def ) +#define _BILINEAR_GRID_SAMPLE_PARAM_NUM \ +_cnt_of_array( _bilinear_grid_sample_kernel_param_def ) #define SCALAR_ALIGN_CORNERS (3) /* * Kernel initializer */ -DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) +static vsi_status _bilinear_grid_sample_initializer_base ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, - size_t param_size + size_t param_size, + vsi_bool is_reflect_mode ) { vsi_status status = VSI_FAILURE; @@ -135,6 +160,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) int32_t input1ZP = 0; float output_scale = 1.0; int32_t outputZP = 0; + float min_val_wh[4] = { 0 }; + float span_wh[4] = { 0 }; VSI_UNREFERENCED(param_size); @@ -156,6 +183,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) (vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners)); CHECK_STATUS_FAIL_GOTO(status, final); + out_shape = output_attr->shape; in0_shape = input_attr[0]->shape; input0_dtype = input_attr[0]->dtype; @@ -193,6 +221,35 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "add_float_value", add_float_value); status |= vsi_nn_kernel_gpu_add_param(node, "depth", &depth); + if (is_reflect_mode) + { + float low_w, low_h, high_w, high_h; + if (align_corners) + { + low_w = 0; + low_h = 0; + high_w = 2 * (float)(in0_width - 1); + high_h = 2 * (float)(in0_height - 1); + } + else + { + low_w = -1; + low_h = -1; + high_w = 2 * (float)in0_width - 1; + high_h = 2 * (float)in0_height - 1; + } + min_val_wh[0] = low_w / 2; + span_wh[0] = (high_w - low_w) / 2; + min_val_wh[1] = low_h / 2; + span_wh[1] = (high_h - low_h) / 2; + min_val_wh[2] = min_val_wh[0]; + min_val_wh[3] = min_val_wh[1]; + span_wh[2] = span_wh[0]; + span_wh[3] = span_wh[1]; + status |= vsi_nn_kernel_gpu_add_param(node, "span_wh", span_wh); + status |= vsi_nn_kernel_gpu_add_param(node, "min_val_wh", min_val_wh); + } + { gpu_dp_inst_t uniFp16toFp32_part0_4x4 = { { @@ -538,6 +595,28 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) +DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) +( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t* param, + size_t param_size + ) +{ + return _bilinear_grid_sample_initializer_base( + node, param, param_size, vx_false_e); +} + +DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_reflect_initializer) +( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t* param, + size_t param_size + ) +{ + return _bilinear_grid_sample_initializer_base( + node, param, param_size, vx_true_e); +} + /* * Query kernel */ @@ -545,7 +624,8 @@ static vsi_status _query_kernel ( vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, - vsi_nn_tensor_t * const * const outputs + vsi_nn_tensor_t * const * const outputs, + int32_t is_reflect_mode ) { vsi_status status = VSI_FAILURE; @@ -563,7 +643,16 @@ static vsi_status _query_kernel in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype); + key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, is_reflect_mode); + + if (is_reflect_mode) + { + initializer = _bilinear_grid_sample_reflect_initializer; + } + else + { + initializer = _bilinear_grid_sample_initializer; + } for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) { @@ -605,13 +694,21 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_NUM]; vsi_nn_kernel_node_t node = NULL; vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; - uint32_t final_in1_rank = 0; + vsi_size_t final_out_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; + uint32_t final_in1_rank = 0, final_out_rank = 0; vsi_nn_tensor_t* rs_tensors = NULL; + vsi_nn_tensor_t* rs_out_tensors = NULL; vsi_nn_tensor_t* final_tensors[3] = {NULL}; vsi_nn_kernel_dtype_e in0_dtype; uint32_t pad_val = 0; int32_t align_corners = vsi_nn_kernel_param_get_int32(params, "align_corners"); + int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode"); + int32_t is_reflect_mode = 0; + vsi_size_t in_size_x = inputs[1]->attr.size[1]; + vsi_size_t in_size_y = inputs[1]->attr.dim_num >= 3 ? inputs[1]->attr.size[2] : 1; + vsi_size_t new_size_x = in_size_x, new_size_y = in_size_y; + vsi_bool is_reshape_out = vx_false_e; // Check if gpu can support the size if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size, @@ -624,12 +721,63 @@ static vsi_nn_kernel_node_t _setup return NULL; } + if (pad_mode == VSI_NN_PAD_MODE_REFLECT) + { + is_reflect_mode = 1; + } + final_tensors[0] = inputs[0]; + is_reshape_out = vx_false_e; if (inputs[1]->attr.dim_num >= 3) { + vsi_size_t shape_x[2]; + vsi_size_t out_shape_x[2]; + vsi_size_t out_rank_x; + shape_x[0] = in_size_x; + shape_x[1] = in_size_y; + vsi_nn_kernel_optimize_element_shape(shape_x, 2, out_shape_x, &out_rank_x); + if (out_rank_x == 2) + { + new_size_x = out_shape_x[0]; + new_size_y = out_shape_x[1]; + } - final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0]; - final_shape[1] = inputs[1]->attr.size[2]; + if ((new_size_x == in_size_x) && (new_size_y == in_size_y)) + { + is_reshape_out = vx_false_e; + } + else if ((new_size_x * 2) >= GPU_TENSOR_MAX_WIDTH) + { + is_reshape_out = vx_false_e; + } + else + { + is_reshape_out = vx_true_e; + } + + if (is_reshape_out == vx_false_e) + { + new_size_x = in_size_x; + new_size_y = in_size_y; + if ((new_size_x < new_size_y) && ((new_size_y * 2) < GPU_TENSOR_MAX_WIDTH)) + { + vsi_size_t tmp = new_size_x; + new_size_x = new_size_y; + new_size_y = tmp; + is_reshape_out = vx_true_e; + } + } + + } + + if (((new_size_x * 2) >= GPU_TENSOR_MAX_WIDTH) || (new_size_y >= GPU_TENSOR_MAX_WIDTH)) + { + return NULL; + } + + if (inputs[1]->attr.dim_num >= 3) { + final_shape[0] = new_size_x * inputs[1]->attr.size[0]; + final_shape[1] = new_size_y; final_shape[2] = 1; final_shape[3] = inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1; final_in1_rank = @@ -643,14 +791,32 @@ static vsi_nn_kernel_node_t _setup } else { final_tensors[1] = inputs[1]; } - final_tensors[2] = outputs[0]; + + if (is_reshape_out) + { + final_out_shape[0] = new_size_x; + final_out_shape[1] = new_size_y; + final_out_shape[2] = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1; + final_out_shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; + final_out_rank = outputs[0]->attr.dim_num; + if (!vsi_nn_kernel_gpu_check_shape(final_out_shape, final_out_rank)) { + return NULL; + } + + rs_out_tensors = vsi_nn_reshape_tensor(graph, outputs[0], final_out_shape, final_out_rank); + final_tensors[2] = rs_out_tensors; + } + else + { + final_tensors[2] = outputs[0]; + } in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type); if (U8 == in0_dtype) { pad_val = inputs[0]->attr.dtype.zero_point; } - status = _query_kernel( kernel, inputs, outputs ); + status = _query_kernel( kernel, inputs, outputs, is_reflect_mode); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -662,14 +828,22 @@ static vsi_nn_kernel_node_t _setup node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create(graph, I32, &align_corners); /* Pass parameters to node. */ - status = vsi_nn_kernel_node_pass_param( node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM ); + status = vsi_nn_kernel_node_pass_param( + node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM ); VSI_ASSERT(status == VSI_SUCCESS); vsi_nn_kernel_scalar_release(&node_params[SCALAR_ALIGN_CORNERS]); { // Set default border mode. vx_border_t border; - border.mode = VX_BORDER_CONSTANT; - border.constant_value.U32 = pad_val; + if (pad_mode == VSI_NN_PAD_MODE_CONSTANT) + { + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = pad_val; + } + else + { + border.mode = VX_BORDER_REPLICATE; + } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border)); CHECK_STATUS(status); @@ -678,6 +852,7 @@ static vsi_nn_kernel_node_t _setup } vsi_safe_release_tensor(rs_tensors); + vsi_safe_release_tensor(rs_out_tensors); return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c b/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c index 012c0408d..9b465badd 100644 --- a/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c @@ -513,6 +513,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[4] ); vsi_nn_kernel_scalar_release( &node_params[5] ); } + + if ( node ) { // Set default border mode. vx_border_t border; diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c index 2ec1b1aa6..a393c4559 100644 --- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c @@ -34,6 +34,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" #include "kernel/vsi_nn_kernel.h" #include "libnnext/vx_lib_nnext.h" @@ -1002,8 +1003,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer) float twoLogE = 2 * logE; uint32_t uint_min = 0xFBFFFFFF; uint32_t uint_max = 0x7BFFFFFF; - float float_min = *(float *)&uint_min; - float float_max = *(float *)&uint_max; + float float_min = 0.0f; + float float_max = 0.0f; float clip_Min_F[4] = {0}; float clip_Max_F[4] = {0}; uint32_t i = 0; @@ -1017,6 +1018,12 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer) vsi_nn_kernel_tensor_attr_t* input_attr[9] = {NULL}; vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL}; + fp32_bit_cast_t fp32_bit_cast; + fp32_bit_cast.data = uint_min; + float_min = fp32_bit_cast.val; + fp32_bit_cast.data = uint_max; + float_max = fp32_bit_cast.val; + status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 5], &_is_ln ); CHECK_STATUS_FAIL_GOTO(status, final ); status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 4], &_is_cifg ); diff --git a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c index 28ff2d1ae..6554c74a9 100644 --- a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c @@ -539,6 +539,7 @@ static vsi_nn_kernel_node_t _setup uint32_t pad_val = 0; int32_t align_corners = vsi_nn_kernel_param_get_int32(params, "align_corners"); + int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode"); // Check if gpu can support the size if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size, @@ -551,6 +552,11 @@ static vsi_nn_kernel_node_t _setup return NULL; } + if (pad_mode == VSI_NN_PAD_MODE_REFLECT) + { + return NULL; + } + final_tensors[0] = inputs[0]; if (inputs[1]->attr.dim_num >= 3) { @@ -596,8 +602,15 @@ static vsi_nn_kernel_node_t _setup { // Set default border mode. vx_border_t border; - border.mode = VX_BORDER_CONSTANT; - border.constant_value.U32 = pad_val; + if (pad_mode == VSI_NN_PAD_MODE_CONSTANT) + { + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = pad_val; + } + else + { + border.mode = VX_BORDER_REPLICATE; + } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border)); CHECK_STATUS(status); diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c index d9f96b236..167db3e9a 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c @@ -754,7 +754,7 @@ static vsi_nn_kernel_node_t _setup if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size, outputs[0]->attr.dim_num ) ) { - return NULL; + goto final; } if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] && diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c index 0504dff72..17f3bc52e 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c @@ -915,7 +915,7 @@ static vsi_nn_kernel_node_t _setup if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size, outputs[0]->attr.dim_num ) ) { - return NULL; + goto final; } if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] && diff --git a/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c index 618b33f52..e7b4d445a 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c @@ -290,13 +290,14 @@ static vsi_nn_tensor_t* _create_scale_tensor vsi_nn_tensor_t* scale = NULL; vsi_size_t i = 0; float *scale_data_ptr = NULL; - int *index_data_ptr = NULL; + int32_t *index_data_ptr = NULL; float scale_value = 0; vsi_ssize_t data = 0; - int idx = 0; + int32_t idx = 0; float delta_v = 0; float cubic_coeff_a = -0.5f; vsi_size_t item_count = 4 * output_size; + scale_data_ptr = (float *)malloc(item_count * sizeof(float)); if (scale_data_ptr == NULL) { @@ -316,7 +317,7 @@ static vsi_nn_tensor_t* _create_scale_tensor scale_value = ((float)i + half_pixel_value) * scale_factor - half_pixel_value; data = (vsi_ssize_t)scale_value; delta_v = scale_value - (float)data; - idx = (int)data - 1; + idx = (int32_t)data - 1; index_data_ptr[i] = idx; scale_data_ptr[i * 4 + 0] = cubic_coeff_a * (((delta_v - 4) * (delta_v + 1) + 8) * (delta_v + 1) - 4); @@ -331,11 +332,6 @@ static vsi_nn_tensor_t* _create_scale_tensor attr.vtl = FALSE; scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr); - if (scale_data_ptr) - { - free (scale_data_ptr); - scale_data_ptr = NULL; - } attr.size[0] = output_size; attr.dim_num = 1; @@ -343,13 +339,11 @@ static vsi_nn_tensor_t* _create_scale_tensor attr.vtl = FALSE; *index = vsi_nn_CreateTensorFromData(graph, (uint8_t *)index_data_ptr, &attr); - if (index_data_ptr) - { - free (index_data_ptr); - index_data_ptr = NULL; - } OnError: + vsi_nn_safe_free(scale_data_ptr); + vsi_nn_safe_free(index_data_ptr); + return scale; } diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index 83334269c..331f26298 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -1218,6 +1218,11 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector status = backend->select( graph, inputs, input_num, outputs, output_num, params, &selector ); VSI_ASSERT( status == VSI_SUCCESS ); + + if ( status != VSI_SUCCESS ) { + VSILOGW("Failed to select kernel \"%s\"", kernel_name); + return NULL; + } } else { diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c index d74d6a1c8..0e0b21411 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c @@ -171,7 +171,9 @@ vsi_status vsi_nn_kernel_copy_tensor_patch vsi_nn_kernel_tensor_attr_get_stride( attr, stride ); memset(start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); - for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + memset(end, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + + for (i = 0; i < (uint32_t)attr->shape->size; i++) { end[i] = attr->shape->data[i]; if ( attr->dtype != I4 && attr->dtype != U4 ) @@ -490,7 +492,7 @@ vsi_status vsi_nn_kernel_scalar_get_dtype ( vsi_nn_kernel_scalar_t scalar, DTYPE * ptr ) \ { \ vsi_status status; \ - vsi_nn_kernel_dtype_e dtype; \ + vsi_nn_kernel_dtype_e dtype = INVALID_DTYPE; \ if( !ptr ) \ { \ VSILOGE("Pointer to store scalar is null"); \ diff --git a/src/tim/vx/internal/src/kernel/vx/rms_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/rms_norm_vx.c new file mode 100644 index 000000000..df6c7a72a --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/rms_norm_vx.c @@ -0,0 +1,94 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + + +#define REGISTER_RMS_NORM_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_RMS_NORM_OPENVX_KERNEL(rms_norm) +{ + vx_node node = NULL; + +#if (VX_RMS_NORM_VX_SUPPORT) + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + vx_tensor inputs_tensor[2] = {NULL}; + vx_tensor output_tensor = NULL; + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + + inputs_tensor[0] = inputs[0]->t; + inputs_tensor[1] = inputs[1]->t; + output_tensor = outputs[0]->t; + + node = vxRMSNormalizationLayer( + graph->g, + eps, + axis, + inputs_tensor, + (uint32_t)input_num, + output_tensor + ); +#else + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); +#endif + + return (vsi_nn_kernel_node_t)node; +} /* rms_norm() */ diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample_reflect.cl b/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample_reflect.cl new file mode 100644 index 000000000..5aed95999 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample_reflect.cl @@ -0,0 +1,169 @@ +__kernel void bilinear_grid_sample_reflect_F32_F32toF32( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + float half_input0_w, + float half_input0_h, + float add_float_value_w, + float add_float_value_h, + int depth, + float min_val_w, + float span_w, + float min_val_h, + float span_h + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1)); + int2 coord_add = (int2)(-1, 1); + + float fx = read_imagef(input1, coord_in1).x; + coord_in1.x = coord_in1.x + 1; + float fy = read_imagef(input1, coord_in1).x; + + fx = fx * half_input0_w + add_float_value_w; + fy = fy * half_input0_h + add_float_value_h; + + if (span_w > 0) + { + fx = fabs(fx - min_val_w); + int flips_x = (int)(fx / span_w); + float extra_x = fx - flips_x * span_w; + fx = (flips_x & 0x01) ? min_val_w + (span_w - extra_x) : min_val_w + extra_x ; + } + else + { + fx = 0; + } + + if (span_h > 0) + { + fy = fabs(fy - min_val_h); + int flips_y = (int)(fy / span_h); + float extra_y = fy - flips_y * span_h; + fy = (flips_y & 0x01) ? min_val_h + (span_h - extra_y) : min_val_h + extra_y ; + } + else + { + fy = 0; + } + + float x_f = floor(fx); + float y_f = floor(fy); + float x_lerp = fx - x_f; + float y_lerp = fy - y_f; + int x_index = convert_int(x_f); + int y_index = convert_int(y_f); + int4 coord_in = (int4)(x_index, y_index, 0, 0); + + float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst; + + while (coord_in.z < depth){ + top_l = read_imagef(input0, coord_in); + coord_in.y++; + bottom_l = read_imagef(input0, coord_in); + coord_in.x++; + bottom_r = read_imagef(input0, coord_in); + coord_in.y--; + top_r = read_imagef(input0, coord_in); + top_r = top_r - top_l; + top = top_l + x_lerp * top_r; + bottom_r = bottom_r - bottom_l; + bottom = bottom_l + x_lerp * bottom_r; + bottom = bottom - top; + dst = top + y_lerp * bottom; + write_imagef(output, coord_out, dst); + coord_in.xz = coord_in.xz + coord_add; + coord_out.z++; + } +} + + +__kernel void bilinear_grid_sample_reflect_U8_U8toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + float half_input0_w, + float half_input0_h, + float add_float_value_w, + float add_float_value_h, + int depth, + float in0_scale, + float in0_tail, + float in1_scale, + float in1_tail, + float out_scale, + float out_tail, + float min_val_w, + float span_w, + float min_val_h, + float span_h + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1)); + int2 coord_add = (int2)(-1, 1); + + float fx = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail; + coord_in1.x = coord_in1.x + 1; + float fy = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail; + + fx = fx * half_input0_w + add_float_value_w; + fy = fy * half_input0_h + add_float_value_h; + + if (span_w > 0) + { + fx = fabs(fx - min_val_w); + int flips_x = (int)(fx / span_w); + float extra_x = fx - flips_x * span_w; + fx = (flips_x & 0x01) ? min_val_w + (span_w - extra_x) : min_val_w + extra_x ; + } + else + { + fx = 0; + } + + if (span_h > 0) + { + fy = fabs(fy - min_val_h); + int flips_y = (int)(fy / span_h); + float extra_y = fy - flips_y * span_h; + fy = (flips_y & 0x01) ? min_val_h + (span_h - extra_y) : min_val_h + extra_y ; + } + else + { + fy = 0; + } + + float x_f = floor(fx); + float y_f = floor(fy); + float x_lerp = fx - x_f; + float y_lerp = fy - y_f; + int x_index = convert_int(x_f); + int y_index = convert_int(y_f); + int4 coord_in = (int4)(x_index, y_index, 0, 0); + + float4 top_l, top_r, bottom_l, bottom_r, top, bottom; + uint4 dst; + + while (coord_in.z < depth){ + top_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail; + coord_in.y++; + bottom_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail; + coord_in.x++; + bottom_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail; + coord_in.y--; + top_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail; + top_r = top_r - top_l; + top = top_l + x_lerp * top_r; + bottom_r = bottom_r - bottom_l; + bottom = bottom_l + x_lerp * bottom_r; + bottom = bottom - top; + top = top + y_lerp * bottom; + dst = convert_uint4_rte(top * out_scale + out_tail); + write_imageui(output, coord_out, dst); + coord_in.xz = coord_in.xz + coord_add; + coord_out.z++; + } + +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl index f87e9e449..bd26572ba 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl @@ -1,4 +1,4 @@ -#define VSI_FLOAT32_MIN (1.175494351e-38F) +#define VSI_FLOAT32_MIN (-3.40E+38) #define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \ __kernel void maxpool_##in_name##to##out_name( \ @@ -45,7 +45,7 @@ __kernel void maxpool_##in_name##to##out_name( \ { \ src0 = read_func(input, coord_in); \ coord_in.x += dilation_x; \ - maxVal = max(src0, maxVal); \ + maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x; \ } \ } \ \ @@ -101,7 +101,7 @@ __kernel void maxpool_F32toF32( { src0 = read_imagef(input, coord_in); coord_in.x += dilation_x; - maxVal = max(src0, maxVal); + maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x; } } @@ -152,7 +152,7 @@ __kernel void maxpool_U32toF32( { src0 = read_imageui(input, coord_in); coord_in.x += dilation_x; - maxVal = max(src0, maxVal); + maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x; } } @@ -206,7 +206,7 @@ __kernel void maxpool_F32toU32( { src0 = read_imagef(input, coord_in); coord_in.x += dilation_x; - maxVal = max(src0, maxVal); + maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x; } } diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl index 0e6166c4d..dc2038920 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl @@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \ float right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem) ^ signo) \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag uint left_elem = local_data[left_id]; \ uint right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem) ^ signo) \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -227,7 +227,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag int left_elem = local_data[left_id]; \ int right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem) ^ signo) \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -315,7 +315,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \ float right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem) ^ signo) \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -403,7 +403,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \ float right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem) ^ signo) \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl new file mode 100644 index 000000000..0eae5ab2f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl @@ -0,0 +1,368 @@ + +#define BITONIC_STEP(dtype) \ +void bitonic_step_##dtype(uint num_stages, int lx, \ + __local dtype *local_data, __local int *local_indices) \ +{ \ + for (uint stage = 0; stage < num_stages + 1; ++stage) \ + { \ + uint signo = (lx >> stage) & 1; \ + \ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ + { \ + uint postShift = (stage - passOfStage); \ + uint pairDistance = 1 << postShift; \ + \ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \ + uint right_id = left_id + pairDistance; \ + \ + int left_idx = local_indices[left_id]; \ + int right_idx = local_indices[right_id]; \ + \ + dtype left_elem = local_data[left_id]; \ + dtype right_elem = local_data[right_id]; \ + \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ + { \ + local_data[left_id] = right_elem; \ + local_data[right_id] = left_elem; \ + \ + local_indices[left_id] = right_idx; \ + local_indices[right_id] = left_idx; \ + } \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + } \ + } \ +} +BITONIC_STEP(int) +BITONIC_STEP(uint) + +#define BITONIC_STEP_ASCEND(dtype) \ +void bitonic_step_ascend_##dtype(uint num_stages, int lx, \ + __local dtype *p_share_k, __local int *p_share_v) \ +{ \ + for (uint stage = 0; stage < num_stages + 1; ++stage) \ + { \ + uint signo = (lx >> stage) & 1; \ + \ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ + { \ + uint postShift = (stage - passOfStage); \ + uint pairDistance = 1 << postShift; \ + \ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \ + uint right_id = left_id + pairDistance; \ + \ + int left_idx = p_share_v[left_id]; \ + int right_idx = p_share_v[right_id]; \ + \ + dtype left_elem = p_share_k[left_id]; \ + dtype right_elem = p_share_k[right_id]; \ + \ + if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \ + { \ + p_share_k[left_id] = right_elem; \ + p_share_k[right_id] = left_elem; \ + \ + p_share_v[left_id] = right_idx; \ + p_share_v[right_id] = left_idx; \ + } \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + } \ + } \ +} +BITONIC_STEP_ASCEND(int) +BITONIC_STEP_ASCEND(uint) + +#define BITONIC_MERGE(dtype) \ +void bitonic_merge_##dtype(uint num_stages, int lx, \ + __local dtype *local_data, __local int *local_indices) \ +{ \ + uint stage = num_stages; \ + uint signo = (lx >> stage) & 1; \ + \ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ + { \ + uint postShift = (stage - passOfStage); \ + uint pairDistance = 1 << postShift; \ + \ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \ + uint right_id = left_id + pairDistance; \ + \ + int left_idx = local_indices[left_id]; \ + int right_idx = local_indices[right_id]; \ + \ + dtype left_elem = local_data[left_id]; \ + dtype right_elem = local_data[right_id]; \ + \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ + { \ + local_data[left_id] = right_elem; \ + local_data[right_id] = left_elem; \ + \ + local_indices[left_id] = right_idx; \ + local_indices[right_id] = left_idx; \ + } \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + } \ +} +BITONIC_MERGE(int) +BITONIC_MERGE(uint) + +#define BLOCK_SIZE (512) + +__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32 +( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t indices, + float input_scale, + float input_tail, + float output_scale, + float output_tail, + int _num_stages, + int width + ) + { + uint lx = get_local_id(0); + const int init_k = -2147483647; + const int init_v = -2147483647; + const int num_stages = 9; + const int threads_per_block = BLOCK_SIZE; + const int index_minus_1 = threads_per_block * 2 - 1; + uint offset = 0; + uint lx1 = lx + threads_per_block; + + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + __local int local_data[1536]; + __local int local_indices[1536]; + + int left = read_imagei(input, coord.xy).x; + coord.z += threads_per_block; + int right = read_imagei(input, coord.zy).x; + + local_data[lx] = left; + local_indices[lx] = coord.x; + local_data[lx1] = right; + local_indices[lx1] = coord.z; + + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_int(num_stages, lx, local_data, local_indices); + + int min_data = local_data[511]; + + int *p_share_k = local_data + threads_per_block; + int *p_share_v = local_indices + threads_per_block; + + int limit = (width >> 10) << 10; + p_share_k[lx] = init_k; + p_share_v[lx] = init_v; + + p_share_k[lx1] = init_k; + p_share_v[lx1] = init_v; + barrier(CLK_LOCAL_MEM_FENCE); + + for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2) + { + int2 data; + coord.z = coord.x + threads_per_block; + data.x = read_imagei(input, coord.xy).x; + data.y = read_imagei(input, coord.zy).x; + + p_share_k[lx] = data.x; + p_share_v[lx] = coord.x; + + p_share_k[lx1] = data.y; + p_share_v[lx1] = coord.z; + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v); + + if (p_share_k[index_minus_1] < min_data) + { + continue; + } + + p_share_k[lx] = p_share_k[lx1]; + p_share_v[lx] = p_share_v[lx1]; + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_merge_int(num_stages, lx, local_data, local_indices); + + min_data = local_data[511]; + p_share_k[lx] = init_k; + p_share_v[lx] = init_v; + p_share_k[lx1] = init_k; + p_share_v[lx1] = init_v; + } + + if (width > limit) + { + if (coord.x < width) + { + int2 data; + data.x = read_imagei(input, coord.xy).x; + coord.z = coord.x + threads_per_block; + data.y = read_imagei(input, coord.zy).x; + + p_share_k[lx] = data.x; + p_share_v[lx] = coord.x; + + p_share_k[lx1] = coord.z < width ? data.y : init_k; + p_share_v[lx1] = coord.z < width ? coord.z : init_v; + } + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v); + + if (p_share_k[index_minus_1] >= min_data) + { + p_share_k[lx] = p_share_k[lx1]; + p_share_v[lx] = p_share_v[lx1]; + barrier(CLK_LOCAL_MEM_FENCE); + bitonic_merge_int(num_stages, lx, local_data, local_indices); + } + } + + int4 dst; + dst.x = local_data[lx]; + + coord.x = lx; + write_imagei(output, coord.xy, dst.xxxx); + + int4 index; + index.x = local_indices[lx]; + + write_imagei(indices, coord.xy, index.xxxx); +} + +__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32 +( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t indices, + float input_scale, + float input_tail, + float output_scale, + float output_tail, + int _num_stages, + int width + ) + { + uint lx = get_local_id(0); + const uint init_k = 0; + const int init_v = -2147483647; + const int num_stages = 9; + const int threads_per_block = BLOCK_SIZE; + const int index_minus_1 = threads_per_block * 2 - 1; + uint offset = 0; + uint lx1 = lx + threads_per_block; + + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + __local uint local_data[1536]; + __local int local_indices[1536]; + + uint left = read_imageui(input, coord.xy).x; + coord.z += threads_per_block; + uint right = read_imageui(input, coord.zy).x; + + local_data[lx] = left; + local_indices[lx] = coord.x; + local_data[lx1] = right; + local_indices[lx1] = coord.z; + + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_uint(num_stages, lx, local_data, local_indices); + + uint min_data = local_data[511]; + + uint *p_share_k = local_data + threads_per_block; + int *p_share_v = local_indices + threads_per_block; + + int limit = (width >> 10) << 10; + p_share_k[lx] = init_k; + p_share_v[lx] = init_v; + + p_share_k[lx1] = init_k; + p_share_v[lx1] = init_v; + barrier(CLK_LOCAL_MEM_FENCE); + + for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2) + { + uint2 data; + coord.z = coord.x + threads_per_block; + data.x = read_imageui(input, coord.xy).x; + data.y = read_imageui(input, coord.zy).x; + + p_share_k[lx] = data.x; + p_share_v[lx] = coord.x; + + p_share_k[lx1] = data.y; + p_share_v[lx1] = coord.z; + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v); + + if (p_share_k[index_minus_1] < min_data) + { + continue; + } + + p_share_k[lx] = p_share_k[lx1]; + p_share_v[lx] = p_share_v[lx1]; + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_merge_uint(num_stages, lx, local_data, local_indices); + + min_data = local_data[511]; + p_share_k[lx] = init_k; + p_share_v[lx] = init_v; + p_share_k[lx1] = init_k; + p_share_v[lx1] = init_v; + } + + if (width > limit) + { + if (coord.x < width) + { + uint2 data; + data.x = read_imageui(input, coord.xy).x; + coord.z = coord.x + threads_per_block; + data.y = read_imageui(input, coord.zy).x; + + p_share_k[lx] = data.x; + p_share_v[lx] = coord.x; + + p_share_k[lx1] = coord.z < width ? data.y : init_k; + p_share_v[lx1] = coord.z < width ? coord.z : init_v; + } + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v); + + if (p_share_k[index_minus_1] >= min_data) + { + p_share_k[lx] = p_share_k[lx1]; + p_share_v[lx] = p_share_v[lx1]; + barrier(CLK_LOCAL_MEM_FENCE); + bitonic_merge_uint(num_stages, lx, local_data, local_indices); + } + } + + uint4 dst; + dst.x = local_data[lx]; + + coord.x = lx; + write_imageui(output, coord.xy, dst.xxxx); + + int4 index; + index.x = local_indices[lx]; + + write_imagei(indices, coord.xy, index.xxxx); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl index beaaccbc8..d6eb98f1f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl @@ -28,12 +28,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd } __local int sorted[1]; - int width_minus_one = width - 1; - int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X; - num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1); - - int x_start = lid * num_pixels_per_thread; - int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one); sorted[0] = 0; @@ -44,20 +38,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd *sorted = 0; } int swapped = 0; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); // odd-even - coord.x = x_start; - coord.z = x_start + 1; - for (; coord.x < x_end; ) + coord.x = lid * 2; + coord.z = lid * 2 + 1; + for (; coord.z < width; ) { float4 left = read_imagef(input_t, coord.xy); float4 right = read_imagef(input_t, coord.zy); + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); - if (left.x < right.x) + if ( (left.x < right.x) || + (left.x == right.x && l_index.x < r_index.x) ) { - int4 l_index = read_imagei(indices_t, coord.xy); - int4 r_index = read_imagei(indices_t, coord.zy); swapped = 1; write_imagef(input_t, coord.xy, right); @@ -67,21 +62,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index); } - coord.xz = coord.xz + 2; + coord.xz += 2 * LOCAL_SIZE_X; } + barrier(CLK_GLOBAL_MEM_FENCE); // even-odd - coord.x = x_start + 1; - coord.z = x_start + 2; - for (; coord.x < x_end; ) + coord.x = lid * 2 + 1; + coord.z = lid * 2 + 2; + for (; coord.z < width; ) { float4 left = read_imagef(input_t, coord.xy); float4 right = read_imagef(input_t, coord.zy); + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); - if (left.x < right.x) + if ( (left.x < right.x) || + (left.x == right.x && l_index.x < r_index.x) ) { - int4 l_index = read_imagei(indices_t, coord.xy); - int4 r_index = read_imagei(indices_t, coord.zy); swapped = 1; write_imagef(input_t, coord.xy, right); @@ -91,11 +88,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index); } - coord.xz = coord.xz + 2; + coord.xz += 2 * LOCAL_SIZE_X; } atomic_add(sorted, swapped); - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); if (*sorted == 0) break; @@ -141,13 +138,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd } __local int sorted[1]; - int width_minus_one = width - 1; - int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X; - num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1); - - int x_start = lid * num_pixels_per_thread; - int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one); - sorted[0] = 0; while (1) @@ -157,20 +147,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd *sorted = 0; } int swapped = 0; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); // odd-even - coord.x = x_start; - coord.z = x_start + 1; - for (; coord.x < x_end; ) + coord.x = lid * 2; + coord.z = lid * 2 + 1; + for (; coord.z < width; ) { uint4 left = read_imageui(input_t, coord.xy); uint4 right = read_imageui(input_t, coord.zy); + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); - if (left.x < right.x) + if ( (left.x < right.x) || + (left.x == right.x && l_index.x < r_index.x) ) { - int4 l_index = read_imagei(indices_t, coord.xy); - int4 r_index = read_imagei(indices_t, coord.zy); swapped = 1; write_imageui(input_t, coord.xy, right); @@ -180,21 +171,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index); } - coord.xz = coord.xz + 2; + coord.xz += 2 * LOCAL_SIZE_X; } + barrier(CLK_GLOBAL_MEM_FENCE); // even-odd - coord.x = x_start + 1; - coord.z = x_start + 2; - for (; coord.x < x_end; ) + coord.x = lid * 2 + 1; + coord.z = lid * 2 + 2; + for (; coord.z < width; ) { uint4 left = read_imageui(input_t, coord.xy); uint4 right = read_imageui(input_t, coord.zy); + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); - if (left.x < right.x) + if ( (left.x < right.x) || + (left.x == right.x && l_index.x < r_index.x) ) { - int4 l_index = read_imagei(indices_t, coord.xy); - int4 r_index = read_imagei(indices_t, coord.zy); swapped = 1; write_imageui(input_t, coord.xy, right); @@ -204,11 +197,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index); } - coord.xz = coord.xz + 2; + coord.xz += 2 * LOCAL_SIZE_X; } atomic_add(sorted, swapped); - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); if (*sorted == 0) break; @@ -254,13 +247,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd } __local int sorted[1]; - int width_minus_one = width - 1; - int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X; - num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1); - - int x_start = lid * num_pixels_per_thread; - int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one); - sorted[0] = 0; while (1) @@ -270,20 +256,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd *sorted = 0; } int swapped = 0; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); // odd-even - coord.x = x_start; - coord.z = x_start + 1; - for (; coord.x < x_end; ) + coord.x = lid * 2; + coord.z = lid * 2 + 1; + for (; coord.z < width; ) { int4 left = read_imagei(input_t, coord.xy); int4 right = read_imagei(input_t, coord.zy); + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); - if (left.x < right.x) + if ( (left.x < right.x) || + (left.x == right.x && l_index.x < r_index.x) ) { - int4 l_index = read_imagei(indices_t, coord.xy); - int4 r_index = read_imagei(indices_t, coord.zy); swapped = 1; write_imagei(input_t, coord.xy, right); @@ -293,21 +280,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index); } - coord.xz = coord.xz + 2; + coord.xz += 2 * LOCAL_SIZE_X; } + barrier(CLK_GLOBAL_MEM_FENCE); // even-odd - coord.x = x_start + 1; - coord.z = x_start + 2; - for (; coord.x < x_end; ) + coord.x = lid * 2 + 1; + coord.z = lid * 2 + 2; + for (; coord.z < width; ) { int4 left = read_imagei(input_t, coord.xy); int4 right = read_imagei(input_t, coord.zy); + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); - if (left.x < right.x) + if ( (left.x < right.x) || + (left.x == right.x && l_index.x < r_index.x) ) { - int4 l_index = read_imagei(indices_t, coord.xy); - int4 r_index = read_imagei(indices_t, coord.zy); swapped = 1; write_imagei(input_t, coord.xy, right); @@ -317,11 +306,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index); } - coord.xz = coord.xz + 2; + coord.xz += 2 * LOCAL_SIZE_X; } atomic_add(sorted, swapped); - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); if (*sorted == 0) break; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl index 976da2063..27b0633b5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl @@ -28,12 +28,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd } __local int sorted[1]; - int width_minus_one = width - 1; - int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X; - num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1); - - int x_start = lid * num_pixels_per_thread; - int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one); sorted[0] = 0; @@ -44,20 +38,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd *sorted = 0; } int swapped = 0; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); // odd-even - coord.x = x_start; - coord.z = x_start + 1; - for (; coord.x < x_end; ) + coord.x = lid * 2; + coord.z = lid * 2 + 1; + for (; coord.z < width; ) { float4 left = read_imagef(input_t, coord.xy); float4 right = read_imagef(input_t, coord.zy); + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); - if (left.x < right.x) + if ( (left.x < right.x) || + (left.x == right.x && l_index.x < r_index.x) ) { - int4 l_index = read_imagei(indices_t, coord.xy); - int4 r_index = read_imagei(indices_t, coord.zy); swapped = 1; write_imagef(input_t, coord.xy, right); @@ -67,21 +62,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index); } - coord.xz = coord.xz + 2; + coord.xz += 2 * LOCAL_SIZE_X; } + barrier(CLK_GLOBAL_MEM_FENCE); // even-odd - coord.x = x_start + 1; - coord.z = x_start + 2; - for (; coord.x < x_end; ) + coord.x = lid * 2 + 1; + coord.z = lid * 2 + 2; + for (; coord.z < width; ) { float4 left = read_imagef(input_t, coord.xy); float4 right = read_imagef(input_t, coord.zy); + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); - if (left.x < right.x) + if ( (left.x < right.x) || + (left.x == right.x && l_index.x < r_index.x) ) { - int4 l_index = read_imagei(indices_t, coord.xy); - int4 r_index = read_imagei(indices_t, coord.zy); swapped = 1; write_imagef(input_t, coord.xy, right); @@ -91,11 +88,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index); } - coord.xz = coord.xz + 2; + coord.xz += 2 * LOCAL_SIZE_X; } atomic_add(sorted, swapped); - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); if (*sorted == 0) break; @@ -143,13 +140,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd } __local int sorted[1]; - int width_minus_one = width - 1; - int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X; - num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1); - - int x_start = lid * num_pixels_per_thread; - int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one); - sorted[0] = 0; while (1) @@ -159,20 +149,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd *sorted = 0; } int swapped = 0; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); // odd-even - coord.x = x_start; - coord.z = x_start + 1; - for (; coord.x < x_end; ) + coord.x = lid * 2; + coord.z = lid * 2 + 1; + for (; coord.z < width; ) { float4 left = read_imagef(input_t, coord.xy); float4 right = read_imagef(input_t, coord.zy); + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); - if (left.x < right.x) + if ( (left.x < right.x) || + (left.x == right.x && l_index.x < r_index.x) ) { - int4 l_index = read_imagei(indices_t, coord.xy); - int4 r_index = read_imagei(indices_t, coord.zy); swapped = 1; write_imagef(input_t, coord.xy, right); @@ -182,18 +173,22 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index); } - coord.xz = coord.xz + 2; + coord.xz += 2 * LOCAL_SIZE_X; } + barrier(CLK_GLOBAL_MEM_FENCE); // even-odd - coord.x = x_start + 1; - coord.z = x_start + 2; - for (; coord.x < x_end; ) + coord.x = lid * 2 + 1; + coord.z = lid * 2 + 2; + for (; coord.z < width; ) { float4 left = read_imagef(input_t, coord.xy); float4 right = read_imagef(input_t, coord.zy); + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); - if (left.x < right.x) + if ( (left.x < right.x) || + (left.x == right.x && l_index.x < r_index.x) ) { int4 l_index = read_imagei(indices_t, coord.xy); int4 r_index = read_imagei(indices_t, coord.zy); @@ -206,11 +201,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index); } - coord.xz = coord.xz + 2; + coord.xz += 2 * LOCAL_SIZE_X; } atomic_add(sorted, swapped); - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); if (*sorted == 0) break; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_BF16_to_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_BF16_to_BF16.vx new file mode 100644 index 000000000..666950162 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_BF16_to_BF16.vx @@ -0,0 +1,171 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; +_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8; + +_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8; +_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8; +_viv_uniform float4 span_wh; +_viv_uniform float4 min_val_wh; + +#define GRID_SAMPLE_BF16_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy0 = fabs(fxy0 - min_val_wh); \ + fxy1 = fabs(fxy1 - min_val_wh); \ + float4 flips_xy0 = floor(fxy0 / span_wh); \ + float4 flips_xy1 = floor(fxy1 / span_wh); \ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \ + int4 flips_int_xy0 = convert_int4(flips_xy0); \ + int4 flips_int_xy1 = convert_int4(flips_xy1); \ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + float4 x_f = floor(in_x); \ + float4 x_lerp = in_x - x_f; \ + int4 x_idx = convert_int4(x_f); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + float4 y_f = floor(in_y); \ + float4 y_lerp = in_y - y_f; \ + int4 y_idx = convert_int4(y_f); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_short8 top; \ + vxc_short8 bottom; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + float4 left4; \ + float4 right4; \ + float4 top4; \ + float4 bottom4; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + vxc_ushort8 tmp, dst; \ + while (coord_in.z < loop) \ + { \ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \ + _viv_asm(COPY, right4, src, 16); \ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \ + _viv_asm(COPY, left4, src, 16); \ + right4 -= left4; \ + top4 = right4 * x_lerp + left4; \ + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \ + _viv_asm(COPY, right4, src, 16); \ + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \ + _viv_asm(COPY, left4, src, 16); \ + right4 -= left4; \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + _viv_asm(COPY, tmp, dst4, 16); \ + dst.s0123 = tmp.s1357; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \ + _viv_asm(COPY, right4, src, 16); \ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \ + _viv_asm(COPY, left4, src, 16); \ + right4 -= left4; \ + top4 = right4 * x_lerp + left4; \ + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \ + _viv_asm(COPY, right4, src, 16); \ + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \ + _viv_asm(COPY, left4, src, 16); \ + right4 -= left4; \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + _viv_asm(COPY, tmp, dst4, 16); \ + dst.s0123 = tmp.s1357; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + + + +__kernel void bilinear_grid_sample_reflect_BF16_BF16toBF16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_short8 read_val; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + + float4 fxy0; + float4 fxy1; + + vxc_short8 src; + VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8); + _viv_asm(COPY, fxy0, src, 16); + VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8); + _viv_asm(COPY, fxy1, src, 16); + + + + GRID_SAMPLE_BF16_PROCESS(); + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_F16.vx new file mode 100644 index 000000000..db6ff35e8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_F16.vx @@ -0,0 +1,217 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; +_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4; +_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4; +_viv_uniform VXC_512Bits uniExtactHalf8_2x8; +_viv_uniform float4 span_wh; +_viv_uniform float4 min_val_wh; + +#define GRID_SAMPLE_F16_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy0 = fabs(fxy0 - min_val_wh); \ + fxy1 = fabs(fxy1 - min_val_wh); \ + float4 flips_xy0 = floor(fxy0 / span_wh); \ + float4 flips_xy1 = floor(fxy1 / span_wh); \ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \ + int4 flips_int_xy0 = convert_int4(flips_xy0); \ + int4 flips_int_xy1 = convert_int4(flips_xy1); \ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + float4 x_f = floor(in_x); \ + float4 x_lerp = in_x - x_f; \ + int4 x_idx = convert_int4(x_f); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + float4 y_f = floor(in_y); \ + float4 y_lerp = in_y - y_f; \ + int4 y_idx = convert_int4(y_f); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_short8 t0; \ + vxc_short8 b0; \ + vxc_half8 top; \ + vxc_half8 bottom; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, top, t0, 16); \ + _viv_asm(COPY, bottom, b0, 16); \ + float4 left4; \ + float4 right4; \ + float4 top4; \ + float4 bottom4; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + while (coord_in.z < loop) \ + { \ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + half4 tmp; \ + _viv_asm(CONV, tmp, dst4); \ + VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \ + vxc_short4 result; \ + _viv_asm(COPY, result, top, 8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, top, t0, 16); \ + _viv_asm(COPY, bottom, b0, 16); \ + } \ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + half4 tmp; \ + _viv_asm(CONV, tmp, dst4); \ + VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \ + vxc_short4 result; \ + _viv_asm(COPY, result, top, 8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + + +__kernel void bilinear_grid_sample_reflect_F16_F32toF16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + coord_in1.z = coord_in1.z + 4; + + float4 fxy0 = read_imagef(input1, coord_in1.xy); + float4 fxy1 = read_imagef(input1, coord_in1.zw); + + GRID_SAMPLE_F16_PROCESS(); + +} + +_viv_uniform int input1_ZP; +_viv_uniform float input1Scale; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4; + +__kernel void bilinear_grid_sample_reflect_F16_U8toF16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + coord_in1.xz = coord_in1.xz * 2; + vxc_uchar16 read_coord; + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 fxy0; + float4 fxy1; + unsigned char input1ZP; + _viv_asm(COPY, input1ZP, input1_ZP, 4); + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); + fxy0 = fxy0 * input1Scale; + fxy1 = fxy1 * input1Scale; + + GRID_SAMPLE_F16_PROCESS(); + +} + + +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4; +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4; + +__kernel void bilinear_grid_sample_reflect_F16_F16toF16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_short8 read_val; + vxc_half8 read_coord; + + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, read_coord, read_val, 16); + + float4 fxy0; + float4 fxy1; + + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + + GRID_SAMPLE_F16_PROCESS(); + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_U8.vx new file mode 100644 index 000000000..46f31ac75 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_U8.vx @@ -0,0 +1,224 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; +_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4; +_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4; +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform float uint8Scale; +_viv_uniform float output_ZP; +_viv_uniform float4 span_wh; +_viv_uniform float4 min_val_wh; + +#define GRID_SAMPLE_F16_to_U8_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy0 = fabs(fxy0 - min_val_wh); \ + fxy1 = fabs(fxy1 - min_val_wh); \ + float4 flips_xy0 = floor(fxy0 / span_wh); \ + float4 flips_xy1 = floor(fxy1 / span_wh); \ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \ + int4 flips_int_xy0 = convert_int4(flips_xy0); \ + int4 flips_int_xy1 = convert_int4(flips_xy1); \ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + float4 x_f = floor(in_x); \ + float4 x_lerp = in_x - x_f; \ + int4 x_idx = convert_int4(x_f); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + float4 y_f = floor(in_y); \ + float4 y_lerp = in_y - y_f; \ + int4 y_idx = convert_int4(y_f); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_short8 t0; \ + vxc_short8 b0; \ + vxc_uchar16 result; \ + vxc_half8 top; \ + vxc_half8 bottom; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, top, t0, 16); \ + _viv_asm(COPY, bottom, b0, 16); \ + float4 left4; \ + float4 right4; \ + float4 top4; \ + float4 bottom4; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + while (coord_in.z < loop) \ + { \ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + dst4 = dst4 * uint8Scale + output_ZP; \ + int4 dst = convert_int4_rte(dst4); \ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \ + result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, top, t0, 16); \ + _viv_asm(COPY, bottom, b0, 16); \ + } \ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + dst4 = dst4 * uint8Scale + output_ZP; \ + int4 dst = convert_int4_rte(dst4); \ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \ + result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + +__kernel void bilinear_grid_sample_reflect_F16_F32toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + coord_in1.z = coord_in1.z + 4; + + float4 fxy0 = read_imagef(input1, coord_in1.xy); + float4 fxy1 = read_imagef(input1, coord_in1.zw); + GRID_SAMPLE_F16_to_U8_PROCESS(); + +} + +_viv_uniform int input1_ZP; +_viv_uniform float input1Scale; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4; + + +__kernel void bilinear_grid_sample_reflect_F16_U8toU8( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_uchar16 read_coord; + + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 fxy0; + float4 fxy1; + + unsigned char input1ZP; + _viv_asm(COPY, input1ZP, input1_ZP, 4); + + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); + + fxy0 = fxy0 * input1Scale; + fxy1 = fxy1 * input1Scale; + + GRID_SAMPLE_F16_to_U8_PROCESS(); + +} + +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4; +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4; + +__kernel void bilinear_grid_sample_reflect_F16_F16toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_short8 read_val; + vxc_half8 read_coord; + + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, read_coord, read_val, 16); + + float4 fxy0; + float4 fxy1; + + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + + GRID_SAMPLE_F16_to_U8_PROCESS(); + +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I16_to_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I16_to_I16.vx new file mode 100644 index 000000000..fc92c2cf6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I16_to_I16.vx @@ -0,0 +1,160 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; +_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4; +_viv_uniform VXC_512Bits uniRightSubLeft_4x4; +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4; +_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4; +_viv_uniform float input1_scale; +_viv_uniform float dfpScale; +_viv_uniform float4 span_wh; +_viv_uniform float4 min_val_wh; + +#define GRID_SAMPLE_I16_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy0 = fabs(fxy0 - min_val_wh); \ + fxy1 = fabs(fxy1 - min_val_wh); \ + float4 flips_xy0 = floor(fxy0 / span_wh); \ + float4 flips_xy1 = floor(fxy1 / span_wh); \ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \ + int4 flips_int_xy0 = convert_int4(flips_xy0); \ + int4 flips_int_xy1 = convert_int4(flips_xy1); \ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + float4 x_f = floor(in_x); \ + float4 x_lerp = in_x - x_f; \ + int4 x_idx = convert_int4(x_f); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + float4 y_f = floor(in_y); \ + float4 y_lerp = in_y - y_f; \ + int4 y_idx = convert_int4(y_f); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_short8 top; \ + vxc_short8 bottom; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + float4 left4; \ + float4 right4; \ + float4 top4; \ + float4 bottom4; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + while (coord_in.z < loop) \ + { \ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \ + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + dst4 = dst4 * dfpScale; \ + int4 dst = convert_int4_rte(dst4); \ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \ + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + dst4 = dst4 * dfpScale; \ + int4 dst = convert_int4_rte(dst4); \ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + + +__kernel void bilinear_grid_sample_reflect_I16_I16toI16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + vxc_short8 read_coord; + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 fxy0; + float4 fxy1; + + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4); + + fxy0 = fxy0 * input1_scale; + fxy1 = fxy1 * input1_scale; + + GRID_SAMPLE_I16_PROCESS(); + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I8_to_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I8_to_I8.vx new file mode 100644 index 000000000..999ff9018 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I8_to_I8.vx @@ -0,0 +1,160 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; +_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4; +_viv_uniform VXC_512Bits uniRightSubLeft_4x4; +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4; +_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4; +_viv_uniform float input1_scale; +_viv_uniform float dfpScale; +_viv_uniform float4 span_wh; +_viv_uniform float4 min_val_wh; + +#define GRID_SAMPLE_I8_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy0 = fabs(fxy0 - min_val_wh); \ + fxy1 = fabs(fxy1 - min_val_wh); \ + float4 flips_xy0 = floor(fxy0 / span_wh); \ + float4 flips_xy1 = floor(fxy1 / span_wh); \ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \ + int4 flips_int_xy0 = convert_int4(flips_xy0); \ + int4 flips_int_xy1 = convert_int4(flips_xy1); \ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + float4 x_f = floor(in_x); \ + float4 x_lerp = in_x - x_f; \ + int4 x_idx = convert_int4(x_f); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + float4 y_f = floor(in_y); \ + float4 y_lerp = in_y - y_f; \ + int4 y_idx = convert_int4(y_f); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_char16 top; \ + vxc_char16 bottom; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + float4 left4; \ + float4 right4; \ + float4 top4; \ + float4 bottom4; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + while (coord_in.z < loop) \ + { \ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \ + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + dst4 = dst4 * dfpScale; \ + int4 dst = convert_int4_rte(dst4); \ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \ + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + dst4 = dst4 * dfpScale; \ + int4 dst = convert_int4_rte(dst4); \ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + + +__kernel void bilinear_grid_sample_reflect_I8_I8toI8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + vxc_char16 read_coord; + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 fxy0; + float4 fxy1; + + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4); + + fxy0 = fxy0 * input1_scale; + fxy1 = fxy1 * input1_scale; + + GRID_SAMPLE_I8_PROCESS(); + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_U8_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_U8_to_U8.vx new file mode 100644 index 000000000..00c343722 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_U8_to_U8.vx @@ -0,0 +1,224 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; + +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4; +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4; +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform int input_ZP; +_viv_uniform float uint8Scale; +_viv_uniform float output_ZP; +_viv_uniform int input1_ZP; +_viv_uniform float input1Scale; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4; +_viv_uniform float4 span_wh; +_viv_uniform float4 min_val_wh; + +#define GRID_SAMPLE_U8_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy0 = fabs(fxy0 - min_val_wh); \ + fxy1 = fabs(fxy1 - min_val_wh); \ + float4 flips_xy0 = floor(fxy0 / span_wh); \ + float4 flips_xy1 = floor(fxy1 / span_wh); \ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \ + int4 flips_int_xy0 = convert_int4(flips_xy0); \ + int4 flips_int_xy1 = convert_int4(flips_xy1); \ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + float4 x_f = floor(in_x); \ + float4 x_lerp = in_x - x_f; \ + int4 x_idx = convert_int4(x_f); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + float4 y_f = floor(in_y); \ + float4 y_lerp = in_y - y_f; \ + int4 y_idx = convert_int4(y_f); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_uchar16 top; \ + vxc_uchar16 bottom; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + float4 left4; \ + float4 right4; \ + float4 top4; \ + float4 bottom4; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + while (coord_in.z < loop) \ + { \ + unsigned char inputZP; \ + _viv_asm(COPY, inputZP, input_ZP, 4); \ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \ + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + dst4 = dst4 * uint8Scale + output_ZP; \ + int4 dst = convert_int4_rte(dst4); \ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + unsigned char inputZP; \ + _viv_asm(COPY, inputZP, input_ZP, 4); \ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \ + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + dst4 = dst4 * uint8Scale + output_ZP; \ + int4 dst = convert_int4_rte(dst4); \ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + +__kernel void bilinear_grid_sample_reflect_U8_F32toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + coord_in1.z = coord_in1.z + 4; + + float4 fxy0 = read_imagef(input1, coord_in1.xy); + float4 fxy1 = read_imagef(input1, coord_in1.zw); + GRID_SAMPLE_U8_PROCESS(); + +} + + +__kernel void bilinear_grid_sample_reflect_U8_U8toU8( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_uchar16 read_coord; + + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 fxy0; + float4 fxy1; + + unsigned char input1ZP; + _viv_asm(COPY, input1ZP, input1_ZP, 4); + + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); + + fxy0 = fxy0 * input1Scale; + fxy1 = fxy1 * input1Scale; + + GRID_SAMPLE_U8_PROCESS(); + +} + +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4; +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4; + +__kernel void bilinear_grid_sample_reflect_U8_F16toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_short8 read_val; + vxc_half8 read_coord; + + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, read_coord, read_val, 16); + + float4 fxy0; + float4 fxy1; + + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + + GRID_SAMPLE_U8_PROCESS(); + +} + diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 68763cc7e..d19b5191b 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -3236,6 +3236,1174 @@ __kernel void bilinear_grid_sample_U8_F16toU8(\n\ \n\ "; /* end of bilinear_grid_sample_U8_to_U8_vx*/ +static const char bilinear_grid_sample_reflect_BF16_to_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;\n\ +_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;\n\ +_viv_uniform float4 span_wh;\n\ +_viv_uniform float4 min_val_wh;\n\ +\n\ +#define GRID_SAMPLE_BF16_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy0 = fabs(fxy0 - min_val_wh); \\\n\ + fxy1 = fabs(fxy1 - min_val_wh); \\\n\ + float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\ + float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\ + int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\ + int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + float4 x_f = floor(in_x); \\\n\ + float4 x_lerp = in_x - x_f; \\\n\ + int4 x_idx = convert_int4(x_f); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + float4 y_f = floor(in_y); \\\n\ + float4 y_lerp = in_y - y_f; \\\n\ + int4 y_idx = convert_int4(y_f); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_short8 top; \\\n\ + vxc_short8 bottom; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + float4 left4; \\\n\ + float4 right4; \\\n\ + float4 top4; \\\n\ + float4 bottom4; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + vxc_ushort8 tmp, dst; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\ + _viv_asm(COPY, right4, src, 16); \\\n\ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\ + _viv_asm(COPY, left4, src, 16); \\\n\ + right4 -= left4; \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\ + _viv_asm(COPY, right4, src, 16); \\\n\ + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\ + _viv_asm(COPY, left4, src, 16); \\\n\ + right4 -= left4; \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + _viv_asm(COPY, tmp, dst4, 16); \\\n\ + dst.s0123 = tmp.s1357; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\ + _viv_asm(COPY, right4, src, 16); \\\n\ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\ + _viv_asm(COPY, left4, src, 16); \\\n\ + right4 -= left4; \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\ + _viv_asm(COPY, right4, src, 16); \\\n\ + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\ + _viv_asm(COPY, left4, src, 16); \\\n\ + right4 -= left4; \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + _viv_asm(COPY, tmp, dst4, 16); \\\n\ + dst.s0123 = tmp.s1357; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_BF16_BF16toBF16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_short8 read_val;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + vxc_short8 src;\n\ + VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);\n\ + _viv_asm(COPY, fxy0, src, 16);\n\ + VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);\n\ + _viv_asm(COPY, fxy1, src, 16);\n\ +\n\ +\n\ +\n\ + GRID_SAMPLE_BF16_PROCESS();\n\ +\n\ +}\n\ +"; /* end of bilinear_grid_sample_reflect_BF16_to_BF16_vx*/ + +static const char bilinear_grid_sample_reflect_F16_to_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;\n\ +_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\ +_viv_uniform float4 span_wh;\n\ +_viv_uniform float4 min_val_wh;\n\ +\n\ +#define GRID_SAMPLE_F16_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy0 = fabs(fxy0 - min_val_wh); \\\n\ + fxy1 = fabs(fxy1 - min_val_wh); \\\n\ + float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\ + float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\ + int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\ + int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + float4 x_f = floor(in_x); \\\n\ + float4 x_lerp = in_x - x_f; \\\n\ + int4 x_idx = convert_int4(x_f); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + float4 y_f = floor(in_y); \\\n\ + float4 y_lerp = in_y - y_f; \\\n\ + int4 y_idx = convert_int4(y_f); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_short8 t0; \\\n\ + vxc_short8 b0; \\\n\ + vxc_half8 top; \\\n\ + vxc_half8 bottom; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, top, t0, 16); \\\n\ + _viv_asm(COPY, bottom, b0, 16); \\\n\ + float4 left4; \\\n\ + float4 right4; \\\n\ + float4 top4; \\\n\ + float4 bottom4; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + half4 tmp; \\\n\ + _viv_asm(CONV, tmp, dst4); \\\n\ + VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \\\n\ + vxc_short4 result; \\\n\ + _viv_asm(COPY, result, top, 8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, top, t0, 16); \\\n\ + _viv_asm(COPY, bottom, b0, 16); \\\n\ + } \\\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + half4 tmp; \\\n\ + _viv_asm(CONV, tmp, dst4); \\\n\ + VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \\\n\ + vxc_short4 result; \\\n\ + _viv_asm(COPY, result, top, 8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_F16_F32toF16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + coord_in1.z = coord_in1.z + 4;\n\ +\n\ + float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\ + float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\ +\n\ + GRID_SAMPLE_F16_PROCESS();\n\ +\n\ +}\n\ +\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float input1Scale;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_F16_U8toF16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + vxc_uchar16 read_coord;\n\ + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ + unsigned char input1ZP;\n\ + _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\ + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ + fxy0 = fxy0 * input1Scale;\n\ + fxy1 = fxy1 * input1Scale;\n\ +\n\ + GRID_SAMPLE_F16_PROCESS();\n\ +\n\ +}\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_F16_F16toF16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_short8 read_val;\n\ + vxc_half8 read_coord;\n\ +\n\ + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, read_coord, read_val, 16);\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ +\n\ + GRID_SAMPLE_F16_PROCESS();\n\ +\n\ +}\n\ +"; /* end of bilinear_grid_sample_reflect_F16_to_F16_vx*/ + +static const char bilinear_grid_sample_reflect_F16_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform float uint8Scale;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float4 span_wh;\n\ +_viv_uniform float4 min_val_wh;\n\ +\n\ +#define GRID_SAMPLE_F16_to_U8_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy0 = fabs(fxy0 - min_val_wh); \\\n\ + fxy1 = fabs(fxy1 - min_val_wh); \\\n\ + float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\ + float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\ + int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\ + int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + float4 x_f = floor(in_x); \\\n\ + float4 x_lerp = in_x - x_f; \\\n\ + int4 x_idx = convert_int4(x_f); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + float4 y_f = floor(in_y); \\\n\ + float4 y_lerp = in_y - y_f; \\\n\ + int4 y_idx = convert_int4(y_f); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_short8 t0; \\\n\ + vxc_short8 b0; \\\n\ + vxc_uchar16 result; \\\n\ + vxc_half8 top; \\\n\ + vxc_half8 bottom; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, top, t0, 16); \\\n\ + _viv_asm(COPY, bottom, b0, 16); \\\n\ + float4 left4; \\\n\ + float4 right4; \\\n\ + float4 top4; \\\n\ + float4 bottom4; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + dst4 = dst4 * uint8Scale + output_ZP; \\\n\ + int4 dst = convert_int4_rte(dst4); \\\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\ + result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, top, t0, 16); \\\n\ + _viv_asm(COPY, bottom, b0, 16); \\\n\ + } \\\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + dst4 = dst4 * uint8Scale + output_ZP; \\\n\ + int4 dst = convert_int4_rte(dst4); \\\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\ + result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_F16_F32toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + coord_in1.z = coord_in1.z + 4;\n\ +\n\ + float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\ + float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\ + GRID_SAMPLE_F16_to_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float input1Scale;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\ +\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_F16_U8toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_uchar16 read_coord;\n\ +\n\ + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + unsigned char input1ZP;\n\ + _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ +\n\ + fxy0 = fxy0 * input1Scale;\n\ + fxy1 = fxy1 * input1Scale;\n\ +\n\ + GRID_SAMPLE_F16_to_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_F16_F16toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_short8 read_val;\n\ + vxc_half8 read_coord;\n\ +\n\ + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, read_coord, read_val, 16);\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ +\n\ + GRID_SAMPLE_F16_to_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +"; /* end of bilinear_grid_sample_reflect_F16_to_U8_vx*/ + +static const char bilinear_grid_sample_reflect_I16_to_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\ +_viv_uniform float input1_scale;\n\ +_viv_uniform float dfpScale;\n\ +_viv_uniform float4 span_wh;\n\ +_viv_uniform float4 min_val_wh;\n\ +\n\ +#define GRID_SAMPLE_I16_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy0 = fabs(fxy0 - min_val_wh); \\\n\ + fxy1 = fabs(fxy1 - min_val_wh); \\\n\ + float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\ + float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\ + int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\ + int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + float4 x_f = floor(in_x); \\\n\ + float4 x_lerp = in_x - x_f; \\\n\ + int4 x_idx = convert_int4(x_f); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + float4 y_f = floor(in_y); \\\n\ + float4 y_lerp = in_y - y_f; \\\n\ + int4 y_idx = convert_int4(y_f); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_short8 top; \\\n\ + vxc_short8 bottom; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + float4 left4; \\\n\ + float4 right4; \\\n\ + float4 top4; \\\n\ + float4 bottom4; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + dst4 = dst4 * dfpScale; \\\n\ + int4 dst = convert_int4_rte(dst4); \\\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + dst4 = dst4 * dfpScale; \\\n\ + int4 dst = convert_int4_rte(dst4); \\\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_I16_I16toI16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + vxc_short8 read_coord;\n\ + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\ +\n\ + fxy0 = fxy0 * input1_scale;\n\ + fxy1 = fxy1 * input1_scale;\n\ +\n\ + GRID_SAMPLE_I16_PROCESS();\n\ +\n\ +}\n\ +"; /* end of bilinear_grid_sample_reflect_I16_to_I16_vx*/ + +static const char bilinear_grid_sample_reflect_I8_to_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\ +_viv_uniform float input1_scale;\n\ +_viv_uniform float dfpScale;\n\ +_viv_uniform float4 span_wh;\n\ +_viv_uniform float4 min_val_wh;\n\ +\n\ +#define GRID_SAMPLE_I8_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy0 = fabs(fxy0 - min_val_wh); \\\n\ + fxy1 = fabs(fxy1 - min_val_wh); \\\n\ + float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\ + float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\ + int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\ + int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + float4 x_f = floor(in_x); \\\n\ + float4 x_lerp = in_x - x_f; \\\n\ + int4 x_idx = convert_int4(x_f); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + float4 y_f = floor(in_y); \\\n\ + float4 y_lerp = in_y - y_f; \\\n\ + int4 y_idx = convert_int4(y_f); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_char16 top; \\\n\ + vxc_char16 bottom; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + float4 left4; \\\n\ + float4 right4; \\\n\ + float4 top4; \\\n\ + float4 bottom4; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + dst4 = dst4 * dfpScale; \\\n\ + int4 dst = convert_int4_rte(dst4); \\\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + dst4 = dst4 * dfpScale; \\\n\ + int4 dst = convert_int4_rte(dst4); \\\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_I8_I8toI8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + vxc_char16 read_coord;\n\ + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\ +\n\ + fxy0 = fxy0 * input1_scale;\n\ + fxy1 = fxy1 * input1_scale;\n\ +\n\ + GRID_SAMPLE_I8_PROCESS();\n\ +\n\ +}\n\ +"; /* end of bilinear_grid_sample_reflect_I8_to_I8_vx*/ + +static const char bilinear_grid_sample_reflect_U8_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform int input_ZP;\n\ +_viv_uniform float uint8Scale;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float input1Scale;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\ +_viv_uniform float4 span_wh;\n\ +_viv_uniform float4 min_val_wh;\n\ +\n\ +#define GRID_SAMPLE_U8_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy0 = fabs(fxy0 - min_val_wh); \\\n\ + fxy1 = fabs(fxy1 - min_val_wh); \\\n\ + float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\ + float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\ + float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\ + float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\ + int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\ + int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\ + fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\ + fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + float4 x_f = floor(in_x); \\\n\ + float4 x_lerp = in_x - x_f; \\\n\ + int4 x_idx = convert_int4(x_f); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + float4 y_f = floor(in_y); \\\n\ + float4 y_lerp = in_y - y_f; \\\n\ + int4 y_idx = convert_int4(y_f); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_uchar16 top; \\\n\ + vxc_uchar16 bottom; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + float4 left4; \\\n\ + float4 right4; \\\n\ + float4 top4; \\\n\ + float4 bottom4; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + unsigned char inputZP; \\\n\ + _viv_asm(COPY, inputZP, input_ZP, 4); \\\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + dst4 = dst4 * uint8Scale + output_ZP; \\\n\ + int4 dst = convert_int4_rte(dst4); \\\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + unsigned char inputZP; \\\n\ + _viv_asm(COPY, inputZP, input_ZP, 4); \\\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + dst4 = dst4 * uint8Scale + output_ZP; \\\n\ + int4 dst = convert_int4_rte(dst4); \\\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_U8_F32toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + coord_in1.z = coord_in1.z + 4;\n\ +\n\ + float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\ + float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\ + GRID_SAMPLE_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_U8_U8toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_uchar16 read_coord;\n\ +\n\ + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + unsigned char input1ZP;\n\ + _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ +\n\ + fxy0 = fxy0 * input1Scale;\n\ + fxy1 = fxy1 * input1Scale;\n\ +\n\ + GRID_SAMPLE_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_U8_F16toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_short8 read_val;\n\ + vxc_half8 read_coord;\n\ +\n\ + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, read_coord, read_val, 16);\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ +\n\ + GRID_SAMPLE_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +"; /* end of bilinear_grid_sample_reflect_U8_to_U8_vx*/ + static const char bucketize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\ @@ -58353,6 +59521,176 @@ __kernel void bilinear_grid_sample_U8_U8toU8(\n\ \n\ }"; /* end of bilinear_grid_sample_cl*/ +static const char bilinear_grid_sample_reflect_cl[] = "__kernel void bilinear_grid_sample_reflect_F32_F32toF32(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + float half_input0_w,\n\ + float half_input0_h,\n\ + float add_float_value_w,\n\ + float add_float_value_h,\n\ + int depth,\n\ + float min_val_w,\n\ + float span_w,\n\ + float min_val_h,\n\ + float span_h\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));\n\ + int2 coord_add = (int2)(-1, 1);\n\ +\n\ + float fx = read_imagef(input1, coord_in1).x;\n\ + coord_in1.x = coord_in1.x + 1;\n\ + float fy = read_imagef(input1, coord_in1).x;\n\ +\n\ + fx = fx * half_input0_w + add_float_value_w;\n\ + fy = fy * half_input0_h + add_float_value_h;\n\ +\n\ + if (span_w > 0)\n\ + {\n\ + fx = fabs(fx - min_val_w);\n\ + int flips_x = (int)(fx / span_w);\n\ + float extra_x = fx - flips_x * span_w;\n\ + fx = (flips_x & 0x01) ? min_val_w + (span_w - extra_x) : min_val_w + extra_x ;\n\ + }\n\ + else\n\ + {\n\ + fx = 0;\n\ + }\n\ +\n\ + if (span_h > 0)\n\ + {\n\ + fy = fabs(fy - min_val_h);\n\ + int flips_y = (int)(fy / span_h);\n\ + float extra_y = fy - flips_y * span_h;\n\ + fy = (flips_y & 0x01) ? min_val_h + (span_h - extra_y) : min_val_h + extra_y ;\n\ + }\n\ + else\n\ + {\n\ + fy = 0;\n\ + }\n\ +\n\ + float x_f = floor(fx);\n\ + float y_f = floor(fy);\n\ + float x_lerp = fx - x_f;\n\ + float y_lerp = fy - y_f;\n\ + int x_index = convert_int(x_f);\n\ + int y_index = convert_int(y_f);\n\ + int4 coord_in = (int4)(x_index, y_index, 0, 0);\n\ +\n\ + float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst;\n\ +\n\ + while (coord_in.z < depth){\n\ + top_l = read_imagef(input0, coord_in);\n\ + coord_in.y++;\n\ + bottom_l = read_imagef(input0, coord_in);\n\ + coord_in.x++;\n\ + bottom_r = read_imagef(input0, coord_in);\n\ + coord_in.y--;\n\ + top_r = read_imagef(input0, coord_in);\n\ + top_r = top_r - top_l;\n\ + top = top_l + x_lerp * top_r;\n\ + bottom_r = bottom_r - bottom_l;\n\ + bottom = bottom_l + x_lerp * bottom_r;\n\ + bottom = bottom - top;\n\ + dst = top + y_lerp * bottom;\n\ + write_imagef(output, coord_out, dst);\n\ + coord_in.xz = coord_in.xz + coord_add;\n\ + coord_out.z++;\n\ + }\n\ +}\n\ +\n\ +\n\ +__kernel void bilinear_grid_sample_reflect_U8_U8toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + float half_input0_w,\n\ + float half_input0_h,\n\ + float add_float_value_w,\n\ + float add_float_value_h,\n\ + int depth,\n\ + float in0_scale,\n\ + float in0_tail,\n\ + float in1_scale,\n\ + float in1_tail,\n\ + float out_scale,\n\ + float out_tail,\n\ + float min_val_w,\n\ + float span_w,\n\ + float min_val_h,\n\ + float span_h\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));\n\ + int2 coord_add = (int2)(-1, 1);\n\ +\n\ + float fx = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\ + coord_in1.x = coord_in1.x + 1;\n\ + float fy = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\ +\n\ + fx = fx * half_input0_w + add_float_value_w;\n\ + fy = fy * half_input0_h + add_float_value_h;\n\ +\n\ + if (span_w > 0)\n\ + {\n\ + fx = fabs(fx - min_val_w);\n\ + int flips_x = (int)(fx / span_w);\n\ + float extra_x = fx - flips_x * span_w;\n\ + fx = (flips_x & 0x01) ? min_val_w + (span_w - extra_x) : min_val_w + extra_x ;\n\ + }\n\ + else\n\ + {\n\ + fx = 0;\n\ + }\n\ +\n\ + if (span_h > 0)\n\ + {\n\ + fy = fabs(fy - min_val_h);\n\ + int flips_y = (int)(fy / span_h);\n\ + float extra_y = fy - flips_y * span_h;\n\ + fy = (flips_y & 0x01) ? min_val_h + (span_h - extra_y) : min_val_h + extra_y ;\n\ + }\n\ + else\n\ + {\n\ + fy = 0;\n\ + }\n\ +\n\ + float x_f = floor(fx);\n\ + float y_f = floor(fy);\n\ + float x_lerp = fx - x_f;\n\ + float y_lerp = fy - y_f;\n\ + int x_index = convert_int(x_f);\n\ + int y_index = convert_int(y_f);\n\ + int4 coord_in = (int4)(x_index, y_index, 0, 0);\n\ +\n\ + float4 top_l, top_r, bottom_l, bottom_r, top, bottom;\n\ + uint4 dst;\n\ +\n\ + while (coord_in.z < depth){\n\ + top_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\ + coord_in.y++;\n\ + bottom_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\ + coord_in.x++;\n\ + bottom_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\ + coord_in.y--;\n\ + top_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\ + top_r = top_r - top_l;\n\ + top = top_l + x_lerp * top_r;\n\ + bottom_r = bottom_r - bottom_l;\n\ + bottom = bottom_l + x_lerp * bottom_r;\n\ + bottom = bottom - top;\n\ + top = top + y_lerp * bottom;\n\ + dst = convert_uint4_rte(top * out_scale + out_tail);\n\ + write_imageui(output, coord_out, dst);\n\ + coord_in.xz = coord_in.xz + coord_add;\n\ + coord_out.z++;\n\ + }\n\ +\n\ +}"; /* end of bilinear_grid_sample_reflect_cl*/ + static const char bucketize_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\ \n\ #define BUCKETIZE_F32_2D_SH_IMPL(name, comp_op) \\\n\ @@ -60238,108 +61576,6 @@ __kernel void depth2space_crd_F32toF32(\n\ }\n\ "; /* end of depth2space_crd_cl*/ -static const char detect_post_box_cl[] = "float exp_(float x, float logE)\n\ -{\n\ - x *= logE;\n\ - x = exp2(x);\n\ - return x;\n\ -}\n\ -\n\ -__kernel void detect_post_box_F32_F32toF32(\n\ - __read_only image2d_array_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_array_t output,\n\ - float inv_scale_y,\n\ - float inv_scale_x,\n\ - float inv_scale_h,\n\ - float inv_scale_w,\n\ - float logE)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ - float4 src0;\n\ - float4 src1;\n\ - float4 dst;\n\ - float4 tmp0, tmp1;\n\ - src0.x = read_imagef(input0, coord).x;\n\ - src1.x = read_imagef(input1, coord.xy).x;\n\ - coord.x++;\n\ - src0.y = read_imagef(input0, coord).x;\n\ - src1.y = read_imagef(input1, coord.xy).x;\n\ - coord.x++;\n\ - src0.z = read_imagef(input0, coord).x;\n\ - src1.z = read_imagef(input1, coord.xy).x;\n\ - coord.x++;\n\ - src0.w = read_imagef(input0, coord).x;\n\ - src1.w = read_imagef(input1, coord.xy).x;\n\ -\n\ - tmp0.x = src1.x + src1.z * src0.x * inv_scale_y;\n\ - tmp0.y = src1.y + src1.w * src0.y * inv_scale_x;\n\ - tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f;\n\ - tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f;\n\ - dst.xy = tmp0.xy - tmp1.xy;\n\ - dst.zw = tmp0.xy + tmp1.xy;\n\ - coord.x = 0;\n\ - write_imagef(output, coord, dst.xxxx);\n\ - coord.x++;\n\ - write_imagef(output, coord, dst.yyyy);\n\ - coord.x++;\n\ - write_imagef(output, coord, dst.zzzz);\n\ - coord.x++;\n\ - write_imagef(output, coord, dst.wwww);\n\ -}\n\ -\n\ -\n\ -__kernel void detect_post_box_U8_U8toF32(\n\ - __read_only image2d_array_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_array_t output,\n\ - float inv_scale_y,\n\ - float inv_scale_x,\n\ - float inv_scale_h,\n\ - float inv_scale_w,\n\ - float logE,\n\ - float input0Tail,\n\ - float input1Tail,\n\ - float input0Scale,\n\ - float input1Scale)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ - uint4 in0, in1;\n\ - float4 src0;\n\ - float4 src1;\n\ - float4 dst;\n\ - float4 tmp0, tmp1;\n\ - in0.x = read_imageui(input0, coord).x;\n\ - in1.x = read_imageui(input1, coord.xy).x;\n\ - coord.x++;\n\ - in0.y = read_imageui(input0, coord).x;\n\ - in1.y = read_imageui(input1, coord.xy).x;\n\ - coord.x++;\n\ - in0.z = read_imageui(input0, coord).x;\n\ - in1.z = read_imageui(input1, coord.xy).x;\n\ - coord.x++;\n\ - in0.w = read_imageui(input0, coord).x;\n\ - in1.w = read_imageui(input1, coord.xy).x;\n\ -\n\ - src0 = convert_float4(in0) * input0Scale + input0Tail;\n\ - src1 = convert_float4(in1) * input1Scale + input1Tail;\n\ -\n\ - tmp0.x = src1.x + src1.z * src0.x * inv_scale_y;\n\ - tmp0.y = src1.y + src1.w * src0.y * inv_scale_x;\n\ - tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f;\n\ - tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f;\n\ - dst.xy = tmp0.xy - tmp1.xy;\n\ - dst.zw = tmp0.xy + tmp1.xy;\n\ - coord.x = 0;\n\ - write_imagef(output, coord, dst.xxxx);\n\ - coord.x++;\n\ - write_imagef(output, coord, dst.yyyy);\n\ - coord.x++;\n\ - write_imagef(output, coord, dst.zzzz);\n\ - coord.x++;\n\ - write_imagef(output, coord, dst.wwww);\n\ -}"; /* end of detect_post_box_cl*/ - static const char eltwise_ops_helper_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\ #pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ \n\ @@ -70818,7 +72054,7 @@ __kernel void maximum_I32I32toI32_2D\n\ }\n\ "; /* end of maximum_cl*/ -static const char maxpool_cl[] = "#define VSI_FLOAT32_MIN (1.175494351e-38F)\n\ +static const char maxpool_cl[] = "#define VSI_FLOAT32_MIN (-3.40E+38)\n\ \n\ #define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \\\n\ __kernel void maxpool_##in_name##to##out_name( \\\n\ @@ -70865,7 +72101,7 @@ __kernel void maxpool_##in_name##to##out_name( \\\n\ { \\\n\ src0 = read_func(input, coord_in); \\\n\ coord_in.x += dilation_x; \\\n\ - maxVal = max(src0, maxVal); \\\n\ + maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x; \\\n\ } \\\n\ } \\\n\ \\\n\ @@ -70921,7 +72157,7 @@ __kernel void maxpool_F32toF32(\n\ {\n\ src0 = read_imagef(input, coord_in);\n\ coord_in.x += dilation_x;\n\ - maxVal = max(src0, maxVal);\n\ + maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;\n\ }\n\ }\n\ \n\ @@ -70972,7 +72208,7 @@ __kernel void maxpool_U32toF32(\n\ {\n\ src0 = read_imageui(input, coord_in);\n\ coord_in.x += dilation_x;\n\ - maxVal = max(src0, maxVal);\n\ + maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;\n\ }\n\ }\n\ \n\ @@ -71026,7 +72262,7 @@ __kernel void maxpool_F32toU32(\n\ {\n\ src0 = read_imagef(input, coord_in);\n\ coord_in.x += dilation_x;\n\ - maxVal = max(src0, maxVal);\n\ + maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;\n\ }\n\ }\n\ \n\ @@ -79273,7 +80509,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \\\n\ float right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem) ^ signo) \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -79361,7 +80597,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag uint left_elem = local_data[left_id]; \\\n\ uint right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem) ^ signo) \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -79449,7 +80685,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag int left_elem = local_data[left_id]; \\\n\ int right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem) ^ signo) \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -79537,7 +80773,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \\\n\ float right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem) ^ signo) \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -79625,7 +80861,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \\\n\ float right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem) ^ signo) \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -79660,6 +80896,376 @@ TOPK_F32toI32(1 << 4, 4)\n\ TOPK_F32toI32(1 << 5, 5)\n\ TOPK_F32toI32(1 << 6, 6)"; /* end of topk_cl*/ +static const char topk2_cl[] = "\n\ +#define BITONIC_STEP(dtype) \\\n\ +void bitonic_step_##dtype(uint num_stages, int lx, \\\n\ + __local dtype *local_data, __local int *local_indices) \\\n\ +{ \\\n\ + for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\ + { \\\n\ + uint signo = (lx >> stage) & 1; \\\n\ + \\\n\ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ + { \\\n\ + uint postShift = (stage - passOfStage); \\\n\ + uint pairDistance = 1 << postShift; \\\n\ + \\\n\ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\ + uint right_id = left_id + pairDistance; \\\n\ + \\\n\ + int left_idx = local_indices[left_id]; \\\n\ + int right_idx = local_indices[right_id]; \\\n\ + \\\n\ + dtype left_elem = local_data[left_id]; \\\n\ + dtype right_elem = local_data[right_id]; \\\n\ + \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ + { \\\n\ + local_data[left_id] = right_elem; \\\n\ + local_data[right_id] = left_elem; \\\n\ + \\\n\ + local_indices[left_id] = right_idx; \\\n\ + local_indices[right_id] = left_idx; \\\n\ + } \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +BITONIC_STEP(int)\n\ +BITONIC_STEP(uint)\n\ +\n\ +#define BITONIC_STEP_ASCEND(dtype) \\\n\ +void bitonic_step_ascend_##dtype(uint num_stages, int lx, \\\n\ + __local dtype *p_share_k, __local int *p_share_v) \\\n\ +{ \\\n\ + for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\ + { \\\n\ + uint signo = (lx >> stage) & 1; \\\n\ + \\\n\ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ + { \\\n\ + uint postShift = (stage - passOfStage); \\\n\ + uint pairDistance = 1 << postShift; \\\n\ + \\\n\ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\ + uint right_id = left_id + pairDistance; \\\n\ + \\\n\ + int left_idx = p_share_v[left_id]; \\\n\ + int right_idx = p_share_v[right_id]; \\\n\ + \\\n\ + dtype left_elem = p_share_k[left_id]; \\\n\ + dtype right_elem = p_share_k[right_id]; \\\n\ + \\\n\ + if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \\\n\ + { \\\n\ + p_share_k[left_id] = right_elem; \\\n\ + p_share_k[right_id] = left_elem; \\\n\ + \\\n\ + p_share_v[left_id] = right_idx; \\\n\ + p_share_v[right_id] = left_idx; \\\n\ + } \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +BITONIC_STEP_ASCEND(int)\n\ +BITONIC_STEP_ASCEND(uint)\n\ +\n\ +#define BITONIC_MERGE(dtype) \\\n\ +void bitonic_merge_##dtype(uint num_stages, int lx, \\\n\ + __local dtype *local_data, __local int *local_indices) \\\n\ +{ \\\n\ + uint stage = num_stages; \\\n\ + uint signo = (lx >> stage) & 1; \\\n\ + \\\n\ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ + { \\\n\ + uint postShift = (stage - passOfStage); \\\n\ + uint pairDistance = 1 << postShift; \\\n\ + \\\n\ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\ + uint right_id = left_id + pairDistance; \\\n\ + \\\n\ + int left_idx = local_indices[left_id]; \\\n\ + int right_idx = local_indices[right_id]; \\\n\ + \\\n\ + dtype left_elem = local_data[left_id]; \\\n\ + dtype right_elem = local_data[right_id]; \\\n\ + \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ + { \\\n\ + local_data[left_id] = right_elem; \\\n\ + local_data[right_id] = left_elem; \\\n\ + \\\n\ + local_indices[left_id] = right_idx; \\\n\ + local_indices[right_id] = left_idx; \\\n\ + } \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + } \\\n\ +}\n\ +BITONIC_MERGE(int)\n\ +BITONIC_MERGE(uint)\n\ +\n\ +#define BLOCK_SIZE (512)\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32\n\ +(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t indices,\n\ + float input_scale,\n\ + float input_tail,\n\ + float output_scale,\n\ + float output_tail,\n\ + int _num_stages,\n\ + int width\n\ + )\n\ + {\n\ + uint lx = get_local_id(0);\n\ + const int init_k = -2147483647;\n\ + const int init_v = -2147483647;\n\ + const int num_stages = 9;\n\ + const int threads_per_block = BLOCK_SIZE;\n\ + const int index_minus_1 = threads_per_block * 2 - 1;\n\ + uint offset = 0;\n\ + uint lx1 = lx + threads_per_block;\n\ +\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + __local int local_data[1536];\n\ + __local int local_indices[1536];\n\ +\n\ + int left = read_imagei(input, coord.xy).x;\n\ + coord.z += threads_per_block;\n\ + int right = read_imagei(input, coord.zy).x;\n\ +\n\ + local_data[lx] = left;\n\ + local_indices[lx] = coord.x;\n\ + local_data[lx1] = right;\n\ + local_indices[lx1] = coord.z;\n\ +\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_int(num_stages, lx, local_data, local_indices);\n\ +\n\ + int min_data = local_data[511];\n\ +\n\ + int *p_share_k = local_data + threads_per_block;\n\ + int *p_share_v = local_indices + threads_per_block;\n\ +\n\ + int limit = (width >> 10) << 10;\n\ + p_share_k[lx] = init_k;\n\ + p_share_v[lx] = init_v;\n\ +\n\ + p_share_k[lx1] = init_k;\n\ + p_share_v[lx1] = init_v;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\ + {\n\ + int2 data;\n\ + coord.z = coord.x + threads_per_block;\n\ + data.x = read_imagei(input, coord.xy).x;\n\ + data.y = read_imagei(input, coord.zy).x;\n\ +\n\ + p_share_k[lx] = data.x;\n\ + p_share_v[lx] = coord.x;\n\ +\n\ + p_share_k[lx1] = data.y;\n\ + p_share_v[lx1] = coord.z;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\ +\n\ + if (p_share_k[index_minus_1] < min_data)\n\ + {\n\ + continue;\n\ + }\n\ +\n\ + p_share_k[lx] = p_share_k[lx1];\n\ + p_share_v[lx] = p_share_v[lx1];\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\ +\n\ + min_data = local_data[511];\n\ + p_share_k[lx] = init_k;\n\ + p_share_v[lx] = init_v;\n\ + p_share_k[lx1] = init_k;\n\ + p_share_v[lx1] = init_v;\n\ + }\n\ +\n\ + if (width > limit)\n\ + {\n\ + if (coord.x < width)\n\ + {\n\ + int2 data;\n\ + data.x = read_imagei(input, coord.xy).x;\n\ + coord.z = coord.x + threads_per_block;\n\ + data.y = read_imagei(input, coord.zy).x;\n\ +\n\ + p_share_k[lx] = data.x;\n\ + p_share_v[lx] = coord.x;\n\ +\n\ + p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\ + p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\ + }\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\ +\n\ + if (p_share_k[index_minus_1] >= min_data)\n\ + {\n\ + p_share_k[lx] = p_share_k[lx1];\n\ + p_share_v[lx] = p_share_v[lx1];\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\ + }\n\ + }\n\ +\n\ + int4 dst;\n\ + dst.x = local_data[lx];\n\ +\n\ + coord.x = lx;\n\ + write_imagei(output, coord.xy, dst.xxxx);\n\ +\n\ + int4 index;\n\ + index.x = local_indices[lx];\n\ +\n\ + write_imagei(indices, coord.xy, index.xxxx);\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32\n\ +(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t indices,\n\ + float input_scale,\n\ + float input_tail,\n\ + float output_scale,\n\ + float output_tail,\n\ + int _num_stages,\n\ + int width\n\ + )\n\ + {\n\ + uint lx = get_local_id(0);\n\ + const uint init_k = 0;\n\ + const int init_v = -2147483647;\n\ + const int num_stages = 9;\n\ + const int threads_per_block = BLOCK_SIZE;\n\ + const int index_minus_1 = threads_per_block * 2 - 1;\n\ + uint offset = 0;\n\ + uint lx1 = lx + threads_per_block;\n\ +\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + __local uint local_data[1536];\n\ + __local int local_indices[1536];\n\ +\n\ + uint left = read_imageui(input, coord.xy).x;\n\ + coord.z += threads_per_block;\n\ + uint right = read_imageui(input, coord.zy).x;\n\ +\n\ + local_data[lx] = left;\n\ + local_indices[lx] = coord.x;\n\ + local_data[lx1] = right;\n\ + local_indices[lx1] = coord.z;\n\ +\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_uint(num_stages, lx, local_data, local_indices);\n\ +\n\ + uint min_data = local_data[511];\n\ +\n\ + uint *p_share_k = local_data + threads_per_block;\n\ + int *p_share_v = local_indices + threads_per_block;\n\ +\n\ + int limit = (width >> 10) << 10;\n\ + p_share_k[lx] = init_k;\n\ + p_share_v[lx] = init_v;\n\ +\n\ + p_share_k[lx1] = init_k;\n\ + p_share_v[lx1] = init_v;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\ + {\n\ + uint2 data;\n\ + coord.z = coord.x + threads_per_block;\n\ + data.x = read_imageui(input, coord.xy).x;\n\ + data.y = read_imageui(input, coord.zy).x;\n\ +\n\ + p_share_k[lx] = data.x;\n\ + p_share_v[lx] = coord.x;\n\ +\n\ + p_share_k[lx1] = data.y;\n\ + p_share_v[lx1] = coord.z;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\ +\n\ + if (p_share_k[index_minus_1] < min_data)\n\ + {\n\ + continue;\n\ + }\n\ +\n\ + p_share_k[lx] = p_share_k[lx1];\n\ + p_share_v[lx] = p_share_v[lx1];\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\ +\n\ + min_data = local_data[511];\n\ + p_share_k[lx] = init_k;\n\ + p_share_v[lx] = init_v;\n\ + p_share_k[lx1] = init_k;\n\ + p_share_v[lx1] = init_v;\n\ + }\n\ +\n\ + if (width > limit)\n\ + {\n\ + if (coord.x < width)\n\ + {\n\ + uint2 data;\n\ + data.x = read_imageui(input, coord.xy).x;\n\ + coord.z = coord.x + threads_per_block;\n\ + data.y = read_imageui(input, coord.zy).x;\n\ +\n\ + p_share_k[lx] = data.x;\n\ + p_share_v[lx] = coord.x;\n\ +\n\ + p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\ + p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\ + }\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\ +\n\ + if (p_share_k[index_minus_1] >= min_data)\n\ + {\n\ + p_share_k[lx] = p_share_k[lx1];\n\ + p_share_v[lx] = p_share_v[lx1];\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\ + }\n\ + }\n\ +\n\ + uint4 dst;\n\ + dst.x = local_data[lx];\n\ +\n\ + coord.x = lx;\n\ + write_imageui(output, coord.xy, dst.xxxx);\n\ +\n\ + int4 index;\n\ + index.x = local_indices[lx];\n\ +\n\ + write_imagei(indices, coord.xy, index.xxxx);\n\ +}\n\ +"; /* end of topk2_cl*/ + static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X (32)\n\ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\ (\n\ @@ -79690,12 +81296,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd }\n\ \n\ __local int sorted[1];\n\ - int width_minus_one = width - 1;\n\ - int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\ - num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\ -\n\ - int x_start = lid * num_pixels_per_thread;\n\ - int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\ \n\ sorted[0] = 0;\n\ \n\ @@ -79706,20 +81306,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd *sorted = 0;\n\ }\n\ int swapped = 0;\n\ - barrier(CLK_GLOBAL_MEM_FENCE);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ // odd-even\n\ - coord.x = x_start;\n\ - coord.z = x_start + 1;\n\ - for (; coord.x < x_end; )\n\ + coord.x = lid * 2;\n\ + coord.z = lid * 2 + 1;\n\ + for (; coord.z < width; )\n\ {\n\ float4 left = read_imagef(input_t, coord.xy);\n\ float4 right = read_imagef(input_t, coord.zy);\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ \n\ - if (left.x < right.x)\n\ + if ( (left.x < right.x) ||\n\ + (left.x == right.x && l_index.x < r_index.x) )\n\ {\n\ - int4 l_index = read_imagei(indices_t, coord.xy);\n\ - int4 r_index = read_imagei(indices_t, coord.zy);\n\ swapped = 1;\n\ \n\ write_imagef(input_t, coord.xy, right);\n\ @@ -79729,21 +81330,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index);\n\ }\n\ \n\ - coord.xz = coord.xz + 2;\n\ + coord.xz += 2 * LOCAL_SIZE_X;\n\ }\n\ \n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ // even-odd\n\ - coord.x = x_start + 1;\n\ - coord.z = x_start + 2;\n\ - for (; coord.x < x_end; )\n\ + coord.x = lid * 2 + 1;\n\ + coord.z = lid * 2 + 2;\n\ + for (; coord.z < width; )\n\ {\n\ float4 left = read_imagef(input_t, coord.xy);\n\ float4 right = read_imagef(input_t, coord.zy);\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ \n\ - if (left.x < right.x)\n\ + if ( (left.x < right.x) ||\n\ + (left.x == right.x && l_index.x < r_index.x) )\n\ {\n\ - int4 l_index = read_imagei(indices_t, coord.xy);\n\ - int4 r_index = read_imagei(indices_t, coord.zy);\n\ swapped = 1;\n\ \n\ write_imagef(input_t, coord.xy, right);\n\ @@ -79753,11 +81356,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index);\n\ }\n\ \n\ - coord.xz = coord.xz + 2;\n\ + coord.xz += 2 * LOCAL_SIZE_X;\n\ }\n\ \n\ atomic_add(sorted, swapped);\n\ - barrier(CLK_GLOBAL_MEM_FENCE);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ if (*sorted == 0)\n\ break;\n\ @@ -79803,13 +81406,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd }\n\ \n\ __local int sorted[1];\n\ - int width_minus_one = width - 1;\n\ - int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\ - num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\ -\n\ - int x_start = lid * num_pixels_per_thread;\n\ - int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\ -\n\ sorted[0] = 0;\n\ \n\ while (1)\n\ @@ -79819,20 +81415,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd *sorted = 0;\n\ }\n\ int swapped = 0;\n\ - barrier(CLK_GLOBAL_MEM_FENCE);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ // odd-even\n\ - coord.x = x_start;\n\ - coord.z = x_start + 1;\n\ - for (; coord.x < x_end; )\n\ + coord.x = lid * 2;\n\ + coord.z = lid * 2 + 1;\n\ + for (; coord.z < width; )\n\ {\n\ uint4 left = read_imageui(input_t, coord.xy);\n\ uint4 right = read_imageui(input_t, coord.zy);\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ \n\ - if (left.x < right.x)\n\ + if ( (left.x < right.x) ||\n\ + (left.x == right.x && l_index.x < r_index.x) )\n\ {\n\ - int4 l_index = read_imagei(indices_t, coord.xy);\n\ - int4 r_index = read_imagei(indices_t, coord.zy);\n\ swapped = 1;\n\ \n\ write_imageui(input_t, coord.xy, right);\n\ @@ -79842,21 +81439,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index);\n\ }\n\ \n\ - coord.xz = coord.xz + 2;\n\ + coord.xz += 2 * LOCAL_SIZE_X;\n\ }\n\ \n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ // even-odd\n\ - coord.x = x_start + 1;\n\ - coord.z = x_start + 2;\n\ - for (; coord.x < x_end; )\n\ + coord.x = lid * 2 + 1;\n\ + coord.z = lid * 2 + 2;\n\ + for (; coord.z < width; )\n\ {\n\ uint4 left = read_imageui(input_t, coord.xy);\n\ uint4 right = read_imageui(input_t, coord.zy);\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ \n\ - if (left.x < right.x)\n\ + if ( (left.x < right.x) ||\n\ + (left.x == right.x && l_index.x < r_index.x) )\n\ {\n\ - int4 l_index = read_imagei(indices_t, coord.xy);\n\ - int4 r_index = read_imagei(indices_t, coord.zy);\n\ swapped = 1;\n\ \n\ write_imageui(input_t, coord.xy, right);\n\ @@ -79866,11 +81465,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index);\n\ }\n\ \n\ - coord.xz = coord.xz + 2;\n\ + coord.xz += 2 * LOCAL_SIZE_X;\n\ }\n\ \n\ atomic_add(sorted, swapped);\n\ - barrier(CLK_GLOBAL_MEM_FENCE);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ if (*sorted == 0)\n\ break;\n\ @@ -79916,13 +81515,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd }\n\ \n\ __local int sorted[1];\n\ - int width_minus_one = width - 1;\n\ - int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\ - num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\ -\n\ - int x_start = lid * num_pixels_per_thread;\n\ - int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\ -\n\ sorted[0] = 0;\n\ \n\ while (1)\n\ @@ -79932,20 +81524,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd *sorted = 0;\n\ }\n\ int swapped = 0;\n\ - barrier(CLK_GLOBAL_MEM_FENCE);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ // odd-even\n\ - coord.x = x_start;\n\ - coord.z = x_start + 1;\n\ - for (; coord.x < x_end; )\n\ + coord.x = lid * 2;\n\ + coord.z = lid * 2 + 1;\n\ + for (; coord.z < width; )\n\ {\n\ int4 left = read_imagei(input_t, coord.xy);\n\ int4 right = read_imagei(input_t, coord.zy);\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ \n\ - if (left.x < right.x)\n\ + if ( (left.x < right.x) ||\n\ + (left.x == right.x && l_index.x < r_index.x) )\n\ {\n\ - int4 l_index = read_imagei(indices_t, coord.xy);\n\ - int4 r_index = read_imagei(indices_t, coord.zy);\n\ swapped = 1;\n\ \n\ write_imagei(input_t, coord.xy, right);\n\ @@ -79955,21 +81548,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index);\n\ }\n\ \n\ - coord.xz = coord.xz + 2;\n\ + coord.xz += 2 * LOCAL_SIZE_X;\n\ }\n\ \n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ // even-odd\n\ - coord.x = x_start + 1;\n\ - coord.z = x_start + 2;\n\ - for (; coord.x < x_end; )\n\ + coord.x = lid * 2 + 1;\n\ + coord.z = lid * 2 + 2;\n\ + for (; coord.z < width; )\n\ {\n\ int4 left = read_imagei(input_t, coord.xy);\n\ int4 right = read_imagei(input_t, coord.zy);\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ \n\ - if (left.x < right.x)\n\ + if ( (left.x < right.x) ||\n\ + (left.x == right.x && l_index.x < r_index.x) )\n\ {\n\ - int4 l_index = read_imagei(indices_t, coord.xy);\n\ - int4 r_index = read_imagei(indices_t, coord.zy);\n\ swapped = 1;\n\ \n\ write_imagei(input_t, coord.xy, right);\n\ @@ -79979,11 +81574,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index);\n\ }\n\ \n\ - coord.xz = coord.xz + 2;\n\ + coord.xz += 2 * LOCAL_SIZE_X;\n\ }\n\ \n\ atomic_add(sorted, swapped);\n\ - barrier(CLK_GLOBAL_MEM_FENCE);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ if (*sorted == 0)\n\ break;\n\ @@ -80031,12 +81626,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd }\n\ \n\ __local int sorted[1];\n\ - int width_minus_one = width - 1;\n\ - int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\ - num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\ -\n\ - int x_start = lid * num_pixels_per_thread;\n\ - int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\ \n\ sorted[0] = 0;\n\ \n\ @@ -80047,20 +81636,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd *sorted = 0;\n\ }\n\ int swapped = 0;\n\ - barrier(CLK_GLOBAL_MEM_FENCE);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ // odd-even\n\ - coord.x = x_start;\n\ - coord.z = x_start + 1;\n\ - for (; coord.x < x_end; )\n\ + coord.x = lid * 2;\n\ + coord.z = lid * 2 + 1;\n\ + for (; coord.z < width; )\n\ {\n\ float4 left = read_imagef(input_t, coord.xy);\n\ float4 right = read_imagef(input_t, coord.zy);\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ \n\ - if (left.x < right.x)\n\ + if ( (left.x < right.x) ||\n\ + (left.x == right.x && l_index.x < r_index.x) )\n\ {\n\ - int4 l_index = read_imagei(indices_t, coord.xy);\n\ - int4 r_index = read_imagei(indices_t, coord.zy);\n\ swapped = 1;\n\ \n\ write_imagef(input_t, coord.xy, right);\n\ @@ -80070,21 +81660,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index);\n\ }\n\ \n\ - coord.xz = coord.xz + 2;\n\ + coord.xz += 2 * LOCAL_SIZE_X;\n\ }\n\ \n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ // even-odd\n\ - coord.x = x_start + 1;\n\ - coord.z = x_start + 2;\n\ - for (; coord.x < x_end; )\n\ + coord.x = lid * 2 + 1;\n\ + coord.z = lid * 2 + 2;\n\ + for (; coord.z < width; )\n\ {\n\ float4 left = read_imagef(input_t, coord.xy);\n\ float4 right = read_imagef(input_t, coord.zy);\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ \n\ - if (left.x < right.x)\n\ + if ( (left.x < right.x) ||\n\ + (left.x == right.x && l_index.x < r_index.x) )\n\ {\n\ - int4 l_index = read_imagei(indices_t, coord.xy);\n\ - int4 r_index = read_imagei(indices_t, coord.zy);\n\ swapped = 1;\n\ \n\ write_imagef(input_t, coord.xy, right);\n\ @@ -80094,11 +81686,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index);\n\ }\n\ \n\ - coord.xz = coord.xz + 2;\n\ + coord.xz += 2 * LOCAL_SIZE_X;\n\ }\n\ \n\ atomic_add(sorted, swapped);\n\ - barrier(CLK_GLOBAL_MEM_FENCE);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ if (*sorted == 0)\n\ break;\n\ @@ -80146,13 +81738,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd }\n\ \n\ __local int sorted[1];\n\ - int width_minus_one = width - 1;\n\ - int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\ - num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\ -\n\ - int x_start = lid * num_pixels_per_thread;\n\ - int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\ -\n\ sorted[0] = 0;\n\ \n\ while (1)\n\ @@ -80162,20 +81747,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd *sorted = 0;\n\ }\n\ int swapped = 0;\n\ - barrier(CLK_GLOBAL_MEM_FENCE);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ // odd-even\n\ - coord.x = x_start;\n\ - coord.z = x_start + 1;\n\ - for (; coord.x < x_end; )\n\ + coord.x = lid * 2;\n\ + coord.z = lid * 2 + 1;\n\ + for (; coord.z < width; )\n\ {\n\ float4 left = read_imagef(input_t, coord.xy);\n\ float4 right = read_imagef(input_t, coord.zy);\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ \n\ - if (left.x < right.x)\n\ + if ( (left.x < right.x) ||\n\ + (left.x == right.x && l_index.x < r_index.x) )\n\ {\n\ - int4 l_index = read_imagei(indices_t, coord.xy);\n\ - int4 r_index = read_imagei(indices_t, coord.zy);\n\ swapped = 1;\n\ \n\ write_imagef(input_t, coord.xy, right);\n\ @@ -80185,18 +81771,22 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index);\n\ }\n\ \n\ - coord.xz = coord.xz + 2;\n\ + coord.xz += 2 * LOCAL_SIZE_X;\n\ }\n\ \n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ // even-odd\n\ - coord.x = x_start + 1;\n\ - coord.z = x_start + 2;\n\ - for (; coord.x < x_end; )\n\ + coord.x = lid * 2 + 1;\n\ + coord.z = lid * 2 + 2;\n\ + for (; coord.z < width; )\n\ {\n\ float4 left = read_imagef(input_t, coord.xy);\n\ float4 right = read_imagef(input_t, coord.zy);\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ \n\ - if (left.x < right.x)\n\ + if ( (left.x < right.x) ||\n\ + (left.x == right.x && l_index.x < r_index.x) )\n\ {\n\ int4 l_index = read_imagei(indices_t, coord.xy);\n\ int4 r_index = read_imagei(indices_t, coord.zy);\n\ @@ -80209,11 +81799,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd write_imagei(indices_t, coord.zy, l_index);\n\ }\n\ \n\ - coord.xz = coord.xz + 2;\n\ + coord.xz += 2 * LOCAL_SIZE_X;\n\ }\n\ \n\ atomic_add(sorted, swapped);\n\ - barrier(CLK_GLOBAL_MEM_FENCE);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ if (*sorted == 0)\n\ break;\n\ @@ -80464,6 +82054,12 @@ static const source_map_t evis_resource[] = {"bilinear_grid_sample_I16_to_I16_vx", bilinear_grid_sample_I16_to_I16_vx}, {"bilinear_grid_sample_I8_to_I8_vx", bilinear_grid_sample_I8_to_I8_vx}, {"bilinear_grid_sample_U8_to_U8_vx", bilinear_grid_sample_U8_to_U8_vx}, + {"bilinear_grid_sample_reflect_BF16_to_BF16_vx", bilinear_grid_sample_reflect_BF16_to_BF16_vx}, + {"bilinear_grid_sample_reflect_F16_to_F16_vx", bilinear_grid_sample_reflect_F16_to_F16_vx}, + {"bilinear_grid_sample_reflect_F16_to_U8_vx", bilinear_grid_sample_reflect_F16_to_U8_vx}, + {"bilinear_grid_sample_reflect_I16_to_I16_vx", bilinear_grid_sample_reflect_I16_to_I16_vx}, + {"bilinear_grid_sample_reflect_I8_to_I8_vx", bilinear_grid_sample_reflect_I8_to_I8_vx}, + {"bilinear_grid_sample_reflect_U8_to_U8_vx", bilinear_grid_sample_reflect_U8_to_U8_vx}, {"bucketize_vx", bucketize_vx}, {"cast_vx", cast_vx}, {"clip_F16_vx", clip_F16_vx}, @@ -80753,6 +82349,7 @@ static const source_map_t cl_resource[] = {"avg_pool3d_cl", avg_pool3d_cl}, {"batchnorm_single_cl", batchnorm_single_cl}, {"bilinear_grid_sample_cl", bilinear_grid_sample_cl}, + {"bilinear_grid_sample_reflect_cl", bilinear_grid_sample_reflect_cl}, {"bucketize_cl", bucketize_cl}, {"cast_cl", cast_cl}, {"clip_BF16_cl", clip_BF16_cl}, @@ -80764,7 +82361,6 @@ static const source_map_t cl_resource[] = {"cumsum_cl", cumsum_cl}, {"cumsum_2d_cl", cumsum_2d_cl}, {"depth2space_crd_cl", depth2space_crd_cl}, - {"detect_post_box_cl", detect_post_box_cl}, {"eltwise_ops_helper_cl", eltwise_ops_helper_cl}, {"eltwise_unary_0_cl", eltwise_unary_0_cl}, {"eltwise_unary_1_cl", eltwise_unary_1_cl}, @@ -80888,6 +82484,7 @@ static const source_map_t cl_resource[] = {"swish_cl", swish_cl}, {"tile_cl", tile_cl}, {"topk_cl", topk_cl}, + {"topk2_cl", topk2_cl}, {"topk_odd_even_sort_cl", topk_odd_even_sort_cl}, {"topk_odd_even_sort2_cl", topk_odd_even_sort2_cl}, {"upsample_cl", upsample_cl}, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c index bade3f959..377bba4d8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c @@ -39,6 +39,7 @@ #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_constraint_check.h" #include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" #include "vsi_nn_error.h" #define _INPUT_NUM (1) @@ -199,14 +200,11 @@ static vsi_bool op_setup vsi_nn_internal_node_t* curr = NULL; float min = self->nn_param.clip.min; float max = self->nn_param.clip.max; - uint32_t infinity = VSI_NN_FLOAT32_INF; - float neg_infinity = -*(float*)&infinity; - int32_t max_float = *(int32_t*)&max; if ( (min == -1.0f && max == 1.0f) || (min == 0.0f && max == 6.0f) - || (min == 0.0f && max_float == VSI_NN_FLOAT32_INF) - || (min == neg_infinity && max_float == VSI_NN_FLOAT32_INF)) + || (min == 0.0f && fp32_is_inf(max)) + || (fp32_is_inf(-min) && fp32_is_inf(max))) { vsi_nn_internal_init_node_wksp(self); if (min == -1.0f && max == 1.0f) @@ -217,7 +215,7 @@ static vsi_bool op_setup { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0); } - else if (min == 0.0f && max_float == VSI_NN_FLOAT32_INF) + else if (min == 0.0f && fp32_is_inf(max)) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0); } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c index 235ab87bb..ffac02898 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c @@ -430,10 +430,13 @@ static vsi_bool op_setup // create activation output/hstate_output/cstate_output vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dtype, TRUE); act_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(act_out, "Create internal tensor failed", final); vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_H_STATE]->attr.dtype, TRUE); act_h_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(act_h_out, "Create internal tensor failed", final); vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_C_STATE]->attr.dtype, TRUE); act_c_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(act_c_out, "Create internal tensor failed", final); curr->outputs[LSTMUNIT_ACT_OUTPUT] = act_out->t; curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = act_h_out->t; curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = act_c_out->t; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c index ce79a92d9..7020b570d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c @@ -89,7 +89,7 @@ static vsi_status op_grouped_compute if (NULL == LOCAL()->weight_tensor_group) { VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__); - return VSI_FAILURE; + goto final; } memset(LOCAL()->weight_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *)); res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 2, @@ -325,8 +325,8 @@ static vsi_status op_compute #endif // param.a_x = self->nn_param.deconv.dilation; // param.a_y = self->nn_param.deconv.dilation; - param.ext.khr.a_x = 1; - param.ext.khr.a_y = 1; + param.ext.khr.a_x = (size_t)self->nn_param.deconv.output_padding[0]; + param.ext.khr.a_y = (size_t)self->nn_param.deconv.output_padding[1]; param.ext.khr.padding_x = (size_t)self->nn_param.deconv.pad[0]; param.ext.khr.padding_y = (size_t)self->nn_param.deconv.pad[2]; param.ext.khr.overflow_policy = self->vx_param.overflow_policy; @@ -336,6 +336,7 @@ static vsi_status op_compute param.ext.channel_group = self->nn_param.deconv.group; param.stride_x = self->nn_param.deconv.stride[0]; param.stride_y = self->nn_param.deconv.stride[1]; + param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding; //param.border_mode; //param.border_const; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c index 958b06b10..903ca2d2f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c @@ -66,7 +66,7 @@ static vsi_status op_compute input_tensor[1] = tmp_tensor; self->n = (vx_node)vsi_nn_kernel_selector( self->graph, - "signal_frame", + "extra_ending", input_tensor, 2, outputs, 1, NULL ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c index e7d935843..a8875911a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c @@ -65,9 +65,15 @@ static vsi_status op_compute } else { +#define _TENSOR_LEN 64 vsi_nn_tensor_attr_t attr; vsi_nn_tensor_t* temp_tensors = NULL; + char gather_tensor_name[_TENSOR_LEN]; + char copy_tensor_name[_TENSOR_LEN]; + memset(gather_tensor_name, 0, sizeof(gather_tensor_name)); + memset(copy_tensor_name, 0, sizeof(copy_tensor_name)); + VSILOGW("gather is no_range_change operation! \ Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!"); @@ -78,7 +84,20 @@ static vsi_status op_compute temp_tensors = vsi_nn_CreateTensor( self->graph, &attr ); vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, &temp_tensors, 1, param ); + snprintf(gather_tensor_name, sizeof(gather_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0); + if(vxSetReferenceName((vx_reference)temp_tensors->t, gather_tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u gather node output name fail", self->uid); + return VSI_FAILURE; + } + n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t); + snprintf(copy_tensor_name, sizeof(copy_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1); + if(vxSetReferenceName((vx_reference)outputs[0]->t, copy_tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u copy node output name fail", self->uid); + return VSI_FAILURE; + } vsi_safe_release_tensor(temp_tensors); } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c index b77a39db3..d5be535bf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c @@ -38,6 +38,7 @@ #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" typedef struct _gather_elements_local_data_t { int32_t placeholder; @@ -101,6 +102,7 @@ static vsi_status op_compute attr.is_const = FALSE; attr.vtl = TRUE; temp_tensors = vsi_nn_CreateTensor( self->graph, &attr ); + CHECK_PTR_FAIL_GOTO( temp_tensors, "Create tensor fail.", final ); } else { @@ -148,6 +150,7 @@ static vsi_status op_compute vsi_safe_release_tensor(temp_tensors); } +final: vsi_nn_kernel_param_release( ¶m ); if ( self->n ) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c index 86f15f81d..d035ddaed 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c @@ -53,11 +53,13 @@ static vsi_status op_compute vsi_nn_kernel_param_t* param = NULL; int32_t align_corners = self->nn_param.gridsample.align_corners; + int32_t pad_mode = (int32_t)self->nn_param.gridsample.padding_mode; vsi_nn_kernel_node_t n; char kernel_name[128]; param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners); + vsi_nn_kernel_param_add_int32(param, "padding_mode", pad_mode); switch (self->nn_param.gridsample.mode) { case VSI_NN_INTERPOLATION_BILINEAR: @@ -103,13 +105,20 @@ static vsi_bool op_check return FALSE; } - if (!((VSI_NN_PAD_MODE_CONSTANT == + if ((VSI_NN_PAD_MODE_CONSTANT == self->nn_param.gridsample.padding_mode) && - (0 == self->nn_param.gridsample.const_val))) { + (0 != self->nn_param.gridsample.const_val)) { VSILOGE("Only support padding const 0 now!"); return FALSE; } + + if (VSI_NN_PAD_MODE_SYMMETRIC == + self->nn_param.gridsample.padding_mode) { + VSILOGE("Can't support VSI_NN_PAD_MODE_SYMMETRIC now!"); + return FALSE; + } + return TRUE; } /* op_check() */ @@ -124,6 +133,11 @@ static vsi_bool op_setup return FALSE; } + if (2 != inputs[1]->attr.size[0]) + { + return FALSE; + } + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; outputs[0]->attr.size[0] = inputs[1]->attr.size[1]; @@ -133,6 +147,16 @@ static vsi_bool op_setup outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; } } + else + { + if ((outputs[0]->attr.dim_num != inputs[0]->attr.dim_num) + || (outputs[0]->attr.size[0] != inputs[1]->attr.size[1]) + || (outputs[0]->attr.size[1] != inputs[1]->attr.size[2])) + { + return FALSE; + } + + } return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c index 487e89c26..89c3b844a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c @@ -121,6 +121,7 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { +#define _TENSOR_LEN 64 vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; @@ -129,15 +130,36 @@ static vsi_status op_compute vsi_size_t new_rank = 0; vsi_nn_tensor_t * tmp_tensors[4] = {NULL}; + char reshape0_tensor_name[_TENSOR_LEN]; + char reshape1_tensor_name[_TENSOR_LEN]; + char instance_norm_tensor_name[_TENSOR_LEN]; + + memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name)); + memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name)); + memset(instance_norm_tensor_name, 0, sizeof(instance_norm_tensor_name)); + vsi_nn_optimize_instance_norm_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank); tmp_tensors[0] = vsi_nn_kernel_insert_reshape_node( self->graph, inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD ); + snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0); + if(vxSetReferenceName((vx_reference)tmp_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u reshape 0 node output name fail", self->uid); + return VSI_FAILURE; + } tmp_tensors[1] = inputs[1]; tmp_tensors[2] = inputs[2]; tmp_tensors[3] = vsi_nn_kernel_insert_reshape_node( self->graph, outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD ); + snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1); + if(vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u reshap 1 node output name fail", self->uid); + return VSI_FAILURE; + } + status = _try_set_high_presision_tensor(tmp_tensors); if (status != VSI_SUCCESS) { @@ -155,6 +177,12 @@ static vsi_status op_compute self->n = (vx_node)n; status = VSI_SUCCESS; } + snprintf(instance_norm_tensor_name, sizeof(instance_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2); + if(vxSetReferenceName((vx_reference)tmp_tensors[3]->t, instance_norm_tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u instance_norm node output name fail", self->uid); + return VSI_FAILURE; + } if (param != NULL) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c index fe227816a..46a389a0f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -259,9 +259,9 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - vsi_nn_safe_free(self->nn_param.layernorm.local); #if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) + vsi_nn_safe_free(self->nn_param.layernorm.local); vsi_nn_internal_deinit_node_wksp( self ); #endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index aa2b231e1..2051c4533 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -161,14 +161,6 @@ static vsi_bool _check_is_sp_supported_type { int32_t * axes = self->nn_param.reduce.local2->axes; int32_t axes_num = self->nn_param.reduce.local2->axes_num; - vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { {0} }; - int32_t axis_in[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t i = 0; - uint32_t axis_size = 0; - uint32_t rank_in = 0; - uint32_t rank_out = 0; - vsi_bool ret = FALSE; if ( !self->graph->ctx->config.support_stream_processor || (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) ) @@ -191,22 +183,15 @@ static vsi_bool _check_is_sp_supported_type return FALSE; } - for (i = 0; i < axes_num; i++) + if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2)) || + (axes_num == 2 && ((axes[0] < 2 && axes[1] < 2) || (axes[0] == 1 && axes[1] == 2))) ) { - shapes[0][i] = input->attr.size[axes[i]]; - shapes[1][i] = 1; - axis_in[i] = i; + return TRUE; } - ret = vsi_nn_kernel_optimize_reduce_shape( - shapes[0], axes_num, - axis_in, axes_num, - shapes[1], axes_num, - shapes[2], &rank_in, shapes[3], &rank_out, - new_axis, &axis_size); - - return ret && axis_size < 3; + return FALSE; } + static vsi_status op_compute ( vsi_nn_node_t * self, @@ -839,82 +824,28 @@ static vsi_bool op_set_sp_reduce_internal vsi_enum type_name ) { - vsi_nn_tensor_attr_t attr; - vsi_nn_internal_tensor_t* tensor1 = NULL; - vsi_nn_tensor_t* new_output = NULL; - uint32_t* permute_in_perm = NULL; - int32_t * new_axis = NULL; - vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = {1}; - int32_t use_virtual_tensor = TRUE; vsi_nn_internal_node_t* tmp_inode = NULL; int32_t * axes = self->nn_param.reduce.local2->axes; int32_t axes_num = self->nn_param.reduce.local2->axes_num; - int32_t i = 0, j = 0, index = 0; vsi_size_t reduce_size = 1; vsi_bool ret = FALSE; + int32_t i = 0; vsi_nn_internal_init_node_wksp( self ); - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - memcpy(&attr.dtype, &inputs[0]->attr.dtype, sizeof(vsi_nn_dtype_t)); - attr.dim_num = VSI_NN_DIM_AUTO; - attr.vtl = use_virtual_tensor; - attr.is_const = FALSE; - tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); - CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final); - - tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 ); + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 ); CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); - permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, - inputs[0]->attr.dim_num * sizeof(uint32_t)); - CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create buffer failed", final); - - for ( i = 0; i < axes_num; i++) - { - shapes[index] = outputs[0]->attr.size[axes[i]]; - permute_in_perm[index ++] = axes[i]; - reduce_size *= inputs[0]->attr.size[axes[i]]; - } - - for ( j = 0; j < (int32_t)inputs[0]->attr.dim_num; j++) - { - for (i = 0; i < axes_num; i++) - { - if (j == axes[i]) - { - break; - } - } - if (i == axes_num) - { - shapes[index] = outputs[0]->attr.size[j]; - permute_in_perm[index ++] = j; - } - } - tmp_inode->node->nn_param.permute.perm = permute_in_perm; - tmp_inode->node->nn_param.permute.dim_num = inputs[0]->attr.dim_num; tmp_inode->inputs[0] = inputs[0]; - tmp_inode->outputs[0] = tensor1->t; - vsi_nn_internal_setup_node(self, tmp_inode); - - new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num); - CHECK_PTR_FAIL_GOTO(new_output, "Create tensor failed", final); - self->nn_param.reduce.local2->reshaped_output = new_output; + tmp_inode->outputs[0] = outputs[0]; + tmp_inode->node->nn_param.reduce_mean_internal.axis = axes; + tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num; + tmp_inode->node->nn_param.reduce_mean_internal.type = type_name; - tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 ); - CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); - new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode, - axes_num * sizeof(int32_t)); - CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(new_axis, tmp_inode, "Create buffer failed", final); for (i = 0; i < axes_num; i++) { - new_axis[i] = i; + reduce_size *= inputs[0]->attr.size[axes[i]]; } - tmp_inode->inputs[0] = tensor1->t; - tmp_inode->outputs[0] = new_output; - tmp_inode->node->nn_param.reduce_mean_internal.axis = new_axis; - tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num; - tmp_inode->node->nn_param.reduce_mean_internal.type = type_name; + if (type_name == VSI_NN_REDUCE_SUM) { tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f; @@ -1147,6 +1078,7 @@ static vsi_bool op_set_reduce_internal re_sizes[axes[2]] = 1; new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num); } + self->nn_param.reduce.local2->reshaped_output = new_output; curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); @@ -1161,7 +1093,6 @@ static vsi_bool op_set_reduce_internal curr->inputs[0] = tmp_output_tensor[1]->t; } curr->outputs[0] = new_output; - self->nn_param.reduce.local2->reshaped_output = new_output; vsi_nn_internal_setup_node(self, curr); } else diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c index 3b92359c1..f560e8198 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c @@ -136,7 +136,7 @@ static vsi_bool op_setup attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; reducesum_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); - CHECK_PTR_FAIL_GOTO(square_tensor, "Create internal tensor failed", final); + CHECK_PTR_FAIL_GOTO(reducesum_tensor, "Create internal tensor failed", final); reducesum_node = vsi_nn_internal_new_node( self, VSI_NN_OP_REDUCE, 0, 0); CHECK_PTR_FAIL_GOTO(reducesum_node, "Create internal node failed", final); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c index 6ec9d19af..7221928d3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c @@ -37,6 +37,7 @@ #include "vsi_nn_error.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" static vsi_status op_compute @@ -88,7 +89,6 @@ static vsi_bool op_setup float alpha = 0; float max_value = 0; float threshold = 0; - uint32_t max_raw = 0; vsi_bool ret = FALSE; if ( NULL == self ) @@ -101,11 +101,9 @@ static vsi_bool op_setup max_value = p->max_value; threshold = p->threshold; - max_raw = *(uint32_t*)&max_value; - vsi_nn_internal_init_node_wksp(self); - if (alpha == 0 && max_raw == VSI_NN_FLOAT32_INF && threshold == 0) + if (alpha == 0.0f && fp32_is_inf(max_value) && threshold == 0.0f) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0); CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); @@ -119,14 +117,14 @@ static vsi_bool op_setup curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; } - else if (alpha == 0 && max_value == 6.0f && threshold == 0) + else if (alpha == 0.0f && max_value == 6.0f && threshold == 0.0f) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0); CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; } - else if (alpha == 0.1 && max_value == VSI_NN_FLOAT32_INF && threshold == 0) + else if (alpha == 0.1f && max_value == VSI_NN_FLOAT32_INF && threshold == 0.0f) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LEAKY_RELU, 0, 0); CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c index 418c6a0e6..662fa967f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c @@ -70,7 +70,11 @@ static vsi_status op_compute self->graph, (uint8_t *)self->nn_param.reshape.size, &attr); - + if (NULL == dims_tensor) + { + VSILOGE( "Create tensor fail." ); + return VSI_FAILURE; + } reshape_param.dims = REQUIRED_IO(dims_tensor); self->n = vxTensorReshapeNode(self->graph->g, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c index 6e1c3138d..93d269dfc 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c @@ -75,6 +75,11 @@ static vsi_status op_compute self->graph, (uint8_t *)dims_data, &attr); + if (NULL == dims_tensor) + { + VSILOGE( "Create tensor fail." ); + return VSI_FAILURE; + } reshape_param.dims = REQUIRED_IO(dims_tensor); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c index efa21d605..c1f559f4a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c @@ -52,6 +52,7 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; int32_t align_corners = self->nn_param.resize_internal.align_corners; int32_t half_pixel_centers = self->nn_param.resize_internal.half_pixel_centers; + int32_t type = self->nn_param.resize_internal.type; vsi_enum layout = self->nn_param.resize_internal.layout; vsi_nn_kernel_param_t * param = NULL; @@ -59,6 +60,7 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners ); vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers ); + vsi_nn_kernel_param_add_int32( param, "type", type ); if (layout == VSI_NN_RESIZE_LAYOUT_NCHW) { @@ -186,6 +188,7 @@ static vsi_status op_init vsi_status status = VSI_SUCCESS; self->nn_param.resize_internal.layout = VSI_NN_RESIZE_LAYOUT_NCHW; + self->nn_param.resize_internal.type = VSI_NN_INTERPOLATION_BILINEAR; return status; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c new file mode 100644 index 000000000..84387d7fd --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c @@ -0,0 +1,202 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util_prv.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t* param = NULL; + vsi_nn_kernel_node_t n = NULL; + float eps = self->nn_param.rmsnorm.eps; + int32_t axis = self->nn_param.rmsnorm.axis; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_float32(param, "eps", eps); + vsi_nn_kernel_param_add_int32(param, "axis", axis); + n = vsi_nn_kernel_selector(self->graph, "rms_norm", + inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param); + if (n != NULL) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if (param != NULL) + { + vsi_nn_kernel_param_release(¶m); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num); + + if (!ret) + { + BEGIN_IO_TYPE_DECL(RMS_NORM, 2, 1) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM) + IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM) + IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP) + IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP) + IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM) + IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM) + IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM) + IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM) + IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP) + IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP) + IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM) + IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM) + IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM) + IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM) + IO_TYPE(D_BF16, D_F32, D_BF16) + IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16) + IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM) + IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP) + IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM) + IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM) + IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16) + IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16) + IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16) + IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP) + IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM) + IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM) + IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16) + IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16) + IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16) + IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM) + IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16) + IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP) + IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM) + IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM) + IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16) + IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16) + IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16) + IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP) + IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM) + IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM) + IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16) + IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16) + IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16) + END_IO_TYPE_DECL(RMS_NORM) + if (!VALIDATE_OP_IO_TYPES(RMS_NORM, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + + if (NULL == self) + { + return FALSE; + } + + ret = vsi_nn_op_common_setup(self, inputs, outputs); + + return ret; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + self->nn_param.rmsnorm.axis = 0; + self->nn_param.rmsnorm.eps = 1e-8f; + return VSI_SUCCESS; +} /* op_init() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RMSNORM, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_shape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_shape.c new file mode 100644 index 000000000..db6df273a --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shape.c @@ -0,0 +1,196 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_test.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" + +typedef struct _shape_local_data_t { + vsi_nn_tensor_t *shape_tensor; +} shape_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + + if (outputs[0]->attr.dtype.vx_type != VSI_NN_TYPE_INT32) + { + VSILOGD("Outputs data type not support"); + + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vx_int32 shapes[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t i = 0; + + for ( i = 0; i < inputs[0]->attr.dim_num; i++ ) + { + shapes[i] = (int32_t)inputs[0]->attr.size[i]; + } + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = inputs[0]->attr.dim_num; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + self->nn_param.shape.local->shape_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)shapes, + &attr); + if ( NULL == self->nn_param.shape.local->shape_tensor ) + { + VSILOGE("Create shape_tensor fail.(shape)"); + goto final; + } + + vsi_nn_internal_init_node_wksp(self); + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); + if (NULL == curr) + { + return ret; + } + curr->inputs[0] = self->nn_param.shape.local->shape_tensor; + curr->outputs[0] = outputs[0]; + + ret = vsi_nn_internal_setup_node(self, curr); + +final: + + return ret; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + self->nn_param.shape.local = + (shape_local_data_t *)malloc(sizeof(shape_local_data_t)); + if (NULL == self->nn_param.shape.local) + { + return VSI_FAILURE; + } + memset( self->nn_param.shape.local, 0, sizeof(shape_local_data_t) ); + + return VSI_SUCCESS; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.shape.local) + { + vsi_safe_release_tensor(self->nn_param.shape.local->shape_tensor); + } + vsi_nn_safe_free(self->nn_param.shape.local); + vsi_nn_internal_deinit_node_wksp(self); + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SHAPE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c index 84c2dd75d..95dc76ab7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -735,6 +735,15 @@ static vsi_bool op_setup outputs[0]->attr.dim_num++; } + + /*output dim_num is 0, the tensor should be scalar!*/ + if (outputs[0]->attr.dim_num == 0) + { + outputs[0]->attr.dim_num = 1; + outputs[0]->attr.size[0] = 1; + + vsi_nn_SetTensorIsScalar(outputs[0], TRUE); + } } _get_stride_slice_start_stop_stride(self, inputs, outputs); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c index 0be22cd64..7e8ae3485 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c @@ -124,9 +124,6 @@ static vsi_status op_compute outputs[0]->attr.size, outputs[0]->attr.dim_num, axis, shapes[1], &rank_out, &new_axis1); - param = vsi_nn_kernel_param_create(); - vsi_nn_kernel_param_add_int32( param, "top_k", top_k ); - if (ret) { uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0}; @@ -195,10 +192,14 @@ static vsi_status op_compute outputs_tensor[1] = reshape_tensors[2]; } + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "top_k", top_k ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk", &input_tensor, _INPUT_NUM, outputs_tensor, _OUTPUT_NUM, param ); + vsi_nn_kernel_param_release( ¶m ); if (axis != 0) { _create_permute_node(self, outputs_tensor[0], reshape_tensors[1], perm_out, rank_in, TRUE); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c index 4b7dd3f61..90479bb81 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c @@ -25,6 +25,7 @@ #include #include +#include #include "vsi_nn_types.h" #include "vsi_nn_log.h" @@ -47,8 +48,6 @@ typedef struct _upsamplescale_local_data_t { #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) -#define _EPSILON 1e-8 - static vsi_status op_compute ( vsi_nn_node_t * self, @@ -69,7 +68,7 @@ static vsi_status op_compute stride = self->nn_param.upsamplescale.stride; scale = self->nn_param.upsamplescale.scale; - if (stride == 1 || vsi_nn_abs(scale - 1.0f) == _EPSILON) + if (stride == 1 || vsi_nn_abs(scale - 1.0f) < FLT_EPSILON) { return vsi_nn_internal_compute_node( self ); } @@ -148,7 +147,7 @@ static vsi_status op_optimize VSI_UNREFERENCED(inputs); VSI_UNREFERENCED(outputs); - if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON) + if (stride == 1 && vsi_nn_abs(scale - 1.0f) < FLT_EPSILON) { return vsi_nn_internal_optimize_node( self, direction ); } @@ -174,7 +173,7 @@ static vsi_bool op_setup vsi_nn_internal_init_node_wksp(self); - if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON) + if (stride == 1 && vsi_nn_abs(scale - 1.0f) < FLT_EPSILON) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0); CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); @@ -194,7 +193,7 @@ static vsi_bool op_setup ret = vsi_nn_internal_setup_node(self, curr); } - else if (vsi_nn_abs(scale - 1.0f) == _EPSILON) + else if (vsi_nn_abs(scale - 1.0f) < FLT_EPSILON) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESIZE, 0, 0); CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index eb02639c2..4b2aa7aeb 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -469,6 +469,8 @@ static _op_param_gen_t s_op_gen[] = /* REDUCEL2 */ NULL, /* CROP_AND_RESIZE */ NULL, /* TAN */ NULL, + /* RMSNORM */ NULL, + /* SHAPE */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index 6f91f9933..e1d9b8198 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -46,6 +46,11 @@ #include "utils/vsi_nn_math.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" + +#if (defined(__ANDROID__)) && (__ANDROID_API__ > 21) +#include +#endif typedef struct _vx_status_desc_t { @@ -387,10 +392,11 @@ float vsi_nn_DataAsFloat32 ) { float val; - uint32_t *p = (uint32_t*)(&val); + fp32_bit_cast_t fp32_bit_cast; int16_t fp16; - *p = 0xFFFFFFFF; + fp32_bit_cast.data = 0xFFFFFFFF; + val = fp32_bit_cast.val; switch( type ) { case VSI_NN_TYPE_BOOL8: @@ -1462,11 +1468,15 @@ void vsi_nn_get_tensor_clamp_min_max } else { - uint32_t f32_min = 0xff800000; - uint32_t f32_max = 0x7f800000; + fp32_bit_cast_t fp32_bit_cast; + float pos_infinity; + float neg_infinity; + fp32_bit_cast.data = VSI_NN_FLOAT32_INF; + pos_infinity = fp32_bit_cast.val; + neg_infinity = -pos_infinity; - *clampMin = *(float*)&f32_min; - *clampMax = *(float*)&f32_max; + *clampMin = neg_infinity; + *clampMax = pos_infinity; } } diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c index fa5804540..7c7ed61d7 100644 --- a/src/tim/vx/internal/src/vsi_nn_context.c +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -93,7 +93,7 @@ static vsi_status query_hardware_caps return status; } -#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30) +#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30)) static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER"; static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK"; static const char* ENV_ENABLE_CONCAT_OPTIMIZE = "vendor.VSI_NN_ENABLE_CONCAT_OPTIMIZE"; diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index ded183541..3242621b2 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -799,6 +799,7 @@ static vsi_status batchInference_graph original_inputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io); original_outputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io); approximateConstTensor = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t) * graph->tensor_num); + CHECK_PTR_FAIL_GOTO(approximateConstTensor, "Malloc fail.", final); memset(approximateConstTensor, -1, sizeof(vsi_nn_tensor_id_t) * graph->tensor_num); if (NULL == inputs || NULL == outputs || NULL == original_inputs_attr || NULL == original_outputs_attr) @@ -878,6 +879,7 @@ static vsi_status batchInference_graph vsi_size_t iterator_list_index = 0; vsi_size_t list_index = 0; vsi_size_t* iterator_list = (vsi_size_t*)malloc(sizeof(vsi_size_t) * (batchNum + 1)); + CHECK_PTR_FAIL_GOTO(iterator_list, "Malloc fail.", final); memset(iterator_list, 0, sizeof(uint32_t) * (batchNum + 1)); if (((vsi_nn_node_prv_t*)node)->split_num > 0) @@ -885,6 +887,7 @@ static vsi_status batchInference_graph iterator_list[iterator_list_index++] = ((vsi_nn_node_prv_t*)node)->split_num; if (((vsi_nn_node_prv_t*)node)->split_num == 1) {/*if user set split_num = 1, there is no need to batch split.*/ + vsi_nn_safe_free(iterator_list); continue; } } @@ -1015,6 +1018,7 @@ static vsi_status batchInference_graph } } + vsi_nn_safe_free(iterator_list); /*restore node input batch number*/ num_of_node_inputs = node->input.num; for (k = 0; k < num_of_node_inputs; k++) @@ -1053,7 +1057,7 @@ static vsi_status batchInference_graph } } - final: +final: for (i = 0; i < graph->node_num; i++) { node_id = nodes_list[i]; @@ -1067,7 +1071,7 @@ static vsi_status batchInference_graph node->input.num, inputs); vsi_nn_GetTensors(graph, node->output.tensors, node->output.num, outputs); - for (j = 0; j < node->output.num; j++) + for (j = 0; outputs && j < node->output.num; j++) { if (outputs[j] == NULL) { diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c index ff5b1cce0..c240d3bea 100644 --- a/src/tim/vx/internal/src/vsi_nn_internal_node.c +++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c @@ -716,6 +716,12 @@ vsi_status vsi_nn_internal_optimize_node for ( i = n - 1; i >= 0; i-- ) { curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)WKSP(node), i); + if ( NULL == curr ) + { + VSILOGE("get point fail"); + status = VSI_FAILURE; + break; + } VSILOGD("Optimize backward for node uid[%u] sub_uid[%u] op[%s]", node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op)); diff --git a/src/tim/vx/internal/src/vsi_nn_log.c b/src/tim/vx/internal/src/vsi_nn_log.c index f617359d0..8449f80d9 100644 --- a/src/tim/vx/internal/src/vsi_nn_log.c +++ b/src/tim/vx/internal/src/vsi_nn_log.c @@ -29,7 +29,7 @@ #include "vsi_nn_log.h" #include "vsi_nn_types.h" -#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30) +#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30)) static const char* ENV_LOG_LEVEL = "vendor.VSI_NN_LOG_LEVEL"; #else static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL"; diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index c6e9daa44..c30d03106 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -1013,6 +1013,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly sizeof(numParams)); if (VSI_SUCCESS != status) { + vsi_nn_safe_free(nodes); goto final; } for (p = 0; p < numParams; p++) diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index d44ecf8eb..4d1022250 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -3258,6 +3258,7 @@ static vsi_bool _init_dummy_tensor #endif // This is a hack that driver doesn't support const scales scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim); + CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final ); memcpy(scales, tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float));