From bc472242a30bd8c74b97d46dc83c05f742113de7 Mon Sep 17 00:00:00 2001
From: Feiyue Chen <Feiyue.Chen@verisilicon.com>
Date: Mon, 1 Apr 2024 07:39:12 +0000
Subject: [PATCH] Updata internal ovxlib to rel1.2.6

Internal ovxlib commit hash: c5d3e69356579fc7b595a7c0939fc7e4e0aaab5a

Type: Code Improvement
Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
---
 VERSION                                       |    2 +-
 src/tim/vx/internal/include/interface/ops.def |    2 +
 .../internal/include/kernel/vsi_nn_kernel.h   |    6 +-
 .../include/ops/vsi_nn_op_resize_internal.h   |    1 +
 .../internal/include/ops/vsi_nn_op_rmsnorm.h  |   54 +
 .../vx/internal/include/ops/vsi_nn_op_shape.h |   47 +
 .../include/utils/vsi_nn_dtype_util_prv.h     |   72 +-
 src/tim/vx/internal/include/vsi_nn_context.h  |    2 -
 .../vx/internal/include/vsi_nn_node_type.h    |    4 +
 src/tim/vx/internal/include/vsi_nn_version.h  |    2 +-
 ..._tiny_yolov4_postprocess_confidence_evis.c |    2 +-
 .../custom/ops/vsi_nn_op_custom_warp_affine.c |   11 +-
 .../src/kernel/cl/bilinear_grid_sample_cl.c   |  147 +-
 .../src/kernel/cl/gather_elements_cl.c        |    3 +
 .../src/kernel/cl/lstmunit_activation_cl.c    |   11 +-
 .../src/kernel/cl/nearest_grid_sample_cl.c    |   17 +-
 .../vx/internal/src/kernel/cl/one_hot_cl.c    |    9 +-
 src/tim/vx/internal/src/kernel/cl/topk_cl.c   |   44 +-
 .../kernel/evis/bilinear_grid_sample_evis.c   |  215 +-
 .../src/kernel/evis/crop_and_resize_evis.c    |    2 +
 .../kernel/evis/lstmunit_activation_evis.c    |   11 +-
 .../kernel/evis/nearest_grid_sample_evis.c    |   17 +-
 .../evis/pre_process_rgb888_planar_evis.c     |    2 +-
 .../pre_process_rgb888_planar_nhwc_evis.c     |    2 +-
 .../src/kernel/evis/resize_cubic_evis.c       |   20 +-
 .../vx/internal/src/kernel/vsi_nn_kernel.c    |    5 +
 .../internal/src/kernel/vsi_nn_kernel_util.c  |    6 +-
 .../vx/internal/src/kernel/vx/rms_norm_vx.c   |   94 +
 .../ops/cl/bilinear_grid_sample_reflect.cl    |  169 ++
 .../internal/src/libnnext/ops/cl/maxpool.cl   |   10 +-
 .../vx/internal/src/libnnext/ops/cl/topk.cl   |   10 +-
 .../vx/internal/src/libnnext/ops/cl/topk2.cl  |  368 +++
 .../src/libnnext/ops/cl/topk_odd_even_sort.cl |  125 +-
 .../libnnext/ops/cl/topk_odd_even_sort2.cl    |   81 +-
 ...linear_grid_sample_reflect_BF16_to_BF16.vx |  171 ++
 ...bilinear_grid_sample_reflect_F16_to_F16.vx |  217 ++
 .../bilinear_grid_sample_reflect_F16_to_U8.vx |  224 ++
 ...bilinear_grid_sample_reflect_I16_to_I16.vx |  160 ++
 .../bilinear_grid_sample_reflect_I8_to_I8.vx  |  160 ++
 .../bilinear_grid_sample_reflect_U8_to_U8.vx  |  224 ++
 .../src/libnnext/vsi_nn_libnnext_resource.c   | 2045 +++++++++++++++--
 src/tim/vx/internal/src/ops/vsi_nn_op_clip.c  |   10 +-
 .../src/ops/vsi_nn_op_conv2d_lstm_cell.c      |    3 +
 .../src/ops/vsi_nn_op_deconvolution.c         |    7 +-
 .../internal/src/ops/vsi_nn_op_extra_ending.c |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_gather.c    |   19 +
 .../src/ops/vsi_nn_op_gather_elements.c       |    3 +
 .../internal/src/ops/vsi_nn_op_grid_sample.c  |   28 +-
 .../src/ops/vsi_nn_op_instancenormalize.c     |   28 +
 .../src/ops/vsi_nn_op_layernormalize.c        |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_reduce.c    |   97 +-
 .../vx/internal/src/ops/vsi_nn_op_reducel2.c  |    2 +-
 .../internal/src/ops/vsi_nn_op_relu_keras.c   |   10 +-
 .../vx/internal/src/ops/vsi_nn_op_reshape.c   |    6 +-
 .../vx/internal/src/ops/vsi_nn_op_reshape2.c  |    5 +
 .../src/ops/vsi_nn_op_resize_internal.c       |    3 +
 .../vx/internal/src/ops/vsi_nn_op_rmsnorm.c   |  202 ++
 src/tim/vx/internal/src/ops/vsi_nn_op_shape.c |  196 ++
 .../src/ops/vsi_nn_op_strided_slice.c         |    9 +
 src/tim/vx/internal/src/ops/vsi_nn_op_topk.c  |    7 +-
 .../src/ops/vsi_nn_op_upsamplescale.c         |   11 +-
 .../src/utils/vsi_nn_code_generator.c         |    2 +
 src/tim/vx/internal/src/utils/vsi_nn_util.c   |   22 +-
 src/tim/vx/internal/src/vsi_nn_context.c      |    2 +-
 src/tim/vx/internal/src/vsi_nn_graph.c        |    8 +-
 .../vx/internal/src/vsi_nn_internal_node.c    |    6 +
 src/tim/vx/internal/src/vsi_nn_log.c          |    2 +-
 .../vx/internal/src/vsi_nn_pre_post_process.c |    1 +
 src/tim/vx/internal/src/vsi_nn_tensor.c       |    1 +
 69 files changed, 4878 insertions(+), 590 deletions(-)
 mode change 100755 => 100644 src/tim/vx/internal/include/interface/ops.def
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_rmsnorm.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_shape.h
 create mode 100644 src/tim/vx/internal/src/kernel/vx/rms_norm_vx.c
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample_reflect.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_BF16_to_BF16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_F16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_U8.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I16_to_I16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I8_to_I8.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_U8_to_U8.vx
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_shape.c

diff --git a/VERSION b/VERSION
index da44c7f34..3c43790f5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.1.50
+1.2.6
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
old mode 100755
new mode 100644
index 6c879e9c9..fe42d4533
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -197,3 +197,5 @@ DEF_OP(RESIZE_3D)
 DEF_OP(REDUCEL2)
 DEF_OP(CROP_AND_RESIZE)
 DEF_OP(TAN)
+DEF_OP(RMSNORM)
+DEF_OP(SHAPE)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
index 5150b0e4a..d81bd8408 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@@ -26,6 +26,8 @@
 #define _VSI_NN_KERNEL_H
 
 #include <stdint.h>
+#include <stdarg.h>
+
 #include "vsi_nn_log.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_graph.h"
@@ -81,6 +83,7 @@ typedef enum
     U4,
     FP8_E4M3,
     FP8_E5M2,
+    INVALID_DTYPE,
 } VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e;
 
 typedef enum
@@ -532,9 +535,8 @@ static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
         return FP8_E5M2;
     default:
         VSILOGE("error data type %d", dtype);
-        break;
+        return INVALID_DTYPE;
     }
-    return I8;
 } /* vsi_nn_kernel_map_dtype() */
 
 static VSI_INLINE_API  vsi_nn_type_e vsi_nn_dtype_map_kernel
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h
index 6adc8964d..ba9891925 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h
@@ -43,6 +43,7 @@ typedef struct _vsi_nn_resize_internal_param
     vsi_bool    half_pixel_centers;
     float       factor;
     vsi_enum    layout;
+    vsi_enum    type;
 } vsi_nn_resize_internal_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rmsnorm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rmsnorm.h
new file mode 100644
index 000000000..12c8113df
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rmsnorm.h
@@ -0,0 +1,54 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_RMSNORM_H
+#define _VSI_NN_OP_RMSNORM_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct _rmsnorm_local_data_t {
+    int32_t placeholder;
+} rmsnorm_local_data_t;
+
+typedef struct _vsi_nn_rmsnorm_param
+{
+    struct _rmsnorm_local_data_t* local;
+    float eps;
+    int32_t axis;
+} vsi_nn_rmsnorm_param;
+
+_compiler_assert(offsetof(vsi_nn_rmsnorm_param, local) == 0, \
+    vsi_nn_rmsnorm_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_shape.h b/src/tim/vx/internal/include/ops/vsi_nn_op_shape.h
new file mode 100644
index 000000000..1ae2a5bb3
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_shape.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_SHAPE_H
+#define _VSI_NN_OP_SHAPE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_shape_param
+{
+    struct _shape_local_data_t* local;
+    // Add parameters here
+} vsi_nn_shape_param;
+_compiler_assert(offsetof(vsi_nn_shape_param, local) == 0, \
+    vsi_nn_shape_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
index 9d32dfb7c..ed7857159 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@@ -33,6 +33,14 @@
 extern "C" {
 #endif
 
+/*
+ * A helper union for fp32 bit casting.
+ */
+typedef union {
+    float val;
+    uint32_t data;
+} fp32_bit_cast_t;
+
 static VSI_INLINE_API vsi_bool type_is_integer
     (
     const vsi_nn_type_e type
@@ -203,9 +211,11 @@ static VSI_INLINE_API vsi_bool fp32_is_inf
         float val
     )
 {
-    uint32_t u_value = *(uint32_t*)&val;
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = val;
+    uint32_t fp32_data = fp32_bit_cast.data;
 
-    if ((u_value & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
+    if ((fp32_data & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
     {
         return TRUE;
     }
@@ -232,7 +242,9 @@ static VSI_INLINE_API int32_t fp32_to_affine
 
     if (fp32_is_inf(in) != 0)
     {
-        uint32_t sign = (*(uint32_t*)&in) >> 31;
+        fp32_bit_cast_t fp32_bit_cast;
+        fp32_bit_cast.val = in;
+        uint32_t sign = fp32_bit_cast.data >> 31;
         data = sign == 1 ? (int32_t)min_range : (int32_t)max_range;
     }
 
@@ -277,7 +289,9 @@ static VSI_INLINE_API int32_t fp32_to_dfp
 
     if (fp32_is_inf(in) != 0)
     {
-        uint32_t sign = (*(uint32_t*)&in) >> 31;
+        fp32_bit_cast_t fp32_bit_cast;
+        fp32_bit_cast.val = in;
+        uint32_t sign = fp32_bit_cast.data >> 31;
         data = sign == 1 ? (int32_t)min_range : (int32_t) max_range;
     }
 
@@ -373,8 +387,9 @@ static VSI_INLINE_API float bfp16_to_fp32
     int16_t in
     )
 {
-    int32_t t1, t2, t3;
+    uint32_t t1, t2, t3;
     float out;
+    fp32_bit_cast_t fp32_bit_cast;
 
     t1 = in & 0x00FF;                       // Mantissa
     t2 = in & 0xFF00;                       // Sign bit + Exponent
@@ -384,9 +399,10 @@ static VSI_INLINE_API float bfp16_to_fp32
     t2 <<= 16;                              // Shift (sign + Exponent) bit into position
     t1 |= t2;                               // Re-insert (sign + Exponent) bit
 
-    *((uint32_t*)&out) = t1;
+    fp32_bit_cast.data = t1;
+    out = fp32_bit_cast.val;
 
-    return t3 == 0 ? 0 : out;
+    return t3 == 0 ? 0.0f : out;
 } /* bfp16_to_fp32() */
 
 static VSI_INLINE_API uint16_t fp32_to_fp16
@@ -394,10 +410,12 @@ static VSI_INLINE_API uint16_t fp32_to_fp16
     float in
     )
 {
-    uint32_t fp32 = *((uint32_t *) &in);
-    uint32_t t1 = (fp32 & 0x80000000u) >> 16;  /* sign bit. */
-    uint32_t t2 = (fp32 & 0x7F800000u) >> 13;  /* Exponent bits */
-    uint32_t t3 = (fp32 & 0x007FE000u) >> 13;  /* Mantissa bits, no rounding */
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = in;
+    uint32_t fp32_data = fp32_bit_cast.data;
+    uint32_t t1 = (fp32_data & 0x80000000u) >> 16;  /* sign bit. */
+    uint32_t t2 = (fp32_data & 0x7F800000u) >> 13;  /* Exponent bits */
+    uint32_t t3 = (fp32_data & 0x007FE000u) >> 13;  /* Mantissa bits, no rounding */
     uint32_t fp16 = 0u;
     if( t2 >= 0x023c00u )
     {
@@ -420,8 +438,10 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16
     float in
     )
 {
-    uint32_t fp32 = *((unsigned int *) &in);
-    uint32_t t1 = fp32 >> 16;
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = in;
+    uint32_t fp32_data = fp32_bit_cast.data;
+    uint32_t t1 = fp32_data >> 16;
 
     return (uint16_t) t1;
 } /* fp32_to_bfp16() */
@@ -435,10 +455,12 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
     Convert a float point to bfloat16, with round-nearest-to-even as rounding method.
     */
 
-    uint32_t fp32 = *((unsigned int *) &in);
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = in;
+    uint32_t fp32_data = fp32_bit_cast.data;
     uint16_t out;
 
-    uint32_t lsb = (fp32 >> 16) & 1;    /* Least significant bit of resulting bfloat. */
+    uint32_t lsb = (fp32_data >> 16) & 1;    /* Least significant bit of resulting bfloat. */
     uint32_t rounding_bias = 0x7fff + lsb;
 
     if ( VSI_NN_FLOAT32_NAN == in )
@@ -447,8 +469,8 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
     }
     else
     {
-        fp32 += rounding_bias;
-        out = (uint16_t) (fp32 >> 16);
+        fp32_data += rounding_bias;
+        out = (uint16_t) (fp32_data >> 16);
     }
 
     return out;
@@ -466,7 +488,9 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
 
 static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
     float fp8_f32 = in / scale;
-    int32_t in_val = *((int32_t*)&fp8_f32);
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = fp8_f32;
+    uint32_t in_val = fp32_bit_cast.data;
 
     uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
     uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
@@ -512,7 +536,9 @@ static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
 
 static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) {
     float fp8_f32 = in / scale;
-    int32_t in_val = *((int32_t*)&fp8_f32);
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = fp8_f32;
+    uint32_t in_val = fp32_bit_cast.data;
     uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
     uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
     uint32_t in_man = (in_val & 0x7FFFFF);   /* low 23 bits is man */
@@ -561,6 +587,7 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
     uint32_t exponentOut = 0;
     uint32_t mantissaOut = 0;
     uint32_t out_u = 0;
+    fp32_bit_cast_t fp32_bit_cast;
 
     {
         uint32_t signIn;
@@ -610,7 +637,8 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
     }
 final:
     out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
-    val_fp32 = *((float*)&out_u);
+    fp32_bit_cast.data = out_u;
+    val_fp32 = fp32_bit_cast.val;
 
     return val_fp32 * scale;
 } /* fp8_e4m3_to_fp32() */
@@ -621,6 +649,7 @@ static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
     uint32_t exponentOut = 0;
     uint32_t mantissaOut = 0;
     uint32_t out_u = 0;
+    fp32_bit_cast_t fp32_bit_cast;
 
     {
         uint32_t signIn;
@@ -670,7 +699,8 @@ static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
     }
 final:
     out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
-    val_fp32 = *((float*)&out_u);
+    fp32_bit_cast.data = out_u;
+    val_fp32 = fp32_bit_cast.val;
     return val_fp32 * scale;
 } /* fp8_e5m2_to_fp32() */
 
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index 4ac9f6113..477cb19f4 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -60,9 +60,7 @@ typedef struct _vsi_nn_hw_config_t
 {
     char target_name[VSI_NN_MAX_TARGET_NAME];
     vsi_nn_hw_evis_t evis;
-#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
     uint32_t subGroupSize;
-#endif
     uint32_t use_40bits_va;
     uint32_t support_stream_processor;
     uint32_t sp_exec_count;
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index 173be9409..a18e89494 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -210,6 +210,8 @@
 #include "ops/vsi_nn_op_resize_3d.h"
 #include "ops/vsi_nn_op_reducel2.h"
 #include "ops/vsi_nn_op_crop_and_resize.h"
+#include "ops/vsi_nn_op_rmsnorm.h"
+#include "ops/vsi_nn_op_shape.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@@ -408,6 +410,8 @@ typedef union _vsi_nn_nn_param
     vsi_nn_resize_3d_param          resize_3d;
     vsi_nn_reducel2_param           reducel2;
     vsi_nn_crop_and_resize_param    crop_and_resize;
+    vsi_nn_rmsnorm_param            rmsnorm;
+    vsi_nn_shape_param              shape;
     void*                         client_param;
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index 2b7e1bd04..0fafc241d 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 2
-#define VSI_NN_VERSION_PATCH 2
+#define VSI_NN_VERSION_PATCH 6
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
index b36ec6b14..5f34aaa41 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
@@ -267,7 +267,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM];
     vsi_nn_kernel_node_t node = NULL;
-    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
     vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
 
     VSI_UNREFERENCED(params);
diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
index 94a2d704a..631c2edea 100644
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
@@ -55,8 +55,17 @@ static vsi_status op_compute
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_custom_warp_affine_param * p;
     p = &(self->nn_param.custom_warp_affine);
-
+    float matrix_shape[6] = { 1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f};
     param = vsi_nn_kernel_param_create();
+    //Unlike OpenCV, we use the coordinate of dst and matrix to calculate the coordinate of src in custom_warp_affine.
+    //Therefore, matrix M_ovx in custom_warp_affine is different from matrix M_cv in OpenCV.
+    //We get M_ovx by transposing the inverse of M_cv.
+    //inv_M = cv2.invertAffineTransform(M_cv); M_ovx=inv_M.transpose(1,0)
+    if (p->matrix == NULL)
+    {
+        p->matrix = matrix_shape;
+    }
+
     vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 );
     vsi_nn_kernel_param_add_int32( param, "type", p->type);
     vsi_nn_kernel_param_add_int32( param, "rgb_type", p->rgb_type);
diff --git a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
index 84811fd82..095ed2418 100644
--- a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
@@ -47,18 +47,26 @@ typedef enum
 } _internal_kernel_e;
 
 #define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE()      "bilinear_grid_sample"
+#define _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE()      "bilinear_grid_sample_reflect"
 
 #define STR(a) #a
 
 // Add kernel hashtable here
-#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
-    ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, REFLECT) \
+    ((IN1_DTYPE << 24) | (IN0_DTYPE << 16) | (OUT_DTYPE << 8) | (REFLECT))
 
 #define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE)                \
     {                                                                   \
-        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
-            CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
-            _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE()   \
+        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
+        CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+        _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE()   \
+    }
+
+#define PACK_REFLECT_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE)                \
+    {                                                                   \
+        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
+        CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_reflect_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+        _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE()   \
     }
 
 typedef struct
@@ -73,6 +81,8 @@ static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
     // Register kernel here
     PACK_KERNEL_MAP(F32, F32, F32 ),
     PACK_KERNEL_MAP(U8,  U8,  U8),
+    PACK_REFLECT_KERNEL_MAP(F32, F32, F32),
+    PACK_REFLECT_KERNEL_MAP(U8,  U8,  U8),
 };
 
 
@@ -95,23 +105,24 @@ static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
 #define _BILINEAR_GRID_SAMPLE_PARAM_NUM 8
-#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM \
+#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM 14
+#define _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_NUM 12
+#define _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM \
     _cnt_of_array(_bilinear_grid_sample_kernel_param_def)
 
+
 #define SCALAR_HALF_INPUT0_W (3)
 #define SCALAR_HALF_INPUT0_H (4)
 #define SCALAR_ADD_VALUE_W   (5)
 #define SCALAR_ADD_VALUE_H   (6)
 #define SCALAR_DEPTH         (7)
-#define SCALAR_INPUT0_SCALE  (8)
-#define SCALAR_INPUT0_TAIL   (9)
-#define SCALAR_INPUT1_SCALE  (10)
-#define SCALAR_INPUT1_TAIL   (11)
-#define SCALAR_OUTPUT_SCALE  (12)
-#define SCALAR_OUTPUT_TAIL   (13)
 
 /*
  * Kernel initializer
@@ -170,7 +181,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
-    vsi_bool* is_use_u8_kernel
+    vsi_bool* is_use_u8_kernel,
+    int32_t is_reflect_mode
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -199,14 +211,29 @@ static vsi_status _query_kernel
         out_dtype = F32;
     }
     if ((U8 == in0_dtype) || (U8 == out_dtype)) {
-        param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
+        if (is_reflect_mode)
+        {
+            param_def_size = _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM;
+        }
+        else{
+            param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
+        }
+
         *is_use_u8_kernel = TRUE;
     } else {
-        param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
+        if (is_reflect_mode)
+        {
+            param_def_size = _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_NUM;
+        }
+        else
+        {
+            param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
+        }
+
         *is_use_u8_kernel = FALSE;
     }
 
-    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, is_reflect_mode);
 
     for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
     {
@@ -245,7 +272,7 @@ static vsi_nn_kernel_node_t _setup
 {
     vsi_nn_kernel_node_t node = NULL;
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM];
+    vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM];
     vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
     uint32_t final_in1_rank = 0;
     vsi_nn_tensor_t* rs_tensors = NULL;
@@ -263,11 +290,14 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool is_use_u8_kernel = FALSE;
     int32_t align_corners =
         vsi_nn_kernel_param_get_int32(params, "align_corners");
+    int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
     uint32_t pad_val = 0;
     int32_t  depth = 0;
     vsi_nn_kernel_dtype_e in0_dtype;
 
     float half_input0_w, half_input0_h, add_float_value_w, add_float_value_h;
+    int32_t is_reflect_mode = 0;
+    float min_val_w, span_w, min_val_h, span_h;
 
     // Check if gpu can support the size
     if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
@@ -280,6 +310,11 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
+    if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
+    {
+        is_reflect_mode = 1;
+    }
+
     final_tensors[0] = inputs[0];
 
     if (inputs[1]->attr.dim_num >= 3) {
@@ -313,12 +348,35 @@ static vsi_nn_kernel_node_t _setup
         add_float_value_h = half_input0_h - 0.5f;
     }
 
+    if (is_reflect_mode)
+    {
+        float low_w, low_h, high_w, high_h;
+        if (align_corners)
+        {
+            low_w = 0;
+            low_h = 0;
+            high_w = 2 * (float)(in0_width - 1);
+            high_h = 2 * (float)(in0_height - 1);
+        }
+        else
+        {
+            low_w = -1;
+            low_h = -1;
+            high_w = 2 * (float)in0_width - 1;
+            high_h = 2 * (float)in0_height - 1;
+        }
+        min_val_w = low_w / 2;
+        span_w = (high_w - low_w) / 2;
+        min_val_h = low_h / 2;
+        span_h = (high_h - low_h) / 2;
+    }
+
     depth = (int32_t)inputs[0]->attr.size[2];
     in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
     if (U8 == in0_dtype) {
         pad_val = inputs[0]->attr.dtype.zero_point;
     }
-    status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel);
+    status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel, is_reflect_mode);
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -326,7 +384,7 @@ static vsi_nn_kernel_node_t _setup
         {
             size_t node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
             /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM,
+            vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM,
                     final_tensors, input_num, &final_tensors[2], output_num );
             node_params[SCALAR_HALF_INPUT0_W] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_w );
             node_params[SCALAR_HALF_INPUT0_H] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_h );
@@ -335,13 +393,19 @@ static vsi_nn_kernel_node_t _setup
             node_params[SCALAR_DEPTH]         = vsi_nn_kernel_scalar_create( graph, I32, &depth );
             if (is_use_u8_kernel)
             {
-                node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
-                node_params[SCALAR_INPUT0_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail );
-                node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
-                node_params[SCALAR_INPUT1_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail );
-                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
-                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
-                node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
+                node_params[node_params_num++]  = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail );
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
+                node_params[node_params_num++]  = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail );
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+                node_params[node_params_num++]  = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+            }
+            if (is_reflect_mode)
+            {
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &min_val_w);
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &span_w);
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &min_val_h);
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &span_h);
             }
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
@@ -351,19 +415,34 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_W]);
             vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_H]);
             vsi_nn_kernel_scalar_release(&node_params[SCALAR_DEPTH]);
+            node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
             if (is_use_u8_kernel) {
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_SCALE]);
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_TAIL]);
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_SCALE]);
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_TAIL]);
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_SCALE]);
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_TAIL]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+            }
+            if (is_reflect_mode)
+            {
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
             }
             {
                 // Set default border mode.
                 vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U32 = pad_val;
+                if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
+                {
+                    border.mode = VX_BORDER_CONSTANT;
+                    border.constant_value.U32 = pad_val;
+                }
+                else
+                {
+                    border.mode = VX_BORDER_REPLICATE;
+                }
                 status = vxSetNodeAttribute(
                     (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
                 CHECK_STATUS(status);
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
index 82838648c..abb4ed4e3 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
@@ -244,7 +244,10 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
 
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(input_attr0);
+    SAFE_FREE_TENSOR_ATTR(input_attr1);
     SAFE_FREE_TENSOR_ATTR(output_attr);
+
     return status;
 } /* _gather_elements_initializer() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
index aeb8b4d6c..67d4d932b 100644
--- a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
 #include "kernel/vsi_nn_kernel.h"
 
 __BEGIN_DECLS
@@ -1489,8 +1490,8 @@ static vsi_nn_kernel_node_t _setup
     float    twoLogE                = 2 * logE;
     uint32_t uint_min               = 0xFBFFFFFF;
     uint32_t uint_max               = 0x7BFFFFFF;
-    float    float_min              = *(vx_float32 *)&uint_min;
-    float    float_max              = *(vx_float32 *)&uint_max;
+    float    float_min              = 0.0f;
+    float    float_max              = 0.0f;
     float    scale_val[9]           = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
     float    tail_val[9]            = {0, 0, 0, 0, 0, 0, 0, 0, 0};
     vsi_bool is_u8_type = FALSE;
@@ -1499,6 +1500,12 @@ static vsi_nn_kernel_node_t _setup
     size_t   lstm_activation_in_out_num = 0;
     uint32_t i;
 
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.data = uint_min;
+    float_min = fp32_bit_cast.val;
+    fp32_bit_cast.data = uint_max;
+    float_max = fp32_bit_cast.val;
+
     _is_ln               = vsi_nn_kernel_param_get_int32( params, "_is_ln" );
     _is_cifg             = vsi_nn_kernel_param_get_int32( params, "_is_cifg" );
     _is_proj             = vsi_nn_kernel_param_get_int32( params, "_is_proj" );
diff --git a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
index cc6d53800..8eee2c474 100644
--- a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
@@ -266,6 +266,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool is_use_u8_kernel = FALSE;
     int32_t align_corners =
         vsi_nn_kernel_param_get_int32(params, "align_corners");
+    int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
     uint32_t pad_val = 0;
     int32_t depth = 0;
     vsi_nn_kernel_dtype_e in0_dtype;
@@ -282,6 +283,11 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
+    if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
+    {
+        return NULL;
+    }
+
     final_tensors[0] = inputs[0];
     if (inputs[1]->attr.dim_num >= 3) {
         final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
@@ -382,8 +388,15 @@ static vsi_nn_kernel_node_t _setup
             {
                 // Set default border mode.
                 vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U32 = pad_val;
+                if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
+                {
+                    border.mode = VX_BORDER_CONSTANT;
+                    border.constant_value.U32 = pad_val;
+                }
+                else
+                {
+                    border.mode = VX_BORDER_REPLICATE;
+                }
                 status = vxSetNodeAttribute(
                     (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
                 CHECK_STATUS(status);
diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
index a66b89b3e..eb0e556fb 100644
--- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@@ -36,6 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
 
 __BEGIN_DECLS
 
@@ -242,6 +243,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t suffix_dim_size = 0;
     int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
     vsi_nn_kernel_dtype_e out_dtype;
+    fp32_bit_cast_t fp32_bit_cast;
     uint32_t data[2] = {0};
     float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
     float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
@@ -258,8 +260,11 @@ static vsi_nn_kernel_node_t _setup
     }
     else
     {
-        data[0] = *(uint32_t*)&on_value;
-        data[1] = *(uint32_t*)&off_value;
+        fp32_bit_cast.val = on_value;
+        data[0] = fp32_bit_cast.data;
+
+        fp32_bit_cast.val = off_value;
+        data[1] = fp32_bit_cast.data;
     }
 
     axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis;
diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
index b8cdfd086..a21d29093 100644
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@@ -34,20 +34,24 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
 #define _TOPK_KERNEL_SOURCE      "topk"
 #define STR(a) #a
 // Add kernel hashtable here
-#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
+#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
           CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
           _TOPK_KERNEL_SOURCE }
 
+#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
+          CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
+          "topk2" }
+
 #define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
         ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
 #define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
@@ -111,6 +115,9 @@ static const _kernel_map_type _topk_kernel_map[] =
     PACK_KERNEL_MAP( F32, I32, 4 ),
     PACK_KERNEL_MAP( F32, I32, 5 ),
     PACK_KERNEL_MAP( F32, I32, 6 ),
+
+    PACK_MERGE_KERNEL_MAP(U32, U32),
+    PACK_MERGE_KERNEL_MAP(I32, I32),
 };
 
 static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@@ -254,7 +261,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
-    int32_t num_stages
+    int32_t num_stages,
+    vsi_bool is_bitnoic_segment
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -272,21 +280,23 @@ static vsi_status _query_kernel
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    num_stages = is_bitnoic_segment ? 0 : num_stages;
+
     switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
     {
     case _PACK_SELECT_KEY(F32, F32):
     case _PACK_SELECT_KEY(F16, F16):
-        key = TOPK_HASH_KEY( F32, F32, num_stages );
+        key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
         break;
     case _PACK_SELECT_KEY(U32, U32):
     case _PACK_SELECT_KEY(U16, U16):
     case _PACK_SELECT_KEY(U8,  U8):
-        key = TOPK_HASH_KEY( U32, U32, num_stages );
+        key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
         break;
     case _PACK_SELECT_KEY(I32, I32):
     case _PACK_SELECT_KEY(I16, I16):
     case _PACK_SELECT_KEY(I8,  I8):
-        key = TOPK_HASH_KEY( I32, I32, num_stages );
+        key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
         break;
     case _PACK_SELECT_KEY(F32, U32):
     case _PACK_SELECT_KEY(F16, U32):
@@ -294,7 +304,7 @@ static vsi_status _query_kernel
     case _PACK_SELECT_KEY(F16, U16):
     case _PACK_SELECT_KEY(F32, U8):
     case _PACK_SELECT_KEY(F16, U8):
-        key = TOPK_HASH_KEY( F32, U32, num_stages );
+        key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
         break;
     case _PACK_SELECT_KEY(F32, I32):
     case _PACK_SELECT_KEY(F16, I32):
@@ -302,7 +312,7 @@ static vsi_status _query_kernel
     case _PACK_SELECT_KEY(F16, I16):
     case _PACK_SELECT_KEY(F32, I8):
     case _PACK_SELECT_KEY(F16, I8):
-        key = TOPK_HASH_KEY( F32, I32, num_stages );
+        key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
         break;
     default:
         break;
@@ -440,7 +450,12 @@ static vsi_nn_kernel_node_t _setup
     int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
     int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
     vsi_bool is_odd_even_sort = FALSE;
+    vsi_bool is_bitnoic_segment = FALSE;
     size_t param_num = _TOPK_PARAM_NUM;
+    int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
+    vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
     float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
     float inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
     float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
@@ -471,9 +486,14 @@ static vsi_nn_kernel_node_t _setup
     rs_tensors[0] = vsi_nn_reshape_tensor( graph,
         inputs[0], shape[0], 2 );
 
-    if (num_stages < 7)
+    is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
+        type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
+    num_stages = is_bitnoic_segment ? 9 : num_stages;
+    max_stages = is_bitnoic_segment ? max_stages : 7;
+
+    if (num_stages < max_stages || is_bitnoic_segment)
     {
-        status = _query_kernel( kernel, inputs, outputs, num_stages );
+        status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );
 
         rs_tensors[1] = vsi_nn_reshape_tensor( graph,
             outputs[0], shape[1], 2 );
diff --git a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
index a94c93c8c..ed1e295fb 100644
--- a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
@@ -52,15 +52,26 @@ typedef enum
 #define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(_input_type, _output_type) \
     "bilinear_grid_sample_" #_input_type "_to_" #_output_type
 
+#define _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE(_input_type, _output_type) \
+    "bilinear_grid_sample_reflect_" #_input_type "_to_" #_output_type
+
 // Add kernel hashtable here
-#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
-        ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, REFLECT) \
+    ((IN1_DTYPE << 24) | (IN0_DTYPE << 16) | (OUT_DTYPE << 8) | (REFLECT))
+
 #define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
-        {                                                                   \
-        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
-        CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
-        _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE)     \
-        }
+    {                                                                   \
+    BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
+    CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+    _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE)     \
+    }
+
+#define PACK_REFLECT_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    {                                                                   \
+    BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
+    CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_reflect_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+    _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE)     \
+    }
 
 typedef struct
 {
@@ -83,6 +94,18 @@ static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
     PACK_KERNEL_MAP(I16,  I16,  I16),
     PACK_KERNEL_MAP(I8,   I8,   I8),
     PACK_KERNEL_MAP(BF16, BF16, BF16),
+    PACK_REFLECT_KERNEL_MAP(F16,  F32,  F16),
+    PACK_REFLECT_KERNEL_MAP(F16,  U8,   F16),
+    PACK_REFLECT_KERNEL_MAP(F16,  F16,  F16),
+    PACK_REFLECT_KERNEL_MAP(F16,  F32,  U8),
+    PACK_REFLECT_KERNEL_MAP(F16,  F16,  U8),
+    PACK_REFLECT_KERNEL_MAP(F16,  U8,   U8),
+    PACK_REFLECT_KERNEL_MAP(U8,   U8,   U8),
+    PACK_REFLECT_KERNEL_MAP(U8,   F16,  U8),
+    PACK_REFLECT_KERNEL_MAP(U8,   F32,  U8),
+    PACK_REFLECT_KERNEL_MAP(I16,  I16,  I16),
+    PACK_REFLECT_KERNEL_MAP(I8,   I8,   I8),
+    PACK_REFLECT_KERNEL_MAP(BF16, BF16, BF16),
 };
 
 
@@ -96,18 +119,20 @@ static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
-#define _BILINEAR_GRID_SAMPLE_PARAM_NUM  _cnt_of_array( _bilinear_grid_sample_kernel_param_def )
+#define _BILINEAR_GRID_SAMPLE_PARAM_NUM \
+_cnt_of_array( _bilinear_grid_sample_kernel_param_def )
 
 #define SCALAR_ALIGN_CORNERS (3)
 
 /*
  * Kernel initializer
  */
-DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
+static vsi_status _bilinear_grid_sample_initializer_base
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
+    size_t                              param_size,
+    vsi_bool                            is_reflect_mode
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -135,6 +160,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
     int32_t input1ZP      = 0;
     float   output_scale  = 1.0;
     int32_t outputZP      = 0;
+    float min_val_wh[4] = { 0 };
+    float span_wh[4] = { 0 };
 
     VSI_UNREFERENCED(param_size);
 
@@ -156,6 +183,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
         (vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
     CHECK_STATUS_FAIL_GOTO(status, final);
 
+
     out_shape = output_attr->shape;
     in0_shape = input_attr[0]->shape;
     input0_dtype = input_attr[0]->dtype;
@@ -193,6 +221,35 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
     status |= vsi_nn_kernel_gpu_add_param(node, "add_float_value", add_float_value);
     status |= vsi_nn_kernel_gpu_add_param(node, "depth", &depth);
 
+    if (is_reflect_mode)
+    {
+        float low_w, low_h, high_w, high_h;
+        if (align_corners)
+        {
+            low_w = 0;
+            low_h = 0;
+            high_w = 2 * (float)(in0_width - 1);
+            high_h = 2 * (float)(in0_height - 1);
+        }
+        else
+        {
+            low_w = -1;
+            low_h = -1;
+            high_w = 2 * (float)in0_width - 1;
+            high_h = 2 * (float)in0_height - 1;
+        }
+        min_val_wh[0] = low_w / 2;
+        span_wh[0] = (high_w - low_w) / 2;
+        min_val_wh[1] = low_h / 2;
+        span_wh[1] = (high_h - low_h) / 2;
+        min_val_wh[2] = min_val_wh[0];
+        min_val_wh[3] = min_val_wh[1];
+        span_wh[2] = span_wh[0];
+        span_wh[3] = span_wh[1];
+        status |= vsi_nn_kernel_gpu_add_param(node, "span_wh", span_wh);
+        status |= vsi_nn_kernel_gpu_add_param(node, "min_val_wh", min_val_wh);
+    }
+
     {
         gpu_dp_inst_t uniFp16toFp32_part0_4x4 = {
             {
@@ -538,6 +595,28 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
 
 
 
+DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
+(
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t* param,
+    size_t                              param_size
+    )
+{
+    return _bilinear_grid_sample_initializer_base(
+        node, param, param_size, vx_false_e);
+}
+
+DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_reflect_initializer)
+(
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t* param,
+    size_t                              param_size
+    )
+{
+    return _bilinear_grid_sample_initializer_base(
+        node, param, param_size, vx_true_e);
+}
+
 /*
  * Query kernel
  */
@@ -545,7 +624,8 @@ static vsi_status _query_kernel
     (
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t is_reflect_mode
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -563,7 +643,16 @@ static vsi_status _query_kernel
     in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, is_reflect_mode);
+
+    if (is_reflect_mode)
+    {
+        initializer = _bilinear_grid_sample_reflect_initializer;
+    }
+    else
+    {
+        initializer = _bilinear_grid_sample_initializer;
+    }
 
     for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
     {
@@ -605,13 +694,21 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_NUM];
     vsi_nn_kernel_node_t node = NULL;
     vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
-    uint32_t final_in1_rank = 0;
+    vsi_size_t final_out_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
+    uint32_t final_in1_rank = 0, final_out_rank = 0;
     vsi_nn_tensor_t* rs_tensors = NULL;
+    vsi_nn_tensor_t* rs_out_tensors = NULL;
     vsi_nn_tensor_t* final_tensors[3] = {NULL};
     vsi_nn_kernel_dtype_e in0_dtype;
     uint32_t pad_val = 0;
     int32_t align_corners =
         vsi_nn_kernel_param_get_int32(params, "align_corners");
+    int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
+    int32_t is_reflect_mode = 0;
+    vsi_size_t in_size_x = inputs[1]->attr.size[1];
+    vsi_size_t in_size_y = inputs[1]->attr.dim_num >= 3 ? inputs[1]->attr.size[2] : 1;
+    vsi_size_t new_size_x = in_size_x, new_size_y = in_size_y;
+    vsi_bool is_reshape_out = vx_false_e;
 
     // Check if gpu can support the size
     if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
@@ -624,12 +721,63 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
+    if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
+    {
+        is_reflect_mode = 1;
+    }
+
     final_tensors[0] = inputs[0];
 
+    is_reshape_out = vx_false_e;
     if (inputs[1]->attr.dim_num >= 3) {
+        vsi_size_t shape_x[2];
+        vsi_size_t out_shape_x[2];
+        vsi_size_t out_rank_x;
+        shape_x[0] = in_size_x;
+        shape_x[1] = in_size_y;
+        vsi_nn_kernel_optimize_element_shape(shape_x, 2, out_shape_x, &out_rank_x);
+        if (out_rank_x == 2)
+        {
+            new_size_x = out_shape_x[0];
+            new_size_y = out_shape_x[1];
+        }
 
-        final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
-        final_shape[1] = inputs[1]->attr.size[2];
+        if ((new_size_x == in_size_x) && (new_size_y == in_size_y))
+        {
+            is_reshape_out = vx_false_e;
+        }
+        else if ((new_size_x * 2) >= GPU_TENSOR_MAX_WIDTH)
+        {
+            is_reshape_out = vx_false_e;
+        }
+        else
+        {
+            is_reshape_out = vx_true_e;
+        }
+
+        if (is_reshape_out == vx_false_e)
+        {
+            new_size_x = in_size_x;
+            new_size_y = in_size_y;
+            if ((new_size_x < new_size_y) && ((new_size_y * 2) < GPU_TENSOR_MAX_WIDTH))
+            {
+                vsi_size_t tmp = new_size_x;
+                new_size_x = new_size_y;
+                new_size_y = tmp;
+                is_reshape_out = vx_true_e;
+            }
+        }
+
+    }
+
+    if (((new_size_x * 2) >= GPU_TENSOR_MAX_WIDTH) || (new_size_y >= GPU_TENSOR_MAX_WIDTH))
+    {
+        return NULL;
+    }
+
+    if (inputs[1]->attr.dim_num >= 3) {
+        final_shape[0] = new_size_x * inputs[1]->attr.size[0];
+        final_shape[1] = new_size_y;
         final_shape[2] = 1;
         final_shape[3] = inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
         final_in1_rank =
@@ -643,14 +791,32 @@ static vsi_nn_kernel_node_t _setup
     } else {
         final_tensors[1] = inputs[1];
     }
-    final_tensors[2] = outputs[0];
+
+    if (is_reshape_out)
+    {
+        final_out_shape[0] = new_size_x;
+        final_out_shape[1] = new_size_y;
+        final_out_shape[2] = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
+        final_out_shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
+        final_out_rank = outputs[0]->attr.dim_num;
+        if (!vsi_nn_kernel_gpu_check_shape(final_out_shape, final_out_rank)) {
+            return NULL;
+        }
+
+        rs_out_tensors = vsi_nn_reshape_tensor(graph, outputs[0], final_out_shape, final_out_rank);
+        final_tensors[2] = rs_out_tensors;
+    }
+    else
+    {
+        final_tensors[2] = outputs[0];
+    }
 
     in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
     if (U8 == in0_dtype) {
         pad_val = inputs[0]->attr.dtype.zero_point;
     }
 
-    status = _query_kernel( kernel, inputs, outputs );
+    status = _query_kernel( kernel, inputs, outputs, is_reflect_mode);
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -662,14 +828,22 @@ static vsi_nn_kernel_node_t _setup
             node_params[SCALAR_ALIGN_CORNERS] =
                 vsi_nn_kernel_scalar_create(graph, I32, &align_corners);
             /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM );
+            status  = vsi_nn_kernel_node_pass_param(
+                node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM );
             VSI_ASSERT(status == VSI_SUCCESS);
             vsi_nn_kernel_scalar_release(&node_params[SCALAR_ALIGN_CORNERS]);
             {
                 // Set default border mode.
                 vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U32 = pad_val;
+                if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
+                {
+                    border.mode = VX_BORDER_CONSTANT;
+                    border.constant_value.U32 = pad_val;
+                }
+                else
+                {
+                    border.mode = VX_BORDER_REPLICATE;
+                }
                 status = vxSetNodeAttribute(
                     (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
                 CHECK_STATUS(status);
@@ -678,6 +852,7 @@ static vsi_nn_kernel_node_t _setup
     }
 
     vsi_safe_release_tensor(rs_tensors);
+    vsi_safe_release_tensor(rs_out_tensors);
 
     return node;
 } /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c b/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c
index 012c0408d..9b465badd 100644
--- a/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c
@@ -513,6 +513,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[4] );
             vsi_nn_kernel_scalar_release( &node_params[5] );
         }
+
+        if ( node )
         {
             // Set default border mode.
             vx_border_t border;
diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
index 2ec1b1aa6..a393c4559 100644
--- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
 
@@ -1002,8 +1003,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
     float                        twoLogE                = 2 * logE;
     uint32_t                     uint_min               = 0xFBFFFFFF;
     uint32_t                     uint_max               = 0x7BFFFFFF;
-    float                        float_min              = *(float *)&uint_min;
-    float                        float_max              = *(float *)&uint_max;
+    float                        float_min              = 0.0f;
+    float                        float_max              = 0.0f;
     float                        clip_Min_F[4]          = {0};
     float                        clip_Max_F[4]          = {0};
     uint32_t                     i                      = 0;
@@ -1017,6 +1018,12 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
     vsi_nn_kernel_tensor_attr_t* input_attr[9]          = {NULL};
     vsi_nn_kernel_tensor_attr_t* attr[2]                = {NULL};
 
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.data = uint_min;
+    float_min = fp32_bit_cast.val;
+    fp32_bit_cast.data = uint_max;
+    float_max = fp32_bit_cast.val;
+
     status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 5], &_is_ln );
     CHECK_STATUS_FAIL_GOTO(status, final );
     status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 4], &_is_cifg );
diff --git a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
index 28ff2d1ae..6554c74a9 100644
--- a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
@@ -539,6 +539,7 @@ static vsi_nn_kernel_node_t _setup
     uint32_t pad_val = 0;
     int32_t align_corners =
         vsi_nn_kernel_param_get_int32(params, "align_corners");
+    int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
 
     // Check if gpu can support the size
     if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
@@ -551,6 +552,11 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
+    if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
+    {
+        return NULL;
+    }
+
     final_tensors[0] = inputs[0];
 
     if (inputs[1]->attr.dim_num >= 3) {
@@ -596,8 +602,15 @@ static vsi_nn_kernel_node_t _setup
             {
                 // Set default border mode.
                 vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U32 = pad_val;
+                if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
+                {
+                    border.mode = VX_BORDER_CONSTANT;
+                    border.constant_value.U32 = pad_val;
+                }
+                else
+                {
+                    border.mode = VX_BORDER_REPLICATE;
+                }
                 status = vxSetNodeAttribute(
                     (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
                 CHECK_STATUS(status);
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
index d9f96b236..167db3e9a 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@@ -754,7 +754,7 @@ static vsi_nn_kernel_node_t _setup
     if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
-        return NULL;
+        goto final;
     }
 
     if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
index 0504dff72..17f3bc52e 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
@@ -915,7 +915,7 @@ static vsi_nn_kernel_node_t _setup
     if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
-        return NULL;
+        goto final;
     }
 
     if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c
index 618b33f52..e7b4d445a 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c
@@ -290,13 +290,14 @@ static vsi_nn_tensor_t* _create_scale_tensor
     vsi_nn_tensor_t*  scale           = NULL;
     vsi_size_t   i                    = 0;
     float       *scale_data_ptr       = NULL;
-    int         *index_data_ptr       = NULL;
+    int32_t     *index_data_ptr       = NULL;
     float        scale_value          = 0;
     vsi_ssize_t  data                 = 0;
-    int          idx                  = 0;
+    int32_t      idx                  = 0;
     float        delta_v              = 0;
     float        cubic_coeff_a        = -0.5f;
     vsi_size_t   item_count           = 4 * output_size;
+
     scale_data_ptr = (float *)malloc(item_count * sizeof(float));
     if (scale_data_ptr == NULL)
     {
@@ -316,7 +317,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
         scale_value = ((float)i + half_pixel_value) * scale_factor - half_pixel_value;
         data = (vsi_ssize_t)scale_value;
         delta_v = scale_value - (float)data;
-        idx   = (int)data - 1;
+        idx   = (int32_t)data - 1;
 
         index_data_ptr[i] = idx;
         scale_data_ptr[i * 4 + 0] = cubic_coeff_a * (((delta_v - 4) * (delta_v + 1) + 8) * (delta_v + 1) - 4);
@@ -331,11 +332,6 @@ static vsi_nn_tensor_t* _create_scale_tensor
     attr.vtl = FALSE;
 
     scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr);
-    if (scale_data_ptr)
-    {
-        free (scale_data_ptr);
-        scale_data_ptr = NULL;
-    }
 
     attr.size[0] = output_size;
     attr.dim_num = 1;
@@ -343,13 +339,11 @@ static vsi_nn_tensor_t* _create_scale_tensor
     attr.vtl = FALSE;
 
     *index = vsi_nn_CreateTensorFromData(graph, (uint8_t *)index_data_ptr, &attr);
-    if (index_data_ptr)
-    {
-        free (index_data_ptr);
-        index_data_ptr = NULL;
-    }
 
 OnError:
+    vsi_nn_safe_free(scale_data_ptr);
+    vsi_nn_safe_free(index_data_ptr);
+
     return scale;
 }
 
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index 83334269c..331f26298 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -1218,6 +1218,11 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
         status = backend->select( graph, inputs, input_num, outputs, output_num,
                 params, &selector );
         VSI_ASSERT( status == VSI_SUCCESS );
+
+        if ( status != VSI_SUCCESS ) {
+            VSILOGW("Failed to select kernel \"%s\"", kernel_name);
+            return NULL;
+        }
     }
     else
     {
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
index d74d6a1c8..0e0b21411 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@@ -171,7 +171,9 @@ vsi_status vsi_nn_kernel_copy_tensor_patch
 
     vsi_nn_kernel_tensor_attr_get_stride( attr, stride );
     memset(start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
-    for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    memset(end, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
+
+    for (i = 0; i < (uint32_t)attr->shape->size; i++)
     {
         end[i] = attr->shape->data[i];
         if ( attr->dtype != I4 && attr->dtype != U4 )
@@ -490,7 +492,7 @@ vsi_status vsi_nn_kernel_scalar_get_dtype
         ( vsi_nn_kernel_scalar_t scalar, DTYPE * ptr  ) \
     { \
         vsi_status status; \
-        vsi_nn_kernel_dtype_e dtype; \
+        vsi_nn_kernel_dtype_e dtype = INVALID_DTYPE; \
         if( !ptr ) \
         { \
             VSILOGE("Pointer to store scalar is null"); \
diff --git a/src/tim/vx/internal/src/kernel/vx/rms_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/rms_norm_vx.c
new file mode 100644
index 000000000..df6c7a72a
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/rms_norm_vx.c
@@ -0,0 +1,94 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+
+#define REGISTER_RMS_NORM_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_RMS_NORM_OPENVX_KERNEL(rms_norm)
+{
+    vx_node node = NULL;
+
+#if (VX_RMS_NORM_VX_SUPPORT)
+    float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+    vx_tensor inputs_tensor[2] = {NULL};
+    vx_tensor output_tensor = NULL;
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+
+    inputs_tensor[0] = inputs[0]->t;
+    inputs_tensor[1] = inputs[1]->t;
+    output_tensor = outputs[0]->t;
+
+    node = vxRMSNormalizationLayer(
+        graph->g,
+        eps,
+        axis,
+        inputs_tensor,
+        (uint32_t)input_num,
+        output_tensor
+        );
+#else
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
+#endif
+
+    return (vsi_nn_kernel_node_t)node;
+} /* rms_norm() */
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample_reflect.cl b/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample_reflect.cl
new file mode 100644
index 000000000..5aed95999
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample_reflect.cl
@@ -0,0 +1,169 @@
+__kernel void bilinear_grid_sample_reflect_F32_F32toF32(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_t        input1,
+    __write_only image2d_array_t  output,
+                           float  half_input0_w,
+                           float  half_input0_h,
+                           float  add_float_value_w,
+                           float  add_float_value_h,
+                           int    depth,
+                           float  min_val_w,
+                           float  span_w,
+                           float  min_val_h,
+                           float  span_h
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));
+    int2   coord_add    = (int2)(-1, 1);
+
+    float fx = read_imagef(input1, coord_in1).x;
+    coord_in1.x = coord_in1.x + 1;
+    float fy = read_imagef(input1, coord_in1).x;
+
+    fx = fx * half_input0_w + add_float_value_w;
+    fy = fy * half_input0_h + add_float_value_h;
+
+    if (span_w > 0)
+    {
+        fx = fabs(fx - min_val_w);
+        int flips_x = (int)(fx / span_w);
+        float extra_x = fx - flips_x * span_w;
+        fx = (flips_x & 0x01) ?  min_val_w + (span_w - extra_x) : min_val_w + extra_x ;
+    }
+    else
+    {
+        fx = 0;
+    }
+
+    if (span_h > 0)
+    {
+        fy = fabs(fy - min_val_h);
+        int flips_y = (int)(fy / span_h);
+        float extra_y = fy - flips_y * span_h;
+        fy = (flips_y & 0x01) ?  min_val_h + (span_h - extra_y) : min_val_h + extra_y ;
+    }
+    else
+    {
+        fy = 0;
+    }
+
+    float x_f = floor(fx);
+    float y_f = floor(fy);
+    float x_lerp  = fx - x_f;
+    float y_lerp  = fy - y_f;
+    int   x_index = convert_int(x_f);
+    int   y_index = convert_int(y_f);
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);
+
+    float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst;
+
+    while (coord_in.z < depth){
+        top_l    = read_imagef(input0, coord_in);
+        coord_in.y++;
+        bottom_l = read_imagef(input0, coord_in);
+        coord_in.x++;
+        bottom_r = read_imagef(input0, coord_in);
+        coord_in.y--;
+        top_r    = read_imagef(input0, coord_in);
+        top_r    = top_r - top_l;
+        top      = top_l + x_lerp * top_r;
+        bottom_r = bottom_r - bottom_l;
+        bottom   = bottom_l + x_lerp * bottom_r;
+        bottom   = bottom - top;
+        dst      = top + y_lerp * bottom;
+        write_imagef(output, coord_out, dst);
+        coord_in.xz = coord_in.xz + coord_add;
+        coord_out.z++;
+    }
+}
+
+
+__kernel void bilinear_grid_sample_reflect_U8_U8toU8(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_t        input1,
+    __write_only image2d_array_t  output,
+                           float  half_input0_w,
+                           float  half_input0_h,
+                           float  add_float_value_w,
+                           float  add_float_value_h,
+                           int    depth,
+                           float  in0_scale,
+                           float  in0_tail,
+                           float  in1_scale,
+                           float  in1_tail,
+                           float  out_scale,
+                           float  out_tail,
+                           float  min_val_w,
+                           float  span_w,
+                           float  min_val_h,
+                           float  span_h
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));
+    int2   coord_add    = (int2)(-1, 1);
+
+    float fx    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
+    coord_in1.x = coord_in1.x + 1;
+    float fy    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
+
+    fx = fx * half_input0_w + add_float_value_w;
+    fy = fy * half_input0_h + add_float_value_h;
+
+    if (span_w > 0)
+    {
+        fx = fabs(fx - min_val_w);
+        int flips_x = (int)(fx / span_w);
+        float extra_x = fx - flips_x * span_w;
+        fx = (flips_x & 0x01) ?  min_val_w + (span_w - extra_x) : min_val_w + extra_x ;
+    }
+    else
+    {
+        fx = 0;
+    }
+
+    if (span_h > 0)
+    {
+        fy = fabs(fy - min_val_h);
+        int flips_y = (int)(fy / span_h);
+        float extra_y = fy - flips_y * span_h;
+        fy = (flips_y & 0x01) ?  min_val_h + (span_h - extra_y) : min_val_h + extra_y ;
+    }
+    else
+    {
+        fy = 0;
+    }
+
+    float x_f = floor(fx);
+    float y_f = floor(fy);
+    float x_lerp  = fx - x_f;
+    float y_lerp  = fy - y_f;
+    int   x_index = convert_int(x_f);
+    int   y_index = convert_int(y_f);
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);
+
+    float4 top_l, top_r, bottom_l, bottom_r, top, bottom;
+    uint4  dst;
+
+    while (coord_in.z < depth){
+        top_l    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        coord_in.y++;
+        bottom_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        coord_in.x++;
+        bottom_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        coord_in.y--;
+        top_r    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        top_r    = top_r - top_l;
+        top      = top_l + x_lerp * top_r;
+        bottom_r = bottom_r - bottom_l;
+        bottom   = bottom_l + x_lerp * bottom_r;
+        bottom   = bottom - top;
+        top      = top + y_lerp * bottom;
+        dst      = convert_uint4_rte(top * out_scale + out_tail);
+        write_imageui(output, coord_out, dst);
+        coord_in.xz = coord_in.xz + coord_add;
+        coord_out.z++;
+    }
+
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl
index f87e9e449..bd26572ba 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl
@@ -1,4 +1,4 @@
-#define VSI_FLOAT32_MIN     (1.175494351e-38F)
+#define VSI_FLOAT32_MIN     (-3.40E+38)
 
 #define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \
 __kernel void maxpool_##in_name##to##out_name( \
@@ -45,7 +45,7 @@ __kernel void maxpool_##in_name##to##out_name( \
         { \
             src0 = read_func(input, coord_in); \
             coord_in.x += dilation_x; \
-            maxVal = max(src0, maxVal); \
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x; \
         } \
     } \
  \
@@ -101,7 +101,7 @@ __kernel void maxpool_F32toF32(
         {
             src0 = read_imagef(input, coord_in);
             coord_in.x += dilation_x;
-            maxVal = max(src0, maxVal);
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
         }
     }
 
@@ -152,7 +152,7 @@ __kernel void maxpool_U32toF32(
         {
             src0 = read_imageui(input, coord_in);
             coord_in.x += dilation_x;
-            maxVal = max(src0, maxVal);
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
         }
     }
 
@@ -206,7 +206,7 @@ __kernel void maxpool_F32toU32(
         {
             src0 = read_imagef(input, coord_in);
             coord_in.x += dilation_x;
-            maxVal = max(src0, maxVal);
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
         }
     }
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
index 0e6166c4d..dc2038920 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
@@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \
             float right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             uint left_elem = local_data[left_id]; \
             uint right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -227,7 +227,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             int left_elem = local_data[left_id]; \
             int right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -315,7 +315,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \
             float right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -403,7 +403,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \
             float right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
new file mode 100644
index 000000000..0eae5ab2f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
@@ -0,0 +1,368 @@
+
+#define BITONIC_STEP(dtype) \
+void bitonic_step_##dtype(uint num_stages, int lx, \
+        __local dtype *local_data, __local int *local_indices) \
+{ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (lx >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            int left_idx = local_indices[left_id]; \
+            int right_idx = local_indices[right_id]; \
+ \
+            dtype left_elem = local_data[left_id]; \
+            dtype right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+}
+BITONIC_STEP(int)
+BITONIC_STEP(uint)
+
+#define BITONIC_STEP_ASCEND(dtype) \
+void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
+        __local dtype *p_share_k, __local int *p_share_v) \
+{ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (lx >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            int left_idx = p_share_v[left_id]; \
+            int right_idx = p_share_v[right_id]; \
+ \
+            dtype left_elem = p_share_k[left_id]; \
+            dtype right_elem = p_share_k[right_id]; \
+ \
+            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
+            { \
+                p_share_k[left_id] = right_elem; \
+                p_share_k[right_id] = left_elem; \
+ \
+                p_share_v[left_id] = right_idx; \
+                p_share_v[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+}
+BITONIC_STEP_ASCEND(int)
+BITONIC_STEP_ASCEND(uint)
+
+#define BITONIC_MERGE(dtype) \
+void bitonic_merge_##dtype(uint num_stages, int lx, \
+        __local dtype *local_data, __local int *local_indices) \
+{ \
+    uint stage = num_stages; \
+    uint signo = (lx >> stage) & 1; \
+ \
+    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+    { \
+        uint postShift = (stage - passOfStage); \
+        uint pairDistance = 1 << postShift; \
+ \
+        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+        uint right_id = left_id + pairDistance; \
+ \
+        int left_idx = local_indices[left_id]; \
+        int right_idx = local_indices[right_id]; \
+ \
+        dtype left_elem = local_data[left_id]; \
+        dtype right_elem = local_data[right_id]; \
+ \
+        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+        { \
+            local_data[left_id] = right_elem; \
+            local_data[right_id] = left_elem; \
+ \
+            local_indices[left_id] = right_idx; \
+            local_indices[right_id] = left_idx; \
+        } \
+ \
+        barrier(CLK_LOCAL_MEM_FENCE); \
+    } \
+}
+BITONIC_MERGE(int)
+BITONIC_MERGE(uint)
+
+#define BLOCK_SIZE              (512)
+
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
+(
+  __read_only  image2d_t input,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
+               int       _num_stages,
+               int       width
+  )
+ {
+    uint lx = get_local_id(0);
+    const int init_k = -2147483647;
+    const int init_v = -2147483647;
+    const int num_stages = 9;
+    const int threads_per_block = BLOCK_SIZE;
+    const int index_minus_1 = threads_per_block * 2 - 1;
+    uint offset = 0;
+    uint lx1 = lx + threads_per_block;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    __local int local_data[1536];
+    __local int local_indices[1536];
+
+    int left = read_imagei(input, coord.xy).x;
+    coord.z += threads_per_block;
+    int right = read_imagei(input, coord.zy).x;
+
+    local_data[lx] = left;
+    local_indices[lx] = coord.x;
+    local_data[lx1] = right;
+    local_indices[lx1] = coord.z;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    bitonic_step_int(num_stages, lx, local_data, local_indices);
+
+    int min_data = local_data[511];
+
+    int *p_share_k = local_data + threads_per_block;
+    int *p_share_v = local_indices + threads_per_block;
+
+    int limit = (width >> 10) << 10;
+    p_share_k[lx] = init_k;
+    p_share_v[lx] = init_v;
+
+    p_share_k[lx1] = init_k;
+    p_share_v[lx1] = init_v;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
+    {
+        int2 data;
+        coord.z = coord.x + threads_per_block;
+        data.x = read_imagei(input, coord.xy).x;
+        data.y = read_imagei(input, coord.zy).x;
+
+        p_share_k[lx] = data.x;
+        p_share_v[lx] = coord.x;
+
+        p_share_k[lx1] = data.y;
+        p_share_v[lx1] = coord.z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] < min_data)
+        {
+            continue;
+        }
+
+        p_share_k[lx] = p_share_k[lx1];
+        p_share_v[lx] = p_share_v[lx1];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_merge_int(num_stages, lx, local_data, local_indices);
+
+        min_data = local_data[511];
+        p_share_k[lx] = init_k;
+        p_share_v[lx] = init_v;
+        p_share_k[lx1] = init_k;
+        p_share_v[lx1] = init_v;
+    }
+
+    if (width > limit)
+    {
+        if (coord.x < width)
+        {
+            int2 data;
+            data.x = read_imagei(input, coord.xy).x;
+            coord.z = coord.x + threads_per_block;
+            data.y = read_imagei(input, coord.zy).x;
+
+            p_share_k[lx] = data.x;
+            p_share_v[lx] = coord.x;
+
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] >= min_data)
+        {
+            p_share_k[lx] = p_share_k[lx1];
+            p_share_v[lx] = p_share_v[lx1];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            bitonic_merge_int(num_stages, lx, local_data, local_indices);
+        }
+    }
+
+    int4 dst;
+    dst.x = local_data[lx];
+
+    coord.x = lx;
+    write_imagei(output, coord.xy, dst.xxxx);
+
+    int4 index;
+    index.x = local_indices[lx];
+
+    write_imagei(indices, coord.xy, index.xxxx);
+}
+
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
+(
+  __read_only  image2d_t input,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
+               int       _num_stages,
+               int       width
+  )
+ {
+    uint lx = get_local_id(0);
+    const uint init_k = 0;
+    const int init_v = -2147483647;
+    const int num_stages = 9;
+    const int threads_per_block = BLOCK_SIZE;
+    const int index_minus_1 = threads_per_block * 2 - 1;
+    uint offset = 0;
+    uint lx1 = lx + threads_per_block;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    __local uint local_data[1536];
+    __local int local_indices[1536];
+
+    uint left = read_imageui(input, coord.xy).x;
+    coord.z += threads_per_block;
+    uint right = read_imageui(input, coord.zy).x;
+
+    local_data[lx] = left;
+    local_indices[lx] = coord.x;
+    local_data[lx1] = right;
+    local_indices[lx1] = coord.z;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    bitonic_step_uint(num_stages, lx, local_data, local_indices);
+
+    uint min_data = local_data[511];
+
+    uint *p_share_k = local_data + threads_per_block;
+    int *p_share_v = local_indices + threads_per_block;
+
+    int limit = (width >> 10) << 10;
+    p_share_k[lx] = init_k;
+    p_share_v[lx] = init_v;
+
+    p_share_k[lx1] = init_k;
+    p_share_v[lx1] = init_v;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
+    {
+        uint2 data;
+        coord.z = coord.x + threads_per_block;
+        data.x = read_imageui(input, coord.xy).x;
+        data.y = read_imageui(input, coord.zy).x;
+
+        p_share_k[lx] = data.x;
+        p_share_v[lx] = coord.x;
+
+        p_share_k[lx1] = data.y;
+        p_share_v[lx1] = coord.z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] < min_data)
+        {
+            continue;
+        }
+
+        p_share_k[lx] = p_share_k[lx1];
+        p_share_v[lx] = p_share_v[lx1];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_merge_uint(num_stages, lx, local_data, local_indices);
+
+        min_data = local_data[511];
+        p_share_k[lx] = init_k;
+        p_share_v[lx] = init_v;
+        p_share_k[lx1] = init_k;
+        p_share_v[lx1] = init_v;
+    }
+
+    if (width > limit)
+    {
+        if (coord.x < width)
+        {
+            uint2 data;
+            data.x = read_imageui(input, coord.xy).x;
+            coord.z = coord.x + threads_per_block;
+            data.y = read_imageui(input, coord.zy).x;
+
+            p_share_k[lx] = data.x;
+            p_share_v[lx] = coord.x;
+
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] >= min_data)
+        {
+            p_share_k[lx] = p_share_k[lx1];
+            p_share_v[lx] = p_share_v[lx1];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            bitonic_merge_uint(num_stages, lx, local_data, local_indices);
+        }
+    }
+
+    uint4 dst;
+    dst.x = local_data[lx];
+
+    coord.x = lx;
+    write_imageui(output, coord.xy, dst.xxxx);
+
+    int4 index;
+    index.x = local_indices[lx];
+
+    write_imagei(indices, coord.xy, index.xxxx);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl
index beaaccbc8..d6eb98f1f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl
@@ -28,12 +28,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
     }
 
     __local int sorted[1];
-    int width_minus_one = width - 1;
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
-
-    int x_start = lid * num_pixels_per_thread;
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
 
     sorted[0] = 0;
 
@@ -44,20 +38,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
             *sorted = 0;
         }
         int swapped = 0;
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         // odd-even
-        coord.x = x_start;
-        coord.z = x_start + 1;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2;
+        coord.z = lid * 2 + 1;
+        for (; coord.z < width; )
         {
             float4 left = read_imagef(input_t, coord.xy);
             float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);
 
-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
             {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                 swapped = 1;
 
                 write_imagef(input_t, coord.xy, right);
@@ -67,21 +62,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);
             }
 
-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
         }
 
+        barrier(CLK_GLOBAL_MEM_FENCE);
         // even-odd
-        coord.x = x_start + 1;
-        coord.z = x_start + 2;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2 + 1;
+        coord.z = lid * 2 + 2;
+        for (; coord.z < width; )
         {
             float4 left = read_imagef(input_t, coord.xy);
             float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);
 
-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
             {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                 swapped = 1;
 
                 write_imagef(input_t, coord.xy, right);
@@ -91,11 +88,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);
             }
 
-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
         }
 
         atomic_add(sorted, swapped);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         if (*sorted == 0)
             break;
@@ -141,13 +138,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
     }
 
     __local int sorted[1];
-    int width_minus_one = width - 1;
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
-
-    int x_start = lid * num_pixels_per_thread;
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
-
     sorted[0] = 0;
 
     while (1)
@@ -157,20 +147,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
             *sorted = 0;
         }
         int swapped = 0;
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         // odd-even
-        coord.x = x_start;
-        coord.z = x_start + 1;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2;
+        coord.z = lid * 2 + 1;
+        for (; coord.z < width; )
         {
             uint4 left = read_imageui(input_t, coord.xy);
             uint4 right = read_imageui(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);
 
-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
             {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                 swapped = 1;
 
                 write_imageui(input_t, coord.xy, right);
@@ -180,21 +171,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);
             }
 
-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
         }
 
+        barrier(CLK_GLOBAL_MEM_FENCE);
         // even-odd
-        coord.x = x_start + 1;
-        coord.z = x_start + 2;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2 + 1;
+        coord.z = lid * 2 + 2;
+        for (; coord.z < width; )
         {
             uint4 left = read_imageui(input_t, coord.xy);
             uint4 right = read_imageui(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);
 
-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
             {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                 swapped = 1;
 
                 write_imageui(input_t, coord.xy, right);
@@ -204,11 +197,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);
             }
 
-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
         }
 
         atomic_add(sorted, swapped);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         if (*sorted == 0)
             break;
@@ -254,13 +247,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
     }
 
     __local int sorted[1];
-    int width_minus_one = width - 1;
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
-
-    int x_start = lid * num_pixels_per_thread;
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
-
     sorted[0] = 0;
 
     while (1)
@@ -270,20 +256,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
             *sorted = 0;
         }
         int swapped = 0;
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         // odd-even
-        coord.x = x_start;
-        coord.z = x_start + 1;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2;
+        coord.z = lid * 2 + 1;
+        for (; coord.z < width; )
         {
             int4 left = read_imagei(input_t, coord.xy);
             int4 right = read_imagei(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);
 
-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
             {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                 swapped = 1;
 
                 write_imagei(input_t, coord.xy, right);
@@ -293,21 +280,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);
             }
 
-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
         }
 
+        barrier(CLK_GLOBAL_MEM_FENCE);
         // even-odd
-        coord.x = x_start + 1;
-        coord.z = x_start + 2;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2 + 1;
+        coord.z = lid * 2 + 2;
+        for (; coord.z < width; )
         {
             int4 left = read_imagei(input_t, coord.xy);
             int4 right = read_imagei(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);
 
-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
             {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                 swapped = 1;
 
                 write_imagei(input_t, coord.xy, right);
@@ -317,11 +306,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);
             }
 
-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
         }
 
         atomic_add(sorted, swapped);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         if (*sorted == 0)
             break;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl
index 976da2063..27b0633b5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl
@@ -28,12 +28,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
     }
 
     __local int sorted[1];
-    int width_minus_one = width - 1;
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
-
-    int x_start = lid * num_pixels_per_thread;
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
 
     sorted[0] = 0;
 
@@ -44,20 +38,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
             *sorted = 0;
         }
         int swapped = 0;
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         // odd-even
-        coord.x = x_start;
-        coord.z = x_start + 1;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2;
+        coord.z = lid * 2 + 1;
+        for (; coord.z < width; )
         {
             float4 left = read_imagef(input_t, coord.xy);
             float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);
 
-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
             {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                 swapped = 1;
 
                 write_imagef(input_t, coord.xy, right);
@@ -67,21 +62,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);
             }
 
-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
         }
 
+        barrier(CLK_GLOBAL_MEM_FENCE);
         // even-odd
-        coord.x = x_start + 1;
-        coord.z = x_start + 2;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2 + 1;
+        coord.z = lid * 2 + 2;
+        for (; coord.z < width; )
         {
             float4 left = read_imagef(input_t, coord.xy);
             float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);
 
-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
             {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                 swapped = 1;
 
                 write_imagef(input_t, coord.xy, right);
@@ -91,11 +88,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);
             }
 
-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
         }
 
         atomic_add(sorted, swapped);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         if (*sorted == 0)
             break;
@@ -143,13 +140,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
     }
 
     __local int sorted[1];
-    int width_minus_one = width - 1;
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
-
-    int x_start = lid * num_pixels_per_thread;
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
-
     sorted[0] = 0;
 
     while (1)
@@ -159,20 +149,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
             *sorted = 0;
         }
         int swapped = 0;
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         // odd-even
-        coord.x = x_start;
-        coord.z = x_start + 1;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2;
+        coord.z = lid * 2 + 1;
+        for (; coord.z < width; )
         {
             float4 left = read_imagef(input_t, coord.xy);
             float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);
 
-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
             {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                 swapped = 1;
 
                 write_imagef(input_t, coord.xy, right);
@@ -182,18 +173,22 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);
             }
 
-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
         }
 
+        barrier(CLK_GLOBAL_MEM_FENCE);
         // even-odd
-        coord.x = x_start + 1;
-        coord.z = x_start + 2;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2 + 1;
+        coord.z = lid * 2 + 2;
+        for (; coord.z < width; )
         {
             float4 left = read_imagef(input_t, coord.xy);
             float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);
 
-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
             {
                 int4 l_index = read_imagei(indices_t, coord.xy);
                 int4 r_index = read_imagei(indices_t, coord.zy);
@@ -206,11 +201,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);
             }
 
-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
         }
 
         atomic_add(sorted, swapped);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         if (*sorted == 0)
             break;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_BF16_to_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_BF16_to_BF16.vx
new file mode 100644
index 000000000..666950162
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_BF16_to_BF16.vx
@@ -0,0 +1,171 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;
+
+_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;
+_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_BF16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 top; \
+    vxc_short8 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    vxc_ushort8 tmp, dst; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+        _viv_asm(COPY, right4, src, 16); \
+        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+        _viv_asm(COPY, left4,  src, 16); \
+        right4    -= left4; \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+        _viv_asm(COPY, right4, src, 16); \
+        VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+        _viv_asm(COPY, left4,  src, 16); \
+        right4    -= left4; \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        _viv_asm(COPY, tmp, dst4, 16); \
+        dst.s0123 = tmp.s1357; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+    _viv_asm(COPY, right4, src, 16); \
+    VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+    _viv_asm(COPY, left4,  src, 16); \
+    right4    -= left4; \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+    _viv_asm(COPY, right4, src, 16); \
+    VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+    _viv_asm(COPY, left4,  src, 16); \
+    right4    -= left4; \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    _viv_asm(COPY, tmp, dst4, 16); \
+    dst.s0123 = tmp.s1357; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+
+__kernel void bilinear_grid_sample_reflect_BF16_BF16toBF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+
+    float4 fxy0;
+    float4 fxy1;
+
+    vxc_short8 src;
+    VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);
+    _viv_asm(COPY, fxy0, src, 16);
+    VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);
+    _viv_asm(COPY, fxy1, src, 16);
+
+
+
+    GRID_SAMPLE_BF16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_F16.vx
new file mode 100644
index 000000000..db6ff35e8
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_F16.vx
@@ -0,0 +1,217 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_F16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 t0; \
+    vxc_short8 b0; \
+    vxc_half8 top; \
+    vxc_half8 bottom; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, top, t0, 16); \
+    _viv_asm(COPY, bottom, b0, 16); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        half4 tmp; \
+        _viv_asm(CONV, tmp, dst4); \
+        VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \
+        vxc_short4 result; \
+        _viv_asm(COPY, result, top, 8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, top, t0, 16); \
+        _viv_asm(COPY, bottom, b0, 16); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    half4 tmp; \
+    _viv_asm(CONV, tmp, dst4); \
+    VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \
+    vxc_short4 result; \
+    _viv_asm(COPY, result, top, 8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void bilinear_grid_sample_reflect_F16_F32toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_reflect_F16_U8toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_uchar16 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    float4 fxy0;
+    float4 fxy1;
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
+
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_reflect_F16_F16toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_U8.vx
new file mode 100644
index 000000000..46f31ac75
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_U8.vx
@@ -0,0 +1,224 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_F16_to_U8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 t0; \
+    vxc_short8 b0; \
+    vxc_uchar16 result; \
+    vxc_half8 top; \
+    vxc_half8 bottom; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, top, t0, 16); \
+    _viv_asm(COPY, bottom, b0, 16); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * uint8Scale + output_ZP; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+        result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, top, t0, 16); \
+        _viv_asm(COPY, bottom, b0, 16); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * uint8Scale + output_ZP; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+    result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void bilinear_grid_sample_reflect_F16_F32toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+
+__kernel void bilinear_grid_sample_reflect_F16_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_uchar16 read_coord;
+
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_reflect_F16_F16toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I16_to_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I16_to_I16.vx
new file mode 100644
index 000000000..fc92c2cf6
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I16_to_I16.vx
@@ -0,0 +1,160 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
+_viv_uniform float input1_scale;
+_viv_uniform float dfpScale;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_I16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 top; \
+    vxc_short8 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * dfpScale; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * dfpScale; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void bilinear_grid_sample_reflect_I16_I16toI16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_short8 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1_scale;
+    fxy1 = fxy1 * input1_scale;
+
+    GRID_SAMPLE_I16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I8_to_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I8_to_I8.vx
new file mode 100644
index 000000000..999ff9018
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I8_to_I8.vx
@@ -0,0 +1,160 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
+_viv_uniform float input1_scale;
+_viv_uniform float dfpScale;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_I8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_char16 top; \
+    vxc_char16 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * dfpScale; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * dfpScale; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void bilinear_grid_sample_reflect_I8_I8toI8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_char16 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1_scale;
+    fxy1 = fxy1 * input1_scale;
+
+    GRID_SAMPLE_I8_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_U8_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_U8_to_U8.vx
new file mode 100644
index 000000000..00c343722
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_U8_to_U8.vx
@@ -0,0 +1,224 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform int input_ZP;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_U8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_uchar16 top; \
+    vxc_uchar16 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        unsigned char inputZP; \
+        _viv_asm(COPY, inputZP, input_ZP, 4); \
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, inputZP, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * uint8Scale + output_ZP; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    unsigned char inputZP; \
+    _viv_asm(COPY, inputZP, input_ZP, 4); \
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, inputZP, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * uint8Scale + output_ZP; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void bilinear_grid_sample_reflect_U8_F32toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
+
+__kernel void bilinear_grid_sample_reflect_U8_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_uchar16 read_coord;
+
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_reflect_U8_F16toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index 68763cc7e..d19b5191b 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -3236,6 +3236,1174 @@ __kernel void bilinear_grid_sample_U8_F16toU8(\n\
 \n\
 "; /* end of bilinear_grid_sample_U8_to_U8_vx*/
 
+static const char bilinear_grid_sample_reflect_BF16_to_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;\n\
+_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;\n\
+_viv_uniform float4 span_wh;\n\
+_viv_uniform float4 min_val_wh;\n\
+\n\
+#define GRID_SAMPLE_BF16_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy0 = fabs(fxy0 - min_val_wh); \\\n\
+    fxy1 = fabs(fxy1 - min_val_wh); \\\n\
+    float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\
+    float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 top; \\\n\
+    vxc_short8 bottom; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    vxc_ushort8 tmp, dst; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\
+        _viv_asm(COPY, right4, src, 16); \\\n\
+        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\
+        _viv_asm(COPY, left4,  src, 16); \\\n\
+        right4    -= left4; \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\
+        _viv_asm(COPY, right4, src, 16); \\\n\
+        VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\
+        _viv_asm(COPY, left4,  src, 16); \\\n\
+        right4    -= left4; \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        _viv_asm(COPY, tmp, dst4, 16); \\\n\
+        dst.s0123 = tmp.s1357; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\
+    _viv_asm(COPY, right4, src, 16); \\\n\
+    VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\
+    _viv_asm(COPY, left4,  src, 16); \\\n\
+    right4    -= left4; \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\
+    _viv_asm(COPY, right4, src, 16); \\\n\
+    VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\
+    _viv_asm(COPY, left4,  src, 16); \\\n\
+    right4    -= left4; \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    _viv_asm(COPY, tmp, dst4, 16); \\\n\
+    dst.s0123 = tmp.s1357; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_BF16_BF16toBF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    vxc_short8 src;\n\
+    VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);\n\
+    _viv_asm(COPY, fxy0, src, 16);\n\
+    VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);\n\
+    _viv_asm(COPY, fxy1, src, 16);\n\
+\n\
+\n\
+\n\
+    GRID_SAMPLE_BF16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of bilinear_grid_sample_reflect_BF16_to_BF16_vx*/
+
+static const char bilinear_grid_sample_reflect_F16_to_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;\n\
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\
+_viv_uniform float4 span_wh;\n\
+_viv_uniform float4 min_val_wh;\n\
+\n\
+#define GRID_SAMPLE_F16_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy0 = fabs(fxy0 - min_val_wh); \\\n\
+    fxy1 = fabs(fxy1 - min_val_wh); \\\n\
+    float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\
+    float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 t0; \\\n\
+    vxc_short8 b0; \\\n\
+    vxc_half8 top; \\\n\
+    vxc_half8 bottom; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, top, t0, 16); \\\n\
+    _viv_asm(COPY, bottom, b0, 16); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        half4 tmp; \\\n\
+        _viv_asm(CONV, tmp, dst4); \\\n\
+        VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \\\n\
+        vxc_short4 result; \\\n\
+        _viv_asm(COPY, result, top, 8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, top, t0, 16); \\\n\
+        _viv_asm(COPY, bottom, b0, 16); \\\n\
+    } \\\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    half4 tmp; \\\n\
+    _viv_asm(CONV, tmp, dst4); \\\n\
+    VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \\\n\
+    vxc_short4 result; \\\n\
+    _viv_asm(COPY, result, top, 8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_F16_F32toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    coord_in1.z  = coord_in1.z + 4;\n\
+\n\
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+\n\
+    GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_F16_U8toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    vxc_uchar16 read_coord;\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+    unsigned char input1ZP;\n\
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+    fxy0 = fxy0 * input1Scale;\n\
+    fxy1 = fxy1 * input1Scale;\n\
+\n\
+    GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_F16_F16toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_half8  read_coord;\n\
+\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+    GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of bilinear_grid_sample_reflect_F16_to_F16_vx*/
+
+static const char bilinear_grid_sample_reflect_F16_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform float uint8Scale;\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float4 span_wh;\n\
+_viv_uniform float4 min_val_wh;\n\
+\n\
+#define GRID_SAMPLE_F16_to_U8_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy0 = fabs(fxy0 - min_val_wh); \\\n\
+    fxy1 = fabs(fxy1 - min_val_wh); \\\n\
+    float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\
+    float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 t0; \\\n\
+    vxc_short8 b0; \\\n\
+    vxc_uchar16 result; \\\n\
+    vxc_half8 top; \\\n\
+    vxc_half8 bottom; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, top, t0, 16); \\\n\
+    _viv_asm(COPY, bottom, b0, 16); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+        int4 dst     = convert_int4_rte(dst4); \\\n\
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+        result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, top, t0, 16); \\\n\
+        _viv_asm(COPY, bottom, b0, 16); \\\n\
+    } \\\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+    int4 dst     = convert_int4_rte(dst4); \\\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+    result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_F16_F32toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    coord_in1.z  = coord_in1.z + 4;\n\
+\n\
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+    GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_F16_U8toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_uchar16 read_coord;\n\
+\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    unsigned char input1ZP;\n\
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1Scale;\n\
+    fxy1 = fxy1 * input1Scale;\n\
+\n\
+    GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_F16_F16toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_half8  read_coord;\n\
+\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+    GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+"; /* end of bilinear_grid_sample_reflect_F16_to_U8_vx*/
+
+static const char bilinear_grid_sample_reflect_I16_to_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform float dfpScale;\n\
+_viv_uniform float4 span_wh;\n\
+_viv_uniform float4 min_val_wh;\n\
+\n\
+#define GRID_SAMPLE_I16_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy0 = fabs(fxy0 - min_val_wh); \\\n\
+    fxy1 = fabs(fxy1 - min_val_wh); \\\n\
+    float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\
+    float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 top; \\\n\
+    vxc_short8 bottom; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP4x4(left4, bottom, bottom, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, bottom, bottom, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        dst4         = dst4 * dfpScale; \\\n\
+        int4 dst     = convert_int4_rte(dst4); \\\n\
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    dst4         = dst4 * dfpScale; \\\n\
+    int4 dst     = convert_int4_rte(dst4); \\\n\
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_I16_I16toI16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    vxc_short8 read_coord;\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1_scale;\n\
+    fxy1 = fxy1 * input1_scale;\n\
+\n\
+    GRID_SAMPLE_I16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of bilinear_grid_sample_reflect_I16_to_I16_vx*/
+
+static const char bilinear_grid_sample_reflect_I8_to_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform float dfpScale;\n\
+_viv_uniform float4 span_wh;\n\
+_viv_uniform float4 min_val_wh;\n\
+\n\
+#define GRID_SAMPLE_I8_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy0 = fabs(fxy0 - min_val_wh); \\\n\
+    fxy1 = fabs(fxy1 - min_val_wh); \\\n\
+    float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\
+    float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_char16 top; \\\n\
+    vxc_char16 bottom; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP4x4(left4, bottom, bottom, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, bottom, bottom, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        dst4         = dst4 * dfpScale; \\\n\
+        int4 dst     = convert_int4_rte(dst4); \\\n\
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    dst4         = dst4 * dfpScale; \\\n\
+    int4 dst     = convert_int4_rte(dst4); \\\n\
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_I8_I8toI8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    vxc_char16 read_coord;\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1_scale;\n\
+    fxy1 = fxy1 * input1_scale;\n\
+\n\
+    GRID_SAMPLE_I8_PROCESS();\n\
+\n\
+}\n\
+"; /* end of bilinear_grid_sample_reflect_I8_to_I8_vx*/
+
+static const char bilinear_grid_sample_reflect_U8_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform int input_ZP;\n\
+_viv_uniform float uint8Scale;\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+_viv_uniform float4 span_wh;\n\
+_viv_uniform float4 min_val_wh;\n\
+\n\
+#define GRID_SAMPLE_U8_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy0 = fabs(fxy0 - min_val_wh); \\\n\
+    fxy1 = fabs(fxy1 - min_val_wh); \\\n\
+    float4 flips_xy0 = floor(fxy0 / span_wh); \\\n\
+    float4 flips_xy1 = floor(fxy1 / span_wh); \\\n\
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \\\n\
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \\\n\
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \\\n\
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \\\n\
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \\\n\
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_uchar16 top; \\\n\
+    vxc_uchar16 bottom; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        unsigned char inputZP; \\\n\
+        _viv_asm(COPY, inputZP, input_ZP, 4); \\\n\
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP4x4(left4, bottom, inputZP, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, bottom, bottom, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+        int4 dst     = convert_int4_rte(dst4); \\\n\
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    unsigned char inputZP; \\\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4); \\\n\
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, inputZP, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+    int4 dst     = convert_int4_rte(dst4); \\\n\
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_U8_F32toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    coord_in1.z  = coord_in1.z + 4;\n\
+\n\
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+    GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_U8_U8toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_uchar16 read_coord;\n\
+\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    unsigned char input1ZP;\n\
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1Scale;\n\
+    fxy1 = fxy1 * input1Scale;\n\
+\n\
+    GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_U8_F16toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_half8  read_coord;\n\
+\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+    GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+"; /* end of bilinear_grid_sample_reflect_U8_to_U8_vx*/
+
 static const char bucketize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\
@@ -58353,6 +59521,176 @@ __kernel void bilinear_grid_sample_U8_U8toU8(\n\
 \n\
 }"; /* end of bilinear_grid_sample_cl*/
 
+static const char bilinear_grid_sample_reflect_cl[] = "__kernel void bilinear_grid_sample_reflect_F32_F32toF32(\n\
+    __read_only  image2d_array_t  input0,\n\
+    __read_only  image2d_t        input1,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  half_input0_w,\n\
+                           float  half_input0_h,\n\
+                           float  add_float_value_w,\n\
+                           float  add_float_value_h,\n\
+                           int    depth,\n\
+                           float  min_val_w,\n\
+                           float  span_w,\n\
+                           float  min_val_h,\n\
+                           float  span_h\n\
+                           )\n\
+{\n\
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));\n\
+    int2   coord_add    = (int2)(-1, 1);\n\
+\n\
+    float fx = read_imagef(input1, coord_in1).x;\n\
+    coord_in1.x = coord_in1.x + 1;\n\
+    float fy = read_imagef(input1, coord_in1).x;\n\
+\n\
+    fx = fx * half_input0_w + add_float_value_w;\n\
+    fy = fy * half_input0_h + add_float_value_h;\n\
+\n\
+    if (span_w > 0)\n\
+    {\n\
+        fx = fabs(fx - min_val_w);\n\
+        int flips_x = (int)(fx / span_w);\n\
+        float extra_x = fx - flips_x * span_w;\n\
+        fx = (flips_x & 0x01) ?  min_val_w + (span_w - extra_x) : min_val_w + extra_x ;\n\
+    }\n\
+    else\n\
+    {\n\
+        fx = 0;\n\
+    }\n\
+\n\
+    if (span_h > 0)\n\
+    {\n\
+        fy = fabs(fy - min_val_h);\n\
+        int flips_y = (int)(fy / span_h);\n\
+        float extra_y = fy - flips_y * span_h;\n\
+        fy = (flips_y & 0x01) ?  min_val_h + (span_h - extra_y) : min_val_h + extra_y ;\n\
+    }\n\
+    else\n\
+    {\n\
+        fy = 0;\n\
+    }\n\
+\n\
+    float x_f = floor(fx);\n\
+    float y_f = floor(fy);\n\
+    float x_lerp  = fx - x_f;\n\
+    float y_lerp  = fy - y_f;\n\
+    int   x_index = convert_int(x_f);\n\
+    int   y_index = convert_int(y_f);\n\
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);\n\
+\n\
+    float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst;\n\
+\n\
+    while (coord_in.z < depth){\n\
+        top_l    = read_imagef(input0, coord_in);\n\
+        coord_in.y++;\n\
+        bottom_l = read_imagef(input0, coord_in);\n\
+        coord_in.x++;\n\
+        bottom_r = read_imagef(input0, coord_in);\n\
+        coord_in.y--;\n\
+        top_r    = read_imagef(input0, coord_in);\n\
+        top_r    = top_r - top_l;\n\
+        top      = top_l + x_lerp * top_r;\n\
+        bottom_r = bottom_r - bottom_l;\n\
+        bottom   = bottom_l + x_lerp * bottom_r;\n\
+        bottom   = bottom - top;\n\
+        dst      = top + y_lerp * bottom;\n\
+        write_imagef(output, coord_out, dst);\n\
+        coord_in.xz = coord_in.xz + coord_add;\n\
+        coord_out.z++;\n\
+    }\n\
+}\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_reflect_U8_U8toU8(\n\
+    __read_only  image2d_array_t  input0,\n\
+    __read_only  image2d_t        input1,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  half_input0_w,\n\
+                           float  half_input0_h,\n\
+                           float  add_float_value_w,\n\
+                           float  add_float_value_h,\n\
+                           int    depth,\n\
+                           float  in0_scale,\n\
+                           float  in0_tail,\n\
+                           float  in1_scale,\n\
+                           float  in1_tail,\n\
+                           float  out_scale,\n\
+                           float  out_tail,\n\
+                           float  min_val_w,\n\
+                           float  span_w,\n\
+                           float  min_val_h,\n\
+                           float  span_h\n\
+                           )\n\
+{\n\
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));\n\
+    int2   coord_add    = (int2)(-1, 1);\n\
+\n\
+    float fx    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\
+    coord_in1.x = coord_in1.x + 1;\n\
+    float fy    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\
+\n\
+    fx = fx * half_input0_w + add_float_value_w;\n\
+    fy = fy * half_input0_h + add_float_value_h;\n\
+\n\
+    if (span_w > 0)\n\
+    {\n\
+        fx = fabs(fx - min_val_w);\n\
+        int flips_x = (int)(fx / span_w);\n\
+        float extra_x = fx - flips_x * span_w;\n\
+        fx = (flips_x & 0x01) ?  min_val_w + (span_w - extra_x) : min_val_w + extra_x ;\n\
+    }\n\
+    else\n\
+    {\n\
+        fx = 0;\n\
+    }\n\
+\n\
+    if (span_h > 0)\n\
+    {\n\
+        fy = fabs(fy - min_val_h);\n\
+        int flips_y = (int)(fy / span_h);\n\
+        float extra_y = fy - flips_y * span_h;\n\
+        fy = (flips_y & 0x01) ?  min_val_h + (span_h - extra_y) : min_val_h + extra_y ;\n\
+    }\n\
+    else\n\
+    {\n\
+        fy = 0;\n\
+    }\n\
+\n\
+    float x_f = floor(fx);\n\
+    float y_f = floor(fy);\n\
+    float x_lerp  = fx - x_f;\n\
+    float y_lerp  = fy - y_f;\n\
+    int   x_index = convert_int(x_f);\n\
+    int   y_index = convert_int(y_f);\n\
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);\n\
+\n\
+    float4 top_l, top_r, bottom_l, bottom_r, top, bottom;\n\
+    uint4  dst;\n\
+\n\
+    while (coord_in.z < depth){\n\
+        top_l    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\
+        coord_in.y++;\n\
+        bottom_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\
+        coord_in.x++;\n\
+        bottom_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\
+        coord_in.y--;\n\
+        top_r    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\
+        top_r    = top_r - top_l;\n\
+        top      = top_l + x_lerp * top_r;\n\
+        bottom_r = bottom_r - bottom_l;\n\
+        bottom   = bottom_l + x_lerp * bottom_r;\n\
+        bottom   = bottom - top;\n\
+        top      = top + y_lerp * bottom;\n\
+        dst      = convert_uint4_rte(top * out_scale + out_tail);\n\
+        write_imageui(output, coord_out, dst);\n\
+        coord_in.xz = coord_in.xz + coord_add;\n\
+        coord_out.z++;\n\
+    }\n\
+\n\
+}"; /* end of bilinear_grid_sample_reflect_cl*/
+
 static const char bucketize_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\
 \n\
 #define BUCKETIZE_F32_2D_SH_IMPL(name, comp_op) \\\n\
@@ -60238,108 +61576,6 @@ __kernel void depth2space_crd_F32toF32(\n\
 }\n\
 "; /* end of depth2space_crd_cl*/
 
-static const char detect_post_box_cl[] = "float exp_(float x, float logE)\n\
-{\n\
-    x *= logE;\n\
-    x = exp2(x);\n\
-    return x;\n\
-}\n\
-\n\
-__kernel void detect_post_box_F32_F32toF32(\n\
-     __read_only image2d_array_t   input0,\n\
-           __read_only image2d_t   input1,\n\
-    __write_only image2d_array_t   output,\n\
-                           float   inv_scale_y,\n\
-                           float   inv_scale_x,\n\
-                           float   inv_scale_h,\n\
-                           float   inv_scale_w,\n\
-                           float   logE)\n\
-{\n\
-    int4 coord =  (int4)(0, get_global_id(0), get_global_id(1), 0);\n\
-    float4 src0;\n\
-    float4 src1;\n\
-    float4 dst;\n\
-    float4 tmp0, tmp1;\n\
-    src0.x = read_imagef(input0, coord).x;\n\
-    src1.x = read_imagef(input1, coord.xy).x;\n\
-    coord.x++;\n\
-    src0.y = read_imagef(input0, coord).x;\n\
-    src1.y = read_imagef(input1, coord.xy).x;\n\
-    coord.x++;\n\
-    src0.z = read_imagef(input0, coord).x;\n\
-    src1.z = read_imagef(input1, coord.xy).x;\n\
-    coord.x++;\n\
-    src0.w = read_imagef(input0, coord).x;\n\
-    src1.w = read_imagef(input1, coord.xy).x;\n\
-\n\
-    tmp0.x  = src1.x + src1.z * src0.x * inv_scale_y;\n\
-    tmp0.y  = src1.y + src1.w * src0.y * inv_scale_x;\n\
-    tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f;\n\
-    tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f;\n\
-    dst.xy = tmp0.xy - tmp1.xy;\n\
-    dst.zw = tmp0.xy + tmp1.xy;\n\
-    coord.x = 0;\n\
-    write_imagef(output, coord, dst.xxxx);\n\
-    coord.x++;\n\
-    write_imagef(output, coord, dst.yyyy);\n\
-    coord.x++;\n\
-    write_imagef(output, coord, dst.zzzz);\n\
-    coord.x++;\n\
-    write_imagef(output, coord, dst.wwww);\n\
-}\n\
-\n\
-\n\
-__kernel void detect_post_box_U8_U8toF32(\n\
-     __read_only image2d_array_t   input0,\n\
-           __read_only image2d_t   input1,\n\
-    __write_only image2d_array_t   output,\n\
-                           float   inv_scale_y,\n\
-                           float   inv_scale_x,\n\
-                           float   inv_scale_h,\n\
-                           float   inv_scale_w,\n\
-                           float   logE,\n\
-                           float   input0Tail,\n\
-                           float   input1Tail,\n\
-                           float   input0Scale,\n\
-                           float   input1Scale)\n\
-{\n\
-    int4 coord =  (int4)(0, get_global_id(0), get_global_id(1), 0);\n\
-    uint4  in0, in1;\n\
-    float4 src0;\n\
-    float4 src1;\n\
-    float4 dst;\n\
-    float4 tmp0, tmp1;\n\
-    in0.x = read_imageui(input0, coord).x;\n\
-    in1.x = read_imageui(input1, coord.xy).x;\n\
-    coord.x++;\n\
-    in0.y = read_imageui(input0, coord).x;\n\
-    in1.y = read_imageui(input1, coord.xy).x;\n\
-    coord.x++;\n\
-    in0.z = read_imageui(input0, coord).x;\n\
-    in1.z = read_imageui(input1, coord.xy).x;\n\
-    coord.x++;\n\
-    in0.w = read_imageui(input0, coord).x;\n\
-    in1.w = read_imageui(input1, coord.xy).x;\n\
-\n\
-    src0 = convert_float4(in0) * input0Scale + input0Tail;\n\
-    src1 = convert_float4(in1) * input1Scale + input1Tail;\n\
-\n\
-    tmp0.x  = src1.x + src1.z * src0.x * inv_scale_y;\n\
-    tmp0.y  = src1.y + src1.w * src0.y * inv_scale_x;\n\
-    tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f;\n\
-    tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f;\n\
-    dst.xy = tmp0.xy - tmp1.xy;\n\
-    dst.zw = tmp0.xy + tmp1.xy;\n\
-    coord.x = 0;\n\
-    write_imagef(output, coord, dst.xxxx);\n\
-    coord.x++;\n\
-    write_imagef(output, coord, dst.yyyy);\n\
-    coord.x++;\n\
-    write_imagef(output, coord, dst.zzzz);\n\
-    coord.x++;\n\
-    write_imagef(output, coord, dst.wwww);\n\
-}"; /* end of detect_post_box_cl*/
-
 static const char eltwise_ops_helper_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\
 #pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
 \n\
@@ -70818,7 +72054,7 @@ __kernel void maximum_I32I32toI32_2D\n\
 }\n\
 "; /* end of maximum_cl*/
 
-static const char maxpool_cl[] = "#define VSI_FLOAT32_MIN     (1.175494351e-38F)\n\
+static const char maxpool_cl[] = "#define VSI_FLOAT32_MIN     (-3.40E+38)\n\
 \n\
 #define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \\\n\
 __kernel void maxpool_##in_name##to##out_name( \\\n\
@@ -70865,7 +72101,7 @@ __kernel void maxpool_##in_name##to##out_name( \\\n\
         { \\\n\
             src0 = read_func(input, coord_in); \\\n\
             coord_in.x += dilation_x; \\\n\
-            maxVal = max(src0, maxVal); \\\n\
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x; \\\n\
         } \\\n\
     } \\\n\
  \\\n\
@@ -70921,7 +72157,7 @@ __kernel void maxpool_F32toF32(\n\
         {\n\
             src0 = read_imagef(input, coord_in);\n\
             coord_in.x += dilation_x;\n\
-            maxVal = max(src0, maxVal);\n\
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;\n\
         }\n\
     }\n\
 \n\
@@ -70972,7 +72208,7 @@ __kernel void maxpool_U32toF32(\n\
         {\n\
             src0 = read_imageui(input, coord_in);\n\
             coord_in.x += dilation_x;\n\
-            maxVal = max(src0, maxVal);\n\
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;\n\
         }\n\
     }\n\
 \n\
@@ -71026,7 +72262,7 @@ __kernel void maxpool_F32toU32(\n\
         {\n\
             src0 = read_imagef(input, coord_in);\n\
             coord_in.x += dilation_x;\n\
-            maxVal = max(src0, maxVal);\n\
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;\n\
         }\n\
     }\n\
 \n\
@@ -79273,7 +80509,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \\\n\
             float right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem) ^ signo) \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -79361,7 +80597,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             uint left_elem = local_data[left_id]; \\\n\
             uint right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem) ^ signo) \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -79449,7 +80685,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             int left_elem = local_data[left_id]; \\\n\
             int right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem) ^ signo) \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -79537,7 +80773,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \\\n\
             float right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem) ^ signo) \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -79625,7 +80861,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \\\n\
             float right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem) ^ signo) \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -79660,6 +80896,376 @@ TOPK_F32toI32(1 << 4, 4)\n\
 TOPK_F32toI32(1 << 5, 5)\n\
 TOPK_F32toI32(1 << 6, 6)"; /* end of topk_cl*/
 
+static const char topk2_cl[] = "\n\
+#define BITONIC_STEP(dtype) \\\n\
+void bitonic_step_##dtype(uint num_stages, int lx, \\\n\
+        __local dtype *local_data, __local int *local_indices) \\\n\
+{ \\\n\
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
+    { \\\n\
+        uint signo = (lx >> stage) & 1; \\\n\
+ \\\n\
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+        { \\\n\
+            uint postShift = (stage - passOfStage); \\\n\
+            uint pairDistance = 1 << postShift; \\\n\
+ \\\n\
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
+            uint right_id = left_id + pairDistance; \\\n\
+ \\\n\
+            int left_idx = local_indices[left_id]; \\\n\
+            int right_idx = local_indices[right_id]; \\\n\
+ \\\n\
+            dtype left_elem = local_data[left_id]; \\\n\
+            dtype right_elem = local_data[right_id]; \\\n\
+ \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            { \\\n\
+                local_data[left_id] = right_elem; \\\n\
+                local_data[right_id] = left_elem; \\\n\
+ \\\n\
+                local_indices[left_id] = right_idx; \\\n\
+                local_indices[right_id] = left_idx; \\\n\
+            } \\\n\
+ \\\n\
+            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+BITONIC_STEP(int)\n\
+BITONIC_STEP(uint)\n\
+\n\
+#define BITONIC_STEP_ASCEND(dtype) \\\n\
+void bitonic_step_ascend_##dtype(uint num_stages, int lx, \\\n\
+        __local dtype *p_share_k, __local int *p_share_v) \\\n\
+{ \\\n\
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
+    { \\\n\
+        uint signo = (lx >> stage) & 1; \\\n\
+ \\\n\
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+        { \\\n\
+            uint postShift = (stage - passOfStage); \\\n\
+            uint pairDistance = 1 << postShift; \\\n\
+ \\\n\
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
+            uint right_id = left_id + pairDistance; \\\n\
+ \\\n\
+            int left_idx = p_share_v[left_id]; \\\n\
+            int right_idx = p_share_v[right_id]; \\\n\
+ \\\n\
+            dtype left_elem = p_share_k[left_id]; \\\n\
+            dtype right_elem = p_share_k[right_id]; \\\n\
+ \\\n\
+            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \\\n\
+            { \\\n\
+                p_share_k[left_id] = right_elem; \\\n\
+                p_share_k[right_id] = left_elem; \\\n\
+ \\\n\
+                p_share_v[left_id] = right_idx; \\\n\
+                p_share_v[right_id] = left_idx; \\\n\
+            } \\\n\
+ \\\n\
+            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+BITONIC_STEP_ASCEND(int)\n\
+BITONIC_STEP_ASCEND(uint)\n\
+\n\
+#define BITONIC_MERGE(dtype) \\\n\
+void bitonic_merge_##dtype(uint num_stages, int lx, \\\n\
+        __local dtype *local_data, __local int *local_indices) \\\n\
+{ \\\n\
+    uint stage = num_stages; \\\n\
+    uint signo = (lx >> stage) & 1; \\\n\
+ \\\n\
+    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+    { \\\n\
+        uint postShift = (stage - passOfStage); \\\n\
+        uint pairDistance = 1 << postShift; \\\n\
+ \\\n\
+        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
+        uint right_id = left_id + pairDistance; \\\n\
+ \\\n\
+        int left_idx = local_indices[left_id]; \\\n\
+        int right_idx = local_indices[right_id]; \\\n\
+ \\\n\
+        dtype left_elem = local_data[left_id]; \\\n\
+        dtype right_elem = local_data[right_id]; \\\n\
+ \\\n\
+        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+        { \\\n\
+            local_data[left_id] = right_elem; \\\n\
+            local_data[right_id] = left_elem; \\\n\
+ \\\n\
+            local_indices[left_id] = right_idx; \\\n\
+            local_indices[right_id] = left_idx; \\\n\
+        } \\\n\
+ \\\n\
+        barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+    } \\\n\
+}\n\
+BITONIC_MERGE(int)\n\
+BITONIC_MERGE(uint)\n\
+\n\
+#define BLOCK_SIZE              (512)\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32\n\
+(\n\
+  __read_only  image2d_t input,\n\
+  __write_only image2d_t output,\n\
+  __write_only image2d_t indices,\n\
+               float     input_scale,\n\
+               float     input_tail,\n\
+               float     output_scale,\n\
+               float     output_tail,\n\
+               int       _num_stages,\n\
+               int       width\n\
+  )\n\
+ {\n\
+    uint lx = get_local_id(0);\n\
+    const int init_k = -2147483647;\n\
+    const int init_v = -2147483647;\n\
+    const int num_stages = 9;\n\
+    const int threads_per_block = BLOCK_SIZE;\n\
+    const int index_minus_1 = threads_per_block * 2 - 1;\n\
+    uint offset = 0;\n\
+    uint lx1 = lx + threads_per_block;\n\
+\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    __local int local_data[1536];\n\
+    __local int local_indices[1536];\n\
+\n\
+    int left = read_imagei(input, coord.xy).x;\n\
+    coord.z += threads_per_block;\n\
+    int right = read_imagei(input, coord.zy).x;\n\
+\n\
+    local_data[lx] = left;\n\
+    local_indices[lx] = coord.x;\n\
+    local_data[lx1] = right;\n\
+    local_indices[lx1] = coord.z;\n\
+\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    bitonic_step_int(num_stages, lx, local_data, local_indices);\n\
+\n\
+    int min_data = local_data[511];\n\
+\n\
+    int *p_share_k = local_data + threads_per_block;\n\
+    int *p_share_v = local_indices + threads_per_block;\n\
+\n\
+    int limit = (width >> 10) << 10;\n\
+    p_share_k[lx] = init_k;\n\
+    p_share_v[lx] = init_v;\n\
+\n\
+    p_share_k[lx1] = init_k;\n\
+    p_share_v[lx1] = init_v;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\
+    {\n\
+        int2 data;\n\
+        coord.z = coord.x + threads_per_block;\n\
+        data.x = read_imagei(input, coord.xy).x;\n\
+        data.y = read_imagei(input, coord.zy).x;\n\
+\n\
+        p_share_k[lx] = data.x;\n\
+        p_share_v[lx] = coord.x;\n\
+\n\
+        p_share_k[lx1] = data.y;\n\
+        p_share_v[lx1] = coord.z;\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\
+\n\
+        if (p_share_k[index_minus_1] < min_data)\n\
+        {\n\
+            continue;\n\
+        }\n\
+\n\
+        p_share_k[lx] = p_share_k[lx1];\n\
+        p_share_v[lx] = p_share_v[lx1];\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\
+\n\
+        min_data = local_data[511];\n\
+        p_share_k[lx] = init_k;\n\
+        p_share_v[lx] = init_v;\n\
+        p_share_k[lx1] = init_k;\n\
+        p_share_v[lx1] = init_v;\n\
+    }\n\
+\n\
+    if (width > limit)\n\
+    {\n\
+        if (coord.x < width)\n\
+        {\n\
+            int2 data;\n\
+            data.x = read_imagei(input, coord.xy).x;\n\
+            coord.z = coord.x + threads_per_block;\n\
+            data.y = read_imagei(input, coord.zy).x;\n\
+\n\
+            p_share_k[lx] = data.x;\n\
+            p_share_v[lx] = coord.x;\n\
+\n\
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\
+        }\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\
+\n\
+        if (p_share_k[index_minus_1] >= min_data)\n\
+        {\n\
+            p_share_k[lx] = p_share_k[lx1];\n\
+            p_share_v[lx] = p_share_v[lx1];\n\
+            barrier(CLK_LOCAL_MEM_FENCE);\n\
+            bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\
+        }\n\
+    }\n\
+\n\
+    int4 dst;\n\
+    dst.x = local_data[lx];\n\
+\n\
+    coord.x = lx;\n\
+    write_imagei(output, coord.xy, dst.xxxx);\n\
+\n\
+    int4 index;\n\
+    index.x = local_indices[lx];\n\
+\n\
+    write_imagei(indices, coord.xy, index.xxxx);\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32\n\
+(\n\
+  __read_only  image2d_t input,\n\
+  __write_only image2d_t output,\n\
+  __write_only image2d_t indices,\n\
+               float     input_scale,\n\
+               float     input_tail,\n\
+               float     output_scale,\n\
+               float     output_tail,\n\
+               int       _num_stages,\n\
+               int       width\n\
+  )\n\
+ {\n\
+    uint lx = get_local_id(0);\n\
+    const uint init_k = 0;\n\
+    const int init_v = -2147483647;\n\
+    const int num_stages = 9;\n\
+    const int threads_per_block = BLOCK_SIZE;\n\
+    const int index_minus_1 = threads_per_block * 2 - 1;\n\
+    uint offset = 0;\n\
+    uint lx1 = lx + threads_per_block;\n\
+\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    __local uint local_data[1536];\n\
+    __local int local_indices[1536];\n\
+\n\
+    uint left = read_imageui(input, coord.xy).x;\n\
+    coord.z += threads_per_block;\n\
+    uint right = read_imageui(input, coord.zy).x;\n\
+\n\
+    local_data[lx] = left;\n\
+    local_indices[lx] = coord.x;\n\
+    local_data[lx1] = right;\n\
+    local_indices[lx1] = coord.z;\n\
+\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    bitonic_step_uint(num_stages, lx, local_data, local_indices);\n\
+\n\
+    uint min_data = local_data[511];\n\
+\n\
+    uint *p_share_k = local_data + threads_per_block;\n\
+    int *p_share_v = local_indices + threads_per_block;\n\
+\n\
+    int limit = (width >> 10) << 10;\n\
+    p_share_k[lx] = init_k;\n\
+    p_share_v[lx] = init_v;\n\
+\n\
+    p_share_k[lx1] = init_k;\n\
+    p_share_v[lx1] = init_v;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\
+    {\n\
+        uint2 data;\n\
+        coord.z = coord.x + threads_per_block;\n\
+        data.x = read_imageui(input, coord.xy).x;\n\
+        data.y = read_imageui(input, coord.zy).x;\n\
+\n\
+        p_share_k[lx] = data.x;\n\
+        p_share_v[lx] = coord.x;\n\
+\n\
+        p_share_k[lx1] = data.y;\n\
+        p_share_v[lx1] = coord.z;\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\
+\n\
+        if (p_share_k[index_minus_1] < min_data)\n\
+        {\n\
+            continue;\n\
+        }\n\
+\n\
+        p_share_k[lx] = p_share_k[lx1];\n\
+        p_share_v[lx] = p_share_v[lx1];\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\
+\n\
+        min_data = local_data[511];\n\
+        p_share_k[lx] = init_k;\n\
+        p_share_v[lx] = init_v;\n\
+        p_share_k[lx1] = init_k;\n\
+        p_share_v[lx1] = init_v;\n\
+    }\n\
+\n\
+    if (width > limit)\n\
+    {\n\
+        if (coord.x < width)\n\
+        {\n\
+            uint2 data;\n\
+            data.x = read_imageui(input, coord.xy).x;\n\
+            coord.z = coord.x + threads_per_block;\n\
+            data.y = read_imageui(input, coord.zy).x;\n\
+\n\
+            p_share_k[lx] = data.x;\n\
+            p_share_v[lx] = coord.x;\n\
+\n\
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\
+        }\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\
+\n\
+        if (p_share_k[index_minus_1] >= min_data)\n\
+        {\n\
+            p_share_k[lx] = p_share_k[lx1];\n\
+            p_share_v[lx] = p_share_v[lx1];\n\
+            barrier(CLK_LOCAL_MEM_FENCE);\n\
+            bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\
+        }\n\
+    }\n\
+\n\
+    uint4 dst;\n\
+    dst.x = local_data[lx];\n\
+\n\
+    coord.x = lx;\n\
+    write_imageui(output, coord.xy, dst.xxxx);\n\
+\n\
+    int4 index;\n\
+    index.x = local_indices[lx];\n\
+\n\
+    write_imagei(indices, coord.xy, index.xxxx);\n\
+}\n\
+"; /* end of topk2_cl*/
+
 static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X    (32)\n\
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\
  (\n\
@@ -79690,12 +81296,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
     }\n\
 \n\
     __local int sorted[1];\n\
-    int width_minus_one = width - 1;\n\
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\
-\n\
-    int x_start = lid * num_pixels_per_thread;\n\
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\
 \n\
     sorted[0] = 0;\n\
 \n\
@@ -79706,20 +81306,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
             *sorted = 0;\n\
         }\n\
         int swapped = 0;\n\
-        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
         // odd-even\n\
-        coord.x = x_start;\n\
-        coord.z = x_start + 1;\n\
-        for (; coord.x < x_end; )\n\
+        coord.x = lid * 2;\n\
+        coord.z = lid * 2 + 1;\n\
+        for (; coord.z < width; )\n\
         {\n\
             float4 left = read_imagef(input_t, coord.xy);\n\
             float4 right = read_imagef(input_t, coord.zy);\n\
+            int4 l_index = read_imagei(indices_t, coord.xy);\n\
+            int4 r_index = read_imagei(indices_t, coord.zy);\n\
 \n\
-            if (left.x < right.x)\n\
+            if ( (left.x < right.x) ||\n\
+                (left.x == right.x && l_index.x < r_index.x) )\n\
             {\n\
-                int4 l_index = read_imagei(indices_t, coord.xy);\n\
-                int4 r_index = read_imagei(indices_t, coord.zy);\n\
                 swapped = 1;\n\
 \n\
                 write_imagef(input_t, coord.xy, right);\n\
@@ -79729,21 +81330,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);\n\
             }\n\
 \n\
-            coord.xz = coord.xz + 2;\n\
+            coord.xz += 2 * LOCAL_SIZE_X;\n\
         }\n\
 \n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
         // even-odd\n\
-        coord.x = x_start + 1;\n\
-        coord.z = x_start + 2;\n\
-        for (; coord.x < x_end; )\n\
+        coord.x = lid * 2 + 1;\n\
+        coord.z = lid * 2 + 2;\n\
+        for (; coord.z < width; )\n\
         {\n\
             float4 left = read_imagef(input_t, coord.xy);\n\
             float4 right = read_imagef(input_t, coord.zy);\n\
+            int4 l_index = read_imagei(indices_t, coord.xy);\n\
+            int4 r_index = read_imagei(indices_t, coord.zy);\n\
 \n\
-            if (left.x < right.x)\n\
+            if ( (left.x < right.x) ||\n\
+                (left.x == right.x && l_index.x < r_index.x) )\n\
             {\n\
-                int4 l_index = read_imagei(indices_t, coord.xy);\n\
-                int4 r_index = read_imagei(indices_t, coord.zy);\n\
                 swapped = 1;\n\
 \n\
                 write_imagef(input_t, coord.xy, right);\n\
@@ -79753,11 +81356,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);\n\
             }\n\
 \n\
-            coord.xz = coord.xz + 2;\n\
+            coord.xz += 2 * LOCAL_SIZE_X;\n\
         }\n\
 \n\
         atomic_add(sorted, swapped);\n\
-        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
         if (*sorted == 0)\n\
             break;\n\
@@ -79803,13 +81406,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
     }\n\
 \n\
     __local int sorted[1];\n\
-    int width_minus_one = width - 1;\n\
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\
-\n\
-    int x_start = lid * num_pixels_per_thread;\n\
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\
-\n\
     sorted[0] = 0;\n\
 \n\
     while (1)\n\
@@ -79819,20 +81415,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
             *sorted = 0;\n\
         }\n\
         int swapped = 0;\n\
-        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
         // odd-even\n\
-        coord.x = x_start;\n\
-        coord.z = x_start + 1;\n\
-        for (; coord.x < x_end; )\n\
+        coord.x = lid * 2;\n\
+        coord.z = lid * 2 + 1;\n\
+        for (; coord.z < width; )\n\
         {\n\
             uint4 left = read_imageui(input_t, coord.xy);\n\
             uint4 right = read_imageui(input_t, coord.zy);\n\
+            int4 l_index = read_imagei(indices_t, coord.xy);\n\
+            int4 r_index = read_imagei(indices_t, coord.zy);\n\
 \n\
-            if (left.x < right.x)\n\
+            if ( (left.x < right.x) ||\n\
+                (left.x == right.x && l_index.x < r_index.x) )\n\
             {\n\
-                int4 l_index = read_imagei(indices_t, coord.xy);\n\
-                int4 r_index = read_imagei(indices_t, coord.zy);\n\
                 swapped = 1;\n\
 \n\
                 write_imageui(input_t, coord.xy, right);\n\
@@ -79842,21 +81439,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);\n\
             }\n\
 \n\
-            coord.xz = coord.xz + 2;\n\
+            coord.xz += 2 * LOCAL_SIZE_X;\n\
         }\n\
 \n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
         // even-odd\n\
-        coord.x = x_start + 1;\n\
-        coord.z = x_start + 2;\n\
-        for (; coord.x < x_end; )\n\
+        coord.x = lid * 2 + 1;\n\
+        coord.z = lid * 2 + 2;\n\
+        for (; coord.z < width; )\n\
         {\n\
             uint4 left = read_imageui(input_t, coord.xy);\n\
             uint4 right = read_imageui(input_t, coord.zy);\n\
+            int4 l_index = read_imagei(indices_t, coord.xy);\n\
+            int4 r_index = read_imagei(indices_t, coord.zy);\n\
 \n\
-            if (left.x < right.x)\n\
+            if ( (left.x < right.x) ||\n\
+                (left.x == right.x && l_index.x < r_index.x) )\n\
             {\n\
-                int4 l_index = read_imagei(indices_t, coord.xy);\n\
-                int4 r_index = read_imagei(indices_t, coord.zy);\n\
                 swapped = 1;\n\
 \n\
                 write_imageui(input_t, coord.xy, right);\n\
@@ -79866,11 +81465,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);\n\
             }\n\
 \n\
-            coord.xz = coord.xz + 2;\n\
+            coord.xz += 2 * LOCAL_SIZE_X;\n\
         }\n\
 \n\
         atomic_add(sorted, swapped);\n\
-        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
         if (*sorted == 0)\n\
             break;\n\
@@ -79916,13 +81515,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
     }\n\
 \n\
     __local int sorted[1];\n\
-    int width_minus_one = width - 1;\n\
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\
-\n\
-    int x_start = lid * num_pixels_per_thread;\n\
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\
-\n\
     sorted[0] = 0;\n\
 \n\
     while (1)\n\
@@ -79932,20 +81524,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
             *sorted = 0;\n\
         }\n\
         int swapped = 0;\n\
-        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
         // odd-even\n\
-        coord.x = x_start;\n\
-        coord.z = x_start + 1;\n\
-        for (; coord.x < x_end; )\n\
+        coord.x = lid * 2;\n\
+        coord.z = lid * 2 + 1;\n\
+        for (; coord.z < width; )\n\
         {\n\
             int4 left = read_imagei(input_t, coord.xy);\n\
             int4 right = read_imagei(input_t, coord.zy);\n\
+            int4 l_index = read_imagei(indices_t, coord.xy);\n\
+            int4 r_index = read_imagei(indices_t, coord.zy);\n\
 \n\
-            if (left.x < right.x)\n\
+            if ( (left.x < right.x) ||\n\
+                (left.x == right.x && l_index.x < r_index.x) )\n\
             {\n\
-                int4 l_index = read_imagei(indices_t, coord.xy);\n\
-                int4 r_index = read_imagei(indices_t, coord.zy);\n\
                 swapped = 1;\n\
 \n\
                 write_imagei(input_t, coord.xy, right);\n\
@@ -79955,21 +81548,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);\n\
             }\n\
 \n\
-            coord.xz = coord.xz + 2;\n\
+            coord.xz += 2 * LOCAL_SIZE_X;\n\
         }\n\
 \n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
         // even-odd\n\
-        coord.x = x_start + 1;\n\
-        coord.z = x_start + 2;\n\
-        for (; coord.x < x_end; )\n\
+        coord.x = lid * 2 + 1;\n\
+        coord.z = lid * 2 + 2;\n\
+        for (; coord.z < width; )\n\
         {\n\
             int4 left = read_imagei(input_t, coord.xy);\n\
             int4 right = read_imagei(input_t, coord.zy);\n\
+            int4 l_index = read_imagei(indices_t, coord.xy);\n\
+            int4 r_index = read_imagei(indices_t, coord.zy);\n\
 \n\
-            if (left.x < right.x)\n\
+            if ( (left.x < right.x) ||\n\
+                (left.x == right.x && l_index.x < r_index.x) )\n\
             {\n\
-                int4 l_index = read_imagei(indices_t, coord.xy);\n\
-                int4 r_index = read_imagei(indices_t, coord.zy);\n\
                 swapped = 1;\n\
 \n\
                 write_imagei(input_t, coord.xy, right);\n\
@@ -79979,11 +81574,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);\n\
             }\n\
 \n\
-            coord.xz = coord.xz + 2;\n\
+            coord.xz += 2 * LOCAL_SIZE_X;\n\
         }\n\
 \n\
         atomic_add(sorted, swapped);\n\
-        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
         if (*sorted == 0)\n\
             break;\n\
@@ -80031,12 +81626,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
     }\n\
 \n\
     __local int sorted[1];\n\
-    int width_minus_one = width - 1;\n\
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\
-\n\
-    int x_start = lid * num_pixels_per_thread;\n\
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\
 \n\
     sorted[0] = 0;\n\
 \n\
@@ -80047,20 +81636,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
             *sorted = 0;\n\
         }\n\
         int swapped = 0;\n\
-        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
         // odd-even\n\
-        coord.x = x_start;\n\
-        coord.z = x_start + 1;\n\
-        for (; coord.x < x_end; )\n\
+        coord.x = lid * 2;\n\
+        coord.z = lid * 2 + 1;\n\
+        for (; coord.z < width; )\n\
         {\n\
             float4 left = read_imagef(input_t, coord.xy);\n\
             float4 right = read_imagef(input_t, coord.zy);\n\
+            int4 l_index = read_imagei(indices_t, coord.xy);\n\
+            int4 r_index = read_imagei(indices_t, coord.zy);\n\
 \n\
-            if (left.x < right.x)\n\
+            if ( (left.x < right.x) ||\n\
+                (left.x == right.x && l_index.x < r_index.x) )\n\
             {\n\
-                int4 l_index = read_imagei(indices_t, coord.xy);\n\
-                int4 r_index = read_imagei(indices_t, coord.zy);\n\
                 swapped = 1;\n\
 \n\
                 write_imagef(input_t, coord.xy, right);\n\
@@ -80070,21 +81660,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);\n\
             }\n\
 \n\
-            coord.xz = coord.xz + 2;\n\
+            coord.xz += 2 * LOCAL_SIZE_X;\n\
         }\n\
 \n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
         // even-odd\n\
-        coord.x = x_start + 1;\n\
-        coord.z = x_start + 2;\n\
-        for (; coord.x < x_end; )\n\
+        coord.x = lid * 2 + 1;\n\
+        coord.z = lid * 2 + 2;\n\
+        for (; coord.z < width; )\n\
         {\n\
             float4 left = read_imagef(input_t, coord.xy);\n\
             float4 right = read_imagef(input_t, coord.zy);\n\
+            int4 l_index = read_imagei(indices_t, coord.xy);\n\
+            int4 r_index = read_imagei(indices_t, coord.zy);\n\
 \n\
-            if (left.x < right.x)\n\
+            if ( (left.x < right.x) ||\n\
+                (left.x == right.x && l_index.x < r_index.x) )\n\
             {\n\
-                int4 l_index = read_imagei(indices_t, coord.xy);\n\
-                int4 r_index = read_imagei(indices_t, coord.zy);\n\
                 swapped = 1;\n\
 \n\
                 write_imagef(input_t, coord.xy, right);\n\
@@ -80094,11 +81686,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);\n\
             }\n\
 \n\
-            coord.xz = coord.xz + 2;\n\
+            coord.xz += 2 * LOCAL_SIZE_X;\n\
         }\n\
 \n\
         atomic_add(sorted, swapped);\n\
-        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
         if (*sorted == 0)\n\
             break;\n\
@@ -80146,13 +81738,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
     }\n\
 \n\
     __local int sorted[1];\n\
-    int width_minus_one = width - 1;\n\
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\
-\n\
-    int x_start = lid * num_pixels_per_thread;\n\
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\
-\n\
     sorted[0] = 0;\n\
 \n\
     while (1)\n\
@@ -80162,20 +81747,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
             *sorted = 0;\n\
         }\n\
         int swapped = 0;\n\
-        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
         // odd-even\n\
-        coord.x = x_start;\n\
-        coord.z = x_start + 1;\n\
-        for (; coord.x < x_end; )\n\
+        coord.x = lid * 2;\n\
+        coord.z = lid * 2 + 1;\n\
+        for (; coord.z < width; )\n\
         {\n\
             float4 left = read_imagef(input_t, coord.xy);\n\
             float4 right = read_imagef(input_t, coord.zy);\n\
+            int4 l_index = read_imagei(indices_t, coord.xy);\n\
+            int4 r_index = read_imagei(indices_t, coord.zy);\n\
 \n\
-            if (left.x < right.x)\n\
+            if ( (left.x < right.x) ||\n\
+                (left.x == right.x && l_index.x < r_index.x) )\n\
             {\n\
-                int4 l_index = read_imagei(indices_t, coord.xy);\n\
-                int4 r_index = read_imagei(indices_t, coord.zy);\n\
                 swapped = 1;\n\
 \n\
                 write_imagef(input_t, coord.xy, right);\n\
@@ -80185,18 +81771,22 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);\n\
             }\n\
 \n\
-            coord.xz = coord.xz + 2;\n\
+            coord.xz += 2 * LOCAL_SIZE_X;\n\
         }\n\
 \n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
         // even-odd\n\
-        coord.x = x_start + 1;\n\
-        coord.z = x_start + 2;\n\
-        for (; coord.x < x_end; )\n\
+        coord.x = lid * 2 + 1;\n\
+        coord.z = lid * 2 + 2;\n\
+        for (; coord.z < width; )\n\
         {\n\
             float4 left = read_imagef(input_t, coord.xy);\n\
             float4 right = read_imagef(input_t, coord.zy);\n\
+            int4 l_index = read_imagei(indices_t, coord.xy);\n\
+            int4 r_index = read_imagei(indices_t, coord.zy);\n\
 \n\
-            if (left.x < right.x)\n\
+            if ( (left.x < right.x) ||\n\
+                (left.x == right.x && l_index.x < r_index.x) )\n\
             {\n\
                 int4 l_index = read_imagei(indices_t, coord.xy);\n\
                 int4 r_index = read_imagei(indices_t, coord.zy);\n\
@@ -80209,11 +81799,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                 write_imagei(indices_t, coord.zy, l_index);\n\
             }\n\
 \n\
-            coord.xz = coord.xz + 2;\n\
+            coord.xz += 2 * LOCAL_SIZE_X;\n\
         }\n\
 \n\
         atomic_add(sorted, swapped);\n\
-        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
         if (*sorted == 0)\n\
             break;\n\
@@ -80464,6 +82054,12 @@ static const source_map_t evis_resource[] =
     {"bilinear_grid_sample_I16_to_I16_vx", bilinear_grid_sample_I16_to_I16_vx},
     {"bilinear_grid_sample_I8_to_I8_vx", bilinear_grid_sample_I8_to_I8_vx},
     {"bilinear_grid_sample_U8_to_U8_vx", bilinear_grid_sample_U8_to_U8_vx},
+    {"bilinear_grid_sample_reflect_BF16_to_BF16_vx", bilinear_grid_sample_reflect_BF16_to_BF16_vx},
+    {"bilinear_grid_sample_reflect_F16_to_F16_vx", bilinear_grid_sample_reflect_F16_to_F16_vx},
+    {"bilinear_grid_sample_reflect_F16_to_U8_vx", bilinear_grid_sample_reflect_F16_to_U8_vx},
+    {"bilinear_grid_sample_reflect_I16_to_I16_vx", bilinear_grid_sample_reflect_I16_to_I16_vx},
+    {"bilinear_grid_sample_reflect_I8_to_I8_vx", bilinear_grid_sample_reflect_I8_to_I8_vx},
+    {"bilinear_grid_sample_reflect_U8_to_U8_vx", bilinear_grid_sample_reflect_U8_to_U8_vx},
     {"bucketize_vx", bucketize_vx},
     {"cast_vx", cast_vx},
     {"clip_F16_vx", clip_F16_vx},
@@ -80753,6 +82349,7 @@ static const source_map_t cl_resource[] =
     {"avg_pool3d_cl", avg_pool3d_cl},
     {"batchnorm_single_cl", batchnorm_single_cl},
     {"bilinear_grid_sample_cl", bilinear_grid_sample_cl},
+    {"bilinear_grid_sample_reflect_cl", bilinear_grid_sample_reflect_cl},
     {"bucketize_cl", bucketize_cl},
     {"cast_cl", cast_cl},
     {"clip_BF16_cl", clip_BF16_cl},
@@ -80764,7 +82361,6 @@ static const source_map_t cl_resource[] =
     {"cumsum_cl", cumsum_cl},
     {"cumsum_2d_cl", cumsum_2d_cl},
     {"depth2space_crd_cl", depth2space_crd_cl},
-    {"detect_post_box_cl", detect_post_box_cl},
     {"eltwise_ops_helper_cl", eltwise_ops_helper_cl},
     {"eltwise_unary_0_cl", eltwise_unary_0_cl},
     {"eltwise_unary_1_cl", eltwise_unary_1_cl},
@@ -80888,6 +82484,7 @@ static const source_map_t cl_resource[] =
     {"swish_cl", swish_cl},
     {"tile_cl", tile_cl},
     {"topk_cl", topk_cl},
+    {"topk2_cl", topk2_cl},
     {"topk_odd_even_sort_cl", topk_odd_even_sort_cl},
     {"topk_odd_even_sort2_cl", topk_odd_even_sort2_cl},
     {"upsample_cl", upsample_cl},
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
index bade3f959..377bba4d8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
@@ -39,6 +39,7 @@
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
 #include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (1)
@@ -199,14 +200,11 @@ static vsi_bool op_setup
     vsi_nn_internal_node_t* curr = NULL;
     float min = self->nn_param.clip.min;
     float max = self->nn_param.clip.max;
-    uint32_t infinity = VSI_NN_FLOAT32_INF;
-    float neg_infinity = -*(float*)&infinity;
-    int32_t max_float = *(int32_t*)&max;
 
     if ( (min == -1.0f && max == 1.0f)
       || (min == 0.0f && max == 6.0f)
-      || (min == 0.0f && max_float == VSI_NN_FLOAT32_INF)
-      || (min == neg_infinity && max_float == VSI_NN_FLOAT32_INF))
+      || (min == 0.0f && fp32_is_inf(max))
+      || (fp32_is_inf(-min) && fp32_is_inf(max)))
     {
         vsi_nn_internal_init_node_wksp(self);
         if (min == -1.0f && max == 1.0f)
@@ -217,7 +215,7 @@ static vsi_bool op_setup
         {
             curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
         }
-        else if (min == 0.0f && max_float == VSI_NN_FLOAT32_INF)
+        else if (min == 0.0f && fp32_is_inf(max))
         {
             curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
index 235ab87bb..ffac02898 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
@@ -430,10 +430,13 @@ static vsi_bool op_setup
     // create activation output/hstate_output/cstate_output
     vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dtype, TRUE);
     act_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(act_out, "Create internal tensor failed", final);
     vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_H_STATE]->attr.dtype, TRUE);
     act_h_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(act_h_out, "Create internal tensor failed", final);
     vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_C_STATE]->attr.dtype, TRUE);
     act_c_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(act_c_out, "Create internal tensor failed", final);
     curr->outputs[LSTMUNIT_ACT_OUTPUT] = act_out->t;
     curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = act_h_out->t;
     curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = act_c_out->t;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
index ce79a92d9..7020b570d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
@@ -89,7 +89,7 @@ static vsi_status op_grouped_compute
     if (NULL == LOCAL()->weight_tensor_group)
     {
         VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
-        return VSI_FAILURE;
+        goto final;
     }
     memset(LOCAL()->weight_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
     res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 2,
@@ -325,8 +325,8 @@ static vsi_status op_compute
 #endif
     // param.a_x = self->nn_param.deconv.dilation;
     // param.a_y = self->nn_param.deconv.dilation;
-    param.ext.khr.a_x = 1;
-    param.ext.khr.a_y = 1;
+    param.ext.khr.a_x = (size_t)self->nn_param.deconv.output_padding[0];
+    param.ext.khr.a_y = (size_t)self->nn_param.deconv.output_padding[1];
     param.ext.khr.padding_x = (size_t)self->nn_param.deconv.pad[0];
     param.ext.khr.padding_y = (size_t)self->nn_param.deconv.pad[2];
     param.ext.khr.overflow_policy = self->vx_param.overflow_policy;
@@ -336,6 +336,7 @@ static vsi_status op_compute
     param.ext.channel_group = self->nn_param.deconv.group;
     param.stride_x = self->nn_param.deconv.stride[0];
     param.stride_y = self->nn_param.deconv.stride[1];
+    param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
     //param.border_mode;
     //param.border_const;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
index 958b06b10..903ca2d2f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
@@ -66,7 +66,7 @@ static vsi_status op_compute
     input_tensor[1] = tmp_tensor;
 
     self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-        "signal_frame",
+        "extra_ending",
         input_tensor, 2,
         outputs, 1, NULL );
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
index e7d935843..a8875911a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@@ -65,9 +65,15 @@ static vsi_status op_compute
     }
     else
     {
+#define _TENSOR_LEN 64
         vsi_nn_tensor_attr_t attr;
         vsi_nn_tensor_t* temp_tensors = NULL;
 
+        char gather_tensor_name[_TENSOR_LEN];
+        char copy_tensor_name[_TENSOR_LEN];
+        memset(gather_tensor_name, 0, sizeof(gather_tensor_name));
+        memset(copy_tensor_name, 0, sizeof(copy_tensor_name));
+
         VSILOGW("gather is no_range_change operation! \
             Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
 
@@ -78,7 +84,20 @@ static vsi_status op_compute
         temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
 
         vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, &temp_tensors, 1, param );
+        snprintf(gather_tensor_name, sizeof(gather_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
+        if(vxSetReferenceName((vx_reference)temp_tensors->t, gather_tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u gather node output name fail", self->uid);
+            return VSI_FAILURE;
+        }
+
         n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
+        snprintf(copy_tensor_name, sizeof(copy_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
+        if(vxSetReferenceName((vx_reference)outputs[0]->t, copy_tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u copy node output name fail", self->uid);
+            return VSI_FAILURE;
+        }
 
         vsi_safe_release_tensor(temp_tensors);
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
index b77a39db3..d5be535bf 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
@@ -38,6 +38,7 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 typedef struct _gather_elements_local_data_t {
     int32_t placeholder;
@@ -101,6 +102,7 @@ static vsi_status op_compute
         attr.is_const = FALSE;
         attr.vtl = TRUE;
         temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+        CHECK_PTR_FAIL_GOTO( temp_tensors, "Create tensor fail.", final );
     }
     else
     {
@@ -148,6 +150,7 @@ static vsi_status op_compute
         vsi_safe_release_tensor(temp_tensors);
     }
 
+final:
     vsi_nn_kernel_param_release( &param );
 
     if ( self->n )
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
index 86f15f81d..d035ddaed 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
@@ -53,11 +53,13 @@ static vsi_status op_compute
 
     vsi_nn_kernel_param_t* param = NULL;
     int32_t align_corners = self->nn_param.gridsample.align_corners;
+    int32_t pad_mode = (int32_t)self->nn_param.gridsample.padding_mode;
     vsi_nn_kernel_node_t n;
     char kernel_name[128];
 
     param = vsi_nn_kernel_param_create();
     vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
+    vsi_nn_kernel_param_add_int32(param, "padding_mode", pad_mode);
 
     switch (self->nn_param.gridsample.mode) {
         case VSI_NN_INTERPOLATION_BILINEAR:
@@ -103,13 +105,20 @@ static vsi_bool op_check
         return FALSE;
     }
 
-    if (!((VSI_NN_PAD_MODE_CONSTANT ==
+    if ((VSI_NN_PAD_MODE_CONSTANT ==
            self->nn_param.gridsample.padding_mode) &&
-          (0 == self->nn_param.gridsample.const_val))) {
+          (0 != self->nn_param.gridsample.const_val)) {
         VSILOGE("Only support padding const 0 now!");
         return FALSE;
     }
 
+
+    if (VSI_NN_PAD_MODE_SYMMETRIC ==
+        self->nn_param.gridsample.padding_mode) {
+        VSILOGE("Can't support VSI_NN_PAD_MODE_SYMMETRIC now!");
+        return FALSE;
+    }
+
     return TRUE;
 } /* op_check() */
 
@@ -124,6 +133,11 @@ static vsi_bool op_setup
         return FALSE;
     }
 
+    if (2 != inputs[1]->attr.size[0])
+    {
+        return FALSE;
+    }
+
     if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
         outputs[0]->attr.size[0] = inputs[1]->attr.size[1];
@@ -133,6 +147,16 @@ static vsi_bool op_setup
             outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
         }
     }
+    else
+    {
+        if ((outputs[0]->attr.dim_num != inputs[0]->attr.dim_num)
+            || (outputs[0]->attr.size[0] != inputs[1]->attr.size[1])
+            || (outputs[0]->attr.size[1] != inputs[1]->attr.size[2]))
+        {
+            return FALSE;
+        }
+
+    }
 
     return TRUE;
 } /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
index 487e89c26..89c3b844a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
@@ -121,6 +121,7 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+#define _TENSOR_LEN 64
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
@@ -129,15 +130,36 @@ static vsi_status op_compute
     vsi_size_t new_rank = 0;
     vsi_nn_tensor_t * tmp_tensors[4] = {NULL};
 
+    char reshape0_tensor_name[_TENSOR_LEN];
+    char reshape1_tensor_name[_TENSOR_LEN];
+    char instance_norm_tensor_name[_TENSOR_LEN];
+
+    memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
+    memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
+    memset(instance_norm_tensor_name, 0, sizeof(instance_norm_tensor_name));
+
     vsi_nn_optimize_instance_norm_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
 
     tmp_tensors[0] = vsi_nn_kernel_insert_reshape_node( self->graph,
         inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD );
+    snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
+    if(vxSetReferenceName((vx_reference)tmp_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
+    {
+        VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
+        return VSI_FAILURE;
+    }
     tmp_tensors[1] = inputs[1];
     tmp_tensors[2] = inputs[2];
     tmp_tensors[3] = vsi_nn_kernel_insert_reshape_node( self->graph,
             outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD );
 
+    snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
+    if(vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
+    {
+        VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
+        return VSI_FAILURE;
+    }
+
     status = _try_set_high_presision_tensor(tmp_tensors);
     if (status != VSI_SUCCESS)
     {
@@ -155,6 +177,12 @@ static vsi_status op_compute
         self->n = (vx_node)n;
         status = VSI_SUCCESS;
     }
+    snprintf(instance_norm_tensor_name, sizeof(instance_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
+    if(vxSetReferenceName((vx_reference)tmp_tensors[3]->t, instance_norm_tensor_name) == VSI_FAILURE)
+    {
+        VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
+        return VSI_FAILURE;
+    }
 
     if (param != NULL)
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
index fe227816a..46a389a0f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@@ -259,9 +259,9 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    vsi_nn_safe_free(self->nn_param.layernorm.local);
 
 #if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+    vsi_nn_safe_free(self->nn_param.layernorm.local);
     vsi_nn_internal_deinit_node_wksp( self );
 #endif
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index aa2b231e1..2051c4533 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -161,14 +161,6 @@ static vsi_bool _check_is_sp_supported_type
 {
     int32_t * axes = self->nn_param.reduce.local2->axes;
     int32_t axes_num = self->nn_param.reduce.local2->axes_num;
-    vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { {0} };
-    int32_t axis_in[VSI_NN_MAX_DIM_NUM] = {0};
-    int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
-    int32_t i = 0;
-    uint32_t axis_size = 0;
-    uint32_t rank_in = 0;
-    uint32_t rank_out = 0;
-    vsi_bool ret = FALSE;
 
     if ( !self->graph->ctx->config.support_stream_processor ||
          (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
@@ -191,22 +183,15 @@ static vsi_bool _check_is_sp_supported_type
         return FALSE;
     }
 
-    for (i = 0; i < axes_num; i++)
+    if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2)) ||
+         (axes_num == 2 && ((axes[0] < 2 && axes[1] < 2) || (axes[0] == 1 && axes[1] == 2))) )
     {
-        shapes[0][i] = input->attr.size[axes[i]];
-        shapes[1][i] = 1;
-        axis_in[i] = i;
+        return TRUE;
     }
 
-    ret = vsi_nn_kernel_optimize_reduce_shape(
-            shapes[0], axes_num,
-            axis_in, axes_num,
-            shapes[1], axes_num,
-            shapes[2], &rank_in, shapes[3], &rank_out,
-            new_axis, &axis_size);
-
-    return ret && axis_size < 3;
+    return FALSE;
 }
+
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -839,82 +824,28 @@ static vsi_bool op_set_sp_reduce_internal
     vsi_enum  type_name
     )
 {
-    vsi_nn_tensor_attr_t attr;
-    vsi_nn_internal_tensor_t* tensor1 = NULL;
-    vsi_nn_tensor_t* new_output = NULL;
-    uint32_t* permute_in_perm = NULL;
-    int32_t * new_axis = NULL;
-    vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = {1};
-    int32_t use_virtual_tensor = TRUE;
     vsi_nn_internal_node_t* tmp_inode = NULL;
     int32_t * axes = self->nn_param.reduce.local2->axes;
     int32_t axes_num = self->nn_param.reduce.local2->axes_num;
-    int32_t i = 0, j = 0, index = 0;
     vsi_size_t reduce_size = 1;
     vsi_bool ret = FALSE;
+    int32_t i = 0;
 
     vsi_nn_internal_init_node_wksp( self );
 
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    memcpy(&attr.dtype, &inputs[0]->attr.dtype, sizeof(vsi_nn_dtype_t));
-    attr.dim_num = VSI_NN_DIM_AUTO;
-    attr.vtl = use_virtual_tensor;
-    attr.is_const = FALSE;
-    tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
-    CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
-
-    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 );
+    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
     CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
-    permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode,
-        inputs[0]->attr.dim_num * sizeof(uint32_t));
-    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create buffer failed", final);
-
-    for ( i = 0;  i < axes_num; i++)
-    {
-        shapes[index] = outputs[0]->attr.size[axes[i]];
-        permute_in_perm[index ++] = axes[i];
-        reduce_size *= inputs[0]->attr.size[axes[i]];
-    }
-
-    for ( j = 0;  j < (int32_t)inputs[0]->attr.dim_num;  j++)
-    {
-        for (i = 0; i < axes_num; i++)
-        {
-            if (j == axes[i])
-            {
-                break;
-            }
-        }
-        if (i == axes_num)
-        {
-            shapes[index] = outputs[0]->attr.size[j];
-            permute_in_perm[index ++] = j;
-        }
-    }
-    tmp_inode->node->nn_param.permute.perm = permute_in_perm;
-    tmp_inode->node->nn_param.permute.dim_num = inputs[0]->attr.dim_num;
     tmp_inode->inputs[0] = inputs[0];
-    tmp_inode->outputs[0] = tensor1->t;
-    vsi_nn_internal_setup_node(self, tmp_inode);
-
-    new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num);
-    CHECK_PTR_FAIL_GOTO(new_output, "Create tensor failed", final);
-    self->nn_param.reduce.local2->reshaped_output = new_output;
+    tmp_inode->outputs[0] = outputs[0];
+    tmp_inode->node->nn_param.reduce_mean_internal.axis = axes;
+    tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num;
+    tmp_inode->node->nn_param.reduce_mean_internal.type = type_name;
 
-    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
-    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
-    new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode,
-        axes_num * sizeof(int32_t));
-    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(new_axis, tmp_inode, "Create buffer failed", final);
     for (i = 0; i < axes_num; i++)
     {
-        new_axis[i] = i;
+        reduce_size *= inputs[0]->attr.size[axes[i]];
     }
-    tmp_inode->inputs[0] = tensor1->t;
-    tmp_inode->outputs[0] = new_output;
-    tmp_inode->node->nn_param.reduce_mean_internal.axis = new_axis;
-    tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num;
-    tmp_inode->node->nn_param.reduce_mean_internal.type = type_name;
+
     if (type_name == VSI_NN_REDUCE_SUM)
     {
         tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f;
@@ -1147,6 +1078,7 @@ static vsi_bool op_set_reduce_internal
             re_sizes[axes[2]] = 1;
             new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num);
         }
+        self->nn_param.reduce.local2->reshaped_output = new_output;
 
         curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
         CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@@ -1161,7 +1093,6 @@ static vsi_bool op_set_reduce_internal
             curr->inputs[0]  = tmp_output_tensor[1]->t;
         }
         curr->outputs[0] = new_output;
-        self->nn_param.reduce.local2->reshaped_output = new_output;
         vsi_nn_internal_setup_node(self, curr);
     }
     else
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c
index 3b92359c1..f560e8198 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c
@@ -136,7 +136,7 @@ static vsi_bool op_setup
     attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
 
     reducesum_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
-    CHECK_PTR_FAIL_GOTO(square_tensor, "Create internal tensor failed", final);
+    CHECK_PTR_FAIL_GOTO(reducesum_tensor, "Create internal tensor failed", final);
     reducesum_node = vsi_nn_internal_new_node( self, VSI_NN_OP_REDUCE, 0, 0);
     CHECK_PTR_FAIL_GOTO(reducesum_node, "Create internal node failed", final);
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
index 6ec9d19af..7221928d3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
@@ -37,6 +37,7 @@
 #include "vsi_nn_error.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
 
 
 static vsi_status op_compute
@@ -88,7 +89,6 @@ static vsi_bool op_setup
     float alpha = 0;
     float max_value = 0;
     float threshold = 0;
-    uint32_t max_raw = 0;
     vsi_bool ret = FALSE;
 
     if ( NULL == self )
@@ -101,11 +101,9 @@ static vsi_bool op_setup
     max_value = p->max_value;
     threshold = p->threshold;
 
-    max_raw = *(uint32_t*)&max_value;
-
     vsi_nn_internal_init_node_wksp(self);
 
-    if (alpha == 0 && max_raw == VSI_NN_FLOAT32_INF && threshold == 0)
+    if (alpha == 0.0f && fp32_is_inf(max_value) && threshold == 0.0f)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
         CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@@ -119,14 +117,14 @@ static vsi_bool op_setup
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
     }
-    else if (alpha == 0 && max_value == 6.0f && threshold == 0)
+    else if (alpha == 0.0f && max_value == 6.0f && threshold == 0.0f)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
         CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
     }
-    else if (alpha == 0.1 && max_value == VSI_NN_FLOAT32_INF && threshold == 0)
+    else if (alpha == 0.1f && max_value == VSI_NN_FLOAT32_INF && threshold == 0.0f)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LEAKY_RELU, 0, 0);
         CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
index 418c6a0e6..662fa967f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@@ -70,7 +70,11 @@ static vsi_status op_compute
             self->graph,
             (uint8_t *)self->nn_param.reshape.size,
             &attr);
-
+        if (NULL == dims_tensor)
+        {
+            VSILOGE( "Create tensor fail." );
+            return VSI_FAILURE;
+        }
         reshape_param.dims = REQUIRED_IO(dims_tensor);
 
         self->n = vxTensorReshapeNode(self->graph->g,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
index 6e1c3138d..93d269dfc 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
@@ -75,6 +75,11 @@ static vsi_status op_compute
             self->graph,
             (uint8_t *)dims_data,
             &attr);
+        if (NULL == dims_tensor)
+        {
+            VSILOGE( "Create tensor fail." );
+            return VSI_FAILURE;
+        }
 
         reshape_param.dims = REQUIRED_IO(dims_tensor);
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
index efa21d605..c1f559f4a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
@@ -52,6 +52,7 @@ static vsi_status op_compute
     vsi_status status = VSI_FAILURE;
     int32_t  align_corners       = self->nn_param.resize_internal.align_corners;
     int32_t  half_pixel_centers  = self->nn_param.resize_internal.half_pixel_centers;
+    int32_t  type                = self->nn_param.resize_internal.type;
     vsi_enum layout = self->nn_param.resize_internal.layout;
     vsi_nn_kernel_param_t * param = NULL;
 
@@ -59,6 +60,7 @@ static vsi_status op_compute
 
     vsi_nn_kernel_param_add_int32( param, "align_corners",  align_corners );
     vsi_nn_kernel_param_add_int32( param, "half_pixel_centers",  half_pixel_centers );
+    vsi_nn_kernel_param_add_int32( param, "type",  type );
 
     if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
     {
@@ -186,6 +188,7 @@ static vsi_status op_init
     vsi_status status = VSI_SUCCESS;
 
     self->nn_param.resize_internal.layout = VSI_NN_RESIZE_LAYOUT_NCHW;
+    self->nn_param.resize_internal.type = VSI_NN_INTERPOLATION_BILINEAR;
 
     return status;
 } /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
new file mode 100644
index 000000000..84387d7fd
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
@@ -0,0 +1,202 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "utils/vsi_nn_math.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_tensor_util_prv.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param = NULL;
+    vsi_nn_kernel_node_t    n = NULL;
+    float eps = self->nn_param.rmsnorm.eps;
+    int32_t axis = self->nn_param.rmsnorm.axis;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_float32(param, "eps", eps);
+    vsi_nn_kernel_param_add_int32(param, "axis", axis);
+    n = vsi_nn_kernel_selector(self->graph, "rms_norm",
+        inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param);
+    if (n != NULL)
+    {
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
+    }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release(&param);
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
+
+    if (!ret)
+    {
+        BEGIN_IO_TYPE_DECL(RMS_NORM, 2, 1)
+            IO_TYPE(D_F32, D_F32, D_F32)
+            IO_TYPE(D_F16, D_F32, D_F16)
+            IO_TYPE(D_F16, D_F32, D_F16)
+            IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_BF16, D_F32, D_BF16)
+            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
+            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
+            IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
+            IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
+            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
+            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
+            IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
+            IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
+            END_IO_TYPE_DECL(RMS_NORM)
+            if (!VALIDATE_OP_IO_TYPES(RMS_NORM, self, inputs, self->input.num, outputs, self->output.num))
+            {
+                char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+                VSILOGE("Inputs/Outputs data type not support: %s", desc);
+                destroy_op_io_types_desc(desc);
+                return FALSE;
+            }
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = TRUE;
+
+    if (NULL == self)
+    {
+        return FALSE;
+    }
+
+    ret = vsi_nn_op_common_setup(self, inputs, outputs);
+
+    return ret;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    self->nn_param.rmsnorm.axis = 0;
+    self->nn_param.rmsnorm.eps = 1e-8f;
+    return VSI_SUCCESS;
+} /* op_init() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ RMSNORM,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_shape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_shape.c
new file mode 100644
index 000000000..db6df273a
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shape.c
@@ -0,0 +1,196 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_test.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+typedef struct _shape_local_data_t {
+    vsi_nn_tensor_t *shape_tensor;
+} shape_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    return vsi_nn_internal_compute_node( self );
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+
+    if (outputs[0]->attr.dtype.vx_type != VSI_NN_TYPE_INT32)
+    {
+        VSILOGD("Outputs data type not support");
+
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    return vsi_nn_internal_optimize_node( self, direction );
+} /* op_optimize() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = TRUE;
+    vsi_nn_internal_node_t* curr = NULL;
+    vsi_nn_tensor_attr_t attr;
+    vx_int32 shapes[VSI_NN_MAX_DIM_NUM] = {0};
+    uint32_t i = 0;
+
+    for ( i = 0; i < inputs[0]->attr.dim_num; i++ )
+    {
+        shapes[i] = (int32_t)inputs[0]->attr.size[i];
+    }
+
+    memset(&attr, 0, sizeof(attr));
+    attr.size[0] = inputs[0]->attr.dim_num;
+    attr.dim_num = 1;
+    attr.is_const = TRUE;
+    attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    self->nn_param.shape.local->shape_tensor = vsi_nn_CreateTensorFromData(
+        self->graph,
+        (uint8_t *)shapes,
+        &attr);
+    if ( NULL == self->nn_param.shape.local->shape_tensor )
+    {
+        VSILOGE("Create shape_tensor fail.(shape)");
+        goto final;
+    }
+
+    vsi_nn_internal_init_node_wksp(self);
+    curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
+    if (NULL == curr)
+    {
+        return ret;
+    }
+    curr->inputs[0]  = self->nn_param.shape.local->shape_tensor;
+    curr->outputs[0] = outputs[0];
+
+    ret = vsi_nn_internal_setup_node(self, curr);
+
+final:
+
+    return ret;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    self->nn_param.shape.local =
+        (shape_local_data_t *)malloc(sizeof(shape_local_data_t));
+    if (NULL == self->nn_param.shape.local)
+    {
+        return VSI_FAILURE;
+    }
+    memset( self->nn_param.shape.local, 0, sizeof(shape_local_data_t) );
+
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    if (self->nn_param.shape.local)
+    {
+        vsi_safe_release_tensor(self->nn_param.shape.local->shape_tensor);
+    }
+    vsi_nn_safe_free(self->nn_param.shape.local);
+    vsi_nn_internal_deinit_node_wksp(self);
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ SHAPE,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ op_optimize,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index 84c2dd75d..95dc76ab7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -735,6 +735,15 @@ static vsi_bool op_setup
 
             outputs[0]->attr.dim_num++;
         }
+
+        /*output dim_num is 0, the tensor should be scalar!*/
+        if (outputs[0]->attr.dim_num == 0)
+        {
+            outputs[0]->attr.dim_num = 1;
+            outputs[0]->attr.size[0] = 1;
+
+            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+        }
     }
 
     _get_stride_slice_start_stop_stride(self, inputs, outputs);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
index 0be22cd64..7e8ae3485 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
@@ -124,9 +124,6 @@ static vsi_status op_compute
             outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
             shapes[1], &rank_out, &new_axis1);
 
-    param = vsi_nn_kernel_param_create();
-    vsi_nn_kernel_param_add_int32( param, "top_k", top_k );
-
     if (ret)
     {
         uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
@@ -195,10 +192,14 @@ static vsi_status op_compute
             outputs_tensor[1] = reshape_tensors[2];
         }
 
+        param = vsi_nn_kernel_param_create();
+        vsi_nn_kernel_param_add_int32( param, "top_k", top_k );
+
         self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk",
                 &input_tensor, _INPUT_NUM,
                 outputs_tensor, _OUTPUT_NUM, param );
 
+        vsi_nn_kernel_param_release( &param );
         if (axis != 0)
         {
             _create_permute_node(self, outputs_tensor[0], reshape_tensors[1], perm_out, rank_in, TRUE);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
index 4b7dd3f61..90479bb81 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
@@ -25,6 +25,7 @@
 
 #include <string.h>
 #include <stdlib.h>
+#include <float.h>
 
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
@@ -47,8 +48,6 @@ typedef struct _upsamplescale_local_data_t {
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 
-#define _EPSILON 1e-8
-
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -69,7 +68,7 @@ static vsi_status op_compute
     stride = self->nn_param.upsamplescale.stride;
     scale  = self->nn_param.upsamplescale.scale;
 
-    if (stride == 1 || vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    if (stride == 1 || vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
     {
         return vsi_nn_internal_compute_node( self );
     }
@@ -148,7 +147,7 @@ static vsi_status op_optimize
     VSI_UNREFERENCED(inputs);
     VSI_UNREFERENCED(outputs);
 
-    if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    if (stride == 1 && vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
     {
         return vsi_nn_internal_optimize_node( self, direction );
     }
@@ -174,7 +173,7 @@ static vsi_bool op_setup
 
     vsi_nn_internal_init_node_wksp(self);
 
-    if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    if (stride == 1 && vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
         CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@@ -194,7 +193,7 @@ static vsi_bool op_setup
 
         ret = vsi_nn_internal_setup_node(self, curr);
     }
-    else if (vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    else if (vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESIZE, 0, 0);
         CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index eb02639c2..4b2aa7aeb 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -469,6 +469,8 @@ static _op_param_gen_t s_op_gen[] =
     /* REDUCEL2 */              NULL,
     /* CROP_AND_RESIZE */       NULL,
     /* TAN */                   NULL,
+    /* RMSNORM */               NULL,
+    /* SHAPE */                 NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index 6f91f9933..e1d9b8198 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -46,6 +46,11 @@
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
+
+#if (defined(__ANDROID__)) && (__ANDROID_API__ > 21)
+#include <sys/system_properties.h>
+#endif
 
 typedef struct _vx_status_desc_t
 {
@@ -387,10 +392,11 @@ float vsi_nn_DataAsFloat32
     )
 {
     float val;
-    uint32_t *p = (uint32_t*)(&val);
+    fp32_bit_cast_t fp32_bit_cast;
     int16_t fp16;
 
-    *p = 0xFFFFFFFF;
+    fp32_bit_cast.data = 0xFFFFFFFF;
+    val = fp32_bit_cast.val;
     switch( type )
     {
     case VSI_NN_TYPE_BOOL8:
@@ -1462,11 +1468,15 @@ void vsi_nn_get_tensor_clamp_min_max
     }
     else
     {
-        uint32_t f32_min = 0xff800000;
-        uint32_t f32_max = 0x7f800000;
+        fp32_bit_cast_t fp32_bit_cast;
+        float pos_infinity;
+        float neg_infinity;
+        fp32_bit_cast.data = VSI_NN_FLOAT32_INF;
+        pos_infinity = fp32_bit_cast.val;
+        neg_infinity = -pos_infinity;
 
-        *clampMin = *(float*)&f32_min;
-        *clampMax = *(float*)&f32_max;
+        *clampMin = neg_infinity;
+        *clampMax = pos_infinity;
     }
 }
 
diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c
index fa5804540..7c7ed61d7 100644
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@@ -93,7 +93,7 @@ static vsi_status query_hardware_caps
     return status;
 }
 
-#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30)
+#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
 static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER";
 static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK";
 static const char* ENV_ENABLE_CONCAT_OPTIMIZE = "vendor.VSI_NN_ENABLE_CONCAT_OPTIMIZE";
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index ded183541..3242621b2 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -799,6 +799,7 @@ static vsi_status batchInference_graph
     original_inputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io);
     original_outputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io);
     approximateConstTensor = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t) * graph->tensor_num);
+    CHECK_PTR_FAIL_GOTO(approximateConstTensor, "Malloc fail.", final);
     memset(approximateConstTensor, -1, sizeof(vsi_nn_tensor_id_t) * graph->tensor_num);
 
     if (NULL == inputs || NULL == outputs || NULL == original_inputs_attr || NULL == original_outputs_attr)
@@ -878,6 +879,7 @@ static vsi_status batchInference_graph
             vsi_size_t iterator_list_index = 0;
             vsi_size_t list_index = 0;
             vsi_size_t* iterator_list = (vsi_size_t*)malloc(sizeof(vsi_size_t) * (batchNum + 1));
+            CHECK_PTR_FAIL_GOTO(iterator_list, "Malloc fail.", final);
             memset(iterator_list, 0, sizeof(uint32_t) * (batchNum + 1));
 
             if (((vsi_nn_node_prv_t*)node)->split_num > 0)
@@ -885,6 +887,7 @@ static vsi_status batchInference_graph
                 iterator_list[iterator_list_index++] = ((vsi_nn_node_prv_t*)node)->split_num;
                 if (((vsi_nn_node_prv_t*)node)->split_num == 1)
                 {/*if user set split_num = 1, there is no need to batch split.*/
+                    vsi_nn_safe_free(iterator_list);
                     continue;
                 }
             }
@@ -1015,6 +1018,7 @@ static vsi_status batchInference_graph
                 }
             }
 
+            vsi_nn_safe_free(iterator_list);
             /*restore node input batch number*/
             num_of_node_inputs = node->input.num;
             for (k = 0; k < num_of_node_inputs; k++)
@@ -1053,7 +1057,7 @@ static vsi_status batchInference_graph
         }
     }
 
-    final:
+final:
     for (i = 0; i < graph->node_num; i++)
     {
         node_id = nodes_list[i];
@@ -1067,7 +1071,7 @@ static vsi_status batchInference_graph
             node->input.num, inputs);
         vsi_nn_GetTensors(graph, node->output.tensors,
             node->output.num, outputs);
-        for (j = 0; j < node->output.num; j++)
+        for (j = 0; outputs && j < node->output.num; j++)
         {
             if (outputs[j] == NULL)
             {
diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c
index ff5b1cce0..c240d3bea 100644
--- a/src/tim/vx/internal/src/vsi_nn_internal_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c
@@ -716,6 +716,12 @@ vsi_status vsi_nn_internal_optimize_node
         for ( i = n - 1; i >= 0; i-- )
         {
             curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)WKSP(node), i);
+            if ( NULL == curr )
+            {
+                VSILOGE("get point fail");
+                status = VSI_FAILURE;
+                break;
+            }
             VSILOGD("Optimize backward for node uid[%u] sub_uid[%u] op[%s]",
                 node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op));
 
diff --git a/src/tim/vx/internal/src/vsi_nn_log.c b/src/tim/vx/internal/src/vsi_nn_log.c
index f617359d0..8449f80d9 100644
--- a/src/tim/vx/internal/src/vsi_nn_log.c
+++ b/src/tim/vx/internal/src/vsi_nn_log.c
@@ -29,7 +29,7 @@
 #include "vsi_nn_log.h"
 #include "vsi_nn_types.h"
 
-#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30)
+#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
 static const char* ENV_LOG_LEVEL = "vendor.VSI_NN_LOG_LEVEL";
 #else
 static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL";
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index c6e9daa44..c30d03106 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -1013,6 +1013,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly
                                                      sizeof(numParams));
                                 if (VSI_SUCCESS != status)
                                 {
+                                    vsi_nn_safe_free(nodes);
                                     goto final;
                                 }
                                 for (p = 0; p < numParams; p++)
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index d44ecf8eb..4d1022250 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -3258,6 +3258,7 @@ static vsi_bool _init_dummy_tensor
         #endif
         // This is a hack that driver doesn't support const scales
         scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
+        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
         memcpy(scales,
                tensor->attr.dtype.scales,
                tensor->attr.dtype.scale_dim * sizeof(float));