diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu index 627ef44e6fd75..c10ab7277e788 100644 --- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu @@ -38,10 +38,12 @@ __global__ void ElementwiseMask(const T* a, const T* b, T* res, int num_elements) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) auto tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= num_elements) return; const T zero = 0; res[tid] = b[tid] >= zero ? a[tid] : zero; +#endif } template @@ -121,6 +123,7 @@ __global__ void ReduceSum2( template <> __global__ void ReduceSum2( const half* src, half* dst, int bsz, int nb_head, int max_seq_len) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) int tid = threadIdx.x; int bid = blockIdx.x; int num_blocks_per_head = ((max_seq_len / blockDim.x) * max_seq_len); @@ -152,6 +155,7 @@ __global__ void ReduceSum2( static_cast(bsz * max_seq_len), static_cast(res_half[0])); } +#endif } template