Skip to content

Commit

Permalink
Add support for widening_mul in wasm (#5849)
Browse files Browse the repository at this point in the history
* Add support for widening_mul in wasm
  • Loading branch information
steven-johnson committed Mar 25, 2021
1 parent 9a8ddf7 commit bc42da9
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 16 deletions.
5 changes: 4 additions & 1 deletion dependencies/wasm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,10 @@ if (WITH_WASM_SHELL)
# We want to deliberately choose a stable version (rather than top-of-tree);
# this might be a canary version (if needed to get the updates to v8 that we need)
# but should be carefully tested before landing.
set(WASM_SHELL_VERSION 8.8.97)
#
# Note that V8 8.9.238 is the first version claiming to implement the final simd spec;
# see https://github.com/WebAssembly/simd/blob/master/proposals/simd/ImplementationStatus.md
set(WASM_SHELL_VERSION 8.9.238)
set(WASM_SHELL_URL "https://storage.googleapis.com/chromium-v8/official/canary/v8-${WASM_SHELL_PLATFORM}-rel-${WASM_SHELL_VERSION}.zip")
message(STATUS "Fetching WASM_SHELL ${WASM_SHELL_URL}...")
FetchContent_Declare(wasm_shell URL "${WASM_SHELL_URL}")
Expand Down
9 changes: 9 additions & 0 deletions src/CodeGen_WebAssembly.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,15 @@ const WasmIntrinsic intrinsic_defs[] = {
{"llvm.wasm.avgr.unsigned.v16i8", UInt(8, 16), "rounding_halving_add", {UInt(8, 16), UInt(8, 16)}, Target::WasmSimd128},
{"llvm.wasm.avgr.unsigned.v8i16", UInt(16, 8), "rounding_halving_add", {UInt(16, 8), UInt(16, 8)}, Target::WasmSimd128},

#if LLVM_VERSION >= 130
{"widening_mul_i8x16", Int(16, 16), "widening_mul", {Int(8, 16), Int(8, 16)}, Target::WasmSimd128},
{"widening_mul_i16x8", Int(32, 8), "widening_mul", {Int(16, 8), Int(16, 8)}, Target::WasmSimd128},
{"widening_mul_i32x4", Int(64, 4), "widening_mul", {Int(32, 4), Int(32, 4)}, Target::WasmSimd128},
{"widening_mul_u8x16", UInt(16, 16), "widening_mul", {UInt(8, 16), UInt(8, 16)}, Target::WasmSimd128},
{"widening_mul_u16x8", UInt(32, 8), "widening_mul", {UInt(16, 8), UInt(16, 8)}, Target::WasmSimd128},
{"widening_mul_u32x4", UInt(64, 4), "widening_mul", {UInt(32, 4), UInt(32, 4)}, Target::WasmSimd128},
#endif

// TODO: LLVM should support this directly, but doesn't yet.
// To make this work, we need to be able to call the intrinsics with two vecs.
// @abadams sez: "The way I've had to do this in the past is with force-inlined implementations
Expand Down
66 changes: 66 additions & 0 deletions src/runtime/wasm_math.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ declare float @llvm.sqrt.f32(float);
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>);
declare <2 x float> @llvm.sqrt.v2f32(<2 x float>);


; fast_inverse

define weak_odr float @fast_inverse_f32(float %x) nounwind alwaysinline {
Expand Down Expand Up @@ -39,3 +40,68 @@ define weak_odr <4 x float> @fast_inverse_sqrt_f32x4(<4 x float> %x) nounwind al
ret <4 x float> %z
}

; widening_mul

declare <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32>, <4 x i32>);
declare <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32>, <4 x i32>);
declare <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16>, <8 x i16>);
declare <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16>, <8 x i16>);
declare <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8>, <16 x i8>);
declare <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8>, <16 x i8>);

; i8 -> i16
define weak_odr <16 x i16> @widening_mul_i8x16(<16 x i8> %x, <16 x i8> %y) nounwind alwaysinline {
%1 = tail call <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8> %x, <16 x i8> %y)
%2 = tail call <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8> %x, <16 x i8> %y)
%3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %3
}

; i16 -> i32
define weak_odr <8 x i32> @widening_mul_i16x8(<8 x i16> %x, <8 x i16> %y) nounwind alwaysinline {
%1 = tail call <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16> %x, <8 x i16> %y)
%2 = tail call <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16> %x, <8 x i16> %y)
%3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %3
}

; i32 -> i64
define weak_odr <4 x i64> @widening_mul_i32x4(<4 x i32> %x, <4 x i32> %y) nounwind alwaysinline {
%1 = tail call <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32> %x, <4 x i32> %y)
%2 = tail call <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32> %x, <4 x i32> %y)
%3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i64> %3
}

declare <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32>, <4 x i32>);
declare <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32>, <4 x i32>);
declare <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16>, <8 x i16>);
declare <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16>, <8 x i16>);
declare <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8>, <16 x i8>);
declare <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8>, <16 x i8>);

; u8 -> u16
define weak_odr <16 x i16> @widening_mul_u8x16(<16 x i8> %x, <16 x i8> %y) nounwind alwaysinline {
%1 = tail call <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8> %x, <16 x i8> %y)
%2 = tail call <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8> %x, <16 x i8> %y)
%3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %3
}

; u16 -> u32
define weak_odr <8 x i32> @widening_mul_u16x8(<8 x i16> %x, <8 x i16> %y) nounwind alwaysinline {
%1 = tail call <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16> %x, <8 x i16> %y)
%2 = tail call <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16> %x, <8 x i16> %y)
%3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %3
}

; u32 -> u64
define weak_odr <4 x i64> @widening_mul_u32x4(<4 x i32> %x, <4 x i32> %y) nounwind alwaysinline {
%1 = tail call <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32> %x, <4 x i32> %y)
%2 = tail call <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32> %x, <4 x i32> %y)
%3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i64> %3
}


35 changes: 20 additions & 15 deletions test/correctness/simd_op_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1657,7 +1657,7 @@ class SimdOpCheck : public SimdOpCheckTest {
// check("i32.extend8_s", 1, i32(i8(x) ^ 1));
// check("i32.extend16_s", 1, i32(i16(x) ^ 1));
// check("i64.extend8_s", 1, i64(i8(x) ^ 1));
// check("i64.extend16_s", 1, i32(i16(x) ^ 1));
// check("i64.extend16_s", 1, i64(i16(x) ^ 1));
// check("i64.extend32_s", 1, i64(i32(x) ^ 1));
}

Expand Down Expand Up @@ -1712,6 +1712,9 @@ class SimdOpCheck : public SimdOpCheckTest {
// (This fails to generate, but that's not entirely surprising -- I don't
// think we ever attempt to emit the most general-purpose swizzles in Halide
// code, so this may or may not be a defect.)
//
// TODO: this currently emits a bunch of extract_lane / replace_lane ops,
// so we should definitely try to do better.
// check("v8x16.swizzle", 16*w, in_u8(in_u8(x+32)));

// Integer addition
Expand Down Expand Up @@ -1746,20 +1749,22 @@ class SimdOpCheck : public SimdOpCheckTest {
check("i32x4.neg", 4 * w, -i32_1);
check("i64x2.neg", 2 * w, -i64_1);

// Extended integer multiplication
// TODO(https://github.com/halide/Halide/issues/5130): NOT BEING GENERATED AT TRUNK
// check("i16x8.extmul_low_i8x16_s", ???, ???);
// check("i16x8.extmul_high_i8x16_s", ???, ???);
// check("i16x8.extmul_low_i8x16_u", ???, ???);
// check("i16x8.extmul_high_i8x16_u", ???, ???);
// check("i32x4.extmul_low_i16x8_s", ???, ???);
// check("i32x4.extmul_high_i16x8_s", ???, ???);
// check("i32x4.extmul_low_i16x8_u", ???, ???);
// check("i32x4.extmul_high_i16x8_u", ???, ???);
// check("i64x2.extmul_low_i32x4_s", ???, ???);
// check("i64x2.extmul_high_i32x4_s", ???, ???);
// check("i64x2.extmul_low_i32x4_u", ???, ???);
// check("i64x2.extmul_high_i32x4_u", ???, ???);
// Extended (widening) integer multiplication
check("i16x8.extmul_low_i8x16_s", 8 * w, i16(i8_1) * i8_2);
check("i32x4.extmul_low_i16x8_s", 4 * w, i32(i16_1) * i16_2);
check("i64x2.extmul_low_i32x4_s", 2 * w, i64(i32_1) * i32_2);
check("i16x8.extmul_low_i8x16_u", 8 * w, u16(u8_1) * u8_2);
check("i32x4.extmul_low_i16x8_u", 4 * w, u32(u16_1) * u16_2);
check("i64x2.extmul_low_i32x4_u", 2 * w, u64(u32_1) * u32_2);
if (w > 1) {
// Need a register wider than 128 bits for us to generate these
check("i16x8.extmul_high_i8x16_s", 8 * w, i16(i8_1) * i8_2);
check("i32x4.extmul_high_i16x8_s", 4 * w, i32(i16_1) * i16_2);
check("i64x2.extmul_high_i32x4_s", 2 * w, i64(i32_1) * i32_2);
check("i16x8.extmul_high_i8x16_u", 8 * w, u16(u8_1) * u8_2);
check("i32x4.extmul_high_i16x8_u", 4 * w, u32(u16_1) * u16_2);
check("i64x2.extmul_high_i32x4_u", 2 * w, u64(u32_1) * u32_2);
}

// Extended pairwise integer addition
// TODO(https://github.com/halide/Halide/issues/5130): NOT BEING GENERATED AT TRUNK
Expand Down

0 comments on commit bc42da9

Please sign in to comment.