diff --git a/dependencies/wasm/CMakeLists.txt b/dependencies/wasm/CMakeLists.txt index a959fc17fff5..f2c5937faba9 100644 --- a/dependencies/wasm/CMakeLists.txt +++ b/dependencies/wasm/CMakeLists.txt @@ -71,7 +71,10 @@ if (WITH_WASM_SHELL) # We want to deliberately choose a stable version (rather than top-of-tree); # this might be a canary version (if needed to get the updates to v8 that we need) # but should be carefully tested before landing. - set(WASM_SHELL_VERSION 8.8.97) + # + # Note that V8 8.9.238 is the first version claiming to implement the final simd spec; + # see https://github.com/WebAssembly/simd/blob/master/proposals/simd/ImplementationStatus.md + set(WASM_SHELL_VERSION 8.9.238) set(WASM_SHELL_URL "https://storage.googleapis.com/chromium-v8/official/canary/v8-${WASM_SHELL_PLATFORM}-rel-${WASM_SHELL_VERSION}.zip") message(STATUS "Fetching WASM_SHELL ${WASM_SHELL_URL}...") FetchContent_Declare(wasm_shell URL "${WASM_SHELL_URL}") diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp index a94e7c78b988..72871733393c 100644 --- a/src/CodeGen_WebAssembly.cpp +++ b/src/CodeGen_WebAssembly.cpp @@ -66,6 +66,15 @@ const WasmIntrinsic intrinsic_defs[] = { {"llvm.wasm.avgr.unsigned.v16i8", UInt(8, 16), "rounding_halving_add", {UInt(8, 16), UInt(8, 16)}, Target::WasmSimd128}, {"llvm.wasm.avgr.unsigned.v8i16", UInt(16, 8), "rounding_halving_add", {UInt(16, 8), UInt(16, 8)}, Target::WasmSimd128}, +#if LLVM_VERSION >= 130 + {"widening_mul_i8x16", Int(16, 16), "widening_mul", {Int(8, 16), Int(8, 16)}, Target::WasmSimd128}, + {"widening_mul_i16x8", Int(32, 8), "widening_mul", {Int(16, 8), Int(16, 8)}, Target::WasmSimd128}, + {"widening_mul_i32x4", Int(64, 4), "widening_mul", {Int(32, 4), Int(32, 4)}, Target::WasmSimd128}, + {"widening_mul_u8x16", UInt(16, 16), "widening_mul", {UInt(8, 16), UInt(8, 16)}, Target::WasmSimd128}, + {"widening_mul_u16x8", UInt(32, 8), "widening_mul", {UInt(16, 8), UInt(16, 8)}, Target::WasmSimd128}, + {"widening_mul_u32x4", UInt(64, 4), "widening_mul", {UInt(32, 4), UInt(32, 4)}, Target::WasmSimd128}, +#endif + // TODO: LLVM should support this directly, but doesn't yet. // To make this work, we need to be able to call the intrinsics with two vecs. // @abadams sez: "The way I've had to do this in the past is with force-inlined implementations diff --git a/src/runtime/wasm_math.ll b/src/runtime/wasm_math.ll index 609a6ec8677d..93b0c14cf1d8 100644 --- a/src/runtime/wasm_math.ll +++ b/src/runtime/wasm_math.ll @@ -2,6 +2,7 @@ declare float @llvm.sqrt.f32(float); declare <4 x float> @llvm.sqrt.v4f32(<4 x float>); declare <2 x float> @llvm.sqrt.v2f32(<2 x float>); + ; fast_inverse define weak_odr float @fast_inverse_f32(float %x) nounwind alwaysinline { @@ -39,3 +40,68 @@ define weak_odr <4 x float> @fast_inverse_sqrt_f32x4(<4 x float> %x) nounwind al ret <4 x float> %z } +; widening_mul + +declare <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32>, <4 x i32>); +declare <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32>, <4 x i32>); +declare <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16>, <8 x i16>); +declare <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16>, <8 x i16>); +declare <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8>, <16 x i8>); +declare <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8>, <16 x i8>); + +; i8 -> i16 +define weak_odr <16 x i16> @widening_mul_i8x16(<16 x i8> %x, <16 x i8> %y) nounwind alwaysinline { + %1 = tail call <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8> %x, <16 x i8> %y) + %2 = tail call <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8> %x, <16 x i8> %y) + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> + ret <16 x i16> %3 +} + +; i16 -> i32 +define weak_odr <8 x i32> @widening_mul_i16x8(<8 x i16> %x, <8 x i16> %y) nounwind alwaysinline { + %1 = tail call <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16> %x, <8 x i16> %y) + %2 = tail call <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16> %x, <8 x i16> %y) + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> + ret <8 x i32> %3 +} + +; i32 -> i64 +define weak_odr <4 x i64> @widening_mul_i32x4(<4 x i32> %x, <4 x i32> %y) nounwind alwaysinline { + %1 = tail call <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32> %x, <4 x i32> %y) + %2 = tail call <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32> %x, <4 x i32> %y) + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> + ret <4 x i64> %3 +} + +declare <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32>, <4 x i32>); +declare <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32>, <4 x i32>); +declare <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16>, <8 x i16>); +declare <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16>, <8 x i16>); +declare <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8>, <16 x i8>); +declare <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8>, <16 x i8>); + +; u8 -> u16 +define weak_odr <16 x i16> @widening_mul_u8x16(<16 x i8> %x, <16 x i8> %y) nounwind alwaysinline { + %1 = tail call <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8> %x, <16 x i8> %y) + %2 = tail call <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8> %x, <16 x i8> %y) + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> + ret <16 x i16> %3 +} + +; u16 -> u32 +define weak_odr <8 x i32> @widening_mul_u16x8(<8 x i16> %x, <8 x i16> %y) nounwind alwaysinline { + %1 = tail call <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16> %x, <8 x i16> %y) + %2 = tail call <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16> %x, <8 x i16> %y) + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> + ret <8 x i32> %3 +} + +; u32 -> u64 +define weak_odr <4 x i64> @widening_mul_u32x4(<4 x i32> %x, <4 x i32> %y) nounwind alwaysinline { + %1 = tail call <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32> %x, <4 x i32> %y) + %2 = tail call <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32> %x, <4 x i32> %y) + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> + ret <4 x i64> %3 +} + + diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp index 0a8b5c739600..3cb6afa0a359 100644 --- a/test/correctness/simd_op_check.cpp +++ b/test/correctness/simd_op_check.cpp @@ -1657,7 +1657,7 @@ class SimdOpCheck : public SimdOpCheckTest { // check("i32.extend8_s", 1, i32(i8(x) ^ 1)); // check("i32.extend16_s", 1, i32(i16(x) ^ 1)); // check("i64.extend8_s", 1, i64(i8(x) ^ 1)); - // check("i64.extend16_s", 1, i32(i16(x) ^ 1)); + // check("i64.extend16_s", 1, i64(i16(x) ^ 1)); // check("i64.extend32_s", 1, i64(i32(x) ^ 1)); } @@ -1712,6 +1712,9 @@ class SimdOpCheck : public SimdOpCheckTest { // (This fails to generate, but that's not entirely surprising -- I don't // think we ever attempt to emit the most general-purpose swizzles in Halide // code, so this may or may not be a defect.) + // + // TODO: this currently emits a bunch of extract_lane / replace_lane ops, + // so we should definitely try to do better. // check("v8x16.swizzle", 16*w, in_u8(in_u8(x+32))); // Integer addition @@ -1746,20 +1749,22 @@ class SimdOpCheck : public SimdOpCheckTest { check("i32x4.neg", 4 * w, -i32_1); check("i64x2.neg", 2 * w, -i64_1); - // Extended integer multiplication - // TODO(https://github.com/halide/Halide/issues/5130): NOT BEING GENERATED AT TRUNK - // check("i16x8.extmul_low_i8x16_s", ???, ???); - // check("i16x8.extmul_high_i8x16_s", ???, ???); - // check("i16x8.extmul_low_i8x16_u", ???, ???); - // check("i16x8.extmul_high_i8x16_u", ???, ???); - // check("i32x4.extmul_low_i16x8_s", ???, ???); - // check("i32x4.extmul_high_i16x8_s", ???, ???); - // check("i32x4.extmul_low_i16x8_u", ???, ???); - // check("i32x4.extmul_high_i16x8_u", ???, ???); - // check("i64x2.extmul_low_i32x4_s", ???, ???); - // check("i64x2.extmul_high_i32x4_s", ???, ???); - // check("i64x2.extmul_low_i32x4_u", ???, ???); - // check("i64x2.extmul_high_i32x4_u", ???, ???); + // Extended (widening) integer multiplication + check("i16x8.extmul_low_i8x16_s", 8 * w, i16(i8_1) * i8_2); + check("i32x4.extmul_low_i16x8_s", 4 * w, i32(i16_1) * i16_2); + check("i64x2.extmul_low_i32x4_s", 2 * w, i64(i32_1) * i32_2); + check("i16x8.extmul_low_i8x16_u", 8 * w, u16(u8_1) * u8_2); + check("i32x4.extmul_low_i16x8_u", 4 * w, u32(u16_1) * u16_2); + check("i64x2.extmul_low_i32x4_u", 2 * w, u64(u32_1) * u32_2); + if (w > 1) { + // Need a register wider than 128 bits for us to generate these + check("i16x8.extmul_high_i8x16_s", 8 * w, i16(i8_1) * i8_2); + check("i32x4.extmul_high_i16x8_s", 4 * w, i32(i16_1) * i16_2); + check("i64x2.extmul_high_i32x4_s", 2 * w, i64(i32_1) * i32_2); + check("i16x8.extmul_high_i8x16_u", 8 * w, u16(u8_1) * u8_2); + check("i32x4.extmul_high_i16x8_u", 4 * w, u32(u16_1) * u16_2); + check("i64x2.extmul_high_i32x4_u", 2 * w, u64(u32_1) * u32_2); + } // Extended pairwise integer addition // TODO(https://github.com/halide/Halide/issues/5130): NOT BEING GENERATED AT TRUNK