diff --git a/dependencies/wasm/CMakeLists.txt b/dependencies/wasm/CMakeLists.txt
index a959fc17fff5..f2c5937faba9 100644
--- a/dependencies/wasm/CMakeLists.txt
+++ b/dependencies/wasm/CMakeLists.txt
@@ -71,7 +71,10 @@ if (WITH_WASM_SHELL)
     # We want to deliberately choose a stable version (rather than top-of-tree);
     # this might be a canary version (if needed to get the updates to v8 that we need)
     # but should be carefully tested before landing.
-    set(WASM_SHELL_VERSION 8.8.97)
+    #
+    # Note that V8 8.9.238 is the first version claiming to implement the final simd spec;
+    # see https://github.com/WebAssembly/simd/blob/master/proposals/simd/ImplementationStatus.md
+    set(WASM_SHELL_VERSION 8.9.238)
     set(WASM_SHELL_URL "https://storage.googleapis.com/chromium-v8/official/canary/v8-${WASM_SHELL_PLATFORM}-rel-${WASM_SHELL_VERSION}.zip")
     message(STATUS "Fetching WASM_SHELL ${WASM_SHELL_URL}...")
     FetchContent_Declare(wasm_shell URL "${WASM_SHELL_URL}")
diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
index a94e7c78b988..72871733393c 100644
--- a/src/CodeGen_WebAssembly.cpp
+++ b/src/CodeGen_WebAssembly.cpp
@@ -66,6 +66,15 @@ const WasmIntrinsic intrinsic_defs[] = {
     {"llvm.wasm.avgr.unsigned.v16i8", UInt(8, 16), "rounding_halving_add", {UInt(8, 16), UInt(8, 16)}, Target::WasmSimd128},
     {"llvm.wasm.avgr.unsigned.v8i16", UInt(16, 8), "rounding_halving_add", {UInt(16, 8), UInt(16, 8)}, Target::WasmSimd128},
 
+#if LLVM_VERSION >= 130
+    {"widening_mul_i8x16", Int(16, 16), "widening_mul", {Int(8, 16), Int(8, 16)}, Target::WasmSimd128},
+    {"widening_mul_i16x8", Int(32, 8), "widening_mul", {Int(16, 8), Int(16, 8)}, Target::WasmSimd128},
+    {"widening_mul_i32x4", Int(64, 4), "widening_mul", {Int(32, 4), Int(32, 4)}, Target::WasmSimd128},
+    {"widening_mul_u8x16", UInt(16, 16), "widening_mul", {UInt(8, 16), UInt(8, 16)}, Target::WasmSimd128},
+    {"widening_mul_u16x8", UInt(32, 8), "widening_mul", {UInt(16, 8), UInt(16, 8)}, Target::WasmSimd128},
+    {"widening_mul_u32x4", UInt(64, 4), "widening_mul", {UInt(32, 4), UInt(32, 4)}, Target::WasmSimd128},
+#endif
+
     // TODO: LLVM should support this directly, but doesn't yet.
     // To make this work, we need to be able to call the intrinsics with two vecs.
     // @abadams sez: "The way I've had to do this in the past is with force-inlined implementations
diff --git a/src/runtime/wasm_math.ll b/src/runtime/wasm_math.ll
index 609a6ec8677d..93b0c14cf1d8 100644
--- a/src/runtime/wasm_math.ll
+++ b/src/runtime/wasm_math.ll
@@ -2,6 +2,7 @@ declare float @llvm.sqrt.f32(float);
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>);
 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>);
 
+
 ; fast_inverse
 
 define weak_odr float @fast_inverse_f32(float %x) nounwind alwaysinline {
@@ -39,3 +40,68 @@ define weak_odr <4 x float> @fast_inverse_sqrt_f32x4(<4 x float> %x) nounwind al
        ret <4 x float> %z
 }
 
+; widening_mul
+
+declare <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32>, <4 x i32>);
+declare <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32>, <4 x i32>);
+declare <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16>, <8 x i16>);
+declare <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16>, <8 x i16>);
+declare <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8>, <16 x i8>);
+declare <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8>, <16 x i8>);
+
+; i8 -> i16
+define weak_odr <16 x i16> @widening_mul_i8x16(<16 x i8> %x, <16 x i8> %y) nounwind alwaysinline {
+  %1 = tail call <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8> %x, <16 x i8> %y)
+  %2 = tail call <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8> %x, <16 x i8> %y)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %3
+}
+
+; i16 -> i32
+define weak_odr <8 x i32> @widening_mul_i16x8(<8 x i16> %x, <8 x i16> %y) nounwind alwaysinline {
+  %1 = tail call <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16> %x, <8 x i16> %y)
+  %2 = tail call <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16> %x, <8 x i16> %y)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %3
+}
+
+; i32 -> i64
+define weak_odr <4 x i64> @widening_mul_i32x4(<4 x i32> %x, <4 x i32> %y) nounwind alwaysinline {
+  %1 = tail call <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32> %x, <4 x i32> %y)
+  %2 = tail call <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32> %x, <4 x i32> %y)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %3
+}
+
+declare <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32>, <4 x i32>);
+declare <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32>, <4 x i32>);
+declare <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16>, <8 x i16>);
+declare <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16>, <8 x i16>);
+declare <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8>, <16 x i8>);
+declare <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8>, <16 x i8>);
+
+; u8 -> u16
+define weak_odr <16 x i16> @widening_mul_u8x16(<16 x i8> %x, <16 x i8> %y) nounwind alwaysinline {
+  %1 = tail call <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8> %x, <16 x i8> %y)
+  %2 = tail call <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8> %x, <16 x i8> %y)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %3
+}
+
+; u16 -> u32
+define weak_odr <8 x i32> @widening_mul_u16x8(<8 x i16> %x, <8 x i16> %y) nounwind alwaysinline {
+  %1 = tail call <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16> %x, <8 x i16> %y)
+  %2 = tail call <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16> %x, <8 x i16> %y)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %3
+}
+
+; u32 -> u64
+define weak_odr <4 x i64> @widening_mul_u32x4(<4 x i32> %x, <4 x i32> %y) nounwind alwaysinline {
+  %1 = tail call <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32> %x, <4 x i32> %y)
+  %2 = tail call <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32> %x, <4 x i32> %y)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %3
+}
+
+
diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
index 0a8b5c739600..3cb6afa0a359 100644
--- a/test/correctness/simd_op_check.cpp
+++ b/test/correctness/simd_op_check.cpp
@@ -1657,7 +1657,7 @@ class SimdOpCheck : public SimdOpCheckTest {
             // check("i32.extend8_s", 1, i32(i8(x) ^ 1));
             // check("i32.extend16_s", 1, i32(i16(x) ^ 1));
             // check("i64.extend8_s", 1, i64(i8(x) ^ 1));
-            // check("i64.extend16_s", 1, i32(i16(x) ^ 1));
+            // check("i64.extend16_s", 1, i64(i16(x) ^ 1));
             // check("i64.extend32_s", 1, i64(i32(x) ^ 1));
         }
 
@@ -1712,6 +1712,9 @@ class SimdOpCheck : public SimdOpCheckTest {
                 // (This fails to generate, but that's not entirely surprising -- I don't
                 // think we ever attempt to emit the most general-purpose swizzles in Halide
                 // code, so this may or may not be a defect.)
+                //
+                // TODO: this currently emits a bunch of extract_lane / replace_lane ops,
+                // so we should definitely try to do better.
                 // check("v8x16.swizzle", 16*w, in_u8(in_u8(x+32)));
 
                 // Integer addition
@@ -1746,20 +1749,22 @@ class SimdOpCheck : public SimdOpCheckTest {
                 check("i32x4.neg", 4 * w, -i32_1);
                 check("i64x2.neg", 2 * w, -i64_1);
 
-                // Extended integer multiplication
-                // TODO(https://github.com/halide/Halide/issues/5130): NOT BEING GENERATED AT TRUNK
-                // check("i16x8.extmul_low_i8x16_s", ???, ???);
-                // check("i16x8.extmul_high_i8x16_s", ???, ???);
-                // check("i16x8.extmul_low_i8x16_u", ???, ???);
-                // check("i16x8.extmul_high_i8x16_u", ???, ???);
-                // check("i32x4.extmul_low_i16x8_s", ???, ???);
-                // check("i32x4.extmul_high_i16x8_s", ???, ???);
-                // check("i32x4.extmul_low_i16x8_u", ???, ???);
-                // check("i32x4.extmul_high_i16x8_u", ???, ???);
-                // check("i64x2.extmul_low_i32x4_s", ???, ???);
-                // check("i64x2.extmul_high_i32x4_s", ???, ???);
-                // check("i64x2.extmul_low_i32x4_u", ???, ???);
-                // check("i64x2.extmul_high_i32x4_u", ???, ???);
+                // Extended (widening) integer multiplication
+                check("i16x8.extmul_low_i8x16_s", 8 * w, i16(i8_1) * i8_2);
+                check("i32x4.extmul_low_i16x8_s", 4 * w, i32(i16_1) * i16_2);
+                check("i64x2.extmul_low_i32x4_s", 2 * w, i64(i32_1) * i32_2);
+                check("i16x8.extmul_low_i8x16_u", 8 * w, u16(u8_1) * u8_2);
+                check("i32x4.extmul_low_i16x8_u", 4 * w, u32(u16_1) * u16_2);
+                check("i64x2.extmul_low_i32x4_u", 2 * w, u64(u32_1) * u32_2);
+                if (w > 1) {
+                    // Need a register wider than 128 bits for us to generate these
+                    check("i16x8.extmul_high_i8x16_s", 8 * w, i16(i8_1) * i8_2);
+                    check("i32x4.extmul_high_i16x8_s", 4 * w, i32(i16_1) * i16_2);
+                    check("i64x2.extmul_high_i32x4_s", 2 * w, i64(i32_1) * i32_2);
+                    check("i16x8.extmul_high_i8x16_u", 8 * w, u16(u8_1) * u8_2);
+                    check("i32x4.extmul_high_i16x8_u", 4 * w, u32(u16_1) * u16_2);
+                    check("i64x2.extmul_high_i32x4_u", 2 * w, u64(u32_1) * u32_2);
+                }
 
                 // Extended pairwise integer addition
                 // TODO(https://github.com/halide/Halide/issues/5130): NOT BEING GENERATED AT TRUNK