halide · steven-johnson · Mar 30, 2021 · Mar 24, 2021 · Mar 25, 2021 · Mar 25, 2021
diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
@@ -1,15 +1,22 @@
-#include "CodeGen_Posix.h"
+#include <functional>
+#include <sstream>
 
+#include "CodeGen_Posix.h"
+#include "ConciseCasts.h"
+#include "IRMatch.h"
+#include "IROperator.h"
 #include "LLVM_Headers.h"
-#include <sstream>
 
 namespace Halide {
 namespace Internal {
 
 using std::string;
+using std::vector;
 
 #if defined(WITH_WEBASSEMBLY)
 
+using namespace Halide::ConciseCasts;
+
 namespace {
 
 /** A code generator that emits WebAssembly code from a given Halide stmt. */
@@ -27,6 +34,7 @@ class CodeGen_WebAssembly : public CodeGen_Posix {
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
     bool use_pic() const override;
+    void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
 };
 
 CodeGen_WebAssembly::CodeGen_WebAssembly(const Target &t)
@@ -67,12 +75,24 @@ const WasmIntrinsic intrinsic_defs[] = {
     {"llvm.wasm.avgr.unsigned.v8i16", UInt(16, 8), "rounding_halving_add", {UInt(16, 8), UInt(16, 8)}, Target::WasmSimd128},
 
 #if LLVM_VERSION >= 130
+    // With some work, some of these could possibly be adapted to work under earlier versions of LLVM.
     {"widening_mul_i8x16", Int(16, 16), "widening_mul", {Int(8, 16), Int(8, 16)}, Target::WasmSimd128},
     {"widening_mul_i16x8", Int(32, 8), "widening_mul", {Int(16, 8), Int(16, 8)}, Target::WasmSimd128},
     {"widening_mul_i32x4", Int(64, 4), "widening_mul", {Int(32, 4), Int(32, 4)}, Target::WasmSimd128},
     {"widening_mul_u8x16", UInt(16, 16), "widening_mul", {UInt(8, 16), UInt(8, 16)}, Target::WasmSimd128},
     {"widening_mul_u16x8", UInt(32, 8), "widening_mul", {UInt(16, 8), UInt(16, 8)}, Target::WasmSimd128},
     {"widening_mul_u32x4", UInt(64, 4), "widening_mul", {UInt(32, 4), UInt(32, 4)}, Target::WasmSimd128},
+
+    {"llvm.wasm.extadd.pairwise.signed.v8i16", Int(16, 8), "pairwise_widening_add", {Int(8, 16)}, Target::WasmSimd128},
+    {"llvm.wasm.extadd.pairwise.unsigned.v8i16", UInt(16, 8), "pairwise_widening_add", {UInt(8, 16)}, Target::WasmSimd128},
+    {"llvm.wasm.extadd.pairwise.signed.v4i32", Int(32, 4), "pairwise_widening_add", {Int(16, 8)}, Target::WasmSimd128},
+    {"llvm.wasm.extadd.pairwise.unsigned.v4i32", UInt(32, 4), "pairwise_widening_add", {UInt(16, 8)}, Target::WasmSimd128},
+    // There isn't an op for u8x16 -> i16x8, but we can just the u8x16 -> u16x8 op and treat the result as i16x8,
+    // since the result will be the same for our purposes here
+    {"llvm.wasm.extadd.pairwise.unsigned.v8i16", Int(16, 8), "pairwise_widening_add", {UInt(8, 16)}, Target::WasmSimd128},
+    {"llvm.wasm.extadd.pairwise.unsigned.v4i32", Int(32, 4), "pairwise_widening_add", {UInt(16, 8)}, Target::WasmSimd128},
+
+    {"llvm.wasm.dot", Int(32, 4), "dot_product", {Int(16, 8), Int(16, 8)}, Target::WasmSimd128},
 #endif
 
     // TODO: LLVM should support this directly, but doesn't yet.
@@ -95,7 +115,7 @@ void CodeGen_WebAssembly::init_module() {
         }
 
         Type ret_type = i.ret_type;
-        std::vector<Type> arg_types;
+        vector<Type> arg_types;
         arg_types.reserve(max_intrinsic_args);
         for (halide_type_t i : i.arg_types) {
             if (i.bits == 0) {
@@ -110,6 +130,81 @@ void CodeGen_WebAssembly::init_module() {
     }
 }
 
+void CodeGen_WebAssembly::codegen_vector_reduce(const VectorReduce *op, const Expr &init) {
+#if LLVM_VERSION >= 130
+    struct Pattern {
+        VectorReduce::Operator reduce_op;
+        int factor;
+        Expr pattern;
+        const char *intrin;
+        Target::Feature required_feature;
+    };
+    // clang-format off
+    static const Pattern patterns[] = {
+        {VectorReduce::Add, 2, i16(wild_i8x_), "pairwise_widening_add", Target::WasmSimd128},
+        {VectorReduce::Add, 2, u16(wild_u8x_), "pairwise_widening_add", Target::WasmSimd128},
+        {VectorReduce::Add, 2, i16(wild_u8x_), "pairwise_widening_add", Target::WasmSimd128},
+
+        {VectorReduce::Add, 2, i32(wild_i16x_), "pairwise_widening_add", Target::WasmSimd128},
+        {VectorReduce::Add, 2, u32(wild_u16x_), "pairwise_widening_add", Target::WasmSimd128},
+        {VectorReduce::Add, 2, i32(wild_u16x_), "pairwise_widening_add", Target::WasmSimd128},
+
+        {VectorReduce::Add, 2, i32(widening_mul(wild_i16x_, wild_i16x_)), "dot_product", Target::WasmSimd128},
+    };
+    // clang-format on
+
+    // Other values will be added soon, so this switch isn't actually pointless
+    using ValuePtr = llvm::Value *;
+    std::function<ValuePtr(ValuePtr, ValuePtr)> binop = nullptr;
+    switch (op->op) {
+    case VectorReduce::Add:
+        binop = [this](ValuePtr x, ValuePtr y) -> ValuePtr { return this->builder->CreateAdd(x, y); };
+        break;
+    default:
+        break;
+    }
+
+    const int factor = op->value.type().lanes() / op->type.lanes();
+    vector<Expr> matches;
+    for (const Pattern &p : patterns) {
+        if (op->op != p.reduce_op || (factor % p.factor) != 0) {
+            continue;
+        }
+        if (!target.has_feature(p.required_feature)) {
+            continue;
+        }
+        if (expr_match(p.pattern, op->value, matches)) {
+            if (factor != p.factor) {
+                Expr equiv = VectorReduce::make(op->op, op->value, op->value.type().lanes() / p.factor);
+                equiv = VectorReduce::make(op->op, equiv, op->type.lanes());
+                codegen_vector_reduce(equiv.as<VectorReduce>(), init);
+                return;
+            }
+
+            if (const Shuffle *s = matches[0].as<Shuffle>()) {
+                if (s->is_broadcast() && matches.size() == 2) {
+                    // LLVM wants the broadcast as the second operand for the broadcasting
+                    // variant of udot/sdot.
+                    std::swap(matches[0], matches[1]);
+                }
+            }
+            value = call_overloaded_intrin(op->type, p.intrin, matches);
+            if (value) {
+                if (init.defined()) {
+                    internal_assert(binop != nullptr) << "unsupported op";
+                    ValuePtr x = value;
+                    ValuePtr y = codegen(init);
+                    value = binop(x, y);
+                }
+                return;
+            }
+        }
+    }
+#endif  // LLVM_VERSION >= 130
+
+    CodeGen_Posix::codegen_vector_reduce(op, init);
+}
+
 string CodeGen_WebAssembly::mcpu() const {
     return "";
 }

diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
@@ -1737,41 +1737,64 @@ class SimdOpCheck : public SimdOpCheckTest {
                 check("i64x2.mul", 2 * w, i64_1 * i64_2);
 
                 // Integer dot product (16 -> 32)
-                // TODO(https://github.com/halide/Halide/issues/5130): NOT BEING GENERATED AT TRUNK
-                // {
-                //     RDom r(0, 4);
-                //     check("i32x4.dot_i16x8_s", 2 * w, sum(i32(in_i16(x * 4 + r)) * in_i16(x * 4 + r + 32)));
-                // }
+                for (int f : {2, 4, 8}) {
+                    RDom r(0, f);
+                    for (int v : {1, 2, 4}) {
+                        check("i32x4.dot_i16x8_s", w * v, sum(i32(in_i16(f * x + r)) * in_i16(f * x + r + 32)));
+                    }
+                }
 
                 // Integer negation
                 check("i8x16.neg", 16 * w, -i8_1);
                 check("i16x8.neg", 8 * w, -i16_1);
                 check("i32x4.neg", 4 * w, -i32_1);
                 check("i64x2.neg", 2 * w, -i64_1);
 
-                // Extended (widening) integer multiplication
-                check("i16x8.extmul_low_i8x16_s", 8 * w, i16(i8_1) * i8_2);
-                check("i32x4.extmul_low_i16x8_s", 4 * w, i32(i16_1) * i16_2);
-                check("i64x2.extmul_low_i32x4_s", 2 * w, i64(i32_1) * i32_2);
-                check("i16x8.extmul_low_i8x16_u", 8 * w, u16(u8_1) * u8_2);
-                check("i32x4.extmul_low_i16x8_u", 4 * w, u32(u16_1) * u16_2);
-                check("i64x2.extmul_low_i32x4_u", 2 * w, u64(u32_1) * u32_2);
-                if (w > 1) {
-                    // Need a register wider than 128 bits for us to generate these
-                    check("i16x8.extmul_high_i8x16_s", 8 * w, i16(i8_1) * i8_2);
-                    check("i32x4.extmul_high_i16x8_s", 4 * w, i32(i16_1) * i16_2);
-                    check("i64x2.extmul_high_i32x4_s", 2 * w, i64(i32_1) * i32_2);
-                    check("i16x8.extmul_high_i8x16_u", 8 * w, u16(u8_1) * u8_2);
-                    check("i32x4.extmul_high_i16x8_u", 4 * w, u32(u16_1) * u16_2);
-                    check("i64x2.extmul_high_i32x4_u", 2 * w, u64(u32_1) * u32_2);
-                }
+                if (Halide::Internal::get_llvm_version() >= 130) {
+                    // At present, we only attempt to generate these for LLVM >= 13.
+
+                    // Extended (widening) integer multiplication
+                    check("i16x8.extmul_low_i8x16_s", 8 * w, i16(i8_1) * i8_2);
+                    check("i32x4.extmul_low_i16x8_s", 4 * w, i32(i16_1) * i16_2);
+                    check("i64x2.extmul_low_i32x4_s", 2 * w, i64(i32_1) * i32_2);
+                    check("i16x8.extmul_low_i8x16_u", 8 * w, u16(u8_1) * u8_2);
+                    check("i32x4.extmul_low_i16x8_u", 4 * w, u32(u16_1) * u16_2);
+                    check("i64x2.extmul_low_i32x4_u", 2 * w, u64(u32_1) * u32_2);
+                    if (w > 1) {
+                        // Need a register wider than 128 bits for us to generate these
+                        check("i16x8.extmul_high_i8x16_s", 8 * w, i16(i8_1) * i8_2);
+                        check("i32x4.extmul_high_i16x8_s", 4 * w, i32(i16_1) * i16_2);
+                        check("i64x2.extmul_high_i32x4_s", 2 * w, i64(i32_1) * i32_2);
+                        check("i16x8.extmul_high_i8x16_u", 8 * w, u16(u8_1) * u8_2);
+                        check("i32x4.extmul_high_i16x8_u", 4 * w, u32(u16_1) * u16_2);
+                        check("i64x2.extmul_high_i32x4_u", 2 * w, u64(u32_1) * u32_2);
+                    }
 
-                // Extended pairwise integer addition
-                // TODO(https://github.com/halide/Halide/issues/5130): NOT BEING GENERATED AT TRUNK
-                // check("i16x8.extadd_pairwise_i8x16_s", ???, ???);
-                // check("i16x8.extadd_pairwise_i8x16_u", ???, ???);
-                // check("i32x4.extadd_pairwise_i16x8_s", ???, ???);
-                // check("i32x4.extadd_pairwise_i16x8_u", ???, ???);
+                    // Extended pairwise integer addition
+                    for (int f : {2, 4}) {
+                        RDom r(0, f);
+
+                        // A summation reduction that starts at something
+                        // non-trivial, to avoid llvm simplifying accumulating
+                        // widening summations into just widening summations.
+                        auto sum_ = [&](Expr e) {
+                            Func f;
+                            f(x) = cast(e.type(), 123);
+                            f(x) += e;
+                            return f(x);
+                        };
+
+                        check("i16x8.extadd_pairwise_i8x16_s", 8 * w, sum_(i16(in_i8(f * x + r))));
+                        check("i16x8.extadd_pairwise_i8x16_u", 8 * w, sum_(u16(in_u8(f * x + r))));
+                        // The u8->i16 op uses the unsigned variant
+                        check("i16x8.extadd_pairwise_i8x16_u", 8 * w, sum_(i16(in_u8(f * x + r))));
+
+                        check("i32x4.extadd_pairwise_i16x8_s", 8 * w, sum_(i32(in_i16(f * x + r))));
+                        check("i32x4.extadd_pairwise_i16x8_u", 8 * w, sum_(u32(in_u16(f * x + r))));
+                        // The u16->i32 op uses the unsigned variant
+                        check("i32x4.extadd_pairwise_i16x8_u", 8 * w, sum_(i32(in_u16(f * x + r))));
+                    }
+                }
 
                 // Saturating integer addition
                 std::string sat = Halide::Internal::get_llvm_version() >= 130 ? "sat" : "saturate";