From 7bbe2fdb24c86a2b21aa6a44dc3175cb1c4d10e7 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 30 Mar 2021 16:00:23 -0700
Subject: [PATCH] Add wasm support for int32->f64 and f32->f64 simd ops (#5863)

* Add wasm support for int32->f64 and f32->f64 simd ops

At top-of-tree LLVM, the wasm backend never seems to emit the vector version of these ops; pattern-match to target them specifically.
---
 src/CodeGen_WebAssembly.cpp        |  7 +++++++
 src/runtime/wasm_math.ll           | 33 ++++++++++++++++++++++++++++++
 test/correctness/simd_op_check.cpp |  8 +++-----
 3 files changed, 43 insertions(+), 5 deletions(-)
diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
index 61eb4aa28888..e13431830f6c 100644
--- a/src/CodeGen_WebAssembly.cpp
+++ b/src/CodeGen_WebAssembly.cpp
@@ -94,6 +94,10 @@ const WasmIntrinsic intrinsic_defs[] = {
     {"llvm.wasm.extadd.pairwise.unsigned.v8i16", Int(16, 8), "pairwise_widening_add", {UInt(8, 16)}, Target::WasmSimd128},
     {"llvm.wasm.extadd.pairwise.unsigned.v4i32", Int(32, 4), "pairwise_widening_add", {UInt(16, 8)}, Target::WasmSimd128},
 
+    {"i32_to_double_s", Float(64, 4), "int_to_double", {Int(32, 4)}, Target::WasmSimd128},
+    {"i32_to_double_u", Float(64, 4), "int_to_double", {UInt(32, 4)}, Target::WasmSimd128},
+    {"float_to_double", Float(64, 4), "float_to_double", {Float(32, 4)}, Target::WasmSimd128},
+
     // Basically like ARM's SQRDMULH
     {"llvm.wasm.q15mulr.sat.signed", Int(16, 8), "q15mulr_sat_s", {Int(16, 8), Int(16, 8)}, Target::WasmSimd128},
 
@@ -147,6 +151,9 @@ void CodeGen_WebAssembly::visit(const Cast *op) {
         {"saturating_narrow", u8_sat(wild_i16x_), Target::WasmSimd128},
         {"saturating_narrow", i16_sat(wild_i32x_), Target::WasmSimd128},
         {"saturating_narrow", u16_sat(wild_i32x_), Target::WasmSimd128},
+        {"int_to_double", f64(wild_i32x_), Target::WasmSimd128},
+        {"int_to_double", f64(wild_u32x_), Target::WasmSimd128},
+        {"float_to_double", f64(wild_f32x_), Target::WasmSimd128},
     };
     // clang-format on
 
diff --git a/src/runtime/wasm_math.ll b/src/runtime/wasm_math.ll
index 6ea43e90fed2..1748a841da7c 100644
--- a/src/runtime/wasm_math.ll
+++ b/src/runtime/wasm_math.ll
@@ -138,3 +138,36 @@ define weak_odr <8 x i16> @saturating_narrow_i32x8_to_u16x8(<8 x i32> %x) nounwi
   %3 = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> %1, <4 x i32> %2)
   ret <8 x i16> %3
 }
+
+; Integer to double-precision floating point
+
+declare <2 x double> @llvm.wasm.convert.low.signed(<4 x i32>)
+declare <2 x double> @llvm.wasm.convert.low.unsigned(<4 x i32>)
+
+define weak_odr <4 x double> @i32_to_double_s(<4 x i32> %x) nounwind alwaysinline {
+  %1 = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = tail call <2 x double> @llvm.wasm.convert.low.signed(<4 x i32> %x)
+  %3 = tail call <2 x double> @llvm.wasm.convert.low.signed(<4 x i32> %1)
+  %4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %4
+}
+
+define weak_odr <4 x double> @i32_to_double_u(<4 x i32> %x) nounwind alwaysinline {
+  %1 = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = tail call <2 x double> @llvm.wasm.convert.low.unsigned(<4 x i32> %x)
+  %3 = tail call <2 x double> @llvm.wasm.convert.low.unsigned(<4 x i32> %1)
+  %4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %4
+}
+
+; single to double-precision floating point
+
+declare <2 x double> @llvm.wasm.promote.low(<4 x float>)
+
+define weak_odr <4 x double> @float_to_double(<4 x float> %x) nounwind alwaysinline {
+  %1 = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = tail call <2 x double> @llvm.wasm.promote.low(<4 x float> %x)
+  %3 = tail call <2 x double> @llvm.wasm.promote.low(<4 x float> %1)
+  %4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %4
+}
diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
index 2e910f66fb8f..db4f24f407d3 100644
--- a/test/correctness/simd_op_check.cpp
+++ b/test/correctness/simd_op_check.cpp
@@ -2137,9 +2137,8 @@ class SimdOpCheck : public SimdOpCheckTest {
                 check("f32x4.convert_i32x4_u", 8 * w, cast<float>(u32_1));
 
                 // Integer to double-precision floating point
-                // TODO(https://github.com/halide/Halide/issues/5130): NOT BEING GENERATED AT TRUNK
-                // check("f64x2.convert_low_i32x4_s", 4 * w, cast<double>(i32_1));
-                // check("f64x2.convert_low_i32x4_u", 4 * w, cast<double>(u32_1));
+                check("f64x2.convert_low_i32x4_s", 2 * w, cast<double>(i32_1));
+                check("f64x2.convert_low_i32x4_u", 2 * w, cast<double>(u32_1));
 
                 // Single-precision floating point to integer with saturation
                 check("i32x4.trunc_sat_f32x4_s", 4 * w, cast<int32_t>(f32_1));
@@ -2155,8 +2154,7 @@ class SimdOpCheck : public SimdOpCheckTest {
                 // check("f32x4.demote_f64x2_zero", 4 * w, ???);
 
                 // Single-precision floating point to double-precision
-                // TODO(https://github.com/halide/Halide/issues/5130): NOT BEING GENERATED AT TRUNK
-                // check("f64x2.promote_low_f32x4", 4 * w, ???);
+                check("f64x2.promote_low_f32x4", 2 * w, cast<double>(f32_1));
 
                 // Integer to integer narrowing
                 check("i8x16.narrow_i16x8_s", 16 * w, i8_sat(i16_1));