From fa5f2a055c77ceeea4b734e1f39174fbf9da844b Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 13 Sep 2024 12:41:44 -0700
Subject: [PATCH 1/2] Reschedule the matrix multiply performance app

---
 test/performance/matrix_multiplication.cpp | 74 ++++++++++++++++------
 1 file changed, 54 insertions(+), 20 deletions(-)
diff --git a/test/performance/matrix_multiplication.cpp b/test/performance/matrix_multiplication.cpp
index 873c2f06cb7a..85497e89dd40 100644
--- a/test/performance/matrix_multiplication.cpp
+++ b/test/performance/matrix_multiplication.cpp
@@ -30,44 +30,78 @@ int main(int argc, char **argv) {
     ImageParam A(type_of<float>(), 2);
     ImageParam B(type_of<float>(), 2);
 
-    Var x("x"), xi("xi"), xo("xo"), y("y"), yo("yo"), yi("yi"), yii("yii"), xii("xii");
-    Func matrix_mul("matrix_mul");
-
+    Var x("x"), y("y");
     RDom k(0, matrix_size);
-    RVar ki;
+
+    Func matrix_mul("matrix_mul");
 
     matrix_mul(x, y) += A(k, y) * B(x, k);
 
     Func out;
     out(x, y) = matrix_mul(x, y);
 
-    Var xy;
+    // Now the schedule. Single-threaded, it hits 155 GFlops on Skylake-X
+    // i9-9960x with AVX-512 (80% of peak)), and 87 GFlops with AVX2 (90% of
+    // peak).
+    //
+    // Using 16 threads (and no hyperthreading), hits 2080 GFlops (67% of peak)
+    // and 1310 GFLops (85% of peak) respectively.
 
-    out.tile(x, y, xi, yi, 24, 32)
-        .fuse(x, y, xy)
-        .parallel(xy)
-        .split(yi, yi, yii, 4)
-        .vectorize(xi, 8)
+    const int vec = target.natural_vector_size<float>();
+
+    // Size the inner loop tiles to fit into the number of registers available
+    // on the target, using either 12 accumulator registers or 24.
+    const int inner_tile_x = 3 * vec;
+    const int inner_tile_y = (target.has_feature(Target::AVX512) || target.arch != Target::X86) ? 8 : 4;
+
+    // The shape of the outer tiling
+    const int tile_y = matrix_size / 4;
+    const int tile_k = matrix_size / 16;
+
+    Var xy("xy"), xi("xi"), yi("yi"), yii("yii");
+
+    out.tile(x, y, xi, yi, inner_tile_x, tile_y)
+        .split(yi, yi, yii, inner_tile_y)
+        .vectorize(xi, vec)
         .unroll(xi)
-        .unroll(yii);
+        .unroll(yii)
+        .fuse(x, y, xy)
+        .parallel(xy);
+
+    RVar ko("ko"), ki("ki");
+    Var z("z");
+    matrix_mul.update().split(k, ko, ki, tile_k);
+
+    // Factor the reduction so that we can do outer blocking over the reduction
+    // dimension.
+    Func intm = matrix_mul.update().rfactor(ko, z);
 
-    matrix_mul.compute_at(out, yi)
-        .vectorize(x, 8)
+    intm.compute_at(matrix_mul, y)
+        .vectorize(x, vec)
+        .unroll(x)
         .unroll(y);
 
-    matrix_mul.update(0)
-        .reorder(x, y, k)
-        .vectorize(x, 8)
+    intm.update(0)
+        .reorder(x, y, ki)
+        .vectorize(x, vec)
         .unroll(x)
-        .unroll(y)
-        .unroll(k, 2);
+        .unroll(y);
+
+    matrix_mul.compute_at(out, xy)
+        .vectorize(x, vec)
+        .unroll(x);
+
+    matrix_mul.update()
+        .split(y, y, yi, inner_tile_y)
+        .reorder(x, yi, y, ko)
+        .vectorize(x, vec)
+        .unroll(x)
+        .unroll(yi);
 
     out
         .bound(x, 0, matrix_size)
         .bound(y, 0, matrix_size);
 
-    out.compile_jit();
-
     Buffer<float> mat_A(matrix_size, matrix_size);
     Buffer<float> mat_B(matrix_size, matrix_size);
     Buffer<float> output(matrix_size, matrix_size);

From 1c0954b87c18f2869c90eed83edd67b5c396c66d Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 13 Sep 2024 12:43:45 -0700
Subject: [PATCH 2/2] Remove stray paren

---
 test/performance/matrix_multiplication.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/performance/matrix_multiplication.cpp b/test/performance/matrix_multiplication.cpp
index 85497e89dd40..8b37b001f7df 100644
--- a/test/performance/matrix_multiplication.cpp
+++ b/test/performance/matrix_multiplication.cpp
@@ -41,7 +41,7 @@ int main(int argc, char **argv) {
     out(x, y) = matrix_mul(x, y);
 
     // Now the schedule. Single-threaded, it hits 155 GFlops on Skylake-X
-    // i9-9960x with AVX-512 (80% of peak)), and 87 GFlops with AVX2 (90% of
+    // i9-9960x with AVX-512 (80% of peak), and 87 GFlops with AVX2 (90% of
     // peak).
     //
     // Using 16 threads (and no hyperthreading), hits 2080 GFlops (67% of peak)