Merge pull request #92 from ranocha/hr/LoopVectorization

LoopVectorization
ranocha · May 25, 2021 · 93a88f8 · 93a88f8
2 parents 7b86cfe + ec12426
commit 93a88f8
Show file tree

Hide file tree

Showing 7 changed files with 61 additions and 76 deletions.
diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml
@@ -32,6 +32,7 @@ jobs:
       - uses: julia-actions/setup-julia@latest
         with:
           version: '1.6'
+      - uses: julia-actions/julia-buildpkg@v1
       - name: Install dependencies
         run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
       - name: Build and deploy

diff --git a/Project.toml b/Project.toml
@@ -9,6 +9,7 @@ DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
 DiffEqCallbacks = "459566f4-90b8-5000-8ac3-15dfb0a30def"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 PolynomialBases = "c74db56a-226d-5e98-8bb0-a6049094aeea"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
@@ -22,6 +23,7 @@ ArgCheck = "1, 2.0"
 DiffEqBase = "6"
 DiffEqCallbacks = "2.6"
 FFTW = "1"
+LoopVectorization = "0.12.22"
 PolynomialBases = "0.4.5"
 Reexport = "0.2, 1.0"
 Requires = "0.5.2, 1"

diff --git a/src/SBP_operators.jl b/src/SBP_operators.jl
@@ -212,25 +212,21 @@ end
         ex = :( $ex + upper_coef[$j]*u[i+$j] )
     end
 
-    quote
-        @inbounds for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
-            dest[i] = β*dest[i] + α*$ex
-        end
-    end
-end
-
-function convolve_interior_coefficients!(dest::AbstractVector{T}, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset}, u::AbstractVector, α, β, left_boundary_width, right_boundary_width, parallel::Val{:threads}) where {T, LowerOffset, UpperOffset}
-    Threads.@threads for i in (left_boundary_width+1):(length(dest)-right_boundary_width) @inbounds begin
-        tmp = zero(T)
-        for j in Base.OneTo(LowerOffset)
-            tmp += lower_coef[j]*u[i-j]
+    if parallel <: Val{:threads}
+        quote
+            Base.@_inline_meta
+            @tturbo for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
+                dest[i] = β*dest[i] + α*$ex
+            end
         end
-        tmp += central_coef*u[i]
-        for j in Base.OneTo(UpperOffset)
-            tmp += upper_coef[j]*u[i+j]
+    else
+        quote
+            Base.@_inline_meta
+            @turbo for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
+                dest[i] = β*dest[i] + α*$ex
+            end
         end
-        dest[i] = β*dest[i] + α*tmp
-    end end
+    end
 end
 
 @generated function convolve_interior_coefficients!(dest::AbstractVector, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset}, u::AbstractVector, α, left_boundary_width, right_boundary_width, parallel) where {LowerOffset, UpperOffset}
@@ -247,25 +243,21 @@ end
         ex = :( $ex + upper_coef[$j]*u[i+$j] )
     end
 
-    quote
-        @inbounds for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
-            dest[i] = α*$ex
-        end
-    end
-end
-
-function convolve_interior_coefficients!(dest::AbstractVector{T}, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset}, u::AbstractVector, α, left_boundary_width, right_boundary_width, parallel::Val{:threads}) where {T, LowerOffset, UpperOffset}
-    Threads.@threads for i in (left_boundary_width+1):(length(dest)-right_boundary_width) @inbounds begin
-        tmp = zero(T)
-        for j in Base.OneTo(LowerOffset)
-            tmp += lower_coef[j]*u[i-j]
+    if parallel <: Val{:threads}
+        quote
+            Base.@_inline_meta
+            @tturbo for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
+                dest[i] = α*$ex
+            end
         end
-        tmp += central_coef*u[i]
-        for j in Base.OneTo(UpperOffset)
-            tmp += upper_coef[j]*u[i+j]
+    else
+        quote
+            Base.@_inline_meta
+            @turbo for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
+                dest[i] = α*$ex
+            end
         end
-        dest[i] = α*tmp
-    end end
+    end
 end
 
 
@@ -380,7 +372,7 @@ Thus, the coefficients `α, β` have the same meaning as in [`mul!`](@ref).
 end
 
 @inline function mul_transpose_derivative_left!(u::AbstractVector, D::DerivativeOperator, der_order::Val{0}, α=true, β=false)
-    u[begin] = α * u[begin] + β * u[begin]
+    @inbounds u[begin] = α * u[begin] + β * u[begin]
     return nothing
 end
 
@@ -401,8 +393,7 @@ Thus, the coefficients `α, β` have the same meaning as in [`mul!`](@ref).
 end
 
 @inline function mul_transpose_derivative_right!(u::AbstractVector, D::DerivativeOperator, der_order::Val{0}, α=true, β=false)
-    factor = α
-    u[end] += factor * u[end]
+    @inbounds u[end] = α * u[end] + β * u[end]
     return nothing
 end
 

diff --git a/src/SummationByPartsOperators.jl b/src/SummationByPartsOperators.jl
@@ -40,6 +40,7 @@ using SparseArrays
 
 using ArgCheck: @argcheck
 using FFTW
+using LoopVectorization: @turbo, @tturbo
 using Reexport: @reexport
 using Requires
 using StaticArrays

diff --git a/src/periodic_operators.jl b/src/periodic_operators.jl
@@ -167,26 +167,21 @@ end
         ex = :( $ex + upper_coef[$j]*u[i+$j] )
     end
 
-    quote
-        @inbounds for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
-            dest[i] = β*dest[i] + α*$ex
-        end
-    end
-end
-
-function convolve_interior_coefficients!(dest::AbstractVector{T}, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset},
-                                         u::AbstractVector, α, β, parallel::Val{:threads}) where {T, LowerOffset, UpperOffset}
-    Threads.@threads for i in (LowerOffset+1):(length(dest)-UpperOffset) @inbounds begin
-        tmp = zero(T)
-        for j in Base.OneTo(LowerOffset)
-            tmp += lower_coef[j]*u[i-j]
+    if parallel <: Val{:threads}
+        quote
+            Base.@_inline_meta
+            @tturbo for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
+                dest[i] = β*dest[i] + α*$ex
+            end
         end
-        tmp += central_coef*u[i]
-        for j in Base.OneTo(UpperOffset)
-            tmp += upper_coef[j]*u[i+j]
+    else
+        quote
+            Base.@_inline_meta
+            @turbo for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
+                dest[i] = β*dest[i] + α*$ex
+            end
         end
-        dest[i] = β*dest[i] + α*tmp
-    end end
+    end
 end
 
 @generated function convolve_interior_coefficients!(dest::AbstractVector, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset},
@@ -204,26 +199,21 @@ end
         ex = :( $ex + upper_coef[$j]*u[i+$j] )
     end
 
-    quote
-        @inbounds for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
-            dest[i] = α*$ex
-        end
-    end
-end
-
-function convolve_interior_coefficients!(dest::AbstractVector{T}, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset},
-                                         u::AbstractVector, α, parallel::Val{:threads}) where {T, LowerOffset, UpperOffset}
-    Threads.@threads for i in (LowerOffset+1):(length(dest)-UpperOffset) @inbounds begin
-        tmp = zero(T)
-        for j in Base.OneTo(LowerOffset)
-            tmp += lower_coef[j]*u[i-j]
+    if parallel <: Val{:threads}
+        quote
+            Base.@_inline_meta
+            @tturbo for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
+                dest[i] = α*$ex
+            end
         end
-        tmp += central_coef*u[i]
-        for j in Base.OneTo(UpperOffset)
-            tmp += upper_coef[j]*u[i+j]
+    else
+        quote
+            Base.@_inline_meta
+            @turbo for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
+                dest[i] = α*$ex
+            end
         end
-        dest[i] = α*tmp
-    end end
+    end
 end
 
 

diff --git a/test/SBP_operators_test.jl b/test/SBP_operators_test.jl
@@ -559,7 +559,7 @@ for source in D_test_list, T in (Float32,Float64)
         @test all(i->isapprox(res[i], 6*x0[i], rtol=5_000_000*eps(T)), eachindex(res))
         # only interior
         mul!(res, D, x3)
-        @test all(i->isapprox(res[i], 6*x0[i], rtol=100_000*eps(T)), inner_indices)
+        @test all(i->isapprox(res[i], 6*x0[i], rtol=150_000*eps(T)), inner_indices)
         mul!(res, D, x4)
         @test all(i->isapprox(res[i], 24*x1[i], rtol=50_000*eps(T)), inner_indices)
         mul!(res, D, x5)
@@ -684,7 +684,7 @@ for source in D_test_list, T in (Float32,Float64)
         mul!(res, D, x2)
         @test all(i->abs(res[i]) < 50_000_000*eps(T), inner_indices)
         mul!(res, D, x3)
-        @test all(i->abs(res[i]) < 100_000_000*eps(T), inner_indices)
+        @test all(i->abs(res[i]) < 120_000_000*eps(T), inner_indices)
         mul!(res, D, x4)
         @test all(i->isapprox(res[i], 24*x0[i], rtol=50_000_000*eps(T)), inner_indices)
         # boundary: first derivative

diff --git a/test/periodic_operators_test.jl b/test/periodic_operators_test.jl
@@ -305,7 +305,7 @@ let T = Float64
     mul!(res, D, x1); @test norm(res[accuracy_order:end-accuracy_order], Inf) < 5000*eps(T)
     mul!(res, D, x2); @test norm((res - 2 .* x0)[accuracy_order:end-accuracy_order], Inf) < 10020*eps(T)
     mul!(res, D, x3); @test norm((res - 6 .* x1)[accuracy_order:end-accuracy_order], Inf) < 30000*eps(T)
-    mul!(res, D, x4); @test norm((res - 12 .* x2)[accuracy_order:end-accuracy_order], Inf) < 28000*eps(T)
+    mul!(res, D, x4); @test norm((res - 12 .* x2)[accuracy_order:end-accuracy_order], Inf) < 40000*eps(T)
     mul!(res, D, x5); @test norm((res - 20 .* x3)[accuracy_order:end-accuracy_order], Inf) < 100000*eps(T)
     mul!(res, D, x6); @test norm((res - 30 .* x4)[accuracy_order:end-accuracy_order], Inf) < 220000*eps(T)
     mul!(res, D, x7); @test norm((res - 42 .* x5)[accuracy_order:end-accuracy_order], Inf) > 220000*eps(T)