Skip to content

Commit

Permalink
Merge pull request #92 from ranocha/hr/LoopVectorization
Browse files Browse the repository at this point in the history
LoopVectorization
  • Loading branch information
ranocha committed May 25, 2021
2 parents 7b86cfe + ec12426 commit 93a88f8
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 76 deletions.
1 change: 1 addition & 0 deletions .github/workflows/Documenter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ jobs:
- uses: julia-actions/setup-julia@latest
with:
version: '1.6'
- uses: julia-actions/julia-buildpkg@v1
- name: Install dependencies
run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
- name: Build and deploy
Expand Down
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
DiffEqCallbacks = "459566f4-90b8-5000-8ac3-15dfb0a30def"
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
PolynomialBases = "c74db56a-226d-5e98-8bb0-a6049094aeea"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
Expand All @@ -22,6 +23,7 @@ ArgCheck = "1, 2.0"
DiffEqBase = "6"
DiffEqCallbacks = "2.6"
FFTW = "1"
LoopVectorization = "0.12.22"
PolynomialBases = "0.4.5"
Reexport = "0.2, 1.0"
Requires = "0.5.2, 1"
Expand Down
65 changes: 28 additions & 37 deletions src/SBP_operators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -212,25 +212,21 @@ end
ex = :( $ex + upper_coef[$j]*u[i+$j] )
end

quote
@inbounds for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
dest[i] = β*dest[i] + α*$ex
end
end
end

function convolve_interior_coefficients!(dest::AbstractVector{T}, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset}, u::AbstractVector, α, β, left_boundary_width, right_boundary_width, parallel::Val{:threads}) where {T, LowerOffset, UpperOffset}
Threads.@threads for i in (left_boundary_width+1):(length(dest)-right_boundary_width) @inbounds begin
tmp = zero(T)
for j in Base.OneTo(LowerOffset)
tmp += lower_coef[j]*u[i-j]
if parallel <: Val{:threads}
quote
Base.@_inline_meta
@tturbo for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
dest[i] = β*dest[i] + α*$ex
end
end
tmp += central_coef*u[i]
for j in Base.OneTo(UpperOffset)
tmp += upper_coef[j]*u[i+j]
else
quote
Base.@_inline_meta
@turbo for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
dest[i] = β*dest[i] + α*$ex
end
end
dest[i] = β*dest[i] + α*tmp
end end
end
end

@generated function convolve_interior_coefficients!(dest::AbstractVector, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset}, u::AbstractVector, α, left_boundary_width, right_boundary_width, parallel) where {LowerOffset, UpperOffset}
Expand All @@ -247,25 +243,21 @@ end
ex = :( $ex + upper_coef[$j]*u[i+$j] )
end

quote
@inbounds for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
dest[i] = α*$ex
end
end
end

function convolve_interior_coefficients!(dest::AbstractVector{T}, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset}, u::AbstractVector, α, left_boundary_width, right_boundary_width, parallel::Val{:threads}) where {T, LowerOffset, UpperOffset}
Threads.@threads for i in (left_boundary_width+1):(length(dest)-right_boundary_width) @inbounds begin
tmp = zero(T)
for j in Base.OneTo(LowerOffset)
tmp += lower_coef[j]*u[i-j]
if parallel <: Val{:threads}
quote
Base.@_inline_meta
@tturbo for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
dest[i] = α*$ex
end
end
tmp += central_coef*u[i]
for j in Base.OneTo(UpperOffset)
tmp += upper_coef[j]*u[i+j]
else
quote
Base.@_inline_meta
@turbo for i in (left_boundary_width+1):(length(dest)-right_boundary_width)
dest[i] = α*$ex
end
end
dest[i] = α*tmp
end end
end
end


Expand Down Expand Up @@ -380,7 +372,7 @@ Thus, the coefficients `α, β` have the same meaning as in [`mul!`](@ref).
end

@inline function mul_transpose_derivative_left!(u::AbstractVector, D::DerivativeOperator, der_order::Val{0}, α=true, β=false)
u[begin] = α * u[begin] + β * u[begin]
@inbounds u[begin] = α * u[begin] + β * u[begin]
return nothing
end

Expand All @@ -401,8 +393,7 @@ Thus, the coefficients `α, β` have the same meaning as in [`mul!`](@ref).
end

@inline function mul_transpose_derivative_right!(u::AbstractVector, D::DerivativeOperator, der_order::Val{0}, α=true, β=false)
factor = α
u[end] += factor * u[end]
@inbounds u[end] = α * u[end] + β * u[end]
return nothing
end

Expand Down
1 change: 1 addition & 0 deletions src/SummationByPartsOperators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ using SparseArrays

using ArgCheck: @argcheck
using FFTW
using LoopVectorization: @turbo, @tturbo
using Reexport: @reexport
using Requires
using StaticArrays
Expand Down
62 changes: 26 additions & 36 deletions src/periodic_operators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -167,26 +167,21 @@ end
ex = :( $ex + upper_coef[$j]*u[i+$j] )
end

quote
@inbounds for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
dest[i] = β*dest[i] + α*$ex
end
end
end

function convolve_interior_coefficients!(dest::AbstractVector{T}, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset},
u::AbstractVector, α, β, parallel::Val{:threads}) where {T, LowerOffset, UpperOffset}
Threads.@threads for i in (LowerOffset+1):(length(dest)-UpperOffset) @inbounds begin
tmp = zero(T)
for j in Base.OneTo(LowerOffset)
tmp += lower_coef[j]*u[i-j]
if parallel <: Val{:threads}
quote
Base.@_inline_meta
@tturbo for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
dest[i] = β*dest[i] + α*$ex
end
end
tmp += central_coef*u[i]
for j in Base.OneTo(UpperOffset)
tmp += upper_coef[j]*u[i+j]
else
quote
Base.@_inline_meta
@turbo for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
dest[i] = β*dest[i] + α*$ex
end
end
dest[i] = β*dest[i] + α*tmp
end end
end
end

@generated function convolve_interior_coefficients!(dest::AbstractVector, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset},
Expand All @@ -204,26 +199,21 @@ end
ex = :( $ex + upper_coef[$j]*u[i+$j] )
end

quote
@inbounds for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
dest[i] = α*$ex
end
end
end

function convolve_interior_coefficients!(dest::AbstractVector{T}, lower_coef::SVector{LowerOffset}, central_coef, upper_coef::SVector{UpperOffset},
u::AbstractVector, α, parallel::Val{:threads}) where {T, LowerOffset, UpperOffset}
Threads.@threads for i in (LowerOffset+1):(length(dest)-UpperOffset) @inbounds begin
tmp = zero(T)
for j in Base.OneTo(LowerOffset)
tmp += lower_coef[j]*u[i-j]
if parallel <: Val{:threads}
quote
Base.@_inline_meta
@tturbo for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
dest[i] = α*$ex
end
end
tmp += central_coef*u[i]
for j in Base.OneTo(UpperOffset)
tmp += upper_coef[j]*u[i+j]
else
quote
Base.@_inline_meta
@turbo for i in $(LowerOffset+1):(length(dest)-$UpperOffset)
dest[i] = α*$ex
end
end
dest[i] = α*tmp
end end
end
end


Expand Down
4 changes: 2 additions & 2 deletions test/SBP_operators_test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,7 @@ for source in D_test_list, T in (Float32,Float64)
@test all(i->isapprox(res[i], 6*x0[i], rtol=5_000_000*eps(T)), eachindex(res))
# only interior
mul!(res, D, x3)
@test all(i->isapprox(res[i], 6*x0[i], rtol=100_000*eps(T)), inner_indices)
@test all(i->isapprox(res[i], 6*x0[i], rtol=150_000*eps(T)), inner_indices)
mul!(res, D, x4)
@test all(i->isapprox(res[i], 24*x1[i], rtol=50_000*eps(T)), inner_indices)
mul!(res, D, x5)
Expand Down Expand Up @@ -684,7 +684,7 @@ for source in D_test_list, T in (Float32,Float64)
mul!(res, D, x2)
@test all(i->abs(res[i]) < 50_000_000*eps(T), inner_indices)
mul!(res, D, x3)
@test all(i->abs(res[i]) < 100_000_000*eps(T), inner_indices)
@test all(i->abs(res[i]) < 120_000_000*eps(T), inner_indices)
mul!(res, D, x4)
@test all(i->isapprox(res[i], 24*x0[i], rtol=50_000_000*eps(T)), inner_indices)
# boundary: first derivative
Expand Down
2 changes: 1 addition & 1 deletion test/periodic_operators_test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ let T = Float64
mul!(res, D, x1); @test norm(res[accuracy_order:end-accuracy_order], Inf) < 5000*eps(T)
mul!(res, D, x2); @test norm((res - 2 .* x0)[accuracy_order:end-accuracy_order], Inf) < 10020*eps(T)
mul!(res, D, x3); @test norm((res - 6 .* x1)[accuracy_order:end-accuracy_order], Inf) < 30000*eps(T)
mul!(res, D, x4); @test norm((res - 12 .* x2)[accuracy_order:end-accuracy_order], Inf) < 28000*eps(T)
mul!(res, D, x4); @test norm((res - 12 .* x2)[accuracy_order:end-accuracy_order], Inf) < 40000*eps(T)
mul!(res, D, x5); @test norm((res - 20 .* x3)[accuracy_order:end-accuracy_order], Inf) < 100000*eps(T)
mul!(res, D, x6); @test norm((res - 30 .* x4)[accuracy_order:end-accuracy_order], Inf) < 220000*eps(T)
mul!(res, D, x7); @test norm((res - 42 .* x5)[accuracy_order:end-accuracy_order], Inf) > 220000*eps(T)
Expand Down

0 comments on commit 93a88f8

Please sign in to comment.