JuliaSIMD · chriselrod · Feb 5, 2021 · Feb 4, 2021 · Feb 4, 2021 · Feb 5, 2021
diff --git a/.github/workflows/ci-julia-nightly-noavx2.yml b/.github/workflows/ci-julia-nightly-noavx2.yml
@@ -0,0 +1,44 @@
+name: CI (Julia nightly)
+on:
+  pull_request:
+    branches:
+      - master
+  push:
+    branches:
+      - master
+    tags: '*'
+jobs:
+  test-julia-nightly:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          - 'nightly'
+        os:
+          - macOS-latest
+        arch:
+          - x64
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: actions/cache@v1
+        env:
+          cache-name: cache-artifacts
+        with:
+          path: ~/.julia/artifacts
+          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-test-${{ env.cache-name }}-
+            ${{ runner.os }}-test-
+            ${{ runner.os }}-
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+      - uses: julia-actions/julia-processcoverage@v1
+      - uses: codecov/codecov-action@v1
+        with:
+          file: lcov.info
diff --git a/.github/workflows/ci-noavx2.yml b/.github/workflows/ci-noavx2.yml
@@ -0,0 +1,96 @@
+name: CI
+on:
+  pull_request:
+    branches:
+      - master
+    paths-ignore:
+      - 'LICENSE.md'
+      - 'README.md'
+      - '.github/workflows/TagBot.yml'
+  push:
+    branches:
+      - master
+    paths-ignore:
+      - 'LICENSE.md'
+      - 'README.md'
+      - '.github/workflows/TagBot.yml'
+    tags: '*'
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          - '1.5'
+          - '1' # automatically expands to the latest stable 1.x release of Julia.
+        os:
+          - macOS-latest
+        arch:
+          - x64
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: actions/cache@v1
+        env:
+          cache-name: cache-artifacts
+        with:
+          path: ~/.julia/artifacts
+          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-test-${{ env.cache-name }}-
+            ${{ runner.os }}-test-
+            ${{ runner.os }}-
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+      - uses: julia-actions/julia-processcoverage@v1
+      - uses: codecov/codecov-action@v1
+        with:
+          file: lcov.info
+  docs:
+    name: Documentation
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: '1'
+      - run: julia --color=yes -e 'using Pkg; VERSION >= v"1.5-" && !isdir(joinpath(DEPOT_PATH[1], "registries", "General")) && Pkg.Registry.add("General")'
+        shell: bash
+        env:
+          JULIA_PKG_SERVER: ""
+      - run: |
+          julia --project=docs -e '
+            using Pkg
+            Pkg.develop(PackageSpec(path=pwd()))
+            Pkg.instantiate()'
+      - run: julia --project=docs docs/make.jl
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+  doctests:
+    name: Doctests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: 'nightly'
+      - run: julia --color=yes -e 'using Pkg; VERSION >= v"1.5-" && !isdir(joinpath(DEPOT_PATH[1], "registries", "General")) && Pkg.Registry.add("General")'
+        shell: bash
+        env:
+          JULIA_PKG_SERVER: ""
+      - run: |
+          julia --project=docs -e '
+            using Pkg
+            Pkg.develop(PackageSpec(path=pwd()))
+            Pkg.instantiate()'
+      - run: |
+          julia --project=docs -e '
+            using Documenter: doctest
+            using SLEEFPirates
+            doctest(SLEEFPirates)'
diff --git a/Project.toml b/Project.toml
@@ -10,7 +10,7 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 IfElse = "0.1"
-VectorizationBase = "0.18"
+VectorizationBase = "0.18.7"
 julia = "1.5"
 
 [extras]

diff --git a/src/double.jl b/src/double.jl
@@ -62,12 +62,48 @@ end
 @inline trunclo(x::Float32) = reinterpret(Float32, reinterpret(UInt32, x) & 0xffff_f000) # clear lowest 12 bits (leave upper 12 bits)
 
 # @inline trunclo(x::VecProduct) = trunclo(Vec(data(x)))
-@inline function trunclo(x::Vec{N,Float64}) where {N}
-    reinterpret(Vec{N,Float64}, reinterpret(Vec{N,UInt64}, x) & vbroadcast(Val{N}(), 0xffff_ffff_f800_0000)) # clear lower 27 bits (leave upper 26 bits)
+@inline function trunclo(x::AbstractSIMD{N,Float64}) where {N}
+    reinterpret(Vec{N,Float64}, reinterpret(Vec{N,UInt64}, x) & convert(Vec{N,UInt64}, 0xffff_ffff_f800_0000)) # clear lower 27 bits (leave upper 26 bits)
+end
+@inline function trunclo(x::AbstractSIMD{N,Float32}) where {N}
+    reinterpret(Vec{N,Float32}, reinterpret(Vec{N,UInt32}, x) & convert(Vec{N,UInt32}, 0xffff_f000)) # clear lowest 12 bits (leave upper 12 bits)
+end
+for (op,f,ff) ∈ [("fadd",:add_ieee,:(+)),("fsub",:sub_ieee,:(-)),("fmul",:mul_ieee,:(*)),("fdiv",:fdiv_ieee,:(/)),("frem",:rem_ieee,:(%))]
+    @eval begin
+        @generated $f(v1::Vec{W,T}, v2::Vec{W,T}) where {W,T<:Union{Float32,Float64}} = VectorizationBase.binary_op($op, W, T)
+        @inline $f(s1::T, s2::T) where {T<:Union{Float32,Float64}} = $ff(s1,s2)
+        @inline $f(args::Vararg{Any,K}) where {K} = $f(promote(args...)...)
+        @inline $f(a::VecUnroll, b::VecUnroll) = VecUnroll(VectorizationBase.fmap($f, VectorizationBase.data(a), VectorizationBase.data(b)))
+    end
 end
-@inline function trunclo(x::Vec{N,Float32}) where {N}
-    reinterpret(Vec{N,Float32}, reinterpret(Vec{N,UInt32}, x) & vbroadcast(Val{N}(), 0xffff_f000)) # clear lowest 12 bits (leave upper 12 bits)
+@inline add_ieee(a, b, c) = add_ieee(add_ieee(a, b), c)
+@inline add_ieee(a, b, c, d::Vararg{Any,K}) where {K} = add_ieee(add_ieee(a, b), add_ieee(c, d...))
+function sub_ieee!(ex)
+    ex isa Expr || return
+    if ex.head === :call
+        _f = ex.args[1]
+        if _f isa Symbol
+            f::Symbol = _f
+            if f === :(+)
+                ex.args[1] = :(SLEEFPirates.add_ieee)
+            elseif f === :(-)
+                ex.args[1] = :(SLEEFPirates.sub_ieee)
+            elseif f === :(*)
+                ex.args[1] = :(SLEEFPirates.mul_ieee)
+            elseif f === :(/)
+                ex.args[1] = :(SLEEFPirates.fdiv_ieee)
+            elseif f === :(%)
+                ex.args[1] = :(SLEEFPirates.rem_ieee)
+            end
+        end
+    end
+    foreach(sub_ieee!, ex.args)
+    esc(ex)
 end
+macro ieee(ex)
+    sub_ieee!(ex)
+end
+
 
 @inline function splitprec(x::vIEEEFloat)
     hx = trunclo(x)
@@ -200,8 +236,10 @@ end
 @inline function dmul(x::vIEEEFloat, y::vIEEEFloat, ::False)
     hx, lx = splitprec(x)
     hy, ly = splitprec(y)
-    z = (x * y)
-    Double(z, ((hx * hy - z) + lx * hy + hx * ly) + lx * ly)
+    @ieee begin
+        z = x * y
+        Double(z, (((hx * hy - z) + lx * hy + hx * ly) + lx * ly))
+    end
 end
 @inline function dmul(x::Double{<:vIEEEFloat}, y::vIEEEFloat, ::True)
     z = (x.hi * y)
@@ -210,8 +248,10 @@ end
 @inline function dmul(x::Double{<:vIEEEFloat}, y::vIEEEFloat, ::False)
     hx, lx = splitprec(x.hi)
     hy, ly = splitprec(y)
-    z = x.hi * y
-    Double(z, (hx * hy - z) + lx * hy + hx * ly + lx * ly + x.lo * y)
+    @ieee begin
+        z = x.hi * y
+        Double(z, (hx * hy - z) + lx * hy + hx * ly + lx * ly + x.lo * y)
+    end
 end
 @inline function dmul(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat}, ::True)
     z = x.hi * y.hi
@@ -220,8 +260,10 @@ end
 @inline function dmul(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat}, ::False)
     hx, lx = splitprec(x.hi)
     hy, ly = splitprec(y.hi)
-    z = x.hi * y.hi
-    Double(z, (((hx * hy - z) + lx * hy + hx * ly) + lx * ly) + x.hi * y.lo + x.lo * y.hi)
+    @ieee begin
+        z = x.hi * y.hi
+        Double(z, (((hx * hy - z) + lx * hy + hx * ly) + lx * ly) + x.hi * y.lo + x.lo * y.hi)
+    end
 end
 @inline dmul(x::vIEEEFloat, y::Double{<:vIEEEFloat}) = dmul(y, x)
 @inline dmul(x, y) = dmul(x, y, fma_fast())
@@ -232,17 +274,21 @@ end
 end
 @inline function dsqu(x::T, ::False) where {T<:vIEEEFloat}
     hx, lx = splitprec(x)
-    z = x * x
-    Double(z, (hx * hx - z) + lx * (hx + hx) + lx * lx)
+    @ieee begin
+        z = x * x
+        Double(z, (hx * hx - z) + lx * (hx + hx) + lx * lx)
+    end
 end
 @inline function dsqu(x::Double{T}, ::True) where {T<:vIEEEFloat}
     z = x.hi * x.hi
     Double(z, vfmsub(x.hi, x.hi, z) + (x.hi * (x.lo + x.lo)))
 end
 @inline function dsqu(x::Double{T}, ::False) where {T<:vIEEEFloat}
     hx, lx = splitprec(x.hi)
-    z = x.hi * x.hi
-    Double(z, (hx * hx - z) + lx * (hx + hx) + lx * lx + x.hi * (x.lo + x.lo))
+    @ieee begin
+        z = x.hi * x.hi
+        Double(z, (hx * hx - z) + lx * (hx + hx) + lx * lx + x.hi * (x.lo + x.lo))
+    end
 end
 @inline dsqu(x) = dsqu(x, fma_fast())
 
@@ -253,8 +299,8 @@ end
 end
 @inline function dsqrt(x::Double{T}, ::False) where {T<:vIEEEFloat}
     c = _sqrt(x.hi)
-    u = dsqu(c)
-    Double(c, (x.hi - u.hi - u.lo + x.lo) / (c + c))
+    u = dsqu(c, False())
+    @ieee Double(c, (x.hi - u.hi - u.lo + x.lo) / (c + c))
 end
 @inline dsqrt(x) = dsqrt(x, fma_fast())
 
@@ -265,22 +311,26 @@ end
     Double(zhi, ((vfnmadd(zhi, y.hi, x.hi) + vfnmadd(zhi, y.lo, x.lo)) * invy))
 end
 @inline function ddiv(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat}, ::False)
-    invy = 1 / y.hi
-    c = x.hi * invy
-    u = dmul(c, y.hi)
-    Double(c, ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) * invy)
+    @ieee begin
+        invy = one(y.hi) / y.hi
+        c = x.hi * invy
+        u = dmul(c, y.hi, False())
+        Double(c, ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) * invy)
+    end
 end
 @inline function ddiv(x::vIEEEFloat, y::vIEEEFloat, ::True)
     ry = inv(y)
     r = (x * ry)
     Double(r, (vfnmadd(r, y, x) * ry))
 end
 @inline function ddiv(x::vIEEEFloat, y::vIEEEFloat, ::False)
-    ry = 1 / y
-    r = x * ry
-    hx, lx = splitprec(r)
-    hy, ly = splitprec(y)
-    Double(r, (((-hx * hy + r * y) - lx * hy - hx * ly) - lx * ly) * ry)
+    @ieee begin
+        ry = one(y) / y
+        r = x * ry
+        hx, lx = splitprec(r)
+        hy, ly = splitprec(y)
+        Double(r, (((-hx * hy + r * y) - lx * hy - hx * ly) - lx * ly) * ry)
+    end
 end
 @inline ddiv(x, y) = ddiv(x, y, fma_fast())
     # 1/x
@@ -289,19 +339,22 @@ end
     Double(zhi, (vfnmadd(zhi, x, one(eltype(x))) * zhi))
 end
 @inline function drec(x::vIEEEFloat, ::False)
-    c = 1 / x
-    u = dmul(c, x)
-    Double(c, (one(T) - u.hi - u.lo) * c)
+    @ieee begin
+        c = one(x) / x
+        u = dmul(c, x, False())
+        Double(c, (one(eltype(u.hi)) - u.hi - u.lo) * c)
+    end
 end
 
 @inline function drec(x::Double{<:vIEEEFloat}, ::True)
-        zhi = inv(x.hi)
-        Double(zhi, ((vfnmadd(zhi, x.hi, one(eltype(x))) - (zhi * x.lo)) * zhi))
+    zhi = inv(x.hi)
+    Double(zhi, ((vfnmadd(zhi, x.hi, one(eltype(x))) - (zhi * x.lo)) * zhi))
 end
 @inline function drec(x::Double{<:vIEEEFloat}, ::False)
-    c = 1 / x.hi
-    u = dmul(c, x.hi)
-    Double(c, (one(T) - u.hi - u.lo - c * x.lo) * c)
+    @ieee begin
+        c = inv(x.hi)
+        u = dmul(c, x.hi, False())
+        Double(c, (one(eltype(u.hi)) - u.hi - u.lo - c * x.lo) * c)
+    end
 end
 @inline drec(x) = drec(x, fma_fast())
-
diff --git a/src/exp.jl b/src/exp.jl
@@ -77,6 +77,12 @@ const J_TABLE= Float64[2.0^(big(j-1)/256) for j in 1:256];
 @inline target_trunc(v, ::VectorizationBase.False) = v % UInt32
 @inline target_trunc(v) = target_trunc(v, VectorizationBase.has_feature(Val(:x86_64_avx512dq)))
 
+@inline fast_fma(a, b, c, ::True) = fma(a, b, c)
+@inline function fast_fma(a, b, c, ::False)
+    d = dadd(dmul(Double(a),Double(b)),Double(c))
+    add_ieee(d.hi, d.lo)
+end
+
 for (func, base) in (:exp2=>Val(2), :exp=>Val(ℯ), :exp10=>Val(10))
     Ndef1 = :(target_trunc(reinterpret(UInt64, N_float)))
     func_fast = Symbol(func, :_fast)
@@ -85,8 +91,8 @@ for (func, base) in (:exp2=>Val(2), :exp=>Val(ℯ), :exp10=>Val(10))
             N_float = muladd(x, LogBo256INV($base, Float64), MAGIC_ROUND_CONST(Float64))
             N = $Ndef1
             N_float = N_float - MAGIC_ROUND_CONST(Float64)
-            r = vfmadd(N_float, LogBo256U($base, Float64), x)
-            r = vfmadd(N_float, LogBo256L($base, Float64), r)
+            r = fast_fma(N_float, LogBo256U($base, Float64), x, fma_fast())
+            r = fast_fma(N_float, LogBo256L($base, Float64), r, fma_fast())
             js = vload(VectorizationBase.zero_offsets(stridedpointer(J_TABLE)), (N & 0x000000ff,))
             k = N >>> 0x00000008
 
@@ -108,8 +114,8 @@ for (func, base) in (:exp2=>Val(2), :exp=>Val(ℯ), :exp10=>Val(10))
             N = reinterpret(UInt32, N_float)
             N_float = (N_float - MAGIC_ROUND_CONST(Float32))
 
-            r = vfmadd(N_float, LogBU($base, Float32), x)
-            r = vfmadd(N_float, LogBL($base, Float32), r)
+            r = fast_fma(N_float, LogBU($base, Float32), x, fma_fast())
+            r = fast_fma(N_float, LogBL($base, Float32), r, fma_fast())
 
             small_part = reinterpret(UInt32, expb_kernel($base, r))
             twopk = N << 0x00000017

diff --git a/src/trig.jl b/src/trig.jl
@@ -818,7 +818,8 @@ end
 end
 @inline function atan_fast_q(x::Vec{W}) where {W}
     I = fpinttype(eltype(x))
-    ifelse(signbit(x), vbroadcast(Val{W}(), 2 % I), vzero(Val{W}(), I))
+    q = convert(Vec{W,I}, 2 % I)
+    ifelse(signbit(x), q, zero(q))
 end
 """
     atan_fast(x)