From a974ff555e15692573f08222e7c79739bf03e0c2 Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Sat, 30 Jan 2021 23:26:11 -0500
Subject: [PATCH 1/3] Updates for VectorizatoinBase preferences

---
 Project.toml |  6 +++---
 src/exp.jl   | 23 +++++++----------------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/Project.toml b/Project.toml
index 47148bb..67c77da 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "SLEEFPirates"
 uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
 authors = ["chriselrod <elrodc@gmail.com>"]
-version = "0.6.8"
+version = "0.6.9"
 
 [deps]
 IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
@@ -10,8 +10,8 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 IfElse = "0.1"
-VectorizationBase = "0.16, 0.17"
-julia = "1.5"
+VectorizationBase = "0.18"
+julia = "1.6"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/exp.jl b/src/exp.jl
index 1530780..3470b4e 100644
--- a/src/exp.jl
+++ b/src/exp.jl
@@ -73,14 +73,9 @@ end
 
 const J_TABLE= Float64[2.0^(big(j-1)/256) for j in 1:256];
 
-@generated function targetspecific_truncate(v)
-    ex = if VectorizationBase.has_feature("x86_64_avx512dq")
-        :v
-    else
-        :(v % UInt32)
-    end
-    Expr(:block, Expr(:meta,:inline), ex)
-end
+@inline target_trunc(v, ::VectorizationBase.True) = v
+@inline target_trunc(v, ::VectorizationBase.False) = v % UInt32
+@inline target_trunc(v) = target_trunc(v, VectorizationBase.has_feature(Val(:x86_64_avx512dq)))
 
 for (func, base) in (:exp2=>Val(2), :exp=>Val(ℯ), :exp10=>Val(10))
     Ndef1 = :(targetspecific_truncate(reinterpret(UInt64, N_float)))
@@ -179,14 +174,10 @@ end
     return exthorner(x, (1.0f0, 0.5f0, hi_order))
 end
 
-@generated function inttype(::Type{Float64})
-    if VectorizationBase.has_feature("x86_64_avx512dq")
-        :Int64
-    else
-        :Int32
-    end
-end
-inttype(::Type{Float32}) = Int32
+@inline widest_supported_integer(::VectorizationBase.True) = Int64
+@inline widest_supported_integer(::VectorizationBase.False) = Int32
+@inline inttype(::Type{Float64}) = widest_supported_integer(VectorizationBase.has_feature(Val(:x86_64_avx512dq)))
+@inline inttype(::Type{Float32}) = Int32
 
 @inline function expm1_fast(x::FloatType)
     T = eltype(x)

From f1b3e5744410b4fcfee15feb85c22b7ef5771c51 Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Sun, 31 Jan 2021 03:41:23 -0500
Subject: [PATCH 2/3] Updates for VectorizationBase 0.18

---
 src/SLEEFPirates.jl |   2 +-
 src/double.jl       | 202 +++++++++++++++++++++-----------------------
 src/exp.jl          |  14 +--
 test/testsetup.jl   |   4 +-
 4 files changed, 108 insertions(+), 114 deletions(-)

diff --git a/src/SLEEFPirates.jl b/src/SLEEFPirates.jl
index d8e8a47..091174e 100644
--- a/src/SLEEFPirates.jl
+++ b/src/SLEEFPirates.jl
@@ -6,7 +6,7 @@ using Base.Math: uinttype, exponent_bias, exponent_mask, significand_bits, IEEEF
 using Libdl, VectorizationBase
 
 using VectorizationBase: vzero, AbstractSIMD, _Vec, fma_fast, data, VecUnroll, NativeTypes, FloatingTypes,
-    vfmadd, vfnmadd, vfmsub, vfnmsub
+    vfmadd, vfnmadd, vfmsub, vfnmsub, True, False
 
 import IfElse: ifelse
 
diff --git a/src/double.jl b/src/double.jl
index be0818e..6417e73 100644
--- a/src/double.jl
+++ b/src/double.jl
@@ -193,125 +193,115 @@ end
 end
 
 # two-prod-fma
-@inline function dmul(x::vIEEEFloat, y::vIEEEFloat)
-    if fma_fast()
-        z = (x * y)
-        Double(z, vfmsub(x, y, z))
-    else
-        hx, lx = splitprec(x)
-        hy, ly = splitprec(y)
-        z = (x * y)
-        Double(z, ((hx * hy - z) + lx * hy + hx * ly) + lx * ly)
-    end
-end
-
-@inline function dmul(x::Double{<:vIEEEFloat}, y::vIEEEFloat)
-    if fma_fast()
-        z = (x.hi * y)
-        Double(z, vfmsub(x.hi, y, z) + x.lo * y)
-    else
-        hx, lx = splitprec(x.hi)
-        hy, ly = splitprec(y)
-        z = x.hi * y
-        Double(z, (hx * hy - z) + lx * hy + hx * ly + lx * ly + x.lo * y)
-    end
+@inline function dmul(x::vIEEEFloat, y::vIEEEFloat, ::True)
+    z = (x * y)
+    Double(z, vfmsub(x, y, z))
+end
+@inline function dmul(x::vIEEEFloat, y::vIEEEFloat, ::False)
+    hx, lx = splitprec(x)
+    hy, ly = splitprec(y)
+    z = (x * y)
+    Double(z, ((hx * hy - z) + lx * hy + hx * ly) + lx * ly)
+end
+@inline function dmul(x::Double{<:vIEEEFloat}, y::vIEEEFloat, ::True)
+    z = (x.hi * y)
+    Double(z, vfmsub(x.hi, y, z) + x.lo * y)
+end
+@inline function dmul(x::Double{<:vIEEEFloat}, y::vIEEEFloat, ::False)
+    hx, lx = splitprec(x.hi)
+    hy, ly = splitprec(y)
+    z = x.hi * y
+    Double(z, (hx * hy - z) + lx * hy + hx * ly + lx * ly + x.lo * y)
+end
+@inline function dmul(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat}, ::True)
+    z = x.hi * y.hi
+    Double(z, vfmsub(x.hi, y.hi, z) + x.hi * y.lo + x.lo * y.hi)
+end
+@inline function dmul(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat}, ::False)
+    hx, lx = splitprec(x.hi)
+    hy, ly = splitprec(y.hi)
+    z = x.hi * y.hi
+    Double(z, (((hx * hy - z) + lx * hy + hx * ly) + lx * ly) + x.hi * y.lo + x.lo * y.hi)
 end
 @inline dmul(x::vIEEEFloat, y::Double{<:vIEEEFloat}) = dmul(y, x)
-
-@inline function dmul(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat})
-    if fma_fast()
-        z = x.hi * y.hi
-        Double(z, vfmsub(x.hi, y.hi, z) + x.hi * y.lo + x.lo * y.hi)
-    else
-        hx, lx = splitprec(x.hi)
-        hy, ly = splitprec(y.hi)
-        z = x.hi * y.hi
-        Double(z, (((hx * hy - z) + lx * hy + hx * ly) + lx * ly) + x.hi * y.lo + x.lo * y.hi)
-    end
-end
+@inline dmul(x, y) = dmul(x, y, fma_fast())
     # x^2
-@inline function dsqu(x::T) where {T<:vIEEEFloat}
-    if fma_fast()
-        z = x * x
-        Double(z, vfmsub(x, x, z))
-    else
-        hx, lx = splitprec(x)
-        z = x * x
-        Double(z, (hx * hx - z) + lx * (hx + hx) + lx * lx)
-    end
+@inline function dsqu(x::T, ::True) where {T<:vIEEEFloat}
+    z = x * x
+    Double(z, vfmsub(x, x, z))
 end
-
-@inline function dsqu(x::Double{T}) where {T<:vIEEEFloat}
-    if fma_fast()
-        z = x.hi * x.hi
-        Double(z, vfmsub(x.hi, x.hi, z) + (x.hi * (x.lo + x.lo)))
-    else
-        hx, lx = splitprec(x.hi)
-        z = x.hi * x.hi
-        Double(z, (hx * hx - z) + lx * (hx + hx) + lx * lx + x.hi * (x.lo + x.lo))
-    end
+@inline function dsqu(x::T, ::False) where {T<:vIEEEFloat}
+    hx, lx = splitprec(x)
+    z = x * x
+    Double(z, (hx * hx - z) + lx * (hx + hx) + lx * lx)
 end
-
-    # sqrt(x)
-@inline function dsqrt(x::Double{T}) where {T<:vIEEEFloat}
-    if fma_fast()
-        zhi = _sqrt(x.hi)
-        Double(zhi, (x.lo + vfnmadd(zhi, zhi, x.hi)) / (zhi + zhi))
-    else
-        c = _sqrt(x.hi)
-        u = dsqu(c)
-        Double(c, (x.hi - u.hi - u.lo + x.lo) / (c + c))
-    end
+@inline function dsqu(x::Double{T}, ::True) where {T<:vIEEEFloat}
+    z = x.hi * x.hi
+    Double(z, vfmsub(x.hi, x.hi, z) + (x.hi * (x.lo + x.lo)))
 end
-
-    # x/y
-@inline function ddiv(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat})
-    if fma_fast()
-        invy = inv(y.hi)
-        zhi = (x.hi * invy)
-        Double(zhi, ((vfnmadd(zhi, y.hi, x.hi) + vfnmadd(zhi, y.lo, x.lo)) * invy))
-    else
-        invy = 1 / y.hi
-        c = x.hi * invy
-        u = dmul(c, y.hi)
-        Double(c, ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) * invy)
-    end
+@inline function dsqu(x::Double{T}, ::False) where {T<:vIEEEFloat}
+    hx, lx = splitprec(x.hi)
+    z = x.hi * x.hi
+    Double(z, (hx * hx - z) + lx * (hx + hx) + lx * lx + x.hi * (x.lo + x.lo))
 end
+@inline dsqu(x) = dsqu(x, fma_fast())
 
-@inline function ddiv(x::vIEEEFloat, y::vIEEEFloat)
-    if fma_fast()
-        ry = inv(y)
-        r = (x * ry)
-        Double(r, (vfnmadd(r, y, x) * ry))
-    else
-        ry = 1 / y
-        r = x * ry
-        hx, lx = splitprec(r)
-        hy, ly = splitprec(y)
-        Double(r, (((-hx * hy + r * y) - lx * hy - hx * ly) - lx * ly) * ry)
-    end
+    # sqrt(x)
+@inline function dsqrt(x::Double{T}, ::True) where {T<:vIEEEFloat}
+    zhi = _sqrt(x.hi)
+    Double(zhi, (x.lo + vfnmadd(zhi, zhi, x.hi)) / (zhi + zhi))
+end
+@inline function dsqrt(x::Double{T}, ::False) where {T<:vIEEEFloat}
+    c = _sqrt(x.hi)
+    u = dsqu(c)
+    Double(c, (x.hi - u.hi - u.lo + x.lo) / (c + c))
 end
+@inline dsqrt(x) = dsqrt(x, fma_fast())
 
+    # x/y
+@inline function ddiv(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat}, ::True)
+    invy = inv(y.hi)
+    zhi = (x.hi * invy)
+    Double(zhi, ((vfnmadd(zhi, y.hi, x.hi) + vfnmadd(zhi, y.lo, x.lo)) * invy))
+end
+@inline function ddiv(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat}, ::False)
+    invy = 1 / y.hi
+    c = x.hi * invy
+    u = dmul(c, y.hi)
+    Double(c, ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) * invy)
+end
+@inline function ddiv(x::vIEEEFloat, y::vIEEEFloat, ::True)
+    ry = inv(y)
+    r = (x * ry)
+    Double(r, (vfnmadd(r, y, x) * ry))
+end
+@inline function ddiv(x::vIEEEFloat, y::vIEEEFloat, ::False)
+    ry = 1 / y
+    r = x * ry
+    hx, lx = splitprec(r)
+    hy, ly = splitprec(y)
+    Double(r, (((-hx * hy + r * y) - lx * hy - hx * ly) - lx * ly) * ry)
+end
+@inline ddiv(x, y) = ddiv(x, y, fma_fast())
     # 1/x
-@inline function drec(x::vIEEEFloat)
-    if fma_fast()
-        zhi = inv(x)
-        Double(zhi, (vfnmadd(zhi, x, one(eltype(x))) * zhi))
-    else
-        c = 1 / x
-        u = dmul(c, x)
-        Double(c, (one(T) - u.hi - u.lo) * c)
-    end
+@inline function drec(x::vIEEEFloat, ::True)
+    zhi = inv(x)
+    Double(zhi, (vfnmadd(zhi, x, one(eltype(x))) * zhi))
+end
+@inline function drec(x::vIEEEFloat, ::False)
+    c = 1 / x
+    u = dmul(c, x)
+    Double(c, (one(T) - u.hi - u.lo) * c)
 end
 
-@inline function drec(x::Double{<:vIEEEFloat})
-    if fma_fast()
+@inline function drec(x::Double{<:vIEEEFloat}, ::True)
         zhi = inv(x.hi)
         Double(zhi, ((vfnmadd(zhi, x.hi, one(eltype(x))) - (zhi * x.lo)) * zhi))
-        
-    else
-        c = 1 / x.hi
-        u = dmul(c, x.hi)
-        Double(c, (one(T) - u.hi - u.lo - c * x.lo) * c)
-    end
 end
+@inline function drec(x::Double{<:vIEEEFloat}, ::False)
+    c = 1 / x.hi
+    u = dmul(c, x.hi)
+    Double(c, (one(T) - u.hi - u.lo - c * x.lo) * c)
+end
+@inline drec(x) = drec(x, fma_fast())
+
diff --git a/src/exp.jl b/src/exp.jl
index 3470b4e..c2bc690 100644
--- a/src/exp.jl
+++ b/src/exp.jl
@@ -78,7 +78,7 @@ const J_TABLE= Float64[2.0^(big(j-1)/256) for j in 1:256];
 @inline target_trunc(v) = target_trunc(v, VectorizationBase.has_feature(Val(:x86_64_avx512dq)))
 
 for (func, base) in (:exp2=>Val(2), :exp=>Val(ℯ), :exp10=>Val(10))
-    Ndef1 = :(targetspecific_truncate(reinterpret(UInt64, N_float)))
+    Ndef1 = :(target_trunc(reinterpret(UInt64, N_float)))
     func_fast = Symbol(func, :_fast)
     @eval begin
         @inline function $func_fast(x::FloatType64)
@@ -174,10 +174,14 @@ end
     return exthorner(x, (1.0f0, 0.5f0, hi_order))
 end
 
-@inline widest_supported_integer(::VectorizationBase.True) = Int64
-@inline widest_supported_integer(::VectorizationBase.False) = Int32
-@inline inttype(::Type{Float64}) = widest_supported_integer(VectorizationBase.has_feature(Val(:x86_64_avx512dq)))
-@inline inttype(::Type{Float32}) = Int32
+if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)
+    @inline widest_supported_integer(::VectorizationBase.True) = Int64
+    @inline widest_supported_integer(::VectorizationBase.False) = Int32
+    @inline inttype(::Type{Float64}) = widest_supported_integer(VectorizationBase.has_feature(Val(:x86_64_avx512dq)))
+    @inline inttype(::Type{Float32}) = Int32
+else
+    @inline inttype(_) = Int
+end
 
 @inline function expm1_fast(x::FloatType)
     T = eltype(x)
diff --git a/test/testsetup.jl b/test/testsetup.jl
index 233c2f4..b3216b2 100644
--- a/test/testsetup.jl
+++ b/test/testsetup.jl
@@ -210,9 +210,9 @@ function test_acc(T, fun_table, xx, tol; debug = false, tol_debug = 5)
         # Vector test is mostly to make sure that they do not error
         # Results should either be the same as scalar
         # Or they're from another library (e.g., GLIBC), and may differ slighlty
-        test_vector(xfun, fun, VectorizationBase.pick_vector_width_val(T), first(xx), last(xx), tol)
-        test_vector(xfun, fun, Val(2), first(xx), last(xx), tol)
         W = VectorizationBase.pick_vector_width(T)
+        test_vector(xfun, fun, W, first(xx), last(xx), tol)
+        test_vector(xfun, fun, Val(2), first(xx), last(xx), tol)
         if W ≥ 4
             test_vector(xfun, fun, Val(4), first(xx), last(xx), tol)
         end

From 9946a8802cd5b49aa2f5f8951c2d5e75e72c7d8e Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Mon, 1 Feb 2021 00:34:11 -0500
Subject: [PATCH 3/3] VectorizationBase 0.18 now allows Julia 1.5

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 67c77da..40da2e6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,7 +11,7 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 [compat]
 IfElse = "0.1"
 VectorizationBase = "0.18"
-julia = "1.6"
+julia = "1.5"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"