diff --git a/docs/make.jl b/docs/make.jl index 2e73f638a9..ecfaaa256c 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,62 +1,61 @@ -using Documenter, Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, - ChainRulesCore +using Documenter, + Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true) makedocs(; - modules = [ - Flux, - NNlib, - Functors, - MLUtils, - BSON, - Optimisers, - OneHotArrays, - Zygote, - ChainRulesCore, - Base, - ], - doctest = false, - sitename = "Flux", - # strict = [:cross_references,], - pages = [ - "Home" => "index.md", - "Building Models" => [ - "Overview" => "models/overview.md", - "Basics" => "models/basics.md", - "Recurrence" => "models/recurrence.md", - "Layer Reference" => "models/layers.md", - "Loss Functions" => "models/losses.md", - "Regularisation" => "models/regularisation.md", - "Custom Layers" => "models/advanced.md", - "NNlib.jl" => "models/nnlib.md", - "Activation Functions" => "models/activation.md", - ], - "Handling Data" => [ - "MLUtils.jl" => "data/mlutils.md", - "OneHotArrays.jl" => "data/onehot.md", - ], - "Training Models" => [ - "Optimisers" => "training/optimisers.md", - "Training" => "training/training.md", - "Callback Helpers" => "training/callbacks.md", - "Zygote.jl" => "training/zygote.md", - ], - "GPU Support" => "gpu.md", - "Model Tools" => [ - "Saving & Loading" => "saving.md", - "Shape Inference" => "outputsize.md", - "Weight Initialisation" => "utilities.md", - "Functors.jl" => "models/functors.md", - ], - "Performance Tips" => "performance.md", - "Flux's Ecosystem" => "ecosystem.md", - ], - format = Documenter.HTML(; sidebar_sitename = false, - analytics = "UA-36890222-9", - assets = ["assets/flux.css"], - prettyurls = get(ENV, "CI", nothing) == "true")) + modules = [ + Flux, + NNlib, + Functors, + MLUtils, + BSON, + Optimisers, + OneHotArrays, + Zygote, + ChainRulesCore, + Base, + ], + doctest = false, + sitename = "Flux", + # strict = [:cross_references,], + pages = [ + "Home" => "index.md", + "Building Models" => [ + "Overview" => "models/overview.md", + "Basics" => "models/basics.md", + "Recurrence" => "models/recurrence.md", + "Layer Reference" => "models/layers.md", + "Loss Functions" => "models/losses.md", + "Regularisation" => "models/regularisation.md", + "Custom Layers" => "models/advanced.md", + "NNlib.jl" => "models/nnlib.md", + "Activation Functions" => "models/activation.md", + ], + "Handling Data" => + ["MLUtils.jl" => "data/mlutils.md", "OneHotArrays.jl" => "data/onehot.md"], + "Training Models" => [ + "Optimisers" => "training/optimisers.md", + "Training" => "training/training.md", + "Callback Helpers" => "training/callbacks.md", + "Zygote.jl" => "training/zygote.md", + ], + "GPU Support" => "gpu.md", + "Model Tools" => [ + "Saving & Loading" => "saving.md", + "Shape Inference" => "outputsize.md", + "Weight Initialisation" => "utilities.md", + "Functors.jl" => "models/functors.md", + ], + "Performance Tips" => "performance.md", + "Flux's Ecosystem" => "ecosystem.md", + ], + format = Documenter.HTML(; + sidebar_sitename = false, + analytics = "UA-36890222-9", + assets = ["assets/flux.css"], + prettyurls = get(ENV, "CI", nothing) == "true", + ), +) -deploydocs(; repo = "github.com/FluxML/Flux.jl.git", - target = "build", - push_preview = true) +deploydocs(; repo = "github.com/FluxML/Flux.jl.git", target = "build", push_preview = true) diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl index d7897851a4..f719b01c99 100644 --- a/perf/bench_utils.jl +++ b/perf/bench_utils.jl @@ -24,19 +24,19 @@ function run_benchmark(model, x; cuda = true) fw(model, x) GC.gc() CUDA.reclaim() #warmup - @btime CUDA.@sync(fw($model, $x)) teardown=(GC.gc(); CUDA.reclaim()) + @btime CUDA.@sync(fw($model, $x)) teardown = (GC.gc(); CUDA.reclaim()) println(" backward") bw(back) GC.gc() CUDA.reclaim() #warmup - @btime CUDA.@sync(bw($back)) teardown=(GC.gc(); CUDA.reclaim()) + @btime CUDA.@sync(bw($back)) teardown = (GC.gc(); CUDA.reclaim()) println(" forw and back") fwbw(model, ps, x) GC.gc() CUDA.reclaim() #warmup - @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown=(GC.gc(); CUDA.reclaim()) + @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown = (GC.gc(); CUDA.reclaim()) else println(" forward") fw(model, x) #warmup diff --git a/perf/recurrent.jl b/perf/recurrent.jl index 9002e248d6..bf4a2474da 100644 --- a/perf/recurrent.jl +++ b/perf/recurrent.jl @@ -51,7 +51,7 @@ end for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM] rnn_benchmark_sweep(rnn_type) do n, ts - return [randn(Float32, n, n) for _ in 1:ts], "Vec" + return [randn(Float32, n, n) for _ = 1:ts], "Vec" end end diff --git a/perf/vgg.jl b/perf/vgg.jl index dad9d1aad1..d86fdd6fe1 100644 --- a/perf/vgg.jl +++ b/perf/vgg.jl @@ -6,43 +6,45 @@ using CUDA using Zygote: pullback function vgg16() - return Chain(Conv((3, 3), 3 => 64, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(64), - Conv((3, 3), 64 => 64, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(64), - MaxPool((2, 2)), - Conv((3, 3), 64 => 128, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(128), - Conv((3, 3), 128 => 128, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(128), - MaxPool((2, 2)), - Conv((3, 3), 128 => 256, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(256), - Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(256), - Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(256), - MaxPool((2, 2)), - Conv((3, 3), 256 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - MaxPool((2, 2)), - Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - MaxPool((2, 2)), - flatten, - Dense(512, 4096, relu), - Dropout(0.5), - Dense(4096, 4096, relu), - Dropout(0.5), - Dense(4096, 10)) + return Chain( + Conv((3, 3), 3 => 64, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(64), + Conv((3, 3), 64 => 64, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(64), + MaxPool((2, 2)), + Conv((3, 3), 64 => 128, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(128), + Conv((3, 3), 128 => 128, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(128), + MaxPool((2, 2)), + Conv((3, 3), 128 => 256, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(256), + Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(256), + Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(256), + MaxPool((2, 2)), + Conv((3, 3), 256 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + MaxPool((2, 2)), + Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + MaxPool((2, 2)), + flatten, + Dense(512, 4096, relu), + Dropout(0.5), + Dense(4096, 4096, relu), + Dropout(0.5), + Dense(4096, 10), + ) end let model = vgg16(), x = rand(Float32, 32, 32, 3, 64) diff --git a/src/Flux.jl b/src/Flux.jl index 54335110f7..987917387b 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -15,29 +15,74 @@ export gradient # Pirate error to catch a common mistake. (Internal function `base` because overloading `update!` is more likely to give ambiguities.) function Optimisers.base(dx::Zygote.Grads) - return error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`") + return error( + "Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`", + ) end -export Chain, Dense, Maxout, SkipConnection, Parallel, PairwiseFusion, - RNN, LSTM, GRU, GRUv3, - SamePad, Conv, CrossCor, ConvTranspose, DepthwiseConv, - AdaptiveMaxPool, AdaptiveMeanPool, GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, - Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm, - Upsample, PixelShuffle, - fmap, cpu, gpu, f32, f64, - testmode!, trainmode! +export Chain, + Dense, + Maxout, + SkipConnection, + Parallel, + PairwiseFusion, + RNN, + LSTM, + GRU, + GRUv3, + SamePad, + Conv, + CrossCor, + ConvTranspose, + DepthwiseConv, + AdaptiveMaxPool, + AdaptiveMeanPool, + GlobalMaxPool, + GlobalMeanPool, + MaxPool, + MeanPool, + Dropout, + AlphaDropout, + LayerNorm, + BatchNorm, + InstanceNorm, + GroupNorm, + Upsample, + PixelShuffle, + fmap, + cpu, + gpu, + f32, + f64, + testmode!, + trainmode! include("optimise/Optimise.jl") using .Optimise using .Optimise: @epochs using .Optimise: skip -export Descent, Adam, Momentum, Nesterov, RMSProp, - AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam, - AdamW, RAdam, AdaBelief, InvDecay, ExpDecay, - WeightDecay, ClipValue, ClipNorm +export Descent, + Adam, + Momentum, + Nesterov, + RMSProp, + AdaGrad, + AdaMax, + AdaDelta, + AMSGrad, + NAdam, + OAdam, + AdamW, + RAdam, + AdaBelief, + InvDecay, + ExpDecay, + WeightDecay, + ClipValue, + ClipNorm using CUDA -const use_cuda = Ref{Union{Nothing, Bool}}(nothing) +const use_cuda = Ref{Union{Nothing,Bool}}(nothing) using Adapt, Functors, OneHotArrays include("utils.jl") @@ -45,7 +90,9 @@ include("functor.jl") # Pirate error to catch a common mistake. function Functors.functor(::Type{<:MLUtils.DataLoader}, x) - return error("`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.") + return error( + "`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.", + ) end include("layers/stateless.jl") diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl index c20a7f873c..6ffa43e16a 100644 --- a/src/cuda/cudnn.jl +++ b/src/cuda/cudnn.jl @@ -1,17 +1,39 @@ import NNlibCUDA: batchnorm, ∇batchnorm -function (BN::Flux.BatchNorm)(x::Union{CuArray{T, 2}, CuArray{T, 4}, CuArray{T, 5}}, - cache = nothing) where {T <: Union{Float32, Float64}} +function (BN::Flux.BatchNorm)( + x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, + cache = nothing, +) where {T<:Union{Float32,Float64}} @assert BN.affine "BatchNorm: only affine=true supported on gpu" @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu" - @assert length(BN.β)==size(x, ndims(x) - 1) "BatchNorm: input has wrong number of channels" - return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; - cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, - training = Flux._isactive(BN))) + @assert length(BN.β) == size(x, ndims(x) - 1) "BatchNorm: input has wrong number of channels" + return BN.λ.( + batchnorm( + BN.γ, + BN.β, + x, + BN.μ, + BN.σ², + BN.momentum; + cache = cache, + alpha = 1, + beta = 0, + eps = BN.ϵ, + training = Flux._isactive(BN), + ) + ) end -function ChainRulesCore.rrule(::typeof(batchnorm), g, b, x, running_mean, running_var, - momentum; kw...) +function ChainRulesCore.rrule( + ::typeof(batchnorm), + g, + b, + x, + running_mean, + running_var, + momentum; + kw..., +) y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...) function batchnorm_pullback(Δ) grad = ∇batchnorm(g, b, x, unthunk(Δ), running_mean, running_var, momentum; kw...) diff --git a/src/deprecations.jl b/src/deprecations.jl index b6f56183f7..6d29cb6fd1 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -1,32 +1,49 @@ # v0.12 deprecations function ones(dims...) - Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", - :ones; force = true) + Base.depwarn( + "Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", + :ones; + force = true, + ) return Base.ones(Float32, dims...) end ones(T::Type, dims...) = Base.ones(T, dims...) function zeros(dims...) - Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)", - :zeros; force = true) + Base.depwarn( + "Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)", + :zeros; + force = true, + ) return Base.zeros(Float32, dims...) end zeros(T::Type, dims...) = Base.zeros(T, dims...) function ones32(::Type, dims...) - throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type")) + throw( + ArgumentError( + "Flux.ones32 is always Float32, use Base.ones to specify the element type", + ), + ) end function zeros32(::Type, dims...) - throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type")) + throw( + ArgumentError( + "Flux.zeros32 is always Float32, use Base.zeros to specify the element type", + ), + ) end # v0.13 deprecations function Broadcast.broadcasted(f::Recur, args...) # This had an explicit @adjoint rule, calling Zygote.∇map(__context__, f, args...), until v0.12 - Base.depwarn("""Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order. - Re-writing this as a comprehension would be better.""", :broadcasted) + Base.depwarn( + """Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order. +Re-writing this as a comprehension would be better.""", + :broadcasted, + ) return map(f, args...) # map isn't really safe either, but end @@ -34,37 +51,46 @@ end struct Zeros function Zeros() - Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead", - :Zeros) + Base.depwarn( + "Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead", + :Zeros, + ) return false end end Zeros(args...) = Zeros() # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros()) function Optimise.update!(x::AbstractArray, x̄) - Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", - :update!) + Base.depwarn( + "`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", + :update!, + ) return x .-= x̄ end function Diagonal(size::Integer...; kw...) - Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", - :Diagonal) + Base.depwarn( + "Flux.Diagonal is now Flux.Scale, and also allows an activation function.", + :Diagonal, + ) return Scale(size...; kw...) end function Diagonal(size::Tuple; kw...) - Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", - :Diagonal) + Base.depwarn( + "Flux.Diagonal is now Flux.Scale, and also allows an activation function.", + :Diagonal, + ) return Scale(size...; kw...) end # Deprecate this eventually once saving models w/o structure is no more function loadparams!(m, xs) - Base.depwarn("loadparams! will be deprecated eventually. Use loadmodel! instead.", - :loadparams!) + Base.depwarn( + "loadparams! will be deprecated eventually. Use loadmodel! instead.", + :loadparams!, + ) for (p, x) in zip(params(m), xs) - size(p) == size(x) || - error("Expected param size $(size(p)), got $(size(x))") + size(p) == size(x) || error("Expected param size $(size(p)), got $(size(x))") copyto!(p, x) end end diff --git a/src/functor.jl b/src/functor.jl index 993ea95693..4463aaced7 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -104,7 +104,9 @@ else end adapt_storage(to::FluxCUDAAdaptor, x::CUDA.RNG) = x function adapt_storage(to::FluxCUDAAdaptor, x::AbstractRNG) - return error("Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().") + return error( + "Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().", + ) end # TODO: figure out the correct design for OneElement @@ -116,8 +118,10 @@ struct FluxCPUAdaptor end adapt_storage(to::FluxCPUAdaptor, x::AbstractArray) = adapt(Array, x) adapt_storage(to::FluxCPUAdaptor, x::AbstractRange) = x adapt_storage(to::FluxCPUAdaptor, x::Zygote.FillArrays.AbstractFill) = x -function adapt_storage(to::FluxCPUAdaptor, - x::T) where {T <: CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix} +function adapt_storage( + to::FluxCPUAdaptor, + x::T, +) where {T<:CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix} return adapt(Array, x) end adapt_storage(to::FluxCPUAdaptor, x::Zygote.OneElement) = x @@ -129,10 +133,13 @@ function ChainRulesCore.rrule(::Type{Array}, x::CUDA.CuArray) return Array(x), dx -> (NoTangent(), CUDA.cu(unthunk(dx))) end -function ChainRulesCore.rrule(::typeof(Adapt.adapt_storage), to::FluxCPUAdaptor, - x::CUDA.AbstractGPUArray) +function ChainRulesCore.rrule( + ::typeof(Adapt.adapt_storage), + to::FluxCPUAdaptor, + x::CUDA.AbstractGPUArray, +) return adapt_storage(to, x), - dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx))) + dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx))) end # CPU/GPU movement conveniences @@ -206,7 +213,8 @@ function check_use_cuda() end if !(use_cuda[]) @info """The GPU function is being called but the GPU is not accessible. - Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog=1 + Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog = + 1 end end end diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 97d0d957ee..647b237144 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -32,7 +32,7 @@ For large models, there is a special type-unstable path which can reduce compila times. This can be used by supplying a vector of layers `Chain([layer1, layer2, ...])`. This feature is somewhat experimental, beware! """ -struct Chain{T <: Union{Tuple, NamedTuple, AbstractVector}} +struct Chain{T<:Union{Tuple,NamedTuple,AbstractVector}} layers::T end @@ -44,16 +44,22 @@ function Chain(; kw...) return Chain(values(kw)) end -@forward Chain.layers Base.getindex, Base.length, Base.first, Base.last, - Base.iterate, Base.lastindex, Base.keys, Base.firstindex +@forward Chain.layers Base.getindex, +Base.length, +Base.first, +Base.last, +Base.iterate, +Base.lastindex, +Base.keys, +Base.firstindex @functor Chain (c::Chain)(x) = _applychain(c.layers, x) -@generated function _applychain(layers::Tuple{Vararg{<:Any, N}}, x) where {N} - symbols = vcat(:x, [gensym() for _ in 1:N]) - calls = [:($(symbols[i + 1]) = layers[$i]($(symbols[i]))) for i in 1:N] +@generated function _applychain(layers::Tuple{Vararg{<:Any,N}}, x) where {N} + symbols = vcat(:x, [gensym() for _ = 1:N]) + calls = [:($(symbols[i+1]) = layers[$i]($(symbols[i]))) for i = 1:N] return Expr(:block, calls...) end @@ -156,18 +162,22 @@ julia> Flux.params(d1) # no trainable bias Params([[1.0 1.0 … 1.0 1.0; 1.0 1.0 … 1.0 1.0]]) ``` """ -struct Dense{F, M <: AbstractMatrix, B} +struct Dense{F,M<:AbstractMatrix,B} weight::M bias::B σ::F - function Dense(W::M, bias = true, σ::F = identity) where {M <: AbstractMatrix, F} + function Dense(W::M, bias = true, σ::F = identity) where {M<:AbstractMatrix,F} b = _create_bias(W, bias, size(W, 1)) - return new{F, M, typeof(b)}(W, b, σ) + return new{F,M,typeof(b)}(W, b, σ) end end -function Dense((in, out)::Pair{<:Integer, <:Integer}, σ = identity; - init = glorot_uniform, bias = true) +function Dense( + (in, out)::Pair{<:Integer,<:Integer}, + σ = identity; + init = glorot_uniform, + bias = true, +) return Dense(init(out, in), bias, σ) end @@ -229,15 +239,17 @@ julia> Flux.params(b) Params([[1 2 3 4]]) ``` """ -struct Scale{F, A <: AbstractArray, B} +struct Scale{F,A<:AbstractArray,B} scale::A bias::B σ::F - function Scale(scale::A, bias::B = true, - σ::F = identity) where {A <: AbstractArray, - B <: Union{Bool, AbstractArray}, F} + function Scale( + scale::A, + bias::B = true, + σ::F = identity, + ) where {A<:AbstractArray,B<:Union{Bool,AbstractArray},F} b = _create_bias(scale, bias, size(scale)...) - return new{F, A, typeof(b)}(scale, b, σ) + return new{F,A,typeof(b)}(scale, b, σ) end end @@ -245,7 +257,7 @@ function Scale(s1::Integer, s23::Integer...; bias = true, init = ones32, _act = return Scale(init(s1, s23...), bias, _act) end function Scale(size_act...; bias = true, init = ones32) - return Scale(size_act[1:(end - 1)]...; bias, init, _act = size_act[end]) + return Scale(size_act[1:(end-1)]...; bias, init, _act = size_act[end]) end @functor Scale @@ -298,11 +310,11 @@ julia> Flux.outputsize(m3, (5, 11)) (7, 11) ``` """ -struct Maxout{T <: Tuple} +struct Maxout{T<:Tuple} layers::T end Maxout(layers...) = Maxout(layers) -Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ in 1:n_alts)...) +Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ = 1:n_alts)...) @functor Maxout @@ -346,7 +358,7 @@ true See also [`Parallel`](@ref), [`Maxout`](@ref). """ -struct SkipConnection{T, F} +struct SkipConnection{T,F} layers::T connection::F #user can pass arbitrary connections here, such as (a,b) -> a + b end @@ -409,24 +421,28 @@ julia> Flux.Bilinear(rand(4, 8, 16), false, tanh) # first dim of weight is the Bilinear((8, 16) => 4, tanh; bias=false) # 512 parameters ``` """ -struct Bilinear{F, A, B} +struct Bilinear{F,A,B} weight::A bias::B σ::F - function Bilinear(W::A, bias = true, σ::F = identity) where {A <: AbstractArray, F} + function Bilinear(W::A, bias = true, σ::F = identity) where {A<:AbstractArray,F} ndims(A) == 3 || throw(ArgumentError("expected a 3-array of weights")) b = _create_bias(W, bias, size(W, 1)) - return new{F, A, typeof(b)}(W, b, σ) + return new{F,A,typeof(b)}(W, b, σ) end end @functor Bilinear -function Bilinear(((in1, in2), out)::Pair{<:Tuple, <:Integer}, σ = identity; - bias = true, init = glorot_uniform) +function Bilinear( + ((in1, in2), out)::Pair{<:Tuple,<:Integer}, + σ = identity; + bias = true, + init = glorot_uniform, +) return Bilinear(init(out, in1, in2), bias, σ) end -function Bilinear((in12, out)::Pair{<:Integer, <:Integer}, σ = identity; kw...) +function Bilinear((in12, out)::Pair{<:Integer,<:Integer}, σ = identity; kw...) return Bilinear((in12, in12) => out, σ; kw...) end @@ -436,8 +452,11 @@ function (a::Bilinear)(x::AbstractMatrix, y::AbstractMatrix) d_z, d_x, d_y = size(W) d_x == size(x, 1) && d_y == size(y, 1) || throw(DimensionMismatch("number of rows in data must match W")) - size(x, 2) == size(y, 2) || - throw(DimensionMismatch("Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))")) + size(x, 2) == size(y, 2) || throw( + DimensionMismatch( + "Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))", + ), + ) # @einsum Wy[o,i,s] := W[o,i,j] * y[j,s] Wy = reshape(reshape(W, (:, d_y)) * y, (d_z, d_x, :)) @@ -454,14 +473,21 @@ end function (a::Bilinear)(x::AbstractVector, y::AbstractVector) return vec(a(reshape(x, :, 1), reshape(y, :, 1))) end -(a::Bilinear)(x::NTuple{2, AbstractArray}) = a(x[1], x[2]) +(a::Bilinear)(x::NTuple{2,AbstractArray}) = a(x[1], x[2]) function Base.show(io::IO, l::Bilinear) if size(l.weight, 2) == size(l.weight, 3) print(io, "Bilinear(", size(l.weight, 2), " => ", size(l.weight, 1)) else - print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", - size(l.weight, 1)) + print( + io, + "Bilinear((", + size(l.weight, 2), + ", ", + size(l.weight, 3), + ") => ", + size(l.weight, 1), + ) end l.σ == identity || print(io, ", ", l.σ) l.bias === false && print(io, "; bias=false") @@ -511,7 +537,7 @@ julia> model2[:β] == model2[2] true ``` """ -struct Parallel{F, T <: Union{Tuple, NamedTuple}} +struct Parallel{F,T<:Union{Tuple,NamedTuple}} connection::F layers::T end @@ -520,7 +546,11 @@ Parallel(connection, layers...) = Parallel(connection, layers) function Parallel(connection; kw...) layers = NamedTuple(kw) if :layers in keys(layers) || :connection in keys(layers) - throw(ArgumentError("a Parallel layer cannot have a named sub-layer called `connection` or `layers`")) + throw( + ArgumentError( + "a Parallel layer cannot have a named sub-layer called `connection` or `layers`", + ), + ) end isempty(layers) && return Parallel(connection, ()) return Parallel(connection, layers) @@ -535,7 +565,11 @@ function _parallel_check(layers, xs) nl = length(layers) nx = length(xs) if (nl != nx) - throw(ArgumentError("Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs")) + throw( + ArgumentError( + "Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs", + ), + ) end end ChainRulesCore.@non_differentiable _parallel_check(nl, nx) @@ -547,7 +581,7 @@ end Base.getindex(m::Parallel, i) = m.layers[i] Base.getindex(m::Parallel, i::AbstractVector) = Parallel(m.connection, m.layers[i]) -function Base.getindex(m::Parallel{<:Any, <:NamedTuple}, i::AbstractVector) +function Base.getindex(m::Parallel{<:Any,<:NamedTuple}, i::AbstractVector) return Parallel(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i])) end @@ -605,7 +639,7 @@ end A tuple of length N with the output of each fusion ((`y1`, `y2`, ..., `yN`) in the example above). """ -struct PairwiseFusion{F, T <: Union{Tuple, NamedTuple}} +struct PairwiseFusion{F,T<:Union{Tuple,NamedTuple}} connection::F layers::T end @@ -614,7 +648,11 @@ PairwiseFusion(connection, layers...) = PairwiseFusion(connection, layers) function PairwiseFusion(connection; kw...) layers = NamedTuple(kw) if :layers in keys(layers) || :connection in keys(layers) - throw(ArgumentError("a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`")) + throw( + ArgumentError( + "a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`", + ), + ) end isempty(layers) && return PairwiseFusion(connection, ()) return PairwiseFusion(connection, layers) @@ -624,7 +662,11 @@ function _pairwise_check(x, layers, T) lx = length(x) N = length(layers) if T <: Tuple && lx != N - throw(ArgumentError("PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs")) + throw( + ArgumentError( + "PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs", + ), + ) end end ChainRulesCore.@non_differentiable _pairwise_check(lx, N, T) @@ -635,19 +677,24 @@ function (m::PairwiseFusion)(x::T) where {T} end (m::PairwiseFusion)(xs...) = m(xs) -@generated function applypairwisefusion(layers::Tuple{Vararg{<:Any, N}}, connection, - x::T) where {N, T} - y_symbols = [gensym() for _ in 1:(N + 1)] +@generated function applypairwisefusion( + layers::Tuple{Vararg{<:Any,N}}, + connection, + x::T, +) where {N,T} + y_symbols = [gensym() for _ = 1:(N+1)] getinput(i) = T <: Tuple ? :(x[$i]) : :x - calls = [:($(y_symbols[N + 1]) = $(getinput(1)))] - for i in 1:(N - 1) - push!(calls, - quote - $(y_symbols[i]) = layers[$i]($(y_symbols[N + 1])) - $(y_symbols[N + 1]) = connection($(y_symbols[i]), $(getinput(i + 1))) - end) + calls = [:($(y_symbols[N+1]) = $(getinput(1)))] + for i = 1:(N-1) + push!( + calls, + quote + $(y_symbols[i]) = layers[$i]($(y_symbols[N+1])) + $(y_symbols[N+1]) = connection($(y_symbols[i]), $(getinput(i + 1))) + end, + ) end - push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N + 1])))) + push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N+1])))) push!(calls, :(return tuple($(Tuple(y_symbols[1:N])...)))) return Expr(:block, calls...) end @@ -661,7 +708,7 @@ Base.getindex(m::PairwiseFusion, i) = m.layers[i] function Base.getindex(m::PairwiseFusion, i::AbstractVector) return PairwiseFusion(m.connection, m.layers[i]) end -function Base.getindex(m::PairwiseFusion{<:Any, <:NamedTuple}, i::AbstractVector) +function Base.getindex(m::PairwiseFusion{<:Any,<:NamedTuple}, i::AbstractVector) return PairwiseFusion(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i])) end @@ -710,15 +757,18 @@ end @functor Embedding -Embedding((in, out)::Pair{<:Integer, <:Integer}; init = randn32) = Embedding(init(out, in)) +Embedding((in, out)::Pair{<:Integer,<:Integer}; init = randn32) = Embedding(init(out, in)) (m::Embedding)(x::Integer) = m.weight[:, x] (m::Embedding)(x::AbstractVector) = NNlib.gather(m.weight, x) (m::Embedding)(x::AbstractArray) = reshape(m(vec(x)), :, size(x)...) -function (m::Embedding)(x::Union{OneHotVector{T, L}, OneHotMatrix{T, L}}) where {T, L} - size(m.weight, 2) == L || - throw(DimensionMismatch("Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L")) +function (m::Embedding)(x::Union{OneHotVector{T,L},OneHotMatrix{T,L}}) where {T,L} + size(m.weight, 2) == L || throw( + DimensionMismatch( + "Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L", + ), + ) return m(onecold(x)) end diff --git a/src/layers/conv.jl b/src/layers/conv.jl index 5cd8782606..b620983dbc 100644 --- a/src/layers/conv.jl +++ b/src/layers/conv.jl @@ -1,7 +1,7 @@ using NNlib: conv, ∇conv_data, depthwiseconv, output_size # pad dims of x with dims of y until ndims(x) == ndims(y) -_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...) +_paddims(x::Tuple, y::Tuple) = (x..., y[(end-(length(y)-length(x)-1)):end]...) expand(N, i::Tuple) = i expand(N, i::Integer) = ntuple(_ -> i, N) @@ -48,10 +48,10 @@ julia> layer3(xs) |> size # output size = `ceil(input_size/stride)` = 50 """ struct SamePad end -function calc_padding(lt, pad, k::NTuple{N, T}, dilation, stride) where {T, N} +function calc_padding(lt, pad, k::NTuple{N,T}, dilation, stride) where {T,N} return expand(Val(2 * N), pad) end -function calc_padding(lt, ::SamePad, k::NTuple{N, T}, dilation, stride) where {N, T} +function calc_padding(lt, ::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T} #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/abs/1603.07285 # Effective kernel size, including dilation @@ -127,13 +127,13 @@ julia> Conv((5, 5), 3 => 7; stride = 2, dilation = 4)(xs) |> size (42, 42, 7, 50) ``` """ -struct Conv{N, M, F, A, V} +struct Conv{N,M,F,A,V} σ::F weight::A bias::V - stride::NTuple{N, Int} - pad::NTuple{M, Int} - dilation::NTuple{N, Int} + stride::NTuple{N,Int} + pad::NTuple{M,Int} + dilation::NTuple{N,Int} groups::Int end @@ -159,19 +159,34 @@ julia> Flux.params(layer) |> length 2 ``` """ -function Conv(w::AbstractArray{T, N}, b = true, σ = identity; - stride = 1, pad = 0, dilation = 1, groups = 1) where {T, N} - @assert size(w, N) % groups==0 "Output channel dimension must be divisible by groups." +function Conv( + w::AbstractArray{T,N}, + b = true, + σ = identity; + stride = 1, + pad = 0, + dilation = 1, + groups = 1, +) where {T,N} + @assert size(w, N) % groups == 0 "Output channel dimension must be divisible by groups." stride = expand(Val(N - 2), stride) dilation = expand(Val(N - 2), dilation) - pad = calc_padding(Conv, pad, size(w)[1:(N - 2)], dilation, stride) + pad = calc_padding(Conv, pad, size(w)[1:(N-2)], dilation, stride) bias = _create_bias(w, b, size(w, N)) return Conv(σ, w, bias, stride, pad, dilation, groups) end -function Conv(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity; - init = glorot_uniform, stride = 1, pad = 0, dilation = 1, groups = 1, - bias = true) where {N} +function Conv( + k::NTuple{N,Integer}, + ch::Pair{<:Integer,<:Integer}, + σ = identity; + init = glorot_uniform, + stride = 1, + pad = 0, + dilation = 1, + groups = 1, + bias = true, +) where {N} weight = convfilter(k, ch; init, groups) return Conv(weight, bias, σ; stride, pad, dilation, groups) end @@ -187,19 +202,29 @@ distribution. This is internally used by the [`Conv`](@ref) layer. """ -function convfilter(filter::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}; - init = glorot_uniform, groups = 1) where {N} +function convfilter( + filter::NTuple{N,Integer}, + ch::Pair{<:Integer,<:Integer}; + init = glorot_uniform, + groups = 1, +) where {N} cin, cout = ch - @assert cin % groups==0 "Input channel dimension must be divisible by groups." - @assert cout % groups==0 "Output channel dimension must be divisible by groups." + @assert cin % groups == 0 "Input channel dimension must be divisible by groups." + @assert cout % groups == 0 "Output channel dimension must be divisible by groups." return init(filter..., cin ÷ groups, cout) end @functor Conv function conv_dims(c::Conv, x::AbstractArray) - return DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad, - dilation = c.dilation, groups = c.groups) + return DenseConvDims( + x, + c.weight; + stride = c.stride, + padding = c.pad, + dilation = c.dilation, + groups = c.groups, + ) end ChainRulesCore.@non_differentiable conv_dims(::Any, ::Any) @@ -214,7 +239,7 @@ _channels_in(l::Conv) = size(l.weight, ndims(l.weight) - 1) * l.groups _channels_out(l::Conv) = size(l.weight, ndims(l.weight)) function Base.show(io::IO, l::Conv) - print(io, "Conv(", size(l.weight)[1:(ndims(l.weight) - 2)]) + print(io, "Conv(", size(l.weight)[1:(ndims(l.weight)-2)]) print(io, ", ", _channels_in(l), " => ", _channels_out(l)) _print_conv_opt(io, l) return print(io, ")") @@ -263,18 +288,18 @@ julia> ConvTranspose((5, 5), 3 => 7; stride = 3, pad = SamePad())(xs) |> size (300, 300, 7, 50) ``` """ -struct ConvTranspose{N, M, F, A, V} +struct ConvTranspose{N,M,F,A,V} σ::F weight::A bias::V - stride::NTuple{N, Int} - pad::NTuple{M, Int} - dilation::NTuple{N, Int} + stride::NTuple{N,Int} + pad::NTuple{M,Int} + dilation::NTuple{N,Int} groups::Int end _channels_in(l::ConvTranspose) = size(l.weight)[end] -_channels_out(l::ConvTranspose) = size(l.weight)[end - 1] * l.groups +_channels_out(l::ConvTranspose) = size(l.weight)[end-1] * l.groups """ ConvTranspose(weight::AbstractArray, [bias, activation; stride, pad, dilation, groups]) @@ -300,19 +325,33 @@ julia> Flux.params(layer) |> length 2 ``` """ -function ConvTranspose(w::AbstractArray{T, N}, bias = true, σ = identity; - stride = 1, pad = 0, dilation = 1, groups = 1) where {T, N} +function ConvTranspose( + w::AbstractArray{T,N}, + bias = true, + σ = identity; + stride = 1, + pad = 0, + dilation = 1, + groups = 1, +) where {T,N} stride = expand(Val(N - 2), stride) dilation = expand(Val(N - 2), dilation) - pad = calc_padding(ConvTranspose, pad, size(w)[1:(N - 2)], dilation, stride) + pad = calc_padding(ConvTranspose, pad, size(w)[1:(N-2)], dilation, stride) b = _create_bias(w, bias, size(w, N - 1) * groups) return ConvTranspose(σ, w, b, stride, pad, dilation, groups) end -function ConvTranspose(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity; - init = glorot_uniform, stride = 1, pad = 0, dilation = 1, - groups = 1, - bias = true) where {N} +function ConvTranspose( + k::NTuple{N,Integer}, + ch::Pair{<:Integer,<:Integer}, + σ = identity; + init = glorot_uniform, + stride = 1, + pad = 0, + dilation = 1, + groups = 1, + bias = true, +) where {N} weight = convfilter(k, reverse(ch); init, groups) return ConvTranspose(weight, bias, σ; stride, pad, dilation, groups) end @@ -322,17 +361,21 @@ end function conv_transpose_dims(c::ConvTranspose, x::AbstractArray) # Calculate size of "input", from ∇conv_data()'s perspective... combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end]) - I = (size(x)[1:(end - 2)] .- 1) .* c.stride .+ 1 .+ - (size(c.weight)[1:(end - 2)] .- 1) .* c.dilation .- combined_pad - C_in = size(c.weight)[end - 1] * c.groups + I = + (size(x)[1:(end-2)] .- 1) .* c.stride .+ 1 .+ + (size(c.weight)[1:(end-2)] .- 1) .* c.dilation .- combined_pad + C_in = size(c.weight)[end-1] * c.groups batch_size = size(x)[end] # Create DenseConvDims() that looks like the corresponding conv() w_size = size(c.weight) - return DenseConvDims((I..., C_in, batch_size), w_size; - stride = c.stride, - padding = c.pad, - dilation = c.dilation, - groups = c.groups) + return DenseConvDims( + (I..., C_in, batch_size), + w_size; + stride = c.stride, + padding = c.pad, + dilation = c.dilation, + groups = c.groups, + ) end ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any) @@ -344,14 +387,19 @@ function (c::ConvTranspose)(x::AbstractArray) end function Base.show(io::IO, l::ConvTranspose) - print(io, "ConvTranspose(", size(l.weight)[1:(ndims(l.weight) - 2)]) + print(io, "ConvTranspose(", size(l.weight)[1:(ndims(l.weight)-2)]) print(io, ", ", _channels_in(l), " => ", _channels_out(l)) _print_conv_opt(io, l) return print(io, ")") end -function calc_padding(::Type{ConvTranspose}, pad::SamePad, k::NTuple{N, T}, dilation, - stride) where {N, T} +function calc_padding( + ::Type{ConvTranspose}, + pad::SamePad, + k::NTuple{N,T}, + dilation, + stride, +) where {N,T} return calc_padding(Conv, pad, k .- stride .+ 1, dilation, stride) end @@ -379,17 +427,29 @@ julia> DepthwiseConv((5, 5), 3 => 9; stride = 2, pad = 2)(xs) |> size (50, 50, 9, 50) ``` """ -function DepthwiseConv(k::NTuple{<:Any, Integer}, ch::Pair{<:Integer, <:Integer}, - σ = identity; - stride = 1, pad = 0, dilation = 1, bias = true, - init = glorot_uniform) +function DepthwiseConv( + k::NTuple{<:Any,Integer}, + ch::Pair{<:Integer,<:Integer}, + σ = identity; + stride = 1, + pad = 0, + dilation = 1, + bias = true, + init = glorot_uniform, +) return Conv(k, ch, σ; groups = ch.first, stride, pad, dilation, bias, init) end -function DepthwiseConv(w::AbstractArray{T, N}, bias = true, σ = identity; - stride = 1, pad = 0, dilation = 1) where {T, N} - w2 = reshape(w, size(w)[1:(end - 2)]..., 1, :) - return Conv(w2, bias, σ; groups = size(w)[end - 1], stride, pad, dilation) +function DepthwiseConv( + w::AbstractArray{T,N}, + bias = true, + σ = identity; + stride = 1, + pad = 0, + dilation = 1, +) where {T,N} + w2 = reshape(w, size(w)[1:(end-2)]..., 1, :) + return Conv(w2, bias, σ; groups = size(w)[end-1], stride, pad, dilation) end """ @@ -419,13 +479,13 @@ julia> CrossCor((5, 5), 3 => 7; stride = 3, pad = (2, 0))(xs) |> size (34, 32, 7, 50) ``` """ -struct CrossCor{N, M, F, A, V} +struct CrossCor{N,M,F,A,V} σ::F weight::A bias::V - stride::NTuple{N, Int} - pad::NTuple{M, Int} - dilation::NTuple{N, Int} + stride::NTuple{N,Int} + pad::NTuple{M,Int} + dilation::NTuple{N,Int} end """ @@ -449,18 +509,31 @@ julia> layer(randn(100, 4, 64)) |> size (98, 5, 64) ``` """ -function CrossCor(w::AbstractArray{T, N}, bias = true, σ = identity; - stride = 1, pad = 0, dilation = 1) where {T, N} +function CrossCor( + w::AbstractArray{T,N}, + bias = true, + σ = identity; + stride = 1, + pad = 0, + dilation = 1, +) where {T,N} stride = expand(Val(N - 2), stride) dilation = expand(Val(N - 2), dilation) - pad = calc_padding(CrossCor, pad, size(w)[1:(N - 2)], dilation, stride) + pad = calc_padding(CrossCor, pad, size(w)[1:(N-2)], dilation, stride) b = _create_bias(w, bias, size(w, N)) return CrossCor(σ, w, b, stride, pad, dilation) end -function CrossCor(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity; - init = glorot_uniform, stride = 1, pad = 0, dilation = 1, - bias = true) where {N} +function CrossCor( + k::NTuple{N,Integer}, + ch::Pair{<:Integer,<:Integer}, + σ = identity; + init = glorot_uniform, + stride = 1, + pad = 0, + dilation = 1, + bias = true, +) where {N} weight = convfilter(k, ch; init = init) return CrossCor(weight, bias, σ; stride, pad, dilation) end @@ -473,8 +546,13 @@ function crosscor(x, w, ddims::DenseConvDims) end function crosscor_dims(c::CrossCor, x::AbstractArray) - return DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad, - dilation = c.dilation) + return DenseConvDims( + x, + c.weight; + stride = c.stride, + padding = c.pad, + dilation = c.dilation, + ) end ChainRulesCore.@non_differentiable crosscor_dims(::Any, ::Any) @@ -486,9 +564,14 @@ function (c::CrossCor)(x::AbstractArray) end function Base.show(io::IO, l::CrossCor) - print(io, "CrossCor(", size(l.weight)[1:(ndims(l.weight) - 2)]) - print(io, ", ", size(l.weight, ndims(l.weight) - 1), " => ", - size(l.weight, ndims(l.weight))) + print(io, "CrossCor(", size(l.weight)[1:(ndims(l.weight)-2)]) + print( + io, + ", ", + size(l.weight, ndims(l.weight) - 1), + " => ", + size(l.weight, ndims(l.weight)), + ) _print_conv_opt(io, l) return print(io, ")") end @@ -516,13 +599,13 @@ julia> MaxPool((4, 4))(xs) ≈ AdaptiveMaxPool((25, 25))(xs) true ``` """ -struct AdaptiveMaxPool{S, O} - out::NTuple{O, Int} - AdaptiveMaxPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out) +struct AdaptiveMaxPool{S,O} + out::NTuple{O,Int} + AdaptiveMaxPool(out::NTuple{O,Int}) where {O} = new{O + 2,O}(out) end -function (a::AdaptiveMaxPool{S})(x::AbstractArray{T, S}) where {S, T} - insize = size(x)[1:(end - 2)] +function (a::AdaptiveMaxPool{S})(x::AbstractArray{T,S}) where {S,T} + insize = size(x)[1:(end-2)] outsize = a.out stride = insize .÷ outsize k = insize .- (outsize .- 1) .* stride @@ -558,13 +641,13 @@ julia> MeanPool((4, 4))(xs) ≈ AdaptiveMeanPool((25, 25))(xs) true ``` """ -struct AdaptiveMeanPool{S, O} - out::NTuple{O, Int} - AdaptiveMeanPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out) +struct AdaptiveMeanPool{S,O} + out::NTuple{O,Int} + AdaptiveMeanPool(out::NTuple{O,Int}) where {O} = new{O + 2,O}(out) end -function (a::AdaptiveMeanPool{S})(x::AbstractArray{T, S}) where {S, T} - insize = size(x)[1:(end - 2)] +function (a::AdaptiveMeanPool{S})(x::AbstractArray{T,S}) where {S,T} + insize = size(x)[1:(end-2)] outsize = a.out stride = insize .÷ outsize k = insize .- (outsize .- 1) .* stride @@ -605,7 +688,7 @@ function (g::GlobalMaxPool)(x) # Input size x_size = size(x) # Kernel size - k = x_size[1:(end - 2)] + k = x_size[1:(end-2)] # Pooling dimensions pdims = PoolDims(x, k) @@ -639,7 +722,7 @@ function (g::GlobalMeanPool)(x) # Input size x_size = size(x) # Kernel size - k = x_size[1:(end - 2)] + k = x_size[1:(end-2)] # Pooling dimensions pdims = PoolDims(x, k) @@ -689,13 +772,13 @@ julia> layer(rand(Float32, 100, 7, 50)) |> size (34, 7, 50) ``` """ -struct MaxPool{N, M} - k::NTuple{N, Int} - pad::NTuple{M, Int} - stride::NTuple{N, Int} +struct MaxPool{N,M} + k::NTuple{N,Int} + pad::NTuple{M,Int} + stride::NTuple{N,Int} end -function MaxPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N} +function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where {N} stride = expand(Val(N), stride) pad = calc_padding(MaxPool, pad, k, 1, stride) return MaxPool(k, pad, stride) @@ -748,13 +831,13 @@ julia> m(xs) |> size (20, 20, 7, 50) ``` """ -struct MeanPool{N, M} - k::NTuple{N, Int} - pad::NTuple{M, Int} - stride::NTuple{N, Int} +struct MeanPool{N,M} + k::NTuple{N,Int} + pad::NTuple{M,Int} + stride::NTuple{N,Int} end -function MeanPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N} +function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where {N} stride = expand(Val(N), stride) pad = calc_padding(MeanPool, pad, k, 1, stride) return MeanPool(k, pad, stride) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 43c0a317c6..437d709463 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -38,7 +38,11 @@ dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...) dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) function dropout_mask(rng, x::CuArray, p; kwargs...) - throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.")) + throw( + ArgumentError( + "x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.", + ), + ) end dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) function _dropout_mask(rng, x, p; dims = :) @@ -92,10 +96,10 @@ julia> isapprox(count(==(0), y) / length(y), 0.5; atol = 0.1) true ``` """ -mutable struct Dropout{F, D, R <: AbstractRNG} +mutable struct Dropout{F,D,R<:AbstractRNG} p::F dims::D - active::Union{Bool, Nothing} + active::Union{Bool,Nothing} rng::R end Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value()) @@ -150,13 +154,13 @@ julia> isapprox(std(x), std(y); atol = 0.2) true ``` """ -mutable struct AlphaDropout{F, R <: AbstractRNG} +mutable struct AlphaDropout{F,R<:AbstractRNG} p::F - active::Union{Bool, Nothing} + active::Union{Bool,Nothing} rng::R function AlphaDropout(p, active, rng) @assert 0 ≤ p ≤ 1 - return new{typeof(p), typeof(rng)}(p, active, rng) + return new{typeof(p),typeof(rng)}(p, active, rng) end end AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value()) @@ -216,21 +220,25 @@ julia> isapprox(std(y; dims = 1:3), ones(1, 1, 1, 2); atol = 0.1) && true ``` """ -struct LayerNorm{F, D, T, N} +struct LayerNorm{F,D,T,N} λ::F diag::D ϵ::T - size::NTuple{N, Int} + size::NTuple{N,Int} affine::Bool end -function LayerNorm(size::Tuple{Vararg{Int}}, λ = identity; affine::Bool = true, - ϵ::Real = 1.0f-5) +function LayerNorm( + size::Tuple{Vararg{Int}}, + λ = identity; + affine::Bool = true, + ϵ::Real = 1.0f-5, +) diag = affine ? Scale(size..., λ) : λ != identity ? Base.Fix1(broadcast, λ) : identity return LayerNorm(λ, diag, ϵ, size, affine) end LayerNorm(size::Integer...; kw...) = LayerNorm(Int.(size); kw...) -LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:(end - 1)]), size_act[end]; kw...) +LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:(end-1)]), size_act[end]; kw...) @functor LayerNorm @@ -247,8 +255,12 @@ end # Compute the statistics on the slices specified by reduce_dims. # reduce_dims=[1,...,N-2,N] for BatchNorm # reduce_dims=[1,...,N-2] for InstanceNorm and GroupNorm -function _norm_layer_forward(l, x::AbstractArray{T, N}; reduce_dims, - affine_shape) where {T, N} +function _norm_layer_forward( + l, + x::AbstractArray{T,N}; + reduce_dims, + affine_shape, +) where {T,N} if !_isactive(l) && l.track_stats # testmode with tracked stats stats_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N) μ = reshape(l.μ, stats_shape) @@ -271,7 +283,7 @@ end @inline _norm_layer_forward(x, μ, σ², ϵ) = (x .- μ) ./ sqrt.(σ² .+ ϵ) -function _track_stats!(bn, x::AbstractArray{T, N}, μ, σ², reduce_dims) where {T, N} +function _track_stats!(bn, x::AbstractArray{T,N}, μ, σ², reduce_dims) where {T,N} V = eltype(bn.σ²) mtm = bn.momentum res_mtm = one(V) - mtm @@ -328,7 +340,7 @@ julia> isapprox(std(m(xs)), 1; atol = 0.1) && std(xs) != std(m(xs)) true ``` """ -mutable struct BatchNorm{F, V, N, W} +mutable struct BatchNorm{F,V,N,W} λ::F # activation function β::V # bias γ::V # scale @@ -338,23 +350,26 @@ mutable struct BatchNorm{F, V, N, W} momentum::N affine::Bool track_stats::Bool - active::Union{Bool, Nothing} + active::Union{Bool,Nothing} chs::Int # number of channels end -function BatchNorm(chs::Int, λ = identity; - initβ = zeros32, initγ = ones32, - affine = true, track_stats = true, - ϵ = 1.0f-5, momentum = 0.1f0) +function BatchNorm( + chs::Int, + λ = identity; + initβ = zeros32, + initγ = ones32, + affine = true, + track_stats = true, + ϵ = 1.0f-5, + momentum = 0.1f0, +) β = affine ? initβ(chs) : nothing γ = affine ? initγ(chs) : nothing μ = track_stats ? zeros32(chs) : nothing σ² = track_stats ? ones32(chs) : nothing - return BatchNorm(λ, β, γ, - μ, σ², ϵ, momentum, - affine, track_stats, - nothing, chs) + return BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, nothing, chs) end @functor BatchNorm @@ -363,7 +378,7 @@ trainable(bn::BatchNorm) = hasaffine(bn) ? (β = bn.β, γ = bn.γ) : (;) function (BN::BatchNorm)(x) @assert size(x, ndims(x) - 1) == BN.chs N = ndims(x) - reduce_dims = [1:(N - 2); N] + reduce_dims = [1:(N-2); N] affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N) return _norm_layer_forward(BN, x; reduce_dims, affine_shape) end @@ -419,7 +434,7 @@ julia> isapprox(std(y; dims = 1:2), ones(1, 1, 3, 2); atol = 0.2) && true ``` """ -mutable struct InstanceNorm{F, V, N, W} +mutable struct InstanceNorm{F,V,N,W} λ::F # activation function β::V # bias γ::V # scale @@ -429,17 +444,25 @@ mutable struct InstanceNorm{F, V, N, W} momentum::N affine::Bool track_stats::Bool - active::Union{Bool, Nothing} + active::Union{Bool,Nothing} chs::Int # number of channels end -function InstanceNorm(chs::Int, λ = identity; - initβ = zeros32, initγ = ones32, - affine = false, track_stats = false, - ϵ = 1.0f-5, momentum = 0.1f0) +function InstanceNorm( + chs::Int, + λ = identity; + initβ = zeros32, + initγ = ones32, + affine = false, + track_stats = false, + ϵ = 1.0f-5, + momentum = 0.1f0, +) if track_stats - Base.depwarn("`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", - :InstanceNorm) + Base.depwarn( + "`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", + :InstanceNorm, + ) end β = affine ? initβ(chs) : nothing @@ -447,10 +470,7 @@ function InstanceNorm(chs::Int, λ = identity; μ = track_stats ? zeros32(chs) : nothing σ² = track_stats ? ones32(chs) : nothing - return InstanceNorm(λ, β, γ, - μ, σ², ϵ, momentum, - affine, track_stats, - nothing, chs) + return InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, nothing, chs) end @functor InstanceNorm @@ -460,7 +480,7 @@ function (l::InstanceNorm)(x) @assert ndims(x) > 2 @assert size(x, ndims(x) - 1) == l.chs N = ndims(x) - reduce_dims = 1:(N - 2) + reduce_dims = 1:(N-2) affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N) return _norm_layer_forward(l, x; reduce_dims, affine_shape) end @@ -522,7 +542,7 @@ true ``` # number of groups ``` """ -mutable struct GroupNorm{F, V, N, W} +mutable struct GroupNorm{F,V,N,W} G::Int # number of groups λ::F # activation function β::V # bias @@ -533,20 +553,29 @@ mutable struct GroupNorm{F, V, N, W} momentum::N affine::Bool track_stats::Bool - active::Union{Bool, Nothing} + active::Union{Bool,Nothing} chs::Int # number of channels end @functor GroupNorm trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;) -function GroupNorm(chs::Int, G::Int, λ = identity; - initβ = zeros32, initγ = ones32, - affine = true, track_stats = false, - ϵ = 1.0f-5, momentum = 0.1f0) +function GroupNorm( + chs::Int, + G::Int, + λ = identity; + initβ = zeros32, + initγ = ones32, + affine = true, + track_stats = false, + ϵ = 1.0f-5, + momentum = 0.1f0, +) if track_stats - Base.depwarn("`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", - :GroupNorm) + Base.depwarn( + "`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", + :GroupNorm, + ) end chs % G == 0 || @@ -557,12 +586,7 @@ function GroupNorm(chs::Int, G::Int, λ = identity; μ = track_stats ? zeros32(G) : nothing σ² = track_stats ? ones32(G) : nothing - return GroupNorm(G, λ, - β, γ, - μ, σ², - ϵ, momentum, - affine, track_stats, - nothing, chs) + return GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, nothing, chs) end function (gn::GroupNorm)(x) @@ -570,9 +594,9 @@ function (gn::GroupNorm)(x) @assert size(x, ndims(x) - 1) == gn.chs N = ndims(x) sz = size(x) - x = reshape(x, sz[1:(N - 2)]..., sz[N - 1] ÷ gn.G, gn.G, sz[N]) + x = reshape(x, sz[1:(N-2)]..., sz[N-1] ÷ gn.G, gn.G, sz[N]) N = ndims(x) - reduce_dims = 1:(N - 2) + reduce_dims = 1:(N-2) affine_shape = ntuple(i -> i ∈ (N - 1, N - 2) ? size(x, i) : 1, N) x = _norm_layer_forward(gn, x; reduce_dims, affine_shape) return reshape(x, sz) @@ -598,4 +622,4 @@ scale parameters, `false` otherwise. See [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`LayerNorm`](@ref). """ -hasaffine(l::Union{BatchNorm, InstanceNorm, LayerNorm, GroupNorm}) = l.affine +hasaffine(l::Union{BatchNorm,InstanceNorm,LayerNorm,GroupNorm}) = l.affine diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl index e1e1b55519..5fdf1e7d00 100644 --- a/src/layers/recurrent.jl +++ b/src/layers/recurrent.jl @@ -19,13 +19,13 @@ function ChainRulesCore.rrule(::typeof(multigate), x::AbstractArray, h, c) end # Type stable and AD-friendly helper for iterating over the last dimension of an array -function eachlastdim(A::AbstractArray{T, N}) where {T, N} +function eachlastdim(A::AbstractArray{T,N}) where {T,N} inds_before = ntuple(_ -> :, N - 1) return (view(A, inds_before..., i) for i in axes(A, N)) end # adapted from https://github.com/JuliaDiff/ChainRules.jl/blob/f13e0a45d10bb13f48d6208e9c9d5b4a52b96732/src/rulesets/Base/indexing.jl#L77 -function ∇eachlastdim(dys_raw, x::AbstractArray{T, N}) where {T, N} +function ∇eachlastdim(dys_raw, x::AbstractArray{T,N}) where {T,N} dys = unthunk(dys_raw) i1 = findfirst(dy -> dy isa AbstractArray, dys) if isnothing(i1) # all slices are Zero! @@ -44,7 +44,7 @@ function ∇eachlastdim(dys_raw, x::AbstractArray{T, N}) where {T, N} return ProjectTo(x)(dx) end -function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T, N}) where {T, N} +function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T,N}) where {T,N} lastdims(dy) = (NoTangent(), ∇eachlastdim(unthunk(dy), x)) return collect(eachlastdim(x)), lastdims end @@ -126,7 +126,7 @@ julia> rnn.state 60 ``` """ -mutable struct Recur{T, S} +mutable struct Recur{T,S} cell::T state::S end @@ -183,7 +183,7 @@ reset!(m) = foreach(reset!, functor(m)[1]) flip(f, xs) = reverse([f(x) for x in reverse(xs)]) -function (m::Recur)(x::AbstractArray{T, 3}) where {T} +function (m::Recur)(x::AbstractArray{T,3}) where {T} h = [m(x_t) for x_t in eachlastdim(x)] sze = size(h[1]) return reshape(reduce(hcat, h), sze[1], sze[2], length(h)) @@ -192,23 +192,31 @@ end # Vanilla RNN struct RNNCell{F,I,H,V,S} - σ::F - Wi::I - Wh::H - b::V - state0::S + σ::F + Wi::I + Wh::H + b::V + state0::S end -function RNNCell((in, out)::Pair, σ = tanh; init = Flux.glorot_uniform, initb = zeros32, - init_state = zeros32) +function RNNCell( + (in, out)::Pair, + σ = tanh; + init = Flux.glorot_uniform, + initb = zeros32, + init_state = zeros32, +) return RNNCell(σ, init(out, in), init(out, out), initb(out), init_state(out, 1)) end -function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {F,I,H,V,T} - Wi, Wh, b = m.Wi, m.Wh, m.b - σ = NNlib.fast_act(m.σ, x) - h = σ.(Wi*x .+ Wh*h .+ b) - return h, reshape_cell_output(h, x) +function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})( + h, + x::Union{AbstractVecOrMat{T},OneHotArray}, +) where {F,I,H,V,T} + Wi, Wh, b = m.Wi, m.Wh, m.b + σ = NNlib.fast_act(m.σ, x) + h = σ.(Wi * x .+ Wh * h .+ b) + return h, reshape_cell_output(h, x) end @functor RNNCell @@ -295,29 +303,38 @@ Recur(m::RNNCell) = Recur(m, m.state0) # LSTM struct LSTMCell{I,H,V,S} - Wi::I - Wh::H - b::V - state0::S + Wi::I + Wh::H + b::V + state0::S end -function LSTMCell((in, out)::Pair; - init = glorot_uniform, - initb = zeros32, - init_state = zeros32) - cell = LSTMCell(init(out * 4, in), init(out * 4, out), initb(out * 4), - (init_state(out, 1), init_state(out, 1))) +function LSTMCell( + (in, out)::Pair; + init = glorot_uniform, + initb = zeros32, + init_state = zeros32, +) + cell = LSTMCell( + init(out * 4, in), + init(out * 4, out), + initb(out * 4), + (init_state(out, 1), init_state(out, 1)), + ) cell.b[gate(out, 2)] .= 1 return cell end -function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})((h, c), x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,T} - b, o = m.b, size(h, 1) - g = muladd(m.Wi, x, muladd(m.Wh, h, b)) - input, forget, cell, output = multigate(g, o, Val(4)) - c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell) - h′ = @. sigmoid_fast(output) * tanh_fast(c′) - return (h′, c′), reshape_cell_output(h′, x) +function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})( + (h, c), + x::Union{AbstractVecOrMat{T},OneHotArray}, +) where {I,H,V,T} + b, o = m.b, size(h, 1) + g = muladd(m.Wi, x, muladd(m.Wh, h, b)) + input, forget, cell, output = multigate(g, o, Val(4)) + c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell) + h′ = @. sigmoid_fast(output) * tanh_fast(c′) + return (h′, c′), reshape_cell_output(h′, x) end @functor LSTMCell @@ -376,25 +393,37 @@ function _gru_output(gxs, ghs, bs) end struct GRUCell{I,H,V,S} - Wi::I - Wh::H - b::V - state0::S + Wi::I + Wh::H + b::V + state0::S end -function GRUCell((in, out)::Pair; init = glorot_uniform, initb = zeros32, - init_state = zeros32) - return GRUCell(init(out * 3, in), init(out * 3, out), initb(out * 3), - init_state(out, 1)) +function GRUCell( + (in, out)::Pair; + init = glorot_uniform, + initb = zeros32, + init_state = zeros32, +) + return GRUCell( + init(out * 3, in), + init(out * 3, out), + initb(out * 3), + init_state(out, 1), + ) end -function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,T} - Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1) - gxs, ghs, bs = multigate(Wi*x, o, Val(3)), multigate(Wh*h, o, Val(3)), multigate(b, o, Val(3)) - r, z = _gru_output(gxs, ghs, bs) - h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3]) - h′ = @. (1 - z) * h̃ + z * h - return h′, reshape_cell_output(h′, x) +function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})( + h, + x::Union{AbstractVecOrMat{T},OneHotArray}, +) where {I,H,V,T} + Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1) + gxs, ghs, bs = + multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(3)), multigate(b, o, Val(3)) + r, z = _gru_output(gxs, ghs, bs) + h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3]) + h′ = @. (1 - z) * h̃ + z * h + return h′, reshape_cell_output(h′, x) end @functor GRUCell @@ -448,26 +477,39 @@ Recur(m::GRUCell) = Recur(m, m.state0) # GRU v3 struct GRUv3Cell{I,H,V,HH,S} - Wi::I - Wh::H - b::V - Wh_h̃::HH - state0::S + Wi::I + Wh::H + b::V + Wh_h̃::HH + state0::S end -function GRUv3Cell((in, out)::Pair; init = glorot_uniform, initb = zeros32, - init_state = zeros32) - return GRUv3Cell(init(out * 3, in), init(out * 2, out), initb(out * 3), - init(out, out), init_state(out, 1)) +function GRUv3Cell( + (in, out)::Pair; + init = glorot_uniform, + initb = zeros32, + init_state = zeros32, +) + return GRUv3Cell( + init(out * 3, in), + init(out * 2, out), + initb(out * 3), + init(out, out), + init_state(out, 1), + ) end -function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,HH,T} - Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1) - gxs, ghs, bs = multigate(Wi*x, o, Val(3)), multigate(Wh*h, o, Val(2)), multigate(b, o, Val(3)) - r, z = _gru_output(gxs, ghs, bs) - h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3]) - h′ = @. (1 - z) * h̃ + z * h - return h′, reshape_cell_output(h′, x) +function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})( + h, + x::Union{AbstractVecOrMat{T},OneHotArray}, +) where {I,H,V,HH,T} + Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1) + gxs, ghs, bs = + multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(2)), multigate(b, o, Val(3)) + r, z = _gru_output(gxs, ghs, bs) + h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3]) + h′ = @. (1 - z) * h̃ + z * h + return h′, reshape_cell_output(h′, x) end @functor GRUv3Cell diff --git a/src/layers/show.jl b/src/layers/show.jl index 8918bcd51c..b2e69d0b75 100644 --- a/src/layers/show.jl +++ b/src/layers/show.jl @@ -1,6 +1,11 @@ for T in [ - :Chain, :Parallel, :SkipConnection, :Recur, :Maxout, :PairwiseFusion, # container types + :Chain, + :Parallel, + :SkipConnection, + :Recur, + :Maxout, + :PairwiseFusion, # container types ] @eval function Base.show(io::IO, m::MIME"text/plain", x::$T) if get(io, :typeinfo, nothing) === nothing # e.g. top level in REPL @@ -25,8 +30,8 @@ function _big_show(io::IO, obj, indent::Int = 0, name = nothing) for k in Base.keys(obj) _big_show(io, obj[k], indent + 2, k) end - elseif obj isa Parallel{<:Any, <:NamedTuple} || - obj isa PairwiseFusion{<:Any, <:NamedTuple} + elseif obj isa Parallel{<:Any,<:NamedTuple} || + obj isa PairwiseFusion{<:Any,<:NamedTuple} _big_show(io, obj.connection, indent + 2) for k in Base.keys(obj) _big_show(io, obj[k], indent + 2, k) @@ -58,8 +63,17 @@ _show_children(p::Parallel) = (p.connection, p.layers...) _show_children(f::PairwiseFusion) = (f.connection, f.layers...) for T in [ - :Conv, :ConvTranspose, :CrossCor, :Dense, :Scale, :Bilinear, :Embedding, - :BatchNorm, :LayerNorm, :InstanceNorm, :GroupNorm, + :Conv, + :ConvTranspose, + :CrossCor, + :Dense, + :Scale, + :Bilinear, + :Embedding, + :BatchNorm, + :LayerNorm, + :InstanceNorm, + :GroupNorm, ] @eval function Base.show(io::IO, m::MIME"text/plain", x::$T) if !get(io, :compact, false) @@ -76,12 +90,22 @@ function _layer_show(io::IO, layer, indent::Int = 0, name = nothing) print(io, " "^indent, str, indent == 0 ? "" : ",") if !isempty(params(layer)) print(io, " "^max(2, (indent == 0 ? 20 : 39) - indent - length(str))) - printstyled(io, "# ", underscorise(sum(length, params(layer))), " parameters"; - color = :light_black) + printstyled( + io, + "# ", + underscorise(sum(length, params(layer))), + " parameters"; + color = :light_black, + ) nonparam = _childarray_sum(length, layer) - sum(length, params(layer)) if nonparam > 0 - printstyled(io, ", plus ", underscorise(nonparam), - indent == 0 ? " non-trainable" : ""; color = :light_black) + printstyled( + io, + ", plus ", + underscorise(nonparam), + indent == 0 ? " non-trainable" : ""; + color = :light_black, + ) end _nan_show(io, params(layer)) end @@ -96,15 +120,35 @@ function _big_finale(io::IO, m) noncnt = _childarray_sum(_ -> 1, m) - length(ps) if noncnt > 0 nonparam = underscorise(_childarray_sum(length, m) - sum(length, ps)) - printstyled(io, " "^08, "# Total: ", length(ps), " trainable arrays, "; - color = :light_black) + printstyled( + io, + " "^08, + "# Total: ", + length(ps), + " trainable arrays, "; + color = :light_black, + ) println(io, pars, " parameters,") - printstyled(io, " "^10, "# plus ", noncnt, " non-trainable, ", nonparam, - " parameters, summarysize "; color = :light_black) + printstyled( + io, + " "^10, + "# plus ", + noncnt, + " non-trainable, ", + nonparam, + " parameters, summarysize "; + color = :light_black, + ) print(io, bytes, ".") else - printstyled(io, " "^18, "# Total: ", length(ps), " arrays, "; - color = :light_black) + printstyled( + io, + " "^18, + "# Total: ", + length(ps), + " arrays, "; + color = :light_black, + ) print(io, pars, " parameters, ", bytes, ".") end end diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl index dad2a512bb..d67190a49b 100644 --- a/src/layers/upsample.jl +++ b/src/layers/upsample.jl @@ -31,7 +31,7 @@ julia> m(ones(2, 2, 1, 1)) |> size (4, 5, 1, 1) ``` """ -struct Upsample{mode, S, T} +struct Upsample{mode,S,T} scale::S size::T end @@ -42,26 +42,26 @@ function Upsample(mode::Symbol = :nearest; scale = nothing, size = nothing) if !(isnothing(scale) ⊻ isnothing(size)) throw(ArgumentError("Either scale or size should be specified (but not both).")) end - return Upsample{mode, typeof(scale), typeof(size)}(scale, size) + return Upsample{mode,typeof(scale),typeof(size)}(scale, size) end Upsample(scale, mode::Symbol = :nearest) = Upsample(mode; scale) (m::Upsample{:nearest})(x::AbstractArray) = NNlib.upsample_nearest(x, m.scale) -function (m::Upsample{:nearest, Int})(x::AbstractArray{T, N}) where {T, N} +function (m::Upsample{:nearest,Int})(x::AbstractArray{T,N}) where {T,N} return NNlib.upsample_nearest(x, ntuple(i -> m.scale, N - 2)) end -function (m::Upsample{:nearest, Nothing})(x::AbstractArray) +function (m::Upsample{:nearest,Nothing})(x::AbstractArray) return NNlib.upsample_nearest(x; size = m.size) end (m::Upsample{:bilinear})(x::AbstractArray) = NNlib.upsample_bilinear(x, m.scale) -function (m::Upsample{:bilinear, Nothing})(x::AbstractArray) +function (m::Upsample{:bilinear,Nothing})(x::AbstractArray) return NNlib.upsample_bilinear(x; size = m.size) end (m::Upsample{:trilinear})(x::AbstractArray) = NNlib.upsample_trilinear(x, m.scale) -function (m::Upsample{:trilinear, Nothing})(x::AbstractArray) +function (m::Upsample{:trilinear,Nothing})(x::AbstractArray) return NNlib.upsample_trilinear(x; size = m.size) end diff --git a/src/loading.jl b/src/loading.jl index 35e3868189..0dd73a0d59 100644 --- a/src/loading.jl +++ b/src/loading.jl @@ -23,16 +23,19 @@ function loadleaf!(dst::AbstractArray, src::AbstractArray, err) end function _tie_check(dst::Bool, src::AbstractArray) - return iszero(dst) || - error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.") + return iszero(dst) || error( + "Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.", + ) end function _tie_check(dst::AbstractArray, src::Bool) - return (iszero(dst) && iszero(src)) || - error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.") + return (iszero(dst) && iszero(src)) || error( + "Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.", + ) end function _tie_check(dst::AbstractArray, src::AbstractArray) - return (dst == src) || - error("Encountered tied destination parameters with untied and mismatched sources.") + return (dst == src) || error( + "Encountered tied destination parameters with untied and mismatched sources.", + ) end _tie_check(dst, src) = true @@ -97,10 +100,13 @@ but copying a `src` value of `true` will error. function loadmodel!(dst, src; filter = _ -> true, cache = Base.IdSet()) ldsts = _filter_children(filter, functor(dst)[1]) lsrcs = _filter_children(filter, functor(src)[1]) - (keys(ldsts) == keys(lsrcs)) || - throw(ArgumentError("Tried to load $src into $dst but the structures do not match.")) + (keys(ldsts) == keys(lsrcs)) || throw( + ArgumentError("Tried to load $src into $dst but the structures do not match."), + ) - err = DimensionMismatch("Tried to load $src into $dst but the parameter sizes do not match.") + err = DimensionMismatch( + "Tried to load $src into $dst but the parameter sizes do not match.", + ) foreach(ldsts, lsrcs) do ldst, lsrc if ldst in cache # we already loaded this parameter before _tie_check(ldst, lsrc) && return ldst diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl index 863d075916..a35f93af03 100644 --- a/src/losses/Losses.jl +++ b/src/losses/Losses.jl @@ -9,17 +9,24 @@ using CUDA using NNlib: logsoftmax, logσ, ctc_loss, ctc_alpha, ∇ctc_loss import Base.Broadcast: broadcasted -export mse, mae, msle, - label_smoothing, - crossentropy, logitcrossentropy, - binarycrossentropy, logitbinarycrossentropy, - kldivergence, - huber_loss, - tversky_loss, - dice_coeff_loss, - poisson_loss, - hinge_loss, squared_hinge_loss, - binary_focal_loss, focal_loss, siamese_contrastive_loss +export mse, + mae, + msle, + label_smoothing, + crossentropy, + logitcrossentropy, + binarycrossentropy, + logitbinarycrossentropy, + kldivergence, + huber_loss, + tversky_loss, + dice_coeff_loss, + poisson_loss, + hinge_loss, + squared_hinge_loss, + binary_focal_loss, + focal_loss, + siamese_contrastive_loss include("utils.jl") include("functions.jl") diff --git a/src/losses/functions.jl b/src/losses/functions.jl index 65b6b2fe60..674fe3065c 100644 --- a/src/losses/functions.jl +++ b/src/losses/functions.jl @@ -157,7 +157,7 @@ julia> Flux.crossentropy(y_dis, y) > Flux.crossentropy(y_dis, y_smoothed) true ``` """ -function label_smoothing(y::Union{AbstractArray, Number}, α::Number; dims::Int = 1) +function label_smoothing(y::Union{AbstractArray,Number}, α::Number; dims::Int = 1) if !(0 < α < 1) throw(ArgumentError("α must be between 0 and 1")) end @@ -320,7 +320,7 @@ julia> Flux.crossentropy(y_prob, y_hot) """ function binarycrossentropy(ŷ, y; agg = mean, ϵ = epseltype(ŷ)) _check_sizes(ŷ, y) - return agg(@.(-xlogy(y, ŷ + ϵ)-xlogy(1 - y, 1 - ŷ + ϵ))) + return agg(@.(-xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ))) end """ @@ -351,7 +351,7 @@ julia> Flux.binarycrossentropy(sigmoid.(y_model), y_bin) """ function logitbinarycrossentropy(ŷ, y; agg = mean) _check_sizes(ŷ, y) - return agg(@.((1 - y) * ŷ-logσ(ŷ))) + return agg(@.((1 - y) * ŷ - logσ(ŷ))) end """ diff --git a/src/losses/utils.jl b/src/losses/utils.jl index cda3e4a557..43aab12a05 100644 --- a/src/losses/utils.jl +++ b/src/losses/utils.jl @@ -21,17 +21,19 @@ end @adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric) res = xlogy.(x, y) return res, - Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), - Zygote.unbroadcast(y, Δ .* x ./ y)) + Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y)) end ChainRulesCore.@scalar_rule xlogy(x, y) (log(y), x / y) # should help Diffractor's broadcasting -ChainRulesCore.@scalar_rule xlogx(x) (log(y)+true) +ChainRulesCore.@scalar_rule xlogx(x) (log(y) + true) function _check_sizes(ŷ::AbstractArray, y::AbstractArray) - for d in 1:max(ndims(ŷ), ndims(y)) - size(ŷ, d) == size(y, d) || - throw(DimensionMismatch("loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))")) + for d = 1:max(ndims(ŷ), ndims(y)) + size(ŷ, d) == size(y, d) || throw( + DimensionMismatch( + "loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))", + ), + ) end end _check_sizes(ŷ, y) = nothing # pass-through, for constant label e.g. y = 1 diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl index fa78f513d8..5bc95d0ab2 100644 --- a/src/optimise/Optimise.jl +++ b/src/optimise/Optimise.jl @@ -3,11 +3,30 @@ module Optimise using LinearAlgebra import ArrayInterface -export train!, update!, - Descent, Adam, Momentum, Nesterov, RMSProp, - AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW, RAdam, OAdam, AdaBelief, - InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser, - ClipValue, ClipNorm +export train!, + update!, + Descent, + Adam, + Momentum, + Nesterov, + RMSProp, + AdaGrad, + AdaMax, + AdaDelta, + AMSGrad, + NAdam, + AdamW, + RAdam, + OAdam, + AdaBelief, + InvDecay, + ExpDecay, + WeightDecay, + stop, + skip, + Optimiser, + ClipValue, + ClipNorm include("optimisers.jl") include("train.jl") diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index f4d9687384..e7e40012c6 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -172,9 +172,9 @@ opt = Adam(0.001, (0.9, 0.8)) """ mutable struct Adam <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64, Float64} + beta::Tuple{Float64,Float64} epsilon::Float64 - state::IdDict{Any, Any} + state::IdDict{Any,Any} end Adam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = Adam(η, β, ϵ, IdDict()) Adam(η::Real, β::Tuple, state::IdDict) = Adam(η, β, EPS, state) @@ -183,9 +183,12 @@ function apply!(o::Adam, x, Δ) η, β = o.eta, o.beta mt, vt, βp = get!(o.state, x) do - return (zero(x), zero(x), - Float64[β[1], β[2]]) - end::Tuple{typeof(x), typeof(x), Vector{Float64}} + return ( + zero(x), + zero(x), + Float64[β[1], β[2]], + ) + end::Tuple{typeof(x),typeof(x),Vector{Float64}} @. mt = β[1] * mt + (1 - β[1]) * Δ @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) @@ -215,9 +218,9 @@ opt = RAdam(0.001, (0.9, 0.8)) """ mutable struct RAdam <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64, Float64} + beta::Tuple{Float64,Float64} epsilon::Float64 - state::IdDict{Any, Any} + state::IdDict{Any,Any} end RAdam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = RAdam(η, β, ϵ, IdDict()) RAdam(η::Real, β::Tuple, state::IdDict) = RAdam(η, β, EPS, state) @@ -226,11 +229,14 @@ function apply!(o::RAdam, x, Δ) η, β = o.eta, o.beta ρ∞ = 2 / (1 - β[2]) - 1 - mt, vt, βp, t = get!(o.state, - x) do - return (zero(x), zero(x), Float64[β[1], β[2]], - Ref(1)) - end::Tuple{typeof(x), typeof(x), Vector{Float64}, Base.RefValue{Int}} + mt, vt, βp, t = get!(o.state, x) do + return ( + zero(x), + zero(x), + Float64[β[1], β[2]], + Ref(1), + ) + end::Tuple{typeof(x),typeof(x),Vector{Float64},Base.RefValue{Int}} @. mt = β[1] * mt + (1 - β[1]) * Δ @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) @@ -267,9 +273,9 @@ opt = AdaMax(0.001, (0.9, 0.995)) """ mutable struct AdaMax <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64, Float64} + beta::Tuple{Float64,Float64} epsilon::Float64 - state::IdDict{Any, Any} + state::IdDict{Any,Any} end AdaMax(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = AdaMax(η, β, ϵ, IdDict()) AdaMax(η::Real, β::Tuple, state::IdDict) = AdaMax(η, β, EPS, state) @@ -278,9 +284,12 @@ function apply!(o::AdaMax, x, Δ) η, β = o.eta, o.beta mt, ut, βp = get!(o.state, x) do - return (zero(x), zero(x), - Float64[β[1], β[2]]) - end::Tuple{typeof(x), typeof(x), Vector{Float64}} + return ( + zero(x), + zero(x), + Float64[β[1], β[2]], + ) + end::Tuple{typeof(x),typeof(x),Vector{Float64}} @. mt = β[1] * mt + (1 - β[1]) * Δ @. ut = max(β[2] * ut, abs(Δ)) @@ -311,9 +320,9 @@ opt = OAdam(0.001, (0.9, 0.995)) """ mutable struct OAdam <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64, Float64} + beta::Tuple{Float64,Float64} epsilon::Float64 - state::IdDict{Any, Any} + state::IdDict{Any,Any} end OAdam(η::Real = 0.001, β::Tuple = (0.5, 0.9), ϵ::Real = EPS) = OAdam(η, β, ϵ, IdDict()) OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state) @@ -321,11 +330,14 @@ OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state) function apply!(o::OAdam, x, Δ) η, β = o.eta, o.beta - mt, vt, Δ_, βp = get!(o.state, - x) do - return (zero(x), zero(x), zero(x), - Float64[β[1], β[2]]) - end::Tuple{typeof(x), typeof(x), typeof(x), Vector{Float64}} + mt, vt, Δ_, βp = get!(o.state, x) do + return ( + zero(x), + zero(x), + zero(x), + Float64[β[1], β[2]], + ) + end::Tuple{typeof(x),typeof(x),typeof(x),Vector{Float64}} @. mt = β[1] * mt + (1 - β[1]) * Δ @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) @@ -390,14 +402,14 @@ opt = AdaDelta(0.89) mutable struct AdaDelta <: AbstractOptimiser rho::Float64 epsilon::Float64 - state::IdDict{Any, Any} + state::IdDict{Any,Any} end AdaDelta(ρ::Real = 0.9, ϵ::Real = EPS) = AdaDelta(ρ, ϵ, IdDict()) AdaDelta(ρ::Real, state::IdDict) = AdaDelta(ρ, EPS, state) function apply!(o::AdaDelta, x, Δ) ρ = o.rho - acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2, typeof(x)} + acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2,typeof(x)} @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ) # DON'T remove epsilon from numerator # or even out of the square roots @@ -427,9 +439,9 @@ opt = AMSGrad(0.001, (0.89, 0.995)) """ mutable struct AMSGrad <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64, Float64} + beta::Tuple{Float64,Float64} epsilon::Float64 - state::IdDict{Any, Any} + state::IdDict{Any,Any} end AMSGrad(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AMSGrad(η, β, ϵ, IdDict()) AMSGrad(η::Real, β::Tuple, state::IdDict) = AMSGrad(η, β, EPS, state) @@ -438,9 +450,12 @@ function apply!(o::AMSGrad, x, Δ) η, β = o.eta, o.beta mt, vt, v̂t = get!(o.state, x) do - return (fill!(similar(x), o.epsilon), fill!(similar(x), o.epsilon), - fill!(similar(x), o.epsilon)) - end::NTuple{3, typeof(x)} + return ( + fill!(similar(x), o.epsilon), + fill!(similar(x), o.epsilon), + fill!(similar(x), o.epsilon), + ) + end::NTuple{3,typeof(x)} @. mt = β[1] * mt + (1 - β[1]) * Δ @. vt = β[2] * vt + (1 - β[2]) * Δ^2 @@ -469,9 +484,9 @@ opt = NAdam(0.002, (0.89, 0.995)) """ mutable struct NAdam <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64, Float64} + beta::Tuple{Float64,Float64} epsilon::Float64 - state::IdDict{Any, Any} + state::IdDict{Any,Any} end NAdam(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = NAdam(η, β, ϵ, IdDict()) NAdam(η::Real, β::Tuple, state::IdDict) = NAdam(η, β, EPS, state) @@ -480,15 +495,19 @@ function apply!(o::NAdam, x, Δ) η, β = o.eta, o.beta mt, vt, βp = get!(o.state, x) do - return (zero(x), zero(x), - Float64[o.beta[1], o.beta[2]]) - end::Tuple{typeof(x), typeof(x), Vector{Float64}} + return ( + zero(x), + zero(x), + Float64[o.beta[1], o.beta[2]], + ) + end::Tuple{typeof(x),typeof(x),Vector{Float64}} β1p, β2p = βp @. mt = β[1] * mt + (1 - β[1]) * Δ @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) - @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / - (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η + @. Δ = + (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / + (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η βp .= βp .* β return Δ @@ -539,9 +558,9 @@ opt = AdaBelief(0.001, (0.9, 0.8)) """ mutable struct AdaBelief <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64, Float64} + beta::Tuple{Float64,Float64} epsilon::Float64 - state::IdDict{Any, Any} + state::IdDict{Any,Any} end AdaBelief(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AdaBelief(η, β, ϵ, IdDict()) AdaBelief(η::Real, β::Tuple, state::IdDict) = AdaBelief(η, β, EPS, state) @@ -550,9 +569,12 @@ function apply!(o::AdaBelief, x, Δ) η, β = o.eta, o.beta mt, st, βp = get!(o.state, x) do - return (zero(x), zero(x), - Float64[β[1], β[2]]) - end::Tuple{typeof(x), typeof(x), Vector{Float64}} + return ( + zero(x), + zero(x), + Float64[β[1], β[2]], + ) + end::Tuple{typeof(x),typeof(x),Vector{Float64}} #= st is a variance and can go to zero. This is in contrast to Adam, which uses the second moment which is usually far enough from zero. This is problematic, since st @@ -587,8 +609,12 @@ end Optimiser(opts::AbstractOptimiser...) = Optimiser(Any[opts...]) -@forward Optimiser.os Base.getindex, Base.first, Base.last, Base.lastindex, Base.push!, - Base.setindex! +@forward Optimiser.os Base.getindex, +Base.first, +Base.last, +Base.lastindex, +Base.push!, +Base.setindex! @forward Optimiser.os Base.iterate Base.getindex(c::Optimiser, i::AbstractArray) = Optimiser(c.os[i]...) @@ -623,10 +649,10 @@ opt = Optimiser(Adam(1.0f-3), InvDecay(1.0f-2)) """ mutable struct InvDecay <: AbstractOptimiser gamma::Float64 - state::IdDict{Any, Int} + state::IdDict{Any,Int} end -InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any, Int}()) +InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any,Int}()) function apply!(o::InvDecay, x, Δ) γ = o.gamma @@ -683,7 +709,8 @@ end function apply!(o::ExpDecay, x, Δ) η, s, decay, start = o.eta, o.step, o.decay, o.start n = o.current[x] = get(o.current, x, 0) + 1 - if n > start && n % s == 0 && + if n > start && + n % s == 0 && count(x -> x > start && x % s == 0, values(o.current)) == 1 η = max(η * decay, o.clip) o.eta = η diff --git a/src/optimise/train.jl b/src/optimise/train.jl index e32451b0da..bead5860f0 100644 --- a/src/optimise/train.jl +++ b/src/optimise/train.jl @@ -50,9 +50,11 @@ end ``` """ function skip() - Base.depwarn("""Flux.skip() will be removed from Flux 0.14. - and should be replaced with `continue` in an ordinary `for` loop.""", - :skip) + Base.depwarn( + """Flux.skip() will be removed from Flux 0.14. + and should be replaced with `continue` in an ordinary `for` loop.""", + :skip, + ) throw(SkipException()) end @@ -77,8 +79,11 @@ end ``` """ function stop() - Base.depwarn("""Flux.stop() will be removed from Flux 0.14. - It should be replaced with `break` in an ordinary `for` loop.""", :stop) + Base.depwarn( + """Flux.stop() will be removed from Flux 0.14. + It should be replaced with `break` in an ordinary `for` loop.""", + :stop, + ) throw(StopException()) end @@ -173,11 +178,14 @@ hello ``` """ macro epochs(n, ex) - Base.depwarn("""The macro `@epochs` will be removed from Flux 0.14. - As an alternative, you can write a simple `for i in 1:epochs` loop.""", - Symbol("@epochs"); force = true) - return :(@progress for i in 1:($(esc(n))) - @info "Epoch $i" - $(esc(ex)) - end) + Base.depwarn( + """The macro `@epochs` will be removed from Flux 0.14. + As an alternative, you can write a simple `for i in 1:epochs` loop.""", + Symbol("@epochs"); + force = true, + ) + return :(@progress for i = 1:($(esc(n))) + @info "Epoch $i" + $(esc(ex)) + end) end diff --git a/src/outputsize.jl b/src/outputsize.jl index ec87107adc..65f006d54f 100644 --- a/src/outputsize.jl +++ b/src/outputsize.jl @@ -11,19 +11,33 @@ Unlike `Nothing` and `Missing` it is a number: `Nil <: Real <: Number`. """ struct Nil <: Real end -@doc @doc(Nil) -const nil = Nil() +@doc @doc(Nil) const nil = Nil() -Nil(::T) where {T <: Number} = nil -(::Type{T})(::Nil) where {T <: Number} = nil +Nil(::T) where {T<:Number} = nil +(::Type{T})(::Nil) where {T<:Number} = nil Base.convert(::Type{Nil}, ::Number) = nil Base.float(::Type{Nil}) = Nil -for f in [:copy, :zero, :one, :oneunit, - :+, :-, :abs, :abs2, :inv, - :exp, :log, :log1p, :log2, :log10, - :sqrt, :tanh, :conj] +for f in [ + :copy, + :zero, + :one, + :oneunit, + :+, + :-, + :abs, + :abs2, + :inv, + :exp, + :log, + :log1p, + :log2, + :log10, + :sqrt, + :tanh, + :conj, +] @eval Base.$f(::Nil) = nil end @@ -167,8 +181,12 @@ end for (fn, Dims) in ((:conv, DenseConvDims),) @eval begin function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{Nil}, dims::$Dims) - return fill(nil, NNlib.output_size(dims)..., NNlib.channels_out(dims), - size(a)[end]) + return fill( + nil, + NNlib.output_size(dims)..., + NNlib.channels_out(dims), + size(a)[end], + ) end function NNlib.$fn(a::AbstractArray{<:Real}, b::AbstractArray{Nil}, dims::$Dims) diff --git a/src/utils.jl b/src/utils.jl index 2d0137a182..07d99e8b97 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -28,7 +28,7 @@ nfan() = 1, 1 # fan_in, fan_out nfan(n) = 1, n # A vector is treated as a n×1 matrix nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices nfan(dims::Tuple) = nfan(dims...) -nfan(dims...) = prod(dims[1:(end - 2)]) .* (dims[end - 1], dims[end]) # In case of convolution kernels +nfan(dims...) = prod(dims[1:(end-2)]) .* (dims[end-1], dims[end]) # In case of convolution kernels ofeltype(x, y) = convert(float(eltype(x)), y) epseltype(x) = eps(float(eltype(x))) @@ -270,11 +270,18 @@ julia> round(std(Flux.truncated_normal(10^6; lo = -100, hi = 100))) 1.0f0 ``` """ -function truncated_normal(rng::AbstractRNG, dims::Integer...; mean = 0, std = 1, lo = -2, - hi = 2) +function truncated_normal( + rng::AbstractRNG, + dims::Integer...; + mean = 0, + std = 1, + lo = -2, + hi = 2, +) norm_cdf(x) = 0.5 * (1 + erf(x / √2)) if (mean < lo - 2 * std) || (mean > hi + 2 * std) - @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog=1 + @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog = + 1 end l = norm_cdf((lo - mean) / std) u = norm_cdf((hi - mean) / std) @@ -347,7 +354,7 @@ end function orthogonal(rng::AbstractRNG, d1::Integer, ds::Integer...; kwargs...) dims = (d1, ds...) - rows = prod(dims[1:(end - 1)]) + rows = prod(dims[1:(end-1)]) cols = dims[end] return reshape(orthogonal(rng, rows, cols; kwargs...), dims) end @@ -356,8 +363,8 @@ function orthogonal(dims::Integer...; kwargs...) return orthogonal(default_rng_value(), dims...; kwargs...) end function orthogonal(rng::AbstractRNG = default_rng_value(); init_kwargs...) - return (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs..., - kwargs...) + return (dims::Integer...; kwargs...) -> + orthogonal(rng, dims...; init_kwargs..., kwargs...) end ChainRulesCore.@non_differentiable orthogonal(::Any...) @@ -396,7 +403,11 @@ julia> count(iszero, ans.weight; dims = 1) """ function sparse_init(rng::AbstractRNG, dims::Integer...; sparsity, std = 0.01) if length(dims) != 2 - throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) + throw( + ArgumentError( + "Only 2-dimensional outputs are supported for sparse initialization.", + ), + ) end rows, cols = dims prop_zero = min(1.0, sparsity) @@ -495,10 +506,10 @@ end # Assume convolution function identity_init(dims::Integer...; gain::Real = 1, shift = 0) - nin, nout = dims[end - 1], dims[end] - centers = map(d -> cld(d, 2), dims[1:(end - 2)]) + nin, nout = dims[end-1], dims[end] + centers = map(d -> cld(d, 2), dims[1:(end-2)]) weights = zeros32(dims...) - for i in 1:min(nin, nout) + for i = 1:min(nin, nout) weights[centers..., i, i] = gain end return circshift(weights, shift) diff --git a/test/ctc-gpu.jl b/test/ctc-gpu.jl index 1ed898cd21..6439a3a8f5 100644 --- a/test/ctc-gpu.jl +++ b/test/ctc-gpu.jl @@ -10,7 +10,7 @@ using CUDA function ctc_ngradient(x, y) f = Flux.Losses.ctc_loss grads = zero(x) - for i in 1:length(x) + for i = 1:length(x) δ = sqrt(eps()) tmp = x[i] x[i] = tmp - δ / 2 @@ -30,7 +30,7 @@ end g1 = gradient(ctc_loss, x_cu, y)[1] g1 = g1 |> collect g2 = ctc_ngradient(x, y) - @test g1≈g2 rtol=1e-5 atol=1e-5 + @test g1 ≈ g2 rtol = 1e-5 atol = 1e-5 # test that GPU loss matches CPU implementation l1 = ctc_loss(x_cu, y) @@ -42,18 +42,23 @@ end y = [1, 2] @test ctc_loss(x_cu, y) ≈ 3.6990738275138035 - g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811; - 0.0729422 0.447346 0.16457] + g = [ + -0.317671 -0.427729 0.665241 + 0.244728 -0.0196172 -0.829811 + 0.0729422 0.447346 0.16457 + ] ghat = gradient(ctc_loss, x_cu, y)[1] |> collect - @test g≈ghat rtol=1e-5 atol=1e-5 + @test g ≈ ghat rtol = 1e-5 atol = 1e-5 x_cu = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0] |> CuArray y = [1, 2] |> CuArray @test ctc_loss(x_cu, y) ≈ 8.02519869363453 - g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063; - 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307; - -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07] + g = [ + -2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063 + 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307 + -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07 + ] ghat = gradient(ctc_loss, x_cu, y)[1] |> collect - @test g≈ghat rtol=1e-5 atol=1e-5 + @test g ≈ ghat rtol = 1e-5 atol = 1e-5 end diff --git a/test/ctc.jl b/test/ctc.jl index 88386ff0e7..059b14f292 100644 --- a/test/ctc.jl +++ b/test/ctc.jl @@ -9,7 +9,7 @@ using LinearAlgebra function ctc_ngradient(x, y) f = Flux.Losses.ctc_loss grads = zero(x) - for i in 1:length(x) + for i = 1:length(x) δ = sqrt(eps()) tmp = x[i] x[i] = tmp - δ / 2 @@ -27,25 +27,30 @@ end y = rand(1:9, 30) g1 = gradient(ctc_loss, x, y)[1] g2 = ctc_ngradient(x, y) - @test g1≈g2 rtol=1e-5 atol=1e-5 + @test g1 ≈ g2 rtol = 1e-5 atol = 1e-5 # tests using hand-calculated values x = [1.0 2.0 3.0; 2.0 1.0 1.0; 3.0 3.0 2.0] y = [1, 2] @test ctc_loss(x, y) ≈ 3.6990738275138035 - g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811; - 0.0729422 0.447346 0.16457] + g = [ + -0.317671 -0.427729 0.665241 + 0.244728 -0.0196172 -0.829811 + 0.0729422 0.447346 0.16457 + ] ghat = gradient(ctc_loss, x, y)[1] - @test g≈ghat rtol=1e-5 atol=1e-5 + @test g ≈ ghat rtol = 1e-5 atol = 1e-5 x = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0] y = [1, 2] @test ctc_loss(x, y) ≈ 8.02519869363453 - g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063; - 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307; - -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07] + g = [ + -2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063 + 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307 + -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07 + ] ghat = gradient(ctc_loss, x, y)[1] - @test g≈ghat rtol=1e-5 atol=1e-5 + @test g ≈ ghat rtol = 1e-5 atol = 1e-5 end diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl index 033a08df95..25648eb787 100644 --- a/test/cuda/cuda.jl +++ b/test/cuda/cuda.jl @@ -21,7 +21,7 @@ using SparseArrays: sparse, SparseMatrixCSC, AbstractSparseArray cm = gpu(m) @test all(p isa CuArray for p in params(cm)) - @test cm(gpu(rand(10, 10))) isa CuArray{Float32, 2} + @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2} xs = rand(5, 5) ys = Flux.onehotbatch(1:5, 1:5) @@ -81,7 +81,7 @@ end M = 2.0 * I(10) |> collect Q = cholesky(M) Q_gpu = Q |> gpu - @test Q_gpu isa Cholesky{<:Any, <:CuArray} + @test Q_gpu isa Cholesky{<:Any,<:CuArray} Q_cpu = Q_gpu |> cpu @test Q_cpu == cholesky(eltype(Q_gpu).(M)) end diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl index 2ff137d34f..a46b5684c8 100644 --- a/test/cuda/curnn.jl +++ b/test/cuda/curnn.jl @@ -11,55 +11,55 @@ using Flux, CUDA, Test @test collect(m̄.cell.Wi) == collect(θ[m.cell.Wi]) end -@testset "RNN" begin @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5) - rnn = R(10, 5) - curnn = fmap(gpu, rnn) +@testset "RNN" begin + @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5) + rnn = R(10, 5) + curnn = fmap(gpu, rnn) - Flux.reset!(rnn) - Flux.reset!(curnn) - x = batch_size == 1 ? - rand(Float32, 10) : - rand(Float32, 10, batch_size) - cux = gpu(x) + Flux.reset!(rnn) + Flux.reset!(curnn) + x = batch_size == 1 ? rand(Float32, 10) : rand(Float32, 10, batch_size) + cux = gpu(x) - y, back = pullback((r, x) -> r(x), rnn, x) - cuy, cuback = pullback((r, x) -> r(x), curnn, cux) + y, back = pullback((r, x) -> r(x), rnn, x) + cuy, cuback = pullback((r, x) -> r(x), curnn, cux) - @test y ≈ collect(cuy) + @test y ≈ collect(cuy) - ȳ = randn(size(y)) - m̄, x̄ = back(ȳ) - cum̄, cux̄ = cuback(gpu(ȳ)) + ȳ = randn(size(y)) + m̄, x̄ = back(ȳ) + cum̄, cux̄ = cuback(gpu(ȳ)) - @test x̄ ≈ collect(cux̄) - @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi) - @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh) - @test m̄[].cell.b ≈ collect(cum̄[].cell.b) - if m̄[].state isa Tuple - for (x, cx) in zip(m̄[].state, cum̄[].state) - @test x ≈ collect(cx) + @test x̄ ≈ collect(cux̄) + @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi) + @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh) + @test m̄[].cell.b ≈ collect(cum̄[].cell.b) + if m̄[].state isa Tuple + for (x, cx) in zip(m̄[].state, cum̄[].state) + @test x ≈ collect(cx) + end + else + @test m̄[].state ≈ collect(cum̄[].state) end - else - @test m̄[].state ≈ collect(cum̄[].state) - end - Flux.reset!(rnn) - Flux.reset!(curnn) - ohx = batch_size == 1 ? - Flux.onehot(rand(1:10), 1:10) : - Flux.onehotbatch(rand(1:10, batch_size), 1:10) - cuohx = gpu(ohx) - y = (rnn(ohx); rnn(ohx)) + Flux.reset!(rnn) + Flux.reset!(curnn) + ohx = + batch_size == 1 ? Flux.onehot(rand(1:10), 1:10) : + Flux.onehotbatch(rand(1:10, batch_size), 1:10) + cuohx = gpu(ohx) + y = (rnn(ohx); rnn(ohx)) - cuy = (curnn(cuohx); curnn(cuohx)) - @test y ≈ collect(cuy) + cuy = (curnn(cuohx); curnn(cuohx)) + @test y ≈ collect(cuy) - Flux.reset!(rnn) - Flux.reset!(curnn) - fx = rand(Float32, 10, batch_size, 3) - cufx = gpu(fx) - fy = (rnn(fx); rnn(fx)) + Flux.reset!(rnn) + Flux.reset!(curnn) + fx = rand(Float32, 10, batch_size, 3) + cufx = gpu(fx) + fy = (rnn(fx); rnn(fx)) - cufy = (curnn(cufx); curnn(cufx)) - @test fy ≈ collect(cufy) -end end + cufy = (curnn(cufx); curnn(cufx)) + @test fy ≈ collect(cufy) + end +end diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl index 631b103839..e2da95931d 100644 --- a/test/cuda/layers.jl +++ b/test/cuda/layers.jl @@ -13,63 +13,70 @@ end # TODO: These layers get into scalar indexing issues. const BROKEN_LAYERS = Union{} -const ACTIVATIONS = [identity, relu, tanh, - sigmoid, exp, softplus, - elu, selu] - -function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; - test_cpu = true) +const ACTIVATIONS = [identity, relu, tanh, sigmoid, exp, softplus, elu, selu] + +function gpu_gradtest( + name::String, + layers::Vector, + x_cpu = nothing, + args...; + test_cpu = true, +) isnothing(x_cpu) && error("Missing input to test the layers against.") - @testset "$name GPU grad tests" begin for layer in layers - @testset "$layer Layer GPU grad test" begin - - # compute output and grad of parameters - l_cpu = layer(args...) - ps_cpu = Flux.params(l_cpu) - y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu) - gs_cpu = back_cpu(1.0f0) - - x_gpu = gpu(x_cpu) - l_gpu = l_cpu |> gpu - ps_gpu = Flux.params(l_gpu) - - if typeof(l_gpu) <: BROKEN_LAYERS - @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads - else - y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu) - gs_gpu = back_gpu(1.0f0) # TODO many layers error out when backprop int 1, should fix - - # compute grad of input - xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1] - xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1] - - # test - if test_cpu - @test y_gpu≈y_cpu rtol=1.0f-3 atol=1.0f-3 - if isnothing(xg_cpu) - @test isnothing(xg_gpu) - else - if layer === GroupedConvTranspose - @test Array(xg_gpu)≈xg_cpu rtol=2.0f-2 atol=1.0f-3 + @testset "$name GPU grad tests" begin + for layer in layers + @testset "$layer Layer GPU grad test" begin + + # compute output and grad of parameters + l_cpu = layer(args...) + ps_cpu = Flux.params(l_cpu) + y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu) + gs_cpu = back_cpu(1.0f0) + + x_gpu = gpu(x_cpu) + l_gpu = l_cpu |> gpu + ps_gpu = Flux.params(l_gpu) + + if typeof(l_gpu) <: BROKEN_LAYERS + @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa + Flux.Zygote.Grads + else + y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu) + gs_gpu = back_gpu(1.0f0) # TODO many layers error out when backprop int 1, should fix + + # compute grad of input + xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1] + xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1] + + # test + if test_cpu + @test y_gpu ≈ y_cpu rtol = 1.0f-3 atol = 1.0f-3 + if isnothing(xg_cpu) + @test isnothing(xg_gpu) else - @test Array(xg_gpu)≈xg_cpu rtol=1.0f-3 atol=1.0f-3 + if layer === GroupedConvTranspose + @test Array(xg_gpu) ≈ xg_cpu rtol = 2.0f-2 atol = 1.0f-3 + else + @test Array(xg_gpu) ≈ xg_cpu rtol = 1.0f-3 atol = 1.0f-3 + end end end - end - @test gs_gpu isa Flux.Zygote.Grads - for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu) - if isnothing(gs_cpu[p_cpu]) - @test isnothing(gs_gpu[p_gpu]) - else - @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray - if test_cpu - @test Array(gs_gpu[p_gpu])≈gs_cpu[p_cpu] rtol=1.0f-3 atol=1.0f-3 + @test gs_gpu isa Flux.Zygote.Grads + for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu) + if isnothing(gs_cpu[p_cpu]) + @test isnothing(gs_gpu[p_gpu]) + else + @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray + if test_cpu + @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol = 1.0f-3 atol = + 1.0f-3 + end end end end end end - end end + end end # Just to give testset in gpu_gradtest meaningful labels @@ -82,29 +89,68 @@ GroupedConvTranspose(args...) = ConvTranspose(args...; groups = 5) for act in ACTIVATIONS r = rand(Float32, 28, 28, 1, 1) - conv_layers = [Conv, ConvNoBias, - ConvTranspose, ConvTransposeNoBias, - CrossCor, CrossCorNoBias, - DepthwiseConv, DepthwiseConvNoBias] - gpu_gradtest("Convolution with $act", conv_layers, r, (2, 2), 1 => 3, act; - test_cpu = false) + conv_layers = [ + Conv, + ConvNoBias, + ConvTranspose, + ConvTransposeNoBias, + CrossCor, + CrossCorNoBias, + DepthwiseConv, + DepthwiseConvNoBias, + ] + gpu_gradtest( + "Convolution with $act", + conv_layers, + r, + (2, 2), + 1 => 3, + act; + test_cpu = false, + ) groupedconv = [GroupedConv, GroupedConvTranspose] - gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), - (3, 3), 100 => 25, act; test_cpu = true) + gpu_gradtest( + "GroupedConvolution with $act", + groupedconv, + rand(Float32, 28, 28, 100, 2), + (3, 3), + 100 => 25, + act; + test_cpu = true, + ) batch_norm = [BatchNorm] - gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28, 28, 3, 4), 3, act; - test_cpu = false) #TODO fix errors - gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5, 4), 5, act; - test_cpu = false) + gpu_gradtest( + "BatchNorm 1 with $act", + batch_norm, + rand(Float32, 28, 28, 3, 4), + 3, + act; + test_cpu = false, + ) #TODO fix errors + gpu_gradtest( + "BatchNorm 2 with $act", + batch_norm, + rand(Float32, 5, 4), + 5, + act; + test_cpu = false, + ) instancenorm = [InstanceNorm] gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act; test_cpu = false) groupnorm = [GroupNorm] - gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28, 28, 3, 1), 3, 1, act; - test_cpu = false) + gpu_gradtest( + "GroupNorm with $act", + groupnorm, + rand(Float32, 28, 28, 3, 1), + 3, + 1, + act; + test_cpu = false, + ) end r = rand(Float32, 28, 28, 1, 1) @@ -137,8 +183,13 @@ gpu_gradtest("Embedding integer index", embedding, 1, 5, 2) gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2) gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2) gpu_gradtest("Embedding OneHotMatrix index", embedding, OneHotMatrix([1, 2, 3], 5), 5, 2) -gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, - OneHotMatrix([1, 2, 2], 5), 5, 2) +gpu_gradtest( + "Embedding OneHotMatrix repeated indices", + embedding, + OneHotMatrix([1, 2, 2], 5), + 5, + 2, +) @testset "function layers" begin x = rand(Float32, 3, 3) @@ -287,8 +338,11 @@ end end @testset "Dropout RNGs" begin - @test_throws ArgumentError Flux.dropout(MersenneTwister(), CUDA.rand(Float32, 2, 3), - 0.1) + @test_throws ArgumentError Flux.dropout( + MersenneTwister(), + CUDA.rand(Float32, 2, 3), + 0.1, + ) @testset for layer in (Dropout, AlphaDropout) m = layer(0.1; rng = MersenneTwister(123)) @test_throws ErrorException gpu(m) diff --git a/test/cuda/losses.jl b/test/cuda/losses.jl index 3ecbceb46e..467d3ed46e 100644 --- a/test/cuda/losses.jl +++ b/test/cuda/losses.jl @@ -1,5 +1,5 @@ -using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, - binary_focal_loss, focal_loss +using Flux.Losses: + crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss @testset "Losses" begin x = [1.0, 2.0, 3.0] @@ -14,16 +14,22 @@ using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, @test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y)) @test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y)) - x = [0.268941 0.5 0.268941 - 0.731059 0.5 0.731059] - y = [0 1 0 - 1 0 1] + x = [ + 0.268941 0.5 0.268941 + 0.731059 0.5 0.731059 + ] + y = [ + 0 1 0 + 1 0 1 + ] @test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y)) x = softmax(reshape(-7:7, 3, 5) .* 1.0f0) - y = [1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0] + y = [ + 1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0 + ] @test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y)) @testset "GPU grad tests" begin diff --git a/test/cuda/test_utils.jl b/test/cuda/test_utils.jl index 466b08c8b9..027d13a612 100644 --- a/test/cuda/test_utils.jl +++ b/test/cuda/test_utils.jl @@ -7,10 +7,10 @@ function check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) end check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true function check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) - @test g_cpu≈g_gpu rtol=rtol atol=atol + @test g_cpu ≈ g_gpu rtol = rtol atol = atol end function check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol) - @test g_cpu≈collect(g_gpu) rtol=rtol atol=atol + @test g_cpu ≈ collect(g_gpu) rtol = rtol atol = atol end function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol) @@ -27,8 +27,13 @@ function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol) end end -function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; - test_equal = true, rtol = 1e-4, atol = 1e-4) +function gpu_autodiff_test( + f_cpu, + xs_cpu::Array{Float32}...; + test_equal = true, + rtol = 1e-4, + atol = 1e-4, +) check_type(x) = false check_type(x::Float32) = true check_type(x::CuArray{Float32}) = true @@ -50,7 +55,7 @@ function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; gs_gpu = back_gpu(Δ_gpu) if test_equal - @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol + @test collect(y_cpu) ≈ collect(y_gpu) rtol = rtol atol = atol for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu) check_grad(g_gpu, g_cpu, atol, rtol) end @@ -66,7 +71,7 @@ function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; gs_gpu = back_gpu(Δ_gpu) if test_equal - @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol + @test collect(y_cpu) ≈ collect(y_gpu) rtol = rtol atol = atol @assert length(ps_gpu) == length(ps_cpu) for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu) check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol) diff --git a/test/data.jl b/test/data.jl index 8ee1d58a89..3d2083af4f 100644 --- a/test/data.jl +++ b/test/data.jl @@ -36,7 +36,7 @@ using Random # @inferred first(d) batches = collect(d) # @test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)} - @test eltype(batches) == Tuple{typeof(X), typeof(Y)} + @test eltype(batches) == Tuple{typeof(X),typeof(Y)} @test length(batches) == 3 @test length(batches[1]) == 2 @test length(batches[2]) == 2 @@ -53,7 +53,7 @@ using Random # @inferred first(d) batches = collect(d) # @test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}} - @test eltype(batches) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}} + @test eltype(batches) == NamedTuple{(:x, :y),Tuple{typeof(X),typeof(Y)}} @test length(batches) == 3 @test length(batches[1]) == 2 @test length(batches[2]) == 2 @@ -69,7 +69,7 @@ using Random d = DataLoader([1:10;]; shuffle = true) cd = collect(zip(d, d)) # skip the first since it used to be different also before fixing the bug - @test [cd[i][1] for i in 2:10] != [cd[i][2] for i in 2:10] + @test [cd[i][1] for i = 2:10] != [cd[i][2] for i = 2:10] # test interaction with `train!` θ = ones(2) @@ -89,7 +89,13 @@ using Random @test norm(θ .- 1) < 1e-10 # specify the rng - d = map(identity, - DataLoader(X; batchsize = 2, shuffle = true, - rng = Random.seed!(Random.default_rng(), 5))) + d = map( + identity, + DataLoader( + X; + batchsize = 2, + shuffle = true, + rng = Random.seed!(Random.default_rng(), 5), + ), + ) end diff --git a/test/layers/basic.jl b/test/layers/basic.jl index 93e74e7915..f3600850a8 100644 --- a/test/layers/basic.jl +++ b/test/layers/basic.jl @@ -2,16 +2,18 @@ using Test, Random import Flux: activations @testset "basic" begin - @testset "helpers" begin @testset "activations" begin - dummy_model = Chain(x -> x .^ 2, x -> x .- 3, x -> tan.(x)) - x = randn(10) - @test activations(dummy_model, x)[1] == x .^ 2 - @test activations(dummy_model, x)[2] == (x .^ 2 .- 3) - @test activations(dummy_model, x)[3] == tan.(x .^ 2 .- 3) - - @test activations(Chain(), x) == () - @test activations(Chain(identity, x -> :foo), x)[2] == :foo # results include `Any` type - end end + @testset "helpers" begin + @testset "activations" begin + dummy_model = Chain(x -> x .^ 2, x -> x .- 3, x -> tan.(x)) + x = randn(10) + @test activations(dummy_model, x)[1] == x .^ 2 + @test activations(dummy_model, x)[2] == (x .^ 2 .- 3) + @test activations(dummy_model, x)[3] == tan.(x .^ 2 .- 3) + + @test activations(Chain(), x) == () + @test activations(Chain(identity, x -> :foo), x)[2] == :foo # results include `Any` type + end + end @testset "Chain" begin @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10)) @@ -87,10 +89,9 @@ import Flux: activations @test Dense(10, 2, identity; init = ones)(ones(10, 1)) == 10 * ones(2, 1) @test Dense(10, 2, identity; init = ones)([ones(10, 1) 2 * ones(10, 1)]) == [10 20; 10 20] - @test Dense(10, 2, identity; init = ones, bias = false)([ones(10, 1) 2 * - ones(10, - 1)]) == - [10 20; 10 20] + @test Dense(10, 2, identity; init = ones, bias = false)( + [ones(10, 1) 2 * ones(10, 1)], + ) == [10 20; 10 20] end end @@ -158,8 +159,9 @@ import Flux: activations @testset "concat size" begin input = randn(10, 2) - @test size(SkipConnection(Dense(10, 10), (a, b) -> cat(a, b; dims = 2))(input)) == - (10, 4) + @test size( + SkipConnection(Dense(10, 10), (a, b) -> cat(a, b; dims = 2))(input), + ) == (10, 4) end end @@ -217,8 +219,9 @@ import Flux: activations @testset "concat size" begin input = randn(10, 2) - @test size(Parallel((a, b) -> cat(a, b; dims = 2), Dense(10, 10), identity)(input)) == - (10, 4) + @test size( + Parallel((a, b) -> cat(a, b; dims = 2), Dense(10, 10), identity)(input), + ) == (10, 4) @test size(Parallel(hcat; one = Dense(10, 10), two = identity)(input)) == (10, 4) end @@ -226,8 +229,9 @@ import Flux: activations @testset "vararg input" begin inputs = randn(10), randn(5), randn(4) @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,) - @test size(Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs)) == - (2,) + @test size( + Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs), + ) == (2,) @test_throws ArgumentError Parallel(+, sin, cos)(1, 2, 3) # wrong number of inputs @test Parallel(+, sin, cos)(pi / 2) ≈ 1 end @@ -237,10 +241,16 @@ import Flux: activations @test m[1] == m[:one] @test m[1:2] == m - @test_throws ArgumentError Parallel(hcat, layers = Dense(10, 10), - two = identity) # reserved names - @test_throws ArgumentError Parallel(hcat, connection = Dense(10, 10), - two = identity) + @test_throws ArgumentError Parallel( + hcat, + layers = Dense(10, 10), + two = identity, + ) # reserved names + @test_throws ArgumentError Parallel( + hcat, + connection = Dense(10, 10), + two = identity, + ) @test m == fmap(identity, m) # does not forget names @@ -249,7 +259,7 @@ import Flux: activations end @testset "trivial cases" begin - @test Parallel(hcat) isa Parallel{typeof(hcat), Tuple{}} # not a NamedTuple + @test Parallel(hcat) isa Parallel{typeof(hcat),Tuple{}} # not a NamedTuple @test Parallel(hcat)(1) == hcat() @test Parallel(hcat, inv)(2) == hcat(1 / 2) # still calls connection once. end @@ -314,7 +324,7 @@ import Flux: activations x = rand(1:vocab_size, 3, 4) y = m(x) - @test y isa Array{Float32, 3} + @test y isa Array{Float32,3} @test size(y) == (embed_size, 3, 4) @test m(2) ≈ m.weight[:, 2] diff --git a/test/layers/conv.jl b/test/layers/conv.jl index c5e7845833..51082723fb 100644 --- a/test/layers/conv.jl +++ b/test/layers/conv.jl @@ -25,12 +25,15 @@ end @testset "CNN" begin r = zeros(Float32, 28, 28, 1, 5) - m = Chain(Conv((2, 2), 1 => 16, relu), - MaxPool((2, 2)), - Conv((2, 2), 16 => 8, relu), - MaxPool((2, 2)), - x -> reshape(x, :, size(x, 4)), - Dense(288, 10), softmax) + m = Chain( + Conv((2, 2), 1 => 16, relu), + MaxPool((2, 2)), + Conv((2, 2), 16 => 8, relu), + MaxPool((2, 2)), + x -> reshape(x, :, size(x, 4)), + Dense(288, 10), + softmax, + ) @test size(m(r)) == (10, 5) @@ -56,7 +59,7 @@ end op = zeros(Float32, 27, 27, 3, 1) .+ 2.0f0 opt = Descent() - for _ in 1:(10^3) + for _ = 1:(10^3) gs = gradient(Flux.params(bias)) do return Flux.Losses.mse(bias(ip), op) end @@ -113,7 +116,7 @@ end @test _channels_out(ConvTranspose((5, 6), 2 => 2; groups = 2)) == 2 for Layer in [Conv, ConvTranspose] - for _ in 1:10 + for _ = 1:10 groups = rand(1:10) kernel_size = Tuple(rand(1:5) for _ in rand(1:3)) cin = rand(1:5) * groups @@ -135,7 +138,7 @@ end @test y_hat[2, 2] ≈ 9.0 @test y_hat[end, 1] ≈ 4.0 @test y_hat[1, end] ≈ 3.0 - @test y_hat[1, end - 1] ≈ 6.0 + @test y_hat[1, end-1] ≈ 6.0 @test y_hat[end, end] ≈ 2.0 end @@ -203,19 +206,22 @@ end w = rand(Float32, 2, 2, 1, 1) y = CrossCor(w, [0.0]) - @test sum(w .* x[1:2, 1:2, :, :])≈y(x)[1, 1, 1, 1] rtol=2e-7 + @test sum(w .* x[1:2, 1:2, :, :]) ≈ y(x)[1, 1, 1, 1] rtol = 2e-7 r = zeros(Float32, 28, 28, 1, 5) - m = Chain(CrossCor((2, 2), 1 => 16, relu), - MaxPool((2, 2)), - CrossCor((2, 2), 16 => 8, relu; bias = false), - MaxPool((2, 2)), - x -> reshape(x, :, size(x, 4)), - Dense(288, 10), softmax) + m = Chain( + CrossCor((2, 2), 1 => 16, relu), + MaxPool((2, 2)), + CrossCor((2, 2), 16 => 8, relu; bias = false), + MaxPool((2, 2)), + x -> reshape(x, :, size(x, 4)), + Dense(288, 10), + softmax, + ) @test size(m(r)) == (10, 5) @test y(x) != Conv(w, [0.0])(x) - @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x)≈Conv(w, [0.0])(x) rtol=1e-7 + @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x) ≈ Conv(w, [0.0])(x) rtol = 1e-7 end @testset "Conv with non quadratic window #700" begin @@ -224,17 +230,17 @@ end l = Conv((3, 3), 1 => 1) expected = zeros(eltype(l.weight), 5, 5, 1, 1) - expected[2:(end - 1), 2:(end - 1), 1, 1] = l.weight + expected[2:(end-1), 2:(end-1), 1, 1] = l.weight @test expected ≈ l(data) l = Conv((3, 1), 1 => 1) expected = zeros(eltype(l.weight), 5, 7, 1, 1) - expected[2:(end - 1), 4, 1, 1] = l.weight + expected[2:(end-1), 4, 1, 1] = l.weight @test expected ≈ l(data) l = Conv((1, 3), 1 => 1) expected = zeros(eltype(l.weight), 7, 5, 1, 1) - expected[4, 2:(end - 1), 1, 1] = l.weight + expected[4, 2:(end-1), 1, 1] = l.weight @test expected ≈ l(data) @test begin @@ -244,9 +250,9 @@ end end end -@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, - CrossCor), - k in ((1,), (2,), (3,), (4, 5), (6, 7, 8)) +@testset "$ltype SamePad kernelsize $k" for ltype in + (Conv, ConvTranspose, DepthwiseConv, CrossCor), + k in ((1,), (2,), (3,), (4, 5), (6, 7, 8)) data = ones(Float32, (k .+ 3)..., 1, 1) l = ltype(k, 1 => 1; pad = SamePad()) @@ -258,24 +264,25 @@ end stride = 3 l = ltype(k, 1 => 1; pad = SamePad(), stride = stride) if ltype == ConvTranspose - @test size(l(data))[1:(end - 2)] == stride .* size(data)[1:(end - 2)] + @test size(l(data))[1:(end-2)] == stride .* size(data)[1:(end-2)] else - @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], stride) + @test size(l(data))[1:(end-2)] == cld.(size(data)[1:(end-2)], stride) end end @testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), - k in ((1,), (2,), (3,), (4, 5), (6, 7, 8)) + k in ((1,), (2,), (3,), (4, 5), (6, 7, 8)) data = ones(Float32, (k .+ 3)..., 1, 1) l = ltype(k; pad = SamePad()) - @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], k) + @test size(l(data))[1:(end-2)] == cld.(size(data)[1:(end-2)], k) end @testset "bugs fixed" begin -# https://github.com/FluxML/Flux.jl/issues/1421 -@test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64} end + # https://github.com/FluxML/Flux.jl/issues/1421 + @test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64} +end @testset "constructors: $fun" for fun in [Conv, CrossCor, ConvTranspose, DepthwiseConv] @test fun(rand(2, 3, 4)).bias isa Vector{Float64} diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 1d4be15240..32e99245d6 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -3,123 +3,128 @@ using Zygote: pullback evalwgrad(f, x...) = pullback(f, x...)[1] -@testset "Dropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister())) - x = [1.0 + 0im, 2.0 + 1im, 3.0 + 3im] - @test x == Dropout(0.1; rng_kwargs...)(x) - @test x == evalwgrad(Dropout(0; rng_kwargs...), x) - @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) - - x = [1.0, 2.0, 3.0] - @test x == Dropout(0.1; rng_kwargs...)(x) - @test x == evalwgrad(Dropout(0; rng_kwargs...), x) - @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) - - x = rand(100) - m = Dropout(0.9; rng_kwargs...) - y = evalwgrad(m, x) - @test count(a -> a == 0, y) > 50 - testmode!(m, true) - y = evalwgrad(m, x) # should override istraining - @test count(a -> a == 0, y) == 0 - testmode!(m, false) - y = evalwgrad(m, x) - @test count(a -> a == 0, y) > 50 - - x = rand(Float32, 100) - m = Chain(Dense(100, 100), - Dropout(0.9; rng_kwargs...)) - y = evalwgrad(m, x) - @test count(a -> a == 0, y) > 50 - testmode!(m, true) - y = evalwgrad(m, x) # should override istraining - @test count(a -> a == 0, y) == 0 - - x = rand(100, 50) - m = Dropout(0.5; dims = 2, rng_kwargs...) - y = m(x) - c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100) - @test minimum(c) == maximum(c) - m = Dropout(0.5; dims = 1, rng_kwargs...) - y = m(x) - c = map(i -> count(a -> a == 0, @view y[:, i]), 1:50) - @test minimum(c) == maximum(c) - - # issue #1084 - m = Dropout(0.9; rng_kwargs...) - x = rand(100) - - testmode!(m) - y = m(x) - @test count(a -> a == 0, y) == 0 - trainmode!(m) - y = m(x) - @test count(a -> a == 0, y) > 50 - - y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = true) - @test count(a -> a == 0, y) > 50 - - y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = false) - @test count(a -> a == 0, y) == 0 - - # CPU RNGs map onto CPU ok - if isempty(rng_kwargs) - if VERSION >= v"1.7" - @test cpu(m).rng isa Random.TaskLocalRNG +@testset "Dropout" begin + @testset for rng_kwargs in ((), (; rng = MersenneTwister())) + x = [1.0 + 0im, 2.0 + 1im, 3.0 + 3im] + @test x == Dropout(0.1; rng_kwargs...)(x) + @test x == evalwgrad(Dropout(0; rng_kwargs...), x) + @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) + + x = [1.0, 2.0, 3.0] + @test x == Dropout(0.1; rng_kwargs...)(x) + @test x == evalwgrad(Dropout(0; rng_kwargs...), x) + @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) + + x = rand(100) + m = Dropout(0.9; rng_kwargs...) + y = evalwgrad(m, x) + @test count(a -> a == 0, y) > 50 + testmode!(m, true) + y = evalwgrad(m, x) # should override istraining + @test count(a -> a == 0, y) == 0 + testmode!(m, false) + y = evalwgrad(m, x) + @test count(a -> a == 0, y) > 50 + + x = rand(Float32, 100) + m = Chain(Dense(100, 100), Dropout(0.9; rng_kwargs...)) + y = evalwgrad(m, x) + @test count(a -> a == 0, y) > 50 + testmode!(m, true) + y = evalwgrad(m, x) # should override istraining + @test count(a -> a == 0, y) == 0 + + x = rand(100, 50) + m = Dropout(0.5; dims = 2, rng_kwargs...) + y = m(x) + c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100) + @test minimum(c) == maximum(c) + m = Dropout(0.5; dims = 1, rng_kwargs...) + y = m(x) + c = map(i -> count(a -> a == 0, @view y[:, i]), 1:50) + @test minimum(c) == maximum(c) + + # issue #1084 + m = Dropout(0.9; rng_kwargs...) + x = rand(100) + + testmode!(m) + y = m(x) + @test count(a -> a == 0, y) == 0 + trainmode!(m) + y = m(x) + @test count(a -> a == 0, y) > 50 + + y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = true) + @test count(a -> a == 0, y) > 50 + + y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = false) + @test count(a -> a == 0, y) == 0 + + # CPU RNGs map onto CPU ok + if isempty(rng_kwargs) + if VERSION >= v"1.7" + @test cpu(m).rng isa Random.TaskLocalRNG + else + @test cpu(m).rng isa Random._GLOBAL_RNG + end else - @test cpu(m).rng isa Random._GLOBAL_RNG + @test cpu(m).rng === only(values(rng_kwargs)) end - else - @test cpu(m).rng === only(values(rng_kwargs)) - end -end end - -@testset "AlphaDropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister())) - x = [1.0, 2.0, 3.0] - @test x == AlphaDropout(0.1; rng_kwargs...)(x) - @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x) - @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x) - - x = randn(1000) # large enough to prevent flaky test - m = AlphaDropout(0.5; rng_kwargs...) - - y = evalwgrad(m, x) - # Should preserve unit mean and variance - @test mean(y)≈0 atol=0.2 - @test var(y)≈1 atol=0.2 - - testmode!(m, true) # should override istraining - @test evalwgrad(m, x) == x - - testmode!(m, false) - y = evalwgrad(m, x) - @test mean(y)≈0 atol=0.2 - @test var(y)≈1 atol=0.2 - - # Known good value ranges - # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338 - x = ones(100) - if isempty(rng_kwargs) - @test 40 < sum(evalwgrad(m, x)) < 130 - else - # FIXME: this breaks spuriously for MersenneTwister - @test_skip 40 < sum(evalwgrad(m, x)) < 130 end +end + +@testset "AlphaDropout" begin + @testset for rng_kwargs in ((), (; rng = MersenneTwister())) + x = [1.0, 2.0, 3.0] + @test x == AlphaDropout(0.1; rng_kwargs...)(x) + @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x) + @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x) + + x = randn(1000) # large enough to prevent flaky test + m = AlphaDropout(0.5; rng_kwargs...) + + y = evalwgrad(m, x) + # Should preserve unit mean and variance + @test mean(y) ≈ 0 atol = 0.2 + @test var(y) ≈ 1 atol = 0.2 - # CPU RNGs map onto CPU ok - if isempty(rng_kwargs) - if VERSION >= v"1.7" - @test cpu(m).rng isa Random.TaskLocalRNG + testmode!(m, true) # should override istraining + @test evalwgrad(m, x) == x + + testmode!(m, false) + y = evalwgrad(m, x) + @test mean(y) ≈ 0 atol = 0.2 + @test var(y) ≈ 1 atol = 0.2 + + # Known good value ranges + # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338 + x = ones(100) + if isempty(rng_kwargs) + @test 40 < sum(evalwgrad(m, x)) < 130 + else + # FIXME: this breaks spuriously for MersenneTwister + @test_skip 40 < sum(evalwgrad(m, x)) < 130 + end + + # CPU RNGs map onto CPU ok + if isempty(rng_kwargs) + if VERSION >= v"1.7" + @test cpu(m).rng isa Random.TaskLocalRNG + else + @test cpu(m).rng isa Random._GLOBAL_RNG + end else - @test cpu(m).rng isa Random._GLOBAL_RNG + @test cpu(m).rng === only(values(rng_kwargs)) end - else - @test cpu(m).rng === only(values(rng_kwargs)) end -end end +end @testset "BatchNorm" begin - let m = BatchNorm(2), x = [1.0 3.0 5.0; - 2.0 4.0 6.0] + let m = BatchNorm(2), x = [ + 1.0 3.0 5.0 + 2.0 4.0 6.0 + ] @test Flux.hasaffine(m) == true @test length(Flux.params(m)) == 2 @@ -162,8 +167,10 @@ end end end # with activation function - let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0; - 2.0 4.0 6.0] + let m = BatchNorm(2, sigmoid), x = [ + 1.0 3.0 5.0 + 2.0 4.0 6.0 + ] y = m(x) @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7) @inferred m(x) @@ -203,7 +210,8 @@ end @testset "InstanceNorm" begin # begin tests - let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (3, 2, 2), + let m = InstanceNorm(2; affine = true, track_stats = true), + sizes = (3, 2, 2), x = reshape(collect(1:prod(sizes)), sizes) @test length(Flux.params(m)) == 2 @@ -235,21 +243,23 @@ end # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8 N = ndims(x) @test m.μ ≈ [0.5, 0.8] - n = prod(size(x, i) for i in 1:(N - 2)) + n = prod(size(x, i) for i = 1:(N-2)) corr = n / (n - 1) - σ² = var(x; dims = 1:(N - 2), corrected = false) + σ² = var(x; dims = 1:(N-2), corrected = false) @test m.σ² ≈ 0.1 * corr * vec(mean(σ²; dims = N)) .+ 0.9 * 1 y = m(x) @test length(m.μ) == 2 @test length(m.σ²) == 2 - @test y≈(x .- reshape(m.μ, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 2, 1) .+ 1.0f-5) atol=1.0e-5 + @test y ≈ (x .- reshape(m.μ, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 2, 1) .+ 1.0f-5) atol = + 1.0e-5 @inferred m(x) end # with activation function - let m = InstanceNorm(2, sigmoid; affine = true, track_stats = true), sizes = (3, 2, 2), + let m = InstanceNorm(2, sigmoid; affine = true, track_stats = true), + sizes = (3, 2, 2), x = reshape(collect(1:prod(sizes)), sizes) x = Float64.(x) @@ -260,13 +270,14 @@ end y = m(x) # inference time after a training step μ = reshape(m.μ, affine_shape...) σ² = reshape(m.σ², affine_shape...) - @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 + @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7 @inferred m(x) end # with activation function - let m = InstanceNorm(2, sigmoid; affine = true, track_stats = false), sizes = (3, 2, 2), + let m = InstanceNorm(2, sigmoid; affine = true, track_stats = false), + sizes = (3, 2, 2), x = reshape(collect(1:prod(sizes)), sizes) @test Flux.hasaffine(m) == true @@ -275,12 +286,13 @@ end y = m(x) μ = mean(x; dims = 1) σ² = var(x; dims = 1, corrected = false) - @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 + @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7 @inferred m(x) end - let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2), + let m = InstanceNorm(2, sigmoid), + sizes = (3, 2, 2), x = reshape(collect(1:prod(sizes)), sizes) @test Flux.hasaffine(m) == false @@ -290,12 +302,13 @@ end y = m(x) μ = mean(x; dims = 1) σ² = var(x; dims = 1, corrected = false) - @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 + @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7 @inferred m(x) end - let m = trainmode!(InstanceNorm(2; affine = true)), sizes = (2, 4, 1, 2, 3), + let m = trainmode!(InstanceNorm(2; affine = true)), + sizes = (2, 4, 1, 2, 3), x = Float32.(reshape(collect(1:prod(sizes)), sizes)) y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3) @@ -306,12 +319,13 @@ end end # check that μ, σ², and the output are the correct size for higher rank tensors - let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (5, 5, 3, 4, 2, 6), + let m = InstanceNorm(2; affine = true, track_stats = true), + sizes = (5, 5, 3, 4, 2, 6), x = reshape(Float32.(collect(1:prod(sizes))), sizes) y = evalwgrad(m, x) - @test size(m.μ) == (sizes[end - 1],) - @test size(m.σ²) == (sizes[end - 1],) + @test size(m.μ) == (sizes[end-1],) + @test size(m.σ²) == (sizes[end-1],) @test size(y) == sizes @inferred m(x) @@ -319,11 +333,11 @@ end # show that instance norm is equal to batch norm when channel and batch dims are squashed let m_inorm = trainmode!(InstanceNorm(2; affine = true)), - m_bnorm = trainmode!(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6), + m_bnorm = trainmode!(BatchNorm(12)), + sizes = (5, 5, 3, 4, 2, 6), x = reshape(Float32.(collect(1:prod(sizes))), sizes) - @test m_inorm(x) == - reshape(m_bnorm(reshape(x, (sizes[1:(end - 2)]..., :, 1))), sizes) + @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:(end-2)]..., :, 1))), sizes) end let m = InstanceNorm(32), x = randn(Float32, 416, 416, 32, 1) @@ -365,7 +379,8 @@ end # begin tests squeeze(x) = dropdims(x; dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions - let m = GroupNorm(4, 2; track_stats = true), sizes = (3, 4, 2), + let m = GroupNorm(4, 2; track_stats = true), + sizes = (3, 4, 2), x = reshape(collect(1:prod(sizes)), sizes) @test length(Flux.params(m)) == 2 @@ -409,19 +424,20 @@ end y = m(x) out = (z .- reshape(m.μ, 1, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 1, 2, 1) .+ 1.0f-5) - @test y≈reshape(out, size(x)) atol=1.0e-5 + @test y ≈ reshape(out, size(x)) atol = 1.0e-5 end # with activation function - let m = GroupNorm(4, 2, sigmoid; track_stats = true), sizes = (3, 4, 2), + let m = GroupNorm(4, 2, sigmoid; track_stats = true), + sizes = (3, 4, 2), x = reshape(collect(1:prod(sizes)), sizes) x = Float32.(x) μ_affine_shape = ones(Int, length(sizes) + 1) - μ_affine_shape[end - 1] = 2 # Number of groups + μ_affine_shape[end-1] = 2 # Number of groups affine_shape = ones(Int, length(sizes) + 1) - affine_shape[end - 2] = 2 # Channels per group - affine_shape[end - 1] = 2 # Number of groups + affine_shape[end-2] = 2 # Channels per group + affine_shape[end-1] = 2 # Number of groups affine_shape[1] = sizes[1] affine_shape[end] = sizes[end] @@ -429,12 +445,18 @@ end y = m(x) x_ = reshape(x, affine_shape...) - out = reshape(sigmoid.((x_ .- reshape(m.μ, μ_affine_shape...)) ./ - sqrt.(reshape(m.σ², μ_affine_shape...) .+ m.ϵ)), og_shape) - @test y≈out atol=1e-7 + out = reshape( + sigmoid.( + (x_ .- reshape(m.μ, μ_affine_shape...)) ./ + sqrt.(reshape(m.σ², μ_affine_shape...) .+ m.ϵ) + ), + og_shape, + ) + @test y ≈ out atol = 1e-7 end - let m = trainmode!(GroupNorm(2, 2; track_stats = true)), sizes = (2, 4, 1, 2, 3), + let m = trainmode!(GroupNorm(2, 2; track_stats = true)), + sizes = (2, 4, 1, 2, 3), x = Float32.(reshape(collect(1:prod(sizes)), sizes)) y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3) @@ -443,7 +465,8 @@ end end # check that μ, σ², and the output are the correct size for higher rank tensors - let m = GroupNorm(4, 2; track_stats = true), sizes = (5, 5, 3, 4, 4, 6), + let m = GroupNorm(4, 2; track_stats = true), + sizes = (5, 5, 3, 4, 4, 6), x = Float32.(reshape(collect(1:prod(sizes)), sizes)) y = evalwgrad(m, x) @@ -453,7 +476,8 @@ end end # show that group norm is the same as instance norm when the group size is the same as the number of channels - let IN = trainmode!(InstanceNorm(4; affine = true)), GN = trainmode!(GroupNorm(4, 4)), + let IN = trainmode!(InstanceNorm(4; affine = true)), + GN = trainmode!(GroupNorm(4, 4)), sizes = (2, 2, 3, 4, 5), x = Float32.(reshape(collect(1:prod(sizes)), sizes)) @@ -461,7 +485,8 @@ end end # show that group norm is the same as batch norm for a group of size 1 and batch of size 1 - let BN = trainmode!(BatchNorm(4)), GN = trainmode!(GroupNorm(4, 4)), + let BN = trainmode!(BatchNorm(4)), + GN = trainmode!(GroupNorm(4, 4)), sizes = (2, 2, 3, 4, 1), x = Float32.(reshape(collect(1:prod(sizes)), sizes)) diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl index 19c1506095..225c4d83a9 100644 --- a/test/layers/recurrent.jl +++ b/test/layers/recurrent.jl @@ -2,7 +2,7 @@ using LinearAlgebra # Ref FluxML/Flux.jl#1209 1D input @testset "BPTT-1D" begin - seq = [rand(Float32, 2) for i in 1:3] + seq = [rand(Float32, 2) for i = 1:3] for r in [RNN] rnn = r(2 => 3) Flux.reset!(rnn) @@ -10,23 +10,29 @@ using LinearAlgebra return sum([rnn(s) for s in seq][3]) end Flux.reset!(rnn) - bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + - Wh * - tanh.(rnn.cell.Wi * seq[2] + - Wh * - tanh.(rnn.cell.Wi * seq[1] + - Wh * rnn.cell.state0 - + rnn.cell.b) - + rnn.cell.b) - + rnn.cell.b)), - rnn.cell.Wh) + bptt = gradient( + Wh -> sum( + tanh.( + rnn.cell.Wi * seq[3] + + Wh * + tanh.( + rnn.cell.Wi * seq[2] + + Wh * + tanh.(rnn.cell.Wi * seq[1] + Wh * rnn.cell.state0 + rnn.cell.b) + + rnn.cell.b + ) + + rnn.cell.b + ), + ), + rnn.cell.Wh, + ) @test grads_seq[rnn.cell.Wh] ≈ bptt[1] end end # Ref FluxML/Flux.jl#1209 2D input @testset "BPTT-2D" begin - seq = [rand(Float32, (2, 1)) for i in 1:3] + seq = [rand(Float32, (2, 1)) for i = 1:3] for r in [RNN] rnn = r(2 => 3) Flux.reset!(rnn) @@ -34,16 +40,22 @@ end return sum([rnn(s) for s in seq][3]) end Flux.reset!(rnn) - bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + - Wh * - tanh.(rnn.cell.Wi * seq[2] + - Wh * - tanh.(rnn.cell.Wi * seq[1] + - Wh * rnn.cell.state0 - + rnn.cell.b) - + rnn.cell.b) - + rnn.cell.b)), - rnn.cell.Wh) + bptt = gradient( + Wh -> sum( + tanh.( + rnn.cell.Wi * seq[3] + + Wh * + tanh.( + rnn.cell.Wi * seq[2] + + Wh * + tanh.(rnn.cell.Wi * seq[1] + Wh * rnn.cell.state0 + rnn.cell.b) + + rnn.cell.b + ) + + rnn.cell.b + ), + ), + rnn.cell.Wh, + ) @test grads_seq[rnn.cell.Wh] ≈ bptt[1] end end @@ -58,46 +70,44 @@ end Flux.reset!(rnn) bptt = gradient(rnn.cell.Wh) do Wh # calculate state 1 - s1 = tanh.(rnn.cell.Wi * seq[:, :, 1] + - Wh * rnn.cell.state0 + - rnn.cell.b) + s1 = tanh.(rnn.cell.Wi * seq[:, :, 1] + Wh * rnn.cell.state0 + rnn.cell.b) #calculate state 2 - s2 = tanh.(rnn.cell.Wi * seq[:, :, 2] + - Wh * s1 + - rnn.cell.b) + s2 = tanh.(rnn.cell.Wi * seq[:, :, 2] + Wh * s1 + rnn.cell.b) #calculate state 3 - s3 = tanh.(rnn.cell.Wi * seq[:, :, 3] + - Wh * s2 + - rnn.cell.b) + s3 = tanh.(rnn.cell.Wi * seq[:, :, 3] + Wh * s2 + rnn.cell.b) return sum(s3) # loss is sum of state 3 end @test grads_seq[rnn.cell.Wh] ≈ bptt[1] end -@testset "RNN-shapes" begin @testset for R in [RNN, GRU, LSTM, GRUv3] - m1 = R(3 => 5) - m2 = R(3 => 5) - m3 = R(3, 5) # leave one to test the silently deprecated "," not "=>" notation - x1 = rand(Float32, 3) - x2 = rand(Float32, 3, 1) - x3 = rand(Float32, 3, 1, 2) - Flux.reset!(m1) - Flux.reset!(m2) - Flux.reset!(m3) - @test size(m1(x1)) == (5,) - @test size(m1(x1)) == (5,) # repeat in case of effect from change in state shape - @test size(m2(x2)) == (5, 1) - @test size(m2(x2)) == (5, 1) - @test size(m3(x3)) == (5, 1, 2) - @test size(m3(x3)) == (5, 1, 2) -end end +@testset "RNN-shapes" begin + @testset for R in [RNN, GRU, LSTM, GRUv3] + m1 = R(3 => 5) + m2 = R(3 => 5) + m3 = R(3, 5) # leave one to test the silently deprecated "," not "=>" notation + x1 = rand(Float32, 3) + x2 = rand(Float32, 3, 1) + x3 = rand(Float32, 3, 1, 2) + Flux.reset!(m1) + Flux.reset!(m2) + Flux.reset!(m3) + @test size(m1(x1)) == (5,) + @test size(m1(x1)) == (5,) # repeat in case of effect from change in state shape + @test size(m2(x2)) == (5, 1) + @test size(m2(x2)) == (5, 1) + @test size(m3(x3)) == (5, 1, 2) + @test size(m3(x3)) == (5, 1, 2) + end +end -@testset "RNN-input-state-eltypes" begin @testset for R in [RNN, GRU, LSTM, GRUv3] - m = R(3 => 5) - x = rand(Float64, 3, 1) - Flux.reset!(m) - @test_throws MethodError m(x) -end end +@testset "RNN-input-state-eltypes" begin + @testset for R in [RNN, GRU, LSTM, GRUv3] + m = R(3 => 5) + x = rand(Float64, 3, 1) + Flux.reset!(m) + @test_throws MethodError m(x) + end +end @testset "multigate" begin x = rand(6, 5) @@ -113,14 +123,19 @@ end x = rand(3, 3, 1, 2, 4) @test length(Flux.eachlastdim(x)) == size(x, ndims(x)) @test collect(@inferred(Flux.eachlastdim(x))) == collect(eachslice(x; dims = ndims(x))) - slicedim = (size(x)[1:(end - 1)]..., 1) + slicedim = (size(x)[1:(end-1)]..., 1) res, (dx,) = Flux.withgradient(x) do x x1, _, x3, _ = Flux.eachlastdim(x) return sum(x1) + sum(x3 .* 3) end @test res ≈ sum(selectdim(x, ndims(x), 1)) + 3sum(selectdim(x, ndims(x), 3)) - @test dx ≈ cat(fill(1, slicedim), fill(0, slicedim), - fill(3, slicedim), fill(0, slicedim); dims = ndims(x)) + @test dx ≈ cat( + fill(1, slicedim), + fill(0, slicedim), + fill(3, slicedim), + fill(0, slicedim); + dims = ndims(x), + ) end @testset "∇eachlastdim" begin @@ -132,40 +147,57 @@ end NoTangent = Flux.Zygote.NoTangent abstract_zeros_vector = [ZeroTangent(), ZeroTangent(), NoTangent(), NoTangent()] @test @inferred(Flux.∇eachlastdim(abstract_zeros_vector, x)) == zeros(size(x)) - x2 = rand(Float64, x_size[1:(end - 1)]) - x3 = rand(Float64, x_size[1:(end - 1)]) + x2 = rand(Float64, x_size[1:(end-1)]) + x3 = rand(Float64, x_size[1:(end-1)]) mixed_vector = [ZeroTangent(), x2, x3, ZeroTangent()] - @test @inferred(Flux.∇eachlastdim(mixed_vector, x)) ≈ cat(zeros(x_size[1:(end - 1)]), - x2, - x3, - zeros(x_size[1:(end - 1)]); dims = ndims(x)) + @test @inferred(Flux.∇eachlastdim(mixed_vector, x)) ≈ + cat(zeros(x_size[1:(end-1)]), x2, x3, zeros(x_size[1:(end-1)]); dims = ndims(x)) end @testset "Different Internal Matrix Types" begin - R = Flux.Recur(Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1))) - # don't want to pull in SparseArrays just for this test, but there aren't any - # non-square structured matrix types in LinearAlgebra. so we will use a different - # eltype matrix, which would fail before when `W_i` and `W_h` were required to be the - # same type. - L = Flux.Recur(Flux.LSTMCell(rand(5*4, 3), rand(1:20, 5*4, 5), rand(5*4), (rand(5, 1), rand(5, 1)))) - G = Flux.Recur(Flux.GRUCell(rand(5*3, 3), rand(1:20, 5*3, 5), rand(5*3), rand(5, 1))) - G3 = Flux.Recur(Flux.GRUv3Cell(rand(5*3, 3), rand(1:20, 5*2, 5), rand(5*3), Tridiagonal(rand(5, 5)), rand(5, 1))) + R = Flux.Recur( + Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1)), + ) + # don't want to pull in SparseArrays just for this test, but there aren't any + # non-square structured matrix types in LinearAlgebra. so we will use a different + # eltype matrix, which would fail before when `W_i` and `W_h` were required to be the + # same type. + L = Flux.Recur( + Flux.LSTMCell( + rand(5 * 4, 3), + rand(1:20, 5 * 4, 5), + rand(5 * 4), + (rand(5, 1), rand(5, 1)), + ), + ) + G = Flux.Recur( + Flux.GRUCell(rand(5 * 3, 3), rand(1:20, 5 * 3, 5), rand(5 * 3), rand(5, 1)), + ) + G3 = Flux.Recur( + Flux.GRUv3Cell( + rand(5 * 3, 3), + rand(1:20, 5 * 2, 5), + rand(5 * 3), + Tridiagonal(rand(5, 5)), + rand(5, 1), + ), + ) - for m in [R, L, G, G3] + for m in [R, L, G, G3] - x1 = rand(3) - x2 = rand(3, 1) - x3 = rand(3, 1, 2) - Flux.reset!(m) - @test size(m(x1)) == (5,) - Flux.reset!(m) - @test size(m(x1)) == (5,) # repeat in case of effect from change in state shape - @test size(m(x2)) == (5, 1) - Flux.reset!(m) - @test size(m(x2)) == (5, 1) - Flux.reset!(m) - @test size(m(x3)) == (5, 1, 2) - Flux.reset!(m) - @test size(m(x3)) == (5, 1, 2) - end + x1 = rand(3) + x2 = rand(3, 1) + x3 = rand(3, 1, 2) + Flux.reset!(m) + @test size(m(x1)) == (5,) + Flux.reset!(m) + @test size(m(x1)) == (5,) # repeat in case of effect from change in state shape + @test size(m(x2)) == (5, 1) + Flux.reset!(m) + @test size(m(x2)) == (5, 1) + Flux.reset!(m) + @test size(m(x3)) == (5, 1, 2) + Flux.reset!(m) + @test size(m(x3)) == (5, 1, 2) + end end diff --git a/test/layers/upsample.jl b/test/layers/upsample.jl index c4e1c30341..66831d3d68 100644 --- a/test/layers/upsample.jl +++ b/test/layers/upsample.jl @@ -2,19 +2,19 @@ m = Upsample(:bilinear; scale = (2, 3)) x = rand(Float32, 3, 4, 2, 3) y = m(x) - @test y isa Array{Float32, 4} + @test y isa Array{Float32,4} @test size(y) == (6, 12, 2, 3) m = Upsample(:bilinear; scale = 3) x = rand(Float32, 3, 4, 2, 3) y = m(x) - @test y isa Array{Float32, 4} + @test y isa Array{Float32,4} @test size(y) == (9, 12, 2, 3) m = Upsample(:bilinear; size = (4, 6)) x = rand(Float32, 3, 4, 2, 3) y = m(x) - @test y isa Array{Float32, 4} + @test y isa Array{Float32,4} @test size(y) == (4, 6, 2, 3) end @@ -22,19 +22,19 @@ end m = Upsample(:trilinear; scale = (2, 3, 2)) x = rand(Float32, 3, 4, 2, 3, 4) y = m(x) - @test y isa Array{Float32, 5} + @test y isa Array{Float32,5} @test size(y) == (6, 12, 4, 3, 4) m = Upsample(:trilinear; scale = 3) x = rand(Float32, 3, 4, 2, 3, 4) y = m(x) - @test y isa Array{Float32, 5} + @test y isa Array{Float32,5} @test size(y) == (9, 12, 6, 3, 4) m = Upsample(:trilinear; size = (4, 6, 4)) x = rand(Float32, 3, 4, 2, 3, 4) y = m(x) - @test y isa Array{Float32, 5} + @test y isa Array{Float32,5} @test size(y) == (4, 6, 4, 3, 4) end @@ -42,24 +42,24 @@ end x = rand(Float32, 3, 2, 3) m = Upsample(:nearest; scale = (2,)) y = m(x) - @test y isa Array{Float32, 3} + @test y isa Array{Float32,3} @test size(y) == (6, 2, 3) x = rand(Float32, 3, 4, 2, 3) m = Upsample(:nearest; scale = (2, 3)) y = m(x) - @test y isa Array{Float32, 4} + @test y isa Array{Float32,4} @test size(y) == (6, 12, 2, 3) m = Upsample(:nearest; scale = (2,)) y = m(x) - @test y isa Array{Float32, 4} + @test y isa Array{Float32,4} @test size(y) == (6, 4, 2, 3) m = Upsample(:nearest; scale = 2) y = m(x) - @test y isa Array{Float32, 4} + @test y isa Array{Float32,4} @test size(y) == (6, 8, 2, 3) m = Upsample(2) @@ -68,7 +68,7 @@ end m = Upsample(:nearest; size = (6, 8)) y = m(x) - @test y isa Array{Float32, 4} + @test y isa Array{Float32,4} @test size(y) == (6, 8, 2, 3) end @@ -76,12 +76,12 @@ end m = PixelShuffle(2) x = rand(Float32, 3, 18, 3) y = m(x) - @test y isa Array{Float32, 3} + @test y isa Array{Float32,3} @test size(y) == (6, 9, 3) m = PixelShuffle(3) x = rand(Float32, 3, 4, 18, 3) y = m(x) - @test y isa Array{Float32, 4} + @test y isa Array{Float32,4} @test size(y) == (9, 12, 2, 3) end diff --git a/test/losses.jl b/test/losses.jl index f8f261c7c8..7984941c78 100644 --- a/test/losses.jl +++ b/test/losses.jl @@ -1,22 +1,35 @@ using Test using Flux: onehotbatch, σ -using Flux.Losses: mse, label_smoothing, crossentropy, logitcrossentropy, - binarycrossentropy, logitbinarycrossentropy +using Flux.Losses: + mse, + label_smoothing, + crossentropy, + logitcrossentropy, + binarycrossentropy, + logitbinarycrossentropy using Flux.Losses: xlogx, xlogy # group here all losses, used in tests -const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle, - Flux.Losses.crossentropy, Flux.Losses.logitcrossentropy, - Flux.Losses.binarycrossentropy, Flux.Losses.logitbinarycrossentropy, +const ALL_LOSSES = [ + Flux.Losses.mse, + Flux.Losses.mae, + Flux.Losses.msle, + Flux.Losses.crossentropy, + Flux.Losses.logitcrossentropy, + Flux.Losses.binarycrossentropy, + Flux.Losses.logitbinarycrossentropy, Flux.Losses.kldivergence, Flux.Losses.huber_loss, Flux.Losses.tversky_loss, Flux.Losses.dice_coeff_loss, Flux.Losses.poisson_loss, - Flux.Losses.hinge_loss, Flux.Losses.squared_hinge_loss, - Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss, - Flux.Losses.siamese_contrastive_loss] + Flux.Losses.hinge_loss, + Flux.Losses.squared_hinge_loss, + Flux.Losses.binary_focal_loss, + Flux.Losses.focal_loss, + Flux.Losses.siamese_contrastive_loss, +] @testset "xlogx & xlogy" begin @test iszero(xlogx(0)) @@ -45,13 +58,19 @@ y = [1, 1, 0, 0] @test mse(0 + 0im, 1 + 1im) == 2 end -@testset "mae" begin @test Flux.mae(ŷ, y) ≈ 1 / 2 end +@testset "mae" begin + @test Flux.mae(ŷ, y) ≈ 1 / 2 +end -@testset "huber_loss" begin @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002 end +@testset "huber_loss" begin + @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002 +end y = [123.0, 456.0, 789.0] ŷ = [345.0, 332.0, 789.0] -@testset "msle" begin @test Flux.msle(ŷ, y) ≈ 0.38813985859136585 end +@testset "msle" begin + @test Flux.msle(ŷ, y) ≈ 0.38813985859136585 +end # Now onehot y's y = onehotbatch([1, 1, 0, 0], 0:1) @@ -105,8 +124,10 @@ yls = y .* (1 - 2sf) .+ sf -yls .* log.(σ.(logŷ)) - (1 .- yls) .* log.(1 .- σ.(logŷ)) @test binarycrossentropy(σ.(logŷ), y; ϵ = 0) ≈ mean(-y .* log.(σ.(logŷ)) - (1 .- y) .* log.(1 .- σ.(logŷ))) - @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y .* log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - - (1 .- y) .* log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ)))) + @test binarycrossentropy(σ.(logŷ), y) ≈ mean( + -y .* log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - + (1 .- y) .* log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))), + ) @test binarycrossentropy([0.1, 0.2, 0.9], 1) ≈ -mean(log, [0.1, 0.2, 0.9]) # constant label end @@ -170,68 +191,94 @@ end @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075 end -@testset "no spurious promotions" begin for T in (Float32, Float64) - y = rand(T, 2) - ŷ = rand(T, 2) - for f in ALL_LOSSES - fwd, back = Flux.pullback(f, ŷ, y) - @test fwd isa T - @test eltype(back(one(T))[1]) == T +@testset "no spurious promotions" begin + for T in (Float32, Float64) + y = rand(T, 2) + ŷ = rand(T, 2) + for f in ALL_LOSSES + fwd, back = Flux.pullback(f, ŷ, y) + @test fwd isa T + @test eltype(back(one(T))[1]) == T + end end -end end +end @testset "binary_focal_loss" begin - y = [0 1 0 - 1 0 1] - ŷ = [0.268941 0.5 0.268941 - 0.731059 0.5 0.731059] - - y1 = [1 0 - 0 1] - ŷ1 = [0.6 0.3 - 0.4 0.7] + y = [ + 0 1 0 + 1 0 1 + ] + ŷ = [ + 0.268941 0.5 0.268941 + 0.731059 0.5 0.731059 + ] + + y1 = [ + 1 0 + 0 1 + ] + ŷ1 = [ + 0.6 0.3 + 0.4 0.7 + ] @test Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385 @test Flux.binary_focal_loss(ŷ1, y1) ≈ 0.05691642237852222 @test Flux.binary_focal_loss(ŷ, y; γ = 0.0) ≈ Flux.binarycrossentropy(ŷ, y) end @testset "focal_loss" begin - y = [1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0] + y = [ + 1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0 + ] ŷ = softmax(reshape(-7:7, 3, 5) .* 1.0f0) - y1 = [1 0 - 0 0 - 0 1] - ŷ1 = [0.4 0.2 - 0.5 0.5 - 0.1 0.3] + y1 = [ + 1 0 + 0 0 + 0 1 + ] + ŷ1 = [ + 0.4 0.2 + 0.5 0.5 + 0.1 0.3 + ] @test Flux.focal_loss(ŷ, y) ≈ 1.1277571935622628 @test Flux.focal_loss(ŷ1, y1) ≈ 0.45990566879720157 @test Flux.focal_loss(ŷ, y; γ = 0.0) ≈ Flux.crossentropy(ŷ, y) end @testset "siamese_contrastive_loss" begin - y = [1 0 - 0 0 - 0 1] - ŷ = [0.4 0.2 - 0.5 0.5 - 0.1 0.3] - y1 = [1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0] + y = [ + 1 0 + 0 0 + 0 1 + ] + ŷ = [ + 0.4 0.2 + 0.5 0.5 + 0.1 0.3 + ] + y1 = [ + 1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0 + ] ŷ1 = softmax(reshape(-7:7, 3, 5) .* 1.0f0) - y2 = [1 - 0 - 0 - 1 - 1] - ŷ2 = [0.6 - 0.4 - 0.1 - 0.2 - 0.7] + y2 = [ + 1 + 0 + 0 + 1 + 1 + ] + ŷ2 = [ + 0.6 + 0.4 + 0.1 + 0.2 + 0.7 + ] @test Flux.siamese_contrastive_loss(ŷ, y) ≈ 0.2333333333333333 @test Flux.siamese_contrastive_loss(ŷ, y; margin = 0.5f0) ≈ 0.10000000000000002 @test Flux.siamese_contrastive_loss(ŷ, y; margin = 1.5f0) ≈ 0.5333333333333333 @@ -246,10 +293,14 @@ end @test Flux.siamese_contrastive_loss(ŷ1, y1; margin = 0) ≈ 0.13161165f0 @test Flux.siamese_contrastive_loss(ŷ2, y2) ≈ 0.21200000000000005 @test Flux.siamese_contrastive_loss(ŷ2, ŷ2) ≈ 0.18800000000000003 - @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ1, - y1, - margin = -0.5) - @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ, - y, - margin = -1) + @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss( + ŷ1, + y1, + margin = -0.5, + ) + @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss( + ŷ, + y, + margin = -1, + ) end diff --git a/test/optimise.jl b/test/optimise.jl index 49e5c6c913..9f9f788c01 100644 --- a/test/optimise.jl +++ b/test/optimise.jl @@ -10,14 +10,27 @@ using Random # so that w and w' are different Random.seed!(84) w = randn(10, 10) - @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(), - NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(), - Nesterov(), RMSProp(), Momentum()] + @testset for opt in [ + AdamW(), + AdaGrad(0.1), + AdaMax(), + AdaDelta(0.9), + AMSGrad(), + NAdam(), + RAdam(), + Descent(0.1), + Adam(), + OAdam(), + AdaBelief(), + Nesterov(), + RMSProp(), + Momentum(), + ] Random.seed!(42) w′ = randn(10, 10) b = false loss(x) = Flux.Losses.mse(w * x, w′ * x .+ b) - for t in 1:(10^5) + for t = 1:(10^5) θ = params([w′, b]) x = rand(10) θ̄ = gradient(() -> loss(x), θ) @@ -35,7 +48,7 @@ end w′ = randn(10, 10) loss(x) = Flux.Losses.mse(w * x, w′ * x) opt = Optimiser(Opt(), Adam(0.001)) - for t in 1:(10^5) + for t = 1:(10^5) θ = Params([w′]) x = rand(10) θ̄ = gradient(() -> loss(x), θ) @@ -48,26 +61,32 @@ end @testset "Training Loop" begin i = 0 l = 1 - Flux.train!(() -> (sleep(0.1); Flux.skip(); i += 1), - Params([]), - Iterators.repeated((), 10), - Descent()) + Flux.train!( + () -> (sleep(0.1); Flux.skip(); i += 1), + Params([]), + Iterators.repeated((), 10), + Descent(), + ) @test i == 0 #all skipped - Flux.train!(() -> (sleep(0.1); i == 8 && Flux.skip(); i += 1), - Params([]), - Iterators.repeated((), 10), - Descent()) + Flux.train!( + () -> (sleep(0.1); i == 8 && Flux.skip(); i += 1), + Params([]), + Iterators.repeated((), 10), + Descent(), + ) @test i == 8 #skip after i hit 8 i = 0 - Flux.train!(() -> (sleep(0.1); i += 1; l), - Params([]), - Iterators.repeated((), 100), - Descent(); - cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1)) + Flux.train!( + () -> (sleep(0.1); i += 1; l), + Params([]), + Iterators.repeated((), 100), + Descent(); + cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1), + ) @test 3 < i < 50 @@ -109,7 +128,7 @@ end loss(x) = Flux.Losses.mse(w * x, w1 * x) flag = 1 decay_steps = [] - for t in 1:(10^5) + for t = 1:(10^5) prev_eta = o.eta θ = Params([w1]) x = rand(10) @@ -129,7 +148,7 @@ end @test flag == 1 # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1). ground_truth = [] - for i in 1:4 + for i = 1:4 push!(ground_truth, 1000 * i) # Expected decay steps for this example. end @test decay_steps == ground_truth @@ -204,30 +223,31 @@ end # wreaks all sorts of havoc on our training loops. This test ensures that # a simple optimization is montonically decreasing (up to learning step effects) @testset "Momentum Optimisers and complex values" begin -# Test every optimizer that has momentum internally -for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief] - # Our "model" is just a complex number - w = zeros(ComplexF32, 1) - - # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x` - function loss() - # Deterministic training data is the best training data - x = ones(1, 1) + 1im * ones(1, 1) - - # Manually implement `mse()` to allow demonstration of brokenness - # on older Flux builds that don't have a fixed `mse()` - return sum(abs2.(w * x .- conj(x))) - end + # Test every optimizer that has momentum internally + for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief] + # Our "model" is just a complex number + w = zeros(ComplexF32, 1) + + # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x` + function loss() + # Deterministic training data is the best training data + x = ones(1, 1) + 1im * ones(1, 1) + + # Manually implement `mse()` to allow demonstration of brokenness + # on older Flux builds that don't have a fixed `mse()` + return sum(abs2.(w * x .- conj(x))) + end - params = Flux.Params([w]) - opt = opt_ctor(1e-2) + params = Flux.Params([w]) + opt = opt_ctor(1e-2) - # Train for 10 iterations, enforcing that loss is monotonically decreasing - last_loss = Inf - for idx in 1:10 - grads = Flux.gradient(loss, params) - @test loss() < last_loss - last_loss = loss() - Flux.update!(opt, params, grads) + # Train for 10 iterations, enforcing that loss is monotonically decreasing + last_loss = Inf + for idx = 1:10 + grads = Flux.gradient(loss, params) + @test loss() < last_loss + last_loss = loss() + Flux.update!(opt, params, grads) + end end -end end +end diff --git a/test/outputsize.jl b/test/outputsize.jl index c3c2c7ae7b..64eda2af31 100644 --- a/test/outputsize.jl +++ b/test/outputsize.jl @@ -3,7 +3,7 @@ @test outputsize(m, (10, 10, 3, 1)) == (6, 6, 32, 1) m = Dense(10, 5) - @test_throws DimensionMismatch outputsize(m, (5, 2))==(5, 1) + @test_throws DimensionMismatch outputsize(m, (5, 2)) == (5, 1) @test outputsize(m, (10,); padbatch = true) == (5, 1) m = Chain(Dense(10, 8, σ), Dense(8, 5), Dense(5, 2)) @@ -55,12 +55,33 @@ end @test outputsize(m, (2, 7), (3, 7)) == (13, 7) end -@testset "activations" begin @testset for f in [celu, elu, gelu, hardsigmoid, hardtanh, - leakyrelu, lisht, logcosh, logσ, mish, - relu, relu6, rrelu, selu, σ, softplus, - softshrink, softsign, swish, tanhshrink, trelu] - @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1) -end end +@testset "activations" begin + @testset for f in [ + celu, + elu, + gelu, + hardsigmoid, + hardtanh, + leakyrelu, + lisht, + logcosh, + logσ, + mish, + relu, + relu6, + rrelu, + selu, + σ, + softplus, + softshrink, + softsign, + swish, + tanhshrink, + trelu, + ] + @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1) + end +end @testset "conv" begin m = Conv((3, 3), 3 => 16) diff --git a/test/runtests.jl b/test/runtests.jl index 4189ea0dd5..2a1b2913ca 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,12 +10,18 @@ using CUDA Random.seed!(0) -@testset verbose=true "Flux.jl" begin - @testset "Utils" begin include("utils.jl") end +@testset verbose = true "Flux.jl" begin + @testset "Utils" begin + include("utils.jl") + end - @testset "Optimise" begin include("optimise.jl") end + @testset "Optimise" begin + include("optimise.jl") + end - @testset "Data" begin include("data.jl") end + @testset "Data" begin + include("data.jl") + end @testset "Losses" begin include("losses.jl") @@ -38,11 +44,13 @@ Random.seed!(0) include("outputsize.jl") end - @testset "CUDA" begin if CUDA.functional() - include("cuda/runtests.jl") - else - @warn "CUDA unavailable, not testing GPU support" - end end + @testset "CUDA" begin + if CUDA.functional() + include("cuda/runtests.jl") + else + @warn "CUDA unavailable, not testing GPU support" + end + end @static if VERSION == v"1.6" using Documenter diff --git a/test/utils.jl b/test/utils.jl index f71e336c8a..7da452ba02 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -1,8 +1,22 @@ using Flux -using Flux: throttle, nfan, glorot_uniform, glorot_normal, - kaiming_normal, kaiming_uniform, orthogonal, truncated_normal, - sparse_init, identity_init, unstack, batch, unbatch, - unsqueeze, params, loadparams!, loadmodel! +using Flux: + throttle, + nfan, + glorot_uniform, + glorot_normal, + kaiming_normal, + kaiming_uniform, + orthogonal, + truncated_normal, + sparse_init, + identity_init, + unstack, + batch, + unbatch, + unsqueeze, + params, + loadparams!, + loadmodel! using MLUtils using StatsBase: var, std using Statistics, LinearAlgebra @@ -71,8 +85,10 @@ end end @testset "Basics: $init" for init in [ - glorot_uniform, glorot_normal, - kaiming_uniform, kaiming_normal, + glorot_uniform, + glorot_normal, + kaiming_uniform, + kaiming_normal, orthogonal, sparse_init, truncated_normal, @@ -89,7 +105,7 @@ end end @test size(init(3, 4)) == (3, 4) # only init(size...) is accepted: - @test_throws MethodError size(init((3, 4, 5)))==(3, 4, 5) + @test_throws MethodError size(init((3, 4, 5))) == (3, 4, 5) # rng, and currying: @test size(init(MersenneTwister(1), 3, 4)) == (3, 4) @@ -164,8 +180,8 @@ end for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)] expected_zeros = ceil(Integer, n_in * sparsity) v = sparse_init(n_in, n_out; sparsity = sparsity, std = σ) - @test all([sum(v[:, col] .== 0) == expected_zeros for col in 1:n_out]) - @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ + @test all([sum(v[:, col] .== 0) == expected_zeros for col = 1:n_out]) + @test 0.9 * σ < std(v[v.!=0]) < 1.1 * σ end @test eltype(sparse_init(3, 4; std = 1.5, sparsity = 0.5)) == Float32 @@ -173,9 +189,9 @@ end @testset "truncated_normal" begin m = truncated_normal(100, 100) - @test minimum(m)≈-2 atol=0.05 # default arguments - @test maximum(m)≈2 atol=0.05 - @test mean(m)≈0 atol=0.1 + @test minimum(m) ≈ -2 atol = 0.05 # default arguments + @test maximum(m) ≈ 2 atol = 0.05 + @test mean(m) ≈ 0 atol = 0.1 size100 = (100, 100, 100) for (μ, σ, lo, hi) in [(0.0, 1, -2, 3), (1, 2, -4.0, 5.0)] @@ -225,15 +241,12 @@ end indata = reshape(collect(Float32, 1:9), 3, 3) @test l(indata) == indata end - @testset "$layer ID mapping with kernelsize $kernelsize" for layer in (Conv, - ConvTranspose, - CrossCor), - kernelsize in ((1,), - (3,), - (1, 3), - (3, 5), - (3, 5, - 7)) + @testset "$layer ID mapping with kernelsize $kernelsize" for layer in ( + Conv, + ConvTranspose, + CrossCor, + ), + kernelsize in ((1,), (3,), (1, 3), (3, 5), (3, 5, 7)) nch = 3 l = layer(kernelsize, nch => nch; init = identity_init, pad = SamePad()) @@ -244,10 +257,18 @@ end @testset "Inception identity" begin insize = 7 path1 = Conv((1, 3), insize => 2; init = identity_init, pad = SamePad()) - path2 = Conv((3, 5), insize => 3; init = identity_init(; shift = (0, 0, 2, 0)), - pad = SamePad()) - path3 = Conv((5, 7), insize => 2; init = identity_init(; shift = (0, 0, 5, 0)), - pad = SamePad()) + path2 = Conv( + (3, 5), + insize => 3; + init = identity_init(; shift = (0, 0, 2, 0)), + pad = SamePad(), + ) + path3 = Conv( + (5, 7), + insize => 2; + init = identity_init(; shift = (0, 0, 5, 0)), + pad = SamePad(), + ) block = Parallel((xs...) -> cat(xs...; dims = 3), path1, path2, path3) indata = randn(Float32, 9, 9, 7, 2) @@ -295,7 +316,7 @@ end @test f32(m).bias === m.bias === false @testset "Gradients for broadcasted $op with sizes $s" for op in (+, -, *), - s in ((1,), (2, 3)) + s in ((1,), (2, 3)) o = ones(s) z = zeros(s) @@ -346,10 +367,12 @@ end end @testset "Batching" begin - stacked_array = [8 9 3 5 - 9 6 6 9 - 9 1 7 2 - 7 4 10 6] + stacked_array = [ + 8 9 3 5 + 9 6 6 9 + 9 1 7 2 + 7 4 10 6 + ] unstacked_array = [[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]] @test unbatch(stacked_array) == unstacked_array @test batch(unstacked_array) == stacked_array @@ -359,20 +382,18 @@ end @test unbatch([1, 2, 3]) == [1, 2, 3] # generic iterable - @test batch(ones(2) for i in 1:3) == ones(2, 3) - @test unbatch(ones(2, 3)) == [ones(2) for i in 1:3] + @test batch(ones(2) for i = 1:3) == ones(2, 3) + @test unbatch(ones(2, 3)) == [ones(2) for i = 1:3] end @testset "Param remapping" begin ls(dims...) = reshape(collect(Float32, 1:prod(dims)), dims...) # accepts dims in reverse order to Dense dl(nin, nout, bias) = Dense(ls(nout, nin), bias(nout)) - dm(bias) = Chain(dl(3, 5, bias), - dl(5, 4, bias), - dl(4, 3, bias)) + dm(bias) = Chain(dl(3, 5, bias), dl(5, 4, bias), dl(4, 3, bias)) nobias(n) = false - testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m, - dm(bt))) + testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in + enumerate(zip(m, dm(bt))) @test l1.weight == l2.weight @test l1.bias == l2.bias @test_skip typeof(l1.bias) === typeof(l2.bias) @@ -420,8 +441,12 @@ end # tests for BatchNorm and Dropout m1 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), Flux.flatten, Dropout(0.2)) - m2 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), x -> reshape(x, :, size(x)[end]), - Dropout(0.1)) + m2 = Chain( + Conv((3, 3), 3 => 16), + BatchNorm(16), + x -> reshape(x, :, size(x)[end]), + Dropout(0.1), + ) m2[2].μ .= rand(Float32, size(m2[2].μ)...) loadmodel!(m1, m2) # non-trainable parameters are copied as well @@ -436,36 +461,40 @@ end # tests MaxPool # tests testmode!/trainmode! is not copied # tests Dense, Conv, BatchNorm, Dropout (like above) but in a bigger model - chain1 = Chain(Dropout(0.2), - Conv((3, 3), 1 => 32, relu), - BatchNorm(32, relu), - MaxPool((2, 2)), - Dropout(0.2), - Conv((3, 3), 32 => 16, relu), - Dropout(0.2), - MaxPool((2, 2)), - Dropout(0.2), - Conv((3, 3), 16 => 10, relu), - Dropout(0.2), - x -> reshape(x, :, size(x, 4)), - Dropout(0.2), - Dense(90, 10), - softmax) - chain2 = Chain([Dropout(0.1), - Conv((3, 3), 1 => 32, relu), - BatchNorm(32, relu), - MaxPool((3, 3)), - Dropout(0.1), - Conv((3, 3), 32 => 16, relu), - Dropout(0.1), - MaxPool((3, 3)), - Dropout(0.1), - Conv((3, 3), 16 => 10, relu), - Dropout(0.1), - x -> reshape(x, :, size(x, 4)), - Dropout(0.1), - Dense(90, 10), - softmax]) + chain1 = Chain( + Dropout(0.2), + Conv((3, 3), 1 => 32, relu), + BatchNorm(32, relu), + MaxPool((2, 2)), + Dropout(0.2), + Conv((3, 3), 32 => 16, relu), + Dropout(0.2), + MaxPool((2, 2)), + Dropout(0.2), + Conv((3, 3), 16 => 10, relu), + Dropout(0.2), + x -> reshape(x, :, size(x, 4)), + Dropout(0.2), + Dense(90, 10), + softmax, + ) + chain2 = Chain([ + Dropout(0.1), + Conv((3, 3), 1 => 32, relu), + BatchNorm(32, relu), + MaxPool((3, 3)), + Dropout(0.1), + Conv((3, 3), 32 => 16, relu), + Dropout(0.1), + MaxPool((3, 3)), + Dropout(0.1), + Conv((3, 3), 16 => 10, relu), + Dropout(0.1), + x -> reshape(x, :, size(x, 4)), + Dropout(0.1), + Dense(90, 10), + softmax, + ]) chain2[3].μ .= 5.0f0 chain2[3].σ² .= 2.0f0 testmode!(chain2) @@ -473,7 +502,7 @@ end for (dst, src) in zip(chain1, chain2) if dst isa Dropout @test dst.p == 0.2 - elseif dst isa Union{Conv, Dense} + elseif dst isa Union{Conv,Dense} @test dst.weight == src.weight @test dst.bias == src.bias elseif dst isa MaxPool @@ -486,12 +515,12 @@ end end # copy only a subset of the model - chain1[end - 1].weight .= 1.0f0 + chain1[end-1].weight .= 1.0f0 chain1[3].μ .= 3.0f0 chain1[2].bias .= 5.0f0 - loadmodel!(chain2[end - 1], chain1[end - 1]) + loadmodel!(chain2[end-1], chain1[end-1]) loadmodel!(chain2[3], chain1[3]) - @test chain2[end - 1].weight == chain1[end - 1].weight + @test chain2[end-1].weight == chain1[end-1].weight @test chain2[3].μ == chain1[3].μ @test chain2[2].bias != chain1[2].bias @@ -602,17 +631,18 @@ end @test modules[5] === m2 @test modules[6] === m3 - mod_par = Flux.modules(Parallel(Flux.Bilinear(2, 2, 2, cbrt), Dense(2, 2, abs), - Dense(2, 2, abs2))) + mod_par = Flux.modules( + Parallel(Flux.Bilinear(2, 2, 2, cbrt), Dense(2, 2, abs), Dense(2, 2, abs2)), + ) @test length(mod_par) == 5 mod_rnn = Flux.modules(Chain(Dense(2, 3), BatchNorm(3), LSTM(3, 4))) @test length(mod_rnn) == 6 @test mod_rnn[end] isa Flux.LSTMCell - mod_skip = Flux.modules(Chain(SkipConnection(Conv((2, 3), 4 => 5; pad = 6, stride = 7), - +), - LayerNorm(8))) + mod_skip = Flux.modules( + Chain(SkipConnection(Conv((2, 3), 4 => 5; pad = 6, stride = 7), +), LayerNorm(8)), + ) @test length(mod_skip) == 6 @test mod_skip[end] isa Flux.Scale end @@ -631,7 +661,7 @@ end end n_iter = 0 - for i in 1:length(v) + for i = 1:length(v) trigger(i) && break n_iter += 1 end @@ -653,8 +683,11 @@ end end @testset "distance" begin - es = Flux.early_stopping(identity, 10; - distance = (best_score, score) -> score - best_score) + es = Flux.early_stopping( + identity, + 10; + distance = (best_score, score) -> score - best_score, + ) n_iter = 0 while n_iter < 99 @@ -718,8 +751,7 @@ end return out = m.dense(x) end - model = TwoDenses(Dense(3, 1), - Dense(3, 2)) + model = TwoDenses(Dense(3, 1), Dense(3, 2)) p, re = Flux.destructure(model) x = [1.0, 2.0, 3.0] @@ -781,8 +813,10 @@ end n_outputs = [3, 7] data = rand(Float32, n_input, n_batch) - model = Chain(Dense(n_input, n_shared), - Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2]))) + model = Chain( + Dense(n_input, n_shared), + Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2])), + ) pvec, re = Flux.destructure(model) loss(x, idx, pv) = sum(abs2, re(pv)(x)[idx]) # loss wrt `idx`th output term @@ -792,16 +826,20 @@ end end end -@testset "Rrule" begin @testset "issue 2033" begin if CUDA.functional() - struct Wrapped{T} - x::T +@testset "Rrule" begin + @testset "issue 2033" begin + if CUDA.functional() + struct Wrapped{T} + x::T + end + y, _ = Flux.pullback(Wrapped, cu(randn(3, 3))) + @test y isa Wrapped{<:CuArray} + end end - y, _ = Flux.pullback(Wrapped, cu(randn(3, 3))) - @test y isa Wrapped{<:CuArray} -end end end +end # make sure rng_from_array is non_differentiable @testset "rng_from_array" begin - m(x) = (rand(rng_from_array(x)) * x)[1] + m(x) = (rand(rng_from_array(x))*x)[1] gradient(m, ones(2)) end