From bf30ed78e89f9a652d885d1848a1d41f414dffc8 Mon Sep 17 00:00:00 2001
From: Saransh Chopra <saransh0701@gmail.com>
Date: Wed, 19 Oct 2022 14:09:15 +0530
Subject: [PATCH 1/2] Massive reformat

---
 .JuliaFormatter.toml         |    9 +
 docs/make.jl                 |  117 +--
 perf/bench_utils.jl          |   25 +-
 perf/conv.jl                 |    6 +-
 perf/dense.jl                |    4 +-
 perf/recurrent.jl            |   78 +-
 perf/vgg.jl                  |   82 +-
 src/Flux.jl                  |   17 +-
 src/cuda/cudnn.jl            |   32 +-
 src/deprecations.jl          |   71 +-
 src/functor.jl               |   99 +--
 src/layers/basic.jl          |  413 +++++-----
 src/layers/conv.jl           |  509 ++++++------
 src/layers/normalise.jl      |  472 ++++++------
 src/layers/recurrent.jl      |  320 ++++----
 src/layers/show.jl           |  173 +++--
 src/layers/stateless.jl      |   26 +-
 src/layers/upsample.jl       |   74 +-
 src/loading.jl               |  105 +--
 src/losses/Losses.jl         |   20 +-
 src/losses/functions.jl      |  145 ++--
 src/losses/utils.jl          |   27 +-
 src/optimise/Optimise.jl     |    8 +-
 src/optimise/optimisers.jl   |  463 +++++------
 src/optimise/train.jl        |  111 +--
 src/outputsize.jl            |  224 +++---
 src/utils.jl                 |  435 ++++++-----
 test/ctc-gpu.jl              |   83 +-
 test/ctc.jl                  |   67 +-
 test/cuda/cuda.jl            |  260 +++----
 test/cuda/curnn.jl           |   38 +-
 test/cuda/layers.jl          |  436 +++++------
 test/cuda/losses.jl          |   69 +-
 test/cuda/runtests.jl        |    8 +-
 test/cuda/test_utils.jl      |  111 +--
 test/data.jl                 |   56 +-
 test/layers/basic.jl         |  700 ++++++++---------
 test/layers/conv.jl          |  462 +++++------
 test/layers/normalisation.jl |  702 ++++++++---------
 test/layers/recurrent.jl     |  233 +++---
 test/layers/show.jl          |  104 ++-
 test/layers/stateless.jl     |   16 +-
 test/layers/upsample.jl      |  134 ++--
 test/losses.jl               |  271 +++----
 test/optimise.jl             |  362 +++++----
 test/outputsize.jl           |  446 +++++------
 test/runtests.jl             |   81 +-
 test/utils.jl                | 1410 +++++++++++++++++-----------------
 48 files changed, 5224 insertions(+), 4890 deletions(-)
 create mode 100644 .JuliaFormatter.toml

diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 0000000000..e0bc3e19e9
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1,9 @@
+style = "sciml"
+whitespace_in_kwargs = true
+format_docstrings = true
+always_for_in = true
+join_lines_based_on_source = true
+separate_kwargs_with_semicolon = false
+always_use_return = true
+margin = 92
+indent = 4
diff --git a/docs/make.jl b/docs/make.jl
index 40d6033637..a6c6659cde 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,62 +1,67 @@
-using Documenter, Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore
-
+using Documenter, Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote,
+      ChainRulesCore
 
 DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true)
 
-makedocs(
-    modules = [Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore, Base],
-    doctest = false,
-    sitename = "Flux",
-    # strict = [:cross_references,],
-    pages = [
-        "Getting Started" => [
-            "Welcome" => "index.md",
-            "Quick Start" => "models/quickstart.md",
-            "Fitting a Line" => "models/overview.md",
-            "Gradients and Layers" => "models/basics.md",
-        ],
-        "Building Models" => [
-            "Built-in Layers 📚" => "models/layers.md",
-            "Recurrence" => "models/recurrence.md",
-            "Activation Functions 📚" => "models/activation.md",
-            "NNlib.jl 📚 (`softmax`, `conv`, ...)" => "models/nnlib.md",
-         ],
-         "Handling Data" => [
-             "MLUtils.jl 📚 (`DataLoader`, ...)" => "data/mlutils.md",
-             "OneHotArrays.jl 📚 (`onehot`, ...)" => "data/onehot.md",
-         ],
-         "Training Models" => [
-             "Training" => "training/training.md",
-             "Regularisation" => "models/regularisation.md",
-             "Loss Functions 📚" => "models/losses.md",
-             "Optimisation Rules 📚" => "training/optimisers.md",  # TODO move optimiser intro up to Training
-             "Callback Helpers 📚" => "training/callbacks.md",
-             "Zygote.jl 📚 (`gradient`, ...)" => "training/zygote.md",
-         ],
-         "Model Tools" => [
-             "GPU Support" => "gpu.md",
-             "Saving & Loading" => "saving.md",
-             "Shape Inference 📚" => "outputsize.md",
-             "Weight Initialisation 📚" => "utilities.md",
-             "Flat vs. Nested 📚" => "destructure.md",
-             "Functors.jl 📚 (`fmap`, ...)" => "models/functors.md",
+makedocs(modules = [
+             Flux,
+             NNlib,
+             Functors,
+             MLUtils,
+             BSON,
+             Optimisers,
+             OneHotArrays,
+             Zygote,
+             ChainRulesCore,
+             Base,
          ],
-         "Performance Tips" => "performance.md",
-         "Flux's Ecosystem" => "ecosystem.md",
-         "Tutorials" => [  # TODO, maybe
-             "Custom Layers" => "models/advanced.md",  # TODO move freezing to Training
+         doctest = false,
+         sitename = "Flux",
+         # strict = [:cross_references,],
+         pages = [
+             "Getting Started" => [
+                 "Welcome" => "index.md",
+                 "Quick Start" => "models/quickstart.md",
+                 "Fitting a Line" => "models/overview.md",
+                 "Gradients and Layers" => "models/basics.md",
+             ],
+             "Building Models" => [
+                 "Built-in Layers 📚" => "models/layers.md",
+                 "Recurrence" => "models/recurrence.md",
+                 "Activation Functions 📚" => "models/activation.md",
+                 "NNlib.jl 📚 (`softmax`, `conv`, ...)" => "models/nnlib.md",
+             ],
+             "Handling Data" => [
+                 "MLUtils.jl 📚 (`DataLoader`, ...)" => "data/mlutils.md",
+                 "OneHotArrays.jl 📚 (`onehot`, ...)" => "data/onehot.md",
+             ],
+             "Training Models" => [
+                 "Training" => "training/training.md",
+                 "Regularisation" => "models/regularisation.md",
+                 "Loss Functions 📚" => "models/losses.md",
+                 "Optimisation Rules 📚" => "training/optimisers.md",  # TODO move optimiser intro up to Training
+                 "Callback Helpers 📚" => "training/callbacks.md",
+                 "Zygote.jl 📚 (`gradient`, ...)" => "training/zygote.md",
+             ],
+             "Model Tools" => [
+                 "GPU Support" => "gpu.md",
+                 "Saving & Loading" => "saving.md",
+                 "Shape Inference 📚" => "outputsize.md",
+                 "Weight Initialisation 📚" => "utilities.md",
+                 "Flat vs. Nested 📚" => "destructure.md",
+                 "Functors.jl 📚 (`fmap`, ...)" => "models/functors.md",
+             ],
+             "Performance Tips" => "performance.md",
+             "Flux's Ecosystem" => "ecosystem.md",
+             "Tutorials" => [  # TODO, maybe
+                 "Custom Layers" => "models/advanced.md",  # TODO move freezing to Training
+             ],
          ],
-    ],
-    format = Documenter.HTML(
-        sidebar_sitename = false,
-        analytics = "UA-36890222-9",
-        assets = ["assets/flux.css"],
-        prettyurls = get(ENV, "CI", nothing) == "true"
-    ),
-)
+         format = Documenter.HTML(sidebar_sitename = false,
+                                  analytics = "UA-36890222-9",
+                                  assets = ["assets/flux.css"],
+                                  prettyurls = get(ENV, "CI", nothing) == "true"))
 
-deploydocs(
-    repo = "github.com/FluxML/Flux.jl.git",
-    target = "build",
-    push_preview = true
-)
+deploydocs(repo = "github.com/FluxML/Flux.jl.git",
+           target = "build",
+           push_preview = true)
diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
index 525184f773..d7897851a4 100644
--- a/perf/bench_utils.jl
+++ b/perf/bench_utils.jl
@@ -3,36 +3,39 @@ using Flux
 using CUDA
 using Zygote: pullback, ignore
 
-
 fw(m, x) = m(x)
-bw(back) = back(1f0)
+bw(back) = back(1.0f0)
 fwbw(m, ps, x) = gradient(() -> sum(fw(m, x)), ps)
 pb(m, ps, x) = pullback(() -> sum(fw(m, x)), ps)
 
-function run_benchmark(model, x; cuda=true)
-    
-    if cuda 
+function run_benchmark(model, x; cuda = true)
+    if cuda
         model = model |> gpu
         x = x |> gpu
     end
 
     ps = Flux.params(model)
-    y, back =  pb(model, ps, x)
-
+    y, back = pb(model, ps, x)
 
     if cuda
         CUDA.allowscalar(false)
         # CUDA.device!(3)
         println("  forward")
-        fw(model, x); GC.gc(); CUDA.reclaim(); #warmup
+        fw(model, x)
+        GC.gc()
+        CUDA.reclaim() #warmup
         @btime CUDA.@sync(fw($model, $x)) teardown=(GC.gc(); CUDA.reclaim())
 
         println("  backward")
-        bw(back); GC.gc(); CUDA.reclaim(); #warmup
+        bw(back)
+        GC.gc()
+        CUDA.reclaim() #warmup
         @btime CUDA.@sync(bw($back)) teardown=(GC.gc(); CUDA.reclaim())
-        
+
         println("  forw and back")
-        fwbw(model, ps, x); GC.gc(); CUDA.reclaim(); #warmup
+        fwbw(model, ps, x)
+        GC.gc()
+        CUDA.reclaim() #warmup
         @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown=(GC.gc(); CUDA.reclaim())
     else
         println("  forward")
diff --git a/perf/conv.jl b/perf/conv.jl
index 8da601e480..98dfcf46ce 100644
--- a/perf/conv.jl
+++ b/perf/conv.jl
@@ -1,8 +1,8 @@
 for ch in [1, 3, 16, 64]
     x = rand(Float32, 64, 64, ch, 64)
-    model = Conv((3,3), ch=>ch)
+    model = Conv((3, 3), ch => ch)
     println("CPU ch=$ch")
-    run_benchmark(model, x, cuda=false)
+    run_benchmark(model, x, cuda = false)
     println("CUDA ch=$ch")
-    run_benchmark(model, x, cuda=true)    
+    run_benchmark(model, x, cuda = true)
 end
diff --git a/perf/dense.jl b/perf/dense.jl
index 005d9360ba..1f77d21c55 100644
--- a/perf/dense.jl
+++ b/perf/dense.jl
@@ -2,7 +2,7 @@ for n in [2, 20, 200, 2000]
     x = randn(Float32, n, n)
     model = Dense(n, n)
     println("CPU n=$n")
-    run_benchmark(model, x, cuda=false)
+    run_benchmark(model, x, cuda = false)
     println("CUDA n=$n")
-    run_benchmark(model, x, cuda=true)    
+    run_benchmark(model, x, cuda = true)
 end
diff --git a/perf/recurrent.jl b/perf/recurrent.jl
index ef00a8d9a5..ae8f68d0d6 100644
--- a/perf/recurrent.jl
+++ b/perf/recurrent.jl
@@ -1,62 +1,62 @@
 
-
 struct RNNWrapper{T}
-  rnn::T
+    rnn::T
 end
 Flux.@functor RNNWrapper
 
 # Need to specialize for RNNWrapper.
 fw(r::RNNWrapper, X::Vector{<:AbstractArray}) = begin
-  Flux.reset!(r.rnn)
-  [r.rnn(x) for x in X]
+    Flux.reset!(r.rnn)
+    [r.rnn(x) for x in X]
 end
 
 fw(r::RNNWrapper, X) = begin
-  Flux.reset!(r.rnn)
-  r.rnn(X)
+    Flux.reset!(r.rnn)
+    r.rnn(X)
 end
 
-fwbw(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = gradient(ps) do
-  y = fw(r, X)
-  sum(sum(y))
-end
+fwbw(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) =
+    gradient(ps) do
+        y = fw(r, X)
+        return sum(sum(y))
+    end
 
-pb(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = pullback(ps) do
-  y = fw(r, X)
-  sum(sum(y))
-end
+pb(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) =
+    pullback(ps) do
+        y = fw(r, X)
+        return sum(sum(y))
+    end
 
 function rnn_benchmark_sweep(data_creator::Function, rnn_type)
-  for n in [2, 20, 200, 1000], ts in [1, 4, 16, 64]
-    x, x_n = data_creator(n, ts)
-    model = RNNWrapper(rnn_type(n, n))
-    
-    println("$rnn_type $x_n CPU n=$n, ts=$ts")
-    run_benchmark(model, x, cuda=false)
-    
-    println("$rnn_type $x_n CUDA n=$n, ts=$ts")
-    try
-      run_benchmark(model, x, cuda=true)
-    catch ex
-      @show typeof(ex)
-      if ex isa OutOfGPUMemoryError
-        @warn "Not enough GPU memory to run test"
-      else
-        rethrow(ex)
-      end
+    for n in [2, 20, 200, 1000], ts in [1, 4, 16, 64]
+        x, x_n = data_creator(n, ts)
+        model = RNNWrapper(rnn_type(n, n))
+
+        println("$rnn_type $x_n CPU n=$n, ts=$ts")
+        run_benchmark(model, x, cuda = false)
+
+        println("$rnn_type $x_n CUDA n=$n, ts=$ts")
+        try
+            run_benchmark(model, x, cuda = true)
+        catch ex
+            @show typeof(ex)
+            if ex isa OutOfGPUMemoryError
+                @warn "Not enough GPU memory to run test"
+            else
+                rethrow(ex)
+            end
+        end
     end
-  end  
 end
 
 for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM]
-  rnn_benchmark_sweep(rnn_type) do n, ts
-    [randn(Float32, n, n) for _ in 1:ts], "Vec"
-  end
+    rnn_benchmark_sweep(rnn_type) do n, ts
+        return [randn(Float32, n, n) for _ in 1:ts], "Vec"
+    end
 end
 
 for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM]
-  rnn_benchmark_sweep(rnn_type) do n, ts
-    randn(Float32, n, n, ts), "Block"
-  end
+    rnn_benchmark_sweep(rnn_type) do n, ts
+        return randn(Float32, n, n, ts), "Block"
+    end
 end
-
diff --git a/perf/vgg.jl b/perf/vgg.jl
index 708c152c90..13ac8e8c77 100644
--- a/perf/vgg.jl
+++ b/perf/vgg.jl
@@ -6,50 +6,48 @@ using CUDA
 using Zygote: pullback
 
 function vgg16()
-    Chain(
-        Conv((3, 3), 3 => 64, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(64),
-        Conv((3, 3), 64 => 64, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(64),
-        MaxPool((2,2)),
-        Conv((3, 3), 64 => 128, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(128),
-        Conv((3, 3), 128 => 128, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(128),
-        MaxPool((2,2)),
-        Conv((3, 3), 128 => 256, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(256),
-        Conv((3, 3), 256 => 256, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(256),
-        Conv((3, 3), 256 => 256, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(256),
-        MaxPool((2,2)),
-        Conv((3, 3), 256 => 512, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(512),
-        Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(512),
-        Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(512),
-        MaxPool((2,2)),
-        Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(512),
-        Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(512),
-        Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
-        BatchNorm(512),
-        MaxPool((2,2)),
-        flatten,
-        Dense(512, 4096, relu),
-        Dropout(0.5),
-        Dense(4096, 4096, relu),
-        Dropout(0.5),
-        Dense(4096, 10)
-    )
+    return Chain(Conv((3, 3), 3 => 64, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(64),
+                 Conv((3, 3), 64 => 64, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(64),
+                 MaxPool((2, 2)),
+                 Conv((3, 3), 64 => 128, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(128),
+                 Conv((3, 3), 128 => 128, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(128),
+                 MaxPool((2, 2)),
+                 Conv((3, 3), 128 => 256, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(256),
+                 Conv((3, 3), 256 => 256, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(256),
+                 Conv((3, 3), 256 => 256, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(256),
+                 MaxPool((2, 2)),
+                 Conv((3, 3), 256 => 512, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 Conv((3, 3), 512 => 512, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 Conv((3, 3), 512 => 512, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 MaxPool((2, 2)),
+                 Conv((3, 3), 512 => 512, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 Conv((3, 3), 512 => 512, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 Conv((3, 3), 512 => 512, relu, pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 MaxPool((2, 2)),
+                 flatten,
+                 Dense(512, 4096, relu),
+                 Dropout(0.5),
+                 Dense(4096, 4096, relu),
+                 Dropout(0.5),
+                 Dense(4096, 10))
 end
 
-let model=vgg16(), x=rand(Float32, 32, 32, 3, 64)
+let model = vgg16(), x = rand(Float32, 32, 32, 3, 64)
     println("CPU benchmark")
-    run_benchmark(model, x, cuda=false)
+    run_benchmark(model, x, cuda = false)
     println("CUDA benchmark")
-    run_benchmark(model, x, cuda=true)
+    run_benchmark(model, x, cuda = true)
 end
diff --git a/src/Flux.jl b/src/Flux.jl
index fcb473ba2c..251c472fd2 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -14,7 +14,9 @@ using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
 
 # Pirate error to catch a common mistake. (Internal function `base` because overloading `update!` is more likely to give ambiguities.)
-Optimisers.base(dx::Zygote.Grads) = error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`")
+function Optimisers.base(dx::Zygote.Grads)
+    return error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`")
+end
 
 export Chain, Dense, Embedding, Maxout, SkipConnection, Parallel, PairwiseFusion,
        RNN, LSTM, GRU, GRUv3,
@@ -30,19 +32,21 @@ using .Optimise
 using .Optimise: @epochs
 using .Optimise: skip
 export Descent, Adam, Momentum, Nesterov, RMSProp,
-  AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam,
-  AdamW, RAdam, AdaBelief, InvDecay, ExpDecay,
-  WeightDecay, ClipValue, ClipNorm
+       AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam,
+       AdamW, RAdam, AdaBelief, InvDecay, ExpDecay,
+       WeightDecay, ClipValue, ClipNorm
 
 using CUDA
-const use_cuda = Ref{Union{Nothing,Bool}}(nothing)
+const use_cuda = Ref{Union{Nothing, Bool}}(nothing)
 
 using Adapt, Functors, OneHotArrays
 include("utils.jl")
 include("functor.jl")
 
 # Pirate error to catch a common mistake.
-Functors.functor(::Type{<:MLUtils.DataLoader}, x) = error("`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.")
+function Functors.functor(::Type{<:MLUtils.DataLoader}, x)
+    return error("`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.")
+end
 
 include("layers/stateless.jl")
 include("layers/basic.jl")
@@ -60,7 +64,6 @@ export @autosize
 include("data/Data.jl")
 using .Data
 
-
 include("losses/Losses.jl")
 using .Losses # TODO: stop importing Losses in Flux's namespace in v0.12
 
diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl
index 9e6bdb53a0..c20a7f873c 100644
--- a/src/cuda/cudnn.jl
+++ b/src/cuda/cudnn.jl
@@ -1,21 +1,21 @@
 import NNlibCUDA: batchnorm, ∇batchnorm
 
-function (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}},
-                              cache=nothing) where T<:Union{Float32, Float64}
-  
-  @assert BN.affine "BatchNorm: only affine=true supported on gpu"
-  @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu"
-  @assert length(BN.β) == size(x, ndims(x)-1) "BatchNorm: input has wrong number of channels"
-  return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; 
-                  cache=cache, alpha=1, beta=0, eps=BN.ϵ, 
-                  training=Flux._isactive(BN)))
+function (BN::Flux.BatchNorm)(x::Union{CuArray{T, 2}, CuArray{T, 4}, CuArray{T, 5}},
+                              cache = nothing) where {T <: Union{Float32, Float64}}
+    @assert BN.affine "BatchNorm: only affine=true supported on gpu"
+    @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu"
+    @assert length(BN.β)==size(x, ndims(x) - 1) "BatchNorm: input has wrong number of channels"
+    return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum;
+                           cache = cache, alpha = 1, beta = 0, eps = BN.ϵ,
+                           training = Flux._isactive(BN)))
 end
 
-function ChainRulesCore.rrule(::typeof(batchnorm), g, b, x, running_mean, running_var, momentum; kw...)
-  y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...) 
-  function batchnorm_pullback(Δ)
-    grad = ∇batchnorm(g, b, x, unthunk(Δ), running_mean, running_var, momentum; kw...)
-    (NoTangent(), grad..., NoTangent(), NoTangent(), NoTangent())
-  end
-  y, batchnorm_pullback
+function ChainRulesCore.rrule(::typeof(batchnorm), g, b, x, running_mean, running_var,
+                              momentum; kw...)
+    y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...)
+    function batchnorm_pullback(Δ)
+        grad = ∇batchnorm(g, b, x, unthunk(Δ), running_mean, running_var, momentum; kw...)
+        return (NoTangent(), grad..., NoTangent(), NoTangent(), NoTangent())
+    end
+    return y, batchnorm_pullback
 end
diff --git a/src/deprecations.jl b/src/deprecations.jl
index 8c3bc963a4..acb87b9625 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -1,69 +1,80 @@
 # v0.12 deprecations
 
 function ones(dims...)
-  Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", :ones, force=true)
-  Base.ones(Float32, dims...)
+    Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)",
+                 :ones, force = true)
+    return Base.ones(Float32, dims...)
 end
 ones(T::Type, dims...) = Base.ones(T, dims...)
 
 function zeros(dims...)
-  Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)", :zeros, force=true)
-  Base.zeros(Float32, dims...)
+    Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)",
+                 :zeros, force = true)
+    return Base.zeros(Float32, dims...)
 end
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
 
-ones32(::Type, dims...) = throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type"))
-zeros32(::Type, dims...) = throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type"))
+function ones32(::Type, dims...)
+    throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type"))
+end
+function zeros32(::Type, dims...)
+    throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type"))
+end
 
 # v0.13 deprecations
 
 function Broadcast.broadcasted(f::Recur, args...)
-  # This had an explicit @adjoint rule, calling Zygote.∇map(__context__, f, args...), until v0.12
-  Base.depwarn("""Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order.
-    Re-writing this as a comprehension would be better.""", :broadcasted)
-  map(f, args...)  # map isn't really safe either, but 
+    # This had an explicit @adjoint rule, calling Zygote.∇map(__context__, f, args...), until v0.12
+    Base.depwarn("""Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order.
+      Re-writing this as a comprehension would be better.""", :broadcasted)
+    return map(f, args...)  # map isn't really safe either, but 
 end
 
 @deprecate frequencies(xs) group_counts(xs)
 
 struct Zeros
-  function Zeros()
-    Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead", :Zeros)
-    false
-  end
+    function Zeros()
+        Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead",
+                     :Zeros)
+        return false
+    end
 end
 Zeros(args...) = Zeros()  # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros())
 
 function Optimise.update!(x::AbstractArray, x̄)
-  Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", :update!)
-  x .-= x̄
+    Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.",
+                 :update!)
+    return x .-= x̄
 end
 
 function Diagonal(size::Integer...; kw...)
-  Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", :Diagonal)
-  Scale(size...; kw...)
+    Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.",
+                 :Diagonal)
+    return Scale(size...; kw...)
 end
 function Diagonal(size::Tuple; kw...)
-  Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", :Diagonal)
-  Scale(size...; kw...)
+    Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.",
+                 :Diagonal)
+    return Scale(size...; kw...)
 end
 
 # Deprecate this eventually once saving models w/o structure is no more
 function loadparams!(m, xs)
-  Base.depwarn("loadparams! will be deprecated eventually. Use loadmodel! instead.", :loadparams!)
-  for (p, x) in zip(params(m), xs)
-    size(p) == size(x) ||
-      error("Expected param size $(size(p)), got $(size(x))")
-    copyto!(p, x)
-  end
+    Base.depwarn("loadparams! will be deprecated eventually. Use loadmodel! instead.",
+                 :loadparams!)
+    for (p, x) in zip(params(m), xs)
+        size(p) == size(x) ||
+            error("Expected param size $(size(p)), got $(size(x))")
+        copyto!(p, x)
+    end
 end
 
 # Channel notation: Changed to match Conv, but very softly deprecated!
 # Perhaps change to @deprecate for v0.14, but there is no plan to remove these.
-Dense(in::Integer, out::Integer, σ = identity; kw...) =
-  Dense(in => out, σ; kw...)
-Bilinear(in1::Integer, in2::Integer, out::Integer, σ = identity; kw...) =
-  Bilinear((in1, in2) => out, σ; kw...)
+Dense(in::Integer, out::Integer, σ = identity; kw...) = Dense(in => out, σ; kw...)
+function Bilinear(in1::Integer, in2::Integer, out::Integer, σ = identity; kw...)
+    return Bilinear((in1, in2) => out, σ; kw...)
+end
 Embedding(in::Integer, out::Integer; kw...) = Embedding(in => out; kw...)
 
 RNNCell(in::Integer, out::Integer, σ = tanh; kw...) = RNNCell(in => out, σ; kw...)
diff --git a/src/functor.jl b/src/functor.jl
index 13adbe13ff..993ea95693 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -1,5 +1,5 @@
 import Adapt: adapt, adapt_storage
-using  LinearAlgebra: Cholesky
+using LinearAlgebra: Cholesky
 using Zygote: IdSet
 import Functors: Functors, @functor, functor, fmap, isleaf
 using SparseArrays: AbstractSparseArray
@@ -14,9 +14,10 @@ _Note_: if you manually set a model into test mode, you need to manually place
 it back into train mode during training phase.
 
 Possible values include:
-- `false` for training
-- `true` for testing
-- `:auto` or `nothing` for Flux to detect the mode automatically
+
+  - `false` for training
+  - `true` for testing
+  - `:auto` or `nothing` for Flux to detect the mode automatically
 """
 testmode!(m, mode = true) = (foreach(x -> testmode!(x, mode), trainable(m)); m)
 
@@ -30,23 +31,24 @@ _Note_: if you manually set a model into train mode, you need to manually place
 it into test mode during testing phase.
 
 Possible values include:
-- `true` for training
-- `false` for testing
-- `:auto` or `nothing` for Flux to detect the mode automatically
+
+  - `true` for training
+  - `false` for testing
+  - `:auto` or `nothing` for Flux to detect the mode automatically
 """
 trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode)
 
 function params!(p::Params, x, seen = IdSet())
-  if x isa AbstractArray{<:Number} && Functors.isleaf(x)
-    return push!(p, x)
-  elseif x in seen
-    nothing
-  else
-    push!(seen, x)
-    for child in trainable(x)
-      params!(p, child, seen)
+    if x isa AbstractArray{<:Number} && Functors.isleaf(x)
+        return push!(p, x)
+    elseif x in seen
+        nothing
+    else
+        push!(seen, x)
+        for child in trainable(x)
+            params!(p, child, seen)
+        end
     end
-  end
 end
 
 """
@@ -60,10 +62,11 @@ This can be used with the `gradient` function, see [Taking Gradients](@ref), or
 The behaviour of `params` on custom types can be customized using [`Functors.@functor`](@ref) or [`Flux.trainable`](@ref).
 
 # Examples
+
 ```jldoctest
 julia> using Flux: params
 
-julia> params(Chain(Dense(ones(2,3)), softmax))  # unpacks Flux models
+julia> params(Chain(Dense(ones(2, 3)), softmax))  # unpacks Flux models
 Params([[1.0 1.0 1.0; 1.0 1.0 1.0], [0.0, 0.0]])
 
 julia> bn = BatchNorm(2, relu)
@@ -78,14 +81,14 @@ Params([[1, 2, 3], [4]])
 julia> params([[1, 2, 3], [4]])  # unpacks array of arrays
 Params([[1, 2, 3], [4]])
 
-julia> params(1, [2 2], (alpha=[3,3,3], beta=Ref(4), gamma=sin))  # ignores scalars, unpacks NamedTuples
+julia> params(1, [2 2], (alpha = [3, 3, 3], beta = Ref(4), gamma = sin))  # ignores scalars, unpacks NamedTuples
 Params([[2 2], [3, 3, 3]])
 ```
 """
 function params(m...)
-  ps = Params()
-  params!(ps, m)
-  return ps
+    ps = Params()
+    params!(ps, m)
+    return ps
 end
 
 # Allows caching of the parameters when params is called within gradient() to fix #2040.
@@ -95,13 +98,14 @@ struct FluxCUDAAdaptor end
 adapt_storage(to::FluxCUDAAdaptor, x) = CUDA.cu(x)
 adapt_storage(to::FluxCUDAAdaptor, x::Zygote.FillArrays.AbstractFill) = CUDA.cu(collect(x))
 if VERSION >= v"1.7"
-  adapt_storage(to::FluxCUDAAdaptor, x::Random.TaskLocalRNG) = CUDA.default_rng()
+    adapt_storage(to::FluxCUDAAdaptor, x::Random.TaskLocalRNG) = CUDA.default_rng()
 else
-  adapt_storage(to::FluxCUDAAdaptor, x::Random._GLOBAL_RNG) = CUDA.default_rng()
+    adapt_storage(to::FluxCUDAAdaptor, x::Random._GLOBAL_RNG) = CUDA.default_rng()
 end
 adapt_storage(to::FluxCUDAAdaptor, x::CUDA.RNG) = x
-adapt_storage(to::FluxCUDAAdaptor, x::AbstractRNG) =
-  error("Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().")
+function adapt_storage(to::FluxCUDAAdaptor, x::AbstractRNG)
+    return error("Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().")
+end
 
 # TODO: figure out the correct design for OneElement
 adapt_storage(to::FluxCUDAAdaptor, x::Zygote.OneElement) = CUDA.cu(collect(x))
@@ -112,18 +116,23 @@ struct FluxCPUAdaptor end
 adapt_storage(to::FluxCPUAdaptor, x::AbstractArray) = adapt(Array, x)
 adapt_storage(to::FluxCPUAdaptor, x::AbstractRange) = x
 adapt_storage(to::FluxCPUAdaptor, x::Zygote.FillArrays.AbstractFill) = x
-adapt_storage(to::FluxCPUAdaptor, x::T) where T <: CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix = adapt(Array, x)
+function adapt_storage(to::FluxCPUAdaptor,
+                       x::T) where {T <: CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix}
+    return adapt(Array, x)
+end
 adapt_storage(to::FluxCPUAdaptor, x::Zygote.OneElement) = x
 adapt_storage(to::FluxCPUAdaptor, x::AbstractSparseArray) = x
 adapt_storage(to::FluxCPUAdaptor, x::CUDA.RNG) = Random.default_rng()
 adapt_storage(to::FluxCPUAdaptor, x::AbstractRNG) = x
 
 function ChainRulesCore.rrule(::Type{Array}, x::CUDA.CuArray)
-  Array(x), dx -> (NoTangent(), CUDA.cu(unthunk(dx)),)
+    return Array(x), dx -> (NoTangent(), CUDA.cu(unthunk(dx)))
 end
 
-function ChainRulesCore.rrule(::typeof(Adapt.adapt_storage), to::FluxCPUAdaptor, x::CUDA.AbstractGPUArray)
-  adapt_storage(to, x), dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx)),)
+function ChainRulesCore.rrule(::typeof(Adapt.adapt_storage), to::FluxCPUAdaptor,
+                              x::CUDA.AbstractGPUArray)
+    return adapt_storage(to, x),
+           dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx)))
 end
 
 # CPU/GPU movement conveniences
@@ -135,7 +144,7 @@ Moves `m` onto the CPU, the opposite of [`gpu`](@ref).
 Recurses into structs marked [`@functor`](@ref).
 
 ```julia-repl
-julia> m = Dense(1,2)
+julia> m = Dense(1, 2)
 Dense(1, 2)
 
 julia> m_gpu = gpu(m)
@@ -154,7 +163,7 @@ Matrix{Float32}
 cpu(x) = fmap(x -> adapt(FluxCPUAdaptor(), x), x)
 
 _isbitsarray(::AbstractArray{<:Number}) = true
-_isbitsarray(::AbstractArray{T}) where T = isbitstype(T)
+_isbitsarray(::AbstractArray{T}) where {T} = isbitstype(T)
 _isbitsarray(x) = false
 
 _isleaf(::AbstractRNG) = true
@@ -164,13 +173,13 @@ _isleaf(x) = _isbitsarray(x) || Functors.isleaf(x)
     gpu(x)
 
 Moves `m` to the current GPU device, if available. It is a no-op otherwise.
-See the [CUDA.jl docs](https://juliagpu.github.io/CUDA.jl/stable/usage/multigpu/) 
+See the [CUDA.jl docs](https://juliagpu.github.io/CUDA.jl/stable/usage/multigpu/)
 to help identify the current device.
 
 This works for functions, and any struct marked with [`@functor`](@ref).
 
 ```julia-repl
-julia> m = Dense(1,2)
+julia> m = Dense(1, 2)
 Dense(1, 2)
 
 julia> typeof(m.W)
@@ -184,21 +193,22 @@ CuArray{Float32, 2}
 ```
 """
 function gpu(x)
-  check_use_cuda()
-  use_cuda[] ? fmap(x -> Adapt.adapt(FluxCUDAAdaptor(), x), x; exclude = _isleaf) : x
+    check_use_cuda()
+    return use_cuda[] ? fmap(x -> Adapt.adapt(FluxCUDAAdaptor(), x), x; exclude = _isleaf) :
+           x
 end
 
 function check_use_cuda()
-  if use_cuda[] === nothing
-    use_cuda[] = CUDA.functional()
-    if use_cuda[] && !CUDA.has_cudnn()
-      @warn "CUDA.jl found cuda, but did not find libcudnn. Some functionality will not be available."
-    end
-    if !(use_cuda[])
-      @info """The GPU function is being called but the GPU is not accessible. 
-               Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog=1
+    if use_cuda[] === nothing
+        use_cuda[] = CUDA.functional()
+        if use_cuda[] && !CUDA.has_cudnn()
+            @warn "CUDA.jl found cuda, but did not find libcudnn. Some functionality will not be available."
+        end
+        if !(use_cuda[])
+            @info """The GPU function is being called but the GPU is not accessible. 
+                     Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog=1
+        end
     end
-  end
 end
 ChainRulesCore.@non_differentiable check_use_cuda()
 
@@ -227,4 +237,3 @@ f64(m) = paramtype(Float64, m)
 # Functors for certain Julia data structures
 @functor Cholesky
 trainable(c::Cholesky) = ()
-
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 2a3bc9131c..ae94beffb7 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -9,7 +9,7 @@ and if names are given, `m[:name] == m[1]` etc.
 # Examples
 
 ```jldoctest
-julia> m = Chain(x -> x^2, x -> x+1);
+julia> m = Chain(x -> x^2, x -> x + 1);
 
 julia> m(5) == 26
 true
@@ -21,7 +21,7 @@ julia> x = rand(10, 32);
 julia> m(x) == m[2](m[1](x))
 true
 
-julia> m2 = Chain(enc = Chain(Flux.flatten, Dense(10 => 5, tanh)), 
+julia> m2 = Chain(enc = Chain(Flux.flatten, Dense(10 => 5, tanh)),
                   dec = Dense(5 => 2));
 
 julia> m2(x) == (m2[:dec] ∘ m2[:enc])(x)
@@ -32,51 +32,57 @@ For large models, there is a special type-unstable path which can reduce compila
 times. This can be used by supplying a vector of layers `Chain([layer1, layer2, ...])`.
 This feature is somewhat experimental, beware!
 """
-struct Chain{T<:Union{Tuple, NamedTuple, AbstractVector}}
-  layers::T
+struct Chain{T <: Union{Tuple, NamedTuple, AbstractVector}}
+    layers::T
 end
 
 Chain(xs...) = Chain(xs)
 function Chain(; kw...)
-  :layers in keys(kw) && throw(ArgumentError("a Chain cannot have a named layer called `layers`"))
-  isempty(kw) && return Chain(())
-  Chain(values(kw))
+    :layers in keys(kw) &&
+        throw(ArgumentError("a Chain cannot have a named layer called `layers`"))
+    isempty(kw) && return Chain(())
+    return Chain(values(kw))
 end
 
 @forward Chain.layers Base.getindex, Base.length, Base.first, Base.last,
-  Base.iterate, Base.lastindex, Base.keys, Base.firstindex
+                      Base.iterate, Base.lastindex, Base.keys, Base.firstindex
 
 @functor Chain
 
 (c::Chain)(x) = _applychain(c.layers, x)
 
-@generated function _applychain(layers::Tuple{Vararg{<:Any,N}}, x) where {N}
-  symbols = vcat(:x, [gensym() for _ in 1:N])
-  calls = [:($(symbols[i+1]) = layers[$i]($(symbols[i]))) for i in 1:N]
-  Expr(:block, calls...)
+@generated function _applychain(layers::Tuple{Vararg{<:Any, N}}, x) where {N}
+    symbols = vcat(:x, [gensym() for _ in 1:N])
+    calls = [:($(symbols[i + 1]) = layers[$i]($(symbols[i]))) for i in 1:N]
+    return Expr(:block, calls...)
 end
 
 _applychain(layers::NamedTuple, x) = _applychain(Tuple(layers), x)
 
 function _applychain(layers::AbstractVector, x)  # type-unstable path, helps compile times
-  for f in layers
-    x = f(x)
-  end
-  x
+    for f in layers
+        x = f(x)
+    end
+    return x
 end
 
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i])
-Base.getindex(c::Chain{<:NamedTuple}, i::AbstractArray) =
-  Chain(NamedTuple{keys(c)[i]}(Tuple(c.layers)[i]))
+function Base.getindex(c::Chain{<:NamedTuple}, i::AbstractArray)
+    return Chain(NamedTuple{keys(c)[i]}(Tuple(c.layers)[i]))
+end
 function Base.show(io::IO, c::Chain)
-  print(io, "Chain(")
-  _show_layers(io, c.layers)
-  print(io, ")")
+    print(io, "Chain(")
+    _show_layers(io, c.layers)
+    return print(io, ")")
 end
 
 _show_layers(io, layers::Tuple) = join(io, layers, ", ")
-_show_layers(io, layers::NamedTuple) = join(io, ["$k = $v" for (k, v) in pairs(layers)], ", ")
-_show_layers(io, layers::AbstractVector) = (print(io, "["); join(io, layers, ", "); print(io, "]"))
+function _show_layers(io, layers::NamedTuple)
+    return join(io, ["$k = $v" for (k, v) in pairs(layers)], ", ")
+end
+function _show_layers(io, layers::AbstractVector)
+    return (print(io, "["); join(io, layers, ", "); print(io, "]"))
+end
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -93,7 +99,7 @@ Like calling a `Chain`, but saves the result of each layer as an output.
 ```jldoctest
 julia> using Flux: activations
 
-julia> c = Chain(x -> x + 1, x -> x * 2, x -> x ^ 3);
+julia> c = Chain(x -> x + 1, x -> x * 2, x -> x^3);
 
 julia> activations(c, 1)
 (2, 4, 64)
@@ -103,12 +109,11 @@ activations(c::Chain, input) = _extraChain(Tuple(c.layers), input)
 
 # Calculates the forward results of each layer provided in a `Tuple` with `x` as model input.
 function _extraChain(fs::Tuple, x)
-  res = first(fs)(x)
-  return (res, _extraChain(Base.tail(fs), res)...)
+    res = first(fs)(x)
+    return (res, _extraChain(Base.tail(fs), res)...)
 end
 _extraChain(::Tuple{}, x) = ()
 
-
 """
     Dense(in => out, σ=identity; bias=true, init=glorot_uniform)
     Dense(W::AbstractMatrix, [bias, σ])
@@ -128,6 +133,7 @@ given to keyword `init`, with default [`glorot_uniform`](@ref Flux.glorot_unifor
 The weight matrix and/or the bias vector (of length `out`) may also be provided explicitly.
 
 # Examples
+
 ```jldoctest
 julia> d = Dense(5 => 2)
 Dense(5 => 2)       # 12 parameters
@@ -150,36 +156,35 @@ julia> Flux.params(d1)  # no trainable bias
 Params([[1.0 1.0 … 1.0 1.0; 1.0 1.0 … 1.0 1.0]])
 ```
 """
-struct Dense{F, M<:AbstractMatrix, B}
-  weight::M
-  bias::B
-  σ::F
-  function Dense(W::M, bias = true, σ::F = identity) where {M<:AbstractMatrix, F}
-    b = create_bias(W, bias, size(W,1))
-    new{F,M,typeof(b)}(W, b, σ)
-  end
+struct Dense{F, M <: AbstractMatrix, B}
+    weight::M
+    bias::B
+    σ::F
+    function Dense(W::M, bias = true, σ::F = identity) where {M <: AbstractMatrix, F}
+        b = create_bias(W, bias, size(W, 1))
+        return new{F, M, typeof(b)}(W, b, σ)
+    end
 end
 
 function Dense((in, out)::Pair{<:Integer, <:Integer}, σ = identity;
                init = glorot_uniform, bias = true)
-  Dense(init(out, in), bias, σ)
+    return Dense(init(out, in), bias, σ)
 end
 
 @functor Dense
 
 function (a::Dense)(x::AbstractVecOrMat)
-  σ = NNlib.fast_act(a.σ, x)  # replaces tanh => tanh_fast, etc
-  return σ.(a.weight * x .+ a.bias)
+    σ = NNlib.fast_act(a.σ, x)  # replaces tanh => tanh_fast, etc
+    return σ.(a.weight * x .+ a.bias)
 end
 
-(a::Dense)(x::AbstractArray) = 
-  reshape(a(reshape(x, size(x,1), :)), :, size(x)[2:end]...)
+(a::Dense)(x::AbstractArray) = reshape(a(reshape(x, size(x, 1), :)), :, size(x)[2:end]...)
 
 function Base.show(io::IO, l::Dense)
-  print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1))
-  l.σ == identity || print(io, ", ", l.σ)
-  l.bias == false && print(io, "; bias=false")
-  print(io, ")")
+    print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1))
+    l.σ == identity || print(io, ", ", l.σ)
+    l.bias == false && print(io, "; bias=false")
+    return print(io, ")")
 end
 
 """
@@ -191,14 +196,15 @@ Create an element-wise layer, whose forward pass is given by:
     y = σ.(scale .* x .+ bias)
 
 This uses `.*` instead of matrix multiplication `*` of [`Dense`](@ref).
-    
+
 The learnable scale & bias are initialised `init(size...)` and `zeros32(size...)`,
-with `init=ones32` by default. You may specify the function `init`, 
+with `init=ones32` by default. You may specify the function `init`,
 turn off trainable bias with `bias=false`, or provide the array(s) explicitly.
 
 Used by [`LayerNorm`](@ref) with `affine=true`.
 
 # Examples
+
 ```jldoctest
 julia> a = Flux.Scale(2)
 Scale(2)            # 4 parameters
@@ -223,31 +229,37 @@ julia> Flux.params(b)
 Params([[1 2 3 4]])
 ```
 """
-struct Scale{F, A<:AbstractArray, B}
-  scale::A
-  bias::B
-  σ::F
-  function Scale(scale::A, bias::B = true, σ::F = identity) where {A<:AbstractArray, B<:Union{Bool, AbstractArray}, F}
-    b = create_bias(scale, bias, size(scale)...)
-    new{F, A, typeof(b)}(scale, b, σ)
-  end
+struct Scale{F, A <: AbstractArray, B}
+    scale::A
+    bias::B
+    σ::F
+    function Scale(scale::A, bias::B = true,
+                   σ::F = identity) where {A <: AbstractArray,
+                                           B <: Union{Bool, AbstractArray}, F}
+        b = create_bias(scale, bias, size(scale)...)
+        return new{F, A, typeof(b)}(scale, b, σ)
+    end
 end
 
-Scale(s1::Integer, s23::Integer...; bias = true, init = ones32, _act = identity) = Scale(init(s1, s23...), bias, _act)
-Scale(size_act...; bias = true, init = ones32) = Scale(size_act[1:end-1]...; bias, init, _act = size_act[end])
+function Scale(s1::Integer, s23::Integer...; bias = true, init = ones32, _act = identity)
+    return Scale(init(s1, s23...), bias, _act)
+end
+function Scale(size_act...; bias = true, init = ones32)
+    return Scale(size_act[1:(end - 1)]...; bias, init, _act = size_act[end])
+end
 
 @functor Scale
 
 function (a::Scale)(x::AbstractArray)
-  σ = NNlib.fast_act(a.σ, x)  # replaces tanh => tanh_fast, etc
-  σ.(a.scale .* x .+ a.bias)
+    σ = NNlib.fast_act(a.σ, x)  # replaces tanh => tanh_fast, etc
+    return σ.(a.scale .* x .+ a.bias)
 end
 
 function Base.show(io::IO, l::Scale)
-  print(io, "Scale(", join(size(l.scale), ", "))
-  l.σ == identity || print(io, ", ", l.σ)
-  l.bias == false && print(io, "; bias=false")
-  print(io, ")")
+    print(io, "Scale(", join(size(l.scale), ", "))
+    l.σ == identity || print(io, ", ", l.σ)
+    l.bias == false && print(io, "; bias=false")
+    return print(io, ")")
 end
 
 """
@@ -261,12 +273,13 @@ Instead of defining layers individually, you can provide a zero-argument functio
 which constructs them, and the number to construct.
 
 Maxout over linear dense layers satisfies the univeral approximation theorem.
-See Goodfellow, Warde-Farley, Mirza, Courville & Bengio "Maxout Networks" 
+See Goodfellow, Warde-Farley, Mirza, Courville & Bengio "Maxout Networks"
 [https://arxiv.org/abs/1302.4389](https://arxiv.org/abs/1302.4389).
 
 See also [`Parallel`](@ref) to reduce with other operators.
 
 # Examples
+
 ```jldoctest
 julia> m = Maxout(x -> abs2.(x), x -> x .* 3);
 
@@ -285,8 +298,8 @@ julia> Flux.outputsize(m3, (5, 11))
 (7, 11)
 ```
 """
-struct Maxout{T<:Tuple}
-  layers::T
+struct Maxout{T <: Tuple}
+    layers::T
 end
 Maxout(layers...) = Maxout(layers)
 Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ in 1:n_alts)...)
@@ -294,18 +307,17 @@ Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ in 1:n_alts)...)
 @functor Maxout
 
 function (mo::Maxout)(input::AbstractArray)
-  # Perhaps surprisingly, pairwise max broadcast is often faster,
-  # even with Zygote. See #698 and #1794
-  mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.layers)
+    # Perhaps surprisingly, pairwise max broadcast is often faster,
+    # even with Zygote. See #698 and #1794
+    return mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.layers)
 end
 
 function Base.show(io::IO, mo::Maxout)
-  print(io, "Maxout(")
-  _show_layers(io, mo.layers)
-  print(io, ")")
+    print(io, "Maxout(")
+    _show_layers(io, mo.layers)
+    return print(io, ")")
 end
 
-
 """
     SkipConnection(layer, connection)
 
@@ -317,15 +329,16 @@ will be propagated through the given `layer` while the second is the unchanged,
 
 The simplest "ResNet"-type connection is just `SkipConnection(layer, +)`.
 Here is a more complicated example:
+
 ```jldoctest
-julia> m = Conv((3,3), 4 => 7, pad=(1,1));
+julia> m = Conv((3, 3), 4 => 7, pad = (1, 1));
 
 julia> x = ones(Float32, 5, 5, 4, 10);
 
 julia> size(m(x)) == (5, 5, 7, 10)
 true
 
-julia> sm = SkipConnection(m, (mx, x) -> cat(mx, x, dims=3));
+julia> sm = SkipConnection(m, (mx, x) -> cat(mx, x, dims = 3));
 
 julia> size(sm(x)) == (5, 5, 11, 10)
 true
@@ -333,19 +346,19 @@ true
 
 See also [`Parallel`](@ref), [`Maxout`](@ref).
 """
-struct SkipConnection{T,F}
-  layers::T
-  connection::F  #user can pass arbitrary connections here, such as (a,b) -> a + b
+struct SkipConnection{T, F}
+    layers::T
+    connection::F  #user can pass arbitrary connections here, such as (a,b) -> a + b
 end
 
 @functor SkipConnection
 
 function (skip::SkipConnection)(input)
-  skip.connection(skip.layers(input), input)
+    return skip.connection(skip.layers(input), input)
 end
 
 function Base.show(io::IO, b::SkipConnection)
-  print(io, "SkipConnection(", b.layers, ", ", b.connection, ")")
+    return print(io, "SkipConnection(", b.layers, ", ", b.connection, ")")
 end
 
 """
@@ -373,6 +386,7 @@ By default the bias vector is `zeros(Float32, out)`, option `bias=false` will sw
 trainable bias. Either of these may be provided explicitly.
 
 # Examples
+
 ```jldoctest
 julia> x, y = randn(Float32, 5, 32), randn(Float32, 5, 32);
 
@@ -382,71 +396,76 @@ Bilinear(5 => 7)    # 182 parameters
 julia> B(x) |> size  # interactions based on one input
 (7, 32)
 
-julia> B(x,y) == B((x,y))  # two inputs, may be given as a tuple
+julia> B(x, y) == B((x, y))  # two inputs, may be given as a tuple
 true
 
-julia> sc = SkipConnection(
-                Chain(Dense(5 => 20, tanh), Dense(20 => 9, tanh)),
-                Flux.Bilinear((9, 5) => 3, bias=false),
-            );  # used as the recombinator, with skip as the second input
+julia> sc = SkipConnection(Chain(Dense(5 => 20, tanh), Dense(20 => 9, tanh)),
+                           Flux.Bilinear((9, 5) => 3, bias = false));  # used as the recombinator, with skip as the second input
 
 julia> sc(x) |> size
 (3, 32)
 
-julia> Flux.Bilinear(rand(4,8,16), false, tanh)  # first dim of weight is the output
+julia> Flux.Bilinear(rand(4, 8, 16), false, tanh)  # first dim of weight is the output
 Bilinear((8, 16) => 4, tanh; bias=false)  # 512 parameters
 ```
 """
-struct Bilinear{F,A,B}
-  weight::A
-  bias::B
-  σ::F
-  function Bilinear(W::A, bias = true, σ::F = identity) where {A<:AbstractArray, F}
-    ndims(A) == 3 || throw(ArgumentError("expected a 3-array of weights"))
-    b = create_bias(W, bias, size(W,1))
-    new{F,A,typeof(b)}(W, b, σ)
-  end
+struct Bilinear{F, A, B}
+    weight::A
+    bias::B
+    σ::F
+    function Bilinear(W::A, bias = true, σ::F = identity) where {A <: AbstractArray, F}
+        ndims(A) == 3 || throw(ArgumentError("expected a 3-array of weights"))
+        b = create_bias(W, bias, size(W, 1))
+        return new{F, A, typeof(b)}(W, b, σ)
+    end
 end
 
 @functor Bilinear
 
 function Bilinear(((in1, in2), out)::Pair{<:Tuple, <:Integer}, σ = identity;
                   bias = true, init = glorot_uniform)
-  Bilinear(init(out, in1, in2), bias, σ)
+    return Bilinear(init(out, in1, in2), bias, σ)
+end
+function Bilinear((in12, out)::Pair{<:Integer, <:Integer}, σ = identity; kw...)
+    return Bilinear((in12, in12) => out, σ; kw...)
 end
-Bilinear((in12, out)::Pair{<:Integer, <:Integer}, σ = identity; kw...) = Bilinear((in12, in12) => out, σ; kw...)
 
 function (a::Bilinear)(x::AbstractMatrix, y::AbstractMatrix)
-  W, b, σ = a.weight, a.bias, a.σ
+    W, b, σ = a.weight, a.bias, a.σ
 
-  d_z, d_x, d_y = size(W)
-  d_x == size(x,1) && d_y == size(y,1) || throw(DimensionMismatch("number of rows in data must match W"))
-  size(x,2) == size(y,2) || throw(DimensionMismatch("Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))"))
+    d_z, d_x, d_y = size(W)
+    d_x == size(x, 1) && d_y == size(y, 1) ||
+        throw(DimensionMismatch("number of rows in data must match W"))
+    size(x, 2) == size(y, 2) ||
+        throw(DimensionMismatch("Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))"))
 
-  # @einsum Wy[o,i,s] := W[o,i,j] * y[j,s]
-  Wy = reshape(reshape(W, (:, d_y)) * y, (d_z, d_x, :))
+    # @einsum Wy[o,i,s] := W[o,i,j] * y[j,s]
+    Wy = reshape(reshape(W, (:, d_y)) * y, (d_z, d_x, :))
 
-  # @einsum Z[o,s] := Wy[o,i,s] * x[i,s]
-  Wyx = batched_mul(Wy, reshape(x, (d_x, 1, :)))
-  Z = reshape(Wyx, (d_z, :))
+    # @einsum Z[o,s] := Wy[o,i,s] * x[i,s]
+    Wyx = batched_mul(Wy, reshape(x, (d_x, 1, :)))
+    Z = reshape(Wyx, (d_z, :))
 
-  # @einsum out[o,s] := σ(Z[o,i] + b[o])
-  σ.(Z .+ b)
+    # @einsum out[o,s] := σ(Z[o,i] + b[o])
+    return σ.(Z .+ b)
 end
 
 (a::Bilinear)(x::AbstractVecOrMat) = a(x, x)
-(a::Bilinear)(x::AbstractVector, y::AbstractVector) = vec(a(reshape(x, :,1), reshape(y, :,1)))
+function (a::Bilinear)(x::AbstractVector, y::AbstractVector)
+    return vec(a(reshape(x, :, 1), reshape(y, :, 1)))
+end
 (a::Bilinear)(x::NTuple{2, AbstractArray}) = a(x[1], x[2])
 
 function Base.show(io::IO, l::Bilinear)
-  if size(l.weight, 2) == size(l.weight, 3)
-    print(io, "Bilinear(", size(l.weight, 2), " => ", size(l.weight, 1))
-  else
-    print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", size(l.weight, 1))
-  end
-  l.σ == identity || print(io, ", ", l.σ)
-  l.bias === false && print(io, "; bias=false")
-  print(io, ")")
+    if size(l.weight, 2) == size(l.weight, 3)
+        print(io, "Bilinear(", size(l.weight, 2), " => ", size(l.weight, 1))
+    else
+        print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ",
+              size(l.weight, 1))
+    end
+    l.σ == identity || print(io, ", ", l.σ)
+    l.bias === false && print(io, "; bias=false")
+    return print(io, ")")
 end
 
 """
@@ -492,19 +511,19 @@ julia> model2[:β] == model2[2]
 true
 ```
 """
-struct Parallel{F, T<:Union{Tuple, NamedTuple}}
-  connection::F
-  layers::T
+struct Parallel{F, T <: Union{Tuple, NamedTuple}}
+    connection::F
+    layers::T
 end
 
 Parallel(connection, layers...) = Parallel(connection, layers)
 function Parallel(connection; kw...)
-  layers = NamedTuple(kw)
-  if :layers in keys(layers) || :connection in keys(layers)
-    throw(ArgumentError("a Parallel layer cannot have a named sub-layer called `connection` or `layers`"))
-  end
-  isempty(layers) && return Parallel(connection, ())
-  Parallel(connection, layers)
+    layers = NamedTuple(kw)
+    if :layers in keys(layers) || :connection in keys(layers)
+        throw(ArgumentError("a Parallel layer cannot have a named sub-layer called `connection` or `layers`"))
+    end
+    isempty(layers) && return Parallel(connection, ())
+    return Parallel(connection, layers)
 end
 
 @functor Parallel
@@ -513,30 +532,31 @@ end
 (m::Parallel)(xs::Tuple) = m(xs...)
 
 function _parallel_check(layers, xs)
-  nl = length(layers)
-  nx = length(xs) 
-  if (nl != nx)
-    throw(ArgumentError("Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs"))
-  end
+    nl = length(layers)
+    nx = length(xs)
+    if (nl != nx)
+        throw(ArgumentError("Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs"))
+    end
 end
 ChainRulesCore.@non_differentiable _parallel_check(nl, nx)
 
 function (m::Parallel)(xs...)
-  _parallel_check(m.layers, xs)
-  m.connection(map(|>, xs, Tuple(m.layers))...)
+    _parallel_check(m.layers, xs)
+    return m.connection(map(|>, xs, Tuple(m.layers))...)
 end
 
 Base.getindex(m::Parallel, i) = m.layers[i]
 Base.getindex(m::Parallel, i::AbstractVector) = Parallel(m.connection, m.layers[i])
-Base.getindex(m::Parallel{<:Any, <:NamedTuple}, i::AbstractVector) =
-  Parallel(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i]))
+function Base.getindex(m::Parallel{<:Any, <:NamedTuple}, i::AbstractVector)
+    return Parallel(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i]))
+end
 
 Base.keys(m::Parallel) = keys(getfield(m, :layers))
 
 function Base.show(io::IO, m::Parallel)
-  print(io, "Parallel(", m.connection, ", ")
-  _show_layers(io, m.layers)
-  print(io, ")")
+    print(io, "Parallel(", m.connection, ", ")
+    _show_layers(io, m.layers)
+    return print(io, ")")
 end
 
 """
@@ -544,37 +564,40 @@ end
 
 ## Arguments
 
-- `connection`: A function taking 2 inputs and combining them into a single output 
-- `layers`: The layers whose outputs are combined
+  - `connection`: A function taking 2 inputs and combining them into a single output
+  - `layers`: The layers whose outputs are combined
 
 ## Inputs
 
 This layer behaves differently based on input type:
 
-1. If input `x` is a tuple of length N (or the input is `xs` with N `x`'s), matching the number of `layers`, 
-  then each layer receives a new input `x[i]` combined with the previous output `y[i-1]` using `connection`.
-  Thus `(y1, y2, y3) = PairwiseFusion(connection, layer1, layer2, layer3)((x1, x2, x3))`
-  may be drawn as:
+ 1. If input `x` is a tuple of length N (or the input is `xs` with N `x`'s), matching the number of `layers`,
+    then each layer receives a new input `x[i]` combined with the previous output `y[i-1]` using `connection`.
+    Thus `(y1, y2, y3) = PairwiseFusion(connection, layer1, layer2, layer3)((x1, x2, x3))`
+    may be drawn as:
+
 ```
 x1 → layer1 → y1 ↘
                   connection → layer2 → y2 ↘
               x2 ↗                          connection → layer3 → y3
                                         x3 ↗
 ```
+
 ... or written as:
+
 ```julia
 y1 = layer1(x1)
 y2 = layer2(connection(x2, y1))
 y3 = layer3(connection(x3, y2))
 ```
 
-2. With just one input, each layer receives the same `x` combined with the previous output.
-   Thus `y = PairwiseFusion(connection, layers...)(x)` obeys:
+ 2. With just one input, each layer receives the same `x` combined with the previous output.
+    Thus `y = PairwiseFusion(connection, layers...)(x)` obeys:
 
 ```julia
 y[1] == layers[1](x)
 for i in 2:length(layers)
-    y[i] == connection(x, layers[i](y[i-1]))
+    y[i] == connection(x, layers[i](y[i - 1]))
 end
 ```
 
@@ -582,74 +605,81 @@ end
 
 A tuple of length N with the output of each fusion ((`y1`, `y2`, ..., `yN`) in the example above).
 """
-struct PairwiseFusion{F, T<:Union{Tuple, NamedTuple}}
-  connection::F
-  layers::T
+struct PairwiseFusion{F, T <: Union{Tuple, NamedTuple}}
+    connection::F
+    layers::T
 end
 
 PairwiseFusion(connection, layers...) = PairwiseFusion(connection, layers)
 function PairwiseFusion(connection; kw...)
-  layers = NamedTuple(kw)
-  if :layers in keys(layers) || :connection in keys(layers)
-    throw(ArgumentError("a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`"))
-  end
-  isempty(layers) && return PairwiseFusion(connection, ())
-  PairwiseFusion(connection, layers)
+    layers = NamedTuple(kw)
+    if :layers in keys(layers) || :connection in keys(layers)
+        throw(ArgumentError("a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`"))
+    end
+    isempty(layers) && return PairwiseFusion(connection, ())
+    return PairwiseFusion(connection, layers)
 end
 
 function _pairwise_check(x, layers, T)
-  lx = length(x)
-  N = length(layers)
-  if T <: Tuple && lx != N
-    throw(ArgumentError("PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs"))
-  end
+    lx = length(x)
+    N = length(layers)
+    if T <: Tuple && lx != N
+        throw(ArgumentError("PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs"))
+    end
 end
 ChainRulesCore.@non_differentiable _pairwise_check(lx, N, T)
 
 function (m::PairwiseFusion)(x::T) where {T}
-  _pairwise_check(x, m.layers, T)
-  applypairwisefusion(m.layers, m.connection, x)
+    _pairwise_check(x, m.layers, T)
+    return applypairwisefusion(m.layers, m.connection, x)
 end
 (m::PairwiseFusion)(xs...) = m(xs)
 
-@generated function applypairwisefusion(layers::Tuple{Vararg{<:Any,N}}, connection, x::T) where {N, T}
-  y_symbols = [gensym() for _ in 1:(N + 1)]
-  getinput(i) = T <: Tuple ? :(x[$i]) : :x
-  calls = [:($(y_symbols[N + 1]) = $(getinput(1)))]
-  for i in 1:N - 1
-    push!(calls, quote
-      $(y_symbols[i]) = layers[$i]($(y_symbols[N + 1]))
-      $(y_symbols[N + 1]) = connection($(y_symbols[i]), $(getinput(i + 1)))
-    end)
-  end
-  push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N + 1]))))
-  push!(calls, :(return tuple($(Tuple(y_symbols[1:N])...))))
-  return Expr(:block, calls...)
-end
-applypairwisefusion(layers::NamedTuple, connection, x) = applypairwisefusion(Tuple(layers), connection, x)
+@generated function applypairwisefusion(layers::Tuple{Vararg{<:Any, N}}, connection,
+                                        x::T) where {N, T}
+    y_symbols = [gensym() for _ in 1:(N + 1)]
+    getinput(i) = T <: Tuple ? :(x[$i]) : :x
+    calls = [:($(y_symbols[N + 1]) = $(getinput(1)))]
+    for i in 1:(N - 1)
+        push!(calls,
+              quote
+                  $(y_symbols[i]) = layers[$i]($(y_symbols[N + 1]))
+                  $(y_symbols[N + 1]) = connection($(y_symbols[i]), $(getinput(i + 1)))
+              end)
+    end
+    push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N + 1]))))
+    push!(calls, :(return tuple($(Tuple(y_symbols[1:N])...))))
+    return Expr(:block, calls...)
+end
+function applypairwisefusion(layers::NamedTuple, connection, x)
+    return applypairwisefusion(Tuple(layers), connection, x)
+end
 
 @functor PairwiseFusion
 
 Base.getindex(m::PairwiseFusion, i) = m.layers[i]
-Base.getindex(m::PairwiseFusion, i::AbstractVector) = PairwiseFusion(m.connection, m.layers[i])
-Base.getindex(m::PairwiseFusion{<:Any, <:NamedTuple}, i::AbstractVector) =
-  PairwiseFusion(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i]))
+function Base.getindex(m::PairwiseFusion, i::AbstractVector)
+    return PairwiseFusion(m.connection, m.layers[i])
+end
+function Base.getindex(m::PairwiseFusion{<:Any, <:NamedTuple}, i::AbstractVector)
+    return PairwiseFusion(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i]))
+end
 
 Base.keys(m::PairwiseFusion) = keys(getfield(m, :layers))
 
 function Base.show(io::IO, m::PairwiseFusion)
-  print(io, "PairwiseFusion(", m.connection, ", ")
-  _show_layers(io, m.layers)
-  print(io, ")")
+    print(io, "PairwiseFusion(", m.connection, ", ")
+    _show_layers(io, m.layers)
+    return print(io, ")")
 end
 
 """
     Embedding(in => out; init=randn32)
 
-A lookup table that stores embeddings of dimension `out` 
+A lookup table that stores embeddings of dimension `out`
 for a vocabulary of size `in`, as a trainable matrix.
 
-This layer is often used to store word embeddings and retrieve them using indices. 
+This layer is often used to store word embeddings and retrieve them using indices.
 The input to the layer can be a vocabulary index in `1:in`, an array of indices,
 or the corresponding [`onehot encoding`](@ref OneHotArrays.onehotbatch).
 
@@ -657,8 +687,9 @@ For indices `x`, the result is of size `(out, size(x)...)`, allowing several bat
 For one-hot `ohx`, the result is of size `(out, size(ohx)[2:end]...)`.
 
 # Examples
+
 ```jldoctest
-julia> emb = Embedding(26 => 4, init=Flux.identity_init(gain=22))
+julia> emb = Embedding(26 => 4, init = Flux.identity_init(gain = 22))
 Embedding(26 => 4)  # 104 parameters
 
 julia> emb(2)  # one column of e.weight (here not random!)
@@ -682,8 +713,8 @@ julia> emb(rand(1:26, (10, 1, 12))) |> size  # three batch dimensions
 (4, 10, 1, 12)
 ```
 """
-struct Embedding{W<:AbstractMatrix}
-  weight::W
+struct Embedding{W <: AbstractMatrix}
+    weight::W
 end
 
 @functor Embedding
@@ -696,8 +727,10 @@ Embedding((in, out)::Pair{<:Integer, <:Integer}; init = randn32) = Embedding(ini
 
 (m::Embedding)(x::AbstractVector{Bool}) = m.weight * x  # usually OneHotVector
 (m::Embedding)(x::AbstractMatrix{Bool}) = m.weight * x  # usually OneHotMatrix
-(m::Embedding)(x::AbstractArray{Bool}) = reshape(m(reshape(x, size(x,1), :)), :, size(x)[2:end]...)
+function (m::Embedding)(x::AbstractArray{Bool})
+    return reshape(m(reshape(x, size(x, 1), :)), :, size(x)[2:end]...)
+end
 
 function Base.show(io::IO, m::Embedding)
-  print(io, "Embedding(", size(m.weight, 2), " => ", size(m.weight, 1), ")")
+    return print(io, "Embedding(", size(m.weight, 2), " => ", size(m.weight, 1), ")")
 end
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 003395c15d..428f460dd1 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -8,7 +8,9 @@ expand(N, i::Integer) = ntuple(_ -> i, N)
 
 conv_reshape_bias(c) = conv_reshape_bias(c.bias, c.stride)
 conv_reshape_bias(@nospecialize(bias), _) = bias
-conv_reshape_bias(bias::AbstractVector, stride) = reshape(bias, map(_->1, stride)..., :, 1)
+function conv_reshape_bias(bias::AbstractVector, stride)
+    return reshape(bias, map(_ -> 1, stride)..., :, 1)
+end
 
 """
     SamePad()
@@ -21,22 +23,23 @@ When `stride≠1`, the output size equals `ceil(input_size/stride)`.
 See also [`Conv`](@ref), [`MaxPool`](@ref).
 
 # Examples
+
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50);  # a batch of images
 
-julia> layer = Conv((2,2), 3 => 7, pad=SamePad())
+julia> layer = Conv((2, 2), 3 => 7, pad = SamePad())
 Conv((2, 2), 3 => 7, pad=(1, 0, 1, 0))  # 91 parameters
 
 julia> layer(xs) |> size  # notice how the dimensions stay the same with this padding
 (100, 100, 7, 50)
 
-julia> layer2 = Conv((2,2), 3 => 7)
+julia> layer2 = Conv((2, 2), 3 => 7)
 Conv((2, 2), 3 => 7)  # 91 parameters
 
 julia> layer2(xs) |> size  # the output dimension changes as the padding was not "same"
 (99, 99, 7, 50)
 
-julia> layer3 = Conv((5, 5), 3 => 7, stride=2, pad=SamePad())
+julia> layer3 = Conv((5, 5), 3 => 7, stride = 2, pad = SamePad())
 Conv((5, 5), 3 => 7, pad=2, stride=2)  # 532 parameters
 
 julia> layer3(xs) |> size  # output size = `ceil(input_size/stride)` = 50
@@ -45,16 +48,18 @@ julia> layer3(xs) |> size  # output size = `ceil(input_size/stride)` = 50
 """
 struct SamePad end
 
-calc_padding(lt, pad, k::NTuple{N,T}, dilation, stride) where {T,N} = expand(Val(2*N), pad)
-function calc_padding(lt, ::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
-  #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/abs/1603.07285
+function calc_padding(lt, pad, k::NTuple{N, T}, dilation, stride) where {T, N}
+    return expand(Val(2 * N), pad)
+end
+function calc_padding(lt, ::SamePad, k::NTuple{N, T}, dilation, stride) where {N, T}
+    #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/abs/1603.07285
 
-  # Effective kernel size, including dilation
-  k_eff = @. k + (k - 1) * (dilation - 1)
-  # How much total padding needs to be applied?
-  pad_amt = @. k_eff - 1
-  # In case amount of padding is odd we need to apply different amounts to each side.
-  return Tuple(mapfoldl(i -> [cld(i, 2), fld(i,2)], vcat, pad_amt))
+    # Effective kernel size, including dilation
+    k_eff = @. k + (k - 1) * (dilation - 1)
+    # How much total padding needs to be applied?
+    pad_amt = @. k_eff - 1
+    # In case amount of padding is odd we need to apply different amounts to each side.
+    return Tuple(mapfoldl(i -> [cld(i, 2), fld(i, 2)], vcat, pad_amt))
 end
 
 """
@@ -75,56 +80,61 @@ To take convolutions along `N` feature dimensions, this layer expects as input a
 with `ndims(x) == N+2`, where `size(x, N+1) == in` is the number of input channels,
 and `size(x, ndims(x))` is (as always) the number of observations in a batch.
 Then:
-* `filter` should be a tuple of `N` integers.
-* Keywords `stride` and `dilation` should each be either single integer,
-  or a tuple with `N` integers.
-* Keyword `pad` specifies the number of elements added to the borders of the data array. It can be
-  - a single integer for equal padding all around,
-  - a tuple of `N` integers, to apply the same padding at begin/end of each spatial dimension,
-  - a tuple of `2*N` integers, for asymmetric padding, or
-  - the singleton `SamePad()`, to calculate padding such that
-    `size(output,d) == size(x,d) / stride` (possibly rounded) for each spatial dimension.
-* Keyword `groups` is expected to be an `Int`. It specifies the number of groups
-  to divide a convolution into.
+
+  - `filter` should be a tuple of `N` integers.
+
+  - Keywords `stride` and `dilation` should each be either single integer,
+    or a tuple with `N` integers.
+  - Keyword `pad` specifies the number of elements added to the borders of the data array. It can be
+    
+      + a single integer for equal padding all around,
+      + a tuple of `N` integers, to apply the same padding at begin/end of each spatial dimension,
+      + a tuple of `2*N` integers, for asymmetric padding, or
+      + the singleton `SamePad()`, to calculate padding such that
+        `size(output,d) == size(x,d) / stride` (possibly rounded) for each spatial dimension.
+  - Keyword `groups` is expected to be an `Int`. It specifies the number of groups
+    to divide a convolution into.
 
 Keywords to control initialization of the layer:
-* `init` - Function used to generate initial weights. Defaults to `glorot_uniform`.
-* `bias` - The initial bias vector is all zero by default. Trainable bias can be disabled entirely
-  by setting this to `false`, or another vector can be provided such as `bias = randn(Float32, out)`.
+
+  - `init` - Function used to generate initial weights. Defaults to `glorot_uniform`.
+  - `bias` - The initial bias vector is all zero by default. Trainable bias can be disabled entirely
+    by setting this to `false`, or another vector can be provided such as `bias = randn(Float32, out)`.
 
 See also [`ConvTranspose`](@ref), [`DepthwiseConv`](@ref), [`CrossCor`](@ref).
 
 # Examples
+
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50); # a batch of images
 
-julia> layer = Conv((5,5), 3 => 7, relu; bias = false)
+julia> layer = Conv((5, 5), 3 => 7, relu; bias = false)
 Conv((5, 5), 3 => 7, relu, bias=false)  # 525 parameters
 
 julia> layer(xs) |> size
 (96, 96, 7, 50)
 
-julia> Conv((5,5), 3 => 7; stride = 2)(xs) |> size
+julia> Conv((5, 5), 3 => 7; stride = 2)(xs) |> size
 (48, 48, 7, 50)
 
-julia> Conv((5,5), 3 => 7; stride = 2, pad = SamePad())(xs) |> size
+julia> Conv((5, 5), 3 => 7; stride = 2, pad = SamePad())(xs) |> size
 (50, 50, 7, 50)
 
-julia> Conv((1,1), 3 => 7; pad = (20,10,0,0))(xs) |> size
+julia> Conv((1, 1), 3 => 7; pad = (20, 10, 0, 0))(xs) |> size
 (130, 100, 7, 50)
 
-julia> Conv((5,5), 3 => 7; stride = 2, dilation = 4)(xs) |> size
+julia> Conv((5, 5), 3 => 7; stride = 2, dilation = 4)(xs) |> size
 (42, 42, 7, 50)
 ```
 """
-struct Conv{N,M,F,A,V}
-  σ::F
-  weight::A
-  bias::V
-  stride::NTuple{N,Int}
-  pad::NTuple{M,Int}
-  dilation::NTuple{N,Int}
-  groups::Int
+struct Conv{N, M, F, A, V}
+    σ::F
+    weight::A
+    bias::V
+    stride::NTuple{N, Int}
+    pad::NTuple{M, Int}
+    dilation::NTuple{N, Int}
+    groups::Int
 end
 
 """
@@ -149,23 +159,21 @@ julia> Flux.params(layer) |> length
 2
 ```
 """
-function Conv(w::AbstractArray{T,N}, b = true, σ = identity;
-              stride = 1, pad = 0, dilation = 1, groups = 1) where {T,N}
-
-  @assert size(w, N) % groups == 0 "Output channel dimension must be divisible by groups."
-  stride = expand(Val(N-2), stride)
-  dilation = expand(Val(N-2), dilation)
-  pad = calc_padding(Conv, pad, size(w)[1:N-2], dilation, stride)
-  bias = create_bias(w, b, size(w, N))
-  return Conv(σ, w, bias, stride, pad, dilation, groups)
+function Conv(w::AbstractArray{T, N}, b = true, σ = identity;
+              stride = 1, pad = 0, dilation = 1, groups = 1) where {T, N}
+    @assert size(w, N) % groups==0 "Output channel dimension must be divisible by groups."
+    stride = expand(Val(N - 2), stride)
+    dilation = expand(Val(N - 2), dilation)
+    pad = calc_padding(Conv, pad, size(w)[1:(N - 2)], dilation, stride)
+    bias = create_bias(w, b, size(w, N))
+    return Conv(σ, w, bias, stride, pad, dilation, groups)
 end
 
-function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-            init = glorot_uniform, stride = 1, pad = 0, dilation = 1, groups = 1,
-            bias = true) where N
-    
-  weight = convfilter(k, ch; init, groups)
-  Conv(weight, bias, σ; stride, pad, dilation, groups)
+function Conv(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity;
+              init = glorot_uniform, stride = 1, pad = 0, dilation = 1, groups = 1,
+              bias = true) where {N}
+    weight = convfilter(k, ch; init, groups)
+    return Conv(weight, bias, σ; stride, pad, dilation, groups)
 end
 
 """
@@ -179,46 +187,48 @@ distribution.
 
 This is internally used by the [`Conv`](@ref) layer.
 """
-function convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
-          init = glorot_uniform, groups = 1) where N
-  cin, cout = ch
-  @assert cin % groups == 0 "Input channel dimension must be divisible by groups."
-  @assert cout % groups == 0 "Output channel dimension must be divisible by groups."
-  init(filter..., cin÷groups, cout)
+function convfilter(filter::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer};
+                    init = glorot_uniform, groups = 1) where {N}
+    cin, cout = ch
+    @assert cin % groups==0 "Input channel dimension must be divisible by groups."
+    @assert cout % groups==0 "Output channel dimension must be divisible by groups."
+    return init(filter..., cin ÷ groups, cout)
 end
 
 @functor Conv
 
-conv_dims(c::Conv, x::AbstractArray) =
-  DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad, dilation = c.dilation, groups = c.groups)
+function conv_dims(c::Conv, x::AbstractArray)
+    return DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad,
+                         dilation = c.dilation, groups = c.groups)
+end
 
 ChainRulesCore.@non_differentiable conv_dims(::Any, ::Any)
 
 function (c::Conv)(x::AbstractArray)
-  σ = NNlib.fast_act(c.σ, x)
-  cdims = conv_dims(c, x)
-  σ.(conv(x, c.weight, cdims) .+ conv_reshape_bias(c))
+    σ = NNlib.fast_act(c.σ, x)
+    cdims = conv_dims(c, x)
+    return σ.(conv(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
-_channels_in(l::Conv) = size(l.weight, ndims(l.weight)-1) * l.groups
+_channels_in(l::Conv) = size(l.weight, ndims(l.weight) - 1) * l.groups
 _channels_out(l::Conv) = size(l.weight, ndims(l.weight))
 
 function Base.show(io::IO, l::Conv)
-  print(io, "Conv(", size(l.weight)[1:ndims(l.weight)-2])
-  print(io, ", ", _channels_in(l), " => ", _channels_out(l))
-  _print_conv_opt(io, l)
-  print(io, ")")
+    print(io, "Conv(", size(l.weight)[1:(ndims(l.weight) - 2)])
+    print(io, ", ", _channels_in(l), " => ", _channels_out(l))
+    _print_conv_opt(io, l)
+    return print(io, ")")
 end
 
 function _print_conv_opt(io::IO, l)
-  l.σ == identity || print(io, ", ", l.σ)
-  all(==(0), l.pad) || print(io, ", pad=", _maybetuple_string(l.pad))
-  all(==(1), l.stride) || print(io, ", stride=", _maybetuple_string(l.stride))
-  all(==(1), l.dilation) || print(io, ", dilation=", _maybetuple_string(l.dilation))
-  if hasproperty(l, :groups)
-    (l.groups == 1) || print(io, ", groups=", l.groups)
-  end
-  (l.bias === false) && print(io, ", bias=false")
+    l.σ == identity || print(io, ", ", l.σ)
+    all(==(0), l.pad) || print(io, ", pad=", _maybetuple_string(l.pad))
+    all(==(1), l.stride) || print(io, ", stride=", _maybetuple_string(l.stride))
+    all(==(1), l.dilation) || print(io, ", dilation=", _maybetuple_string(l.dilation))
+    if hasproperty(l, :groups)
+        (l.groups == 1) || print(io, ", groups=", l.groups)
+    end
+    return (l.bias === false) && print(io, ", bias=false")
 end
 
 """
@@ -236,34 +246,35 @@ Parameters are controlled by additional keywords, with defaults
 See also [`Conv`](@ref) for more detailed description of keywords.
 
 # Examples
+
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50);  # a batch of 50 RGB images
 
-julia> layer = ConvTranspose((5,5), 3 => 7, relu)
+julia> layer = ConvTranspose((5, 5), 3 => 7, relu)
 ConvTranspose((5, 5), 3 => 7, relu)  # 532 parameters
 
 julia> layer(xs) |> size
 (104, 104, 7, 50)
 
-julia> ConvTranspose((5,5), 3 => 7, stride=2)(xs) |> size
+julia> ConvTranspose((5, 5), 3 => 7, stride = 2)(xs) |> size
 (203, 203, 7, 50)
 
-julia> ConvTranspose((5,5), 3 => 7, stride=3, pad=SamePad())(xs) |> size
+julia> ConvTranspose((5, 5), 3 => 7, stride = 3, pad = SamePad())(xs) |> size
 (300, 300, 7, 50)
 ```
 """
-struct ConvTranspose{N,M,F,A,V}
-  σ::F
-  weight::A
-  bias::V
-  stride::NTuple{N,Int}
-  pad::NTuple{M,Int}
-  dilation::NTuple{N,Int}
-  groups::Int
+struct ConvTranspose{N, M, F, A, V}
+    σ::F
+    weight::A
+    bias::V
+    stride::NTuple{N, Int}
+    pad::NTuple{M, Int}
+    dilation::NTuple{N, Int}
+    groups::Int
 end
 
-_channels_in(l::ConvTranspose)  = size(l.weight)[end]
-_channels_out(l::ConvTranspose) = size(l.weight)[end-1]*l.groups
+_channels_in(l::ConvTranspose) = size(l.weight)[end]
+_channels_out(l::ConvTranspose) = size(l.weight)[end - 1] * l.groups
 
 """
     ConvTranspose(weight::AbstractArray, [bias, activation; stride, pad, dilation, groups])
@@ -273,6 +284,7 @@ Accepts the same keywords and has the same defaults as
 [`ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ; ...)`](@ref ConvTranspose).
 
 # Examples
+
 ```jldoctest
 julia> weight = rand(3, 4, 5);
 
@@ -288,66 +300,65 @@ julia> Flux.params(layer) |> length
 2
 ```
 """
-function ConvTranspose(w::AbstractArray{T,N}, bias = true, σ = identity;
-                      stride = 1, pad = 0, dilation = 1, groups=1) where {T,N}
-  stride = expand(Val(N-2), stride)
-  dilation = expand(Val(N-2), dilation)
-  pad = calc_padding(ConvTranspose, pad, size(w)[1:N-2], dilation, stride)
-  b = create_bias(w, bias, size(w, N-1) * groups)
-  return ConvTranspose(σ, w, b, stride, pad, dilation, groups)
+function ConvTranspose(w::AbstractArray{T, N}, bias = true, σ = identity;
+                       stride = 1, pad = 0, dilation = 1, groups = 1) where {T, N}
+    stride = expand(Val(N - 2), stride)
+    dilation = expand(Val(N - 2), dilation)
+    pad = calc_padding(ConvTranspose, pad, size(w)[1:(N - 2)], dilation, stride)
+    b = create_bias(w, bias, size(w, N - 1) * groups)
+    return ConvTranspose(σ, w, b, stride, pad, dilation, groups)
 end
 
-function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                      groups = 1,
-                      bias = true,
-                      ) where N
-
-  weight = convfilter(k, reverse(ch); init, groups)                    
-  ConvTranspose(weight, bias, σ; stride, pad, dilation, groups)
+function ConvTranspose(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity;
+                       init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                       groups = 1,
+                       bias = true) where {N}
+    weight = convfilter(k, reverse(ch); init, groups)
+    return ConvTranspose(weight, bias, σ; stride, pad, dilation, groups)
 end
 
 @functor ConvTranspose
 
 function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
-  # Calculate size of "input", from ∇conv_data()'s perspective...
-  combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end])
-  I = (size(x)[1:end-2] .- 1).*c.stride .+ 1 .+ (size(c.weight)[1:end-2] .- 1).*c.dilation .- combined_pad
-  C_in = size(c.weight)[end-1] * c.groups
-  batch_size = size(x)[end]
-  # Create DenseConvDims() that looks like the corresponding conv()
-  w_size = size(c.weight)
-  return DenseConvDims((I..., C_in, batch_size), w_size;
-                      stride=c.stride,
-                      padding=c.pad,
-                      dilation=c.dilation,
-                      groups=c.groups,
-  )
+    # Calculate size of "input", from ∇conv_data()'s perspective...
+    combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end])
+    I = (size(x)[1:(end - 2)] .- 1) .* c.stride .+ 1 .+
+        (size(c.weight)[1:(end - 2)] .- 1) .* c.dilation .- combined_pad
+    C_in = size(c.weight)[end - 1] * c.groups
+    batch_size = size(x)[end]
+    # Create DenseConvDims() that looks like the corresponding conv()
+    w_size = size(c.weight)
+    return DenseConvDims((I..., C_in, batch_size), w_size;
+                         stride = c.stride,
+                         padding = c.pad,
+                         dilation = c.dilation,
+                         groups = c.groups)
 end
 
 ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any)
 
 function (c::ConvTranspose)(x::AbstractArray)
-  σ = NNlib.fast_act(c.σ, x)
-  cdims = conv_transpose_dims(c, x)
-  σ.(∇conv_data(x, c.weight, cdims) .+ conv_reshape_bias(c))
+    σ = NNlib.fast_act(c.σ, x)
+    cdims = conv_transpose_dims(c, x)
+    return σ.(∇conv_data(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
 function Base.show(io::IO, l::ConvTranspose)
-  print(io, "ConvTranspose(", size(l.weight)[1:ndims(l.weight)-2])
-  print(io, ", ", _channels_in(l), " => ", _channels_out(l))
-  _print_conv_opt(io, l)
-  print(io, ")")
+    print(io, "ConvTranspose(", size(l.weight)[1:(ndims(l.weight) - 2)])
+    print(io, ", ", _channels_in(l), " => ", _channels_out(l))
+    _print_conv_opt(io, l)
+    return print(io, ")")
 end
 
-function calc_padding(::Type{ConvTranspose}, pad::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
-  calc_padding(Conv, pad, k .- stride .+ 1, dilation, stride)
+function calc_padding(::Type{ConvTranspose}, pad::SamePad, k::NTuple{N, T}, dilation,
+                      stride) where {N, T}
+    return calc_padding(Conv, pad, k .- stride .+ 1, dilation, stride)
 end
 
 """
     DepthwiseConv(filter, in => out, σ=identity; stride=1, pad=0, dilation=1, [bias, init])
     DepthwiseConv(weight::AbstractArray, [bias, activation; stride, pad, dilation])
-    
+
 Return a depthwise convolutional layer, that is a [`Conv`](@ref) layer with number of
 groups equal to the number of input channels.
 
@@ -358,28 +369,29 @@ See [`Conv`](@ref) for a description of the arguments.
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50);  # a batch of 50 RGB images
 
-julia> layer = DepthwiseConv((5,5), 3 => 6, relu; bias=false)
-Conv((5, 5), 3 => 6, relu, groups=3, bias=false)  # 150 parameters 
+julia> layer = DepthwiseConv((5, 5), 3 => 6, relu; bias = false)
+Conv((5, 5), 3 => 6, relu, groups=3, bias=false)  # 150 parameters
 
 julia> layer(xs) |> size
 (96, 96, 6, 50)
 
-julia> DepthwiseConv((5, 5), 3 => 9, stride=2, pad=2)(xs) |> size
+julia> DepthwiseConv((5, 5), 3 => 9, stride = 2, pad = 2)(xs) |> size
 (50, 50, 9, 50)
 ```
 """
-function DepthwiseConv(k::NTuple{<:Any,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; 
-            stride = 1, pad = 0, dilation = 1, bias = true, init = glorot_uniform)
-  Conv(k, ch, σ; groups=ch.first, stride, pad, dilation, bias, init)
+function DepthwiseConv(k::NTuple{<:Any, Integer}, ch::Pair{<:Integer, <:Integer},
+                       σ = identity;
+                       stride = 1, pad = 0, dilation = 1, bias = true,
+                       init = glorot_uniform)
+    return Conv(k, ch, σ; groups = ch.first, stride, pad, dilation, bias, init)
 end
 
-function DepthwiseConv(w::AbstractArray{T,N}, bias = true, σ = identity;
-                  stride = 1, pad = 0, dilation = 1) where {T,N}
-  w2 = reshape(w, size(w)[1:end-2]..., 1, :)
-  Conv(w2, bias, σ; groups = size(w)[end-1], stride, pad, dilation)
+function DepthwiseConv(w::AbstractArray{T, N}, bias = true, σ = identity;
+                       stride = 1, pad = 0, dilation = 1) where {T, N}
+    w2 = reshape(w, size(w)[1:(end - 2)]..., 1, :)
+    return Conv(w2, bias, σ; groups = size(w)[end - 1], stride, pad, dilation)
 end
 
-
 """
     CrossCor(filter, in => out, σ=identity; stride=1, pad=0, dilation=1, [bias, init])
 
@@ -397,23 +409,23 @@ See also [`Conv`](@ref) for more detailed description of keywords.
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50);  # a batch of 50 RGB images
 
-julia> layer = CrossCor((5,5), 3 => 6, relu; bias=false)
+julia> layer = CrossCor((5, 5), 3 => 6, relu; bias = false)
 CrossCor((5, 5), 3 => 6, relu, bias=false)  # 450 parameters
 
 julia> layer(xs) |> size
 (96, 96, 6, 50)
 
-julia> CrossCor((5,5), 3 => 7, stride=3, pad=(2,0))(xs) |> size
+julia> CrossCor((5, 5), 3 => 7, stride = 3, pad = (2, 0))(xs) |> size
 (34, 32, 7, 50)
 ```
 """
-struct CrossCor{N,M,F,A,V}
-  σ::F
-  weight::A
-  bias::V
-  stride::NTuple{N,Int}
-  pad::NTuple{M,Int}
-  dilation::NTuple{N,Int}
+struct CrossCor{N, M, F, A, V}
+    σ::F
+    weight::A
+    bias::V
+    stride::NTuple{N, Int}
+    pad::NTuple{M, Int}
+    dilation::NTuple{N, Int}
 end
 
 """
@@ -424,6 +436,7 @@ Accepts the same keywords and has the same defaults as
 [`CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ; ...)`](@ref CrossCor).
 
 # Examples
+
 ```jldoctest
 julia> weight = rand(3, 4, 5);
 
@@ -436,46 +449,48 @@ julia> layer(randn(100, 4, 64)) |> size
 (98, 5, 64)
 ```
 """
-function CrossCor(w::AbstractArray{T,N}, bias = true, σ = identity;
-                  stride = 1, pad = 0, dilation = 1) where {T,N}
-  stride = expand(Val(N-2), stride)
-  dilation = expand(Val(N-2), dilation)
-  pad = calc_padding(CrossCor, pad, size(w)[1:N-2], dilation, stride)
-  b = create_bias(w, bias, size(w, N))
-  return CrossCor(σ, w, b, stride, pad, dilation)
+function CrossCor(w::AbstractArray{T, N}, bias = true, σ = identity;
+                  stride = 1, pad = 0, dilation = 1) where {T, N}
+    stride = expand(Val(N - 2), stride)
+    dilation = expand(Val(N - 2), dilation)
+    pad = calc_padding(CrossCor, pad, size(w)[1:(N - 2)], dilation, stride)
+    b = create_bias(w, bias, size(w, N))
+    return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
-function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+function CrossCor(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity;
                   init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                  bias = true) where N
-
-  weight = convfilter(k, ch, init = init)
-  return CrossCor(weight, bias, σ; stride, pad, dilation)
+                  bias = true) where {N}
+    weight = convfilter(k, ch, init = init)
+    return CrossCor(weight, bias, σ; stride, pad, dilation)
 end
 
 @functor CrossCor
 
 function crosscor(x, w, ddims::DenseConvDims)
-  ddims = DenseConvDims(ddims, F=true)
-  return conv(x, w, ddims)
+    ddims = DenseConvDims(ddims, F = true)
+    return conv(x, w, ddims)
 end
 
-crosscor_dims(c::CrossCor, x::AbstractArray) =
-  DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad, dilation = c.dilation)
+function crosscor_dims(c::CrossCor, x::AbstractArray)
+    return DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad,
+                         dilation = c.dilation)
+end
 
 ChainRulesCore.@non_differentiable crosscor_dims(::Any, ::Any)
 
 function (c::CrossCor)(x::AbstractArray)
-  σ = NNlib.fast_act(c.σ, x)
-  cdims = crosscor_dims(c, x)
-  σ.(crosscor(x, c.weight, cdims) .+ conv_reshape_bias(c))
+    σ = NNlib.fast_act(c.σ, x)
+    cdims = crosscor_dims(c, x)
+    return σ.(crosscor(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
 function Base.show(io::IO, l::CrossCor)
-  print(io, "CrossCor(", size(l.weight)[1:ndims(l.weight)-2])
-  print(io, ", ", size(l.weight, ndims(l.weight)-1), " => ", size(l.weight, ndims(l.weight)))
-  _print_conv_opt(io, l)
-  print(io, ")")
+    print(io, "CrossCor(", size(l.weight)[1:(ndims(l.weight) - 2)])
+    print(io, ", ", size(l.weight, ndims(l.weight) - 1), " => ",
+          size(l.weight, ndims(l.weight)))
+    _print_conv_opt(io, l)
+    return print(io, ")")
 end
 
 """
@@ -490,33 +505,34 @@ batch dimensions, after the `N` feature dimensions, where `N = length(out)`.
 See also [`MaxPool`](@ref), [`AdaptiveMeanPool`](@ref).
 
 # Examples
+
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50);  # batch of 50 RGB images
 
 julia> AdaptiveMaxPool((25, 25))(xs) |> size
 (25, 25, 3, 50)
 
-julia> MaxPool((4,4))(xs) ≈ AdaptiveMaxPool((25, 25))(xs)
+julia> MaxPool((4, 4))(xs) ≈ AdaptiveMaxPool((25, 25))(xs)
 true
 ```
 """
 struct AdaptiveMaxPool{S, O}
-  out::NTuple{O, Int}
-  AdaptiveMaxPool(out::NTuple{O, Int}) where O = new{O + 2, O}(out)
+    out::NTuple{O, Int}
+    AdaptiveMaxPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out)
 end
 
 function (a::AdaptiveMaxPool{S})(x::AbstractArray{T, S}) where {S, T}
-  insize = size(x)[1:end-2]
-  outsize = a.out
-  stride = insize .÷ outsize
-  k = insize .- (outsize .- 1) .* stride
-  pad = 0
-  pdims = PoolDims(x, k; padding=pad, stride=stride)
-  return maxpool(x, pdims)
+    insize = size(x)[1:(end - 2)]
+    outsize = a.out
+    stride = insize .÷ outsize
+    k = insize .- (outsize .- 1) .* stride
+    pad = 0
+    pdims = PoolDims(x, k; padding = pad, stride = stride)
+    return maxpool(x, pdims)
 end
 
 function Base.show(io::IO, a::AdaptiveMaxPool)
-  print(io, "AdaptiveMaxPool(", a.out, ")")
+    return print(io, "AdaptiveMaxPool(", a.out, ")")
 end
 
 """
@@ -531,33 +547,34 @@ batch dimensions, after the `N` feature dimensions, where `N = length(out)`.
 See also [`MaxPool`](@ref), [`AdaptiveMaxPool`](@ref).
 
 # Examples
+
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50);  # batch of 50 RGB images
 
 julia> AdaptiveMeanPool((25, 25))(xs) |> size
 (25, 25, 3, 50)
 
-julia> MeanPool((4,4))(xs) ≈ AdaptiveMeanPool((25, 25))(xs)
+julia> MeanPool((4, 4))(xs) ≈ AdaptiveMeanPool((25, 25))(xs)
 true
 ```
 """
 struct AdaptiveMeanPool{S, O}
-  out::NTuple{O, Int}
-  AdaptiveMeanPool(out::NTuple{O, Int}) where O = new{O + 2, O}(out)
+    out::NTuple{O, Int}
+    AdaptiveMeanPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out)
 end
 
 function (a::AdaptiveMeanPool{S})(x::AbstractArray{T, S}) where {S, T}
-  insize = size(x)[1:end-2]
-  outsize = a.out
-  stride = insize .÷ outsize
-  k = insize .- (outsize .- 1) .* stride
-  pad = 0
-  pdims = PoolDims(x, k; padding=pad, stride=stride)
-  return meanpool(x, pdims)
+    insize = size(x)[1:(end - 2)]
+    outsize = a.out
+    stride = insize .÷ outsize
+    k = insize .- (outsize .- 1) .* stride
+    pad = 0
+    pdims = PoolDims(x, k; padding = pad, stride = stride)
+    return meanpool(x, pdims)
 end
 
 function Base.show(io::IO, a::AdaptiveMeanPool)
-  print(io, "AdaptiveMeanPool(", a.out, ")")
+    return print(io, "AdaptiveMeanPool(", a.out, ")")
 end
 
 """
@@ -573,30 +590,30 @@ See also [`MaxPool`](@ref), [`GlobalMeanPool`](@ref).
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50);
 
-julia> m = Chain(Conv((3,3), 3 => 7), GlobalMaxPool());
+julia> m = Chain(Conv((3, 3), 3 => 7), GlobalMaxPool());
 
 julia> m(xs) |> size
 (1, 1, 7, 50)
 
-julia> GlobalMaxPool()(rand(3,5,7)) |> size  # preserves 2 dimensions
+julia> GlobalMaxPool()(rand(3, 5, 7)) |> size  # preserves 2 dimensions
 (1, 5, 7)
 ```
 """
 struct GlobalMaxPool end
 
 function (g::GlobalMaxPool)(x)
-  # Input size
-  x_size = size(x)
-  # Kernel size
-  k = x_size[1:end-2]
-  # Pooling dimensions
-  pdims = PoolDims(x, k)
+    # Input size
+    x_size = size(x)
+    # Kernel size
+    k = x_size[1:(end - 2)]
+    # Pooling dimensions
+    pdims = PoolDims(x, k)
 
-  return maxpool(x, pdims)
+    return maxpool(x, pdims)
 end
 
 function Base.show(io::IO, g::GlobalMaxPool)
-  print(io, "GlobalMaxPool()")
+    return print(io, "GlobalMaxPool()")
 end
 
 """
@@ -610,7 +627,7 @@ by performing mean pooling on the complete (w,h)-shaped feature maps.
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50);
 
-julia> m = Chain(Conv((3,3), 3 => 7), GlobalMeanPool());
+julia> m = Chain(Conv((3, 3), 3 => 7), GlobalMeanPool());
 
 julia> m(xs) |> size
 (1, 1, 7, 50)
@@ -619,18 +636,18 @@ julia> m(xs) |> size
 struct GlobalMeanPool end
 
 function (g::GlobalMeanPool)(x)
-  # Input size
-  x_size = size(x)
-  # Kernel size
-  k = x_size[1:end-2]
-  # Pooling dimensions
-  pdims = PoolDims(x, k)
+    # Input size
+    x_size = size(x)
+    # Kernel size
+    k = x_size[1:(end - 2)]
+    # Pooling dimensions
+    pdims = PoolDims(x, k)
 
-  return meanpool(x, pdims)
+    return meanpool(x, pdims)
 end
 
 function Base.show(io::IO, g::GlobalMeanPool)
-  print(io, "GlobalMeanPool()")
+    return print(io, "GlobalMeanPool()")
 end
 
 """
@@ -653,7 +670,7 @@ See also [`Conv`](@ref), [`MeanPool`](@ref), [`AdaptiveMaxPool`](@ref), [`Global
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50);  # batch of 50 RGB images
 
-julia> m = Chain(Conv((5, 5), 3 => 7, pad=SamePad()), MaxPool((5, 5), pad=SamePad()))
+julia> m = Chain(Conv((5, 5), 3 => 7, pad = SamePad()), MaxPool((5, 5), pad = SamePad()))
 Chain(
   Conv((5, 5), 3 => 7, pad=2),          # 532 parameters
   MaxPool((5, 5), pad=2),
@@ -665,39 +682,39 @@ julia> m[1](xs) |> size
 julia> m(xs) |> size
 (20, 20, 7, 50)
 
-julia> layer = MaxPool((5,), pad=2, stride=(3,))  # one-dimensional window
+julia> layer = MaxPool((5,), pad = 2, stride = (3,))  # one-dimensional window
 MaxPool((5,), pad=2, stride=3)
 
 julia> layer(rand(Float32, 100, 7, 50)) |> size
 (34, 7, 50)
 ```
 """
-struct MaxPool{N,M}
-  k::NTuple{N,Int}
-  pad::NTuple{M,Int}
-  stride::NTuple{N,Int}
+struct MaxPool{N, M}
+    k::NTuple{N, Int}
+    pad::NTuple{M, Int}
+    stride::NTuple{N, Int}
 end
 
-function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
-  stride = expand(Val(N), stride)
-  pad = calc_padding(MaxPool, pad, k, 1, stride)
-  return MaxPool(k, pad, stride)
+function MaxPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N}
+    stride = expand(Val(N), stride)
+    pad = calc_padding(MaxPool, pad, k, 1, stride)
+    return MaxPool(k, pad, stride)
 end
 
 function (m::MaxPool)(x)
-  pdims = PoolDims(x, m.k; padding=m.pad, stride=m.stride)
-  return maxpool(x, pdims)
+    pdims = PoolDims(x, m.k; padding = m.pad, stride = m.stride)
+    return maxpool(x, pdims)
 end
 
 function Base.show(io::IO, m::MaxPool)
-  print(io, "MaxPool(", m.k)
-  all(==(0), m.pad) || print(io, ", pad=", _maybetuple_string(m.pad))
-  m.stride == m.k || print(io, ", stride=", _maybetuple_string(m.stride))
-  print(io, ")")
+    print(io, "MaxPool(", m.k)
+    all(==(0), m.pad) || print(io, ", pad=", _maybetuple_string(m.pad))
+    m.stride == m.k || print(io, ", stride=", _maybetuple_string(m.stride))
+    return print(io, ")")
 end
 
 _maybetuple_string(pad) = string(pad)
-_maybetuple_string(pad::Tuple) = all(==(pad[1]), pad) ? string(pad[1])  : string(pad)
+_maybetuple_string(pad::Tuple) = all(==(pad[1]), pad) ? string(pad[1]) : string(pad)
 
 """
     MeanPool(window::NTuple; pad=0, stride=window)
@@ -718,7 +735,7 @@ See also [`Conv`](@ref), [`MaxPool`](@ref), [`AdaptiveMeanPool`](@ref).
 ```jldoctest
 julia> xs = rand(Float32, 100, 100, 3, 50);
 
-julia> m = Chain(Conv((5,5), 3 => 7), MeanPool((5,5), pad=SamePad()))
+julia> m = Chain(Conv((5, 5), 3 => 7), MeanPool((5, 5), pad = SamePad()))
 Chain(
   Conv((5, 5), 3 => 7),                 # 532 parameters
   MeanPool((5, 5), pad=2),
@@ -731,26 +748,26 @@ julia> m(xs) |> size
 (20, 20, 7, 50)
 ```
 """
-struct MeanPool{N,M}
-  k::NTuple{N,Int}
-  pad::NTuple{M,Int}
-  stride::NTuple{N,Int}
+struct MeanPool{N, M}
+    k::NTuple{N, Int}
+    pad::NTuple{M, Int}
+    stride::NTuple{N, Int}
 end
 
-function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
-  stride = expand(Val(N), stride)
-  pad = calc_padding(MeanPool, pad, k, 1, stride)
-  return MeanPool(k, pad, stride)
+function MeanPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N}
+    stride = expand(Val(N), stride)
+    pad = calc_padding(MeanPool, pad, k, 1, stride)
+    return MeanPool(k, pad, stride)
 end
 
 function (m::MeanPool)(x)
-  pdims = PoolDims(x, m.k; padding=m.pad, stride=m.stride)
-  return meanpool(x, pdims)
+    pdims = PoolDims(x, m.k; padding = m.pad, stride = m.stride)
+    return meanpool(x, pdims)
 end
 
 function Base.show(io::IO, m::MeanPool)
-  print(io, "MeanPool(", m.k)
-  all(==(0), m.pad) || print(io, ", pad=", _maybetuple_string(m.pad))
-  m.stride == m.k || print(io, ", stride=", _maybetuple_string(m.stride))
-  print(io, ")")
+    print(io, "MeanPool(", m.k)
+    all(==(0), m.pad) || print(io, ", pad=", _maybetuple_string(m.pad))
+    m.stride == m.k || print(io, ", stride=", _maybetuple_string(m.stride))
+    return print(io, ")")
 end
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 0f2696a50a..e832ce184a 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -5,7 +5,7 @@ ChainRulesCore.rrule(::typeof(istraining)) = true, _ -> (NoTangent(),)
 _isactive(m) = isnothing(m.active) ? istraining() : m.active
 
 _dropout_shape(s, ::Colon) = size(s)
-_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
+_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) in enumerate(size(s)))...)
 
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
 
@@ -29,22 +29,23 @@ automatically managed using the [`Dropout`](@ref) layer instead of the
 
 The [`Dropout`](@ref) layer is what you should use in most scenarios.
 """
-function dropout(rng, x, p; dims=:, active::Bool=true)
-  active || return x
-  y = dropout_mask(rng, x, p, dims=dims)
-  return x .* y
+function dropout(rng, x, p; dims = :, active::Bool = true)
+    active || return x
+    y = dropout_mask(rng, x, p, dims = dims)
+    return x .* y
 end
 dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...)
 
 dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
-dropout_mask(rng, x::CuArray, p; kwargs...) =
-  throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays."))
+function dropout_mask(rng, x::CuArray, p; kwargs...)
+    throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays."))
+end
 dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
-function _dropout_mask(rng, x, p; dims=:)
-  realfptype = float(real(eltype(x)))
-  y = rand!(rng, similar(x, realfptype, _dropout_shape(x, dims)))
-  y .= _dropout_kernel.(y, p, 1 - p)
-  return y
+function _dropout_mask(rng, x, p; dims = :)
+    realfptype = float(real(eltype(x)))
+    y = rand!(rng, similar(x, realfptype, _dropout_shape(x, dims)))
+    y .= _dropout_kernel.(y, p, 1 - p)
+    return y
 end
 
 # TODO move this to NNlib
@@ -56,9 +57,9 @@ ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any)
 Dropout layer.
 
 While training, for each input, this layer either sets that input to `0` (with probability
-`p`) or scales it by `1 / (1 - p)`. To apply dropout along certain dimension(s), specify the 
+`p`) or scales it by `1 / (1 - p)`. To apply dropout along certain dimension(s), specify the
 `dims` keyword. e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input
-(also called 2D dropout). This is used as a regularisation, i.e. it reduces overfitting during 
+(also called 2D dropout). This is used as a regularisation, i.e. it reduces overfitting during
 training.
 
 In the forward pass, this layer applies the [`Flux.dropout`](@ref) function. See that for more
@@ -70,6 +71,7 @@ Custom RNGs are only supported on the CPU.
 Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
 
 # Examples
+
 ```jldoctest
 julia> m = Chain(Dense(1 => 1), Dropout(1));
 
@@ -86,38 +88,39 @@ julia> Flux.trainmode!(m);
 
 julia> y = m(ones(1000));
 
-julia> isapprox(count(==(0), y) / length(y), 0.5, atol=0.1)
+julia> isapprox(count(==(0), y) / length(y), 0.5, atol = 0.1)
 true
 ```
 """
-mutable struct Dropout{F,D,R<:AbstractRNG}
-  p::F
-  dims::D
-  active::Union{Bool, Nothing}
-  rng::R
+mutable struct Dropout{F, D, R <: AbstractRNG}
+    p::F
+    dims::D
+    active::Union{Bool, Nothing}
+    rng::R
 end
 Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value())
 
-function Dropout(p; dims=:, rng = default_rng_value())
-  @assert 0 ≤ p ≤ 1
-  Dropout(p, dims, nothing, rng)
+function Dropout(p; dims = :, rng = default_rng_value())
+    @assert 0 ≤ p ≤ 1
+    return Dropout(p, dims, nothing, rng)
 end
 
 @functor Dropout
 trainable(a::Dropout) = (;)
 
 function (a::Dropout)(x)
-  _isactive(a) || return x
-  return dropout(a.rng, x, a.p; dims=a.dims, active=true)
+    _isactive(a) || return x
+    return dropout(a.rng, x, a.p; dims = a.dims, active = true)
 end
 
-testmode!(m::Dropout, mode=true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+function testmode!(m::Dropout, mode = true)
+    return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+end
 
 function Base.show(io::IO, d::Dropout)
-  print(io, "Dropout(", d.p)
-  d.dims != (:) && print(io, ", dims = $(repr(d.dims))")
-  print(io, ")")
+    print(io, "Dropout(", d.p)
+    d.dims != (:) && print(io, ", dims = $(repr(d.dims))")
+    return print(io, ")")
 end
 
 """
@@ -131,10 +134,11 @@ remain the same as before.
 Does nothing to the input once [`testmode!`](@ref) is true.
 
 # Examples
+
 ```jldoctest
 julia> using Statistics
 
-julia> x = randn(1000,1);
+julia> x = randn(1000, 1);
 
 julia> m = Chain(Dense(1000 => 1000, selu), AlphaDropout(0.2));
 
@@ -142,18 +146,18 @@ julia> Flux.trainmode!(m);
 
 julia> y = m(x);
 
-julia> isapprox(std(x), std(y), atol=0.2)
+julia> isapprox(std(x), std(y), atol = 0.2)
 true
 ```
 """
-mutable struct AlphaDropout{F,R<:AbstractRNG}
-  p::F
-  active::Union{Bool, Nothing}
-  rng::R
-  function AlphaDropout(p, active, rng)
-    @assert 0 ≤ p ≤ 1
-    new{typeof(p), typeof(rng)}(p, active, rng)
-  end
+mutable struct AlphaDropout{F, R <: AbstractRNG}
+    p::F
+    active::Union{Bool, Nothing}
+    rng::R
+    function AlphaDropout(p, active, rng)
+        @assert 0 ≤ p ≤ 1
+        return new{typeof(p), typeof(rng)}(p, active, rng)
+    end
 end
 AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value())
 AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng)
@@ -161,22 +165,23 @@ AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng)
 @functor AlphaDropout
 trainable(a::AlphaDropout) = (;)
 
-function (a::AlphaDropout)(x::AbstractArray{T}) where T
-  _isactive(a) || return x
-  p = a.p
-  iszero(p) && return x
-  isone(p) && return sign.(x) .* T(0)
+function (a::AlphaDropout)(x::AbstractArray{T}) where {T}
+    _isactive(a) || return x
+    p = a.p
+    iszero(p) && return x
+    isone(p) && return sign.(x) .* T(0)
 
-  α′ = T(-1.7580993408473766) # selu(-Inf) == -λα
-  A = T(inv(sqrt((1 - p) * (1 + p * α′^2))))
-  B = T(-A * α′ * p)
+    α′ = T(-1.7580993408473766) # selu(-Inf) == -λα
+    A = T(inv(sqrt((1 - p) * (1 + p * α′^2))))
+    B = T(-A * α′ * p)
 
-  noise = rand!(a.rng, similar(x))
-  return A .* ifelse.(noise .> p, x, α′) .+ B
+    noise = rand!(a.rng, similar(x))
+    return A .* ifelse.(noise .> p, x, α′) .+ B
 end
 
-testmode!(m::AlphaDropout, mode=true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+function testmode!(m::AlphaDropout, mode = true)
+    return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+end
 
 """
     LayerNorm(size..., λ=identity; affine=true, ϵ=1fe-5)
@@ -196,6 +201,7 @@ using the [`Scale`](@ref) layer.
 See also [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`normalise`](@ref).
 
 # Examples
+
 ```jldoctest
 julia> using Statistics
 
@@ -205,79 +211,78 @@ julia> m = LayerNorm(3);
 
 julia> y = m(xs);
 
-julia> isapprox(std(y, dims=1:3), ones(1, 1, 1, 2), atol=0.1) && std(y, dims=1:3) != std(xs, dims=1:3)
+julia> isapprox(std(y, dims = 1:3), ones(1, 1, 1, 2), atol = 0.1) &&
+           std(y, dims = 1:3) != std(xs, dims = 1:3)
 true
 ```
 """
-struct LayerNorm{F,D,T,N}
-  λ::F
-  diag::D
-  ϵ::T
-  size::NTuple{N,Int}
-  affine::Bool
+struct LayerNorm{F, D, T, N}
+    λ::F
+    diag::D
+    ϵ::T
+    size::NTuple{N, Int}
+    affine::Bool
 end
 
-function LayerNorm(size::Tuple{Vararg{Int}}, λ=identity; affine::Bool=true, ϵ::Real=1f-5)
-  diag = affine ? Scale(size..., λ) : λ!=identity ? Base.Fix1(broadcast, λ) : identity
-  return LayerNorm(λ, diag, ϵ, size, affine)
+function LayerNorm(size::Tuple{Vararg{Int}}, λ = identity; affine::Bool = true,
+                   ϵ::Real = 1.0f-5)
+    diag = affine ? Scale(size..., λ) : λ != identity ? Base.Fix1(broadcast, λ) : identity
+    return LayerNorm(λ, diag, ϵ, size, affine)
 end
 LayerNorm(size::Integer...; kw...) = LayerNorm(Int.(size); kw...)
-LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:end-1]), size_act[end]; kw...)
+LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:(end - 1)]), size_act[end]; kw...)
 
 @functor LayerNorm
 
-(a::LayerNorm)(x) = a.diag(normalise(x, dims=1:length(a.size), ϵ=a.ϵ))
+(a::LayerNorm)(x) = a.diag(normalise(x, dims = 1:length(a.size), ϵ = a.ϵ))
 
 function Base.show(io::IO, l::LayerNorm)
-  print(io, "LayerNorm(", join(l.size, ", "))
-  l.λ === identity || print(io, ", ", l.λ)
-  hasaffine(l) || print(io, ", affine=false")
-  print(io, ")")
+    print(io, "LayerNorm(", join(l.size, ", "))
+    l.λ === identity || print(io, ", ", l.λ)
+    hasaffine(l) || print(io, ", affine=false")
+    return print(io, ")")
 end
 
 # For InstanceNorm, GroupNorm, and BatchNorm.
 # Compute the statistics on the slices specified by reduce_dims.
 # reduce_dims=[1,...,N-2,N] for BatchNorm
 # reduce_dims=[1,...,N-2] for InstanceNorm and GroupNorm
-function _norm_layer_forward(
-  l, x::AbstractArray{T, N}; reduce_dims, affine_shape,
-) where {T, N}
-  if !_isactive(l) && l.track_stats # testmode with tracked stats
-    stats_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
-    μ = reshape(l.μ, stats_shape)
-    σ² = reshape(l.σ², stats_shape)
-  else # trainmode or testmode without tracked stats
-    μ = mean(x; dims=reduce_dims)
-    σ² = var(x; mean=μ, dims=reduce_dims, corrected=false)
-    if l.track_stats
-      _track_stats!(l, x, μ, σ², reduce_dims) # update moving mean/std
+function _norm_layer_forward(l, x::AbstractArray{T, N}; reduce_dims,
+                             affine_shape) where {T, N}
+    if !_isactive(l) && l.track_stats # testmode with tracked stats
+        stats_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N)
+        μ = reshape(l.μ, stats_shape)
+        σ² = reshape(l.σ², stats_shape)
+    else # trainmode or testmode without tracked stats
+        μ = mean(x; dims = reduce_dims)
+        σ² = var(x; mean = μ, dims = reduce_dims, corrected = false)
+        if l.track_stats
+            _track_stats!(l, x, μ, σ², reduce_dims) # update moving mean/std
+        end
     end
-  end
 
-  o = _norm_layer_forward(x, μ, σ², l.ϵ)
-  hasaffine(l) || return l.λ.(o)
+    o = _norm_layer_forward(x, μ, σ², l.ϵ)
+    hasaffine(l) || return l.λ.(o)
 
-  γ = reshape(l.γ, affine_shape)
-  β = reshape(l.β, affine_shape)
-  return l.λ.(γ .* o .+ β)
+    γ = reshape(l.γ, affine_shape)
+    β = reshape(l.β, affine_shape)
+    return l.λ.(γ .* o .+ β)
 end
 
 @inline _norm_layer_forward(x, μ, σ², ϵ) = (x .- μ) ./ sqrt.(σ² .+ ϵ)
 
-function _track_stats!(
-  bn, x::AbstractArray{T, N}, μ, σ², reduce_dims,
-) where {T, N}
-  V = eltype(bn.σ²)
-  mtm = bn.momentum
-  res_mtm = one(V) - mtm
-  m = prod(size(x, i) for i in reduce_dims)
+function _track_stats!(bn, x::AbstractArray{T, N}, μ, σ², reduce_dims) where {T, N}
+    V = eltype(bn.σ²)
+    mtm = bn.momentum
+    res_mtm = one(V) - mtm
+    m = prod(size(x, i) for i in reduce_dims)
 
-  μnew = vec(N ∈ reduce_dims ? μ : mean(μ, dims=N))
-  σ²new = vec(N ∈ reduce_dims ? σ² : mean(σ², dims=N))
+    μnew = vec(N ∈ reduce_dims ? μ : mean(μ, dims = N))
+    σ²new = vec(N ∈ reduce_dims ? σ² : mean(σ², dims = N))
 
-  bn.μ = res_mtm .* bn.μ .+ mtm .* μnew
-  bn.σ² = res_mtm .* bn.σ² .+ mtm .* (m / (m - one(V))) .* σ²new
-  return nothing
+    bn.μ = res_mtm .* bn.μ .+ mtm .* μnew
+    bn.σ² = res_mtm .* bn.σ² .+ mtm .* (m / (m - one(V))) .* σ²new
+    return nothing
 end
 
 ChainRulesCore.@non_differentiable _track_stats!(::Any...)
@@ -309,6 +314,7 @@ that will be used to renormalize the input in test phase.
 Use [`testmode!`](@ref) during inference.
 
 # Examples
+
 ```julia
 julia> using Statistics
 
@@ -318,62 +324,61 @@ julia> m = BatchNorm(3);
 
 julia> Flux.trainmode!(m);
 
-julia> isapprox(std(m(xs)), 1, atol=0.1) && std(xs) != std(m(xs))
+julia> isapprox(std(m(xs)), 1, atol = 0.1) && std(xs) != std(m(xs))
 true
 ```
 """
-mutable struct BatchNorm{F,V,N,W}
-  λ::F  # activation function
-  β::V  # bias
-  γ::V  # scale
-  μ::W     # moving mean
-  σ²::W    # moving var
-  ϵ::N
-  momentum::N
-  affine::Bool
-  track_stats::Bool
-  active::Union{Bool, Nothing}
-  chs::Int # number of channels
+mutable struct BatchNorm{F, V, N, W}
+    λ::F  # activation function
+    β::V  # bias
+    γ::V  # scale
+    μ::W     # moving mean
+    σ²::W    # moving var
+    ϵ::N
+    momentum::N
+    affine::Bool
+    track_stats::Bool
+    active::Union{Bool, Nothing}
+    chs::Int # number of channels
 end
 
-function BatchNorm(chs::Int, λ=identity;
-          initβ=zeros32, initγ=ones32,
-          affine=true, track_stats=true,
-          ϵ=1f-5, momentum=0.1f0)
-
-  β = affine ? initβ(chs) : nothing
-  γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros32(chs) : nothing
-  σ² = track_stats ? ones32(chs) : nothing
-
-  return BatchNorm(λ, β, γ,
-            μ, σ², ϵ, momentum,
-            affine, track_stats,
-            nothing, chs)
+function BatchNorm(chs::Int, λ = identity;
+                   initβ = zeros32, initγ = ones32,
+                   affine = true, track_stats = true,
+                   ϵ = 1.0f-5, momentum = 0.1f0)
+    β = affine ? initβ(chs) : nothing
+    γ = affine ? initγ(chs) : nothing
+    μ = track_stats ? zeros32(chs) : nothing
+    σ² = track_stats ? ones32(chs) : nothing
+
+    return BatchNorm(λ, β, γ,
+                     μ, σ², ϵ, momentum,
+                     affine, track_stats,
+                     nothing, chs)
 end
 
 @functor BatchNorm
 trainable(bn::BatchNorm) = hasaffine(bn) ? (β = bn.β, γ = bn.γ) : (;)
 
 function (BN::BatchNorm)(x)
-  @assert size(x, ndims(x)-1) == BN.chs
-  N = ndims(x)
-  reduce_dims = [1:N-2; N]
-  affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
-  return _norm_layer_forward(BN, x; reduce_dims, affine_shape)
+    @assert size(x, ndims(x) - 1) == BN.chs
+    N = ndims(x)
+    reduce_dims = [1:(N - 2); N]
+    affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N)
+    return _norm_layer_forward(BN, x; reduce_dims, affine_shape)
 end
 
-testmode!(m::BatchNorm, mode=true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+function testmode!(m::BatchNorm, mode = true)
+    return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+end
 
 function Base.show(io::IO, l::BatchNorm)
-  print(io, "BatchNorm($(l.chs)")
-  (l.λ == identity) || print(io, ", $(l.λ)")
-  hasaffine(l) || print(io,  ", affine=false")
-  print(io, ")")
+    print(io, "BatchNorm($(l.chs)")
+    (l.λ == identity) || print(io, ", $(l.λ)")
+    hasaffine(l) || print(io, ", affine=false")
+    return print(io, ")")
 end
 
-
 """
     InstanceNorm(channels::Integer, λ=identity;
                  initβ=zeros32, initγ=ones32,
@@ -399,6 +404,7 @@ that will be used to renormalize the input in test phase.
 in previous Flux versions (< v0.12).
 
 # Examples
+
 ```jldoctest
 julia> using Statistics
 
@@ -408,64 +414,66 @@ julia> m = InstanceNorm(3);
 
 julia> y = m(xs);
 
-julia> isapprox(std(y, dims=1:2), ones(1, 1, 3, 2), atol=0.2) && std(y, dims=1:2) != std(xs, dims=1:2)
+julia> isapprox(std(y, dims = 1:2), ones(1, 1, 3, 2), atol = 0.2) &&
+           std(y, dims = 1:2) != std(xs, dims = 1:2)
 true
 ```
 """
-mutable struct InstanceNorm{F,V,N,W}
-  λ::F  # activation function
-  β::V  # bias
-  γ::V  # scale
-  μ::W  # moving mean
-  σ²::W  # moving var
-  ϵ::N
-  momentum::N
-  affine::Bool
-  track_stats::Bool
-  active::Union{Bool, Nothing}
-  chs::Int # number of channels
+mutable struct InstanceNorm{F, V, N, W}
+    λ::F  # activation function
+    β::V  # bias
+    γ::V  # scale
+    μ::W  # moving mean
+    σ²::W  # moving var
+    ϵ::N
+    momentum::N
+    affine::Bool
+    track_stats::Bool
+    active::Union{Bool, Nothing}
+    chs::Int # number of channels
 end
 
-function InstanceNorm(chs::Int, λ=identity;
-                    initβ=zeros32, initγ=ones32,
-                    affine=false, track_stats=false,
-                    ϵ=1f-5, momentum=0.1f0)
-
-  if track_stats
-    Base.depwarn("`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", :InstanceNorm)
-  end
+function InstanceNorm(chs::Int, λ = identity;
+                      initβ = zeros32, initγ = ones32,
+                      affine = false, track_stats = false,
+                      ϵ = 1.0f-5, momentum = 0.1f0)
+    if track_stats
+        Base.depwarn("`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.",
+                     :InstanceNorm)
+    end
 
-  β = affine ? initβ(chs) : nothing
-  γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros32(chs) : nothing
-  σ² = track_stats ? ones32(chs) : nothing
+    β = affine ? initβ(chs) : nothing
+    γ = affine ? initγ(chs) : nothing
+    μ = track_stats ? zeros32(chs) : nothing
+    σ² = track_stats ? ones32(chs) : nothing
 
-  return InstanceNorm(λ, β, γ,
-            μ, σ², ϵ, momentum,
-            affine, track_stats,
-            nothing, chs)
+    return InstanceNorm(λ, β, γ,
+                        μ, σ², ϵ, momentum,
+                        affine, track_stats,
+                        nothing, chs)
 end
 
 @functor InstanceNorm
 trainable(in::InstanceNorm) = hasaffine(in) ? (β = in.β, γ = in.γ) : (;)
 
 function (l::InstanceNorm)(x)
-  @assert ndims(x) > 2
-  @assert size(x, ndims(x)-1) == l.chs
-  N = ndims(x)
-  reduce_dims = 1:N-2
-  affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
-  return _norm_layer_forward(l, x; reduce_dims, affine_shape)
+    @assert ndims(x) > 2
+    @assert size(x, ndims(x) - 1) == l.chs
+    N = ndims(x)
+    reduce_dims = 1:(N - 2)
+    affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N)
+    return _norm_layer_forward(l, x; reduce_dims, affine_shape)
 end
 
-testmode!(m::InstanceNorm, mode=true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+function testmode!(m::InstanceNorm, mode = true)
+    return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+end
 
 function Base.show(io::IO, l::InstanceNorm)
-  print(io, "InstanceNorm($(l.chs)")
-  l.λ == identity || print(io, ", $(l.λ)")
-  hasaffine(l) || print(io,  ", affine=false")
-  print(io, ")")
+    print(io, "InstanceNorm($(l.chs)")
+    l.λ == identity || print(io, ", $(l.λ)")
+    hasaffine(l) || print(io, ", affine=false")
+    return print(io, ")")
 end
 
 """
@@ -494,6 +502,7 @@ If `track_stats=true`, accumulates mean and var statistics in training phase
 that will be used to renormalize the input in test phase.
 
 # Examples
+
 ```jldoctest
 julia> using Statistics
 
@@ -503,77 +512,82 @@ julia> m = GroupNorm(4, 2);
 
 julia> y = m(xs);
 
-julia> isapprox(std(y[:, :, 1:2, 1]), 1, atol=0.1) && std(xs[:, :, 1:2, 1]) != std(y[:, :, 1:2, 1])
+julia> isapprox(std(y[:, :, 1:2, 1]), 1, atol = 0.1) &&
+           std(xs[:, :, 1:2, 1]) != std(y[:, :, 1:2, 1])
 true
 
-julia> isapprox(std(y[:, :, 3:4, 2]), 1, atol=0.1) && std(xs[:, :, 3:4, 2]) != std(y[:, :, 3:4, 2])
+julia> isapprox(std(y[:, :, 3:4, 2]), 1, atol = 0.1) &&
+           std(xs[:, :, 3:4, 2]) != std(y[:, :, 3:4, 2])
 true
+```  # number of groups
 ```
 """
-mutable struct GroupNorm{F,V,N,W}
-  G::Int  # number of groups
-  λ::F  # activation function
-  β::V  # bias
-  γ::V  # scale
-  μ::W     # moving mean
-  σ²::W    # moving std
-  ϵ::N
-  momentum::N
-  affine::Bool
-  track_stats::Bool
-  active::Union{Bool, Nothing}
-  chs::Int # number of channels
+mutable struct GroupNorm{F, V, N, W}
+    G::Int  # number of groups
+    λ::F  # activation function
+    β::V  # bias
+    γ::V  # scale
+    μ::W     # moving mean
+    σ²::W    # moving std
+    ϵ::N
+    momentum::N
+    affine::Bool
+    track_stats::Bool
+    active::Union{Bool, Nothing}
+    chs::Int # number of channels
 end
 
 @functor GroupNorm
 trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;)
 
-function GroupNorm(chs::Int, G::Int, λ=identity;
-              initβ=zeros32, initγ=ones32,
-              affine=true, track_stats=false,
-              ϵ=1f-5, momentum=0.1f0)
-
-if track_stats
-  Base.depwarn("`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", :GroupNorm)
-end
+function GroupNorm(chs::Int, G::Int, λ = identity;
+                   initβ = zeros32, initγ = ones32,
+                   affine = true, track_stats = false,
+                   ϵ = 1.0f-5, momentum = 0.1f0)
+    if track_stats
+        Base.depwarn("`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.",
+                     :GroupNorm)
+    end
 
-  chs % G == 0 || error("The number of groups ($(G)) must divide the number of channels ($chs)")
+    chs % G == 0 ||
+        error("The number of groups ($(G)) must divide the number of channels ($chs)")
 
-  β = affine ? initβ(chs) : nothing
-  γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros32(G) : nothing
-  σ² = track_stats ? ones32(G) : nothing
+    β = affine ? initβ(chs) : nothing
+    γ = affine ? initγ(chs) : nothing
+    μ = track_stats ? zeros32(G) : nothing
+    σ² = track_stats ? ones32(G) : nothing
 
-  return GroupNorm(G, λ,
-            β, γ,
-            μ, σ²,
-            ϵ, momentum,
-            affine, track_stats,
-            nothing, chs)
+    return GroupNorm(G, λ,
+                     β, γ,
+                     μ, σ²,
+                     ϵ, momentum,
+                     affine, track_stats,
+                     nothing, chs)
 end
 
 function (gn::GroupNorm)(x)
-  @assert ndims(x) > 2
-  @assert size(x, ndims(x)-1) == gn.chs
-  N = ndims(x)
-  sz = size(x)
-  x = reshape(x, sz[1:N-2]..., sz[N-1]÷gn.G, gn.G, sz[N])
-  N = ndims(x)
-  reduce_dims = 1:N-2
-  affine_shape = ntuple(i -> i ∈ (N-1, N-2) ? size(x, i) : 1, N)
-  x = _norm_layer_forward(gn, x; reduce_dims, affine_shape)
-  return reshape(x, sz)
+    @assert ndims(x) > 2
+    @assert size(x, ndims(x) - 1) == gn.chs
+    N = ndims(x)
+    sz = size(x)
+    x = reshape(x, sz[1:(N - 2)]..., sz[N - 1] ÷ gn.G, gn.G, sz[N])
+    N = ndims(x)
+    reduce_dims = 1:(N - 2)
+    affine_shape = ntuple(i -> i ∈ (N - 1, N - 2) ? size(x, i) : 1, N)
+    x = _norm_layer_forward(gn, x; reduce_dims, affine_shape)
+    return reshape(x, sz)
 end
 
-testmode!(m::GroupNorm, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+function testmode!(m::GroupNorm, mode = true)
+    return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+end
 
 function Base.show(io::IO, l::GroupNorm)
-  # print(io, "GroupNorm($(join(size(l.β), ", "))", ", ", l.G)
-  print(io, "GroupNorm($(l.chs), $(l.G)")
-  l.λ == identity || print(io, ", ", l.λ)
-  hasaffine(l) || print(io,  ", affine=false")
-  print(io, ")")
+    # print(io, "GroupNorm($(join(size(l.β), ", "))", ", ", l.G)
+    print(io, "GroupNorm($(l.chs), $(l.G)")
+    l.λ == identity || print(io, ", ", l.λ)
+    hasaffine(l) || print(io, ", affine=false")
+    return print(io, ")")
 end
 
 """
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 7cabc9d5b6..ef19714f1f 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -1,52 +1,52 @@
 
-gate(h, n) = (1:h) .+ h*(n-1)
-gate(x::AbstractVector, h, n) = @view x[gate(h,n)]
-gate(x::AbstractMatrix, h, n) = view(x, gate(h,n), :)
+gate(h, n) = (1:h) .+ h * (n - 1)
+gate(x::AbstractVector, h, n) = @view x[gate(h, n)]
+gate(x::AbstractMatrix, h, n) = view(x, gate(h, n), :)
 
 # AD-friendly helper for dividing monolithic RNN params into equally sized gates
-multigate(x::AbstractArray, h, ::Val{N}) where N = ntuple(n -> gate(x,h,n), N)
+multigate(x::AbstractArray, h, ::Val{N}) where {N} = ntuple(n -> gate(x, h, n), N)
 
 function ChainRulesCore.rrule(::typeof(multigate), x::AbstractArray, h, c)
-  function multigate_pullback(dy)
-    dx = map!(zero, similar(x, float(eltype(x)), axes(x)), x)
-    foreach(multigate(dx, h, c), unthunk(dy)) do dxᵢ, dyᵢ
-      dyᵢ isa AbstractZero && return
-      @. dxᵢ += dyᵢ
+    function multigate_pullback(dy)
+        dx = map!(zero, similar(x, float(eltype(x)), axes(x)), x)
+        foreach(multigate(dx, h, c), unthunk(dy)) do dxᵢ, dyᵢ
+            dyᵢ isa AbstractZero && return
+            @. dxᵢ += dyᵢ
+        end
+        return (NoTangent(), dx, NoTangent(), NoTangent())
     end
-    return (NoTangent(), dx, NoTangent(), NoTangent())
-  end
-  return multigate(x, h, c), multigate_pullback
+    return multigate(x, h, c), multigate_pullback
 end
 
 # Type stable and AD-friendly helper for iterating over the last dimension of an array
-function eachlastdim(A::AbstractArray{T,N}) where {T,N}
-  inds_before = ntuple(_ -> :, N-1)
-  return (view(A, inds_before..., i) for i in axes(A, N))
+function eachlastdim(A::AbstractArray{T, N}) where {T, N}
+    inds_before = ntuple(_ -> :, N - 1)
+    return (view(A, inds_before..., i) for i in axes(A, N))
 end
 
 # adapted from https://github.com/JuliaDiff/ChainRules.jl/blob/f13e0a45d10bb13f48d6208e9c9d5b4a52b96732/src/rulesets/Base/indexing.jl#L77
 function ∇eachlastdim(dys_raw, x::AbstractArray{T, N}) where {T, N}
-  dys = unthunk(dys_raw)
-  i1 = findfirst(dy -> dy isa AbstractArray, dys)
-  if isnothing(i1)  # all slices are Zero!
-      return fill!(similar(x, T, axes(x)), zero(T))
-  end
-  # The whole point of this gradient is that we can allocate one `dx` array:
-  dx = similar(x, T, axes(x))::AbstractArray
-  for i in axes(x, N)
-      slice = selectdim(dx, N, i)
-      if dys[i] isa AbstractZero
-          fill!(slice, zero(eltype(slice)))
-      else
-          copyto!(slice, dys[i])
-      end
-  end
-  return ProjectTo(x)(dx)
+    dys = unthunk(dys_raw)
+    i1 = findfirst(dy -> dy isa AbstractArray, dys)
+    if isnothing(i1)  # all slices are Zero!
+        return fill!(similar(x, T, axes(x)), zero(T))
+    end
+    # The whole point of this gradient is that we can allocate one `dx` array:
+    dx = similar(x, T, axes(x))::AbstractArray
+    for i in axes(x, N)
+        slice = selectdim(dx, N, i)
+        if dys[i] isa AbstractZero
+            fill!(slice, zero(eltype(slice)))
+        else
+            copyto!(slice, dys[i])
+        end
+    end
+    return ProjectTo(x)(dx)
 end
 
-function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T,N}) where {T,N}
-  lastdims(dy) = (NoTangent(), ∇eachlastdim(unthunk(dy), x))
-  collect(eachlastdim(x)), lastdims
+function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T, N}) where {T, N}
+    lastdims(dy) = (NoTangent(), ∇eachlastdim(unthunk(dy), x))
+    return collect(eachlastdim(x)), lastdims
 end
 
 reshape_cell_output(h, x) = reshape(h, :, size(x)[2:end]...)
@@ -64,6 +64,7 @@ in the background. `cell` should be a model of the form:
 For example, here's a recurrent network that keeps a running total of its inputs:
 
 # Examples
+
 ```jldoctest
 julia> accum(h, x) = (h + x, x)
 accum (generic function with 1 method)
@@ -71,7 +72,7 @@ accum (generic function with 1 method)
 julia> rnn = Flux.Recur(accum, 0)
 Recur(accum)
 
-julia> rnn(2) 
+julia> rnn(2)
 2
 
 julia> rnn(3)
@@ -125,14 +126,14 @@ julia> rnn.state
  60
 ```
 """
-mutable struct Recur{T,S}
-  cell::T
-  state::S
+mutable struct Recur{T, S}
+    cell::T
+    state::S
 end
 
 function (m::Recur)(x)
-  m.state, y = m.cell(m.state, x)
-  return y
+    m.state, y = m.cell(m.state, x)
+    return y
 end
 
 @functor Recur
@@ -150,16 +151,17 @@ Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to:
     rnn.state = hidden(rnn.cell)
 
 # Examples
+
 ```jldoctest
-julia> r = Flux.RNNCell(relu, ones(1,1), zeros(1,1), ones(1,1), zeros(1,1));  # users should use the RNN wrapper struct instead
+julia> r = Flux.RNNCell(relu, ones(1, 1), zeros(1, 1), ones(1, 1), zeros(1, 1));  # users should use the RNN wrapper struct instead
 
-julia> y = Flux.Recur(r, ones(1,1));
+julia> y = Flux.Recur(r, ones(1, 1));
 
 julia> y.state
 1×1 Matrix{Float64}:
  1.0
 
-julia> y(ones(1,1))  # relu(1*1 + 1)
+julia> y(ones(1, 1))  # relu(1*1 + 1)
 1×1 Matrix{Float64}:
  2.0
 
@@ -181,38 +183,44 @@ reset!(m) = foreach(reset!, functor(m)[1])
 
 flip(f, xs) = reverse([f(x) for x in reverse(xs)])
 
-function (m::Recur)(x::AbstractArray{T, 3}) where T
-  h = [m(x_t) for x_t in eachlastdim(x)]
-  sze = size(h[1])
-  reshape(reduce(hcat, h), sze[1], sze[2], length(h))
+function (m::Recur)(x::AbstractArray{T, 3}) where {T}
+    h = [m(x_t) for x_t in eachlastdim(x)]
+    sze = size(h[1])
+    return reshape(reduce(hcat, h), sze[1], sze[2], length(h))
 end
 
 # Vanilla RNN
 
-struct RNNCell{F,I,H,V,S}
-  σ::F
-  Wi::I
-  Wh::H
-  b::V
-  state0::S
+struct RNNCell{F, I, H, V, S}
+    σ::F
+    Wi::I
+    Wh::H
+    b::V
+    state0::S
 end
 
-RNNCell((in, out)::Pair, σ=tanh; init=Flux.glorot_uniform, initb=zeros32, init_state=zeros32) =
-  RNNCell(σ, init(out, in), init(out, out), initb(out), init_state(out,1))
+function RNNCell((in, out)::Pair, σ = tanh; init = Flux.glorot_uniform, initb = zeros32,
+                 init_state = zeros32)
+    return RNNCell(σ, init(out, in), init(out, out), initb(out), init_state(out, 1))
+end
 
-function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {F,I,H,V,T}
-  Wi, Wh, b = m.Wi, m.Wh, m.b
-  σ = NNlib.fast_act(m.σ, x)
-  h = σ.(Wi*x .+ Wh*h .+ b)
-  return h, reshape_cell_output(h, x)
+function (m::RNNCell{F, I, H, V, <:AbstractMatrix{T}})(h,
+                                                       x::Union{AbstractVecOrMat{T},
+                                                                OneHotArray}) where {F, I,
+                                                                                     H, V, T
+                                                                                     }
+    Wi, Wh, b = m.Wi, m.Wh, m.b
+    σ = NNlib.fast_act(m.σ, x)
+    h = σ.(Wi * x .+ Wh * h .+ b)
+    return h, reshape_cell_output(h, x)
 end
 
 @functor RNNCell
 
 function Base.show(io::IO, l::RNNCell)
-  print(io, "RNNCell(", size(l.Wi, 2), " => ", size(l.Wi, 1))
-  l.σ == identity || print(io, ", ", l.σ)
-  print(io, ")")
+    print(io, "RNNCell(", size(l.Wi, 2), " => ", size(l.Wi, 1))
+    l.σ == identity || print(io, ", ", l.σ)
+    return print(io, ")")
 end
 
 """
@@ -226,6 +234,7 @@ The arguments `in` and `out` describe the size of the feature vectors passed as
 This constructor is syntactic sugar for `Recur(RNNCell(a...))`, and so RNNs are stateful. Note that the state shape can change depending on the inputs, and so it is good to `reset!` the model between inference calls if the batch size changes. See the examples below.
 
 # Examples
+
 ```jldoctest
 julia> r = RNN(3 => 5)
 Recur(
@@ -243,81 +252,93 @@ julia> r(rand(Float32, 3, 10)) |> size # batch size of 10
 ```
 
 !!! warning "Batch size changes"
-  
+    
     Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the following example:
-
+    
     ```julia
     julia> r = RNN(3 => 5)
     Recur(
       RNNCell(3 => 5, tanh),                # 50 parameters
     )         # Total: 4 trainable arrays, 50 parameters,
               # plus 1 non-trainable, 5 parameters, summarysize 432 bytes.
-
+    
     julia> r.state |> size
     (5, 1)
-
+    
     julia> r(rand(Float32, 3)) |> size
     (5,)
-
+    
     julia> r.state |> size
     (5, 1)
-
+    
     julia> r(rand(Float32, 3, 10)) |> size # batch size of 10
     (5, 10)
-
+    
     julia> r.state |> size # state shape has changed
     (5, 10)
-
+    
     julia> r(rand(Float32, 3)) |> size # erroneously outputs a length 5*10 = 50 vector.
     (50,)
     ```
 
 # Note:
-  `RNNCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type, but if `Wh` is `dxd`, then `Wi` should be of shape `dxN`.
 
-  ```julia
-  julia> using LinearAlgebra
+`RNNCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type, but if `Wh` is `dxd`, then `Wi` should be of shape `dxN`.
+
+```julia
+julia> using LinearAlgebra
 
-  julia> r = Flux.Recur(Flux.RNNCell(tanh, rand(5, 4), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1)))
+julia> r = Flux.Recur(Flux.RNNCell(tanh, rand(5, 4), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1)))
 
-  julia> r(rand(4, 10)) |> size # batch size of 10
-  (5, 10)
-  ```
+julia> r(rand(4, 10)) |> size # batch size of 10
+(5, 10)
+```
 """
 RNN(a...; ka...) = Recur(RNNCell(a...; ka...))
 Recur(m::RNNCell) = Recur(m, m.state0)
 
 # LSTM
 
-struct LSTMCell{I,H,V,S}
-  Wi::I
-  Wh::H
-  b::V
-  state0::S
+struct LSTMCell{I, H, V, S}
+    Wi::I
+    Wh::H
+    b::V
+    state0::S
 end
 
 function LSTMCell((in, out)::Pair;
                   init = glorot_uniform,
                   initb = zeros32,
                   init_state = zeros32)
-  cell = LSTMCell(init(out * 4, in), init(out * 4, out), initb(out * 4), (init_state(out,1), init_state(out,1)))
-  cell.b[gate(out, 2)] .= 1
-  return cell
+    cell = LSTMCell(init(out * 4, in), init(out * 4, out), initb(out * 4),
+                    (init_state(out, 1), init_state(out, 1)))
+    cell.b[gate(out, 2)] .= 1
+    return cell
 end
 
-function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})((h, c), x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,T}
-  b, o = m.b, size(h, 1)
-  g = muladd(m.Wi, x, muladd(m.Wh, h, b))
-  input, forget, cell, output = multigate(g, o, Val(4))
-  c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell)
-  h′ = @. sigmoid_fast(output) * tanh_fast(c′)
-  return (h′, c′), reshape_cell_output(h′, x)
+function (m::LSTMCell{I, H, V, <:NTuple{2, AbstractMatrix{T}}})((h, c),
+                                                                x::Union{
+                                                                         AbstractVecOrMat{T
+                                                                                          },
+                                                                         OneHotArray}) where {
+                                                                                              I,
+                                                                                              H,
+                                                                                              V,
+                                                                                              T
+                                                                                              }
+    b, o = m.b, size(h, 1)
+    g = muladd(m.Wi, x, muladd(m.Wh, h, b))
+    input, forget, cell, output = multigate(g, o, Val(4))
+    c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell)
+    h′ = @. sigmoid_fast(output) * tanh_fast(c′)
+    return (h′, c′), reshape_cell_output(h′, x)
 end
 
 @functor LSTMCell
 
-Base.show(io::IO, l::LSTMCell) =
-  print(io, "LSTMCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷4, ")")
+function Base.show(io::IO, l::LSTMCell)
+    return print(io, "LSTMCell(", size(l.Wi, 2), " => ", size(l.Wi, 1) ÷ 4, ")")
+end
 
 """
     LSTM(in => out)
@@ -333,6 +354,7 @@ See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
 for a good overview of the internals.
 
 # Examples
+
 ```jldoctest
 julia> l = LSTM(3 => 5)
 Recur(
@@ -350,10 +372,12 @@ julia> l(rand(Float32, 3, 10)) |> size # batch size of 10
 ```
 
 !!! warning "Batch size changes"
+    
     Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref).
 
 # Note:
-  `LSTMCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref).
+
+`LSTMCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref).
 """
 LSTM(a...; ka...) = Recur(LSTMCell(a...; ka...))
 Recur(m::LSTMCell) = Recur(m, m.state0)
@@ -361,34 +385,42 @@ Recur(m::LSTMCell) = Recur(m, m.state0)
 # GRU
 
 function _gru_output(gxs, ghs, bs)
-  r = @. sigmoid_fast(gxs[1] + ghs[1] + bs[1])
-  z = @. sigmoid_fast(gxs[2] + ghs[2] + bs[2])
-  return r, z
+    r = @. sigmoid_fast(gxs[1] + ghs[1] + bs[1])
+    z = @. sigmoid_fast(gxs[2] + ghs[2] + bs[2])
+    return r, z
 end
 
-struct GRUCell{I,H,V,S}
-  Wi::I
-  Wh::H
-  b::V
-  state0::S
+struct GRUCell{I, H, V, S}
+    Wi::I
+    Wh::H
+    b::V
+    state0::S
 end
 
-GRUCell((in, out)::Pair; init = glorot_uniform, initb = zeros32, init_state = zeros32) =
-  GRUCell(init(out * 3, in), init(out * 3, out), initb(out * 3), init_state(out,1))
+function GRUCell((in, out)::Pair; init = glorot_uniform, initb = zeros32,
+                 init_state = zeros32)
+    return GRUCell(init(out * 3, in), init(out * 3, out), initb(out * 3),
+                   init_state(out, 1))
+end
 
-function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,T}
-  Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1)
-  gxs, ghs, bs = multigate(Wi*x, o, Val(3)), multigate(Wh*h, o, Val(3)), multigate(b, o, Val(3))
-  r, z = _gru_output(gxs, ghs, bs)
-  h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3])
-  h′ = @. (1 - z) * h̃ + z * h
-  return h′, reshape_cell_output(h′, x)
+function (m::GRUCell{I, H, V, <:AbstractMatrix{T}})(h,
+                                                    x::Union{AbstractVecOrMat{T},
+                                                             OneHotArray}) where {I, H, V, T
+                                                                                  }
+    Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1)
+    gxs, ghs, bs = multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(3)),
+                   multigate(b, o, Val(3))
+    r, z = _gru_output(gxs, ghs, bs)
+    h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3])
+    h′ = @. (1 - z) * h̃ + z * h
+    return h′, reshape_cell_output(h′, x)
 end
 
 @functor GRUCell
 
-Base.show(io::IO, l::GRUCell) =
-  print(io, "GRUCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷3, ")")
+function Base.show(io::IO, l::GRUCell)
+    return print(io, "GRUCell(", size(l.Wi, 2), " => ", size(l.Wi, 1) ÷ 3, ")")
+end
 
 """
     GRU(in => out)
@@ -405,6 +437,7 @@ See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
 for a good overview of the internals.
 
 # Examples
+
 ```jldoctest
 julia> g = GRU(3 => 5)
 Recur(
@@ -422,41 +455,53 @@ julia> g(rand(Float32, 3, 10)) |> size # batch size of 10
 ```
 
 !!! warning "Batch size changes"
+    
     Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref).
 
 # Note:
-  `GRUCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref).
+
+`GRUCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref).
 """
 GRU(a...; ka...) = Recur(GRUCell(a...; ka...))
 Recur(m::GRUCell) = Recur(m, m.state0)
 
 # GRU v3
 
-struct GRUv3Cell{I,H,V,HH,S}
-  Wi::I
-  Wh::H
-  b::V
-  Wh_h̃::HH
-  state0::S
+struct GRUv3Cell{I, H, V, HH, S}
+    Wi::I
+    Wh::H
+    b::V
+    Wh_h̃::HH
+    state0::S
+end
+
+function GRUv3Cell((in, out)::Pair; init = glorot_uniform, initb = zeros32,
+                   init_state = zeros32)
+    return GRUv3Cell(init(out * 3, in), init(out * 2, out), initb(out * 3),
+                     init(out, out), init_state(out, 1))
 end
 
-GRUv3Cell((in, out)::Pair; init = glorot_uniform, initb = zeros32, init_state = zeros32) =
-  GRUv3Cell(init(out * 3, in), init(out * 2, out), initb(out * 3),
-            init(out, out), init_state(out,1))
-
-function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,HH,T}
-  Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1)
-  gxs, ghs, bs = multigate(Wi*x, o, Val(3)), multigate(Wh*h, o, Val(2)), multigate(b, o, Val(3))
-  r, z = _gru_output(gxs, ghs, bs)
-  h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3])
-  h′ = @. (1 - z) * h̃ + z * h
-  return h′, reshape_cell_output(h′, x)
+function (m::GRUv3Cell{I, H, V, HH, <:AbstractMatrix{T}})(h,
+                                                          x::Union{AbstractVecOrMat{T},
+                                                                   OneHotArray}) where {I,
+                                                                                        H,
+                                                                                        V,
+                                                                                        HH,
+                                                                                        T}
+    Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1)
+    gxs, ghs, bs = multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(2)),
+                   multigate(b, o, Val(3))
+    r, z = _gru_output(gxs, ghs, bs)
+    h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3])
+    h′ = @. (1 - z) * h̃ + z * h
+    return h′, reshape_cell_output(h′, x)
 end
 
 @functor GRUv3Cell
 
-Base.show(io::IO, l::GRUv3Cell) =
-  print(io, "GRUv3Cell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷3, ")")
+function Base.show(io::IO, l::GRUv3Cell)
+    return print(io, "GRUv3Cell(", size(l.Wi, 2), " => ", size(l.Wi, 1) ÷ 3, ")")
+end
 
 """
     GRUv3(in => out)
@@ -473,6 +518,7 @@ See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
 for a good overview of the internals.
 
 # Examples
+
 ```jldoctest
 julia> g = GRUv3(3 => 5)
 Recur(
@@ -490,10 +536,12 @@ julia> g(rand(Float32, 3, 10)) |> size # batch size of 10
 ```
 
 !!! warning "Batch size changes"
+    
     Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref).
 
 # Note:
-  `GRUv3Cell`s can be constructed directly by specifying the non-linear function, the `Wi`, `Wh`, and `Wh_h` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi`, `Wh`, and `Wh_h` matrices do not need to be the same type. See the example in [`RNN`](@ref).
+
+`GRUv3Cell`s can be constructed directly by specifying the non-linear function, the `Wi`, `Wh`, and `Wh_h` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi`, `Wh`, and `Wh_h` matrices do not need to be the same type. See the example in [`RNN`](@ref).
 """
 GRUv3(a...; ka...) = Recur(GRUv3Cell(a...; ka...))
 Recur(m::GRUv3Cell) = Recur(m, m.state0)
diff --git a/src/layers/show.jl b/src/layers/show.jl
index 421131f365..b2f272b9ef 100644
--- a/src/layers/show.jl
+++ b/src/layers/show.jl
@@ -1,47 +1,48 @@
 
 for T in [
-    :Chain, :Parallel, :SkipConnection, :Recur, :Maxout, :PairwiseFusion  # container types
-  ]
-  @eval function Base.show(io::IO, m::MIME"text/plain", x::$T)
-    if get(io, :typeinfo, nothing) === nothing  # e.g. top level in REPL
-      _big_show(io, x)
-    elseif !get(io, :compact, false)  # e.g. printed inside a Vector, but not a Matrix
-      _layer_show(io, x)
-    else
-      show(io, x)
+    :Chain, :Parallel, :SkipConnection, :Recur, :Maxout, :PairwiseFusion,  # container types
+]
+    @eval function Base.show(io::IO, m::MIME"text/plain", x::$T)
+        if get(io, :typeinfo, nothing) === nothing  # e.g. top level in REPL
+            _big_show(io, x)
+        elseif !get(io, :compact, false)  # e.g. printed inside a Vector, but not a Matrix
+            _layer_show(io, x)
+        else
+            show(io, x)
+        end
     end
-  end
 end
 
-function _big_show(io::IO, obj, indent::Int=0, name=nothing)
-  pre, post = obj isa Chain{<:AbstractVector} ? ("([", "])") : ("(", ")")
-  children = _show_children(obj)
-  if all(_show_leaflike, children)
-    _layer_show(io, obj, indent, name)
-  else
-    println(io, " "^indent, isnothing(name) ? "" : "$name = ", nameof(typeof(obj)), pre)
-    if obj isa Chain{<:NamedTuple} && children == getfield(obj, :layers)
-      # then we insert names -- can this be done more generically? 
-      for k in Base.keys(obj)
-        _big_show(io, obj[k], indent+2, k)
-      end
-    elseif obj isa Parallel{<:Any, <:NamedTuple} || obj isa PairwiseFusion{<:Any, <:NamedTuple}
-      _big_show(io, obj.connection, indent+2)
-      for k in Base.keys(obj)
-        _big_show(io, obj[k], indent+2, k)
-      end
-    else
-      for c in children
-        _big_show(io, c, indent+2)
-      end
-    end
-    if indent == 0  # i.e. this is the outermost container
-      print(io, rpad(post, 2))
-      _big_finale(io, obj)
+function _big_show(io::IO, obj, indent::Int = 0, name = nothing)
+    pre, post = obj isa Chain{<:AbstractVector} ? ("([", "])") : ("(", ")")
+    children = _show_children(obj)
+    if all(_show_leaflike, children)
+        _layer_show(io, obj, indent, name)
     else
-      println(io, " "^indent, post, ",")
+        println(io, " "^indent, isnothing(name) ? "" : "$name = ", nameof(typeof(obj)), pre)
+        if obj isa Chain{<:NamedTuple} && children == getfield(obj, :layers)
+            # then we insert names -- can this be done more generically? 
+            for k in Base.keys(obj)
+                _big_show(io, obj[k], indent + 2, k)
+            end
+        elseif obj isa Parallel{<:Any, <:NamedTuple} ||
+               obj isa PairwiseFusion{<:Any, <:NamedTuple}
+            _big_show(io, obj.connection, indent + 2)
+            for k in Base.keys(obj)
+                _big_show(io, obj[k], indent + 2, k)
+            end
+        else
+            for c in children
+                _big_show(io, c, indent + 2)
+            end
+        end
+        if indent == 0  # i.e. this is the outermost container
+            print(io, rpad(post, 2))
+            _big_finale(io, obj)
+        else
+            println(io, " "^indent, post, ",")
+        end
     end
-  end
 end
 
 _show_leaflike(x) = isleaf(x)  # mostly follow Functors, except for:
@@ -59,67 +60,75 @@ _show_children(f::PairwiseFusion) = (f.connection, f.layers...)
 for T in [
     :Conv, :ConvTranspose, :CrossCor, :Dense, :Scale, :Bilinear, :Embedding,
     :BatchNorm, :LayerNorm, :InstanceNorm, :GroupNorm,
-  ]
-  @eval function Base.show(io::IO, m::MIME"text/plain", x::$T)
-    if !get(io, :compact, false)
-      _layer_show(io, x)
-    else
-      show(io, x)
+]
+    @eval function Base.show(io::IO, m::MIME"text/plain", x::$T)
+        if !get(io, :compact, false)
+            _layer_show(io, x)
+        else
+            show(io, x)
+        end
     end
-  end
 end
 
-function _layer_show(io::IO, layer, indent::Int=0, name=nothing)
-  _str = isnothing(name) ? "" : "$name = "
-  str = _str * sprint(show, layer, context=io)
-  print(io, " "^indent, str, indent==0 ? "" : ",")
-  if !isempty(params(layer))
-    print(io, " "^max(2, (indent==0 ? 20 : 39) - indent - length(str)))
-    printstyled(io, "# ", underscorise(sum(length, params(layer))), " parameters"; color=:light_black)
-    nonparam = _childarray_sum(length, layer) - sum(length, params(layer))
-    if nonparam > 0
-      printstyled(io, ", plus ", underscorise(nonparam), indent==0 ? " non-trainable" : ""; color=:light_black)
+function _layer_show(io::IO, layer, indent::Int = 0, name = nothing)
+    _str = isnothing(name) ? "" : "$name = "
+    str = _str * sprint(show, layer, context = io)
+    print(io, " "^indent, str, indent == 0 ? "" : ",")
+    if !isempty(params(layer))
+        print(io, " "^max(2, (indent == 0 ? 20 : 39) - indent - length(str)))
+        printstyled(io, "# ", underscorise(sum(length, params(layer))), " parameters";
+                    color = :light_black)
+        nonparam = _childarray_sum(length, layer) - sum(length, params(layer))
+        if nonparam > 0
+            printstyled(io, ", plus ", underscorise(nonparam),
+                        indent == 0 ? " non-trainable" : ""; color = :light_black)
+        end
+        _nan_show(io, params(layer))
     end
-    _nan_show(io, params(layer))
-  end
-  indent==0 || println(io)
+    return indent == 0 || println(io)
 end
 
 function _big_finale(io::IO, m)
-  ps = params(m)
-  if length(ps) > 2
-    pars = underscorise(sum(length, ps))
-    bytes = Base.format_bytes(Base.summarysize(m))
-    noncnt = _childarray_sum(_->1, m) - length(ps)
-    if noncnt > 0
-      nonparam = underscorise(_childarray_sum(length, m) - sum(length, ps))
-      printstyled(io, " "^08, "# Total: ", length(ps), " trainable arrays, "; color=:light_black)
-      println(io, pars, " parameters,")
-      printstyled(io, " "^10, "# plus ", noncnt, " non-trainable, ", nonparam, " parameters, summarysize "; color=:light_black)
-      print(io, bytes, ".")
-    else
-      printstyled(io, " "^18, "# Total: ", length(ps), " arrays, "; color=:light_black)
-      print(io, pars, " parameters, ", bytes, ".")
+    ps = params(m)
+    if length(ps) > 2
+        pars = underscorise(sum(length, ps))
+        bytes = Base.format_bytes(Base.summarysize(m))
+        noncnt = _childarray_sum(_ -> 1, m) - length(ps)
+        if noncnt > 0
+            nonparam = underscorise(_childarray_sum(length, m) - sum(length, ps))
+            printstyled(io, " "^08, "# Total: ", length(ps), " trainable arrays, ";
+                        color = :light_black)
+            println(io, pars, " parameters,")
+            printstyled(io, " "^10, "# plus ", noncnt, " non-trainable, ", nonparam,
+                        " parameters, summarysize "; color = :light_black)
+            print(io, bytes, ".")
+        else
+            printstyled(io, " "^18, "# Total: ", length(ps), " arrays, ";
+                        color = :light_black)
+            print(io, pars, " parameters, ", bytes, ".")
+        end
     end
-  end
 end
 
 _childarray_sum(f, x::AbstractArray{<:Number}) = f(x)
-_childarray_sum(f, x) = isleaf(x) ? 0 : sum(y -> _childarray_sum(f, y), Functors.children(x))
+function _childarray_sum(f, x)
+    return isleaf(x) ? 0 : sum(y -> _childarray_sum(f, y), Functors.children(x))
+end
 
 # utility functions
 
-underscorise(n::Integer) =
-  join(reverse(join.(reverse.(Iterators.partition(digits(n), 3)))), '_')
+function underscorise(n::Integer)
+    return join(reverse(join.(reverse.(Iterators.partition(digits(n), 3)))), '_')
+end
 
 function _nan_show(io::IO, x)
-  if !isempty(x) && _all(iszero, x)
-    printstyled(io, "  (all zero)", color=:cyan)
-  elseif _any(isnan, x)
-    printstyled(io, "  (some NaN)", color=:red)
-  elseif _any(isinf, x)
-    printstyled(io, "  (some Inf)", color=:red)
-  end
+    if !isempty(x) && _all(iszero, x)
+        printstyled(io, "  (all zero)", color = :cyan)
+    elseif _any(isnan, x)
+        printstyled(io, "  (some NaN)", color = :red)
+    elseif _any(isinf, x)
+        printstyled(io, "  (some Inf)", color = :red)
+    end
 end
 
 _any(f, xs::AbstractArray{<:Number}) = any(f, xs)
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 06c8b6a4a9..b772fbbcc2 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -7,13 +7,14 @@ preserving the size of the last dimension.
 See also [`unsqueeze`](@ref).
 
 # Examples
+
 ```jldoctest
-julia> rand(3,4,5) |> Flux.flatten |> size
+julia> rand(3, 4, 5) |> Flux.flatten |> size
 (12, 5)
 
-julia> xs = rand(Float32, 10,10,3,7);
+julia> xs = rand(Float32, 10, 10, 3, 7);
 
-julia> m = Chain(Conv((3,3), 3 => 4, pad=1), Flux.flatten, Dense(400 => 33));
+julia> m = Chain(Conv((3, 3), 3 => 4, pad = 1), Flux.flatten, Dense(400 => 33));
 
 julia> xs |> m[1] |> size
 (10, 10, 4, 7)
@@ -23,17 +24,18 @@ julia> xs |> m |> size
 ```
 """
 function flatten(x::AbstractArray)
-  return reshape(x, :, size(x)[end])
+    return reshape(x, :, size(x)[end])
 end
 
 """
     normalise(x; dims=ndims(x), ϵ=1e-5)
 
 Normalise `x` to mean 0 and standard deviation 1 across the dimension(s) given by `dims`.
-Per default, `dims` is the last dimension. 
+Per default, `dims` is the last dimension.
 `ϵ` is a small additive factor added to the denominator for numerical stability.
 
 # Examples
+
 ```jldoctest
 julia> using Statistics
 
@@ -41,19 +43,19 @@ julia> x = [9, 10, 20, 60];
 
 julia> y = Flux.normalise(x);
 
-julia> isapprox(std(y), 1, atol=0.2) && std(y) != std(x)
+julia> isapprox(std(y), 1, atol = 0.2) && std(y) != std(x)
 true
 
 julia> x = rand(1:100, 10, 2);
 
-julia> y = Flux.normalise(x, dims=1);
+julia> y = Flux.normalise(x, dims = 1);
 
-julia> isapprox(std(y, dims=1), ones(1, 2), atol=0.2) && std(y, dims=1) != std(x, dims=1)
+julia> isapprox(std(y, dims = 1), ones(1, 2), atol = 0.2) && std(y, dims = 1) != std(x, dims = 1)
 true
 ```
 """
-@inline function normalise(x::AbstractArray; dims=ndims(x), ϵ=ofeltype(x, 1e-5))
-  μ = mean(x, dims=dims)
-  σ = std(x, dims=dims, mean=μ, corrected=false)
-  return @. (x - μ) / (σ + ϵ)
+@inline function normalise(x::AbstractArray; dims = ndims(x), ϵ = ofeltype(x, 1e-5))
+    μ = mean(x, dims = dims)
+    σ = std(x, dims = dims, mean = μ, corrected = false)
+    return @. (x - μ) / (σ + ϵ)
 end
diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl
index c71a9acc8d..2d50074833 100644
--- a/src/layers/upsample.jl
+++ b/src/layers/upsample.jl
@@ -4,13 +4,14 @@
 
 An upsampling layer. One of two keywords must be given:
 
-If `scale` is a number, this applies to all but the last two dimensions (channel and batch) of the input. 
-It may also be a tuple, to control dimensions individually. Alternatively, keyword 
+If `scale` is a number, this applies to all but the last two dimensions (channel and batch) of the input.
+It may also be a tuple, to control dimensions individually. Alternatively, keyword
 `size` accepts a tuple, to directly specify the leading dimensions of the output.
 
-Currently supported upsampling `mode`s 
+Currently supported upsampling `mode`s
 and corresponding NNlib's methods are:
-  - `:nearest` -> [`NNlib.upsample_nearest`](@ref) 
+
+  - `:nearest` -> [`NNlib.upsample_nearest`](@ref)
   - `:bilinear` -> [`NNlib.upsample_bilinear`](@ref)
   - `:trilinear` -> [`NNlib.upsample_trilinear`](@ref)
 
@@ -31,45 +32,45 @@ julia> m(ones(2, 2, 1, 1)) |> size
 ```
 """
 struct Upsample{mode, S, T}
-  scale::S
-  size::T
+    scale::S
+    size::T
 end
 
 function Upsample(mode::Symbol = :nearest; scale = nothing, size = nothing)
-  mode in [:nearest, :bilinear, :trilinear] || 
-    throw(ArgumentError("mode=:$mode is not supported."))
-  if !(isnothing(scale) ⊻ isnothing(size))
-    throw(ArgumentError("Either scale or size should be specified (but not both)."))
-  end
-  return Upsample{mode,typeof(scale),typeof(size)}(scale, size)
+    mode in [:nearest, :bilinear, :trilinear] ||
+        throw(ArgumentError("mode=:$mode is not supported."))
+    if !(isnothing(scale) ⊻ isnothing(size))
+        throw(ArgumentError("Either scale or size should be specified (but not both)."))
+    end
+    return Upsample{mode, typeof(scale), typeof(size)}(scale, size)
 end
 
 Upsample(scale, mode::Symbol = :nearest) = Upsample(mode; scale)
 
-(m::Upsample{:nearest})(x::AbstractArray) =
-  NNlib.upsample_nearest(x, m.scale)
-function (m::Upsample{:nearest, Int})(x::AbstractArray{T, N}) where {T, N} 
-  NNlib.upsample_nearest(x, ntuple(i -> m.scale, N-2))
+(m::Upsample{:nearest})(x::AbstractArray) = NNlib.upsample_nearest(x, m.scale)
+function (m::Upsample{:nearest, Int})(x::AbstractArray{T, N}) where {T, N}
+    return NNlib.upsample_nearest(x, ntuple(i -> m.scale, N - 2))
+end
+function (m::Upsample{:nearest, Nothing})(x::AbstractArray)
+    return NNlib.upsample_nearest(x; size = m.size)
 end
-(m::Upsample{:nearest, Nothing})(x::AbstractArray) =
-  NNlib.upsample_nearest(x; size=m.size)
 
-(m::Upsample{:bilinear})(x::AbstractArray) =
-  NNlib.upsample_bilinear(x, m.scale)
-(m::Upsample{:bilinear, Nothing})(x::AbstractArray) = 
-  NNlib.upsample_bilinear(x; size=m.size)
+(m::Upsample{:bilinear})(x::AbstractArray) = NNlib.upsample_bilinear(x, m.scale)
+function (m::Upsample{:bilinear, Nothing})(x::AbstractArray)
+    return NNlib.upsample_bilinear(x; size = m.size)
+end
 
-(m::Upsample{:trilinear})(x::AbstractArray) =
-  NNlib.upsample_trilinear(x, m.scale)
-(m::Upsample{:trilinear, Nothing})(x::AbstractArray) = 
-  NNlib.upsample_trilinear(x; size=m.size)
+(m::Upsample{:trilinear})(x::AbstractArray) = NNlib.upsample_trilinear(x, m.scale)
+function (m::Upsample{:trilinear, Nothing})(x::AbstractArray)
+    return NNlib.upsample_trilinear(x; size = m.size)
+end
 
 function Base.show(io::IO, u::Upsample{mode}) where {mode}
-  print(io, "Upsample(")
-  print(io, ":", mode)
-  u.scale !== nothing && print(io, ", scale = $(u.scale)")
-  u.size !== nothing && print(io, ", size = $(u.size)")
-  print(io, ")")
+    print(io, "Upsample(")
+    print(io, ":", mode)
+    u.scale !== nothing && print(io, ", scale = $(u.scale)")
+    u.size !== nothing && print(io, ", size = $(u.size)")
+    return print(io, ")")
 end
 
 """
@@ -77,14 +78,15 @@ end
 
 Pixel shuffling layer with upscale factor `r`. Usually used for generating higher
 resolution images while upscaling them.
- 
+
 See [`NNlib.pixel_shuffle`](@ref).
 
 # Examples
+
 ```jldoctest
 julia> p = PixelShuffle(2);
 
-julia> xs = [2row + col + channel/10 for row in 1:2, col in 1:2, channel in 1:4, n in 1:1]
+julia> xs = [2row + col + channel / 10 for row in 1:2, col in 1:2, channel in 1:4, n in 1:1]
 2×2×4×1 Array{Float64, 4}:
 [:, :, 1, 1] =
  3.1  4.1
@@ -110,7 +112,7 @@ julia> p(xs)
  5.1  5.3  6.1  6.3
  5.2  5.4  6.2  6.4
 
-julia> xs = [3row + col + channel/10 for row in 1:2, col in 1:3, channel in 1:4, n in 1:1]
+julia> xs = [3row + col + channel / 10 for row in 1:2, col in 1:3, channel in 1:4, n in 1:1]
 2×3×4×1 Array{Float64, 4}:
 [:, :, 1, 1] =
  4.1  5.1  6.1
@@ -137,8 +139,8 @@ julia> p(xs)
  7.2  7.4  8.2  8.4  9.2  9.4
 ```
 """
-struct PixelShuffle 
-  r::Int
+struct PixelShuffle
+    r::Int
 end
 
 (m::PixelShuffle)(x) = NNlib.pixel_shuffle(x, m.r)
diff --git a/src/loading.jl b/src/loading.jl
index 9098828a8b..d360150447 100644
--- a/src/loading.jl
+++ b/src/loading.jl
@@ -1,35 +1,46 @@
 loadleaf!(dst, src, err) = dst
-loadleaf!(dst::AbstractArray, src, err) =
-  error("Tried to copy $src into an array destination; this is not allowed.")
-loadleaf!(dst, src::AbstractArray, err) =
-  error("Tried to copy an array to $dst; this is not allowed.")
+function loadleaf!(dst::AbstractArray, src, err)
+    return error("Tried to copy $src into an array destination; this is not allowed.")
+end
+function loadleaf!(dst, src::AbstractArray, err)
+    return error("Tried to copy an array to $dst; this is not allowed.")
+end
 function loadleaf!(dst::AbstractArray, src::Bool, err)
-  if iszero(src)
-    dst .= src
-  else
-    error("Cannot copy boolean parameter == true to non-zero parameter.")
-  end
-  return dst
+    if iszero(src)
+        dst .= src
+    else
+        error("Cannot copy boolean parameter == true to non-zero parameter.")
+    end
+    return dst
+end
+function loadleaf!(dst::Bool, src::AbstractArray, err)
+    return iszero(dst) ? dst :
+           error("Cannot copy non-zero parameter to boolean parameter == true.")
 end
-loadleaf!(dst::Bool, src::AbstractArray, err) = iszero(dst) ? dst :
-  error("Cannot copy non-zero parameter to boolean parameter == true.")
 function loadleaf!(dst::AbstractArray, src::AbstractArray, err)
-  (size(dst) == size(src)) || throw(err)
-  copyto!(dst, src)
+    (size(dst) == size(src)) || throw(err)
+    return copyto!(dst, src)
 end
 
-_tie_check(dst::Bool, src::AbstractArray) = iszero(dst) ||
-  error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.")
-_tie_check(dst::AbstractArray, src::Bool) = (iszero(dst) && iszero(src)) ||
-  error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.")
-_tie_check(dst::AbstractArray, src::AbstractArray) = (dst == src) ||
-  error("Encountered tied destination parameters with untied and mismatched sources.")
+function _tie_check(dst::Bool, src::AbstractArray)
+    return iszero(dst) ||
+           error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.")
+end
+function _tie_check(dst::AbstractArray, src::Bool)
+    return (iszero(dst) && iszero(src)) ||
+           error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.")
+end
+function _tie_check(dst::AbstractArray, src::AbstractArray)
+    return (dst == src) ||
+           error("Encountered tied destination parameters with untied and mismatched sources.")
+end
 _tie_check(dst, src) = true
 
 _bool_tie_check(dst, src) = true
 
-_filter_children(f, children::NamedTuple) =
-  NamedTuple(filter(kv -> f(kv[2]), pairs(children)))
+function _filter_children(f, children::NamedTuple)
+    return NamedTuple(filter(kv -> f(kv[2]), pairs(children)))
+end
 _filter_children(f, children) = filter(f, children)
 
 """
@@ -44,8 +55,9 @@ Zero bias vectors and `bias=false` are considered equivalent
 (see extended help for more details).
 
 # Examples
+
 ```julia
-julia> dst = Chain(Dense(Flux.ones32(2, 5), Flux.ones32(2), tanh), Dense(2 => 1; bias = [1f0]))
+julia> dst = Chain(Dense(Flux.ones32(2, 5), Flux.ones32(2), tanh), Dense(2 => 1; bias = [1.0f0]))
 Chain(
   Dense(5 => 2, tanh),                  # 12 parameters
   Dense(2 => 1),                        # 3 parameters
@@ -54,7 +66,7 @@ Chain(
 julia> dst[1].weight ≈ ones(2, 5)  # by construction
 true
 
-julia> src = Chain(Dense(5 => 2, relu), Dense(2 => 1, bias=false));
+julia> src = Chain(Dense(5 => 2, relu), Dense(2 => 1, bias = false));
 
 julia> Flux.loadmodel!(dst, src);
 
@@ -68,12 +80,13 @@ true
 # Extended help
 
 Throws an error when:
-- `dst` and `src` do not share the same fields (at any level)
-- the sizes of leaf nodes are mismatched between `dst` and `src`
-- copying non-array values to/from an array parameter
-  (except inactive parameters described below)
-- `dst` is a "tied" parameter (i.e. refers to another parameter) and
-  loaded into multiple times with mismatched source values
+
+  - `dst` and `src` do not share the same fields (at any level)
+  - the sizes of leaf nodes are mismatched between `dst` and `src`
+  - copying non-array values to/from an array parameter
+    (except inactive parameters described below)
+  - `dst` is a "tied" parameter (i.e. refers to another parameter) and
+    loaded into multiple times with mismatched source values
 
 Inactive parameters can be encoded by using the boolean value `false` instead of an array.
 If `dst == false` and `src` is an all-zero array, no error will be raised (and no values copied);
@@ -82,22 +95,22 @@ Likewise, copying a `src` value of `false` to any `dst` array is valid,
 but copying a `src` value of `true` will error.
 """
 function loadmodel!(dst, src; filter = _ -> true, cache = Base.IdSet())
-  ldsts = _filter_children(filter, functor(dst)[1])
-  lsrcs = _filter_children(filter, functor(src)[1])
-  (keys(ldsts) == keys(lsrcs)) ||
-    throw(ArgumentError("Tried to load $src into $dst but the structures do not match."))
-
-  err = DimensionMismatch("Tried to load $src into $dst but the parameter sizes do not match.")
-  foreach(ldsts, lsrcs) do ldst, lsrc
-    if ldst in cache # we already loaded this parameter before
-      _tie_check(ldst, lsrc) && return ldst
-    elseif Functors.isleaf(ldst) # our first time loading this leaf
-      push!(cache, ldst)
-      loadleaf!(ldst, lsrc, err)
-    else # this isn't a leaf
-      loadmodel!(ldst, lsrc; filter = filter, cache = cache)
+    ldsts = _filter_children(filter, functor(dst)[1])
+    lsrcs = _filter_children(filter, functor(src)[1])
+    (keys(ldsts) == keys(lsrcs)) ||
+        throw(ArgumentError("Tried to load $src into $dst but the structures do not match."))
+
+    err = DimensionMismatch("Tried to load $src into $dst but the parameter sizes do not match.")
+    foreach(ldsts, lsrcs) do ldst, lsrc
+        if ldst in cache # we already loaded this parameter before
+            _tie_check(ldst, lsrc) && return ldst
+        elseif Functors.isleaf(ldst) # our first time loading this leaf
+            push!(cache, ldst)
+            loadleaf!(ldst, lsrc, err)
+        else # this isn't a leaf
+            loadmodel!(ldst, lsrc; filter = filter, cache = cache)
+        end
     end
-  end
 
-  return dst
+    return dst
 end
diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl
index 3d8f6f8149..863d075916 100644
--- a/src/losses/Losses.jl
+++ b/src/losses/Losses.jl
@@ -10,16 +10,16 @@ using NNlib: logsoftmax, logσ, ctc_loss, ctc_alpha, ∇ctc_loss
 import Base.Broadcast: broadcasted
 
 export mse, mae, msle,
-    label_smoothing,
-    crossentropy, logitcrossentropy,
-    binarycrossentropy, logitbinarycrossentropy,
-    kldivergence,
-    huber_loss,
-    tversky_loss,
-    dice_coeff_loss,
-    poisson_loss,
-    hinge_loss, squared_hinge_loss,
-    binary_focal_loss, focal_loss, siamese_contrastive_loss
+       label_smoothing,
+       crossentropy, logitcrossentropy,
+       binarycrossentropy, logitbinarycrossentropy,
+       kldivergence,
+       huber_loss,
+       tversky_loss,
+       dice_coeff_loss,
+       poisson_loss,
+       hinge_loss, squared_hinge_loss,
+       binary_focal_loss, focal_loss, siamese_contrastive_loss
 
 include("utils.jl")
 include("functions.jl")
diff --git a/src/losses/functions.jl b/src/losses/functions.jl
index ea7b4a6c65..da428851d8 100644
--- a/src/losses/functions.jl
+++ b/src/losses/functions.jl
@@ -11,6 +11,7 @@ Return the loss corresponding to mean absolute error:
     agg(abs.(ŷ .- y))
 
 # Example
+
 ```jldoctest
 julia> y_model = [1.1, 1.9, 3.1];
 
@@ -19,8 +20,8 @@ julia> Flux.mae(y_model, 1:3)
 ```
 """
 function mae(ŷ, y; agg = mean)
-  _check_sizes(ŷ, y)
-  agg(abs.(ŷ .- y))
+    _check_sizes(ŷ, y)
+    return agg(abs.(ŷ .- y))
 end
 
 """
@@ -33,6 +34,7 @@ Return the loss corresponding to mean square error:
 See also: [`mae`](@ref), [`msle`](@ref), [`crossentropy`](@ref).
 
 # Example
+
 ```jldoctest
 julia> y_model = [1.1, 1.9, 3.1];
 
@@ -43,8 +45,8 @@ julia> Flux.mse(y_model, y_true)
 ```
 """
 function mse(ŷ, y; agg = mean)
-  _check_sizes(ŷ, y)
-  agg(abs2.(ŷ .- y))
+    _check_sizes(ŷ, y)
+    return agg(abs2.(ŷ .- y))
 end
 
 """
@@ -58,6 +60,7 @@ The `ϵ` term provides numerical stability.
 Penalizes an under-estimation more than an over-estimatation.
 
 # Example
+
 ```jldoctest
 julia> Flux.msle(Float32[1.1, 2.2, 3.3], 1:3)
 0.009084041f0
@@ -67,8 +70,8 @@ julia> Flux.msle(Float32[0.9, 1.8, 2.7], 1:3)
 ```
 """
 function msle(ŷ, y; agg = mean, ϵ = epseltype(ŷ))
-  _check_sizes(ŷ, y)
-  agg((log.((ŷ .+ ϵ) ./ (y .+ ϵ))) .^2 )
+    _check_sizes(ŷ, y)
+    return agg((log.((ŷ .+ ϵ) ./ (y .+ ϵ))) .^ 2)
 end
 
 """
@@ -82,23 +85,24 @@ given the prediction `ŷ` and true values `y`.
                  |  δ * (|ŷ - y| - 0.5 * δ), otherwise
 
 # Example
+
 ```jldoctest
 julia> ŷ = [1.1, 2.1, 3.1];
 
 julia> Flux.huber_loss(ŷ, 1:3)  # default δ = 1 > |ŷ - y|
 0.005000000000000009
 
-julia> Flux.huber_loss(ŷ, 1:3, δ=0.05)  # changes behaviour as |ŷ - y| > δ
+julia> Flux.huber_loss(ŷ, 1:3, δ = 0.05)  # changes behaviour as |ŷ - y| > δ
 0.003750000000000005
 ```
 """
 function huber_loss(ŷ, y; agg = mean, δ = ofeltype(ŷ, 1))
-   _check_sizes(ŷ, y)
-   abs_error = abs.(ŷ .- y)
-   #TODO: remove dropgrad when Zygote can handle this function with CuArrays
-   temp = Zygote.dropgrad(abs_error .<  δ)
-   x = ofeltype(ŷ, 0.5)
-   agg(((abs_error .^ 2) .* temp) .* x .+ δ * (abs_error .- x * δ) .* (1 .- temp))
+    _check_sizes(ŷ, y)
+    abs_error = abs.(ŷ .- y)
+    #TODO: remove dropgrad when Zygote can handle this function with CuArrays
+    temp = Zygote.dropgrad(abs_error .< δ)
+    x = ofeltype(ŷ, 0.5)
+    return agg(((abs_error .^ 2) .* temp) .* x .+ δ * (abs_error .- x * δ) .* (1 .- temp))
 end
 
 """
@@ -124,6 +128,7 @@ value of α larger the smoothing of `y`.
 of label smoothing to binary distributions encoded in a single number.
 
 # Example
+
 ```jldoctest
 julia> y = Flux.onehotbatch([1, 1, 1, 0, 1, 0], 0:1)
 2×6 OneHotMatrix(::Vector{UInt32}) with eltype Bool:
@@ -135,12 +140,12 @@ julia> y_smoothed = Flux.label_smoothing(y, 0.2f0)
  0.1  0.1  0.1  0.9  0.1  0.9
  0.9  0.9  0.9  0.1  0.9  0.1
 
-julia> y_sim = softmax(y .* log(2f0))
+julia> y_sim = softmax(y .* log(2.0f0))
 2×6 Matrix{Float32}:
  0.333333  0.333333  0.333333  0.666667  0.333333  0.666667
  0.666667  0.666667  0.666667  0.333333  0.666667  0.333333
 
-julia> y_dis = vcat(y_sim[2,:]', y_sim[1,:]')
+julia> y_dis = vcat(y_sim[2, :]', y_sim[1, :]')
 2×6 Matrix{Float32}:
  0.666667  0.666667  0.666667  0.333333  0.666667  0.333333
  0.333333  0.333333  0.333333  0.666667  0.333333  0.666667
@@ -152,14 +157,14 @@ julia> Flux.crossentropy(y_dis, y) > Flux.crossentropy(y_dis, y_smoothed)
 true
 ```
 """
-function label_smoothing(y::Union{AbstractArray,Number}, α::Number; dims::Int = 1)
+function label_smoothing(y::Union{AbstractArray, Number}, α::Number; dims::Int = 1)
     if !(0 < α < 1)
         throw(ArgumentError("α must be between 0 and 1"))
     end
     if dims == 0
-        y_smoothed = y .* (1 - α) .+ α*1//2
+        y_smoothed = y .* (1 - α) .+ α * 1 // 2
     elseif dims == 1
-        y_smoothed = y .* (1 - α) .+ α* 1 // size(y, 1)
+        y_smoothed = y .* (1 - α) .+ α * 1 // size(y, 1)
     else
         throw(ArgumentError("`dims` should be either 0 or 1"))
     end
@@ -189,6 +194,7 @@ computing the loss.
 See also: [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref), [`logitbinarycrossentropy`](@ref).
 
 # Example
+
 ```jldoctest
 julia> y_label = Flux.onehotbatch([0, 1, 2, 1, 0], 0:2)
 3×5 OneHotMatrix(::Vector{UInt32}) with eltype Bool:
@@ -196,20 +202,20 @@ julia> y_label = Flux.onehotbatch([0, 1, 2, 1, 0], 0:2)
  ⋅  1  ⋅  1  ⋅
  ⋅  ⋅  1  ⋅  ⋅
 
-julia> y_model = softmax(reshape(-7:7, 3, 5) .* 1f0)
+julia> y_model = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
 3×5 Matrix{Float32}:
  0.0900306  0.0900306  0.0900306  0.0900306  0.0900306
  0.244728   0.244728   0.244728   0.244728   0.244728
  0.665241   0.665241   0.665241   0.665241   0.665241
 
-julia> sum(y_model; dims=1)
+julia> sum(y_model; dims = 1)
 1×5 Matrix{Float32}:
  1.0  1.0  1.0  1.0  1.0
 
 julia> Flux.crossentropy(y_model, y_label)
 1.6076053f0
 
-julia> 5 * ans ≈ Flux.crossentropy(y_model, y_label; agg=sum)
+julia> 5 * ans ≈ Flux.crossentropy(y_model, y_label; agg = sum)
 true
 
 julia> y_smooth = Flux.label_smoothing(y_label, 0.15f0)
@@ -223,8 +229,8 @@ julia> Flux.crossentropy(y_model, y_smooth)
 ```
 """
 function crossentropy(ŷ, y; dims = 1, agg = mean, ϵ = epseltype(ŷ))
-  _check_sizes(ŷ, y)
-  agg(.-sum(xlogy.(y, ŷ .+ ϵ); dims = dims))
+    _check_sizes(ŷ, y)
+    return agg(.-sum(xlogy.(y, ŷ .+ ϵ); dims = dims))
 end
 
 """
@@ -241,6 +247,7 @@ and [softmax](@ref Softmax) separately.
 See also: [`binarycrossentropy`](@ref), [`logitbinarycrossentropy`](@ref), [`label_smoothing`](@ref).
 
 # Example
+
 ```jldoctest
 julia> y_label = Flux.onehotbatch(collect("abcabaa"), 'a':'c')
 3×7 OneHotMatrix(::Vector{UInt32}) with eltype Bool:
@@ -262,8 +269,8 @@ julia> Flux.crossentropy(softmax(y_model), y_label)
 ```
 """
 function logitcrossentropy(ŷ, y; dims = 1, agg = mean)
-  _check_sizes(ŷ, y)
-  agg(.-sum(y .* logsoftmax(ŷ; dims = dims); dims = dims))
+    _check_sizes(ŷ, y)
+    return agg(.-sum(y .* logsoftmax(ŷ; dims = dims); dims = dims))
 end
 
 """
@@ -283,22 +290,23 @@ computing the loss.
 See also: [`crossentropy`](@ref), [`logitcrossentropy`](@ref).
 
 # Examples
+
 ```jldoctest
-julia> y_bin = Bool[1,0,1]
+julia> y_bin = Bool[1, 0, 1]
 3-element Vector{Bool}:
  1
  0
  1
 
-julia> y_prob = softmax(reshape(vcat(1:3, 3:5), 2, 3) .* 1f0)
+julia> y_prob = softmax(reshape(vcat(1:3, 3:5), 2, 3) .* 1.0f0)
 2×3 Matrix{Float32}:
  0.268941  0.5  0.268941
  0.731059  0.5  0.731059
 
-julia> Flux.binarycrossentropy(y_prob[2,:], y_bin)
+julia> Flux.binarycrossentropy(y_prob[2, :], y_bin)
 0.43989f0
 
-julia> all(p -> 0 < p < 1, y_prob[2,:])  # else DomainError
+julia> all(p -> 0 < p < 1, y_prob[2, :])  # else DomainError
 true
 
 julia> y_hot = Flux.onehotbatch(y_bin, 0:1)
@@ -311,8 +319,8 @@ julia> Flux.crossentropy(y_prob, y_hot)
 ```
 """
 function binarycrossentropy(ŷ, y; agg = mean, ϵ = epseltype(ŷ))
-  _check_sizes(ŷ, y)
-  agg(@.(-xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)))
+    _check_sizes(ŷ, y)
+    return agg(@.(-xlogy(y, ŷ + ϵ)-xlogy(1 - y, 1 - ŷ + ϵ)))
 end
 
 """
@@ -324,8 +332,9 @@ Mathematically equivalent to
 See also: [`crossentropy`](@ref), [`logitcrossentropy`](@ref).
 
 # Examples
+
 ```jldoctest
-julia> y_bin = Bool[1,0,1];
+julia> y_bin = Bool[1, 0, 1];
 
 julia> y_model = Float32[2, -1, pi]
 3-element Vector{Float32}:
@@ -341,8 +350,8 @@ julia> Flux.binarycrossentropy(sigmoid.(y_model), y_bin)
 ```
 """
 function logitbinarycrossentropy(ŷ, y; agg = mean)
-  _check_sizes(ŷ, y)
-  agg(@.((1 - y) * ŷ - logσ(ŷ)))
+    _check_sizes(ŷ, y)
+    return agg(@.((1 - y) * ŷ-logσ(ŷ)))
 end
 
 """
@@ -356,6 +365,7 @@ The KL divergence is a measure of how much one probability distribution is diffe
 from the other. It is always non-negative, and zero only when both the distributions are equal.
 
 # Example
+
 ```jldoctest
 julia> p1 = [1 0; 0 1]
 2×2 Matrix{Int64}:
@@ -381,10 +391,10 @@ Inf
 ```
 """
 function kldivergence(ŷ, y; dims = 1, agg = mean, ϵ = epseltype(ŷ))
-  _check_sizes(ŷ, y)
-  entropy = agg(sum(xlogx.(y), dims = dims))
-  cross_entropy = crossentropy(ŷ, y; dims = dims, agg = agg, ϵ = ϵ)
-  return entropy + cross_entropy
+    _check_sizes(ŷ, y)
+    entropy = agg(sum(xlogx.(y), dims = dims))
+    cross_entropy = crossentropy(ŷ, y; dims = dims, agg = agg, ϵ = ϵ)
+    return entropy + cross_entropy
 end
 
 """
@@ -398,6 +408,7 @@ distribution `y`; calculated as -
 [More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 
 # Example
+
 ```jldoctest
 julia> y_model = [1, 3, 3];  # data should only take integral values
 
@@ -406,8 +417,8 @@ julia> Flux.poisson_loss(y_model, 1:3)
 ```
 """
 function poisson_loss(ŷ, y; agg = mean)
-  _check_sizes(ŷ, y)
-  agg(ŷ .- xlogy.(y, ŷ))
+    _check_sizes(ŷ, y)
+    return agg(ŷ .- xlogy.(y, ŷ))
 end
 
 """
@@ -422,6 +433,7 @@ Usually used with classifiers like Support Vector Machines.
 See also: [`squared_hinge_loss`](@ref)
 
 # Example
+
 ```jldoctest
 julia> y_true = [1, -1, 1, 1];
 
@@ -441,8 +453,8 @@ true
 ```
 """
 function hinge_loss(ŷ, y; agg = mean)
-  _check_sizes(ŷ, y)
-  agg(max.(0, 1 .- ŷ .* y))
+    _check_sizes(ŷ, y)
+    return agg(max.(0, 1 .- ŷ .* y))
 end
 
 """
@@ -457,6 +469,7 @@ Usually used with classifiers like Support Vector Machines.
 See also: [`hinge_loss`](@ref)
 
 # Example
+
 ```jldoctes
 julia> y_true = [1, -1, 1, 1];
 
@@ -476,8 +489,8 @@ true
 ```
 """
 function squared_hinge_loss(ŷ, y; agg = mean)
-  _check_sizes(ŷ, y)
-  agg((max.(0, 1 .- ŷ .* y)) .^ 2)
+    _check_sizes(ŷ, y)
+    return agg((max.(0, 1 .- ŷ .* y)) .^ 2)
 end
 
 """
@@ -491,6 +504,7 @@ The dice coefficient is similar to the F1_score. Loss calculated as:
     1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)
 
 # Example
+
 ```jldoctest
 julia> y_pred = [1.1, 2.1, 3.1];
 
@@ -502,8 +516,8 @@ julia> 1 - Flux.dice_coeff_loss(y_pred, 1:3)  # ~ F1 score for image segmentatio
 ```
 """
 function dice_coeff_loss(ŷ, y; smooth = ofeltype(ŷ, 1.0))
-  _check_sizes(ŷ, y)
-  1 - (2 * sum(y .* ŷ) + smooth) / (sum(y .^ 2) + sum(ŷ .^ 2) + smooth) #TODO agg
+    _check_sizes(ŷ, y)
+    return 1 - (2 * sum(y .* ŷ) + smooth) / (sum(y .^ 2) + sum(ŷ .^ 2) + smooth) #TODO agg
 end
 
 """
@@ -515,14 +529,13 @@ Larger β weigh recall more than precision (by placing more emphasis on false ne
 Calculated as:
 
     1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + (1 - β)*(1 .- y) .* ŷ + β*y .* (1 .- ŷ)) + 1)
-
 """
 function tversky_loss(ŷ, y; β = ofeltype(ŷ, 0.7))
     _check_sizes(ŷ, y)
     #TODO add agg
     num = sum(y .* ŷ) + 1
     den = sum(y .* ŷ + β * (1 .- y) .* ŷ + (1 - β) * y .* (1 .- ŷ)) + 1
-    1 - num / den
+    return 1 - num / den
 end
 
 """
@@ -536,15 +549,16 @@ For `γ == 0`, the loss is mathematically equivalent to [`Losses.binarycrossentr
 See also: [`Losses.focal_loss`](@ref) for multi-class setting
 
 # Example
+
 ```jldoctest
-julia> y = [0  1  0
-            1  0  1]
+julia> y = [0 1 0
+            1 0 1]
 2×3 Matrix{Int64}:
  0  1  0
  1  0  1
 
-julia> ŷ = [0.268941  0.5  0.268941
-            0.731059  0.5  0.731059]
+julia> ŷ = [0.268941 0.5 0.268941
+            0.731059 0.5 0.731059]
 2×3 Matrix{Float64}:
  0.268941  0.5  0.268941
  0.731059  0.5  0.731059
@@ -553,14 +567,14 @@ julia> Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385
 true
 ```
 """
-function binary_focal_loss(ŷ, y; agg=mean, γ=2, ϵ=epseltype(ŷ))
+function binary_focal_loss(ŷ, y; agg = mean, γ = 2, ϵ = epseltype(ŷ))
     _check_sizes(ŷ, y)
     ŷ = ŷ .+ ϵ
-    p_t = y .* ŷ  + (1 .- y) .* (1 .- ŷ)
+    p_t = y .* ŷ + (1 .- y) .* (1 .- ŷ)
     ce = -log.(p_t)
     weight = (1 .- p_t) .^ γ
     loss = weight .* ce
-    agg(loss)
+    return agg(loss)
 end
 
 """
@@ -575,16 +589,17 @@ The modulating factor, `γ`, controls the down-weighting strength.
 For `γ == 0`, the loss is mathematically equivalent to [`Losses.crossentropy`](@ref).
 
 # Example
+
 ```jldoctest
-julia> y = [1  0  0  0  1
-            0  1  0  1  0
-            0  0  1  0  0]
+julia> y = [1 0 0 0 1
+            0 1 0 1 0
+            0 0 1 0 0]
 3×5 Matrix{Int64}:
  1  0  0  0  1
  0  1  0  1  0
  0  0  1  0  0
 
-julia> ŷ = softmax(reshape(-7:7, 3, 5) .* 1f0)
+julia> ŷ = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
 3×5 Matrix{Float32}:
  0.0900306  0.0900306  0.0900306  0.0900306  0.0900306
  0.244728   0.244728   0.244728   0.244728   0.244728
@@ -595,25 +610,25 @@ true
 ```
 
 See also: [`Losses.binary_focal_loss`](@ref) for binary (not one-hot) labels
-
 """
-function focal_loss(ŷ, y; dims=1, agg=mean, γ=2, ϵ=epseltype(ŷ))
+function focal_loss(ŷ, y; dims = 1, agg = mean, γ = 2, ϵ = epseltype(ŷ))
     _check_sizes(ŷ, y)
     ŷ = ŷ .+ ϵ
-    agg(sum(@. -y * (1 - ŷ)^γ * log(ŷ); dims=dims))
+    return agg(sum(@. -y * (1 - ŷ)^γ * log(ŷ); dims = dims))
 end
 
 """
     siamese_contrastive_loss(ŷ, y; margin = 1, agg = mean)
-                                    
+
 Return the [contrastive loss](http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf)
 which can be useful for training Siamese Networks. It is given by
-                                    
+
     agg(@. (1 - y) * ŷ^2 + y * max(0, margin - ŷ)^2)                           
-                                 
+
 Specify `margin` to set the baseline for distance at which pairs are dissimilar.
 
 # Example
+
 ```jldoctest
 julia> ŷ = [0.5, 1.5, 2.5];
 
diff --git a/src/losses/utils.jl b/src/losses/utils.jl
index e42bdfbe2e..cda3e4a557 100644
--- a/src/losses/utils.jl
+++ b/src/losses/utils.jl
@@ -4,8 +4,8 @@
 Return `x * log(x)` for `x ≥ 0`, handling `x == 0` by taking the limit from above, to get zero.
 """
 function xlogx(x)
-  result = x * log(x)
-  ifelse(iszero(x), zero(result), result)
+    result = x * log(x)
+    return ifelse(iszero(x), zero(result), result)
 end
 
 """
@@ -14,24 +14,25 @@ end
 Return `x * log(y)` for `y > 0`, and zero when `x == 0`.
 """
 function xlogy(x, y)
-  result = x * log(y)
-  ifelse(iszero(x), zero(result), result)
+    result = x * log(y)
+    return ifelse(iszero(x), zero(result), result)
 end
 
 @adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric)
-  res = xlogy.(x, y)
-  res, Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
+    res = xlogy.(x, y)
+    return res,
+           Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)),
+                 Zygote.unbroadcast(y, Δ .* x ./ y))
 end
 
-ChainRulesCore.@scalar_rule xlogy(x, y) (log(y), x/y)  # should help Diffractor's broadcasting
-ChainRulesCore.@scalar_rule xlogx(x) (log(y) + true)
+ChainRulesCore.@scalar_rule xlogy(x, y) (log(y), x / y)  # should help Diffractor's broadcasting
+ChainRulesCore.@scalar_rule xlogx(x) (log(y)+true)
 
 function _check_sizes(ŷ::AbstractArray, y::AbstractArray)
-  for d in 1:max(ndims(ŷ), ndims(y)) 
-   size(ŷ,d) == size(y,d) || throw(DimensionMismatch(
-      "loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))"
-    ))
-  end
+    for d in 1:max(ndims(ŷ), ndims(y))
+        size(ŷ, d) == size(y, d) ||
+            throw(DimensionMismatch("loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))"))
+    end
 end
 _check_sizes(ŷ, y) = nothing  # pass-through, for constant label e.g. y = 1
 
diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index e691ce0170..fa78f513d8 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -4,10 +4,10 @@ using LinearAlgebra
 import ArrayInterface
 
 export train!, update!,
-	Descent, Adam, Momentum, Nesterov, RMSProp,
-	AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW,RAdam, OAdam, AdaBelief,
-	InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser,
-	ClipValue, ClipNorm
+       Descent, Adam, Momentum, Nesterov, RMSProp,
+       AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW, RAdam, OAdam, AdaBelief,
+       InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser,
+       ClipValue, ClipNorm
 
 include("optimisers.jl")
 include("train.jl")
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index ce72a4b0ce..f4d9687384 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -14,10 +14,12 @@ Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
+
+  - Learning rate (`η`): Amount by which gradients are discounted before updating
+    the weights.
 
 # Examples
+
 ```julia
 opt = Descent()
 
@@ -26,20 +28,20 @@ opt = Descent(0.3)
 ps = Flux.params(model)
 
 gs = gradient(ps) do
-    loss(x, y)
+    return loss(x, y)
 end
 
 Flux.Optimise.update!(opt, ps, gs)
 ```
 """
 mutable struct Descent <: AbstractOptimiser
-  eta::Float64
+    eta::Float64
 end
 
 Descent() = Descent(0.1)
 
 function apply!(o::Descent, x, Δ)
-  Δ .*= o.eta
+    return Δ .*= o.eta
 end
 
 """
@@ -48,12 +50,14 @@ end
 Gradient descent optimizer with learning rate `η` and momentum `ρ`.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Momentum (`ρ`): Controls the acceleration of gradient descent in the
-                  prominent direction, in effect damping oscillations.
+
+  - Learning rate (`η`): Amount by which gradients are discounted before updating
+    the weights.
+  - Momentum (`ρ`): Controls the acceleration of gradient descent in the
+    prominent direction, in effect damping oscillations.
 
 # Examples
+
 ```julia
 opt = Momentum()
 
@@ -61,18 +65,18 @@ opt = Momentum(0.01, 0.99)
 ```
 """
 mutable struct Momentum <: AbstractOptimiser
-  eta::Float64
-  rho::Float64
-  velocity::IdDict
+    eta::Float64
+    rho::Float64
+    velocity::IdDict
 end
 
 Momentum(η = 0.01, ρ = 0.9) = Momentum(η, ρ, IdDict())
 
 function apply!(o::Momentum, x, Δ)
-  η, ρ = o.eta, o.rho
-  v = get!(() -> zero(x), o.velocity, x)::typeof(x)
-  @. v = ρ * v - η * Δ
-  @. Δ = -v
+    η, ρ = o.eta, o.rho
+    v = get!(() -> zero(x), o.velocity, x)::typeof(x)
+    @. v = ρ * v - η * Δ
+    @. Δ = -v
 end
 
 """
@@ -81,12 +85,14 @@ end
 Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
-                           prominent direction, in effect damping oscillations.
+
+  - Learning rate (`η`): Amount by which gradients are discounted before updating
+    the weights.
+  - Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
+    prominent direction, in effect damping oscillations.
 
 # Examples
+
 ```julia
 opt = Nesterov()
 
@@ -94,19 +100,19 @@ opt = Nesterov(0.003, 0.95)
 ```
 """
 mutable struct Nesterov <: AbstractOptimiser
-  eta::Float64
-  rho::Float64
-  velocity::IdDict
+    eta::Float64
+    rho::Float64
+    velocity::IdDict
 end
 
 Nesterov(η = 0.001, ρ = 0.9) = Nesterov(η, ρ, IdDict())
 
 function apply!(o::Nesterov, x, Δ)
-  η, ρ = o.eta, o.rho
-  v = get!(() -> zero(x), o.velocity, x)::typeof(x)
-  d = @. ρ^2 * v - (1+ρ) * η * Δ
-  @. v = ρ*v - η*Δ
-  @. Δ = -d
+    η, ρ = o.eta, o.rho
+    v = get!(() -> zero(x), o.velocity, x)::typeof(x)
+    d = @. ρ^2 * v - (1 + ρ) * η * Δ
+    @. v = ρ * v - η * Δ
+    @. Δ = -d
 end
 
 """
@@ -131,19 +137,19 @@ opt = RMSProp(0.002, 0.95)
 ```
 """
 mutable struct RMSProp <: AbstractOptimiser
-  eta::Float64
-  rho::Float64
-  epsilon::Float64
-  acc::IdDict
+    eta::Float64
+    rho::Float64
+    epsilon::Float64
+    acc::IdDict
 end
 RMSProp(η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = EPS) = RMSProp(η, ρ, ϵ, IdDict())
 RMSProp(η::Real, ρ::Real, acc::IdDict) = RMSProp(η, ρ, EPS, acc)
 
 function apply!(o::RMSProp, x, Δ)
-  η, ρ = o.eta, o.rho
-  acc = get!(() -> zero(x), o.acc, x)::typeof(x)
-  @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ)
-  @. Δ *= η / (√acc + o.epsilon)
+    η, ρ = o.eta, o.rho
+    acc = get!(() -> zero(x), o.acc, x)::typeof(x)
+    @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ)
+    @. Δ *= η / (√acc + o.epsilon)
 end
 
 """
@@ -165,27 +171,28 @@ opt = Adam(0.001, (0.9, 0.8))
 ```
 """
 mutable struct Adam <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64,Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
+    eta::Float64
+    beta::Tuple{Float64, Float64}
+    epsilon::Float64
+    state::IdDict{Any, Any}
 end
 Adam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = Adam(η, β, ϵ, IdDict())
 Adam(η::Real, β::Tuple, state::IdDict) = Adam(η, β, EPS, state)
 
 function apply!(o::Adam, x, Δ)
-  η, β = o.eta, o.beta
+    η, β = o.eta, o.beta
 
-  mt, vt, βp = get!(o.state, x) do
-      (zero(x), zero(x), Float64[β[1], β[2]])
-  end :: Tuple{typeof(x),typeof(x),Vector{Float64}}
+    mt, vt, βp = get!(o.state, x) do
+        return (zero(x), zero(x),
+                Float64[β[1], β[2]])
+    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
 
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
-  @. Δ =  mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) * η
-  βp .= βp .* β
+    @. mt = β[1] * mt + (1 - β[1]) * Δ
+    @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
+    @. Δ = mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) * η
+    βp .= βp .* β
 
-  return Δ
+    return Δ
 end
 
 """
@@ -207,35 +214,37 @@ opt = RAdam(0.001, (0.9, 0.8))
 ```
 """
 mutable struct RAdam <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64,Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
+    eta::Float64
+    beta::Tuple{Float64, Float64}
+    epsilon::Float64
+    state::IdDict{Any, Any}
 end
 RAdam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = RAdam(η, β, ϵ, IdDict())
 RAdam(η::Real, β::Tuple, state::IdDict) = RAdam(η, β, EPS, state)
 
 function apply!(o::RAdam, x, Δ)
-  η, β = o.eta, o.beta
-  ρ∞ = 2/(1-β[2])-1
-
-  mt, vt, βp, t = get!(o.state, x) do
-      (zero(x), zero(x), Float64[β[1], β[2]], Ref(1))
-  end :: Tuple{typeof(x),typeof(x),Vector{Float64},Base.RefValue{Int}}
-
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
-  ρ = ρ∞ - 2t[] * βp[2] / (1 - βp[2])
-  if ρ > 4
-    r = sqrt((ρ-4)*(ρ-2)*ρ∞/((ρ∞-4)*(ρ∞-2)*ρ))
-    @. Δ =  mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) * η * r
-  else
-    @. Δ =  mt / (1 - βp[1]) * η
-  end
-  βp .= βp .* β
-  t[] += 1
+    η, β = o.eta, o.beta
+    ρ∞ = 2 / (1 - β[2]) - 1
+
+    mt, vt, βp, t = get!(o.state,
+                         x) do
+        return (zero(x), zero(x), Float64[β[1], β[2]],
+                Ref(1))
+    end::Tuple{typeof(x), typeof(x), Vector{Float64}, Base.RefValue{Int}}
+
+    @. mt = β[1] * mt + (1 - β[1]) * Δ
+    @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
+    ρ = ρ∞ - 2t[] * βp[2] / (1 - βp[2])
+    if ρ > 4
+        r = sqrt((ρ - 4) * (ρ - 2) * ρ∞ / ((ρ∞ - 4) * (ρ∞ - 2) * ρ))
+        @. Δ = mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) * η * r
+    else
+        @. Δ = mt / (1 - βp[1]) * η
+    end
+    βp .= βp .* β
+    t[] += 1
 
-  return Δ
+    return Δ
 end
 
 """
@@ -257,27 +266,28 @@ opt = AdaMax(0.001, (0.9, 0.995))
 ```
 """
 mutable struct AdaMax <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64,Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
+    eta::Float64
+    beta::Tuple{Float64, Float64}
+    epsilon::Float64
+    state::IdDict{Any, Any}
 end
 AdaMax(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = AdaMax(η, β, ϵ, IdDict())
 AdaMax(η::Real, β::Tuple, state::IdDict) = AdaMax(η, β, EPS, state)
 
 function apply!(o::AdaMax, x, Δ)
-  η, β = o.eta, o.beta
+    η, β = o.eta, o.beta
 
-  mt, ut, βp = get!(o.state, x) do
-      (zero(x), zero(x), Float64[β[1], β[2]])
-  end :: Tuple{typeof(x),typeof(x),Vector{Float64}}
+    mt, ut, βp = get!(o.state, x) do
+        return (zero(x), zero(x),
+                Float64[β[1], β[2]])
+    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
 
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. ut = max(β[2] * ut, abs(Δ))
-  @. Δ = (η/(1 - βp[1])) * mt/(ut + o.epsilon)
-  βp .= βp .* β
+    @. mt = β[1] * mt + (1 - β[1]) * Δ
+    @. ut = max(β[2] * ut, abs(Δ))
+    @. Δ = (η / (1 - βp[1])) * mt / (ut + o.epsilon)
+    βp .= βp .* β
 
-  return Δ
+    return Δ
 end
 
 """
@@ -300,29 +310,31 @@ opt = OAdam(0.001, (0.9, 0.995))
 ```
 """
 mutable struct OAdam <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64,Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
+    eta::Float64
+    beta::Tuple{Float64, Float64}
+    epsilon::Float64
+    state::IdDict{Any, Any}
 end
 OAdam(η::Real = 0.001, β::Tuple = (0.5, 0.9), ϵ::Real = EPS) = OAdam(η, β, ϵ, IdDict())
 OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state)
 
 function apply!(o::OAdam, x, Δ)
-  η, β = o.eta, o.beta
+    η, β = o.eta, o.beta
 
-  mt, vt, Δ_, βp = get!(o.state, x) do
-      (zero(x), zero(x), zero(x), Float64[β[1], β[2]])
-  end :: Tuple{typeof(x),typeof(x),typeof(x),Vector{Float64}}
+    mt, vt, Δ_, βp = get!(o.state,
+                          x) do
+        return (zero(x), zero(x), zero(x),
+                Float64[β[1], β[2]])
+    end::Tuple{typeof(x), typeof(x), typeof(x), Vector{Float64}}
 
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
-  @. Δ = -Δ_
-  @. Δ_ = η * mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon)
-  @. Δ += 2Δ_
-  βp .= βp .* β
+    @. mt = β[1] * mt + (1 - β[1]) * Δ
+    @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
+    @. Δ = -Δ_
+    @. Δ_ = η * mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon)
+    @. Δ += 2Δ_
+    βp .= βp .* β
 
-  return Δ
+    return Δ
 end
 
 """
@@ -344,18 +356,18 @@ opt = AdaGrad(0.001)
 ```
 """
 mutable struct AdaGrad <: AbstractOptimiser
-  eta::Float64
-  epsilon::Float64
-  acc::IdDict
+    eta::Float64
+    epsilon::Float64
+    acc::IdDict
 end
 AdaGrad(η::Real = 0.1, ϵ::Real = EPS) = AdaGrad(η, ϵ, IdDict())
 AdaGrad(η::Real, state::IdDict) = AdaGrad(η, EPS, state)
 
 function apply!(o::AdaGrad, x, Δ)
-  η = o.eta
-  acc = get!(() -> fill!(similar(x), o.epsilon), o.acc, x)::typeof(x)
-  @. acc += Δ * conj(Δ)
-  @. Δ *= η / (√acc + o.epsilon)
+    η = o.eta
+    acc = get!(() -> fill!(similar(x), o.epsilon), o.acc, x)::typeof(x)
+    @. acc += Δ * conj(Δ)
+    @. Δ *= η / (√acc + o.epsilon)
 end
 
 """
@@ -376,22 +388,22 @@ opt = AdaDelta(0.89)
 ```
 """
 mutable struct AdaDelta <: AbstractOptimiser
-  rho::Float64
-  epsilon::Float64
-  state::IdDict{Any, Any}
+    rho::Float64
+    epsilon::Float64
+    state::IdDict{Any, Any}
 end
 AdaDelta(ρ::Real = 0.9, ϵ::Real = EPS) = AdaDelta(ρ, ϵ, IdDict())
 AdaDelta(ρ::Real, state::IdDict) = AdaDelta(ρ, EPS, state)
 
 function apply!(o::AdaDelta, x, Δ)
-  ρ = o.rho
-  acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2,typeof(x)}
-  @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ)
-  # DON'T remove epsilon from numerator
-  # or even out of the square roots
-  @. Δ *= √(Δacc + o.epsilon) / √(acc + o.epsilon)
-  @. Δacc = ρ * Δacc + (1 - ρ) * Δ * conj(Δ)
-  return Δ
+    ρ = o.rho
+    acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2, typeof(x)}
+    @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ)
+    # DON'T remove epsilon from numerator
+    # or even out of the square roots
+    @. Δ *= √(Δacc + o.epsilon) / √(acc + o.epsilon)
+    @. Δacc = ρ * Δacc + (1 - ρ) * Δ * conj(Δ)
+    return Δ
 end
 
 """
@@ -414,25 +426,26 @@ opt = AMSGrad(0.001, (0.89, 0.995))
 ```
 """
 mutable struct AMSGrad <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64, Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
+    eta::Float64
+    beta::Tuple{Float64, Float64}
+    epsilon::Float64
+    state::IdDict{Any, Any}
 end
 AMSGrad(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AMSGrad(η, β, ϵ, IdDict())
 AMSGrad(η::Real, β::Tuple, state::IdDict) = AMSGrad(η, β, EPS, state)
 
 function apply!(o::AMSGrad, x, Δ)
-  η, β = o.eta, o.beta
+    η, β = o.eta, o.beta
 
-  mt, vt, v̂t = get!(o.state, x) do
-    (fill!(similar(x), o.epsilon), fill!(similar(x), o.epsilon), fill!(similar(x), o.epsilon))
-  end :: NTuple{3,typeof(x)}
+    mt, vt, v̂t = get!(o.state, x) do
+        return (fill!(similar(x), o.epsilon), fill!(similar(x), o.epsilon),
+                fill!(similar(x), o.epsilon))
+    end::NTuple{3, typeof(x)}
 
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2
-  @. v̂t = max(v̂t, vt)
-  @. Δ = η * mt / (√v̂t + o.epsilon)
+    @. mt = β[1] * mt + (1 - β[1]) * Δ
+    @. vt = β[2] * vt + (1 - β[2]) * Δ^2
+    @. v̂t = max(v̂t, vt)
+    @. Δ = η * mt / (√v̂t + o.epsilon)
 end
 
 """
@@ -455,28 +468,30 @@ opt = NAdam(0.002, (0.89, 0.995))
 ```
 """
 mutable struct NAdam <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64, Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
+    eta::Float64
+    beta::Tuple{Float64, Float64}
+    epsilon::Float64
+    state::IdDict{Any, Any}
 end
 NAdam(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = NAdam(η, β, ϵ, IdDict())
 NAdam(η::Real, β::Tuple, state::IdDict) = NAdam(η, β, EPS, state)
 
 function apply!(o::NAdam, x, Δ)
-  η, β = o.eta, o.beta
+    η, β = o.eta, o.beta
 
-  mt, vt, βp = get!(o.state, x) do
-    (zero(x), zero(x), Float64[o.beta[1], o.beta[2]])
-  end :: Tuple{typeof(x),typeof(x),Vector{Float64}}
-  β1p, β2p = βp
+    mt, vt, βp = get!(o.state, x) do
+        return (zero(x), zero(x),
+                Float64[o.beta[1], o.beta[2]])
+    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
+    β1p, β2p = βp
 
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
-  @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η
-  βp .= βp .* β
+    @. mt = β[1] * mt + (1 - β[1]) * Δ
+    @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
+    @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) /
+           (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η
+    βp .= βp .* β
 
-  return Δ
+    return Δ
 end
 
 """
@@ -486,21 +501,22 @@ end
 weight decay regularization.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                   second (β2) momentum estimate.
-- `decay`: Decay applied to weights during optimisation.
+
+  - Learning rate (`η`): Amount by which gradients are discounted before updating
+    the weights.
+  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+    second (β2) momentum estimate.
+  - `decay`: Decay applied to weights during optimisation.
 
 # Examples
+
 ```julia
 opt = AdamW()
 
 opt = AdamW(0.001, (0.89, 0.995), 0.1)
 ```
 """
-AdamW(η = 0.001, β = (0.9, 0.999), decay = 0) =
-  Optimiser(Adam(η, β), WeightDecay(decay))
+AdamW(η = 0.001, β = (0.9, 0.999), decay = 0) = Optimiser(Adam(η, β), WeightDecay(decay))
 
 """
     AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
@@ -522,40 +538,40 @@ opt = AdaBelief(0.001, (0.9, 0.8))
 ```
 """
 mutable struct AdaBelief <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64,Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
+    eta::Float64
+    beta::Tuple{Float64, Float64}
+    epsilon::Float64
+    state::IdDict{Any, Any}
 end
 AdaBelief(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AdaBelief(η, β, ϵ, IdDict())
 AdaBelief(η::Real, β::Tuple, state::IdDict) = AdaBelief(η, β, EPS, state)
 
 function apply!(o::AdaBelief, x, Δ)
-  η, β = o.eta, o.beta
-
-  mt, st, βp = get!(o.state, x) do
-      (zero(x), zero(x), Float64[β[1], β[2]])
-  end :: Tuple{typeof(x), typeof(x), Vector{Float64}}
+    η, β = o.eta, o.beta
+
+    mt, st, βp = get!(o.state, x) do
+        return (zero(x), zero(x),
+                Float64[β[1], β[2]])
+    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
+
+    #= st is a variance and can go to zero. This is in contrast to Adam, which uses the
+    second moment which is usually far enough from zero. This is problematic, since st
+    can be slightly negative due to numerical error, and the square root below will fail.
+    Also, if we want to differentiate through the optimizer, √0 is not differentiable.
+    To protect against this, we add a small number, st -> st + eps2.
+    The original implementation (https://github.com/juntang-zhuang/Adabelief-Optimizer)
+    uses the square of Adam's epsilon, which we do here.
+    See also: https://github.com/juntang-zhuang/Adabelief-Optimizer/issues/61 =#
+    eps2 = o.epsilon^2 # TODO: make epsilon^2 the default in next breaking release
+
+    @. mt = β[1] * mt + (1 - β[1]) * Δ
+    @. st = β[2] * st + (1 - β[2]) * (Δ - mt) * conj(Δ - mt) + eps2
+    @. Δ = η * mt / (1 - βp[1]) / (√(st / (1 - βp[2])) + eps2)
+    βp .= βp .* β
 
-  #= st is a variance and can go to zero. This is in contrast to Adam, which uses the
-  second moment which is usually far enough from zero. This is problematic, since st
-  can be slightly negative due to numerical error, and the square root below will fail.
-  Also, if we want to differentiate through the optimizer, √0 is not differentiable.
-  To protect against this, we add a small number, st -> st + eps2.
-  The original implementation (https://github.com/juntang-zhuang/Adabelief-Optimizer)
-  uses the square of Adam's epsilon, which we do here.
-  See also: https://github.com/juntang-zhuang/Adabelief-Optimizer/issues/61 =#
-  eps2 = o.epsilon^2 # TODO: make epsilon^2 the default in next breaking release
-  
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. st = β[2] * st + (1 - β[2]) * (Δ - mt) * conj(Δ - mt) + eps2
-  @. Δ =  η * mt / (1 - βp[1]) / (√(st / (1 - βp[2])) + eps2)
-  βp .= βp .* β
-
-  return Δ
+    return Δ
 end
 
-
 # Compose optimizers
 
 """
@@ -566,21 +582,22 @@ that will be fed into the next, and this is finally applied to the parameter as
 usual.
 """
 mutable struct Optimiser <: AbstractOptimiser
-  os::Vector{Any}
+    os::Vector{Any}
 end
 
 Optimiser(opts::AbstractOptimiser...) = Optimiser(Any[opts...])
 
-@forward Optimiser.os Base.getindex, Base.first, Base.last, Base.lastindex, Base.push!, Base.setindex!
+@forward Optimiser.os Base.getindex, Base.first, Base.last, Base.lastindex, Base.push!,
+                      Base.setindex!
 @forward Optimiser.os Base.iterate
 
 Base.getindex(c::Optimiser, i::AbstractArray) = Optimiser(c.os[i]...)
 
 function apply!(o::Optimiser, x, Δ)
-  for opt in o.os
-    Δ = apply!(opt, x, Δ)
-  end
-  return Δ
+    for opt in o.os
+        Δ = apply!(opt, x, Δ)
+    end
+    return Δ
 end
 
 """
@@ -595,28 +612,28 @@ for more general scheduling techniques.
 
 # Examples
 
-`InvDecay` is typically composed  with other optimizers 
+`InvDecay` is typically composed  with other optimizers
 as the last transformation of the gradient:
 
 ```julia
 # Inverse decay of the learning rate
 # with starting value 0.001 and decay coefficient 0.01.
-opt = Optimiser(Adam(1f-3), InvDecay(1f-2))
+opt = Optimiser(Adam(1.0f-3), InvDecay(1.0f-2))
 ```
 """
 mutable struct InvDecay <: AbstractOptimiser
-  gamma::Float64
-  state::IdDict{Any, Int}
+    gamma::Float64
+    state::IdDict{Any, Int}
 end
 
 InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any, Int}())
 
 function apply!(o::InvDecay, x, Δ)
-  γ = o.gamma
-  n = get!(o.state, x, 1)
-  Δ .*= 1 / (1 + γ * n)
-  o.state[x] = n + 1
-  return Δ
+    γ = o.gamma
+    n = get!(o.state, x, 1)
+    Δ .*= 1 / (1 + γ * n)
+    o.state[x] = n + 1
+    return Δ
 end
 
 """
@@ -626,73 +643,77 @@ Discount the learning rate `η` by the factor `decay` every `decay_step` steps t
 a minimum of `clip`.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- `decay`: Factor by which the learning rate is discounted.
-- `decay_step`: Schedule decay operations by setting the number of steps between
-                two decay operations.
-- `clip`: Minimum value of learning rate.
-- 'start': Step at which the decay starts.
 
+  - Learning rate (`η`): Amount by which gradients are discounted before updating
+    the weights.
+  - `decay`: Factor by which the learning rate is discounted.
+  - `decay_step`: Schedule decay operations by setting the number of steps between
+    two decay operations.
+  - `clip`: Minimum value of learning rate.
+  - 'start': Step at which the decay starts.
 
 See also the [Scheduling Optimisers](@ref) section of the docs
 for more general scheduling techniques.
 
 # Examples
 
-`ExpDecay` is typically composed  with other optimizers 
+`ExpDecay` is typically composed  with other optimizers
 as the last transformation of the gradient:
+
 ```julia
 opt = Optimiser(Adam(), ExpDecay(1.0))
 ```
+
 Note: you may want to start with `η=1` in `ExpDecay` when combined with other
 optimizers (`Adam` in this case) that have their own learning rate.
 """
 mutable struct ExpDecay <: AbstractOptimiser
-  eta::Float64
-  decay::Float64
-  step::Int64
-  clip::Float64
-  start::Int64
-  current::IdDict
+    eta::Float64
+    decay::Float64
+    step::Int64
+    clip::Float64
+    start::Int64
+    current::IdDict
 end
 
-ExpDecay(opt = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4, start = 0) =
-  ExpDecay(opt, decay, decay_step, clip, start, IdDict())
+function ExpDecay(opt = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4, start = 0)
+    return ExpDecay(opt, decay, decay_step, clip, start, IdDict())
+end
 
 function apply!(o::ExpDecay, x, Δ)
-  η, s, decay, start = o.eta, o.step, o.decay, o.start
-  n = o.current[x] = get(o.current, x, 0) + 1
-  if n > start && n % s == 0 && count(x -> x > start && x % s == 0, values(o.current)) == 1
-    η = max(η * decay, o.clip)
-    o.eta = η
-  end
-  @. Δ *= η
+    η, s, decay, start = o.eta, o.step, o.decay, o.start
+    n = o.current[x] = get(o.current, x, 0) + 1
+    if n > start && n % s == 0 &&
+       count(x -> x > start && x % s == 0, values(o.current)) == 1
+        η = max(η * decay, o.clip)
+        o.eta = η
+    end
+    @. Δ *= η
 end
 
 """
     WeightDecay(λ = 0)
 
-Decay weights by ``λ``. 
+Decay weights by ``λ``.
 Typically composed  with other optimizers as the first transformation to the gradient,
-making it equivalent to adding ``L_2`` regularization 
+making it equivalent to adding ``L_2`` regularization
 with coefficient  ``λ`` to the loss.
 
 # Examples
 
 ```julia
-opt = Optimiser(WeightDecay(1f-4), Adam())
+opt = Optimiser(WeightDecay(1.0f-4), Adam())
 ```
 """
 mutable struct WeightDecay <: AbstractOptimiser
-  wd::Real
+    wd::Real
 end
 
 WeightDecay() = WeightDecay(0)
 
 function apply!(o::WeightDecay, x, Δ)
-  wd = o.wd
-  @. Δ += wd * x
+    wd = o.wd
+    @. Δ += wd * x
 end
 
 """
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index a1c3e9a7aa..3cea1bcc80 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -1,7 +1,6 @@
 using ProgressLogging: @progress, @withprogress, @logprogress
 import Zygote: Params, gradient, withgradient
 
-
 """
     update!(opt, p, g)
     update!(opt, ps::Params, gs)
@@ -13,16 +12,16 @@ As a result, the parameters are mutated and the optimizer's internal state may c
 The gradient could be mutated as well.
 """
 function update!(opt::AbstractOptimiser, x, x̄)
-  x̄r = ArrayInterface.restructure(x, x̄) # address some cases where Zygote's
-                                          # output are not mutable, see #1510 
-  x .-= apply!(opt, x, x̄r)
+    x̄r = ArrayInterface.restructure(x, x̄) # address some cases where Zygote's
+    # output are not mutable, see #1510 
+    return x .-= apply!(opt, x, x̄r)
 end
 
 function update!(opt::AbstractOptimiser, xs::Params, gs)
-  for x in xs
-    isnothing(gs[x]) && continue
-    update!(opt, x, gs[x])
-  end
+    for x in xs
+        isnothing(gs[x]) && continue
+        update!(opt, x, gs[x])
+    end
 end
 
 # Callback niceties
@@ -39,22 +38,24 @@ Call `Flux.skip()` in a callback to indicate when a callback condition is met.
 This will trigger the train loop to skip the current data point and not update with the calculated gradient.
 
 !!! note
+    
     `Flux.skip()` will be removed from Flux 0.14
 
 # Examples
+
 ```julia
 cb = function ()
-  loss() > 1e7 && Flux.skip()
+    return loss() > 1e7 && Flux.skip()
 end
 ```
 """
 function skip()
-  Base.depwarn("""Flux.skip() will be removed from Flux 0.14.
-                  and should be replaced with `continue` in an ordinary `for` loop.""", :skip)
-  throw(SkipException())
+    Base.depwarn("""Flux.skip() will be removed from Flux 0.14.
+                    and should be replaced with `continue` in an ordinary `for` loop.""",
+                 :skip)
+    throw(SkipException())
 end
 
-
 struct StopException <: Exception end
 
 """
@@ -64,19 +65,21 @@ Call `Flux.stop()` in a callback to indicate when a callback condition is met.
 This will trigger the train loop to stop and exit.
 
 !!! note
+    
     `Flux.stop()` will be removed from Flux 0.14. It should be replaced with `break` in an ordinary `for` loop.
 
 # Examples
+
 ```julia
 cb = function ()
-  accuracy() > 0.9 && Flux.stop()
+    return accuracy() > 0.9 && Flux.stop()
 end
 ```
 """
 function stop()
-  Base.depwarn("""Flux.stop() will be removed from Flux 0.14.
-                  It should be replaced with `break` in an ordinary `for` loop.""", :stop)
-  throw(StopException())
+    Base.depwarn("""Flux.stop() will be removed from Flux 0.14.
+                    It should be replaced with `break` in an ordinary `for` loop.""", :stop)
+    throw(StopException())
 end
 
 batchmemaybe(x) = tuple(x)
@@ -84,69 +87,74 @@ batchmemaybe(x::Tuple) = x
 
 """
     train!(loss, pars::Params, data, opt::AbstractOptimiser; [cb])
-        
-Uses a `loss` function and training `data` to improve the 
+
+Uses a `loss` function and training `data` to improve the
 model's parameters according to a particular optimisation rule `opt`.
 
 For each `d in data`, first the gradient of the `loss` is computed like this:
+
 ```
     gradient(() -> loss(d...), pars)  # if d isa Tuple
     gradient(() -> loss(d), pars)     # otherwise
 ```
+
 Here `pars` is produced by calling [`Flux.params`](@ref) on your model.
 (Or just on the layers you want to train, like `train!(loss, params(model[1:end-2]), data, opt)`.)
 This is the "implicit" style of parameter handling.
 
 This gradient is then used by optimizer `opt` to update the parameters:
+
 ```
     update!(opt, pars, grads)
 ```
+
 The optimiser should be from the `Flux.Optimise` module (see [Optimisers](@ref)).
 Different optimisers can be combined using [`Flux.Optimise.Optimiser`](@ref Flux.Optimiser).
 
 This training loop iterates through `data` once.
 It will stop with a `DomainError` if the loss is `NaN` or infinite.
 
-You can use [`@epochs`](@ref) to do this several times, or 
+You can use [`@epochs`](@ref) to do this several times, or
 use for instance `Itertools.ncycle` to make a longer `data` iterator.
 
 ## Callbacks
 
 [Callbacks](@ref) are given with the keyword argument `cb`.
 For example, this will print "training" every 10 seconds (using [`Flux.throttle`](@ref)):
+
 ```
     train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10))
 ```
-    
+
 The callback can call [`Flux.stop`](@ref) to interrupt the training loop.
 
 Multiple callbacks can be passed to `cb` as array.
 """
 function train!(loss, ps::Params, data, opt::AbstractOptimiser; cb = () -> ())
-  cb = runall(cb)
-  itrsz = Base.IteratorSize(typeof(data))
-  n = (itrsz == Base.HasLength()) || (itrsz == Base.HasShape{1}()) ? length(data) : 0
-  @withprogress for (i, d) in enumerate(data)
-    try
-      l, gs = withgradient(ps) do
-        loss(batchmemaybe(d)...)
-      end
-      if !isfinite(l)
-        throw(DomainError("Loss is $l on data item $i, stopping training"))
-      end
-      update!(opt, ps, gs)
-      cb()
-    catch ex
-      if ex isa StopException
-        break
-      elseif ex isa SkipException
-        continue
-      else
-        rethrow(ex)
-      end
+    cb = runall(cb)
+    itrsz = Base.IteratorSize(typeof(data))
+    n = (itrsz == Base.HasLength()) || (itrsz == Base.HasShape{1}()) ? length(data) : 0
+    @withprogress for (i, d) in enumerate(data)
+        try
+            l, gs = withgradient(ps) do
+                return loss(batchmemaybe(d)...)
+            end
+            if !isfinite(l)
+                throw(DomainError("Loss is $l on data item $i, stopping training"))
+            end
+            update!(opt, ps, gs)
+            cb()
+        catch ex
+            if ex isa StopException
+                break
+            elseif ex isa SkipException
+                continue
+            else
+                rethrow(ex)
+            end
+        end
+        @logprogress iszero(n) ? nothing : i / n
     end
-    @logprogress iszero(n) ? nothing : i / n
-  end
 end
 
 """
@@ -156,9 +164,11 @@ Run `body` `N` times. Mainly useful for quickly doing multiple epochs of
 training in a REPL.
 
 !!! note
+    
     The macro `@epochs` will be removed from Flux 0.14. Please just write an ordinary `for` loop.
 
 # Examples
+
 ```julia
 julia> Flux.@epochs 2 println("hello")
 [ Info: Epoch 1
@@ -168,10 +178,11 @@ hello
 ```
 """
 macro epochs(n, ex)
-  Base.depwarn("""The macro `@epochs` will be removed from Flux 0.14.
-                  As an alternative, you can write a simple `for i in 1:epochs` loop.""", Symbol("@epochs"), force=true)
-  :(@progress for i = 1:$(esc(n))
-      @info "Epoch $i"
-      $(esc(ex))
-    end)
+    Base.depwarn("""The macro `@epochs` will be removed from Flux 0.14.
+                    As an alternative, you can write a simple `for i in 1:epochs` loop.""",
+                 Symbol("@epochs"), force = true)
+    return :(@progress for i in 1:($(esc(n)))
+                 @info "Epoch $i"
+                 $(esc(ex))
+             end)
 end
diff --git a/src/outputsize.jl b/src/outputsize.jl
index 9fd9545b5f..9c67c5d1ae 100644
--- a/src/outputsize.jl
+++ b/src/outputsize.jl
@@ -14,21 +14,21 @@ struct Nil <: Real end
 @doc @doc(Nil)
 const nil = Nil()
 
-Nil(::T) where T<:Number = nil
-(::Type{T})(::Nil) where T<:Number = nil
+Nil(::T) where {T <: Number} = nil
+(::Type{T})(::Nil) where {T <: Number} = nil
 Base.convert(::Type{Nil}, ::Number) = nil
 
 Base.float(::Type{Nil}) = Nil
 
 for f in [:copy, :zero, :one, :oneunit,
-          :+, :-, :abs, :abs2, :inv,
-          :exp, :log, :log1p, :log2, :log10,
-          :sqrt, :tanh, :conj]
-  @eval Base.$f(::Nil) = nil
+    :+, :-, :abs, :abs2, :inv,
+    :exp, :log, :log1p, :log2, :log10,
+    :sqrt, :tanh, :conj]
+    @eval Base.$f(::Nil) = nil
 end
 
 for f in [:+, :-, :*, :/, :^, :mod, :div, :rem]
-  @eval Base.$f(::Nil, ::Nil) = nil
+    @eval Base.$f(::Nil, ::Nil) = nil
 end
 
 Base.:<(::Nil, ::Nil) = true
@@ -62,10 +62,11 @@ which should work out of the box for custom layers.
 If `m` is a `Tuple` or `Vector`, its elements are applied in sequence, like `Chain(m...)`.
 
 # Examples
+
 ```julia-repl
 julia> using Flux: outputsize
 
-julia> outputsize(Dense(10 => 4), (10,); padbatch=true)
+julia> outputsize(Dense(10 => 4), (10,); padbatch = true)
 (4, 1)
 
 julia> m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32));
@@ -73,13 +74,17 @@ julia> m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32));
 julia> m(randn(Float32, 10, 10, 3, 64)) |> size
 (6, 6, 32, 64)
 
-julia> outputsize(m, (10, 10, 3); padbatch=true)
+julia> outputsize(m, (10, 10, 3); padbatch = true)
 (6, 6, 32, 1)
 
 julia> outputsize(m, (10, 10, 3, 64))
 (6, 6, 32, 64)
 
-julia> try outputsize(m, (10, 10, 7, 64)) catch e println(e) end
+julia> try
+           outputsize(m, (10, 10, 7, 64))
+       catch e
+           println(e)
+       end
 ┌ Error: layer Conv((3, 3), 3=>16), index 1 in Chain, gave an error with input of size (10, 10, 7, 64)
 └ @ Flux ~/.julia/dev/Flux/src/outputsize.jl:114
 DimensionMismatch("Input channels must match! (7 vs. 3)")
@@ -88,27 +93,27 @@ julia> outputsize([Dense(10 => 4), Dense(4 => 2)], (10, 1)) # Vector of layers b
 (2, 1)
 ```
 """
-function outputsize(m, inputsizes::Tuple...; padbatch=false)
-  x = nil_input(padbatch, inputsizes...)
-  return size(m(x))
+function outputsize(m, inputsizes::Tuple...; padbatch = false)
+    x = nil_input(padbatch, inputsizes...)
+    return size(m(x))
 end
 
-nil_input(pad::Bool, s::Tuple{Vararg{Integer}}) = pad ? fill(nil, (s...,1)) : fill(nil, s)
+nil_input(pad::Bool, s::Tuple{Vararg{Integer}}) = pad ? fill(nil, (s..., 1)) : fill(nil, s)
 nil_input(pad::Bool, multi::Tuple{Vararg{Integer}}...) = nil_input.(pad, multi)
 nil_input(pad::Bool, tup::Tuple{Vararg{Tuple}}) = nil_input(pad, tup...)
 
-function outputsize(m::Chain, inputsizes::Tuple{Vararg{Integer}}...; padbatch=false)
-  x = nil_input(padbatch, inputsizes...)
-  for (i,lay) in enumerate(m.layers)
-    try
-      x = lay(x)
-    catch err
-      str = x isa AbstractArray ? "with input of size $(size(x))" : ""
-      @error "layer $lay, index $i in Chain, gave an error $str"
-      rethrow(err)
+function outputsize(m::Chain, inputsizes::Tuple{Vararg{Integer}}...; padbatch = false)
+    x = nil_input(padbatch, inputsizes...)
+    for (i, lay) in enumerate(m.layers)
+        try
+            x = lay(x)
+        catch err
+            str = x isa AbstractArray ? "with input of size $(size(x))" : ""
+            @error "layer $lay, index $i in Chain, gave an error $str"
+            rethrow(err)
+        end
     end
-  end
-  return size(x)
+    return size(x)
 end
 
 """
@@ -118,6 +123,7 @@ For model or layer `m` accepting multiple arrays as input,
 this returns `size(m((x, y, ...)))` given `size_x = size(x)`, etc.
 
 # Examples
+
 ```jldoctest
 julia> x, y = rand(Float32, 5, 64), rand(Float32, 7, 64);
 
@@ -128,12 +134,13 @@ julia> Flux.outputsize(par, (5, 64), (7, 64))
 
 julia> m = Chain(par, Dense(20 => 13), softmax);
 
-julia> Flux.outputsize(m, (5,), (7,); padbatch=true)
+julia> Flux.outputsize(m, (5,), (7,); padbatch = true)
 (13, 1)
 
 julia> par(x, y) == par((x, y)) == Chain(par, identity)((x, y))
 true
 ```
+
 Notice that `Chain` only accepts multiple arrays as a tuple,
 while `Parallel` also accepts them as multiple arguments;
 `outputsize` always supplies the tuple.
@@ -142,38 +149,43 @@ outputsize
 
 ## make tuples and vectors be like Chains
 
-outputsize(m::Tuple, input::Tuple...; padbatch=false) = outputsize(Chain(m...), input...; padbatch=padbatch)
-outputsize(m::AbstractVector, input::Tuple...; padbatch=false) = outputsize(Chain(m...), input...; padbatch=padbatch)
+function outputsize(m::Tuple, input::Tuple...; padbatch = false)
+    return outputsize(Chain(m...), input...; padbatch = padbatch)
+end
+function outputsize(m::AbstractVector, input::Tuple...; padbatch = false)
+    return outputsize(Chain(m...), input...; padbatch = padbatch)
+end
 
 ## bypass statistics in normalization layers
 
 for layer in (:BatchNorm, :InstanceNorm, :GroupNorm)  # LayerNorm works fine
-  @eval function (l::$layer)(x::AbstractArray{Nil})
-    l.chs == size(x, ndims(x)-1) || throw(DimensionMismatch(
-      string($layer, " expected ", l.chs, " channels, but got size(x) == ", size(x))))
-    x
-  end
+    @eval function (l::$layer)(x::AbstractArray{Nil})
+        l.chs == size(x, ndims(x) - 1) ||
+            throw(DimensionMismatch(string($layer, " expected ", l.chs,
+                                           " channels, but got size(x) == ", size(x))))
+        return x
+    end
 end
 
 ## fixes for layers that don't work out of the box
 
 for (fn, Dims) in ((:conv, DenseConvDims),)
-  @eval begin
-    function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{Nil}, dims::$Dims)
-      fill(nil, NNlib.output_size(dims)..., NNlib.channels_out(dims), size(a)[end])
-    end
-
-    function NNlib.$fn(a::AbstractArray{<:Real}, b::AbstractArray{Nil}, dims::$Dims)
-      NNlib.$fn(fill(nil, size(a)), b, dims)
-    end
-
-    function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{<:Real}, dims::$Dims)
-      NNlib.$fn(a, fill(nil, size(b)), dims)
+    @eval begin
+        function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{Nil}, dims::$Dims)
+            return fill(nil, NNlib.output_size(dims)..., NNlib.channels_out(dims),
+                        size(a)[end])
+        end
+
+        function NNlib.$fn(a::AbstractArray{<:Real}, b::AbstractArray{Nil}, dims::$Dims)
+            return NNlib.$fn(fill(nil, size(a)), b, dims)
+        end
+
+        function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{<:Real}, dims::$Dims)
+            return NNlib.$fn(a, fill(nil, size(b)), dims)
+        end
     end
-  end
 end
 
-
 """
     @autosize (size...,) Chain(Layer(_ => 2), Layer(_), ...)
 
@@ -187,6 +199,7 @@ The underscore may appear as an argument of a layer, or inside a `=>`.
 It may be used in further calculations, such as `Dense(_ => _÷4)`.
 
 # Examples
+
 ```
 julia> @autosize (3, 1) Chain(Dense(_ => 2, sigmoid), BatchNorm(_, affine=false))
 Chain(
@@ -226,45 +239,49 @@ julia> outputsize(ans, (28, 28, 1, 32))
 ```
 
 Limitations:
-* While `@autosize (5, 32) Flux.Bilinear(_ => 7)` is OK, something like `Bilinear((_, _) => 7)` will fail.
-* While `Scale(_)` and `LayerNorm(_)` are fine (and use the first dimension), `Scale(_,_)` and `LayerNorm(_,_)`
-  will fail if `size(x,1) != size(x,2)`.
-* RNNs won't work: `@autosize (7, 11) LSTM(_ => 5)` fails, because `outputsize(RNN(3=>7), (3,))` also fails, a known issue.
+
+  - While `@autosize (5, 32) Flux.Bilinear(_ => 7)` is OK, something like `Bilinear((_, _) => 7)` will fail.
+  - While `Scale(_)` and `LayerNorm(_)` are fine (and use the first dimension), `Scale(_,_)` and `LayerNorm(_,_)`
+    will fail if `size(x,1) != size(x,2)`.
+  - RNNs won't work: `@autosize (7, 11) LSTM(_ => 5)` fails, because `outputsize(RNN(3=>7), (3,))` also fails, a known issue.
 """
 macro autosize(size, model)
-  Meta.isexpr(size, :tuple) || error("@autosize's first argument must be a tuple, the size of the input")
-  Meta.isexpr(model, :call) || error("@autosize's second argument must be something like Chain(layers...)")
-  ex = _makelazy(model)
-  @gensym m
-  quote
-    $m = $ex
-    $outputsize($m, $size)
-    $striplazy($m)
-  end |> esc
+    Meta.isexpr(size, :tuple) ||
+        error("@autosize's first argument must be a tuple, the size of the input")
+    Meta.isexpr(model, :call) ||
+        error("@autosize's second argument must be something like Chain(layers...)")
+    ex = _makelazy(model)
+    @gensym m
+    return quote
+        $m = $ex
+        $outputsize($m, $size)
+        $striplazy($m)
+    end |> esc
 end
 
 function _makelazy(ex::Expr)
-  n = _underscoredepth(ex)
-  n == 0 && return ex
-  n == 1 && error("@autosize doesn't expect an underscore here: $ex")
-  n == 2 && return :($LazyLayer($(string(ex)), $(_makefun(ex)), nothing))
-  n > 2 && return Expr(ex.head, map(_makelazy, ex.args)...)
+    n = _underscoredepth(ex)
+    n == 0 && return ex
+    n == 1 && error("@autosize doesn't expect an underscore here: $ex")
+    n == 2 && return :($LazyLayer($(string(ex)), $(_makefun(ex)), nothing))
+    n > 2 && return Expr(ex.head, map(_makelazy, ex.args)...)
 end
 _makelazy(x) = x
 
 function _underscoredepth(ex::Expr)
-  # Meta.isexpr(ex, :tuple) && :_ in ex.args && return 10
-  ex.head in (:call, :kw, :(->), :block, :parameters)  || return 0
-  ex.args[1] === :(=>) && ex.args[2] === :_ && return 1
-  m = maximum(_underscoredepth, ex.args)
-  m == 0 ? 0 : m+1
+    # Meta.isexpr(ex, :tuple) && :_ in ex.args && return 10
+    ex.head in (:call, :kw, :(->), :block, :parameters) || return 0
+    ex.args[1] === :(=>) && ex.args[2] === :_ && return 1
+    m = maximum(_underscoredepth, ex.args)
+    return m == 0 ? 0 : m + 1
 end
 _underscoredepth(ex) = Int(ex === :_)
 
 function _makefun(ex)
-  T = Meta.isexpr(ex, :call) ? ex.args[1] : Type
-  @gensym x s
-  Expr(:(->), x, Expr(:block, :($s = $autosizefor($T, $x)), _replaceunderscore(ex, s)))
+    T = Meta.isexpr(ex, :call) ? ex.args[1] : Type
+    @gensym x s
+    return Expr(:(->), x,
+                Expr(:block, :($s = $autosizefor($T, $x)), _replaceunderscore(ex, s)))
 end
 
 """
@@ -274,62 +291,71 @@ If an `_` in your layer's constructor, used within `@autosize`, should
 *not* mean the 2nd-last dimension, then you can overload this.
 
 For instance `autosizefor(::Type{<:Dense}, x::AbstractArray) = size(x, 1)`
-is needed to make `@autosize (2,3,4) Dense(_ => 5)` return 
+is needed to make `@autosize (2,3,4) Dense(_ => 5)` return
 `Dense(2 => 5)` rather than `Dense(3 => 5)`.
 """
-autosizefor(::Type, x::AbstractArray) = size(x, max(1, ndims(x)-1))
+autosizefor(::Type, x::AbstractArray) = size(x, max(1, ndims(x) - 1))
 autosizefor(::Type{<:Dense}, x::AbstractArray) = size(x, 1)
 autosizefor(::Type{<:Embedding}, x::AbstractArray) = size(x, 1)
 autosizefor(::Type{<:LayerNorm}, x::AbstractArray) = size(x, 1)
 
 _replaceunderscore(e, s) = e === :_ ? s : e
-_replaceunderscore(ex::Expr, s) = Expr(ex.head, map(a -> _replaceunderscore(a, s), ex.args)...)
+function _replaceunderscore(ex::Expr, s)
+    return Expr(ex.head, map(a -> _replaceunderscore(a, s), ex.args)...)
+end
 
 mutable struct LazyLayer
-  str::String
-  make::Function
-  layer
+    str::String
+    make::Function
+    layer::Any
 end
 
 function (l::LazyLayer)(x::AbstractArray, ys::AbstractArray...)
-  l.layer === nothing || return l.layer(x, ys...)
-  made = l.make(x)  # for something like `Bilinear((_,__) => 7)`, perhaps need `make(xy...)`, later.
-  y = made(x, ys...)
-  l.layer = made  # mutate after we know that call worked
-  return y
+    l.layer === nothing || return l.layer(x, ys...)
+    made = l.make(x)  # for something like `Bilinear((_,__) => 7)`, perhaps need `make(xy...)`, later.
+    y = made(x, ys...)
+    l.layer = made  # mutate after we know that call worked
+    return y
 end
 
 function striplazy(m)
-  fs, re = functor(m)
-  re(map(striplazy, fs))
+    fs, re = functor(m)
+    return re(map(striplazy, fs))
 end
 function striplazy(l::LazyLayer)
-  l.layer === nothing || return l.layer
-  error("LazyLayer should be initialised, e.g. by outputsize(model, size), before using stiplazy")
+    l.layer === nothing || return l.layer
+    return error("LazyLayer should be initialised, e.g. by outputsize(model, size), before using stiplazy")
 end
 
 # Could make LazyLayer usable outside of @autosize, for instance allow Chain(@lazy Dense(_ => 2))?
 # But then it will survive to produce weird structural gradients etc. 
 
 function ChainRulesCore.rrule(l::LazyLayer, x)
-  l(x), _ -> error("LazyLayer should never be used within a gradient. Call striplazy(model) first to remove all.")
+    return l(x),
+           _ -> error("LazyLayer should never be used within a gradient. Call striplazy(model) first to remove all.")
 end
 function ChainRulesCore.rrule(::typeof(striplazy), m)
-  striplazy(m), _ -> error("striplazy should never be used within a gradient")
+    return striplazy(m), _ -> error("striplazy should never be used within a gradient")
 end
 
-params!(p::Params, x::LazyLayer, seen = IdSet()) = error("LazyLayer should never be used within params(m). Call striplazy(m) first.")
+function params!(p::Params, x::LazyLayer, seen = IdSet())
+    return error("LazyLayer should never be used within params(m). Call striplazy(m) first.")
+end
 
-Functors.functor(::Type{<:LazyLayer}, x) = error("LazyLayer should not be walked with Functors.jl, as the arrays which Flux.gpu wants to move may not exist yet.")
+function Functors.functor(::Type{<:LazyLayer}, x)
+    return error("LazyLayer should not be walked with Functors.jl, as the arrays which Flux.gpu wants to move may not exist yet.")
+end
 
 function Base.show(io::IO, l::LazyLayer)
-  printstyled(io, "LazyLayer(", color=:light_black)
-  if l.layer == nothing
-    printstyled(io, l.str, color=:magenta)
-  else
-    printstyled(io, l.layer, color=:cyan)
-  end
-  printstyled(io, ")", color=:light_black)
+    printstyled(io, "LazyLayer(", color = :light_black)
+    if l.layer == nothing
+        printstyled(io, l.str, color = :magenta)
+    else
+        printstyled(io, l.layer, color = :cyan)
+    end
+    return printstyled(io, ")", color = :light_black)
 end
 
-_big_show(io::IO, l::LazyLayer, indent::Int=0, name=nothing) = _layer_show(io, l, indent, name)
+function _big_show(io::IO, l::LazyLayer, indent::Int = 0, name = nothing)
+    return _layer_show(io, l, indent, name)
+end
diff --git a/src/utils.jl b/src/utils.jl
index 884fcd7465..d25c00525b 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -18,7 +18,7 @@ julia> layer = Dense(10, 20);
 julia> Flux.nfan(size(layer.weight))
 (10, 20)
 
-julia> layer = Conv((3, 3), 2=>10);
+julia> layer = Conv((3, 3), 2 => 10);
 
 julia> Flux.nfan(size(layer.weight))
 (18, 90)
@@ -28,7 +28,7 @@ nfan() = 1, 1 # fan_in, fan_out
 nfan(n) = 1, n # A vector is treated as a n×1 matrix
 nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices
 nfan(dims::Tuple) = nfan(dims...)
-nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) # In case of convolution kernels
+nfan(dims...) = prod(dims[1:(end - 2)]) .* (dims[end - 1], dims[end]) # In case of convolution kernels
 
 ofeltype(x, y) = convert(float(eltype(x)), y)
 epseltype(x) = eps(float(eltype(x)))
@@ -38,10 +38,13 @@ epseltype(x) = eps(float(eltype(x)))
 
 Create an instance of the RNG most appropriate for `x`.
 The current defaults are:
-- `x isa CuArray`: `CUDA.default_rng()`, else:
-- `x isa AbstractArray`, or no `x` provided:
-  - Julia version is < 1.7: `Random.GLOBAL_RNG`
-  - Julia version is >= 1.7: `Random.default_rng()`
+
+  - `x isa CuArray`: `CUDA.default_rng()`, else:
+
+  - `x isa AbstractArray`, or no `x` provided:
+    
+      + Julia version is < 1.7: `Random.GLOBAL_RNG`
+      + Julia version is >= 1.7: `Random.default_rng()`
 """
 rng_from_array(::AbstractArray) = default_rng_value()
 rng_from_array(::CuArray) = CUDA.default_rng()
@@ -49,17 +52,18 @@ rng_from_array(::CuArray) = CUDA.default_rng()
 @non_differentiable rng_from_array(::Any)
 
 if VERSION >= v"1.7"
-  default_rng_value() = Random.default_rng()
+    default_rng_value() = Random.default_rng()
 else
-  default_rng_value() = Random.GLOBAL_RNG
+    default_rng_value() = Random.GLOBAL_RNG
 end
 
 """
     default_rng_value()
 
 Create an instance of the default RNG depending on Julia's version.
-- Julia version is < 1.7: `Random.GLOBAL_RNG`
-- Julia version is >= 1.7: `Random.default_rng()`
+
+  - Julia version is < 1.7: `Random.GLOBAL_RNG`
+  - Julia version is >= 1.7: `Random.default_rng()`
 """
 default_rng_value
 
@@ -73,17 +77,18 @@ distribution on the interval ``[-x, x]``, where `x = gain * sqrt(6 / (fan_in + f
 This method is described in [1] and also known as Xavier initialization.
 
 # Examples
+
 ```jldoctest; setup = :(using Random; Random.seed!(0))
 julia> Flux.glorot_uniform(3, 4) |> summary
 "3×4 Matrix{Float32}"
 
-julia> round.(extrema(Flux.glorot_uniform(10, 100)), digits=3)
+julia> round.(extrema(Flux.glorot_uniform(10, 100)), digits = 3)
 (-0.232f0, 0.234f0)
 
-julia> round.(extrema(Flux.glorot_uniform(100, 10)), digits=3)
+julia> round.(extrema(Flux.glorot_uniform(100, 10)), digits = 3)
 (-0.233f0, 0.233f0)
 
-julia> round.(extrema(Flux.glorot_uniform(100, 100)), digits=3)
+julia> round.(extrema(Flux.glorot_uniform(100, 100)), digits = 3)
 (-0.173f0, 0.173f0)
 
 julia> Dense(3 => 2, tanh; init = Flux.glorot_uniform(MersenneTwister(1)))
@@ -99,12 +104,16 @@ julia> ans.bias
 
 [1] Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of training deep feedforward neural networks." _Proceedings of the thirteenth international conference on artificial intelligence and statistics_. 2010.
 """
-function glorot_uniform(rng::AbstractRNG, dims::Integer...; gain::Real=1)
-  scale = Float32(gain) * sqrt(24.0f0 / sum(nfan(dims...)))
-  (rand(rng, Float32, dims...) .- 0.5f0) .* scale
+function glorot_uniform(rng::AbstractRNG, dims::Integer...; gain::Real = 1)
+    scale = Float32(gain) * sqrt(24.0f0 / sum(nfan(dims...)))
+    return (rand(rng, Float32, dims...) .- 0.5f0) .* scale
+end
+function glorot_uniform(dims::Integer...; kw...)
+    return glorot_uniform(default_rng_value(), dims...; kw...)
+end
+function glorot_uniform(rng::AbstractRNG = default_rng_value(); init_kwargs...)
+    return (dims...; kwargs...) -> glorot_uniform(rng, dims...; init_kwargs..., kwargs...)
 end
-glorot_uniform(dims::Integer...; kw...) = glorot_uniform(default_rng_value(), dims...; kw...)
-glorot_uniform(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> glorot_uniform(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable glorot_uniform(::Any...)
 
@@ -119,22 +128,23 @@ using [`nfan`](@ref Flux.nfan).
 This method is described in [1] and also known as Xavier initialization.
 
 # Examples
+
 ```jldoctest; setup = :(using Random; Random.seed!(0))
 julia> using Statistics
 
-julia> round(std(Flux.glorot_normal(10, 1000)), digits=3)
+julia> round(std(Flux.glorot_normal(10, 1000)), digits = 3)
 0.044f0
 
-julia> round(std(Flux.glorot_normal(1000, 10)), digits=3)
+julia> round(std(Flux.glorot_normal(1000, 10)), digits = 3)
 0.044f0
 
-julia> round(std(Flux.glorot_normal(1000, 1000)), digits=3)
+julia> round(std(Flux.glorot_normal(1000, 1000)), digits = 3)
 0.032f0
 
-julia> Dense(10 => 1000, tanh; init = Flux.glorot_normal(gain=100))
+julia> Dense(10 => 1000, tanh; init = Flux.glorot_normal(gain = 100))
 Dense(10 => 1000, tanh)  # 11_000 parameters
 
-julia> round(std(ans.weight), sigdigits=3)
+julia> round(std(ans.weight), sigdigits = 3)
 4.45f0
 ```
 
@@ -142,12 +152,16 @@ julia> round(std(ans.weight), sigdigits=3)
 
 [1] Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of training deep feedforward neural networks." _Proceedings of the thirteenth international conference on artificial intelligence and statistics_. 2010.
 """
-function glorot_normal(rng::AbstractRNG, dims::Integer...; gain::Real=1)
-  std = Float32(gain) * sqrt(2.0f0 / sum(nfan(dims...)))
-  randn(rng, Float32, dims...) .* std
+function glorot_normal(rng::AbstractRNG, dims::Integer...; gain::Real = 1)
+    std = Float32(gain) * sqrt(2.0f0 / sum(nfan(dims...)))
+    return randn(rng, Float32, dims...) .* std
+end
+function glorot_normal(dims::Integer...; kwargs...)
+    return glorot_normal(default_rng_value(), dims...; kwargs...)
+end
+function glorot_normal(rng::AbstractRNG = default_rng_value(); init_kwargs...)
+    return (dims...; kwargs...) -> glorot_normal(rng, dims...; init_kwargs..., kwargs...)
 end
-glorot_normal(dims::Integer...; kwargs...) = glorot_normal(default_rng_value(), dims...; kwargs...)
-glorot_normal(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> glorot_normal(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable glorot_normal(::Any...)
 
@@ -161,14 +175,15 @@ on the interval `[-x, x]`, where `x = gain * sqrt(3/fan_in)` using [`nfan`](@ref
 This method is described in [1] and also known as He initialization.
 
 # Examples
+
 ```jldoctest; setup = :(using Random; Random.seed!(0))
-julia> round.(extrema(Flux.kaiming_uniform(100, 10)), digits=3)
+julia> round.(extrema(Flux.kaiming_uniform(100, 10)), digits = 3)
 (-0.774f0, 0.774f0)
 
-julia> round.(extrema(Flux.kaiming_uniform(10, 100)), digits=3)
+julia> round.(extrema(Flux.kaiming_uniform(10, 100)), digits = 3)
 (-0.245f0, 0.244f0)
 
-julia> round.(extrema(Flux.kaiming_uniform(100, 100)), digits=3)
+julia> round.(extrema(Flux.kaiming_uniform(100, 100)), digits = 3)
 (-0.245f0, 0.245f0)
 ```
 
@@ -177,12 +192,16 @@ julia> round.(extrema(Flux.kaiming_uniform(100, 100)), digits=3)
 [1] He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level performance on imagenet classification." _Proceedings of the IEEE international conference on computer vision_. 2015.
 """
 function kaiming_uniform(rng::AbstractRNG, dims::Integer...; gain::Real = √2)
-  bound = Float32(√3 * gain / sqrt(first(nfan(dims...)))) # fan_in
-  return (rand(rng, Float32, dims...) .- 0.5f0) .* 2bound
+    bound = Float32(√3 * gain / sqrt(first(nfan(dims...)))) # fan_in
+    return (rand(rng, Float32, dims...) .- 0.5f0) .* 2bound
 end
 
-kaiming_uniform(dims::Integer...; kwargs...) = kaiming_uniform(default_rng_value(), dims...; kwargs...)
-kaiming_uniform(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> kaiming_uniform(rng, dims...; init_kwargs..., kwargs...)
+function kaiming_uniform(dims::Integer...; kwargs...)
+    return kaiming_uniform(default_rng_value(), dims...; kwargs...)
+end
+function kaiming_uniform(rng::AbstractRNG = default_rng_value(); init_kwargs...)
+    return (dims...; kwargs...) -> kaiming_uniform(rng, dims...; init_kwargs..., kwargs...)
+end
 
 ChainRulesCore.@non_differentiable kaiming_uniform(::Any...)
 
@@ -196,16 +215,17 @@ distribution standard deviation `gain / sqrt(fan_in)`, using [`nfan`](@ref Flux.
 This method is described in [1] and also known as He initialization.
 
 # Examples
+
 ```jldoctest; setup = :(using Random; Random.seed!(0))
 julia> using Statistics
 
-julia> round(std(Flux.kaiming_normal(10, 1000)), digits=3)
+julia> round(std(Flux.kaiming_normal(10, 1000)), digits = 3)
 0.045f0
 
-julia> round(std(Flux.kaiming_normal(1000, 10)), digits=3)
+julia> round(std(Flux.kaiming_normal(1000, 10)), digits = 3)
 0.447f0
 
-julia> round(std(Flux.kaiming_normal(1000, 1000)), digits=3)
+julia> round(std(Flux.kaiming_normal(1000, 1000)), digits = 3)
 0.045f0
 ```
 
@@ -213,20 +233,24 @@ julia> round(std(Flux.kaiming_normal(1000, 1000)), digits=3)
 
 [1] He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level performance on imagenet classification." _Proceedings of the IEEE international conference on computer vision_. 2015.
 """
-function kaiming_normal(rng::AbstractRNG, dims::Integer...; gain::Real = √2f0)
-  std = Float32(gain / sqrt(first(nfan(dims...)))) # fan_in
-  return randn(rng, Float32, dims...) .* std
+function kaiming_normal(rng::AbstractRNG, dims::Integer...; gain::Real = √2.0f0)
+    std = Float32(gain / sqrt(first(nfan(dims...)))) # fan_in
+    return randn(rng, Float32, dims...) .* std
 end
 
-kaiming_normal(dims::Integer...; kwargs...) = kaiming_normal(default_rng_value(), dims...; kwargs...)
-kaiming_normal(rng::AbstractRNG; init_kwargs...) = (dims...; kwargs...) -> kaiming_normal(rng, dims...; init_kwargs..., kwargs...)
+function kaiming_normal(dims::Integer...; kwargs...)
+    return kaiming_normal(default_rng_value(), dims...; kwargs...)
+end
+function kaiming_normal(rng::AbstractRNG; init_kwargs...)
+    return (dims...; kwargs...) -> kaiming_normal(rng, dims...; init_kwargs..., kwargs...)
+end
 
 ChainRulesCore.@non_differentiable kaiming_normal(::Any...)
 
 """
     truncated_normal([rng = default_rng_value()], size...; mean = 0, std = 1, lo = -2, hi = 2) -> Array
     truncated_normal([rng]; kw...) -> Function
-  
+
 Return an `Array{Float32}` of the given `size` where each element is drawn from a truncated normal distribution.
 The numbers are distributed like `filter(x -> lo<=x<=hi, mean .+ std .* randn(100))`.
 
@@ -235,37 +259,43 @@ applying the inverse CDF of the truncated normal distribution.
 This method works best when `lo ≤ mean ≤ hi`.
 
 # Examples
+
 ```jldoctest
 julia> using Statistics
 
 julia> Flux.truncated_normal(3, 4) |> summary
 "3×4 Matrix{Float32}"
 
-julia> round.(extrema(Flux.truncated_normal(10^6)); digits=3)
+julia> round.(extrema(Flux.truncated_normal(10^6)); digits = 3)
 (-2.0f0, 2.0f0)
 
 julia> round(std(Flux.truncated_normal(10^6; lo = -100, hi = 100)))
 1.0f0
 ```
 """
-function truncated_normal(rng::AbstractRNG, dims::Integer...; mean = 0, std = 1, lo = -2, hi = 2)
-  norm_cdf(x) = 0.5 * (1 + erf(x/√2))
-  if (mean < lo - 2 * std) || (mean > hi + 2 * std)
-    @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog=1
-  end
-  l = norm_cdf((lo - mean) / std)
-  u = norm_cdf((hi - mean) / std)
-  xs = rand(rng, Float32, dims...)
-  broadcast!(xs, xs) do x
-    x = x * 2(u - l) + (2l - 1)
-    x = erfinv(x)
-    x = clamp(x * std * √2 + mean, lo, hi)
-  end
-  return xs
-end
-
-truncated_normal(dims::Integer...; kwargs...) = truncated_normal(default_rng_value(), dims...; kwargs...)
-truncated_normal(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> truncated_normal(rng, dims...; init_kwargs..., kwargs...)
+function truncated_normal(rng::AbstractRNG, dims::Integer...; mean = 0, std = 1, lo = -2,
+                          hi = 2)
+    norm_cdf(x) = 0.5 * (1 + erf(x / √2))
+    if (mean < lo - 2 * std) || (mean > hi + 2 * std)
+        @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog=1
+    end
+    l = norm_cdf((lo - mean) / std)
+    u = norm_cdf((hi - mean) / std)
+    xs = rand(rng, Float32, dims...)
+    broadcast!(xs, xs) do x
+        x = x * 2(u - l) + (2l - 1)
+        x = erfinv(x)
+        return x = clamp(x * std * √2 + mean, lo, hi)
+    end
+    return xs
+end
+
+function truncated_normal(dims::Integer...; kwargs...)
+    return truncated_normal(default_rng_value(), dims...; kwargs...)
+end
+function truncated_normal(rng::AbstractRNG = default_rng_value(); init_kwargs...)
+    return (dims...; kwargs...) -> truncated_normal(rng, dims...; init_kwargs..., kwargs...)
+end
 
 ChainRulesCore.@non_differentiable truncated_normal(::Any...)
 
@@ -280,6 +310,7 @@ For `length(size) > 2`, a `prod(size[1:(end - 1)])` by `size[end]` orthogonal ma
 is computed before reshaping it to the original dimensions.
 
 # Examples
+
 ```jldoctest; setup = :(using LinearAlgebra)
 julia> W = Flux.orthogonal(5, 7);
 
@@ -306,27 +337,31 @@ true
 # References
 
 [1] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120
-
 """
 function orthogonal(rng::AbstractRNG, rows::Integer, cols::Integer; gain::Real = 1)
-  if rows < cols
-    return permutedims(orthogonal(rng, cols, rows; gain))
-  end
-  mat = randn(rng, Float32, rows, cols)
-  Q, R = LinearAlgebra.qr(mat)
-  mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* Float32(gain)
-  return mat
+    if rows < cols
+        return permutedims(orthogonal(rng, cols, rows; gain))
+    end
+    mat = randn(rng, Float32, rows, cols)
+    Q, R = LinearAlgebra.qr(mat)
+    mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* Float32(gain)
+    return mat
 end
 
 function orthogonal(rng::AbstractRNG, d1::Integer, ds::Integer...; kwargs...)
-  dims = (d1, ds...)
-  rows = prod(dims[1:end-1])
-  cols = dims[end]
-  return reshape(orthogonal(rng, rows, cols; kwargs...), dims)
+    dims = (d1, ds...)
+    rows = prod(dims[1:(end - 1)])
+    cols = dims[end]
+    return reshape(orthogonal(rng, rows, cols; kwargs...), dims)
 end
 
-orthogonal(dims::Integer...; kwargs...) = orthogonal(default_rng_value(), dims...; kwargs...)
-orthogonal(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs..., kwargs...)
+function orthogonal(dims::Integer...; kwargs...)
+    return orthogonal(default_rng_value(), dims...; kwargs...)
+end
+function orthogonal(rng::AbstractRNG = default_rng_value(); init_kwargs...)
+    return (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs...,
+                                                       kwargs...)
+end
 
 ChainRulesCore.@non_differentiable orthogonal(::Any...)
 
@@ -341,18 +376,19 @@ with a mean of zero and standard deviation `std`.
 This method is described in [1].
 
 # Examples
+
 ```jldoctest; setup = :(using Random; Random.seed!(0))
-julia> count(iszero, Flux.sparse_init(10, 10, sparsity=1/5))
+julia> count(iszero, Flux.sparse_init(10, 10, sparsity = 1 / 5))
 20
 
-julia> sum(0 .== Flux.sparse_init(10, 11, sparsity=0.9), dims=1)
+julia> sum(0 .== Flux.sparse_init(10, 11, sparsity = 0.9), dims = 1)
 1×11 Matrix{Int64}:
  9  9  9  9  9  9  9  9  9  9  9
 
-julia> Dense(3 => 10, tanh; init=Flux.sparse_init(sparsity=0.5))
+julia> Dense(3 => 10, tanh; init = Flux.sparse_init(sparsity = 0.5))
 Dense(3 => 10, tanh)  # 40 parameters
 
-julia> count(iszero, ans.weight, dims=1)
+julia> count(iszero, ans.weight, dims = 1)
 1×3 Matrix{Int64}:
  5  5  5
 ```
@@ -362,19 +398,23 @@ julia> count(iszero, ans.weight, dims=1)
 [1] Martens, J, "Deep learning via Hessian-free optimization" _Proceedings of the 27th International Conference on International Conference on Machine Learning_. 2010.
 """
 function sparse_init(rng::AbstractRNG, dims::Integer...; sparsity, std = 0.01)
-  if length(dims) != 2
-    throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
-  end
-  rows, cols = dims
-  prop_zero = min(1.0, sparsity)
-  num_zeros = ceil(Integer, prop_zero * rows)
-  sparse_array = randn(rng, Float32, dims...) .* Float32(std)
-  sparse_array[1:num_zeros, :] .= 0f0
-  return mapslices(shuffle, sparse_array, dims=1)
+    if length(dims) != 2
+        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
+    end
+    rows, cols = dims
+    prop_zero = min(1.0, sparsity)
+    num_zeros = ceil(Integer, prop_zero * rows)
+    sparse_array = randn(rng, Float32, dims...) .* Float32(std)
+    sparse_array[1:num_zeros, :] .= 0.0f0
+    return mapslices(shuffle, sparse_array, dims = 1)
 end
 
-sparse_init(dims::Integer...; kwargs...) = sparse_init(default_rng_value(), dims...; kwargs...)
-sparse_init(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> sparse_init(rng, dims...; init_kwargs..., kwargs...)
+function sparse_init(dims::Integer...; kwargs...)
+    return sparse_init(default_rng_value(), dims...; kwargs...)
+end
+function sparse_init(rng::AbstractRNG = default_rng_value(); init_kwargs...)
+    return (dims...; kwargs...) -> sparse_init(rng, dims...; init_kwargs..., kwargs...)
+end
 
 ChainRulesCore.@non_differentiable sparse_init(::Any...)
 
@@ -389,20 +429,21 @@ Often useful in the context of transfer learning, i.e when one wants to add more
 a model but start from the same mapping.
 
 Has the following behaviour
-*  1D: A `Vector` of `zeros` (useful for an identity bias)
-*  2D: An identity matrix (useful for an identity matrix multiplication)
-*  More than 2D: A dense block array of center tap spatial filters (useful for an identity convolution)
 
-Some caveats: 
-* Not all layers will be identity mapping when used with this init. Exceptions
-  include recurrent layers and normalization layers.
+  - 1D: A `Vector` of `zeros` (useful for an identity bias)
+  - 2D: An identity matrix (useful for an identity matrix multiplication)
+  - More than 2D: A dense block array of center tap spatial filters (useful for an identity convolution)
+
+Some caveats:
 
-* Layers must have `input_size == output_size` for identity mapping to be
-  possible. When this is not the case, extra dimensions of the array are padded with zeros.
+  - Not all layers will be identity mapping when used with this init. Exceptions
+    include recurrent layers and normalization layers.
 
-* For convolutional layers, in addition to the above, the kernel sizes must also be odd and
-  padding must be applied so that output feature maps have the same size as input feature maps,
-  e.g by using [`SamePad`](@ref).
+  - Layers must have `input_size == output_size` for identity mapping to be
+    possible. When this is not the case, extra dimensions of the array are padded with zeros.
+  - For convolutional layers, in addition to the above, the kernel sizes must also be odd and
+    padding must be applied so that output feature maps have the same size as input feature maps,
+    e.g by using [`SamePad`](@ref).
 
 Use keyword `shift` (integer or tuple) to apply circular shift to the output,
 equivalent to `Base.circshift(identity_init(size...), shift)`.
@@ -411,20 +452,21 @@ For consistency with other initialisers, it accepts `rng::AbstractRNG` as an opt
 first argument. But this is ignored, since the result is not random.
 
 # Examples
+
 ```jldoctest
-julia> Flux.identity_init(3,5)
+julia> Flux.identity_init(3, 5)
 3×5 Matrix{Float32}:
  1.0  0.0  0.0  0.0  0.0
  0.0  1.0  0.0  0.0  0.0
  0.0  0.0  1.0  0.0  0.0
 
-julia> Dense(5 => 3, relu, init=Flux.identity_init)([1,-2,3,-4,5])
+julia> Dense(5 => 3, relu, init = Flux.identity_init)([1, -2, 3, -4, 5])
 3-element Vector{Float32}:
  1.0
  0.0
  3.0
 
-julia> Flux.identity_init(3,3,2; gain=100)
+julia> Flux.identity_init(3, 3, 2; gain = 100)
 3×3×2 Array{Float32, 3}:
 [:, :, 1] =
    0.0  0.0  0.0
@@ -436,35 +478,42 @@ julia> Flux.identity_init(3,3,2; gain=100)
  0.0  100.0  0.0
  0.0    0.0  0.0
 
-julia> x4 = cat([1 2 3; 4 5 6; 7 8 9]; dims=4);
+julia> x4 = cat([1 2 3; 4 5 6; 7 8 9]; dims = 4);
 
-julia> Conv((2,2), 1 => 1, init=Flux.identity_init(gain=10), pad=SamePad())(x4)
+julia> Conv((2, 2), 1 => 1, init = Flux.identity_init(gain = 10), pad = SamePad())(x4)
 3×3×1×1 Array{Float32, 4}:
 [:, :, 1, 1] =
  10.0  20.0  30.0
  40.0  50.0  60.0
  70.0  80.0  90.0
+``` # Assume bias
 ```
 """
-identity_init(cols::Integer; gain::Real=1, shift=0) = zeros32(cols) # Assume bias
+identity_init(cols::Integer; gain::Real = 1, shift = 0) = zeros32(cols) # Assume bias
 
 # Assume matrix multiplication
-identity_init(rows::Integer, cols::Integer; gain::Real=1, shift=0) = circshift(Matrix{Float32}(I * gain, rows,cols), shift)
+function identity_init(rows::Integer, cols::Integer; gain::Real = 1, shift = 0)
+    return circshift(Matrix{Float32}(I * gain, rows, cols), shift)
+end
 
 # Assume convolution
-function identity_init(dims::Integer...; gain::Real=1, shift=0)
-  nin, nout = dims[end-1], dims[end]
-  centers = map(d -> cld(d, 2), dims[1:end-2])
-  weights = zeros32(dims...)
-  for i in 1:min(nin,nout)
-    weights[centers..., i, i] = gain
-  end
-  return circshift(weights, shift)
+function identity_init(dims::Integer...; gain::Real = 1, shift = 0)
+    nin, nout = dims[end - 1], dims[end]
+    centers = map(d -> cld(d, 2), dims[1:(end - 2)])
+    weights = zeros32(dims...)
+    for i in 1:min(nin, nout)
+        weights[centers..., i, i] = gain
+    end
+    return circshift(weights, shift)
 end
 
 # For consistency, it accepts an RNG, but ignores it:
-identity_init(::AbstractRNG, dims::Integer...; kwargs...) = identity_init(dims...; kwargs...)
-identity_init(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (args...;kwargs...) -> identity_init(rng, args...; init_kwargs..., kwargs...)
+function identity_init(::AbstractRNG, dims::Integer...; kwargs...)
+    return identity_init(dims...; kwargs...)
+end
+function identity_init(rng::AbstractRNG = default_rng_value(); init_kwargs...)
+    return (args...; kwargs...) -> identity_init(rng, args...; init_kwargs..., kwargs...)
+end
 
 ChainRulesCore.@non_differentiable identity_init(::Any...)
 
@@ -511,20 +560,20 @@ randn32(rng::AbstractRNG) = (dims...,) -> Base.randn(rng, Float32, dims...)
 Return a bias parameter for a layer, based on the value given
 to the constructor's keyword `bias=bias`.
 
-* `bias == true` creates a trainable array of the given size, of the same type as `weights`, initialised to zero.
-* `bias == false` returns `false`, which is understood by AD to be non-differentiable.
-* `bias::AbstractArray` uses the array provided, provided it has the correct size.
-  It does not at present correct the `eltype` to match that of `weights`.
+  - `bias == true` creates a trainable array of the given size, of the same type as `weights`, initialised to zero.
+  - `bias == false` returns `false`, which is understood by AD to be non-differentiable.
+  - `bias::AbstractArray` uses the array provided, provided it has the correct size.
+    It does not at present correct the `eltype` to match that of `weights`.
 """
 function create_bias(weights::AbstractArray, bias::Bool, dims::Integer...)
-  bias ? fill!(similar(weights, dims...), 0) : false
+    return bias ? fill!(similar(weights, dims...), 0) : false
 end
 function create_bias(weights::AbstractArray, bias::AbstractArray, dims::Integer...)
-  size(bias) == dims || throw(DimensionMismatch("expected bias of size $(dims), got size $(size(bias))"))
-  bias
+    size(bias) == dims ||
+        throw(DimensionMismatch("expected bias of size $(dims), got size $(size(bias))"))
+    return bias
 end
 
-
 # Other
 
 """
@@ -539,10 +588,11 @@ execution on the leading edge, pass `leading=false`. To enable execution on
 the trailing edge, pass `trailing=true`.
 
 # Examples
+
 ```jldoctest
 julia> a = Flux.throttle(() -> println("Flux"), 2);
 
-julia> for i = 1:4  # a called in alternate iterations
+julia> for i in 1:4  # a called in alternate iterations
            a()
            sleep(1)
        end
@@ -550,39 +600,38 @@ Flux
 Flux
 ```
 """
-function throttle(f, timeout; leading=true, trailing=false)
-  cooldown = true
-  later = nothing
-  result = nothing
-
-  function throttled(args...; kwargs...)
-    yield()
-
-    if cooldown
-      if leading
-        result = f(args...; kwargs...)
-      else
-        later = () -> f(args...; kwargs...)
-      end
-
-      cooldown = false
-      @async try
-        while (sleep(timeout); later != nothing)
-          later()
-          later = nothing
+function throttle(f, timeout; leading = true, trailing = false)
+    cooldown = true
+    later = nothing
+    result = nothing
+
+    function throttled(args...; kwargs...)
+        yield()
+
+        if cooldown
+            if leading
+                result = f(args...; kwargs...)
+            else
+                later = () -> f(args...; kwargs...)
+            end
+
+            cooldown = false
+            @async try
+                while (sleep(timeout); later != nothing)
+                    later()
+                    later = nothing
+                end
+            finally
+                cooldown = true
+            end
+        elseif trailing
+            later = () -> (result = f(args...; kwargs...))
         end
-      finally
-        cooldown = true
-      end
-    elseif trailing
-      later = () -> (result = f(args...; kwargs...))
-    end
 
-    return result
-  end
+        return result
+    end
 end
 
-
 """
     modules(m)
 
@@ -630,7 +679,7 @@ modules(m) = [x for x in Functors.fcollect(m) if !isleaflike(x)]
 
 @nograd modules # TODO: is this correct? might fail with explicit parameters.
 function ChainRulesCore.rrule(::typeof(modules), m)
-  modules(m), dm -> error("Flux.modules is not at present differentiable, sorry")
+    return modules(m), dm -> error("Flux.modules is not at present differentiable, sorry")
 end
 
 isleaflike(x) = Functors.isleaf(x)
@@ -646,15 +695,15 @@ If the count is greater than or equal to `wait`,
 the function returns `true`, otherwise it returns `false`.
 
 # Examples
+
 ```jldoctest
 julia> loss() = rand();
 
 julia> trigger = Flux.patience(() -> loss() < 1, 3);
 
-
 julia> for i in 1:10
-         @info "Epoch \$i"
-         trigger() && break
+           @info "Epoch \$i"
+           trigger() && break
        end
 [ Info: Epoch 1
 [ Info: Epoch 2
@@ -662,13 +711,13 @@ julia> for i in 1:10
 ```
 """
 function patience(predicate, wait)
-  let count = 0
-    function on_trigger(args...; kwargs...)
-      count = predicate(args...; kwargs...) ? count + 1 : 0
+    let count = 0
+        function on_trigger(args...; kwargs...)
+            count = predicate(args...; kwargs...) ? count + 1 : 0
 
-      return count >= wait
+            return count >= wait
+        end
     end
-  end
 end
 
 """
@@ -682,17 +731,17 @@ the function returns `true`, otherwise it returns `false`.
 The count is reset when `distance(best_score, f(...)) > min_dist`.
 
 # Examples
+
 ```jldoctest
 julia> loss = let l = 0
-         () -> l += 1
+           () -> l += 1
        end; # pseudo loss function that returns increasing values
 
 julia> es = Flux.early_stopping(loss, 3);
 
-
 julia> for i in 1:10
-         @info "Epoch \$i"
-         es() && break
+           @info "Epoch \$i"
+           es() && break
        end
 [ Info: Epoch 1
 [ Info: Epoch 2
@@ -700,17 +749,17 @@ julia> for i in 1:10
 ```
 """
 function early_stopping(f, delay; distance = -, init_score = 0, min_dist = 0)
-  trigger = let best_score = init_score
-    (args...; kwargs...) -> begin
-      score = f(args...; kwargs...)
-      Δ = distance(best_score, score)
-      best_score = Δ < 0 ? best_score : score
+    trigger = let best_score = init_score
+        (args...; kwargs...) -> begin
+            score = f(args...; kwargs...)
+            Δ = distance(best_score, score)
+            best_score = Δ < 0 ? best_score : score
 
-      return Δ < min_dist
+            return Δ < min_dist
+        end
     end
-  end
 
-  return patience(trigger, delay)
+    return patience(trigger, delay)
 end
 
 """
@@ -724,17 +773,17 @@ the function returns `true`, otherwise it returns `false`.
 The count is reset when `abs(distance(last_score, f(...))) > min_dist`.
 
 # Examples
+
 ```jldoctest
 julia> f = let v = 10
-         () -> v = v / abs(v) - v
+           () -> v = v / abs(v) - v
        end; # -9, 8, -7, 6, ...
 
-julia> trigger = Flux.plateau(f, 3; init_score=10, min_dist=18);
-
+julia> trigger = Flux.plateau(f, 3; init_score = 10, min_dist = 18);
 
 julia> for i in 1:10
-         @info "Epoch \$i"
-         trigger() && break
+           @info "Epoch \$i"
+           trigger() && break
        end
 [ Info: Epoch 1
 [ Info: Epoch 2
@@ -742,16 +791,16 @@ julia> for i in 1:10
 [ Info: Epoch 4
 ```
 """
-function plateau(f, width; distance = -, init_score = 0, min_dist = 1f-6)
-  is_plateau = let last_score = init_score
-    (args...; kwargs...) -> begin
-      score = f(args...; kwargs...)
-      Δ = abs(distance(last_score, score))
-      last_score = score
+function plateau(f, width; distance = -, init_score = 0, min_dist = 1.0f-6)
+    is_plateau = let last_score = init_score
+        (args...; kwargs...) -> begin
+            score = f(args...; kwargs...)
+            Δ = abs(distance(last_score, score))
+            last_score = score
 
-      return Δ < min_dist
+            return Δ < min_dist
+        end
     end
-  end
 
-  return patience(is_plateau, width)
+    return patience(is_plateau, width)
 end
diff --git a/test/ctc-gpu.jl b/test/ctc-gpu.jl
index d7ff1bdf9d..1ed898cd21 100644
--- a/test/ctc-gpu.jl
+++ b/test/ctc-gpu.jl
@@ -8,49 +8,52 @@ using CUDA
 # Custom function to check numerical gradient of ctc loss,
 # based on `ngradient` in `Tracker.jl`
 function ctc_ngradient(x, y)
-  f = Flux.Losses.ctc_loss
-  grads = zero(x)
-  for i in 1:length(x)
-    δ = sqrt(eps())
-    tmp = x[i]
-    x[i] = tmp - δ/2
-    y1 = f(x, y)
-    x[i] = tmp + δ/2
-    y2 = f(x, y)
-    x[i] = tmp
-    grads[i] = (y2-y1)/δ
-  end
-  return grads
+    f = Flux.Losses.ctc_loss
+    grads = zero(x)
+    for i in 1:length(x)
+        δ = sqrt(eps())
+        tmp = x[i]
+        x[i] = tmp - δ / 2
+        y1 = f(x, y)
+        x[i] = tmp + δ / 2
+        y2 = f(x, y)
+        x[i] = tmp
+        grads[i] = (y2 - y1) / δ
+    end
+    return grads
 end
 
 @testset "ctc-gpu" begin
-  x = rand(10, 50)
-  y = rand(1:9, 30)
-  x_cu = CuArray(x)
-  g1 = gradient(ctc_loss, x_cu, y)[1]
-  g1 = g1 |> collect
-  g2 = ctc_ngradient(x, y)
-  @test g1 ≈ g2 rtol=1e-5 atol=1e-5
-  
-  # test that GPU loss matches CPU implementation
-  l1 = ctc_loss(x_cu, y)
-  l2 = ctc_loss(x, y)
-  @test l1 ≈ l2
-  
-  # tests using hand-calculated values
-  x_cu = [1. 2. 3.; 2. 1. 1.; 3. 3. 2.] |> CuArray
-  y = [1, 2]
-  @test ctc_loss(x_cu, y) ≈ 3.6990738275138035
-  
-  g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811; 0.0729422 0.447346 0.16457]
-  ghat = gradient(ctc_loss, x_cu, y)[1] |> collect
-  @test g ≈ ghat rtol=1e-5 atol=1e-5
+    x = rand(10, 50)
+    y = rand(1:9, 30)
+    x_cu = CuArray(x)
+    g1 = gradient(ctc_loss, x_cu, y)[1]
+    g1 = g1 |> collect
+    g2 = ctc_ngradient(x, y)
+    @test g1≈g2 rtol=1e-5 atol=1e-5
 
-  x_cu = [-3. 12. 8. 15.; 4. 20. -2. 20.; 8. -33. 6. 5.] |> CuArray
-  y = [1, 2] |> CuArray
-  @test ctc_loss(x_cu, y) ≈ 8.02519869363453
+    # test that GPU loss matches CPU implementation
+    l1 = ctc_loss(x_cu, y)
+    l2 = ctc_loss(x, y)
+    @test l1 ≈ l2
 
-  g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063; 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307; -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07]
-  ghat = gradient(ctc_loss, x_cu, y)[1] |> collect
-  @test g ≈ ghat rtol=1e-5 atol=1e-5
+    # tests using hand-calculated values
+    x_cu = [1.0 2.0 3.0; 2.0 1.0 1.0; 3.0 3.0 2.0] |> CuArray
+    y = [1, 2]
+    @test ctc_loss(x_cu, y) ≈ 3.6990738275138035
+
+    g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811;
+         0.0729422 0.447346 0.16457]
+    ghat = gradient(ctc_loss, x_cu, y)[1] |> collect
+    @test g≈ghat rtol=1e-5 atol=1e-5
+
+    x_cu = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0] |> CuArray
+    y = [1, 2] |> CuArray
+    @test ctc_loss(x_cu, y) ≈ 8.02519869363453
+
+    g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063;
+         0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307;
+         -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07]
+    ghat = gradient(ctc_loss, x_cu, y)[1] |> collect
+    @test g≈ghat rtol=1e-5 atol=1e-5
 end
diff --git a/test/ctc.jl b/test/ctc.jl
index 6fa33c4b99..88386ff0e7 100644
--- a/test/ctc.jl
+++ b/test/ctc.jl
@@ -7,42 +7,45 @@ using LinearAlgebra
 # Custom function to check numerical gradient of ctc loss,
 # based on `ngradient` in `Tracker.jl`
 function ctc_ngradient(x, y)
-  f = Flux.Losses.ctc_loss
-  grads = zero(x)
-  for i in 1:length(x)
-    δ = sqrt(eps())
-    tmp = x[i]
-    x[i] = tmp - δ/2
-    y1 = f(x, y)
-    x[i] = tmp + δ/2
-    y2 = f(x, y)
-    x[i] = tmp
-    grads[i] = (y2-y1)/δ
-  end
-  return grads
+    f = Flux.Losses.ctc_loss
+    grads = zero(x)
+    for i in 1:length(x)
+        δ = sqrt(eps())
+        tmp = x[i]
+        x[i] = tmp - δ / 2
+        y1 = f(x, y)
+        x[i] = tmp + δ / 2
+        y2 = f(x, y)
+        x[i] = tmp
+        grads[i] = (y2 - y1) / δ
+    end
+    return grads
 end
 
 @testset "ctc_loss" begin
-  x = rand(10, 50)
-  y = rand(1:9, 30)
-  g1 = gradient(ctc_loss, x, y)[1]
-  g2 = ctc_ngradient(x, y)
-  @test g1 ≈ g2 rtol=1e-5 atol=1e-5
-  
-  # tests using hand-calculated values
-  x = [1. 2. 3.; 2. 1. 1.; 3. 3. 2.]
-  y = [1, 2]
-  @test ctc_loss(x, y) ≈ 3.6990738275138035
+    x = rand(10, 50)
+    y = rand(1:9, 30)
+    g1 = gradient(ctc_loss, x, y)[1]
+    g2 = ctc_ngradient(x, y)
+    @test g1≈g2 rtol=1e-5 atol=1e-5
 
-  g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811; 0.0729422 0.447346 0.16457]
-  ghat = gradient(ctc_loss, x, y)[1]
-  @test g ≈ ghat rtol=1e-5 atol=1e-5
+    # tests using hand-calculated values
+    x = [1.0 2.0 3.0; 2.0 1.0 1.0; 3.0 3.0 2.0]
+    y = [1, 2]
+    @test ctc_loss(x, y) ≈ 3.6990738275138035
 
-  x = [-3. 12. 8. 15.; 4. 20. -2. 20.; 8. -33. 6. 5.]
-  y = [1, 2]
-  @test ctc_loss(x, y) ≈ 8.02519869363453
+    g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811;
+         0.0729422 0.447346 0.16457]
+    ghat = gradient(ctc_loss, x, y)[1]
+    @test g≈ghat rtol=1e-5 atol=1e-5
 
-  g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063; 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307; -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07]
-  ghat = gradient(ctc_loss, x, y)[1]
-  @test g ≈ ghat rtol=1e-5 atol=1e-5
+    x = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0]
+    y = [1, 2]
+    @test ctc_loss(x, y) ≈ 8.02519869363453
+
+    g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063;
+         0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307;
+         -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07]
+    ghat = gradient(ctc_loss, x, y)[1]
+    @test g≈ghat rtol=1e-5 atol=1e-5
 end
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 2b4fec6e4c..c20236bec6 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -6,163 +6,163 @@ using LinearAlgebra: I, cholesky, Cholesky
 using SparseArrays: sparse, SparseMatrixCSC, AbstractSparseArray
 
 @testset "CUDA" begin
-  x = randn(5, 5)
-  cx = gpu(x)
-  @test cx isa CuArray
+    x = randn(5, 5)
+    cx = gpu(x)
+    @test cx isa CuArray
 
-  @test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3
+    @test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3
 
-  x = Flux.onehotbatch([1, 2, 3], 1:3)
-  cx = gpu(x)
-  @test cx isa Flux.OneHotMatrix && cx.indices isa CuArray
-  @test (cx .+ 1) isa CuArray
+    x = Flux.onehotbatch([1, 2, 3], 1:3)
+    cx = gpu(x)
+    @test cx isa Flux.OneHotMatrix && cx.indices isa CuArray
+    @test (cx .+ 1) isa CuArray
 
-  m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
-  cm = gpu(m)
+    m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
+    cm = gpu(m)
 
-  @test all(p isa CuArray for p in params(cm))
-  @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
+    @test all(p isa CuArray for p in params(cm))
+    @test cm(gpu(rand(10, 10))) isa CuArray{Float32, 2}
 
-  xs = rand(5, 5)
-  ys = Flux.onehotbatch(1:5,1:5)
-  @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)
+    xs = rand(5, 5)
+    ys = Flux.onehotbatch(1:5, 1:5)
+    @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)
 
-  c = gpu(Conv((2,2),3=>4))
-  x = gpu(rand(10, 10, 3, 2))
-  l = c(gpu(rand(10,10,3,2)))
-  @test gradient(x -> sum(c(x)), x)[1] isa CuArray
-
-  c = gpu(CrossCor((2,2),3=>4))
-  x = gpu(rand(10, 10, 3, 2))
-  l = c(gpu(rand(10,10,3,2)))
-  @test gradient(x -> sum(c(x)), x)[1] isa CuArray
+    c = gpu(Conv((2, 2), 3 => 4))
+    x = gpu(rand(10, 10, 3, 2))
+    l = c(gpu(rand(10, 10, 3, 2)))
+    @test gradient(x -> sum(c(x)), x)[1] isa CuArray
 
+    c = gpu(CrossCor((2, 2), 3 => 4))
+    x = gpu(rand(10, 10, 3, 2))
+    l = c(gpu(rand(10, 10, 3, 2)))
+    @test gradient(x -> sum(c(x)), x)[1] isa CuArray
 end
 
 @testset "onehot gpu" begin
-  y = Flux.onehotbatch(ones(3), 1:2) |> gpu;
-  @test (repr("text/plain", y); true)
-
-  gA = rand(3, 2) |> gpu;
-  @test gradient(A -> sum(A * y), gA)[1] isa CuArray
-
-  # construct from CuArray
-  x = [1, 3, 2]
-  y = Flux.onehotbatch(x, 0:3)
-  @test_skip begin  # https://github.com/FluxML/OneHotArrays.jl/issues/16
-  y2 = Flux.onehotbatch(x |> gpu, 0:3)
-  @test y2.indices isa CuArray
-  @test y2 |> cpu == y
-  end
+    y = Flux.onehotbatch(ones(3), 1:2) |> gpu
+    @test (repr("text/plain", y); true)
+
+    gA = rand(3, 2) |> gpu
+    @test gradient(A -> sum(A * y), gA)[1] isa CuArray
+
+    # construct from CuArray
+    x = [1, 3, 2]
+    y = Flux.onehotbatch(x, 0:3)
+    @test_skip begin  # https://github.com/FluxML/OneHotArrays.jl/issues/16
+        y2 = Flux.onehotbatch(x |> gpu, 0:3)
+        @test y2.indices isa CuArray
+        @test y2 |> cpu == y
+    end
 end
 
 @testset "onecold gpu" begin
-  y = Flux.onehotbatch(ones(3), 1:10) |> gpu;
-  l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
-  @test Flux.onecold(y) isa CuArray
-  @test y[3,:] isa CuArray
-  @test Flux.onecold(y, l) == ['a', 'a', 'a']
+    y = Flux.onehotbatch(ones(3), 1:10) |> gpu
+    l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
+    @test Flux.onecold(y) isa CuArray
+    @test y[3, :] isa CuArray
+    @test Flux.onecold(y, l) == ['a', 'a', 'a']
 end
 
 @testset "onehot forward map to broadcast" begin
-  oa = OneHotArray(rand(1:10, 5, 5), 10) |> gpu
-  @test all(map(identity, oa) .== oa)
-  @test all(map(x -> 2 * x, oa) .== 2 .* oa)
+    oa = OneHotArray(rand(1:10, 5, 5), 10) |> gpu
+    @test all(map(identity, oa) .== oa)
+    @test all(map(x -> 2 * x, oa) .== 2 .* oa)
 end
 
 @testset "restructure gpu" begin
-  dudt = Dense(1,1) |> gpu
-  p,re = Flux.destructure(dudt)
-  foo(x) = sum(re(p)(x))
-  @test gradient(foo, cu(rand(1)))[1] isa CuArray
+    dudt = Dense(1, 1) |> gpu
+    p, re = Flux.destructure(dudt)
+    foo(x) = sum(re(p)(x))
+    @test gradient(foo, cu(rand(1)))[1] isa CuArray
 end
 
 @testset "GPU functors" begin
-  @testset "Cholesky" begin
-    M = 2.0*I(10) |> collect
-    Q = cholesky(M)
-    Q_gpu = Q |> gpu
-    @test Q_gpu isa Cholesky{<:Any,<:CuArray}
-    Q_cpu = Q_gpu |> cpu
-    @test Q_cpu == cholesky(eltype(Q_gpu).(M))
-  end
-
-  @testset "isbits array types" begin
-    struct SimpleBits
-      field::Int32
+    @testset "Cholesky" begin
+        M = 2.0 * I(10) |> collect
+        Q = cholesky(M)
+        Q_gpu = Q |> gpu
+        @test Q_gpu isa Cholesky{<:Any, <:CuArray}
+        Q_cpu = Q_gpu |> cpu
+        @test Q_cpu == cholesky(eltype(Q_gpu).(M))
+    end
+
+    @testset "isbits array types" begin
+        struct SimpleBits
+            field::Int32
+        end
+
+        @test gpu((; a = ones(1))).a isa CuVector{Float32}
+        @test gpu((; a = ['a', 'b', 'c'])).a isa CuVector{Char}
+        @test gpu((; a = [SimpleBits(1)])).a isa CuVector{SimpleBits}
     end
-    
-    @test gpu((;a=ones(1))).a isa CuVector{Float32}
-    @test gpu((;a=['a', 'b', 'c'])).a isa CuVector{Char}
-    @test gpu((;a=[SimpleBits(1)])).a isa CuVector{SimpleBits}
-  end
 end
 
 @testset "gpu(cpu(x)) inside gradient" begin
-  a = randn(Float32, 4, 4)
-  ca = cu(a)
-
-  # Trivial functions
-  @test gradient(x -> sum(abs, gpu(x)), a)[1] isa Matrix
-  @test gradient(x -> sum(gpu(x)), a)[1] isa Matrix
-  @test_skip gradient(x -> sum(gpu(x)), a')[1] isa Matrix  # sum(::Adjoint{T,CuArray}) makes a Fill
-  @test gradient(x -> sum(abs, cpu(x)), ca)[1] isa CuArray
-  # This test should really not go through indirections and pull out Fills for efficiency
-  # but we forcefully materialise. TODO: remove materialising CuArray here
-  @test gradient(x -> sum(cpu(x)), ca)[1] isa CuArray # This involves FillArray, which should be GPU compatible
-  @test gradient(x -> sum(cpu(x)), ca')[1] isa LinearAlgebra.Adjoint
-
-  # Even more trivial: no movement
-  @test gradient(x -> sum(abs, cpu(x)), a)[1] isa Matrix
-  @test gradient(x -> sum(abs, cpu(x)), a')[1] isa Matrix
-  @test gradient(x -> sum(cpu(x)), a)[1] isa typeof(gradient(sum, a)[1]) # FillArray
-  @test gradient(x -> sum(abs, gpu(x)), ca)[1] isa CuArray
-  @test_skip gradient(x -> sum(abs, gpu(x)), ca')[1] isa CuArray # KernelError: passing and using non-bitstype argument
-
-  # More complicated, Array * CuArray is an error
-  g0 = gradient(x -> sum(abs, (a * (a * x))), a)[1]
-  @test g0 ≈ gradient(x -> sum(abs, cpu(ca * gpu(a * x))), a)[1]
-  @test cu(g0) ≈ gradient(x -> sum(abs, gpu(a * cpu(ca * x))), ca)[1]
-  @test gradient(x -> sum(gpu(cpu(x))), a)[1] isa Matrix
-  @test gradient(x -> sum(gpu(cpu(x))), ca)[1] isa CuArray
-
-  g4 = gradient(x -> sum(a * (a' * x)), a)[1]  # no abs, one adjoint
-  @test g4 ≈ gradient(x -> sum(cpu(ca * gpu(a' * x))), a)[1]
-  @test cu(g4) ≈ gradient(x -> sum(gpu(a * cpu(ca' * x))), ca)[1]
-
-  # Scalar indexing of an array, needs OneElement to transfer to GPU
-  # https://github.com/FluxML/Zygote.jl/issues/1005
-  @test gradient(x -> cpu(2 .* gpu(x))[1], Float32[1,2,3]) == ([2,0,0],)
-  @test gradient(x -> cpu(gpu(x) * gpu(x))[1,2], Float32[1 2 3; 4 5 6; 7 8 9]) == ([2 6 8; 0 2 0; 0 3 0],)
+    a = randn(Float32, 4, 4)
+    ca = cu(a)
+
+    # Trivial functions
+    @test gradient(x -> sum(abs, gpu(x)), a)[1] isa Matrix
+    @test gradient(x -> sum(gpu(x)), a)[1] isa Matrix
+    @test_skip gradient(x -> sum(gpu(x)), a')[1] isa Matrix  # sum(::Adjoint{T,CuArray}) makes a Fill
+    @test gradient(x -> sum(abs, cpu(x)), ca)[1] isa CuArray
+    # This test should really not go through indirections and pull out Fills for efficiency
+    # but we forcefully materialise. TODO: remove materialising CuArray here
+    @test gradient(x -> sum(cpu(x)), ca)[1] isa CuArray # This involves FillArray, which should be GPU compatible
+    @test gradient(x -> sum(cpu(x)), ca')[1] isa LinearAlgebra.Adjoint
+
+    # Even more trivial: no movement
+    @test gradient(x -> sum(abs, cpu(x)), a)[1] isa Matrix
+    @test gradient(x -> sum(abs, cpu(x)), a')[1] isa Matrix
+    @test gradient(x -> sum(cpu(x)), a)[1] isa typeof(gradient(sum, a)[1]) # FillArray
+    @test gradient(x -> sum(abs, gpu(x)), ca)[1] isa CuArray
+    @test_skip gradient(x -> sum(abs, gpu(x)), ca')[1] isa CuArray # KernelError: passing and using non-bitstype argument
+
+    # More complicated, Array * CuArray is an error
+    g0 = gradient(x -> sum(abs, (a * (a * x))), a)[1]
+    @test g0 ≈ gradient(x -> sum(abs, cpu(ca * gpu(a * x))), a)[1]
+    @test cu(g0) ≈ gradient(x -> sum(abs, gpu(a * cpu(ca * x))), ca)[1]
+    @test gradient(x -> sum(gpu(cpu(x))), a)[1] isa Matrix
+    @test gradient(x -> sum(gpu(cpu(x))), ca)[1] isa CuArray
+
+    g4 = gradient(x -> sum(a * (a' * x)), a)[1]  # no abs, one adjoint
+    @test g4 ≈ gradient(x -> sum(cpu(ca * gpu(a' * x))), a)[1]
+    @test cu(g4) ≈ gradient(x -> sum(gpu(a * cpu(ca' * x))), ca)[1]
+
+    # Scalar indexing of an array, needs OneElement to transfer to GPU
+    # https://github.com/FluxML/Zygote.jl/issues/1005
+    @test gradient(x -> cpu(2 .* gpu(x))[1], Float32[1, 2, 3]) == ([2, 0, 0],)
+    @test gradient(x -> cpu(gpu(x) * gpu(x))[1, 2], Float32[1 2 3; 4 5 6; 7 8 9]) ==
+          ([2 6 8; 0 2 0; 0 3 0],)
 end
 
 @testset "gpu(x) and cpu(x) on structured arrays" begin
-  @test cpu(1:3) isa UnitRange
-  @test cpu(range(1, 3, length = 4)) isa AbstractRange
-
-  # OneElement isn't GPU compatible
-  g1 = Zygote.OneElement(1, (2,3), axes(ones(4,5)))
-  @test cpu(g1) isa Zygote.OneElement
-
-  g2 = Zygote.Fill(1f0, 2)
-  @test cpu(g2) isa Zygote.FillArrays.AbstractFill
-
-  g3 = transpose(Float32[1 2; 3 4])
-  @test parent(cpu(g3)) isa Matrix{Float32}
-
-  @testset "Sparse Arrays" begin
-    @test cpu(sparse(rand(3,3))) isa SparseMatrixCSC
-    a = sparse(rand(3,3))
-    @test cpu(a) === a
-    @test gpu(sparse(rand(3,3))) isa CUDA.CUSPARSE.CuSparseMatrixCSC
-  end
-
-  # Check that gpu() converts these to CuArrays. This a side-effect of using the same functions
-  # in gpu() as in the gradient of cpu(). A different design could avoid having gpu() used alone
-  # move these, if that turns out to be desirable.
-  @test gpu(g1) isa CuArray
-  @test gpu(g1) ≈ cu(Matrix(g1))
-  @test gpu(g2) isa CuArray
-  @test gpu(g2) ≈ cu(Vector(g2))
-  @test parent(gpu(g3)) isa CuArray
+    @test cpu(1:3) isa UnitRange
+    @test cpu(range(1, 3, length = 4)) isa AbstractRange
+
+    # OneElement isn't GPU compatible
+    g1 = Zygote.OneElement(1, (2, 3), axes(ones(4, 5)))
+    @test cpu(g1) isa Zygote.OneElement
+
+    g2 = Zygote.Fill(1.0f0, 2)
+    @test cpu(g2) isa Zygote.FillArrays.AbstractFill
+
+    g3 = transpose(Float32[1 2; 3 4])
+    @test parent(cpu(g3)) isa Matrix{Float32}
+
+    @testset "Sparse Arrays" begin
+        @test cpu(sparse(rand(3, 3))) isa SparseMatrixCSC
+        a = sparse(rand(3, 3))
+        @test cpu(a) === a
+        @test gpu(sparse(rand(3, 3))) isa CUDA.CUSPARSE.CuSparseMatrixCSC
+    end
+
+    # Check that gpu() converts these to CuArrays. This a side-effect of using the same functions
+    # in gpu() as in the gradient of cpu(). A different design could avoid having gpu() used alone
+    # move these, if that turns out to be desirable.
+    @test gpu(g1) isa CuArray
+    @test gpu(g1) ≈ cu(Matrix(g1))
+    @test gpu(g2) isa CuArray
+    @test gpu(g2) ≈ cu(Vector(g2))
+    @test parent(gpu(g3)) isa CuArray
 end
diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl
index 5c460d2aa4..2ff137d34f 100644
--- a/test/cuda/curnn.jl
+++ b/test/cuda/curnn.jl
@@ -1,26 +1,25 @@
 using Flux, CUDA, Test
 
 @testset for R in [RNN, GRU, LSTM, GRUv3]
-  m = R(10, 5) |> gpu
-  x = gpu(rand(10))
-  (m̄,) = gradient(m -> sum(m(x)), m)
-  Flux.reset!(m)
-  θ = gradient(() -> sum(m(x)), params(m))
-  @test x isa CuArray
-  @test θ[m.cell.Wi] isa CuArray
-  @test collect(m̄.cell.Wi) == collect(θ[m.cell.Wi])
+    m = R(10, 5) |> gpu
+    x = gpu(rand(10))
+    (m̄,) = gradient(m -> sum(m(x)), m)
+    Flux.reset!(m)
+    θ = gradient(() -> sum(m(x)), params(m))
+    @test x isa CuArray
+    @test θ[m.cell.Wi] isa CuArray
+    @test collect(m̄.cell.Wi) == collect(θ[m.cell.Wi])
 end
 
-@testset "RNN" begin
-  @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5)
+@testset "RNN" begin @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5)
     rnn = R(10, 5)
     curnn = fmap(gpu, rnn)
 
     Flux.reset!(rnn)
     Flux.reset!(curnn)
     x = batch_size == 1 ?
-      rand(Float32, 10) :
-      rand(Float32, 10, batch_size)
+        rand(Float32, 10) :
+        rand(Float32, 10, batch_size)
     cux = gpu(x)
 
     y, back = pullback((r, x) -> r(x), rnn, x)
@@ -37,18 +36,18 @@ end
     @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh)
     @test m̄[].cell.b ≈ collect(cum̄[].cell.b)
     if m̄[].state isa Tuple
-      for (x, cx) in zip(m̄[].state, cum̄[].state)
-        @test x ≈ collect(cx)
-      end
+        for (x, cx) in zip(m̄[].state, cum̄[].state)
+            @test x ≈ collect(cx)
+        end
     else
-      @test m̄[].state ≈ collect(cum̄[].state)
+        @test m̄[].state ≈ collect(cum̄[].state)
     end
 
     Flux.reset!(rnn)
     Flux.reset!(curnn)
     ohx = batch_size == 1 ?
-      Flux.onehot(rand(1:10), 1:10) :
-      Flux.onehotbatch(rand(1:10, batch_size), 1:10)
+          Flux.onehot(rand(1:10), 1:10) :
+          Flux.onehotbatch(rand(1:10, batch_size), 1:10)
     cuohx = gpu(ohx)
     y = (rnn(ohx); rnn(ohx))
 
@@ -63,5 +62,4 @@ end
 
     cufy = (curnn(cufx); curnn(cufx))
     @test fy ≈ collect(cufy)
-  end
-end
+end end
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index 8024681a06..ea4452a47e 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -6,71 +6,70 @@
 
 # generic movement tests
 @testset "Basic GPU Movement" begin
-  @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
-  @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
+    @test gradient(x -> sum(gpu(x)), rand(3, 3)) isa Tuple
+    @test gradient(x -> sum(cpu(x)), gpu(rand(3, 3))) isa Tuple
 end
 
 # TODO: These layers get into scalar indexing issues.
 const BROKEN_LAYERS = Union{}
 
 const ACTIVATIONS = [identity, relu, tanh,
-                     sigmoid, exp, softplus,
-                     elu, selu]
-
-function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; test_cpu = true)
-  isnothing(x_cpu) && error("Missing input to test the layers against.")
-  @testset "$name GPU grad tests" begin
-    for layer in layers
-      @testset "$layer Layer GPU grad test" begin
-
-        # compute output and grad of parameters
-        l_cpu = layer(args...)
-        ps_cpu = Flux.params(l_cpu)
-        y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu)
-        gs_cpu = back_cpu(1f0)
-
-        x_gpu = gpu(x_cpu)
-        l_gpu = l_cpu |> gpu
-        ps_gpu = Flux.params(l_gpu)
-
-        if typeof(l_gpu) <: BROKEN_LAYERS
-          @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads
-        else
-          y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
-          gs_gpu = back_gpu(1f0) # TODO many layers error out when backprop int 1, should fix
-
-          # compute grad of input
-          xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
-          xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
-
-          # test
-          if test_cpu
-            @test y_gpu ≈ y_cpu rtol=1f-3 atol=1f-3
-            if isnothing(xg_cpu)
-              @test isnothing(xg_gpu)
+    sigmoid, exp, softplus,
+    elu, selu]
+
+function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...;
+                      test_cpu = true)
+    isnothing(x_cpu) && error("Missing input to test the layers against.")
+    @testset "$name GPU grad tests" begin for layer in layers
+        @testset "$layer Layer GPU grad test" begin
+
+            # compute output and grad of parameters
+            l_cpu = layer(args...)
+            ps_cpu = Flux.params(l_cpu)
+            y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu)
+            gs_cpu = back_cpu(1.0f0)
+
+            x_gpu = gpu(x_cpu)
+            l_gpu = l_cpu |> gpu
+            ps_gpu = Flux.params(l_gpu)
+
+            if typeof(l_gpu) <: BROKEN_LAYERS
+                @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads
             else
-              if layer === GroupedConvTranspose
-                @test Array(xg_gpu) ≈ xg_cpu rtol = 2f-2 atol = 1f-3
-              else
-                @test Array(xg_gpu) ≈ xg_cpu rtol = 1f-3 atol = 1f-3
-              end
+                y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
+                gs_gpu = back_gpu(1.0f0) # TODO many layers error out when backprop int 1, should fix
+
+                # compute grad of input
+                xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
+                xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
+
+                # test
+                if test_cpu
+                    @test y_gpu≈y_cpu rtol=1.0f-3 atol=1.0f-3
+                    if isnothing(xg_cpu)
+                        @test isnothing(xg_gpu)
+                    else
+                        if layer === GroupedConvTranspose
+                            @test Array(xg_gpu)≈xg_cpu rtol=2.0f-2 atol=1.0f-3
+                        else
+                            @test Array(xg_gpu)≈xg_cpu rtol=1.0f-3 atol=1.0f-3
+                        end
+                    end
+                end
+                @test gs_gpu isa Flux.Zygote.Grads
+                for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu)
+                    if isnothing(gs_cpu[p_cpu])
+                        @test isnothing(gs_gpu[p_gpu])
+                    else
+                        @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray
+                        if test_cpu
+                            @test Array(gs_gpu[p_gpu])≈gs_cpu[p_cpu] rtol=1.0f-3 atol=1.0f-3
+                        end
+                    end
+                end
             end
-          end
-          @test gs_gpu isa Flux.Zygote.Grads
-          for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu)
-            if isnothing(gs_cpu[p_cpu])
-              @test isnothing(gs_gpu[p_gpu])
-            else
-              @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray
-              if test_cpu
-                @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol=1f-3 atol=1f-3
-              end
-            end
-          end
         end
-      end
-    end
-  end
+    end end
 end
 
 # Just to give testset in gpu_gradtest meaningful labels
@@ -82,44 +81,49 @@ GroupedConv(args...) = Conv(args..., groups = 5)
 GroupedConvTranspose(args...) = ConvTranspose(args..., groups = 5)
 
 for act in ACTIVATIONS
-  r = rand(Float32, 28, 28, 1, 1)
-  conv_layers = [Conv, ConvNoBias,
-                 ConvTranspose, ConvTransposeNoBias,
-                 CrossCor, CrossCorNoBias,
-                 DepthwiseConv, DepthwiseConvNoBias]
-  gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act, test_cpu = false)
-
-  groupedconv = [GroupedConv, GroupedConvTranspose]
-  gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), (3,3), 100 => 25, act, test_cpu = true)
-
-  batch_norm = [BatchNorm]
-  gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, test_cpu = false) #TODO fix errors
-  gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = false)
-
-  instancenorm = [InstanceNorm]
-  gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false)
-
-  groupnorm = [GroupNorm]
-  gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act, test_cpu = false)
+    r = rand(Float32, 28, 28, 1, 1)
+    conv_layers = [Conv, ConvNoBias,
+        ConvTranspose, ConvTransposeNoBias,
+        CrossCor, CrossCorNoBias,
+        DepthwiseConv, DepthwiseConvNoBias]
+    gpu_gradtest("Convolution with $act", conv_layers, r, (2, 2), 1 => 3, act,
+                 test_cpu = false)
+
+    groupedconv = [GroupedConv, GroupedConvTranspose]
+    gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2),
+                 (3, 3), 100 => 25, act, test_cpu = true)
+
+    batch_norm = [BatchNorm]
+    gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28, 28, 3, 4), 3, act,
+                 test_cpu = false) #TODO fix errors
+    gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5, 4), 5, act,
+                 test_cpu = false)
+
+    instancenorm = [InstanceNorm]
+    gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false)
+
+    groupnorm = [GroupNorm]
+    gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28, 28, 3, 1), 3, 1, act,
+                 test_cpu = false)
 end
 
 r = rand(Float32, 28, 28, 1, 1)
 
 pooling_layers = [MaxPool, MeanPool]
-gpu_gradtest("Pooling", pooling_layers, r, (2,2))
+gpu_gradtest("Pooling", pooling_layers, r, (2, 2))
 
 adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool]
-gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7), test_cpu = false)
+gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7, 7), test_cpu = false)
 
 dropout_layers = [Dropout, AlphaDropout]
 gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu = false) # dropout is not deterministic
 
 layer_norm = [LayerNorm]
-gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 1, test_cpu = false) #TODO fix errors
-gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5)
+gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28, 28, 3, 4), 1, test_cpu = false) #TODO fix errors
+gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5, 4), 5)
 
-upsample = [x -> Upsample(scale=x)]
-gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2))
+upsample = [x -> Upsample(scale = x)]
+gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2, 2))
 gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,))
 
 pixelshuffle = [PixelShuffle]
@@ -127,166 +131,168 @@ gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3)
 gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3)
 
 embedding = [Flux.Embedding]
-gpu_gradtest("Embedding", embedding, [1,3,5], 5, 2)
-gpu_gradtest("Embedding repeated indices", embedding, [1,3,5,3], 5, 2)
+gpu_gradtest("Embedding", embedding, [1, 3, 5], 5, 2)
+gpu_gradtest("Embedding repeated indices", embedding, [1, 3, 5, 3], 5, 2)
 gpu_gradtest("Embedding integer index", embedding, 1, 5, 2)
 gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2)
 gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2)
-gpu_gradtest("Embedding OneHotMatrix index", embedding,  OneHotMatrix([1,2,3], 5), 5, 2)
-gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix([1,2,2], 5), 5, 2)
+gpu_gradtest("Embedding OneHotMatrix index", embedding, OneHotMatrix([1, 2, 3], 5), 5, 2)
+gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding,
+             OneHotMatrix([1, 2, 2], 5), 5, 2)
 
 @testset "function layers" begin
-  x = rand(Float32, 3,3)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x)), x)
+    x = rand(Float32, 3, 3)
+    gpu_autodiff_test(x -> sum(Flux.normalise(x; dims = 1)), x)
+    gpu_autodiff_test(x -> sum(Flux.normalise(x; dims = 2)), x)
+    gpu_autodiff_test(x -> sum(Flux.normalise(x)), x)
 end
 
 @testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv)
-  l = cl((2,2), 1=>3, bias = false) |> gpu
-  ip = zeros(Float32, 28,28,1,1) |> gpu
-  if typeof(l) <: BROKEN_LAYERS
-    @test_broken sum(l(ip)) ≈ 0.f0
-    @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads
-  else
-    @test sum(l(ip)) ≈ 0.f0
-    gs = gradient(() -> sum(l(ip)), Flux.params(l))
-    @test l.bias ∉ gs.params
-  end
+    l = cl((2, 2), 1 => 3, bias = false) |> gpu
+    ip = zeros(Float32, 28, 28, 1, 1) |> gpu
+    if typeof(l) <: BROKEN_LAYERS
+        @test_broken sum(l(ip)) ≈ 0.0f0
+        @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads
+    else
+        @test sum(l(ip)) ≈ 0.0f0
+        gs = gradient(() -> sum(l(ip)), Flux.params(l))
+        @test l.bias ∉ gs.params
+    end
 end
 
 @testset "Dense without bias" begin
-  l = Dense(ones(Float32, 4, 3), false) |> gpu
-  ip = zeros(Float32, 3, 7) |> gpu
+    l = Dense(ones(Float32, 4, 3), false) |> gpu
+    ip = zeros(Float32, 3, 7) |> gpu
 
-  @test sum(l(ip)) ≈ 0.f0
-  gs = gradient(() -> sum(l(ip)), Flux.params(l))
-  @test l.bias ∉ gs.params
+    @test sum(l(ip)) ≈ 0.0f0
+    gs = gradient(() -> sum(l(ip)), Flux.params(l))
+    @test l.bias ∉ gs.params
 end
 
 @testset "Extended BatchNorm" begin
-  m_cpu = BatchNorm(2)
-  m_gpu = m_cpu |> gpu
-  x_cpu = rand(Float32, 3, 2, 2)
-  x_gpu = x_cpu |> gpu
-
-  ## In :auto mode, track statistics only in gradient contest
-  μ_cpu = copy(m_cpu.μ)
-  m_cpu(x_cpu)
-  @test m_cpu.μ ≈ μ_cpu
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
-  @test !(m_cpu.μ ≈ μ_cpu)
-
-  μ_gpu = copy(m_gpu.μ)
-  m_gpu(x_gpu)
-  @test m_gpu.μ ≈ μ_gpu
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
-  @test !(m_gpu.μ ≈ μ_gpu)
-
-  @test Array(m_gpu.μ) ≈ m_cpu.μ
-
-  ## In testmode, never track statistics
-  testmode!(m_cpu)
-  μ_cpu = copy(m_cpu.μ)
-  m_cpu(x_cpu)
-  @test m_cpu.μ ≈ μ_cpu
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
-  @test m_cpu.μ ≈ μ_cpu
-
-  testmode!(m_gpu)
-  μ_gpu = copy(m_gpu.μ)
-  m_gpu(x_gpu)
-  @test m_gpu.μ ≈ μ_gpu
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
-  @test m_gpu.μ ≈ μ_gpu
-
-  ## In trainmode, always track statistics
-  trainmode!(m_cpu)
-  μ_cpu = copy(m_cpu.μ)
-  m_cpu(x_cpu)
-  @test !(m_cpu.μ ≈ μ_cpu)
-  μ_cpu = copy(m_cpu.μ)
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
-  @test !(m_cpu.μ ≈ μ_cpu)
-
-  trainmode!(m_gpu)
-  μ_gpu = copy(m_gpu.μ)
-  m_gpu(x_gpu)
-  @test !(m_gpu.μ ≈ μ_gpu)
-  μ_gpu = copy(m_gpu.μ)
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
-  @test !(m_gpu.μ ≈ μ_gpu)
-
-  ## No errors if input type mistmatch
-  # x_cpu = rand(Float64, 3, 2, 2)
-  # x_gpu = x_cpu |> gpu
-  # m_cpu(x_cpu)
-  # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
-  # m_gpu(x_gpu)
-  # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+    m_cpu = BatchNorm(2)
+    m_gpu = m_cpu |> gpu
+    x_cpu = rand(Float32, 3, 2, 2)
+    x_gpu = x_cpu |> gpu
+
+    ## In :auto mode, track statistics only in gradient contest
+    μ_cpu = copy(m_cpu.μ)
+    m_cpu(x_cpu)
+    @test m_cpu.μ ≈ μ_cpu
+    gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+    @test !(m_cpu.μ ≈ μ_cpu)
+
+    μ_gpu = copy(m_gpu.μ)
+    m_gpu(x_gpu)
+    @test m_gpu.μ ≈ μ_gpu
+    gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+    @test !(m_gpu.μ ≈ μ_gpu)
+
+    @test Array(m_gpu.μ) ≈ m_cpu.μ
+
+    ## In testmode, never track statistics
+    testmode!(m_cpu)
+    μ_cpu = copy(m_cpu.μ)
+    m_cpu(x_cpu)
+    @test m_cpu.μ ≈ μ_cpu
+    gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+    @test m_cpu.μ ≈ μ_cpu
+
+    testmode!(m_gpu)
+    μ_gpu = copy(m_gpu.μ)
+    m_gpu(x_gpu)
+    @test m_gpu.μ ≈ μ_gpu
+    gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+    @test m_gpu.μ ≈ μ_gpu
+
+    ## In trainmode, always track statistics
+    trainmode!(m_cpu)
+    μ_cpu = copy(m_cpu.μ)
+    m_cpu(x_cpu)
+    @test !(m_cpu.μ ≈ μ_cpu)
+    μ_cpu = copy(m_cpu.μ)
+    gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+    @test !(m_cpu.μ ≈ μ_cpu)
+
+    trainmode!(m_gpu)
+    μ_gpu = copy(m_gpu.μ)
+    m_gpu(x_gpu)
+    @test !(m_gpu.μ ≈ μ_gpu)
+    μ_gpu = copy(m_gpu.μ)
+    gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+    @test !(m_gpu.μ ≈ μ_gpu)
+
+    ## No errors if input type mistmatch
+    # x_cpu = rand(Float64, 3, 2, 2)
+    # x_gpu = x_cpu |> gpu
+    # m_cpu(x_cpu)
+    # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+    # m_gpu(x_gpu)
+    # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
 end
 
 @testset "Two-streams Bilinear" begin
-  x = zeros(Float32,10,9) |> gpu
-  y = zeros(Float32,2,9) |> gpu
-  b = Flux.Bilinear(10, 2, 3) |> gpu
-  @test size(b(x,y)) == (3,9)
-  @test sum(abs2, b(x,y)) ≈ 0f0
-  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
-  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
-  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
-  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
-    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
-  end
+    x = zeros(Float32, 10, 9) |> gpu
+    y = zeros(Float32, 2, 9) |> gpu
+    b = Flux.Bilinear(10, 2, 3) |> gpu
+    @test size(b(x, y)) == (3, 9)
+    @test sum(abs2, b(x, y)) ≈ 0.0f0
+    gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
+    b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
+    gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
+    for (pgpu, pcpu) in zip(params(b), params(b_cpu))
+        @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
+    end
 end
 
 @testset "Two-streams Bilinear" begin
-  x = zeros(Float32,10,9) |> gpu
-  y = zeros(Float32,2,9) |> gpu
-  b = Flux.Bilinear(10, 2, 3) |> gpu
-  @test size(b(x,y)) == (3,9)
-  @test sum(abs2, b(x,y)) ≈ 0f0
-  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
-  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
-  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
-  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
-    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
-  end
+    x = zeros(Float32, 10, 9) |> gpu
+    y = zeros(Float32, 2, 9) |> gpu
+    b = Flux.Bilinear(10, 2, 3) |> gpu
+    @test size(b(x, y)) == (3, 9)
+    @test sum(abs2, b(x, y)) ≈ 0.0f0
+    gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
+    b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
+    gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
+    for (pgpu, pcpu) in zip(params(b), params(b_cpu))
+        @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
+    end
 end
 
 @testset "Parallel" begin
-  @testset "zero sum" begin
-    input = randn(10, 10, 10, 10) |> gpu
-    layer_gpu = Parallel(+, zero, identity) |> gpu
-    @test layer_gpu(input) == input
-    @test layer_gpu(input) isa Flux.CUDA.CuArray
-  end
-
-  @testset "vararg input" begin
-    inputs = (randn(10), randn(5), randn(4)) .|> gpu
-    layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu
-    @test size(layer(inputs)) == (2,)
-  end
-
-  @testset "gradient" begin
-    input_cpu = randn(10, 10, 10, 10)
-    input_gpu = input_cpu |> gpu
-    layer_cpu = Parallel(+, x -> zero(x), identity)
-    layer_gpu = layer_cpu |> gpu
-    gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
-    gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
-    for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
-      @test gs_cpu[pcpu] ≈ gs_gpu[pgpu]
+    @testset "zero sum" begin
+        input = randn(10, 10, 10, 10) |> gpu
+        layer_gpu = Parallel(+, zero, identity) |> gpu
+        @test layer_gpu(input) == input
+        @test layer_gpu(input) isa Flux.CUDA.CuArray
+    end
+
+    @testset "vararg input" begin
+        inputs = (randn(10), randn(5), randn(4)) .|> gpu
+        layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu
+        @test size(layer(inputs)) == (2,)
+    end
+
+    @testset "gradient" begin
+        input_cpu = randn(10, 10, 10, 10)
+        input_gpu = input_cpu |> gpu
+        layer_cpu = Parallel(+, x -> zero(x), identity)
+        layer_gpu = layer_cpu |> gpu
+        gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
+        gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
+        for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
+            @test gs_cpu[pcpu] ≈ gs_gpu[pgpu]
+        end
     end
-  end
 end
 
 @testset "Dropout RNGs" begin
-  @test_throws ArgumentError Flux.dropout(MersenneTwister(), CUDA.rand(Float32, 2, 3), 0.1)
-  @testset for layer in (Dropout, AlphaDropout)
-    m = layer(0.1; rng = MersenneTwister(123))
-    @test_throws ErrorException gpu(m)
-    m = layer(0.1; rng = CUDA.default_rng())
-    @test gpu(m).rng isa CUDA.RNG
-  end
+    @test_throws ArgumentError Flux.dropout(MersenneTwister(), CUDA.rand(Float32, 2, 3),
+                                            0.1)
+    @testset for layer in (Dropout, AlphaDropout)
+        m = layer(0.1; rng = MersenneTwister(123))
+        @test_throws ErrorException gpu(m)
+        m = layer(0.1; rng = CUDA.default_rng())
+        @test gpu(m).rng isa CUDA.RNG
+    end
 end
diff --git a/test/cuda/losses.jl b/test/cuda/losses.jl
index a0f7f47d80..6777146b3d 100644
--- a/test/cuda/losses.jl
+++ b/test/cuda/losses.jl
@@ -1,38 +1,37 @@
-using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss
-
+using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy,
+                   binary_focal_loss, focal_loss
 
 @testset "Losses" begin
-
-x = [1.,2.,3.]
-cx = gpu(x)
-@test crossentropy(x,x) ≈ crossentropy(cx,cx)
-@test crossentropy(x,x, agg=identity) ≈ crossentropy(cx,cx, agg=identity) |> cpu
-@test crossentropy(x,x, agg=x->mean([1.0;2.0;3.0].*x)) ≈ crossentropy(cx,cx, agg=x->mean(gpu([1.0;2.0;3.0]).*x))
-
-x = [-1.1491, 0.8619, 0.3127]
-y = [1, 1, 0.]
-@test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y))
-@test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y))
-
-x = [0.268941  0.5  0.268941
-     0.731059  0.5  0.731059]
-y = [0  1  0
-     1  0  1]
-@test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y))
-
-x = softmax(reshape(-7:7, 3, 5) .* 1f0)
-y = [1  0  0  0  1
-     0  1  0  1  0
-     0  0  1  0  0]
-@test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y))
-
-@testset "GPU grad tests" begin
-  x = rand(Float32, 3,3)
-  y = rand(Float32, 3,3)
-
-  for loss in ALL_LOSSES
-    gpu_autodiff_test(loss, x, y)
-  end
-end
-
+    x = [1.0, 2.0, 3.0]
+    cx = gpu(x)
+    @test crossentropy(x, x) ≈ crossentropy(cx, cx)
+    @test crossentropy(x, x, agg = identity) ≈ crossentropy(cx, cx, agg = identity) |> cpu
+    @test crossentropy(x, x, agg = x -> mean([1.0; 2.0; 3.0] .* x)) ≈
+          crossentropy(cx, cx, agg = x -> mean(gpu([1.0; 2.0; 3.0]) .* x))
+
+    x = [-1.1491, 0.8619, 0.3127]
+    y = [1, 1, 0.0]
+    @test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y))
+    @test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y))
+
+    x = [0.268941 0.5 0.268941
+         0.731059 0.5 0.731059]
+    y = [0 1 0
+         1 0 1]
+    @test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y))
+
+    x = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
+    y = [1 0 0 0 1
+         0 1 0 1 0
+         0 0 1 0 0]
+    @test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y))
+
+    @testset "GPU grad tests" begin
+        x = rand(Float32, 3, 3)
+        y = rand(Float32, 3, 3)
+
+        for loss in ALL_LOSSES
+            gpu_autodiff_test(loss, x, y)
+        end
+    end
 end #testset
diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl
index ebd32b1ec0..ca44286e48 100644
--- a/test/cuda/runtests.jl
+++ b/test/cuda/runtests.jl
@@ -12,9 +12,9 @@ include("losses.jl")
 include("layers.jl")
 
 if CUDA.has_cudnn()
-  @info "Testing Flux/CUDNN"
-  include("cudnn.jl")
-  include("curnn.jl")
+    @info "Testing Flux/CUDNN"
+    include("cudnn.jl")
+    include("curnn.jl")
 else
-  @warn "CUDNN unavailable, not testing GPU DNN support"
+    @warn "CUDNN unavailable, not testing GPU DNN support"
 end
diff --git a/test/cuda/test_utils.jl b/test/cuda/test_utils.jl
index bc0db37474..466b08c8b9 100644
--- a/test/cuda/test_utils.jl
+++ b/test/cuda/test_utils.jl
@@ -1,72 +1,75 @@
 function check_grad(g_gpu, g_cpu, atol, rtol)
-  @show g_gpu g_cpu
-  @test false
+    @show g_gpu g_cpu
+    @test false
+end
+function check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol)
+    return check_grad(g_gpu[], g_cpu[], atol, rtol)
 end
-check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) = 
-    check_grad(g_gpu[], g_cpu[], atol, rtol)
 check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true
-check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) = @test g_cpu ≈ g_gpu   rtol=rtol atol=atol 
-check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol) = 
-  @test g_cpu ≈ collect(g_gpu)   rtol=rtol atol=atol
+function check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol)
+    @test g_cpu≈g_gpu rtol=rtol atol=atol
+end
+function check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol)
+    @test g_cpu≈collect(g_gpu) rtol=rtol atol=atol
+end
 
 function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol)
-  for (v1, v2) in zip(g_gpu, g_cpu)  
-    check_grad(v1, v2, atol, rtol)
-  end
+    for (v1, v2) in zip(g_gpu, g_cpu)
+        check_grad(v1, v2, atol, rtol)
+    end
 end
 
 function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol)
-  for ((k1,v1), (k2,v2)) in zip(pairs(g_gpu), pairs(g_cpu))  
-    @test k1 == k2
-    # @show k2 v2
-    check_grad(v1, v2, atol, rtol)
-  end
+    for ((k1, v1), (k2, v2)) in zip(pairs(g_gpu), pairs(g_cpu))
+        @test k1 == k2
+        # @show k2 v2
+        check_grad(v1, v2, atol, rtol)
+    end
 end
 
-function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; 
-        test_equal=true, rtol=1e-4, atol=1e-4)
-
-  check_type(x) = false
-  check_type(x::Float32) = true
-  check_type(x::CuArray{Float32}) = true
-  check_type(x::Array{Float32}) = true
+function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...;
+                           test_equal = true, rtol = 1e-4, atol = 1e-4)
+    check_type(x) = false
+    check_type(x::Float32) = true
+    check_type(x::CuArray{Float32}) = true
+    check_type(x::Array{Float32}) = true
 
-  ### GRADIENT WITH RESPECT TO INPUT #####
-  # y_cpu, back_cpu = pullback((f, x...) -> f(x...), f_cpu, xs_cpu...)
-  y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...)
-  @test check_type(y_cpu)
-  Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu))
-  gs_cpu = back_cpu(Δ_cpu)
+    ### GRADIENT WITH RESPECT TO INPUT #####
+    # y_cpu, back_cpu = pullback((f, x...) -> f(x...), f_cpu, xs_cpu...)
+    y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...)
+    @test check_type(y_cpu)
+    Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu))
+    gs_cpu = back_cpu(Δ_cpu)
 
-  f_gpu = f_cpu |> gpu
-  xs_gpu = gpu.(xs_cpu)
-  Δ_gpu = Δ_cpu |> gpu 
-  # y_gpu, back_gpu = pullback((f, x...) -> f(x...), f_gpu, xs_gpu...)
-  y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...)
-  @test check_type(y_gpu)
-  gs_gpu = back_gpu(Δ_gpu)
+    f_gpu = f_cpu |> gpu
+    xs_gpu = gpu.(xs_cpu)
+    Δ_gpu = Δ_cpu |> gpu
+    # y_gpu, back_gpu = pullback((f, x...) -> f(x...), f_gpu, xs_gpu...)
+    y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...)
+    @test check_type(y_gpu)
+    gs_gpu = back_gpu(Δ_gpu)
 
-  if test_equal 
-    @test collect(y_cpu) ≈ collect(y_gpu)   rtol=rtol atol=atol
-    for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu)
-      check_grad(g_gpu, g_cpu, atol, rtol)
+    if test_equal
+        @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol
+        for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu)
+            check_grad(g_gpu, g_cpu, atol, rtol)
+        end
     end
-  end
 
-  ### GRADIENT WITH RESPECT TO f #####
-  ps_cpu = Flux.params(f_cpu)
-  y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu)
-  gs_cpu = back_cpu(Δ_cpu)
+    ### GRADIENT WITH RESPECT TO f #####
+    ps_cpu = Flux.params(f_cpu)
+    y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu)
+    gs_cpu = back_cpu(Δ_cpu)
+
+    ps_gpu = Flux.params(f_gpu)
+    y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu)
+    gs_gpu = back_gpu(Δ_gpu)
 
-  ps_gpu = Flux.params(f_gpu)
-  y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu)
-  gs_gpu = back_gpu(Δ_gpu)
-  
-  if test_equal 
-    @test collect(y_cpu) ≈ collect(y_gpu)   rtol=rtol atol=atol
-    @assert length(ps_gpu) == length(ps_cpu)
-    for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu)
-      check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol)
+    if test_equal
+        @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol
+        @assert length(ps_gpu) == length(ps_cpu)
+        for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu)
+            check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol)
+        end
     end
-  end
 end
diff --git a/test/data.jl b/test/data.jl
index 4e4c485064..08d3a5809c 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -4,35 +4,35 @@ using Random
     X = reshape([1:10;], (2, 5))
     Y = [1:5;]
 
-    d = DataLoader(X, batchsize=2)
+    d = DataLoader(X, batchsize = 2)
     # @inferred first(d)
     batches = collect(d)
     # @test eltype(batches) == eltype(d) == typeof(X)
     @test eltype(batches) == typeof(X)
     @test length(batches) == 3
-    @test batches[1] == X[:,1:2]
-    @test batches[2] == X[:,3:4]
-    @test batches[3] == X[:,5:5]
+    @test batches[1] == X[:, 1:2]
+    @test batches[2] == X[:, 3:4]
+    @test batches[3] == X[:, 5:5]
 
-    d = DataLoader(X, batchsize=2, partial=false)
+    d = DataLoader(X, batchsize = 2, partial = false)
     # @inferred first(d)
     batches = collect(d)
     # @test eltype(batches) == eltype(d) == typeof(X)
     @test eltype(batches) == typeof(X)
     @test length(batches) == 2
-    @test batches[1] == X[:,1:2]
-    @test batches[2] == X[:,3:4]
+    @test batches[1] == X[:, 1:2]
+    @test batches[2] == X[:, 3:4]
 
-    d = DataLoader((X,), batchsize=2, partial=false)
+    d = DataLoader((X,), batchsize = 2, partial = false)
     # @inferred first(d)
     batches = collect(d)
     # @test eltype(batches) == eltype(d) == Tuple{typeof(X)}
     @test eltype(batches) == Tuple{typeof(X)}
     @test length(batches) == 2
-    @test batches[1] == (X[:,1:2],)
-    @test batches[2] == (X[:,3:4],)
+    @test batches[1] == (X[:, 1:2],)
+    @test batches[2] == (X[:, 3:4],)
 
-    d = DataLoader((X, Y), batchsize=2)
+    d = DataLoader((X, Y), batchsize = 2)
     # @inferred first(d)
     batches = collect(d)
     # @test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)}
@@ -41,41 +41,41 @@ using Random
     @test length(batches[1]) == 2
     @test length(batches[2]) == 2
     @test length(batches[3]) == 2
-    @test batches[1][1] == X[:,1:2]
+    @test batches[1][1] == X[:, 1:2]
     @test batches[1][2] == Y[1:2]
-    @test batches[2][1] == X[:,3:4]
+    @test batches[2][1] == X[:, 3:4]
     @test batches[2][2] == Y[3:4]
-    @test batches[3][1] == X[:,5:5]
+    @test batches[3][1] == X[:, 5:5]
     @test batches[3][2] == Y[5:5]
 
     # test with NamedTuple
-    d = DataLoader((x=X, y=Y), batchsize=2)
+    d = DataLoader((x = X, y = Y), batchsize = 2)
     # @inferred first(d)
     batches = collect(d)
     # @test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
-    @test eltype(batches) ==  NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
+    @test eltype(batches) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
     @test length(batches) == 3
     @test length(batches[1]) == 2
     @test length(batches[2]) == 2
     @test length(batches[3]) == 2
-    @test batches[1][1] == batches[1].x == X[:,1:2]
+    @test batches[1][1] == batches[1].x == X[:, 1:2]
     @test batches[1][2] == batches[1].y == Y[1:2]
-    @test batches[2][1] == batches[2].x == X[:,3:4]
+    @test batches[2][1] == batches[2].x == X[:, 3:4]
     @test batches[2][2] == batches[2].y == Y[3:4]
-    @test batches[3][1] == batches[3].x == X[:,5:5]
+    @test batches[3][1] == batches[3].x == X[:, 5:5]
     @test batches[3][2] == batches[3].y == Y[5:5]
 
     # Don't mutate state https://github.com/FluxML/Flux.jl/issues/1227
-    d = DataLoader([1:10;], shuffle=true)
+    d = DataLoader([1:10;], shuffle = true)
     cd = collect(zip(d, d))
     # skip the first since it used to be different also before fixing the bug
-    @test [cd[i][1] for i=2:10] != [cd[i][2] for i=2:10] 
-    
+    @test [cd[i][1] for i in 2:10] != [cd[i][2] for i in 2:10]
+
     # test interaction with `train!`
     θ = ones(2)
     X = zeros(2, 10)
-    loss(x) = sum((x .- θ).^2)
-    d  = DataLoader(X)
+    loss(x) = sum((x .- θ) .^ 2)
+    d = DataLoader(X)
     Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
     @test norm(θ) < 1e-4
 
@@ -83,11 +83,13 @@ using Random
     θ = zeros(2)
     X = ones(2, 10)
     Y = fill(2, 10)
-    loss(x, y) = sum((y - x'*θ).^2)
-    d  = DataLoader((X, Y))
+    loss(x, y) = sum((y - x' * θ) .^ 2)
+    d = DataLoader((X, Y))
     Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
     @test norm(θ .- 1) < 1e-10
 
     # specify the rng
-    d = map(identity, DataLoader(X, batchsize=2; shuffle=true, rng=Random.seed!(Random.default_rng(), 5)))
+    d = map(identity,
+            DataLoader(X, batchsize = 2; shuffle = true,
+                       rng = Random.seed!(Random.default_rng(), 5)))
 end
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 1f9d30dec5..c915087650 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -2,380 +2,396 @@ using Test, Random
 import Flux: activations
 
 @testset "basic" begin
-  @testset "helpers" begin
-    @testset "activations" begin
-      dummy_model = Chain(x->x.^2, x->x .- 3, x -> tan.(x))
-      x = randn(10)
-      @test activations(dummy_model, x)[1] == x.^2
-      @test activations(dummy_model, x)[2] == (x.^2 .- 3)
-      @test activations(dummy_model, x)[3] == tan.(x.^2 .- 3)
-
-      @test activations(Chain(), x) == ()
-      @test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type
-    end
-  end
-
-  @testset "Chain" begin
-    @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10))
-    @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn(10))
-    # numeric test should be put into testset of corresponding layer
-
-    @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn(10))
-    m = Chain(first = Dense(10, 5, σ), second = Dense(5, 2))
-    @test m[:first] == m[1]
-    @test m[1:2] == m
-
-    @test m == m
-    @test m == fmap(identity, m)  # does not forget names
-
-    @test_throws ArgumentError Chain(layers = Dense(10, 10), two = identity) # reserved name
-
-    @test_nowarn Chain([Dense(10, 5, σ), Dense(5, 2)])(randn(Float32, 10))  # vector of layers
-    
-    c = Chain(Dense(10, 5, σ), Dense(5, 2), Dense(2, 1, relu))
-    @test c[1] == c[begin]
-    @test c[3] == c[end]
-  end
-
-  @testset "Activations" begin
-    c = Chain(Dense(3,5,relu), Dense(5,1,relu))
-    X = Float32.([1.0; 1.0; 1.0])
-    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], Flux.params(c))
-
-    c2 = Chain(enc = c[1], dec = c[2])
-    @test Flux.activations(c, X) == Flux.activations(c2, X)
-    @test_nowarn gradient(()->Flux.activations(c2, X)[2][1], Flux.params(c2))
-  end
-
-  @testset "Dense" begin
-    @testset "constructors" begin
-      @test size(Dense(10, 100).weight) == (100, 10)
-      @test size(Dense(10, 100).bias) == (100,)
-      @test Dense(rand(100,10), rand(100)).σ == identity
-      @test Dense(rand(100,10)).σ == identity
-
-      @test Dense(rand(100,10), false).σ == identity
-      @test Dense(rand(100,10), false, tanh).σ == tanh
-      @test Dense(rand(100,10), rand(100)).σ == identity
-      @test Dense(rand(Float16, 100,10), true).bias isa Vector{Float16}  # creates matching type
-      @test_skip Dense(rand(Float16, 100,10), rand(100)).bias isa Vector{Float16}  # converts to match
-
-      @test Dense(3,4; init=Base.randn, bias=true).bias isa Vector{Float64}
-      @test_skip Dense(3,4; init=Base.randn, bias=[1,2,3,4]).bias isa Vector{Float64}
-
-      @test_throws MethodError Dense(10, 10.5)
-      @test_throws MethodError Dense(10, 10.5, tanh)
-      @test_throws DimensionMismatch Dense(3,4; bias=rand(5))
-      @test_throws DimensionMismatch Dense(rand(4,3), rand(5))
-      @test_throws MethodError Dense(rand(5))
-      @test_throws MethodError Dense(rand(5), rand(5))
-      @test_throws MethodError Dense(rand(5), rand(5), tanh)
-    end
-    @testset "dimensions" begin
-      @test  length(Dense(10, 5)(randn(10))) == 5
-      @test_throws DimensionMismatch Dense(10, 5)(randn(1))
-      @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
-      @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting
-      @test size(Dense(10, 5)(randn(10))) == (5,)
-      @test size(Dense(10, 5)(randn(10,2))) == (5,2)
-      @test size(Dense(10, 5)(randn(10,2,3))) == (5,2,3)
-      @test size(Dense(10, 5)(randn(10,2,3,4))) == (5,2,3,4)
-      @test_throws DimensionMismatch Dense(10, 5)(randn(11,2,3))
-    end
-    @testset "zeros" begin
-      @test Dense(10, 1, identity, init = ones)(ones(10,1)) == 10*ones(1, 1)
-      @test Dense(10, 1, identity, init = ones)(ones(10,2)) == 10*ones(1, 2)
-      @test Dense(10, 2, identity, init = ones)(ones(10,1)) == 10*ones(2, 1)
-      @test Dense(10, 2, identity, init = ones)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
-      @test Dense(10, 2, identity, init = ones, bias = false)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
-    end
-  end
-
-  @testset "Scale" begin
-    @test length(Flux.Scale(10)(randn(10))) == 10
-    @test length(Flux.Scale(10)(randn(1))) == 10
-    @test length(Flux.Scale(10; bias = false)(randn(10))) == 10
-    @test length(Flux.Scale(10, tanh)(randn(10))) == 10
-    @test_throws DimensionMismatch Flux.Scale(10)(randn(2))
-
-    @test Flux.Scale(2)([1 2]) == [1 2; 1 2]
-    @test Flux.Scale(2)([1, 2]) == [1, 2]
-    @test Flux.Scale(2; init = randn)([1, 2]) != [1, 2]
-    @test Flux.Scale(2; bias = false)([1 2; 3 4]) == [1 2; 3 4]
-    @test Flux.Scale(2, abs2; bias = false, init = ones)([1 2; 3 4]) == [1 4; 9 16]
-
-    @test Flux.Scale(2)(rand(2, 3, 4)) |> size == (2, 3, 4)
-    @test Flux.Scale(2, 3;)(rand(2, 3, 4)) |> size == (2, 3, 4)
-    @test Flux.Scale(2, 3, 4; bias = false)(rand(2, 3, 4)) |> size == (2, 3, 4)
-    @test Flux.Scale(2, 3; bias = false)(rand(2, 1, 4)) |> size == (2, 3, 4)
-    @test Flux.Scale(2, 3, tanh; bias = false, init = zeros)(rand(2, 1, 4)) == zeros(2, 3, 4)
-    
-    @test_throws MethodError Flux.Scale(1.)
-    @test_throws MethodError Flux.Scale(1., 2.)
-    @test_throws Exception Flux.Scale()
-    @test_throws MethodError Flux.Scale(sin)
-  end
-
-  @testset "Maxout" begin
-    # Note that the normal common usage of Maxout is as per the docstring
-    # These are abnormal constructors used for testing purposes
-
-    @testset "Constructor" begin
-      mo = Maxout(() -> identity, 4)
-      input = rand(40)
-      @test mo(input) == input
-    end
+    @testset "helpers" begin @testset "activations" begin
+        dummy_model = Chain(x -> x .^ 2, x -> x .- 3, x -> tan.(x))
+        x = randn(10)
+        @test activations(dummy_model, x)[1] == x .^ 2
+        @test activations(dummy_model, x)[2] == (x .^ 2 .- 3)
+        @test activations(dummy_model, x)[3] == tan.(x .^ 2 .- 3)
 
-    @testset "simple alternatives" begin
-      mo = Maxout(x -> x, x -> 2x, x -> 0.5x)
-      input = rand(40)
-      @test mo(input) == 2*input
-    end
+        @test activations(Chain(), x) == ()
+        @test activations(Chain(identity, x -> :foo), x)[2] == :foo # results include `Any` type
+    end end
 
-    @testset "complex alternatives" begin
-      mo = Maxout(x -> [0.5; 0.1]*x, x -> [0.2; 0.7]*x)
-      input = [3.0 2.0]
-      target = [0.5, 0.7].*input
-      @test mo(input) == target
-    end
+    @testset "Chain" begin
+        @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10))
+        @test_throws DimensionMismatch Chain(Dense(10, 5, σ), Dense(2, 1))(randn(10))
+        # numeric test should be put into testset of corresponding layer
 
-    @testset "params" begin
-      mo = Maxout(()->Dense(32, 64), 4)
-      ps = Flux.params(mo)
-      @test length(ps) == 8  #4 alts, each with weight and bias
-    end
-  end
+        @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn(10))
+        m = Chain(first = Dense(10, 5, σ), second = Dense(5, 2))
+        @test m[:first] == m[1]
+        @test m[1:2] == m
 
-  @testset "SkipConnection" begin
-    @testset "zero sum" begin
-      input = randn(10, 10, 10, 10)
-      @test SkipConnection(x -> zeros(size(x)), (a,b) -> a + b)(input) == input
-    end
+        @test m == m
+        @test m == fmap(identity, m)  # does not forget names
 
-    @testset "concat size" begin
-      input = randn(10, 2)
-      @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
-    end
-  end
-
-  @testset "Bilinear" begin
-    @testset "SkipConnection recombinator" begin
-      d = Dense(10, 10)
-      b = Flux.Bilinear(10, 10, 5)
-      x = randn(Float32,10,9)
-      sc = SkipConnection(d, b)
-      @test size(sc(x)) == (5,9)
-    end
+        @test_throws ArgumentError Chain(layers = Dense(10, 10), two = identity) # reserved name
 
-    @testset "Two-streams zero sum" begin
-      x = zeros(Float32,10,9)
-      y = zeros(Float32,2,9)
-      b = Flux.Bilinear(10, 2, 3)
-      @test size(b(x,y)) == (3,9)
-      @test sum(abs2, b(x,y)) == 0f0
-    end
+        @test_nowarn Chain([Dense(10, 5, σ), Dense(5, 2)])(randn(Float32, 10))  # vector of layers
 
-    @testset "Inner interactions" begin
-      x = randn(Float32,11,7)
-      b = Flux.Bilinear(11, 11, 3)
-      @test size(b(x)) == (3,7)
-      @test_nowarn gs = gradient(() -> sum(abs2.(b(x))), params(b))
+        c = Chain(Dense(10, 5, σ), Dense(5, 2), Dense(2, 1, relu))
+        @test c[1] == c[begin]
+        @test c[3] == c[end]
     end
 
-    @testset "constructors" begin
-      b1 = Flux.Bilinear(randn(3,4,5))
-      @test b1.bias isa Vector{Float64}
-      @test b1.σ == identity
-
-      b2 = Flux.Bilinear(randn(3,4,5), false)
-      @test b2.bias === false
-
-      b3 = Flux.Bilinear(randn(Float16, 3,4,5), true, tanh)
-      @test b3.σ == tanh
-      @test b3.bias isa Vector{Float16}
-      @test size(b3(rand(4), rand(5))) == (3,)
+    @testset "Activations" begin
+        c = Chain(Dense(3, 5, relu), Dense(5, 1, relu))
+        X = Float32.([1.0; 1.0; 1.0])
+        @test_nowarn gradient(() -> Flux.activations(c, X)[2][1], Flux.params(c))
 
-      b4 = Flux.Bilinear(3,3,7; bias=1:7, init=Flux.zeros32)
-      @test_skip  b4.bias isa Vector{Float32}
-
-      @test_throws ArgumentError Flux.Bilinear(rand(3)) # expects a 3-array
-      @test_throws ArgumentError Flux.Bilinear(rand(3,4), false, tanh)
-      @test_throws DimensionMismatch Flux.Bilinear(rand(3,4,5), rand(6), tanh) # wrong length bias
+        c2 = Chain(enc = c[1], dec = c[2])
+        @test Flux.activations(c, X) == Flux.activations(c2, X)
+        @test_nowarn gradient(() -> Flux.activations(c2, X)[2][1], Flux.params(c2))
     end
-  end
 
-  @testset "Parallel" begin
-    @testset "zero sum" begin
-      input = randn(10, 10, 10, 10)
-      @test Parallel(+, x -> zeros(size(x)), identity)(input) == input
+    @testset "Dense" begin
+        @testset "constructors" begin
+            @test size(Dense(10, 100).weight) == (100, 10)
+            @test size(Dense(10, 100).bias) == (100,)
+            @test Dense(rand(100, 10), rand(100)).σ == identity
+            @test Dense(rand(100, 10)).σ == identity
+
+            @test Dense(rand(100, 10), false).σ == identity
+            @test Dense(rand(100, 10), false, tanh).σ == tanh
+            @test Dense(rand(100, 10), rand(100)).σ == identity
+            @test Dense(rand(Float16, 100, 10), true).bias isa Vector{Float16}  # creates matching type
+            @test_skip Dense(rand(Float16, 100, 10), rand(100)).bias isa Vector{Float16}  # converts to match
+
+            @test Dense(3, 4; init = Base.randn, bias = true).bias isa Vector{Float64}
+            @test_skip Dense(3, 4; init = Base.randn, bias = [1, 2, 3, 4]).bias isa
+                       Vector{Float64}
+
+            @test_throws MethodError Dense(10, 10.5)
+            @test_throws MethodError Dense(10, 10.5, tanh)
+            @test_throws DimensionMismatch Dense(3, 4; bias = rand(5))
+            @test_throws DimensionMismatch Dense(rand(4, 3), rand(5))
+            @test_throws MethodError Dense(rand(5))
+            @test_throws MethodError Dense(rand(5), rand(5))
+            @test_throws MethodError Dense(rand(5), rand(5), tanh)
+        end
+        @testset "dimensions" begin
+            @test length(Dense(10, 5)(randn(10))) == 5
+            @test_throws DimensionMismatch Dense(10, 5)(randn(1))
+            @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
+            @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting
+            @test size(Dense(10, 5)(randn(10))) == (5,)
+            @test size(Dense(10, 5)(randn(10, 2))) == (5, 2)
+            @test size(Dense(10, 5)(randn(10, 2, 3))) == (5, 2, 3)
+            @test size(Dense(10, 5)(randn(10, 2, 3, 4))) == (5, 2, 3, 4)
+            @test_throws DimensionMismatch Dense(10, 5)(randn(11, 2, 3))
+        end
+        @testset "zeros" begin
+            @test Dense(10, 1, identity, init = ones)(ones(10, 1)) == 10 * ones(1, 1)
+            @test Dense(10, 1, identity, init = ones)(ones(10, 2)) == 10 * ones(1, 2)
+            @test Dense(10, 2, identity, init = ones)(ones(10, 1)) == 10 * ones(2, 1)
+            @test Dense(10, 2, identity, init = ones)([ones(10, 1) 2 * ones(10, 1)]) ==
+                  [10 20; 10 20]
+            @test Dense(10, 2, identity, init = ones, bias = false)([ones(10, 1) 2 *
+                                                                                 ones(10,
+                                                                                      1)]) ==
+                  [10 20; 10 20]
+        end
     end
 
-    @testset "concat size" begin
-      input = randn(10, 2)
-      @test size(Parallel((a, b) -> cat(a, b; dims=2), Dense(10, 10), identity)(input)) == (10, 4)
-      @test size(Parallel(hcat, one = Dense(10, 10), two = identity)(input)) == (10, 4)
+    @testset "Scale" begin
+        @test length(Flux.Scale(10)(randn(10))) == 10
+        @test length(Flux.Scale(10)(randn(1))) == 10
+        @test length(Flux.Scale(10; bias = false)(randn(10))) == 10
+        @test length(Flux.Scale(10, tanh)(randn(10))) == 10
+        @test_throws DimensionMismatch Flux.Scale(10)(randn(2))
+
+        @test Flux.Scale(2)([1 2]) == [1 2; 1 2]
+        @test Flux.Scale(2)([1, 2]) == [1, 2]
+        @test Flux.Scale(2; init = randn)([1, 2]) != [1, 2]
+        @test Flux.Scale(2; bias = false)([1 2; 3 4]) == [1 2; 3 4]
+        @test Flux.Scale(2, abs2; bias = false, init = ones)([1 2; 3 4]) == [1 4; 9 16]
+
+        @test Flux.Scale(2)(rand(2, 3, 4)) |> size == (2, 3, 4)
+        @test Flux.Scale(2, 3;)(rand(2, 3, 4)) |> size == (2, 3, 4)
+        @test Flux.Scale(2, 3, 4; bias = false)(rand(2, 3, 4)) |> size == (2, 3, 4)
+        @test Flux.Scale(2, 3; bias = false)(rand(2, 1, 4)) |> size == (2, 3, 4)
+        @test Flux.Scale(2, 3, tanh; bias = false, init = zeros)(rand(2, 1, 4)) ==
+              zeros(2, 3, 4)
+
+        @test_throws MethodError Flux.Scale(1.0)
+        @test_throws MethodError Flux.Scale(1.0, 2.0)
+        @test_throws Exception Flux.Scale()
+        @test_throws MethodError Flux.Scale(sin)
     end
 
-    @testset "vararg input" begin
-      inputs = randn(10), randn(5), randn(4)
-      @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,)
-      @test size(Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs)) == (2,)
-      @test_throws ArgumentError Parallel(+, sin, cos)(1,2,3)  # wrong number of inputs
-      @test Parallel(+, sin, cos)(pi/2) ≈ 1
+    @testset "Maxout" begin
+        # Note that the normal common usage of Maxout is as per the docstring
+        # These are abnormal constructors used for testing purposes
+
+        @testset "Constructor" begin
+            mo = Maxout(() -> identity, 4)
+            input = rand(40)
+            @test mo(input) == input
+        end
+
+        @testset "simple alternatives" begin
+            mo = Maxout(x -> x, x -> 2x, x -> 0.5x)
+            input = rand(40)
+            @test mo(input) == 2 * input
+        end
+
+        @testset "complex alternatives" begin
+            mo = Maxout(x -> [0.5; 0.1] * x, x -> [0.2; 0.7] * x)
+            input = [3.0 2.0]
+            target = [0.5, 0.7] .* input
+            @test mo(input) == target
+        end
+
+        @testset "params" begin
+            mo = Maxout(() -> Dense(32, 64), 4)
+            ps = Flux.params(mo)
+            @test length(ps) == 8  #4 alts, each with weight and bias
+        end
     end
 
-    @testset "named access" begin
-      m = Parallel(hcat, one = Dense(10, 10), two = identity)
-      @test m[1] == m[:one]
-      @test m[1:2] == m
-
-      @test_throws ArgumentError Parallel(hcat, layers = Dense(10, 10), two = identity) # reserved names
-      @test_throws ArgumentError Parallel(hcat, connection = Dense(10, 10), two = identity)
-
-      @test m == fmap(identity, m)  # does not forget names
-
-      @test Parallel(vcat, x = log)(1) == [0]
-      @test Parallel(vcat, log)(1) == [0]
+    @testset "SkipConnection" begin
+        @testset "zero sum" begin
+            input = randn(10, 10, 10, 10)
+            @test SkipConnection(x -> zeros(size(x)), (a, b) -> a + b)(input) == input
+        end
+
+        @testset "concat size" begin
+            input = randn(10, 2)
+            @test size(SkipConnection(Dense(10, 10), (a, b) -> cat(a, b, dims = 2))(input)) ==
+                  (10, 4)
+        end
     end
 
-    @testset "trivial cases" begin
-      @test Parallel(hcat) isa Parallel{typeof(hcat), Tuple{}}  # not a NamedTuple
-      @test Parallel(hcat)(1) == hcat()
-      @test Parallel(hcat, inv)(2) == hcat(1/2)  # still calls connection once.
+    @testset "Bilinear" begin
+        @testset "SkipConnection recombinator" begin
+            d = Dense(10, 10)
+            b = Flux.Bilinear(10, 10, 5)
+            x = randn(Float32, 10, 9)
+            sc = SkipConnection(d, b)
+            @test size(sc(x)) == (5, 9)
+        end
+
+        @testset "Two-streams zero sum" begin
+            x = zeros(Float32, 10, 9)
+            y = zeros(Float32, 2, 9)
+            b = Flux.Bilinear(10, 2, 3)
+            @test size(b(x, y)) == (3, 9)
+            @test sum(abs2, b(x, y)) == 0.0f0
+        end
+
+        @testset "Inner interactions" begin
+            x = randn(Float32, 11, 7)
+            b = Flux.Bilinear(11, 11, 3)
+            @test size(b(x)) == (3, 7)
+            @test_nowarn gs = gradient(() -> sum(abs2.(b(x))), params(b))
+        end
+
+        @testset "constructors" begin
+            b1 = Flux.Bilinear(randn(3, 4, 5))
+            @test b1.bias isa Vector{Float64}
+            @test b1.σ == identity
+
+            b2 = Flux.Bilinear(randn(3, 4, 5), false)
+            @test b2.bias === false
+
+            b3 = Flux.Bilinear(randn(Float16, 3, 4, 5), true, tanh)
+            @test b3.σ == tanh
+            @test b3.bias isa Vector{Float16}
+            @test size(b3(rand(4), rand(5))) == (3,)
+
+            b4 = Flux.Bilinear(3, 3, 7; bias = 1:7, init = Flux.zeros32)
+            @test_skip b4.bias isa Vector{Float32}
+
+            @test_throws ArgumentError Flux.Bilinear(rand(3)) # expects a 3-array
+            @test_throws ArgumentError Flux.Bilinear(rand(3, 4), false, tanh)
+            @test_throws DimensionMismatch Flux.Bilinear(rand(3, 4, 5), rand(6), tanh) # wrong length bias
+        end
     end
 
-    @testset "connection is called once" begin
-      CNT = Ref(0)
-      f_cnt = (x...) -> (CNT[]+=1; +(x...))
-      Parallel(f_cnt, sin, cos, tan)(1)
-      @test CNT[] == 1
-      Parallel(f_cnt, sin, cos, tan)(1,2,3)
-      @test CNT[] == 2
-      Parallel(f_cnt, sin)(1)
-      @test CNT[] == 3
+    @testset "Parallel" begin
+        @testset "zero sum" begin
+            input = randn(10, 10, 10, 10)
+            @test Parallel(+, x -> zeros(size(x)), identity)(input) == input
+        end
+
+        @testset "concat size" begin
+            input = randn(10, 2)
+            @test size(Parallel((a, b) -> cat(a, b; dims = 2), Dense(10, 10), identity)(input)) ==
+                  (10, 4)
+            @test size(Parallel(hcat, one = Dense(10, 10), two = identity)(input)) ==
+                  (10, 4)
+        end
+
+        @testset "vararg input" begin
+            inputs = randn(10), randn(5), randn(4)
+            @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,)
+            @test size(Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs)) ==
+                  (2,)
+            @test_throws ArgumentError Parallel(+, sin, cos)(1, 2, 3)  # wrong number of inputs
+            @test Parallel(+, sin, cos)(pi / 2) ≈ 1
+        end
+
+        @testset "named access" begin
+            m = Parallel(hcat, one = Dense(10, 10), two = identity)
+            @test m[1] == m[:one]
+            @test m[1:2] == m
+
+            @test_throws ArgumentError Parallel(hcat, layers = Dense(10, 10),
+                                                two = identity) # reserved names
+            @test_throws ArgumentError Parallel(hcat, connection = Dense(10, 10),
+                                                two = identity)
+
+            @test m == fmap(identity, m)  # does not forget names
+
+            @test Parallel(vcat, x = log)(1) == [0]
+            @test Parallel(vcat, log)(1) == [0]
+        end
+
+        @testset "trivial cases" begin
+            @test Parallel(hcat) isa Parallel{typeof(hcat), Tuple{}}  # not a NamedTuple
+            @test Parallel(hcat)(1) == hcat()
+            @test Parallel(hcat, inv)(2) == hcat(1 / 2)  # still calls connection once.
+        end
+
+        @testset "connection is called once" begin
+            CNT = Ref(0)
+            f_cnt = (x...) -> (CNT[] += 1; +(x...))
+            Parallel(f_cnt, sin, cos, tan)(1)
+            @test CNT[] == 1
+            Parallel(f_cnt, sin, cos, tan)(1, 2, 3)
+            @test CNT[] == 2
+            Parallel(f_cnt, sin)(1)
+            @test CNT[] == 3
+        end
+
+        # Ref https://github.com/FluxML/Flux.jl/issues/1673
+        @testset "Input domain" begin
+            struct Input
+                x::Any
+            end
+
+            struct L1
+                x::Any
+            end
+            (l::L1)(x) = l.x * x
+            Flux.@functor L1
+            Base.:*(a::AbstractArray, b::Input) = a * b.x
+
+            par = Parallel(+, L1(rand(Float32, 3, 3)), L1(rand(Float32, 3, 3)))
+            ip = Input(rand(Float32, 3, 3))
+            ip2 = Input(rand(Float32, 3, 3))
+
+            @test par(ip) ≈ par.layers[1](ip.x) + par.layers[2](ip.x)
+            @test par(ip, ip2) ≈ par.layers[1](ip.x) + par.layers[2](ip2.x)
+            gs = gradient((par, x...) -> sum(par(x...)), par, ip, ip2)
+            gs_reg = gradient(par, ip, ip2) do par, x, y
+                return sum(par.layers[1](x.x) + par.layers[2](y.x))
+            end
+
+            for (a, b) in zip(gs[1].layers, gs_reg[1].layers)
+                @test a.x ≈ b.x
+            end
+            @test gs[2].x ≈ gs_reg[2].x
+            @test gs[3].x ≈ gs_reg[3].x
+        end
     end
 
-    # Ref https://github.com/FluxML/Flux.jl/issues/1673
-    @testset "Input domain" begin
-      struct Input
-        x
-      end
-
-      struct L1
-        x
-      end
-      (l::L1)(x) = l.x * x
-      Flux.@functor L1
-      Base.:*(a::AbstractArray, b::Input) = a * b.x
-
-      par = Parallel(+, L1(rand(Float32, 3,3)), L1(rand(Float32, 3,3)))
-      ip = Input(rand(Float32, 3,3))
-      ip2 = Input(rand(Float32, 3,3))
-
-      @test par(ip) ≈ par.layers[1](ip.x) + par.layers[2](ip.x)
-      @test par(ip, ip2) ≈ par.layers[1](ip.x) + par.layers[2](ip2.x)
-      gs = gradient((par, x...) -> sum(par(x...)), par, ip, ip2)
-      gs_reg = gradient(par, ip, ip2) do par, x, y
-        sum(par.layers[1](x.x) + par.layers[2](y.x))
-      end
-
-      for (a,b) in zip(gs[1].layers, gs_reg[1].layers)
-        @test a.x ≈ b.x
-      end
-      @test gs[2].x ≈ gs_reg[2].x
-      @test gs[3].x ≈ gs_reg[3].x
+    @testset "Embedding" begin
+        vocab_size, embed_size = 10, 4
+        m = Embedding(vocab_size, embed_size)
+        @test size(m.weight) == (embed_size, vocab_size)
+
+        # one index
+        @test m(1) isa Vector{Float32}
+        @test m(2) ≈ m.weight[:, 2]
+        @test m(OneHotVector(3, vocab_size)) ≈ m.weight[:, 3]
+        @test_throws DimensionMismatch m(OneHotVector(3, 1000))
+        @test m(4) ≈ m((1:vocab_size) .== 4)
+
+        # a batch of indices
+        x = rand(1:vocab_size, 3)
+        y = m(x)
+        @test y isa Matrix{Float32}
+        @test y ≈ m.weight[:, x]
+        x2 = OneHotMatrix(x, vocab_size)
+        y2 = m(x2)
+        @test y2 isa Matrix{Float32}
+        @test y2 ≈ y
+        @test_throws DimensionMismatch m(OneHotMatrix(x, 1000))
+        @test y ≈ m(x' .== (1:vocab_size))
+
+        # more dimensions via reshape
+        x = rand(1:vocab_size, 3, 4)
+        y = m(x)
+        @test y isa Array{Float32, 3}
+        @test size(y) == (embed_size, 3, 4)
+        x3 = onehotbatch(x, 1:1:vocab_size)
+        @test size(x3) == (vocab_size, 3, 4)
+        y3 = m(x3)
+        @test size(y3) == (embed_size, 3, 4)
     end
-  end
-
-  @testset "Embedding" begin
-    vocab_size, embed_size = 10, 4
-    m = Embedding(vocab_size, embed_size)
-    @test size(m.weight) == (embed_size, vocab_size)
-    
-    # one index
-    @test m(1) isa Vector{Float32}
-    @test m(2) ≈ m.weight[:,2]
-    @test m(OneHotVector(3, vocab_size)) ≈ m.weight[:,3]
-    @test_throws DimensionMismatch m(OneHotVector(3, 1000))
-    @test m(4) ≈ m((1:vocab_size) .== 4)
-
-    # a batch of indices
-    x = rand(1:vocab_size, 3)
-    y = m(x)
-    @test y isa Matrix{Float32}
-    @test y ≈ m.weight[:,x]
-    x2 = OneHotMatrix(x, vocab_size)
-    y2 = m(x2)
-    @test y2 isa Matrix{Float32}
-    @test y2 ≈ y
-    @test_throws DimensionMismatch m(OneHotMatrix(x, 1000))
-    @test y ≈ m(x' .== (1:vocab_size))
-
-    # more dimensions via reshape
-    x = rand(1:vocab_size, 3, 4)
-    y = m(x)
-    @test y isa Array{Float32, 3}
-    @test size(y) == (embed_size, 3, 4)
-    x3 = onehotbatch(x, 1:1:vocab_size)
-    @test size(x3) == (vocab_size, 3, 4)
-    y3 = m(x3)
-    @test size(y3) == (embed_size, 3, 4)
-  end
 end
 
 @testset "second derivatives" begin
-  m1 = Chain(Dense(3,4,tanh; bias=false), Dense(4,2))
-  @test Zygote.hessian_dual(sum∘m1, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m1, [1,2,3])
-
-  m1v = Chain([m1[1], m1[2]])  # vector of layers
-  @test Zygote.hessian_dual(sum∘m1v, [1,2,3]) ≈ Zygote.hessian_dual(sum∘m1, [1,2,3])
-  @test_broken Zygote.hessian_dual(sum∘m1v, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m1v, [1,2,3])
-
-  # NNlib's softmax gradient writes in-place
-  m2 = Chain(Dense(3,4,tanh), Dense(4,2), softmax)
-  @test_broken Zygote.hessian_dual(sum∘m2, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m2, [1,2,3])
-
-  # https://github.com/FluxML/NNlib.jl/issues/362
-  m3 = Chain(Conv((3,), 2 => 3, relu), Dense(2,2))
-  x3 = cat(Float32[1 2; 3 4; 5 6; 7 8]; dims=3)
-  @test Zygote.hessian_dual(sum∘m3, x3) ≈ Zygote.hessian_reverse(sum∘m3, x3)
+    m1 = Chain(Dense(3, 4, tanh; bias = false), Dense(4, 2))
+    @test Zygote.hessian_dual(sum ∘ m1, [1, 2, 3]) ≈
+          Zygote.hessian_reverse(sum ∘ m1, [1, 2, 3])
+
+    m1v = Chain([m1[1], m1[2]])  # vector of layers
+    @test Zygote.hessian_dual(sum ∘ m1v, [1, 2, 3]) ≈
+          Zygote.hessian_dual(sum ∘ m1, [1, 2, 3])
+    @test_broken Zygote.hessian_dual(sum ∘ m1v, [1, 2, 3]) ≈
+                 Zygote.hessian_reverse(sum ∘ m1v, [1, 2, 3])
+
+    # NNlib's softmax gradient writes in-place
+    m2 = Chain(Dense(3, 4, tanh), Dense(4, 2), softmax)
+    @test_broken Zygote.hessian_dual(sum ∘ m2, [1, 2, 3]) ≈
+                 Zygote.hessian_reverse(sum ∘ m2, [1, 2, 3])
+
+    # https://github.com/FluxML/NNlib.jl/issues/362
+    m3 = Chain(Conv((3,), 2 => 3, relu), Dense(2, 2))
+    x3 = cat(Float32[1 2; 3 4; 5 6; 7 8]; dims = 3)
+    @test Zygote.hessian_dual(sum ∘ m3, x3) ≈ Zygote.hessian_reverse(sum ∘ m3, x3)
 end
 
 @testset "gradients of Chain{Vector}" begin
-  m1 = Chain(Dense(3,4,tanh; bias=false), Dense(4,2))
-  m1v = Chain([m1[1], m1[2]])
-  @test sum(length, params(m1)) == sum(length, params(m1v))
-
-  x1 = randn(Float32,3,5)
-  @test m1(x1) ≈ m1v(x1)
-
-  y1 = rand(Bool,2,5)
-  g1 = gradient(() -> Flux.Losses.logitcrossentropy(m1(x1), y1), params(m1))
-  g1v = gradient(() -> Flux.Losses.logitcrossentropy(m1v(x1), y1), params(m1v))
-  @test g1[m1[1].weight] ≈ g1v[m1v[1].weight]
-  @test g1[m1[2].bias] ≈ g1v[m1v[2].bias]
-
-  @test Flux.destructure(m1)[1] ≈ Flux.destructure(m1v)[1]
-  z1 = rand(22);
-  @test Flux.destructure(m1)[2](z1)[1].weight ≈ Flux.destructure(m1v)[2](z1)[1].weight
-  # Note that Flux.destructure(m1v)[2](z) has a Chain{Tuple}, as does m1v[1:2]
+    m1 = Chain(Dense(3, 4, tanh; bias = false), Dense(4, 2))
+    m1v = Chain([m1[1], m1[2]])
+    @test sum(length, params(m1)) == sum(length, params(m1v))
+
+    x1 = randn(Float32, 3, 5)
+    @test m1(x1) ≈ m1v(x1)
+
+    y1 = rand(Bool, 2, 5)
+    g1 = gradient(() -> Flux.Losses.logitcrossentropy(m1(x1), y1), params(m1))
+    g1v = gradient(() -> Flux.Losses.logitcrossentropy(m1v(x1), y1), params(m1v))
+    @test g1[m1[1].weight] ≈ g1v[m1v[1].weight]
+    @test g1[m1[2].bias] ≈ g1v[m1v[2].bias]
+
+    @test Flux.destructure(m1)[1] ≈ Flux.destructure(m1v)[1]
+    z1 = rand(22)
+    @test Flux.destructure(m1)[2](z1)[1].weight ≈ Flux.destructure(m1v)[2](z1)[1].weight
+    # Note that Flux.destructure(m1v)[2](z) has a Chain{Tuple}, as does m1v[1:2]
 end
 
 @testset "PairwiseFusion" begin
-  x = (rand(1, 10), rand(30, 10))
-  layer = PairwiseFusion(+,  Dense(1, 30),  Dense(30, 10))
-  y = layer(x)
-  @test length(y) == 2
-  @test size(y[1]) == (30, 10)
-  @test size(y[2]) == (10, 10)
-
-  x = rand(1, 10)
-  layer = PairwiseFusion(.+,  Dense(1, 10),  Dense(10, 1))
-  y = layer(x)
-  @test length(y) == 2
-  @test size(y[1]) == (10, 10)
-  @test size(y[2]) == (1, 10)
-
-  @test PairwiseFusion(vcat, x->x.+1, x->x.+2, x->x.^3)(2, 10, 20) == (3, [5, 12], [125, 1728, 8000])
-  @test PairwiseFusion(vcat, x->x.+1, x->x.+2, x->x.^3)(7) == (8, [10, 9], [1000, 729, 343])
+    x = (rand(1, 10), rand(30, 10))
+    layer = PairwiseFusion(+, Dense(1, 30), Dense(30, 10))
+    y = layer(x)
+    @test length(y) == 2
+    @test size(y[1]) == (30, 10)
+    @test size(y[2]) == (10, 10)
+
+    x = rand(1, 10)
+    layer = PairwiseFusion(.+, Dense(1, 10), Dense(10, 1))
+    y = layer(x)
+    @test length(y) == 2
+    @test size(y[1]) == (10, 10)
+    @test size(y[2]) == (1, 10)
+
+    @test PairwiseFusion(vcat, x -> x .+ 1, x -> x .+ 2, x -> x .^ 3)(2, 10, 20) ==
+          (3, [5, 12], [125, 1728, 8000])
+    @test PairwiseFusion(vcat, x -> x .+ 1, x -> x .+ 2, x -> x .^ 3)(7) ==
+          (8, [10, 9], [1000, 729, 343])
 end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 019f3fd603..85ed0cb0f4 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -3,115 +3,114 @@ using Flux: maxpool, meanpool
 using Flux: gradient
 
 @testset "Pooling" begin
-  x = randn(Float32, 10, 10, 3, 2)
-  y = randn(Float32, 20, 20, 3, 2)
-  ampx = AdaptiveMaxPool((5,5))
-  @test ampx(x) == maxpool(x, PoolDims(x, 2))
-  ampx = AdaptiveMeanPool((5,5))
-  @test ampx(x) == meanpool(x, PoolDims(x, 2))
-  ampy = AdaptiveMaxPool((10, 5))
-  @test ampy(y) == maxpool(y, PoolDims(y, (2, 4)))
-  ampy = AdaptiveMeanPool((10, 5))
-  @test ampy(y) == meanpool(y, PoolDims(y, (2, 4)))
-  gmp = GlobalMaxPool()
-  @test size(gmp(x)) == (1, 1, 3, 2)
-  gmp = GlobalMeanPool()
-  @test size(gmp(x)) == (1, 1, 3, 2)
-  mp = MaxPool((2, 2))
-  @test mp(x) == maxpool(x, PoolDims(x, 2))
-  mp = MeanPool((2, 2))
-  @test mp(x) == meanpool(x, PoolDims(x, 2))
+    x = randn(Float32, 10, 10, 3, 2)
+    y = randn(Float32, 20, 20, 3, 2)
+    ampx = AdaptiveMaxPool((5, 5))
+    @test ampx(x) == maxpool(x, PoolDims(x, 2))
+    ampx = AdaptiveMeanPool((5, 5))
+    @test ampx(x) == meanpool(x, PoolDims(x, 2))
+    ampy = AdaptiveMaxPool((10, 5))
+    @test ampy(y) == maxpool(y, PoolDims(y, (2, 4)))
+    ampy = AdaptiveMeanPool((10, 5))
+    @test ampy(y) == meanpool(y, PoolDims(y, (2, 4)))
+    gmp = GlobalMaxPool()
+    @test size(gmp(x)) == (1, 1, 3, 2)
+    gmp = GlobalMeanPool()
+    @test size(gmp(x)) == (1, 1, 3, 2)
+    mp = MaxPool((2, 2))
+    @test mp(x) == maxpool(x, PoolDims(x, 2))
+    mp = MeanPool((2, 2))
+    @test mp(x) == meanpool(x, PoolDims(x, 2))
 end
 
 @testset "CNN" begin
-  r = zeros(Float32, 28, 28, 1, 5)
-  m = Chain(
-    Conv((2, 2), 1 => 16, relu),
-    MaxPool((2,2)),
-    Conv((2, 2), 16 => 8, relu),
-    MaxPool((2,2)),
-    x -> reshape(x, :, size(x, 4)),
-    Dense(288, 10), softmax)
-
-  @test size(m(r)) == (10, 5)
-
-  # Test bias switch
-  bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3))
-  ip = zeros(Float32, 28,28,1,1)
-
-  op = bias(ip)
-  @test sum(op) == prod(size(op))
-
-  @testset "No bias mapped through $lmap" for lmap in (identity, cpu, f32)
-    bias = Conv((2,2), 1=>3, bias = false) |> lmap
+    r = zeros(Float32, 28, 28, 1, 5)
+    m = Chain(Conv((2, 2), 1 => 16, relu),
+              MaxPool((2, 2)),
+              Conv((2, 2), 16 => 8, relu),
+              MaxPool((2, 2)),
+              x -> reshape(x, :, size(x, 4)),
+              Dense(288, 10), softmax)
+
+    @test size(m(r)) == (10, 5)
+
+    # Test bias switch
+    bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3))
+    ip = zeros(Float32, 28, 28, 1, 1)
+
     op = bias(ip)
-    @test sum(op) ≈ 0.f0
-    gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
-    @test bias.bias ∉ gs.params
-  end
-
-  # Train w/o bias and make sure no convergence happens
-  # when only bias can be converged
-  bias = Conv((2, 2), 1=>3, bias = false);
-  ip = zeros(Float32, 28,28,1,1)
-  op = zeros(Float32, 27,27,3,1) .+ 2.f0
-  opt = Descent()
-
-  for _ = 1:10^3
-    gs = gradient(Flux.params(bias)) do
-      Flux.Losses.mse(bias(ip), op)
+    @test sum(op) == prod(size(op))
+
+    @testset "No bias mapped through $lmap" for lmap in (identity, cpu, f32)
+        bias = Conv((2, 2), 1 => 3, bias = false) |> lmap
+        op = bias(ip)
+        @test sum(op) ≈ 0.0f0
+        gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
+        @test bias.bias ∉ gs.params
     end
-    Flux.Optimise.update!(opt, params(bias), gs)
-  end
 
-  @test Flux.Losses.mse(bias(ip), op) ≈ 4.f0
+    # Train w/o bias and make sure no convergence happens
+    # when only bias can be converged
+    bias = Conv((2, 2), 1 => 3, bias = false)
+    ip = zeros(Float32, 28, 28, 1, 1)
+    op = zeros(Float32, 27, 27, 3, 1) .+ 2.0f0
+    opt = Descent()
 
-  @testset "Grouped Conv" begin
-    ip = rand(Float32, 28, 100, 2)
-    c = Conv((3,), 100 => 25, groups = 5)
-    @test size(c.weight) == (3, 20, 25)
-    @test size(c(ip)) == (26, 25, 2)
+    for _ in 1:(10^3)
+        gs = gradient(Flux.params(bias)) do
+            return Flux.Losses.mse(bias(ip), op)
+        end
+        Flux.Optimise.update!(opt, params(bias), gs)
+    end
 
-    ip = rand(Float32, 28, 28, 100, 2)
-    c = Conv((3,3), 100 => 25, groups = 5)
-    @test size(c.weight) == (3, 3, 20, 25)
-    @test size(c(ip)) == (26, 26, 25, 2)
+    @test Flux.Losses.mse(bias(ip), op) ≈ 4.0f0
 
-    ip = rand(Float32, 10, 11, 12, 100, 2)
-    c = Conv((3,4,5), 100 => 25, groups = 5)
-    @test size(c.weight) == (3,4,5, 20, 25)
-    @test size(c(ip)) == (8,8,8, 25, 2)
+    @testset "Grouped Conv" begin
+        ip = rand(Float32, 28, 100, 2)
+        c = Conv((3,), 100 => 25, groups = 5)
+        @test size(c.weight) == (3, 20, 25)
+        @test size(c(ip)) == (26, 25, 2)
 
-    # Test that we cannot ask for non-integer multiplication factors
-    @test_throws AssertionError Conv((2, 2), 3=>10, groups=2)
-    @test_throws AssertionError Conv((2, 2), 2=>9, groups=2)
-  end
+        ip = rand(Float32, 28, 28, 100, 2)
+        c = Conv((3, 3), 100 => 25, groups = 5)
+        @test size(c.weight) == (3, 3, 20, 25)
+        @test size(c(ip)) == (26, 26, 25, 2)
+
+        ip = rand(Float32, 10, 11, 12, 100, 2)
+        c = Conv((3, 4, 5), 100 => 25, groups = 5)
+        @test size(c.weight) == (3, 4, 5, 20, 25)
+        @test size(c(ip)) == (8, 8, 8, 25, 2)
+
+        # Test that we cannot ask for non-integer multiplication factors
+        @test_throws AssertionError Conv((2, 2), 3 => 10, groups = 2)
+        @test_throws AssertionError Conv((2, 2), 2 => 9, groups = 2)
+    end
 end
 
 @testset "_channels_in, _channels_out" begin
     _channels_in = Flux._channels_in
     _channels_out = Flux._channels_out
-    @test _channels_in(Conv((3,)   , 2=>4)) == 2
-    @test _channels_in(Conv((5,6,) , 2=>4)) == 2
-    @test _channels_in(Conv((1,2,3), 2=>4)) == 2
-    @test _channels_out(Conv((3,)   , 2=>4)) == 4
-    @test _channels_out(Conv((5,6,) , 2=>4)) == 4
-    @test _channels_out(Conv((1,2,3), 2=>4)) == 4
-
-    @test _channels_in( ConvTranspose((3,)   , 1=>4)) == 1
-    @test _channels_in( ConvTranspose((5,6,) , 2=>4)) == 2
-    @test _channels_in( ConvTranspose((1,2,3), 3=>4)) == 3
-    @test _channels_out(ConvTranspose((3,)   , 2=>1)) == 1
-    @test _channels_out(ConvTranspose((5,6,) , 2=>2)) == 2
-    @test _channels_out(ConvTranspose((1,2,3), 2=>3)) == 3
-
-    @test _channels_in( ConvTranspose((6,)   , 8=>4, groups=4)) == 8
-    @test _channels_in( ConvTranspose((5,6,) , 2=>4, groups=2)) == 2
-    @test _channels_in( ConvTranspose((1,2,3), 3=>6, groups=3)) == 3
-
-    @test _channels_out(ConvTranspose((1,)   , 10=>15, groups=5)) == 15
-    @test _channels_out(ConvTranspose((3,2)   , 10=>15, groups=5)) == 15
-    @test _channels_out(ConvTranspose((5,6,) , 2=>2, groups=2)) == 2
+    @test _channels_in(Conv((3,), 2 => 4)) == 2
+    @test _channels_in(Conv((5, 6), 2 => 4)) == 2
+    @test _channels_in(Conv((1, 2, 3), 2 => 4)) == 2
+    @test _channels_out(Conv((3,), 2 => 4)) == 4
+    @test _channels_out(Conv((5, 6), 2 => 4)) == 4
+    @test _channels_out(Conv((1, 2, 3), 2 => 4)) == 4
+
+    @test _channels_in(ConvTranspose((3,), 1 => 4)) == 1
+    @test _channels_in(ConvTranspose((5, 6), 2 => 4)) == 2
+    @test _channels_in(ConvTranspose((1, 2, 3), 3 => 4)) == 3
+    @test _channels_out(ConvTranspose((3,), 2 => 1)) == 1
+    @test _channels_out(ConvTranspose((5, 6), 2 => 2)) == 2
+    @test _channels_out(ConvTranspose((1, 2, 3), 2 => 3)) == 3
+
+    @test _channels_in(ConvTranspose((6,), 8 => 4, groups = 4)) == 8
+    @test _channels_in(ConvTranspose((5, 6), 2 => 4, groups = 2)) == 2
+    @test _channels_in(ConvTranspose((1, 2, 3), 3 => 6, groups = 3)) == 3
+
+    @test _channels_out(ConvTranspose((1,), 10 => 15, groups = 5)) == 15
+    @test _channels_out(ConvTranspose((3, 2), 10 => 15, groups = 5)) == 15
+    @test _channels_out(ConvTranspose((5, 6), 2 => 2, groups = 2)) == 2
 
     for Layer in [Conv, ConvTranspose]
         for _ in 1:10
@@ -119,170 +118,173 @@ end
             kernel_size = Tuple(rand(1:5) for _ in rand(1:3))
             cin = rand(1:5) * groups
             cout = rand(1:5) * groups
-            @test _channels_in(Layer(kernel_size, cin=>cout; groups)) == cin
-            @test _channels_out(Layer(kernel_size, cin=>cout; groups)) == cout
+            @test _channels_in(Layer(kernel_size, cin => cout; groups)) == cin
+            @test _channels_out(Layer(kernel_size, cin => cout; groups)) == cout
         end
     end
 end
 
 @testset "asymmetric padding" begin
-  r = ones(Float32, 28, 28, 1, 1)
-  m = Conv((3, 3), 1=>1, relu; pad=(0,1,1,2))
-  m.weight[:] .= 1.0
-  m.bias[:] .= 0.0
-  y_hat = m(r)[:,:,1,1]
-  @test size(y_hat) == (27, 29)
-  @test y_hat[1, 1] ≈ 6.0
-  @test y_hat[2, 2] ≈ 9.0
-  @test y_hat[end, 1] ≈ 4.0
-  @test y_hat[1, end] ≈ 3.0
-  @test y_hat[1, end-1] ≈ 6.0
-  @test y_hat[end, end] ≈ 2.0
+    r = ones(Float32, 28, 28, 1, 1)
+    m = Conv((3, 3), 1 => 1, relu; pad = (0, 1, 1, 2))
+    m.weight[:] .= 1.0
+    m.bias[:] .= 0.0
+    y_hat = m(r)[:, :, 1, 1]
+    @test size(y_hat) == (27, 29)
+    @test y_hat[1, 1] ≈ 6.0
+    @test y_hat[2, 2] ≈ 9.0
+    @test y_hat[end, 1] ≈ 4.0
+    @test y_hat[1, end] ≈ 3.0
+    @test y_hat[1, end - 1] ≈ 6.0
+    @test y_hat[end, end] ≈ 2.0
 end
 
 @testset "Depthwise Conv" begin
-  r = zeros(Float32, 28, 28, 3, 5)
-  m1 = DepthwiseConv((2, 2), 3=>15)
-  @test size(m1(r), 3) == 15
+    r = zeros(Float32, 28, 28, 3, 5)
+    m1 = DepthwiseConv((2, 2), 3 => 15)
+    @test size(m1(r), 3) == 15
 
-  m2 = DepthwiseConv((2, 3), 3=>9)
-  @test size(m2(r), 3) == 9
+    m2 = DepthwiseConv((2, 3), 3 => 9)
+    @test size(m2(r), 3) == 9
 
-  m3 = DepthwiseConv((2, 3), 3=>9; bias=false)
-  @test size(m2(r), 3) == 9
+    m3 = DepthwiseConv((2, 3), 3 => 9; bias = false)
+    @test size(m2(r), 3) == 9
 
-  # Test that we cannot ask for non-integer multiplication factors
-  @test_throws AssertionError DepthwiseConv((2,2), 3=>10)
+    # Test that we cannot ask for non-integer multiplication factors
+    @test_throws AssertionError DepthwiseConv((2, 2), 3 => 10)
 end
 
 @testset "ConvTranspose" begin
-  x = zeros(Float32, 5, 5, 1, 1)
-  y = Conv((3,3), 1 => 1)(x)
-  x_hat1 = ConvTranspose((3, 3), 1 => 1)(y)
-  x_hat2 = ConvTranspose((3, 3), 1 => 1, bias=false)(y)
-  @test size(x_hat1) == size(x_hat2) == size(x)
-
-  m = ConvTranspose((3,3), 1=>1)
-  # Test that the gradient call does not throw: #900
-  @test gradient(()->sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads
-
-  x = zeros(Float32, 5, 5, 2, 4)
-  m = ConvTranspose((3,3), 2=>3)
-  @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
-
-  # test ConvTranspose supports groups argument
-  x = randn(Float32, 10, 10, 2, 3)
-  m1 = ConvTranspose((3,3), 2=>4, pad=SamePad())
-  @test size(m1.weight) == (3,3,4,2)
-  @test size(m1(x)) == (10,10,4,3)
-  m2 = ConvTranspose((3,3), 2=>4, groups=2, pad=SamePad())
-  @test size(m2.weight) == (3,3,2,2)
-  @test size(m1(x)) == size(m2(x))
-  @test gradient(()->sum(m2(x)), params(m2)) isa Flux.Zygote.Grads
-
-  x = randn(Float32, 10, 2,1)
-  m = ConvTranspose((3,), 2=>4, pad=SamePad(), groups=2)
-  @test size(m(x)) === (10,4,1)
-  @test length(m.weight) == (3)*(2*4) / 2
-
-  x = randn(Float32, 10, 11, 4,2)
-  m = ConvTranspose((3,5), 4=>4, pad=SamePad(), groups=4)
-  @test size(m(x)) === (10,11, 4,2)
-  @test length(m.weight) == (3*5)*(4*4)/4
-
-  x = randn(Float32, 10, 11, 12, 3,2)
-  m = ConvTranspose((3,5,3), 3=>6, pad=SamePad(), groups=3)
-  @test size(m(x)) === (10,11, 12, 6,2)
-  @test length(m.weight) == (3*5*3) * (3*6) / 3
-
-  @test occursin("groups=2", sprint(show, ConvTranspose((3,3), 2=>4, groups=2)))
-  @test occursin("2 => 4"  , sprint(show, ConvTranspose((3,3), 2=>4, groups=2)))
+    x = zeros(Float32, 5, 5, 1, 1)
+    y = Conv((3, 3), 1 => 1)(x)
+    x_hat1 = ConvTranspose((3, 3), 1 => 1)(y)
+    x_hat2 = ConvTranspose((3, 3), 1 => 1, bias = false)(y)
+    @test size(x_hat1) == size(x_hat2) == size(x)
+
+    m = ConvTranspose((3, 3), 1 => 1)
+    # Test that the gradient call does not throw: #900
+    @test gradient(() -> sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads
+
+    x = zeros(Float32, 5, 5, 2, 4)
+    m = ConvTranspose((3, 3), 2 => 3)
+    @test gradient(() -> sum(m(x)), params(m)) isa Flux.Zygote.Grads
+
+    # test ConvTranspose supports groups argument
+    x = randn(Float32, 10, 10, 2, 3)
+    m1 = ConvTranspose((3, 3), 2 => 4, pad = SamePad())
+    @test size(m1.weight) == (3, 3, 4, 2)
+    @test size(m1(x)) == (10, 10, 4, 3)
+    m2 = ConvTranspose((3, 3), 2 => 4, groups = 2, pad = SamePad())
+    @test size(m2.weight) == (3, 3, 2, 2)
+    @test size(m1(x)) == size(m2(x))
+    @test gradient(() -> sum(m2(x)), params(m2)) isa Flux.Zygote.Grads
+
+    x = randn(Float32, 10, 2, 1)
+    m = ConvTranspose((3,), 2 => 4, pad = SamePad(), groups = 2)
+    @test size(m(x)) === (10, 4, 1)
+    @test length(m.weight) == (3) * (2 * 4) / 2
+
+    x = randn(Float32, 10, 11, 4, 2)
+    m = ConvTranspose((3, 5), 4 => 4, pad = SamePad(), groups = 4)
+    @test size(m(x)) === (10, 11, 4, 2)
+    @test length(m.weight) == (3 * 5) * (4 * 4) / 4
+
+    x = randn(Float32, 10, 11, 12, 3, 2)
+    m = ConvTranspose((3, 5, 3), 3 => 6, pad = SamePad(), groups = 3)
+    @test size(m(x)) === (10, 11, 12, 6, 2)
+    @test length(m.weight) == (3 * 5 * 3) * (3 * 6) / 3
+
+    @test occursin("groups=2", sprint(show, ConvTranspose((3, 3), 2 => 4, groups = 2)))
+    @test occursin("2 => 4", sprint(show, ConvTranspose((3, 3), 2 => 4, groups = 2)))
 end
 
 @testset "CrossCor" begin
-  x = rand(Float32, 28, 28, 1, 1)
-  w = rand(Float32, 2,2,1,1)
-  y = CrossCor(w, [0.0])
-
-  @test sum(w .* x[1:2, 1:2, :, :]) ≈ y(x)[1, 1, 1, 1]  rtol=2e-7
-
-  r = zeros(Float32, 28, 28, 1, 5)
-  m = Chain(
-    CrossCor((2, 2), 1=>16, relu),
-    MaxPool((2,2)),
-    CrossCor((2, 2), 16=>8, relu; bias=false),
-    MaxPool((2,2)),
-    x -> reshape(x, :, size(x, 4)),
-    Dense(288, 10), softmax)
-
-  @test size(m(r)) == (10, 5)
-  @test y(x) != Conv(w, [0.0])(x)
-  @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x) ≈ Conv(w, [0.0])(x)  rtol=1e-7
+    x = rand(Float32, 28, 28, 1, 1)
+    w = rand(Float32, 2, 2, 1, 1)
+    y = CrossCor(w, [0.0])
+
+    @test sum(w .* x[1:2, 1:2, :, :])≈y(x)[1, 1, 1, 1] rtol=2e-7
+
+    r = zeros(Float32, 28, 28, 1, 5)
+    m = Chain(CrossCor((2, 2), 1 => 16, relu),
+              MaxPool((2, 2)),
+              CrossCor((2, 2), 16 => 8, relu; bias = false),
+              MaxPool((2, 2)),
+              x -> reshape(x, :, size(x, 4)),
+              Dense(288, 10), softmax)
+
+    @test size(m(r)) == (10, 5)
+    @test y(x) != Conv(w, [0.0])(x)
+    @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x)≈Conv(w, [0.0])(x) rtol=1e-7
 end
 
 @testset "Conv with non quadratic window #700" begin
-  data = zeros(Float32, 7,7,1,1)
-  data[4,4,1,1] = 1
-
-  l = Conv((3,3), 1=>1)
-  expected = zeros(eltype(l.weight),5,5,1,1)
-  expected[2:end-1,2:end-1,1,1] = l.weight
-  @test expected ≈ l(data)
-
-  l = Conv((3,1), 1=>1)
-  expected = zeros(eltype(l.weight),5,7,1,1)
-  expected[2:end-1,4,1,1] = l.weight
-  @test expected ≈ l(data)
-
-  l = Conv((1,3), 1=>1)
-  expected = zeros(eltype(l.weight),7,5,1,1)
-  expected[4,2:end-1,1,1] = l.weight
-  @test expected ≈ l(data)
-
-  @test begin
-    # we test that the next expression does not throw
-    randn(Float32, 10,10,1,1) |> Conv((6,1), 1=>1, Flux.σ)
-    true
-  end
+    data = zeros(Float32, 7, 7, 1, 1)
+    data[4, 4, 1, 1] = 1
+
+    l = Conv((3, 3), 1 => 1)
+    expected = zeros(eltype(l.weight), 5, 5, 1, 1)
+    expected[2:(end - 1), 2:(end - 1), 1, 1] = l.weight
+    @test expected ≈ l(data)
+
+    l = Conv((3, 1), 1 => 1)
+    expected = zeros(eltype(l.weight), 5, 7, 1, 1)
+    expected[2:(end - 1), 4, 1, 1] = l.weight
+    @test expected ≈ l(data)
+
+    l = Conv((1, 3), 1 => 1)
+    expected = zeros(eltype(l.weight), 7, 5, 1, 1)
+    expected[4, 2:(end - 1), 1, 1] = l.weight
+    @test expected ≈ l(data)
+
+    @test begin
+        # we test that the next expression does not throw
+        randn(Float32, 10, 10, 1, 1) |> Conv((6, 1), 1 => 1, Flux.σ)
+        true
+    end
 end
 
-@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, CrossCor), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
-  data = ones(Float32, (k .+ 3)..., 1,1)
-  l = ltype(k, 1=>1, pad=SamePad())
-  @test size(l(data)) == size(data)
-
-  l = ltype(k, 1=>1, pad=SamePad(), dilation = k .÷ 2)
-  @test size(l(data)) == size(data)
-
-  stride = 3
-  l = ltype(k, 1=>1, pad=SamePad(), stride = stride)
-  if ltype == ConvTranspose
-    @test size(l(data))[1:end-2] == stride .* size(data)[1:end-2]
-  else
-    @test size(l(data))[1:end-2] == cld.(size(data)[1:end-2], stride)
-  end
+@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv,
+                                                      CrossCor),
+                                            k in ((1,), (2,), (3,), (4, 5), (6, 7, 8))
+
+    data = ones(Float32, (k .+ 3)..., 1, 1)
+    l = ltype(k, 1 => 1, pad = SamePad())
+    @test size(l(data)) == size(data)
+
+    l = ltype(k, 1 => 1, pad = SamePad(), dilation = k .÷ 2)
+    @test size(l(data)) == size(data)
+
+    stride = 3
+    l = ltype(k, 1 => 1, pad = SamePad(), stride = stride)
+    if ltype == ConvTranspose
+        @test size(l(data))[1:(end - 2)] == stride .* size(data)[1:(end - 2)]
+    else
+        @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], stride)
+    end
 end
 
-@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
-  data = ones(Float32, (k .+ 3)..., 1,1)
+@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool),
+                                            k in ((1,), (2,), (3,), (4, 5), (6, 7, 8))
+
+    data = ones(Float32, (k .+ 3)..., 1, 1)
 
-  l = ltype(k, pad=SamePad())
-  @test size(l(data))[1:end-2] == cld.(size(data)[1:end-2], k)
+    l = ltype(k, pad = SamePad())
+    @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], k)
 end
 
 @testset "bugs fixed" begin
-  # https://github.com/FluxML/Flux.jl/issues/1421
-  @test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64}
-end
+# https://github.com/FluxML/Flux.jl/issues/1421
+@test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64} end
 
 @testset "constructors: $fun" for fun in [Conv, CrossCor, ConvTranspose, DepthwiseConv]
-  @test fun(rand(2,3,4)).bias isa Vector{Float64}
-  @test fun(rand(2,3,4,5), false).bias === false
-  if fun == Conv
-    @test fun(rand(2,3,4,5,6), rand(6)).bias isa Vector{Float64}
-    @test_skip fun(rand(2,3,4,5,6), 1:6).bias isa Vector{Float64}
-  elseif fun == DepthwiseConv
-    @test fun(rand(2,3,4,5,6), rand(30)).bias isa Vector{Float64}
-  end
-  @test_throws DimensionMismatch fun(rand(2,3,4), rand(6))
+    @test fun(rand(2, 3, 4)).bias isa Vector{Float64}
+    @test fun(rand(2, 3, 4, 5), false).bias === false
+    if fun == Conv
+        @test fun(rand(2, 3, 4, 5, 6), rand(6)).bias isa Vector{Float64}
+        @test_skip fun(rand(2, 3, 4, 5, 6), 1:6).bias isa Vector{Float64}
+    elseif fun == DepthwiseConv
+        @test fun(rand(2, 3, 4, 5, 6), rand(30)).bias isa Vector{Float64}
+    end
+    @test_throws DimensionMismatch fun(rand(2, 3, 4), rand(6))
 end
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 7ae15aeff9..7a42c5b8e4 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -3,14 +3,13 @@ using Zygote: pullback
 
 evalwgrad(f, x...) = pullback(f, x...)[1]
 
-@testset "Dropout" begin
-  @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
-    x = [1.0+0im,2.0+1im,3.0+3im]
+@testset "Dropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
+    x = [1.0 + 0im, 2.0 + 1im, 3.0 + 3im]
     @test x == Dropout(0.1; rng_kwargs...)(x)
     @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
     @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
 
-    x = [1.,2.,3.]
+    x = [1.0, 2.0, 3.0]
     @test x == Dropout(0.1; rng_kwargs...)(x)
     @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
     @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
@@ -18,31 +17,31 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     x = rand(100)
     m = Dropout(0.9; rng_kwargs...)
     y = evalwgrad(m, x)
-    @test count(a->a==0, y) > 50
+    @test count(a -> a == 0, y) > 50
     testmode!(m, true)
     y = evalwgrad(m, x) # should override istraining
-    @test count(a->a==0, y) == 0
+    @test count(a -> a == 0, y) == 0
     testmode!(m, false)
     y = evalwgrad(m, x)
-    @test count(a->a==0, y) > 50
+    @test count(a -> a == 0, y) > 50
 
     x = rand(Float32, 100)
-    m = Chain(Dense(100,100),
+    m = Chain(Dense(100, 100),
               Dropout(0.9; rng_kwargs...))
     y = evalwgrad(m, x)
-    @test count(a->a == 0, y) > 50
+    @test count(a -> a == 0, y) > 50
     testmode!(m, true)
     y = evalwgrad(m, x) # should override istraining
-    @test count(a->a == 0, y) == 0
+    @test count(a -> a == 0, y) == 0
 
     x = rand(100, 50)
     m = Dropout(0.5; dims = 2, rng_kwargs...)
     y = m(x)
-    c = map(i->count(a->a==0, @view y[i, :]), 1:100)
+    c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
     @test minimum(c) == maximum(c)
     m = Dropout(0.5; dims = 1, rng_kwargs...)
     y = m(x)
-    c = map(i->count(a->a==0, @view y[:, i]), 1:50)
+    c = map(i -> count(a -> a == 0, @view y[:, i]), 1:50)
     @test minimum(c) == maximum(c)
 
     # issue #1084
@@ -51,33 +50,31 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
 
     testmode!(m)
     y = m(x)
-    @test count(a->a == 0, y) == 0
+    @test count(a -> a == 0, y) == 0
     trainmode!(m)
     y = m(x)
-    @test count(a->a == 0, y) > 50
+    @test count(a -> a == 0, y) > 50
 
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true)
-    @test count(a->a == 0, y) > 50
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active = true)
+    @test count(a -> a == 0, y) > 50
 
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false)
-    @test count(a->a == 0, y) == 0
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active = false)
+    @test count(a -> a == 0, y) == 0
 
     # CPU RNGs map onto CPU ok
     if isempty(rng_kwargs)
-      if VERSION >= v"1.7"
-        @test cpu(m).rng isa Random.TaskLocalRNG
-      else
-        @test cpu(m).rng isa Random._GLOBAL_RNG
-      end
+        if VERSION >= v"1.7"
+            @test cpu(m).rng isa Random.TaskLocalRNG
+        else
+            @test cpu(m).rng isa Random._GLOBAL_RNG
+        end
     else
-      @test cpu(m).rng === only(values(rng_kwargs))
+        @test cpu(m).rng === only(values(rng_kwargs))
     end
-  end
-end
+end end
 
-@testset "AlphaDropout" begin
-  @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
-    x = [1., 2., 3.]
+@testset "AlphaDropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
+    x = [1.0, 2.0, 3.0]
     @test x == AlphaDropout(0.1; rng_kwargs...)(x)
     @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x)
     @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x)
@@ -87,379 +84,392 @@ end
 
     y = evalwgrad(m, x)
     # Should preserve unit mean and variance
-    @test mean(y) ≈ 0 atol=0.2
-    @test var(y) ≈ 1 atol=0.2
+    @test mean(y)≈0 atol=0.2
+    @test var(y)≈1 atol=0.2
 
     testmode!(m, true) # should override istraining
     @test evalwgrad(m, x) == x
 
     testmode!(m, false)
     y = evalwgrad(m, x)
-    @test mean(y) ≈ 0 atol=0.2
-    @test var(y) ≈ 1 atol=0.2
+    @test mean(y)≈0 atol=0.2
+    @test var(y)≈1 atol=0.2
 
     # Known good value ranges
     # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
     x = ones(100)
     if isempty(rng_kwargs)
-      @test 40 < sum(evalwgrad(m, x)) < 130
+        @test 40 < sum(evalwgrad(m, x)) < 130
     else
-      # FIXME: this breaks spuriously for MersenneTwister
-      @test_skip 40 < sum(evalwgrad(m, x)) < 130
+        # FIXME: this breaks spuriously for MersenneTwister
+        @test_skip 40 < sum(evalwgrad(m, x)) < 130
     end
 
     # CPU RNGs map onto CPU ok
     if isempty(rng_kwargs)
-      if VERSION >= v"1.7"
-        @test cpu(m).rng isa Random.TaskLocalRNG
-      else
-        @test cpu(m).rng isa Random._GLOBAL_RNG
-      end
+        if VERSION >= v"1.7"
+            @test cpu(m).rng isa Random.TaskLocalRNG
+        else
+            @test cpu(m).rng isa Random._GLOBAL_RNG
+        end
     else
-      @test cpu(m).rng === only(values(rng_kwargs))
+        @test cpu(m).rng === only(values(rng_kwargs))
     end
-  end
-end
+end end
 
 @testset "BatchNorm" begin
-  let m = BatchNorm(2), x = [1.0 3.0 5.0;
-                             2.0 4.0 6.0]
+    let m = BatchNorm(2), x = [1.0 3.0 5.0;
+                               2.0 4.0 6.0]
+        @test Flux.hasaffine(m) == true
+        @test length(Flux.params(m)) == 2
+
+        @test m.β == [0, 0]  # initβ(2)
+        @test m.γ == [1, 1]  # initγ(2)
+        # initial m.σ is 1
+        # initial m.μ is 0
+
+        y = evalwgrad(m, x)
+        @test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5)
+        # julia> x
+        #  2×3 Array{Float64,2}:
+        #  1.0  3.0  5.0
+        #  2.0  4.0  6.0
+        #
+        # μ of batch will be
+        #  (1. + 3. + 5.) / 3 = 3
+        #  (2. + 4. + 6.) / 3 = 4
+        #
+        # ∴ update rule with momentum:
+        #  .1 * 3 + 0 = .3
+        #  .1 * 4 + 0 = .4
+        @test m.μ ≈ reshape([0.3, 0.4], 2, 1)
+
+        # julia> .1 .* var(x, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
+        # 2×1 Array{Float64,2}:
+        #  1.3
+        #  1.3
+        @test m.σ² ≈
+              0.1 .* var(x, dims = 2, corrected = false) .* (3 / 2) .+ 0.9 .* [1.0, 1.0]
+
+        x′ = m(x)
+        @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.3), atol = 1.0e-5)
+
+        @inferred m(x)
+    end
 
-    @test Flux.hasaffine(m) == true
-    @test length(Flux.params(m)) == 2
+    let m = BatchNorm(2; track_stats = false), x = [1.0 3.0 5.0; 2.0 4.0 6.0]
+        @inferred m(x)
+    end
 
-    @test m.β == [0, 0]  # initβ(2)
-    @test m.γ == [1, 1]  # initγ(2)
-    # initial m.σ is 1
-    # initial m.μ is 0
+    # with activation function
+    let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0;
+                                        2.0 4.0 6.0]
+        y = m(x)
+        @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
+        @inferred m(x)
+    end
 
-    y = evalwgrad(m, x)
-    @test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5)
-    # julia> x
-    #  2×3 Array{Float64,2}:
-    #  1.0  3.0  5.0
-    #  2.0  4.0  6.0
-    #
-    # μ of batch will be
-    #  (1. + 3. + 5.) / 3 = 3
-    #  (2. + 4. + 6.) / 3 = 4
-    #
-    # ∴ update rule with momentum:
-    #  .1 * 3 + 0 = .3
-    #  .1 * 4 + 0 = .4
-    @test m.μ ≈ reshape([0.3, 0.4], 2, 1)
-
-    # julia> .1 .* var(x, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
-    # 2×1 Array{Float64,2}:
-    #  1.3
-    #  1.3
-    @test m.σ² ≈ .1 .* var(x, dims=2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
-
-    x′ = m(x)
-    @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.3), atol = 1.0e-5)
-
-    @inferred m(x)
-  end
-
-  let m = BatchNorm(2; track_stats=false), x = [1.0 3.0 5.0; 2.0 4.0 6.0]
-    @inferred m(x)
-  end
-
-  # with activation function
-  let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0;
-                                      2.0 4.0 6.0]
-    y = m(x)
-    @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
-    @inferred m(x)
-  end
-
-  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
-    y = reshape(permutedims(x, [2, 1, 3]), 2, :)
-    y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
-    @test m(x) == y
-    @inferred m(x)
-  end
-
-  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1)
-    y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
-    y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
-    @test m(x) == y
-    @inferred m(x)
-  end
-
-  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
-    y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
-    y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
-    @test m(x) == y
-    @inferred m(x)
-  end
-
-  let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1);
-    m(x)
-    @test (@allocated m(x)) <  100_000_000
-    @inferred m(x)
-  end
-
-  @test length(Flux.params(BatchNorm(10))) == 2
-  @test length(Flux.params(BatchNorm(10, affine=true))) == 2
-  @test length(Flux.params(BatchNorm(10, affine=false))) == 0
+    let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
+        y = reshape(permutedims(x, [2, 1, 3]), 2, :)
+        y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
+        @test m(x) == y
+        @inferred m(x)
+    end
+
+    let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1)
+        y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
+        y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
+        @test m(x) == y
+        @inferred m(x)
+    end
+
+    let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
+        y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
+        y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
+        @test m(x) == y
+        @inferred m(x)
+    end
+
+    let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1)
+        m(x)
+        @test (@allocated m(x)) < 100_000_000
+        @inferred m(x)
+    end
+
+    @test length(Flux.params(BatchNorm(10))) == 2
+    @test length(Flux.params(BatchNorm(10, affine = true))) == 2
+    @test length(Flux.params(BatchNorm(10, affine = false))) == 0
 end
 
 @testset "InstanceNorm" begin
-  # begin tests
-  let m = InstanceNorm(2; affine=true, track_stats=true), sizes = (3, 2, 2),
+    # begin tests
+    let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (3, 2, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
-      @test length(Flux.params(m)) == 2
-      x = Float32.(x)
-      @test m.β == [0, 0]  # initβ(2)
-      @test m.γ == [1, 1]  # initγ(2)
-      y = evalwgrad(m, x)
-
-      #julia> x
-      #[:, :, 1] =
-      # 1.0  4.0
-      # 2.0  5.0
-      # 3.0  6.0
-      #
-      #[:, :, 2] =
-      # 7.0  10.0
-      # 8.0  11.0
-      # 9.0  12.0
-      #
-      # μ will be
-      # (1. + 2. + 3.) / 3 = 2.
-      # (4. + 5. + 6.) / 3 = 5.
-      #
-      # (7. + 8. + 9.) / 3 = 8.
-      # (10. + 11. + 12.) / 3 = 11.
-      #
-      # ∴ update rule with momentum:
-      # (1. - .1) * 0 + .1 * (2. + 8.) / 2 = .5
-      # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8
-      N = ndims(x)
-      @test m.μ ≈ [0.5, 0.8]
-      n = prod(size(x,i) for i in 1:N-2)
-      corr = n / (n-1)
-      σ² = var(x, dims=1:N-2, corrected=false)
-      @test m.σ² ≈ 0.1*corr*vec(mean(σ², dims=N)) .+ 0.9 * 1
-
-      y = m(x)
-      @test length(m.μ) == 2
-      @test length(m.σ²) == 2
-      @test y ≈ (x .- reshape(m.μ, 1,2,1)) ./ sqrt.(reshape(m.σ², 1,2,1) .+ 1f-5)   atol=1.0e-5
-
-      @inferred m(x)
-  end
-
-  # with activation function
-  let m = InstanceNorm(2, sigmoid; affine=true, track_stats=true), sizes = (3, 2, 2),
-      x = reshape(collect(1:prod(sizes)), sizes)
-    x = Float64.(x)
-    affine_shape = collect(sizes)
-    affine_shape[[1,3]] .= 1
+        @test length(Flux.params(m)) == 2
+        x = Float32.(x)
+        @test m.β == [0, 0]  # initβ(2)
+        @test m.γ == [1, 1]  # initγ(2)
+        y = evalwgrad(m, x)
+
+        #julia> x
+        #[:, :, 1] =
+        # 1.0  4.0
+        # 2.0  5.0
+        # 3.0  6.0
+        #
+        #[:, :, 2] =
+        # 7.0  10.0
+        # 8.0  11.0
+        # 9.0  12.0
+        #
+        # μ will be
+        # (1. + 2. + 3.) / 3 = 2.
+        # (4. + 5. + 6.) / 3 = 5.
+        #
+        # (7. + 8. + 9.) / 3 = 8.
+        # (10. + 11. + 12.) / 3 = 11.
+        #
+        # ∴ update rule with momentum:
+        # (1. - .1) * 0 + .1 * (2. + 8.) / 2 = .5
+        # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8
+        N = ndims(x)
+        @test m.μ ≈ [0.5, 0.8]
+        n = prod(size(x, i) for i in 1:(N - 2))
+        corr = n / (n - 1)
+        σ² = var(x, dims = 1:(N - 2), corrected = false)
+        @test m.σ² ≈ 0.1 * corr * vec(mean(σ², dims = N)) .+ 0.9 * 1
+
+        y = m(x)
+        @test length(m.μ) == 2
+        @test length(m.σ²) == 2
+        @test y≈(x .- reshape(m.μ, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 2, 1) .+ 1.0f-5) atol=1.0e-5
+
+        @inferred m(x)
+    end
 
-    y = evalwgrad(m, x)
-    y = m(x) # inference time after a training step
-    μ = reshape(m.μ, affine_shape...)
-    σ² = reshape(m.σ², affine_shape...)
-    @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ))   atol=1.0e-7
+    # with activation function
+    let m = InstanceNorm(2, sigmoid; affine = true, track_stats = true), sizes = (3, 2, 2),
+        x = reshape(collect(1:prod(sizes)), sizes)
 
-    @inferred m(x)
-  end
+        x = Float64.(x)
+        affine_shape = collect(sizes)
+        affine_shape[[1, 3]] .= 1
 
-  # with activation function
-  let m = InstanceNorm(2, sigmoid; affine=true, track_stats=false), sizes = (3, 2, 2),
-      x = reshape(collect(1:prod(sizes)), sizes)
+        y = evalwgrad(m, x)
+        y = m(x) # inference time after a training step
+        μ = reshape(m.μ, affine_shape...)
+        σ² = reshape(m.σ², affine_shape...)
+        @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7
 
-    @test Flux.hasaffine(m) == true
-    @test length(Flux.params(m)) == 2
-    x = Float64.(x)
-    y = m(x)
-    μ = mean(x, dims=1)
-    σ² = var(x, dims=1, corrected=false)
-    @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ))   atol=1.0e-7
+        @inferred m(x)
+    end
 
-    @inferred m(x)
-  end
+    # with activation function
+    let m = InstanceNorm(2, sigmoid; affine = true, track_stats = false), sizes = (3, 2, 2),
+        x = reshape(collect(1:prod(sizes)), sizes)
 
-  let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2),
-      x = reshape(collect(1:prod(sizes)), sizes)
-    @test Flux.hasaffine(m) == false
-    @test length(Flux.params(m)) == 0
+        @test Flux.hasaffine(m) == true
+        @test length(Flux.params(m)) == 2
+        x = Float64.(x)
+        y = m(x)
+        μ = mean(x, dims = 1)
+        σ² = var(x, dims = 1, corrected = false)
+        @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7
 
-    x = Float64.(x)
-    y = m(x)
-    μ = mean(x, dims=1)
-    σ² = var(x, dims=1, corrected=false)
-    @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ))   atol=1.0e-7
+        @inferred m(x)
+    end
 
-    @inferred m(x)
-  end
+    let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2),
+        x = reshape(collect(1:prod(sizes)), sizes)
 
+        @test Flux.hasaffine(m) == false
+        @test length(Flux.params(m)) == 0
 
-  let m = trainmode!(InstanceNorm(2; affine=true)), sizes = (2, 4, 1, 2, 3),
-      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
-    y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
-    y = reshape(m(y), sizes...)
-    @test m(x) == y
+        x = Float64.(x)
+        y = m(x)
+        μ = mean(x, dims = 1)
+        σ² = var(x, dims = 1, corrected = false)
+        @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7
 
-    @inferred m(x)
-  end
+        @inferred m(x)
+    end
 
-  # check that μ, σ², and the output are the correct size for higher rank tensors
-  let m = InstanceNorm(2; affine=true,track_stats=true), sizes = (5, 5, 3, 4, 2, 6),
-      x = reshape(Float32.(collect(1:prod(sizes))), sizes)
-    y = evalwgrad(m, x)
-    @test size(m.μ) == (sizes[end - 1], )
-    @test size(m.σ²) == (sizes[end - 1], )
-    @test size(y) == sizes
+    let m = trainmode!(InstanceNorm(2; affine = true)), sizes = (2, 4, 1, 2, 3),
+        x = Float32.(reshape(collect(1:prod(sizes)), sizes))
 
-    @inferred m(x)
-  end
+        y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
+        y = reshape(m(y), sizes...)
+        @test m(x) == y
 
-  # show that instance norm is equal to batch norm when channel and batch dims are squashed
-  let m_inorm = trainmode!(InstanceNorm(2; affine=true)), m_bnorm = trainmode!(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6),
-      x = reshape(Float32.(collect(1:prod(sizes))), sizes)
-    @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:end - 2]..., :, 1))), sizes)
-  end
+        @inferred m(x)
+    end
 
-  let m = InstanceNorm(32), x = randn(Float32, 416, 416, 32, 1);
-    m(x)
-    @test (@allocated m(x)) <  100_000_000
+    # check that μ, σ², and the output are the correct size for higher rank tensors
+    let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (5, 5, 3, 4, 2, 6),
+        x = reshape(Float32.(collect(1:prod(sizes))), sizes)
 
-    @inferred m(x)
-  end
+        y = evalwgrad(m, x)
+        @test size(m.μ) == (sizes[end - 1],)
+        @test size(m.σ²) == (sizes[end - 1],)
+        @test size(y) == sizes
 
-  @test length(Flux.params(InstanceNorm(10))) == 0
-  @test length(Flux.params(InstanceNorm(10, affine=true))) == 2
-  @test length(Flux.params(InstanceNorm(10, affine=false))) == 0
+        @inferred m(x)
+    end
+
+    # show that instance norm is equal to batch norm when channel and batch dims are squashed
+    let m_inorm = trainmode!(InstanceNorm(2; affine = true)),
+        m_bnorm = trainmode!(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6),
+        x = reshape(Float32.(collect(1:prod(sizes))), sizes)
+
+        @test m_inorm(x) ==
+              reshape(m_bnorm(reshape(x, (sizes[1:(end - 2)]..., :, 1))), sizes)
+    end
+
+    let m = InstanceNorm(32), x = randn(Float32, 416, 416, 32, 1)
+        m(x)
+        @test (@allocated m(x)) < 100_000_000
+
+        @inferred m(x)
+    end
+
+    @test length(Flux.params(InstanceNorm(10))) == 0
+    @test length(Flux.params(InstanceNorm(10, affine = true))) == 2
+    @test length(Flux.params(InstanceNorm(10, affine = false))) == 0
 end
 
 @testset "LayerNorm" begin
-  x = rand(2,3)
-  @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1)
-  x = rand(2,3,4)
-  @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1)
-  x = rand(2,3,4,5)
-  @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1)
-  x = rand(2)
-  @test LayerNorm(2, tanh)(x) ≈ tanh.(Flux.normalise(x, dims=1))
-
-  x = rand(2,3,4,5)
-  @test LayerNorm((2,3))(x) ≈ Flux.normalise(x, dims=(1,2))
-  x = rand(2,3,4,5)
-  @test LayerNorm((2,3,4))(x) ≈ Flux.normalise(x, dims=1:3)
-
-  m = LayerNorm((2,3,4))
-  @test Flux.hasaffine(m) == true
-  @test length(Flux.params(m)) == 2
-  m = LayerNorm((2,3,4), affine=false)
-  @test Flux.hasaffine(m) == false
-  @test length(Flux.params(m)) == 0
+    x = rand(2, 3)
+    @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims = 1)
+    x = rand(2, 3, 4)
+    @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims = 1)
+    x = rand(2, 3, 4, 5)
+    @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims = 1)
+    x = rand(2)
+    @test LayerNorm(2, tanh)(x) ≈ tanh.(Flux.normalise(x, dims = 1))
+
+    x = rand(2, 3, 4, 5)
+    @test LayerNorm((2, 3))(x) ≈ Flux.normalise(x, dims = (1, 2))
+    x = rand(2, 3, 4, 5)
+    @test LayerNorm((2, 3, 4))(x) ≈ Flux.normalise(x, dims = 1:3)
+
+    m = LayerNorm((2, 3, 4))
+    @test Flux.hasaffine(m) == true
+    @test length(Flux.params(m)) == 2
+    m = LayerNorm((2, 3, 4), affine = false)
+    @test Flux.hasaffine(m) == false
+    @test length(Flux.params(m)) == 0
 end
 
 @testset "GroupNorm" begin
-  # begin tests
-  squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
+    # begin tests
+    squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
 
-  let m = GroupNorm(4,2, track_stats=true), sizes = (3,4,2),
+    let m = GroupNorm(4, 2, track_stats = true), sizes = (3, 4, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
-      @test length(Flux.params(m)) == 2
-      x = Float32.(x)
-      @test m.β == [0, 0, 0, 0]  # initβ(32)
-      @test m.γ == [1, 1, 1, 1]  # initγ(32)
-
-      y = evalwgrad(m, x)
-
-      #julia> x
-      #[:, :, 1]  =
-      # 1.0  4.0  7.0  10.0
-      # 2.0  5.0  8.0  11.0
-      # 3.0  6.0  9.0  12.0
-      #
-      #[:, :, 2] =
-      # 13.0  16.0  19.0  22.0
-      # 14.0  17.0  20.0  23.0
-      # 15.0  18.0  21.0  24.0
-      #
-      # μ will be
-      # (1. + 2. + 3. + 4. + 5. + 6.) / 6 = 3.5
-      # (7. + 8. + 9. + 10. + 11. + 12.) / 6 = 9.5
-      #
-      # (13. + 14. + 15. + 16. + 17. + 18.) / 6 = 15.5
-      # (19. + 20. + 21. + 22. + 23. + 24.) / 6 = 21.5
-      #
-      # μ =
-      # 3.5   15.5
-      # 9.5   21.5
-      #
-      # ∴ update rule with momentum:
-      # (1. - .1) * 0 + .1 * (3.5 + 15.5) / 2 = 0.95
-      # (1. - .1) * 0 + .1 * (9.5 + 21.5) / 2 = 1.55
-      @test m.μ ≈ [0.95, 1.55]
-      n = prod(size(x)) ÷ m.G ÷ size(x)[end]
-      corr = n / (n-1)
-      z = reshape(x,3,2,2,2)
-      σ² = var(z, dims=(1,2), corrected=false)
-      @test m.σ² ≈ 0.1*corr*vec(mean(σ², dims=4)) .+ 0.9 * 1
-
-      y = m(x)
-      out = (z .- reshape(m.μ, 1,1,2,1)) ./ sqrt.(reshape(m.σ², 1,1,2,1) .+ 1f-5)
-      @test y ≈ reshape(out, size(x))   atol=1.0e-5
-  end
-  # with activation function
-  let m = GroupNorm(4,2, sigmoid, track_stats=true), sizes = (3, 4, 2),
-      x = reshape(collect(1:prod(sizes)), sizes)
-    x = Float32.(x)
-    μ_affine_shape = ones(Int,length(sizes) + 1)
-    μ_affine_shape[end-1] = 2 # Number of groups
-
-    affine_shape = ones(Int,length(sizes) + 1)
-    affine_shape[end-2] = 2 # Channels per group
-    affine_shape[end-1] = 2 # Number of groups
-    affine_shape[1] = sizes[1]
-    affine_shape[end] = sizes[end]
-
-    og_shape = size(x)
+        @test length(Flux.params(m)) == 2
+        x = Float32.(x)
+        @test m.β == [0, 0, 0, 0]  # initβ(32)
+        @test m.γ == [1, 1, 1, 1]  # initγ(32)
+
+        y = evalwgrad(m, x)
+
+        #julia> x
+        #[:, :, 1]  =
+        # 1.0  4.0  7.0  10.0
+        # 2.0  5.0  8.0  11.0
+        # 3.0  6.0  9.0  12.0
+        #
+        #[:, :, 2] =
+        # 13.0  16.0  19.0  22.0
+        # 14.0  17.0  20.0  23.0
+        # 15.0  18.0  21.0  24.0
+        #
+        # μ will be
+        # (1. + 2. + 3. + 4. + 5. + 6.) / 6 = 3.5
+        # (7. + 8. + 9. + 10. + 11. + 12.) / 6 = 9.5
+        #
+        # (13. + 14. + 15. + 16. + 17. + 18.) / 6 = 15.5
+        # (19. + 20. + 21. + 22. + 23. + 24.) / 6 = 21.5
+        #
+        # μ =
+        # 3.5   15.5
+        # 9.5   21.5
+        #
+        # ∴ update rule with momentum:
+        # (1. - .1) * 0 + .1 * (3.5 + 15.5) / 2 = 0.95
+        # (1. - .1) * 0 + .1 * (9.5 + 21.5) / 2 = 1.55
+        @test m.μ ≈ [0.95, 1.55]
+        n = prod(size(x)) ÷ m.G ÷ size(x)[end]
+        corr = n / (n - 1)
+        z = reshape(x, 3, 2, 2, 2)
+        σ² = var(z, dims = (1, 2), corrected = false)
+        @test m.σ² ≈ 0.1 * corr * vec(mean(σ², dims = 4)) .+ 0.9 * 1
+
+        y = m(x)
+        out = (z .- reshape(m.μ, 1, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 1, 2, 1) .+ 1.0f-5)
+        @test y≈reshape(out, size(x)) atol=1.0e-5
+    end
+    # with activation function
+    let m = GroupNorm(4, 2, sigmoid, track_stats = true), sizes = (3, 4, 2),
+        x = reshape(collect(1:prod(sizes)), sizes)
 
-    y = m(x)
-    x_ = reshape(x,affine_shape...)
-    out = reshape(sigmoid.((x_ .- reshape(m.μ,μ_affine_shape...)) ./ sqrt.(reshape(m.σ²,μ_affine_shape...) .+ m.ϵ)),og_shape)
-    @test y ≈ out   atol=1e-7
-  end
-
-  let m = trainmode!(GroupNorm(2,2, track_stats=true)), sizes = (2, 4, 1, 2, 3),
-      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
-    y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
-    y = reshape(m(y), sizes...)
-    @test m(x) == y
-  end
-
-  # check that μ, σ², and the output are the correct size for higher rank tensors
-  let m = GroupNorm(4,2, track_stats=true), sizes = (5, 5, 3, 4, 4, 6),
-      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
-    y = evalwgrad(m, x)
-    @test size(m.μ) == (m.G,)
-    @test size(m.σ²) == (m.G,)
-    @test size(y) == sizes
-  end
-
-  # show that group norm is the same as instance norm when the group size is the same as the number of channels
-  let IN = trainmode!(InstanceNorm(4; affine=true)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,5),
-      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
-    @test IN(x) ≈ GN(x)
-  end
-
-  # show that group norm is the same as batch norm for a group of size 1 and batch of size 1
-  let BN = trainmode!(BatchNorm(4)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,1),
-      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
-    @test BN(x) ≈ GN(x)
-  end
+        x = Float32.(x)
+        μ_affine_shape = ones(Int, length(sizes) + 1)
+        μ_affine_shape[end - 1] = 2 # Number of groups
+
+        affine_shape = ones(Int, length(sizes) + 1)
+        affine_shape[end - 2] = 2 # Channels per group
+        affine_shape[end - 1] = 2 # Number of groups
+        affine_shape[1] = sizes[1]
+        affine_shape[end] = sizes[end]
+
+        og_shape = size(x)
+
+        y = m(x)
+        x_ = reshape(x, affine_shape...)
+        out = reshape(sigmoid.((x_ .- reshape(m.μ, μ_affine_shape...)) ./
+                               sqrt.(reshape(m.σ², μ_affine_shape...) .+ m.ϵ)), og_shape)
+        @test y≈out atol=1e-7
+    end
+
+    let m = trainmode!(GroupNorm(2, 2, track_stats = true)), sizes = (2, 4, 1, 2, 3),
+        x = Float32.(reshape(collect(1:prod(sizes)), sizes))
+
+        y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
+        y = reshape(m(y), sizes...)
+        @test m(x) == y
+    end
+
+    # check that μ, σ², and the output are the correct size for higher rank tensors
+    let m = GroupNorm(4, 2, track_stats = true), sizes = (5, 5, 3, 4, 4, 6),
+        x = Float32.(reshape(collect(1:prod(sizes)), sizes))
+
+        y = evalwgrad(m, x)
+        @test size(m.μ) == (m.G,)
+        @test size(m.σ²) == (m.G,)
+        @test size(y) == sizes
+    end
+
+    # show that group norm is the same as instance norm when the group size is the same as the number of channels
+    let IN = trainmode!(InstanceNorm(4; affine = true)), GN = trainmode!(GroupNorm(4, 4)),
+        sizes = (2, 2, 3, 4, 5),
+        x = Float32.(reshape(collect(1:prod(sizes)), sizes))
+
+        @test IN(x) ≈ GN(x)
+    end
+
+    # show that group norm is the same as batch norm for a group of size 1 and batch of size 1
+    let BN = trainmode!(BatchNorm(4)), GN = trainmode!(GroupNorm(4, 4)),
+        sizes = (2, 2, 3, 4, 1),
+        x = Float32.(reshape(collect(1:prod(sizes)), sizes))
+
+        @test BN(x) ≈ GN(x)
+    end
 end
 
 @testset "second derivatives" begin
-  m1 = Dropout(0.5)
-  @test Zygote.hessian_reverse(sum∘m1, [1.0,2.0,3.0]) == zeros(3, 3)
+    m1 = Dropout(0.5)
+    @test Zygote.hessian_reverse(sum ∘ m1, [1.0, 2.0, 3.0]) == zeros(3, 3)
 end
diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl
index facab8466b..e7a2b3d14f 100644
--- a/test/layers/recurrent.jl
+++ b/test/layers/recurrent.jl
@@ -2,76 +2,79 @@ using LinearAlgebra
 
 # Ref FluxML/Flux.jl#1209 1D input
 @testset "BPTT-1D" begin
-  seq = [rand(Float32, 2) for i = 1:3]
-  for r ∈ [RNN,]
-    rnn = r(2 => 3)
-    Flux.reset!(rnn)
-    grads_seq = gradient(Flux.params(rnn)) do
-        sum([rnn(s) for s in seq][3])
+    seq = [rand(Float32, 2) for i in 1:3]
+    for r in [RNN]
+        rnn = r(2 => 3)
+        Flux.reset!(rnn)
+        grads_seq = gradient(Flux.params(rnn)) do
+            return sum([rnn(s) for s in seq][3])
+        end
+        Flux.reset!(rnn)
+        bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] +
+                                        Wh *
+                                        tanh.(rnn.cell.Wi * seq[2] +
+                                              Wh *
+                                              tanh.(rnn.cell.Wi * seq[1] +
+                                                    Wh * rnn.cell.state0
+                                                    + rnn.cell.b)
+                                              + rnn.cell.b)
+                                        + rnn.cell.b)),
+                        rnn.cell.Wh)
+        @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
     end
-    Flux.reset!(rnn);
-    bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + Wh *
-                                  tanh.(rnn.cell.Wi * seq[2] + Wh *
-                                        tanh.(rnn.cell.Wi * seq[1] +
-                                            Wh * rnn.cell.state0
-                                        + rnn.cell.b)
-                                  + rnn.cell.b)
-                            + rnn.cell.b)),
-                    rnn.cell.Wh)
-    @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
-  end
 end
 
 # Ref FluxML/Flux.jl#1209 2D input
 @testset "BPTT-2D" begin
-  seq = [rand(Float32, (2, 1)) for i = 1:3]
-  for r ∈ [RNN,]
-    rnn = r(2 => 3)
-    Flux.reset!(rnn)
-    grads_seq = gradient(Flux.params(rnn)) do
-        sum([rnn(s) for s in seq][3])
+    seq = [rand(Float32, (2, 1)) for i in 1:3]
+    for r in [RNN]
+        rnn = r(2 => 3)
+        Flux.reset!(rnn)
+        grads_seq = gradient(Flux.params(rnn)) do
+            return sum([rnn(s) for s in seq][3])
+        end
+        Flux.reset!(rnn)
+        bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] +
+                                        Wh *
+                                        tanh.(rnn.cell.Wi * seq[2] +
+                                              Wh *
+                                              tanh.(rnn.cell.Wi * seq[1] +
+                                                    Wh * rnn.cell.state0
+                                                    + rnn.cell.b)
+                                              + rnn.cell.b)
+                                        + rnn.cell.b)),
+                        rnn.cell.Wh)
+        @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
     end
-    Flux.reset!(rnn);
-    bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + Wh *
-                                  tanh.(rnn.cell.Wi * seq[2] + Wh *
-                                        tanh.(rnn.cell.Wi * seq[1] +
-                                            Wh * rnn.cell.state0
-                                        + rnn.cell.b)
-                                  + rnn.cell.b)
-                            + rnn.cell.b)),
-                    rnn.cell.Wh)
-    @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
-  end
 end
 
 @testset "BPTT-3D" begin
-  seq = rand(Float32, (2, 1, 3))
-  rnn = RNN(2 => 3)
-  Flux.reset!(rnn)
-  grads_seq = gradient(Flux.params(rnn)) do
-    sum(rnn(seq)[:, :, 3])
-  end
-  Flux.reset!(rnn);
-  bptt = gradient(rnn.cell.Wh) do Wh
-    # calculate state 1
-    s1 = tanh.(rnn.cell.Wi * seq[:, :, 1] +
-               Wh * rnn.cell.state0 +
-               rnn.cell.b)
-    #calculate state 2
-    s2 = tanh.(rnn.cell.Wi * seq[:, :, 2] +
-               Wh * s1 +
-               rnn.cell.b)
-    #calculate state 3
-    s3 = tanh.(rnn.cell.Wi * seq[:, :, 3] +
-               Wh * s2 +
-               rnn.cell.b)
-    sum(s3) # loss is sum of state 3
-  end
-  @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
+    seq = rand(Float32, (2, 1, 3))
+    rnn = RNN(2 => 3)
+    Flux.reset!(rnn)
+    grads_seq = gradient(Flux.params(rnn)) do
+        return sum(rnn(seq)[:, :, 3])
+    end
+    Flux.reset!(rnn)
+    bptt = gradient(rnn.cell.Wh) do Wh
+        # calculate state 1
+        s1 = tanh.(rnn.cell.Wi * seq[:, :, 1] +
+                   Wh * rnn.cell.state0 +
+                   rnn.cell.b)
+        #calculate state 2
+        s2 = tanh.(rnn.cell.Wi * seq[:, :, 2] +
+                   Wh * s1 +
+                   rnn.cell.b)
+        #calculate state 3
+        s3 = tanh.(rnn.cell.Wi * seq[:, :, 3] +
+                   Wh * s2 +
+                   rnn.cell.b)
+        return sum(s3) # loss is sum of state 3
+    end
+    @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
 end
 
-@testset "RNN-shapes" begin
-  @testset for R in [RNN, GRU, LSTM, GRUv3]
+@testset "RNN-shapes" begin @testset for R in [RNN, GRU, LSTM, GRUv3]
     m1 = R(3 => 5)
     m2 = R(3 => 5)
     m3 = R(3, 5)  # leave one to test the silently deprecated "," not "=>" notation
@@ -87,85 +90,85 @@ end
     @test size(m2(x2)) == (5, 1)
     @test size(m3(x3)) == (5, 1, 2)
     @test size(m3(x3)) == (5, 1, 2)
-  end
-end
+end end
 
-@testset "RNN-input-state-eltypes" begin
-  @testset for R in [RNN, GRU, LSTM, GRUv3]
+@testset "RNN-input-state-eltypes" begin @testset for R in [RNN, GRU, LSTM, GRUv3]
     m = R(3 => 5)
     x = rand(Float64, 3, 1)
     Flux.reset!(m)
     @test_throws MethodError m(x)
-  end
-end
+end end
 
 @testset "multigate" begin
-  x = rand(6, 5)
-  res, (dx,) = Flux.withgradient(x) do x
-    x1, _, x3 = Flux.multigate(x, 2, Val(3))
-    sum(x1) + sum(x3 .* 2)
-  end
-  @test res == sum(x[1:2, :]) + 2sum(x[5:6, :])
-  @test dx == [ones(2, 5); zeros(2, 5); fill(2, 2, 5)]
+    x = rand(6, 5)
+    res, (dx,) = Flux.withgradient(x) do x
+        x1, _, x3 = Flux.multigate(x, 2, Val(3))
+        return sum(x1) + sum(x3 .* 2)
+    end
+    @test res == sum(x[1:2, :]) + 2sum(x[5:6, :])
+    @test dx == [ones(2, 5); zeros(2, 5); fill(2, 2, 5)]
 end
 
 @testset "eachlastdim" begin
-  x = rand(3, 3, 1, 2, 4)
-  @test length(Flux.eachlastdim(x)) == size(x, ndims(x))
-  @test collect(@inferred(Flux.eachlastdim(x))) == collect(eachslice(x; dims=ndims(x)))
-  slicedim = (size(x)[1:end-1]..., 1)
-  res, (dx,) = Flux.withgradient(x) do x
-    x1, _, x3, _ = Flux.eachlastdim(x)
-    sum(x1) + sum(x3 .* 3)
-  end
-  @test res ≈ sum(selectdim(x, ndims(x), 1)) + 3sum(selectdim(x, ndims(x), 3))
-  @test dx ≈ cat(fill(1, slicedim), fill(0, slicedim),
-              fill(3, slicedim), fill(0, slicedim); dims=ndims(x))
+    x = rand(3, 3, 1, 2, 4)
+    @test length(Flux.eachlastdim(x)) == size(x, ndims(x))
+    @test collect(@inferred(Flux.eachlastdim(x))) == collect(eachslice(x; dims = ndims(x)))
+    slicedim = (size(x)[1:(end - 1)]..., 1)
+    res, (dx,) = Flux.withgradient(x) do x
+        x1, _, x3, _ = Flux.eachlastdim(x)
+        return sum(x1) + sum(x3 .* 3)
+    end
+    @test res ≈ sum(selectdim(x, ndims(x), 1)) + 3sum(selectdim(x, ndims(x), 3))
+    @test dx ≈ cat(fill(1, slicedim), fill(0, slicedim),
+              fill(3, slicedim), fill(0, slicedim); dims = ndims(x))
 end
 
 @testset "∇eachlastdim" begin
     x = rand(3, 3, 1, 2, 4)
     x_size = size(x)
-    y = collect(eachslice(x; dims=ndims(x)))
+    y = collect(eachslice(x; dims = ndims(x)))
     @test @inferred(Flux.∇eachlastdim(y, x)) == x
     ZeroTangent = Flux.Zygote.ZeroTangent
     NoTangent = Flux.Zygote.NoTangent
     abstract_zeros_vector = [ZeroTangent(), ZeroTangent(), NoTangent(), NoTangent()]
     @test @inferred(Flux.∇eachlastdim(abstract_zeros_vector, x)) == zeros(size(x))
-    x2 = rand(Float64, x_size[1:end-1])
-    x3 = rand(Float64, x_size[1:end-1])
+    x2 = rand(Float64, x_size[1:(end - 1)])
+    x3 = rand(Float64, x_size[1:(end - 1)])
     mixed_vector = [ZeroTangent(), x2, x3, ZeroTangent()]
-    @test @inferred(Flux.∇eachlastdim(mixed_vector, x)) ≈ cat(zeros(x_size[1:end-1]), 
-                                                         x2, 
-                                                         x3, 
-                                                         zeros(x_size[1:end-1]); dims=ndims(x))
+    @test @inferred(Flux.∇eachlastdim(mixed_vector, x)) ≈ cat(zeros(x_size[1:(end - 1)]),
+              x2,
+              x3,
+              zeros(x_size[1:(end - 1)]); dims = ndims(x))
 end
 
 @testset "Different Internal Matrix Types" begin
-  R = Flux.Recur(Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1)))
-  # don't want to pull in SparseArrays just for this test, but there aren't any
-  # non-square structured matrix types in LinearAlgebra. so we will use a different
-  # eltype matrix, which would fail before when `W_i` and `W_h` were required to be the
-  # same type.
-  L = Flux.Recur(Flux.LSTMCell(rand(5*4, 3), rand(1:20, 5*4, 5), rand(5*4), (rand(5, 1), rand(5, 1))))
-  G = Flux.Recur(Flux.GRUCell(rand(5*3, 3), rand(1:20, 5*3, 5), rand(5*3), rand(5, 1)))
-  G3 = Flux.Recur(Flux.GRUv3Cell(rand(5*3, 3), rand(1:20, 5*2, 5), rand(5*3), Tridiagonal(rand(5, 5)), rand(5, 1)))
-
-  for m in [R, L, G, G3]
+    R = Flux.Recur(Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5),
+                                rand(5, 1)))
+    # don't want to pull in SparseArrays just for this test, but there aren't any
+    # non-square structured matrix types in LinearAlgebra. so we will use a different
+    # eltype matrix, which would fail before when `W_i` and `W_h` were required to be the
+    # same type.
+    L = Flux.Recur(Flux.LSTMCell(rand(5 * 4, 3), rand(1:20, 5 * 4, 5), rand(5 * 4),
+                                 (rand(5, 1), rand(5, 1))))
+    G = Flux.Recur(Flux.GRUCell(rand(5 * 3, 3), rand(1:20, 5 * 3, 5), rand(5 * 3),
+                                rand(5, 1)))
+    G3 = Flux.Recur(Flux.GRUv3Cell(rand(5 * 3, 3), rand(1:20, 5 * 2, 5), rand(5 * 3),
+                                   Tridiagonal(rand(5, 5)), rand(5, 1)))
 
-    x1 = rand(3)
-    x2 = rand(3, 1)
-    x3 = rand(3, 1, 2)
-    Flux.reset!(m)
-    @test size(m(x1)) == (5,)
-    Flux.reset!(m)
-    @test size(m(x1)) == (5,) # repeat in case of effect from change in state shape
-    @test size(m(x2)) == (5, 1)
-    Flux.reset!(m)
-    @test size(m(x2)) == (5, 1)
-    Flux.reset!(m)
-    @test size(m(x3)) == (5, 1, 2)
-    Flux.reset!(m)
-    @test size(m(x3)) == (5, 1, 2)
-  end
+    for m in [R, L, G, G3]
+        x1 = rand(3)
+        x2 = rand(3, 1)
+        x3 = rand(3, 1, 2)
+        Flux.reset!(m)
+        @test size(m(x1)) == (5,)
+        Flux.reset!(m)
+        @test size(m(x1)) == (5,) # repeat in case of effect from change in state shape
+        @test size(m(x2)) == (5, 1)
+        Flux.reset!(m)
+        @test size(m(x2)) == (5, 1)
+        Flux.reset!(m)
+        @test size(m(x3)) == (5, 1, 2)
+        Flux.reset!(m)
+        @test size(m(x3)) == (5, 1, 2)
+    end
 end
diff --git a/test/layers/show.jl b/test/layers/show.jl
index 3fc9bd097b..08d1c845f7 100644
--- a/test/layers/show.jl
+++ b/test/layers/show.jl
@@ -1,75 +1,73 @@
 
 @testset "layer printing" begin # 2-arg show, defined with layes
-
-  @test repr(Dense(2,3)) == "Dense(2 => 3)"
-  @test repr(Chain(Dense(2,3))) == "Chain(Dense(2 => 3))"
-  @test repr(Chain(lay=Dense(2,3))) == "Chain(lay = Dense(2 => 3))"
-
+    @test repr(Dense(2, 3)) == "Dense(2 => 3)"
+    @test repr(Chain(Dense(2, 3))) == "Chain(Dense(2 => 3))"
+    @test repr(Chain(lay = Dense(2, 3))) == "Chain(lay = Dense(2 => 3))"
 end
 @testset "nested model printing" begin # 3-arg show, defined in show.jl
 
-  # Dense -- has parameter count, but not when inside a matrix:
+    # Dense -- has parameter count, but not when inside a matrix:
 
-  toplevel_dense = repr("text/plain", Dense(2,3))
-  @test occursin("Dense(2 => 3)", toplevel_dense)
-  @test occursin("# 9 parameters", toplevel_dense)
+    toplevel_dense = repr("text/plain", Dense(2, 3))
+    @test occursin("Dense(2 => 3)", toplevel_dense)
+    @test occursin("# 9 parameters", toplevel_dense)
 
-  @test Meta.isexpr(Meta.parse(toplevel_dense), :call)  # comment is ignored
+    @test Meta.isexpr(Meta.parse(toplevel_dense), :call)  # comment is ignored
 
-  vector_dense = repr("text/plain", [Dense(2,3), Dense(2,3)])
-  @test occursin("Dense(2 => 3)", vector_dense)
-  @test occursin("# 9 parameters", vector_dense)
+    vector_dense = repr("text/plain", [Dense(2, 3), Dense(2, 3)])
+    @test occursin("Dense(2 => 3)", vector_dense)
+    @test occursin("# 9 parameters", vector_dense)
 
-  matrix_dense = repr("text/plain", fill(Dense(2,3), 3, 3))
-  @test occursin("Dense(2 => 3)", matrix_dense)
-  @test !occursin("# 9 parameters", matrix_dense)
+    matrix_dense = repr("text/plain", fill(Dense(2, 3), 3, 3))
+    @test occursin("Dense(2 => 3)", matrix_dense)
+    @test !occursin("# 9 parameters", matrix_dense)
 
-  tuple_dense = repr("text/plain", tuple(Dense(2,3)))
-  @test occursin("Dense(2 => 3)", tuple_dense)
-  @test !occursin("# 9 parameters", tuple_dense)
+    tuple_dense = repr("text/plain", tuple(Dense(2, 3)))
+    @test occursin("Dense(2 => 3)", tuple_dense)
+    @test !occursin("# 9 parameters", tuple_dense)
 
-  # Chain -- gets split over lines at top level only
+    # Chain -- gets split over lines at top level only
 
-  toplevel_chain = repr("text/plain", Chain(Dense(2,3)))
-  @test occursin("Chain(\n  Dense(2 => 3)", toplevel_chain)
-  @test occursin("# 9 parameters", toplevel_chain)
-  @test !occursin("# Total:", toplevel_chain)
+    toplevel_chain = repr("text/plain", Chain(Dense(2, 3)))
+    @test occursin("Chain(\n  Dense(2 => 3)", toplevel_chain)
+    @test occursin("# 9 parameters", toplevel_chain)
+    @test !occursin("# Total:", toplevel_chain)
 
-  vector_chain = repr("text/plain", [Chain(Dense(2,3)), Chain(Dense(2,3))])
-  @test occursin("Chain(Dense(2 => 3))", vector_chain)
-  @test occursin("# 9 parameters", vector_chain)
-  @test !occursin("# Total:", vector_chain)
+    vector_chain = repr("text/plain", [Chain(Dense(2, 3)), Chain(Dense(2, 3))])
+    @test occursin("Chain(Dense(2 => 3))", vector_chain)
+    @test occursin("# 9 parameters", vector_chain)
+    @test !occursin("# Total:", vector_chain)
 
-  matrix_chain = repr("text/plain", fill(Chain(Dense(2,3)), 3,3))
-  @test occursin("Chain(Dense(2 => 3))", matrix_chain)
-  @test !occursin("# 9 parameters", matrix_chain)
-  @test !occursin("# Total:", matrix_chain)
+    matrix_chain = repr("text/plain", fill(Chain(Dense(2, 3)), 3, 3))
+    @test occursin("Chain(Dense(2 => 3))", matrix_chain)
+    @test !occursin("# 9 parameters", matrix_chain)
+    @test !occursin("# Total:", matrix_chain)
 
-  # ... and only long enough chains get a total at the end:
+    # ... and only long enough chains get a total at the end:
 
-  longchain = Chain(Dense(2 => 3), Dense(3 => 4), Dense(4 => 5), softmax)
+    longchain = Chain(Dense(2 => 3), Dense(3 => 4), Dense(4 => 5), softmax)
 
-  toplevel_longchain = repr("text/plain", longchain)
-  @test occursin("Chain(\n  Dense(2 => 3)", toplevel_longchain)
-  @test occursin("# 9 parameters", toplevel_longchain)
-  @test occursin("# Total: 6 arrays, 50 parameters", toplevel_longchain)
+    toplevel_longchain = repr("text/plain", longchain)
+    @test occursin("Chain(\n  Dense(2 => 3)", toplevel_longchain)
+    @test occursin("# 9 parameters", toplevel_longchain)
+    @test occursin("# Total: 6 arrays, 50 parameters", toplevel_longchain)
 
-  vector_longchain = repr("text/plain", [longchain, longchain]) # pretty ugly in reality
-  @test occursin("Chain(Dense(2 => 3)", vector_longchain)
-  @test occursin("# 50 parameters", vector_longchain)
-  @test !occursin("# 9 parameters", vector_longchain)
-  @test !occursin("# Total:", vector_longchain)
+    vector_longchain = repr("text/plain", [longchain, longchain]) # pretty ugly in reality
+    @test occursin("Chain(Dense(2 => 3)", vector_longchain)
+    @test occursin("# 50 parameters", vector_longchain)
+    @test !occursin("# 9 parameters", vector_longchain)
+    @test !occursin("# Total:", vector_longchain)
 
-  matrix_longchain = repr("text/plain", fill(longchain, 3,3))
-  @test occursin("Chain(Dense(2 => 3)", matrix_longchain)
-  @test !occursin("# 9 parameters", matrix_longchain)
-  @test !occursin("# Total:", matrix_longchain)
+    matrix_longchain = repr("text/plain", fill(longchain, 3, 3))
+    @test occursin("Chain(Dense(2 => 3)", matrix_longchain)
+    @test !occursin("# 9 parameters", matrix_longchain)
+    @test !occursin("# Total:", matrix_longchain)
 
-  @test Meta.isexpr(Meta.parse(toplevel_longchain), :call)  # comments are ignored
-  @test Meta.parse(toplevel_longchain).args[1] == :Chain
+    @test Meta.isexpr(Meta.parse(toplevel_longchain), :call)  # comments are ignored
+    @test Meta.parse(toplevel_longchain).args[1] == :Chain
 
-  # Functors@0.3 marks transposed matrices non-leaf, shouldn't affect printing:
-  adjoint_chain = repr("text/plain", Chain([Dense([1 2; 3 4]')]))
-  @test occursin("Dense(2 => 2)", adjoint_chain)
-  @test occursin("Chain([", adjoint_chain)
+    # Functors@0.3 marks transposed matrices non-leaf, shouldn't affect printing:
+    adjoint_chain = repr("text/plain", Chain([Dense([1 2; 3 4]')]))
+    @test occursin("Dense(2 => 2)", adjoint_chain)
+    @test occursin("Chain([", adjoint_chain)
 end
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 23caf06e3b..18de58ce7a 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -2,13 +2,13 @@ using Test
 using Flux: flatten
 
 @testset "helpers" begin
-  @testset "flatten" begin
-    x = randn(Float32, 10, 10, 3, 2)
-    @test size(flatten(x)) == (300, 2)
-  end
+    @testset "flatten" begin
+        x = randn(Float32, 10, 10, 3, 2)
+        @test size(flatten(x)) == (300, 2)
+    end
 
-  @testset "normalise" begin
-    x = randn(Float32, 3, 2, 2)
-    @test Flux.normalise(x) == Flux.normalise(x; dims=3)
-  end
+    @testset "normalise" begin
+        x = randn(Float32, 3, 2, 2)
+        @test Flux.normalise(x) == Flux.normalise(x; dims = 3)
+    end
 end
diff --git a/test/layers/upsample.jl b/test/layers/upsample.jl
index bc752a55a0..f6b80dcd20 100644
--- a/test/layers/upsample.jl
+++ b/test/layers/upsample.jl
@@ -1,87 +1,87 @@
 @testset "upsample bilinear" begin
-  m = Upsample(:bilinear, scale=(2, 3))
-  x = rand(Float32, 3, 4, 2, 3)
-  y = m(x)
-  @test y isa Array{Float32, 4} 
-  @test size(y) == (6, 12, 2, 3)
+    m = Upsample(:bilinear, scale = (2, 3))
+    x = rand(Float32, 3, 4, 2, 3)
+    y = m(x)
+    @test y isa Array{Float32, 4}
+    @test size(y) == (6, 12, 2, 3)
 
-  m = Upsample(:bilinear, scale=3)
-  x = rand(Float32, 3, 4, 2, 3)
-  y = m(x)
-  @test y isa Array{Float32, 4} 
-  @test size(y) == (9, 12, 2, 3)
+    m = Upsample(:bilinear, scale = 3)
+    x = rand(Float32, 3, 4, 2, 3)
+    y = m(x)
+    @test y isa Array{Float32, 4}
+    @test size(y) == (9, 12, 2, 3)
 
-  m = Upsample(:bilinear, size=(4, 6))
-  x = rand(Float32, 3, 4, 2, 3)
-  y = m(x)
-  @test y isa Array{Float32, 4} 
-  @test size(y) == (4, 6, 2, 3)
+    m = Upsample(:bilinear, size = (4, 6))
+    x = rand(Float32, 3, 4, 2, 3)
+    y = m(x)
+    @test y isa Array{Float32, 4}
+    @test size(y) == (4, 6, 2, 3)
 end
 
 @testset "upsample trilinear" begin
-  m = Upsample(:trilinear, scale=(2, 3, 2))
-  x = rand(Float32, 3, 4, 2, 3, 4)
-  y = m(x)
-  @test y isa Array{Float32, 5} 
-  @test size(y) == (6, 12, 4, 3, 4)
+    m = Upsample(:trilinear, scale = (2, 3, 2))
+    x = rand(Float32, 3, 4, 2, 3, 4)
+    y = m(x)
+    @test y isa Array{Float32, 5}
+    @test size(y) == (6, 12, 4, 3, 4)
 
-  m = Upsample(:trilinear, scale=3)
-  x = rand(Float32, 3, 4, 2, 3, 4)
-  y = m(x)
-  @test y isa Array{Float32, 5} 
-  @test size(y) == (9, 12, 6, 3, 4)
+    m = Upsample(:trilinear, scale = 3)
+    x = rand(Float32, 3, 4, 2, 3, 4)
+    y = m(x)
+    @test y isa Array{Float32, 5}
+    @test size(y) == (9, 12, 6, 3, 4)
 
-  m = Upsample(:trilinear, size=(4, 6, 4))
-  x = rand(Float32, 3, 4, 2, 3, 4)
-  y = m(x)
-  @test y isa Array{Float32, 5} 
-  @test size(y) == (4, 6, 4, 3, 4)
+    m = Upsample(:trilinear, size = (4, 6, 4))
+    x = rand(Float32, 3, 4, 2, 3, 4)
+    y = m(x)
+    @test y isa Array{Float32, 5}
+    @test size(y) == (4, 6, 4, 3, 4)
 end
 
 @testset "upsample nearest" begin
-  x = rand(Float32, 3, 2, 3)
-  m = Upsample(:nearest, scale=(2,))
-  y = m(x)
-  @test y isa Array{Float32, 3} 
-  @test size(y) == (6, 2, 3)
+    x = rand(Float32, 3, 2, 3)
+    m = Upsample(:nearest, scale = (2,))
+    y = m(x)
+    @test y isa Array{Float32, 3}
+    @test size(y) == (6, 2, 3)
 
-  x = rand(Float32, 3, 4, 2, 3)
-  
-  m = Upsample(:nearest, scale=(2, 3))
-  y = m(x)
-  @test y isa Array{Float32, 4} 
-  @test size(y) == (6, 12, 2, 3)
-  
-  m = Upsample(:nearest, scale=(2,))
-  y = m(x)
-  @test y isa Array{Float32, 4} 
-  @test size(y) == (6, 4, 2, 3)
+    x = rand(Float32, 3, 4, 2, 3)
 
-  m = Upsample(:nearest, scale=2)
-  y = m(x)
-  @test y isa Array{Float32, 4} 
-  @test size(y) == (6, 8, 2, 3)
+    m = Upsample(:nearest, scale = (2, 3))
+    y = m(x)
+    @test y isa Array{Float32, 4}
+    @test size(y) == (6, 12, 2, 3)
 
-  m = Upsample(2)
-  y2 = m(x)
-  @test y2 ≈ y 
+    m = Upsample(:nearest, scale = (2,))
+    y = m(x)
+    @test y isa Array{Float32, 4}
+    @test size(y) == (6, 4, 2, 3)
 
-  m = Upsample(:nearest, size=(6,8))
-  y = m(x)
-  @test y isa Array{Float32, 4} 
-  @test size(y) == (6, 8, 2, 3)
+    m = Upsample(:nearest, scale = 2)
+    y = m(x)
+    @test y isa Array{Float32, 4}
+    @test size(y) == (6, 8, 2, 3)
+
+    m = Upsample(2)
+    y2 = m(x)
+    @test y2 ≈ y
+
+    m = Upsample(:nearest, size = (6, 8))
+    y = m(x)
+    @test y isa Array{Float32, 4}
+    @test size(y) == (6, 8, 2, 3)
 end
 
 @testset "PixelShuffle" begin
-  m = PixelShuffle(2)
-  x = rand(Float32, 3, 18, 3)
-  y = m(x)
-  @test y isa Array{Float32, 3} 
-  @test size(y) == (6, 9, 3)
+    m = PixelShuffle(2)
+    x = rand(Float32, 3, 18, 3)
+    y = m(x)
+    @test y isa Array{Float32, 3}
+    @test size(y) == (6, 9, 3)
 
-  m = PixelShuffle(3)
-  x = rand(Float32, 3, 4, 18, 3)
-  y = m(x)
-  @test y isa Array{Float32, 4} 
-  @test size(y) == (9, 12, 2, 3)
+    m = PixelShuffle(3)
+    x = rand(Float32, 3, 4, 18, 3)
+    y = m(x)
+    @test y isa Array{Float32, 4}
+    @test size(y) == (9, 12, 2, 3)
 end
diff --git a/test/losses.jl b/test/losses.jl
index 2ca697a657..288fddd90f 100644
--- a/test/losses.jl
+++ b/test/losses.jl
@@ -1,74 +1,69 @@
 using Test
 using Flux: onehotbatch, σ
 
-using Flux.Losses: mse, label_smoothing, crossentropy, logitcrossentropy, binarycrossentropy, logitbinarycrossentropy
+using Flux.Losses: mse, label_smoothing, crossentropy, logitcrossentropy,
+                   binarycrossentropy, logitbinarycrossentropy
 using Flux.Losses: xlogx, xlogy
 
 # group here all losses, used in tests
 const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle,
-                    Flux.Losses.crossentropy, Flux.Losses.logitcrossentropy,
-                    Flux.Losses.binarycrossentropy, Flux.Losses.logitbinarycrossentropy,
-                    Flux.Losses.kldivergence,
-                    Flux.Losses.huber_loss,
-                    Flux.Losses.tversky_loss,
-                    Flux.Losses.dice_coeff_loss,
-                    Flux.Losses.poisson_loss,
-                    Flux.Losses.hinge_loss, Flux.Losses.squared_hinge_loss,
-                    Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss, Flux.Losses.siamese_contrastive_loss]
-
+    Flux.Losses.crossentropy, Flux.Losses.logitcrossentropy,
+    Flux.Losses.binarycrossentropy, Flux.Losses.logitbinarycrossentropy,
+    Flux.Losses.kldivergence,
+    Flux.Losses.huber_loss,
+    Flux.Losses.tversky_loss,
+    Flux.Losses.dice_coeff_loss,
+    Flux.Losses.poisson_loss,
+    Flux.Losses.hinge_loss, Flux.Losses.squared_hinge_loss,
+    Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss,
+    Flux.Losses.siamese_contrastive_loss]
 
 @testset "xlogx & xlogy" begin
-  @test iszero(xlogx(0))
-  @test isnan(xlogx(NaN))
-  @test xlogx(2) ≈ 2.0 * log(2.0)
-  @inferred xlogx(2)
-  @inferred xlogx(0)
+    @test iszero(xlogx(0))
+    @test isnan(xlogx(NaN))
+    @test xlogx(2) ≈ 2.0 * log(2.0)
+    @inferred xlogx(2)
+    @inferred xlogx(0)
 
-  @test iszero(xlogy(0, 1))
-  @test isnan(xlogy(NaN, 1))
-  @test isnan(xlogy(1, NaN))
-  @test isnan(xlogy(NaN, NaN))
-  @test xlogy(2, 3) ≈ 2.0 * log(3.0)
-  @inferred xlogy(2, 3)
-  @inferred xlogy(0, 1)
+    @test iszero(xlogy(0, 1))
+    @test isnan(xlogy(NaN, 1))
+    @test isnan(xlogy(1, NaN))
+    @test isnan(xlogy(NaN, NaN))
+    @test xlogy(2, 3) ≈ 2.0 * log(3.0)
+    @inferred xlogy(2, 3)
+    @inferred xlogy(0, 1)
 end
 
 # First, regression-style y's
 y = [1, 1, 0, 0]
-ŷ = [.9, .1, .1, .9]
+ŷ = [0.9, 0.1, 0.1, 0.9]
 
 @testset "mse" begin
-  @test mse(ŷ, y) ≈ (.1^2 + .9^2)/2
+    @test mse(ŷ, y) ≈ (0.1^2 + 0.9^2) / 2
 
-  # Test that mse() loss works on complex values:
-  @test mse(0 + 0im, 1 + 1im) == 2
+    # Test that mse() loss works on complex values:
+    @test mse(0 + 0im, 1 + 1im) == 2
 end
 
-@testset "mae" begin
-  @test Flux.mae(ŷ, y) ≈ 1/2
-end
+@testset "mae" begin @test Flux.mae(ŷ, y) ≈ 1 / 2 end
 
-@testset "huber_loss" begin
-  @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002
-end
+@testset "huber_loss" begin @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002 end
 
-y = [123.0,456.0,789.0]
-ŷ = [345.0,332.0,789.0]
-@testset "msle" begin
-  @test Flux.msle(ŷ, y) ≈ 0.38813985859136585
-end
+y = [123.0, 456.0, 789.0]
+ŷ = [345.0, 332.0, 789.0]
+@testset "msle" begin @test Flux.msle(ŷ, y) ≈ 0.38813985859136585 end
 
 # Now onehot y's
 y = onehotbatch([1, 1, 0, 0], 0:1)
 y_smoothed = label_smoothing(y, 0.1)
-ŷ = [.1 .9; .9 .1; .9 .1; .1 .9]'
-v = log(.1 / .9)
+ŷ = [0.1 0.9; 0.9 0.1; 0.9 0.1; 0.1 0.9]'
+v = log(0.1 / 0.9)
 logŷ = [v 0.0; 0.0 v; 0.0 v; v 0.0]'
 lossvalue = 1.203972804325936
 lossvalue_smoothed = 1.2039728043259348
 yl = onehotbatch([1], 0:1)
 sf = 0.1
-yls = [sf (1-sf)]'  # Effective y after label smoothing
+yls = [sf (1 - sf)]'  # Effective y after label smoothing
 ylp = [0.9 0.1]'
 logylp = [0.0 v]'
 
@@ -78,113 +73,118 @@ logylp = [0.0 v]'
 ya = onehotbatch([1, 1, 1, 0, 0], 0:1)
 ya_smoothed = label_smoothing(ya, 2sf)
 y_same = Float32.(ya)
-y_sim = y_same .* (1-2*sf) .+ sf
+y_sim = y_same .* (1 - 2 * sf) .+ sf
 y_dis = copy(y_sim)
-y_dis[1,:], y_dis[2,:] = y_dis[2,:], y_dis[1,:]
+y_dis[1, :], y_dis[2, :] = y_dis[2, :], y_dis[1, :]
 
 @testset "crossentropy" begin
-  @test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ crossentropy([0.1,0.9], [0.1,0.9])
-  @test crossentropy(ŷ, y) ≈ lossvalue
-  @test crossentropy(ŷ, y_smoothed) ≈ lossvalue_smoothed
-  @test crossentropy(ylp, label_smoothing(yl, 2sf)) ≈ -sum(yls.*log.(ylp))
-  @test crossentropy(ylp, yl) ≈ -sum(yl.*log.(ylp))
-  @test iszero(crossentropy(y_same, ya, ϵ=0))
-  @test iszero(crossentropy(ya, ya, ϵ=0))
-  @test crossentropy(y_sim, ya) < crossentropy(y_sim, ya_smoothed)
-  @test crossentropy(y_dis, ya) > crossentropy(y_dis, ya_smoothed)
+    @test crossentropy([0.1, 0.0, 0.9], [0.1, 0.0, 0.9]) ≈
+          crossentropy([0.1, 0.9], [0.1, 0.9])
+    @test crossentropy(ŷ, y) ≈ lossvalue
+    @test crossentropy(ŷ, y_smoothed) ≈ lossvalue_smoothed
+    @test crossentropy(ylp, label_smoothing(yl, 2sf)) ≈ -sum(yls .* log.(ylp))
+    @test crossentropy(ylp, yl) ≈ -sum(yl .* log.(ylp))
+    @test iszero(crossentropy(y_same, ya, ϵ = 0))
+    @test iszero(crossentropy(ya, ya, ϵ = 0))
+    @test crossentropy(y_sim, ya) < crossentropy(y_sim, ya_smoothed)
+    @test crossentropy(y_dis, ya) > crossentropy(y_dis, ya_smoothed)
 end
 
 @testset "logitcrossentropy" begin
-  @test logitcrossentropy(logŷ, y) ≈ lossvalue
-  @test logitcrossentropy(logylp, yl) ≈ -sum(yl.*logsoftmax(logylp))
-  @test logitcrossentropy(logylp, label_smoothing(yl, 2sf)) ≈ -sum(yls.*logsoftmax(logylp))
+    @test logitcrossentropy(logŷ, y) ≈ lossvalue
+    @test logitcrossentropy(logylp, yl) ≈ -sum(yl .* logsoftmax(logylp))
+    @test logitcrossentropy(logylp, label_smoothing(yl, 2sf)) ≈
+          -sum(yls .* logsoftmax(logylp))
 end
 
 logŷ, y = randn(3), rand(3)
-yls = y.*(1-2sf).+sf
+yls = y .* (1 - 2sf) .+ sf
 
 @testset "binarycrossentropy" begin
-  @test binarycrossentropy.(σ.(logŷ), label_smoothing(y, 2sf; dims=0); ϵ=0) ≈ -yls.*log.(σ.(logŷ)) - (1 .- yls).*log.(1 .- σ.(logŷ))
-  @test binarycrossentropy(σ.(logŷ), y; ϵ=0) ≈ mean(-y.*log.(σ.(logŷ)) - (1 .- y).*log.(1 .- σ.(logŷ)))
-  @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y.*log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - (1 .- y).*log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))))
-  @test binarycrossentropy([0.1,0.2,0.9], 1) ≈ -mean(log, [0.1,0.2,0.9])  # constant label
+    @test binarycrossentropy.(σ.(logŷ), label_smoothing(y, 2sf; dims = 0); ϵ = 0) ≈
+          -yls .* log.(σ.(logŷ)) - (1 .- yls) .* log.(1 .- σ.(logŷ))
+    @test binarycrossentropy(σ.(logŷ), y; ϵ = 0) ≈
+          mean(-y .* log.(σ.(logŷ)) - (1 .- y) .* log.(1 .- σ.(logŷ)))
+    @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y .* log.(σ.(logŷ) .+ eps.(σ.(logŷ))) -
+               (1 .- y) .* log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))))
+    @test binarycrossentropy([0.1, 0.2, 0.9], 1) ≈ -mean(log, [0.1, 0.2, 0.9])  # constant label
 end
 
 @testset "logitbinarycrossentropy" begin
-  @test logitbinarycrossentropy.(logŷ, label_smoothing(y, 0.2)) ≈ binarycrossentropy.(σ.(logŷ), label_smoothing(y, 0.2); ϵ=0)
-  @test logitbinarycrossentropy(logŷ, y) ≈ binarycrossentropy(σ.(logŷ), y; ϵ=0)
+    @test logitbinarycrossentropy.(logŷ, label_smoothing(y, 0.2)) ≈
+          binarycrossentropy.(σ.(logŷ), label_smoothing(y, 0.2); ϵ = 0)
+    @test logitbinarycrossentropy(logŷ, y) ≈ binarycrossentropy(σ.(logŷ), y; ϵ = 0)
 end
 
 y = onehotbatch([1], 0:1)
 yls = [0.1 0.9]'
 @testset "label_smoothing" begin
-  @test label_smoothing(y, 0.2) == yls
-  @test label_smoothing(y, 0.2; dims=0) == label_smoothing.(y, 0.2; dims=0)
-  @test_throws ArgumentError label_smoothing([0., 0., 1., 0.], 1.2)
-  @test_throws ArgumentError label_smoothing([0., 0., 1., 0.], 0.)
+    @test label_smoothing(y, 0.2) == yls
+    @test label_smoothing(y, 0.2; dims = 0) == label_smoothing.(y, 0.2; dims = 0)
+    @test_throws ArgumentError label_smoothing([0.0, 0.0, 1.0, 0.0], 1.2)
+    @test_throws ArgumentError label_smoothing([0.0, 0.0, 1.0, 0.0], 0.0)
 end
 
 y = [1 2 3]
 ŷ = [4.0 5.0 6.0]
 
 @testset "kldivergence" begin
-  @test Flux.kldivergence([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ Flux.kldivergence([0.1,0.9], [0.1,0.9])
-  @test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457
-  @test Flux.kldivergence(y, y) ≈ 0
+    @test Flux.kldivergence([0.1, 0.0, 0.9], [0.1, 0.0, 0.9]) ≈
+          Flux.kldivergence([0.1, 0.9], [0.1, 0.9])
+    @test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457
+    @test Flux.kldivergence(y, y) ≈ 0
 end
 
 y = [1 2 3 4]
 ŷ = [5.0 6.0 7.0 8.0]
 
 @testset "hinge_loss" begin
-  @test Flux.hinge_loss(ŷ, y) ≈ 0
-  @test Flux.hinge_loss(y, 0.5 .* y) ≈ 0.125
+    @test Flux.hinge_loss(ŷ, y) ≈ 0
+    @test Flux.hinge_loss(y, 0.5 .* y) ≈ 0.125
 end
 
 @testset "squared_hinge_loss" begin
-  @test Flux.squared_hinge_loss(ŷ, y) ≈ 0
-  @test Flux.squared_hinge_loss(y, 0.5 .* y) ≈ 0.0625
+    @test Flux.squared_hinge_loss(ŷ, y) ≈ 0
+    @test Flux.squared_hinge_loss(y, 0.5 .* y) ≈ 0.0625
 end
 
 y = [0.1 0.2 0.3]
 ŷ = [0.4 0.5 0.6]
 
 @testset "poisson_loss" begin
-  @test Flux.poisson_loss(ŷ, y) ≈ 0.6278353988097339
-  @test Flux.poisson_loss(y, y) ≈ 0.5044459776946685
+    @test Flux.poisson_loss(ŷ, y) ≈ 0.6278353988097339
+    @test Flux.poisson_loss(y, y) ≈ 0.5044459776946685
 end
 
 y = [1.0 0.5 0.3 2.4]
 ŷ = [0 1.4 0.5 1.2]
 
 @testset "dice_coeff_loss" begin
-  @test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999
-  @test Flux.dice_coeff_loss(y, y) ≈ 0.0
+    @test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999
+    @test Flux.dice_coeff_loss(y, y) ≈ 0.0
 end
 
 @testset "tversky_loss" begin
-  @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
-  @test Flux.tversky_loss(ŷ, y, β=0.8) ≈ -0.09490740740740744
-  @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
+    @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
+    @test Flux.tversky_loss(ŷ, y, β = 0.8) ≈ -0.09490740740740744
+    @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
 end
 
-@testset "no spurious promotions" begin
-  for T in (Float32, Float64)
+@testset "no spurious promotions" begin for T in (Float32, Float64)
     y = rand(T, 2)
     ŷ = rand(T, 2)
     for f in ALL_LOSSES
-      fwd, back = Flux.pullback(f, ŷ, y)
-      @test fwd isa T
-      @test eltype(back(one(T))[1]) == T
+        fwd, back = Flux.pullback(f, ŷ, y)
+        @test fwd isa T
+        @test eltype(back(one(T))[1]) == T
     end
-  end
-end
+end end
 
 @testset "binary_focal_loss" begin
-    y = [0  1  0
-         1  0  1]
-    ŷ = [0.268941  0.5  0.268941
-         0.731059  0.5  0.731059]
+    y = [0 1 0
+         1 0 1]
+    ŷ = [0.268941 0.5 0.268941
+         0.731059 0.5 0.731059]
 
     y1 = [1 0
           0 1]
@@ -192,14 +192,14 @@ end
           0.4 0.7]
     @test Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385
     @test Flux.binary_focal_loss(ŷ1, y1) ≈ 0.05691642237852222
-    @test Flux.binary_focal_loss(ŷ, y; γ=0.0) ≈ Flux.binarycrossentropy(ŷ, y)
+    @test Flux.binary_focal_loss(ŷ, y; γ = 0.0) ≈ Flux.binarycrossentropy(ŷ, y)
 end
 
 @testset "focal_loss" begin
-    y = [1  0  0  0  1
-         0  1  0  1  0
-         0  0  1  0  0]
-    ŷ = softmax(reshape(-7:7, 3, 5) .* 1f0)
+    y = [1 0 0 0 1
+         0 1 0 1 0
+         0 0 1 0 0]
+    ŷ = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
     y1 = [1 0
           0 0
           0 1]
@@ -208,43 +208,48 @@ end
           0.1 0.3]
     @test Flux.focal_loss(ŷ, y) ≈ 1.1277571935622628
     @test Flux.focal_loss(ŷ1, y1) ≈ 0.45990566879720157
-    @test Flux.focal_loss(ŷ, y; γ=0.0) ≈ Flux.crossentropy(ŷ, y)
+    @test Flux.focal_loss(ŷ, y; γ = 0.0) ≈ Flux.crossentropy(ŷ, y)
 end
-  
+
 @testset "siamese_contrastive_loss" begin
-  y = [1 0
-       0 0
-       0 1]
-  ŷ = [0.4 0.2
-       0.5 0.5
-       0.1 0.3]
-  y1 = [1 0 0 0 1
-        0 1 0 1 0
-        0 0 1 0 0]
-  ŷ1 = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
-  y2 = [1
-        0
-        0
-        1
-        1]
-  ŷ2 = [0.6
-        0.4
-        0.1
-        0.2
-        0.7]
-  @test Flux.siamese_contrastive_loss(ŷ, y) ≈ 0.2333333333333333
-  @test Flux.siamese_contrastive_loss(ŷ, y, margin = 0.5f0) ≈ 0.10000000000000002
-  @test Flux.siamese_contrastive_loss(ŷ, y, margin = 1.5f0) ≈ 0.5333333333333333
-  @test Flux.siamese_contrastive_loss(ŷ1, y1) ≈ 0.32554644f0
-  @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 0.5f0) ≈ 0.16271012f0
-  @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 1.5f0) ≈ 0.6532292f0
-  @test Flux.siamese_contrastive_loss(ŷ, y, margin = 1) ≈ Flux.siamese_contrastive_loss(ŷ, y)
-  @test Flux.siamese_contrastive_loss(y, y) ≈ 0.0
-  @test Flux.siamese_contrastive_loss(y1, y1) ≈ 0.0
-  @test Flux.siamese_contrastive_loss(ŷ, y, margin = 0) ≈ 0.09166666666666667
-  @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 0) ≈ 0.13161165f0
-  @test Flux.siamese_contrastive_loss(ŷ2, y2) ≈ 0.21200000000000005
-  @test Flux.siamese_contrastive_loss(ŷ2, ŷ2) ≈ 0.18800000000000003
-  @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ1, y1, margin = -0.5)
-  @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ, y, margin = -1)
+    y = [1 0
+         0 0
+         0 1]
+    ŷ = [0.4 0.2
+         0.5 0.5
+         0.1 0.3]
+    y1 = [1 0 0 0 1
+          0 1 0 1 0
+          0 0 1 0 0]
+    ŷ1 = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
+    y2 = [1
+          0
+          0
+          1
+          1]
+    ŷ2 = [0.6
+          0.4
+          0.1
+          0.2
+          0.7]
+    @test Flux.siamese_contrastive_loss(ŷ, y) ≈ 0.2333333333333333
+    @test Flux.siamese_contrastive_loss(ŷ, y, margin = 0.5f0) ≈ 0.10000000000000002
+    @test Flux.siamese_contrastive_loss(ŷ, y, margin = 1.5f0) ≈ 0.5333333333333333
+    @test Flux.siamese_contrastive_loss(ŷ1, y1) ≈ 0.32554644f0
+    @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 0.5f0) ≈ 0.16271012f0
+    @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 1.5f0) ≈ 0.6532292f0
+    @test Flux.siamese_contrastive_loss(ŷ, y, margin = 1) ≈
+          Flux.siamese_contrastive_loss(ŷ, y)
+    @test Flux.siamese_contrastive_loss(y, y) ≈ 0.0
+    @test Flux.siamese_contrastive_loss(y1, y1) ≈ 0.0
+    @test Flux.siamese_contrastive_loss(ŷ, y, margin = 0) ≈ 0.09166666666666667
+    @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 0) ≈ 0.13161165f0
+    @test Flux.siamese_contrastive_loss(ŷ2, y2) ≈ 0.21200000000000005
+    @test Flux.siamese_contrastive_loss(ŷ2, ŷ2) ≈ 0.18800000000000003
+    @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ1,
+                                                                                                y1,
+                                                                                                margin = -0.5)
+    @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ,
+                                                                                              y,
+                                                                                              margin = -1)
 end
diff --git a/test/optimise.jl b/test/optimise.jl
index 41de5a4a10..0aef531001 100644
--- a/test/optimise.jl
+++ b/test/optimise.jl
@@ -6,211 +6,206 @@ using Test
 using Random
 
 @testset "Optimise" begin
-  # Ensure rng has different state inside and outside the inner @testset
-  # so that w and w' are different
-  Random.seed!(84)
-  w = randn(10, 10)
-  @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(),
-                       NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(),
-                       Nesterov(), RMSProp(), Momentum()]
-    Random.seed!(42)
-    w′ = randn(10, 10)
-    b = false
-    loss(x) = Flux.Losses.mse(w*x, w′*x .+ b)
-    for t = 1: 10^5
-      θ = params([w′, b])
-      x = rand(10)
-      θ̄ = gradient(() -> loss(x), θ)
-      Optimise.update!(opt, θ, θ̄)
+    # Ensure rng has different state inside and outside the inner @testset
+    # so that w and w' are different
+    Random.seed!(84)
+    w = randn(10, 10)
+    @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(),
+        NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(),
+        Nesterov(), RMSProp(), Momentum()]
+        Random.seed!(42)
+        w′ = randn(10, 10)
+        b = false
+        loss(x) = Flux.Losses.mse(w * x, w′ * x .+ b)
+        for t in 1:(10^5)
+            θ = params([w′, b])
+            x = rand(10)
+            θ̄ = gradient(() -> loss(x), θ)
+            Optimise.update!(opt, θ, θ̄)
+        end
+        @test loss(rand(10, 10)) < 0.01
     end
-    @test loss(rand(10, 10)) < 0.01
-  end
 end
 
 @testset "Optimiser" begin
-  Random.seed!(84)
-  w = randn(10, 10)
-  @testset for Opt in [InvDecay, WeightDecay, ExpDecay]
-    Random.seed!(42)
-    w′ = randn(10, 10)
-    loss(x) = Flux.Losses.mse(w*x, w′*x)
-    opt = Optimiser(Opt(), Adam(0.001))
-    for t = 1:10^5
-      θ = Params([w′])
-      x = rand(10)
-      θ̄ = gradient(() -> loss(x), θ)
-      Optimise.update!(opt, θ, θ̄)
+    Random.seed!(84)
+    w = randn(10, 10)
+    @testset for Opt in [InvDecay, WeightDecay, ExpDecay]
+        Random.seed!(42)
+        w′ = randn(10, 10)
+        loss(x) = Flux.Losses.mse(w * x, w′ * x)
+        opt = Optimiser(Opt(), Adam(0.001))
+        for t in 1:(10^5)
+            θ = Params([w′])
+            x = rand(10)
+            θ̄ = gradient(() -> loss(x), θ)
+            Optimise.update!(opt, θ, θ̄)
+        end
+        @test loss(rand(10, 10)) < 0.01
     end
-    @test loss(rand(10, 10)) < 0.01
-  end
 end
 
 @testset "Training Loop" begin
-  i = 0
-  l = 1
-  Flux.train!(
-              () -> (sleep(0.1); Flux.skip(); i+=1),
-              Params([]),
-              Iterators.repeated((), 10),
-              Descent()
-             )
-
-  @test i==0 #all skipped
-
-  Flux.train!(
-              () -> (sleep(0.1); i==8 && Flux.skip(); i+=1),
-              Params([]),
-              Iterators.repeated((), 10),
-              Descent()
-             )
-
-  @test i==8 #skip after i hit 8
-
-  i = 0
-  Flux.train!(() -> (sleep(0.1); i += 1; l),
-              Params([]),
-              Iterators.repeated((), 100),
-              Descent(),
-              cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1))
-
-  @test 3 < i < 50
-
-  # Test multiple callbacks
-  x = 0
-  fs = [() -> (), () -> x = 1]
-  cbs = runall(fs)
-  cbs()
-  @test x == 1
-
-  r = rand(3, 3)
-  loss(x) = sum(x .* x)
-  Flux.train!(loss, Flux.params(r), (r,), Descent())
+    i = 0
+    l = 1
+    Flux.train!(() -> (sleep(0.1); Flux.skip(); i += 1),
+                Params([]),
+                Iterators.repeated((), 10),
+                Descent())
+
+    @test i == 0 #all skipped
+
+    Flux.train!(() -> (sleep(0.1); i == 8 && Flux.skip(); i += 1),
+                Params([]),
+                Iterators.repeated((), 10),
+                Descent())
+
+    @test i == 8 #skip after i hit 8
+
+    i = 0
+    Flux.train!(() -> (sleep(0.1); i += 1; l),
+                Params([]),
+                Iterators.repeated((), 100),
+                Descent(),
+                cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1))
+
+    @test 3 < i < 50
+
+    # Test multiple callbacks
+    x = 0
+    fs = [() -> (), () -> x = 1]
+    cbs = runall(fs)
+    cbs()
+    @test x == 1
+
+    r = rand(3, 3)
+    loss(x) = sum(x .* x)
+    Flux.train!(loss, Flux.params(r), (r,), Descent())
 end
 
 @testset "Stop on NaN" begin
-  m = Dense(1 => 1)
-  m.weight .= 0
-  CNT = 0
-  @test_throws DomainError Flux.train!(Flux.params(m), 1:100, Descent(0.1)) do i
-    CNT += 1
-    (i == 51 ? NaN32 : 1f0) * sum(m([1.0]))
-  end
-  @test CNT == 51  # stopped early
-  @test m.weight[1] ≈ -5  # did not corrupt weights
+    m = Dense(1 => 1)
+    m.weight .= 0
+    CNT = 0
+    @test_throws DomainError Flux.train!(Flux.params(m), 1:100, Descent(0.1)) do i
+        CNT += 1
+        return (i == 51 ? NaN32 : 1.0f0) * sum(m([1.0]))
+    end
+    @test CNT == 51  # stopped early
+    @test m.weight[1] ≈ -5  # did not corrupt weights
 end
 
 @testset "ExpDecay" begin
+    @testset "Sanity Check" begin
+        o = ExpDecay(0.2, 0.5, 1, 1e-3)
+        p = [0.0]
+        steps = 1:8
+        eta_expected = @. max(o.eta * 0.5^steps, o.clip)
+        eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
+        @test eta_actual == eta_expected
+    end
 
-  @testset "Sanity Check" begin
-    o = ExpDecay(0.2, 0.5, 1, 1e-3)
-    p = [0.0]
-    steps = 1:8
-    eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip)
-    eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
-    @test eta_actual == eta_expected
-  end
-
-  @testset "starting step" begin
-    start = 4
-    o = ExpDecay(0.2, 0.5, 1, 1e-3, start)
-    p = [0.0]
-    steps = 1:8
-    eta_expected = @. max(o.eta * 0.5 ^ max(steps - start, 0), o.clip)
-    eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
-    @test eta_actual == eta_expected
-  end
-
-  w = randn(10, 10)
-  o = ExpDecay(0.1, 0.1, 1000, 1e-4)
-  w1 = randn(10,10)
-  loss(x) = Flux.Losses.mse(w*x, w1*x)
-  flag = 1
-  decay_steps = []
-  for t = 1:10^5
-    prev_eta = o.eta
-    θ = Params([w1])
-    x = rand(10)
-    θ̄ = gradient(() -> loss(x), θ)
-    prev_grad = collect(θ̄[w1])
-    delta = Optimise.apply!(o, w1, θ̄[w1])
-    w1 .-= delta
-    new_eta = o.eta
-    if new_eta != prev_eta
-      push!(decay_steps, t)
+    @testset "starting step" begin
+        start = 4
+        o = ExpDecay(0.2, 0.5, 1, 1e-3, start)
+        p = [0.0]
+        steps = 1:8
+        eta_expected = @. max(o.eta * 0.5^max(steps - start, 0), o.clip)
+        eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
+        @test eta_actual == eta_expected
     end
-    array = fill(o.eta, size(prev_grad))
-    if array .* prev_grad != delta
-      flag = 0
+
+    w = randn(10, 10)
+    o = ExpDecay(0.1, 0.1, 1000, 1e-4)
+    w1 = randn(10, 10)
+    loss(x) = Flux.Losses.mse(w * x, w1 * x)
+    flag = 1
+    decay_steps = []
+    for t in 1:(10^5)
+        prev_eta = o.eta
+        θ = Params([w1])
+        x = rand(10)
+        θ̄ = gradient(() -> loss(x), θ)
+        prev_grad = collect(θ̄[w1])
+        delta = Optimise.apply!(o, w1, θ̄[w1])
+        w1 .-= delta
+        new_eta = o.eta
+        if new_eta != prev_eta
+            push!(decay_steps, t)
+        end
+        array = fill(o.eta, size(prev_grad))
+        if array .* prev_grad != delta
+            flag = 0
+        end
     end
-  end
-  @test flag == 1
-  # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
-  ground_truth = []
-  for i in 1:4
-    push!(ground_truth, 1000*i)  # Expected decay steps for this example.
-  end
-  @test decay_steps == ground_truth
-  @test o.eta == o.clip
+    @test flag == 1
+    # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
+    ground_truth = []
+    for i in 1:4
+        push!(ground_truth, 1000 * i)  # Expected decay steps for this example.
+    end
+    @test decay_steps == ground_truth
+    @test o.eta == o.clip
 end
 
 @testset "Clipping" begin
-  w = randn(10, 10)
-  loss(x) = sum(w * x)
-  θ = Params([w])
-  x = 1000 * randn(10)
-  w̄ = gradient(() -> loss(x), θ)[w]
-  w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄))
-  @test all(w̄_value .<= 1)
-  w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄))
-  @test norm(w̄_norm) <= 1
+    w = randn(10, 10)
+    loss(x) = sum(w * x)
+    θ = Params([w])
+    x = 1000 * randn(10)
+    w̄ = gradient(() -> loss(x), θ)[w]
+    w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄))
+    @test all(w̄_value .<= 1)
+    w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄))
+    @test norm(w̄_norm) <= 1
 end
 
 @testset "update!: handle Fills from Zygote" begin
-  w = randn(10,10)
-  wold = copy(w)
-  g = FillArrays.Ones(size(w))
-  opt = Descent(0.1)
-  Flux.update!(opt, w, g)
-  @test w ≈ wold .- 0.1 
-
-  w = randn(3)
-  wold = copy(w)
-  θ = Flux.params([w])
-  gs = gradient(() -> w[1], θ)
-  opt = Descent(0.1)
-  Flux.update!(opt, θ, gs)
-  @test w[1] ≈ wold[1] .- 0.1
-  @test w[2:3] ≈ wold[2:3] 
-
-  ## Issue #1510
-  w = randn(10,10)
-  wold = copy(w)
-  θ = Flux.params([w])
-  gs = gradient(() -> sum(w), θ)
-  opt = Descent(0.1)
-  Flux.update!(opt, θ, gs)
-  @test w ≈ wold .- 0.1 
+    w = randn(10, 10)
+    wold = copy(w)
+    g = FillArrays.Ones(size(w))
+    opt = Descent(0.1)
+    Flux.update!(opt, w, g)
+    @test w ≈ wold .- 0.1
+
+    w = randn(3)
+    wold = copy(w)
+    θ = Flux.params([w])
+    gs = gradient(() -> w[1], θ)
+    opt = Descent(0.1)
+    Flux.update!(opt, θ, gs)
+    @test w[1] ≈ wold[1] .- 0.1
+    @test w[2:3] ≈ wold[2:3]
+
+    ## Issue #1510
+    w = randn(10, 10)
+    wold = copy(w)
+    θ = Flux.params([w])
+    gs = gradient(() -> sum(w), θ)
+    opt = Descent(0.1)
+    Flux.update!(opt, θ, gs)
+    @test w ≈ wold .- 0.1
 end
 
 @testset "update!: handle ComponentArrays" begin
-  w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2]))
-  wold = deepcopy(w)
-  θ = Flux.params([w])
-  gs = gradient(() -> sum(w.a) + sum(w.c.b), θ)
-  opt = Descent(0.1)
-  Flux.update!(opt, θ, gs)
-  @test w.a ≈ wold.a .- 0.1
-  @test w.b ≈ wold.b
-  @test w.c.b ≈ wold.c.b .- 0.1
-  @test w.c.a ≈ wold.c.a
-
-  w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2]))
-  wold = deepcopy(w)
-  θ = Flux.params([w])
-  gs = gradient(() -> sum(w), θ)
-  opt = Descent(0.1)
-  Flux.update!(opt, θ, gs)
-  @test w ≈ wold .- 0.1
+    w = ComponentArrays.ComponentArray(a = 1.0, b = [2, 1, 4], c = (a = 2, b = [1, 2]))
+    wold = deepcopy(w)
+    θ = Flux.params([w])
+    gs = gradient(() -> sum(w.a) + sum(w.c.b), θ)
+    opt = Descent(0.1)
+    Flux.update!(opt, θ, gs)
+    @test w.a ≈ wold.a .- 0.1
+    @test w.b ≈ wold.b
+    @test w.c.b ≈ wold.c.b .- 0.1
+    @test w.c.a ≈ wold.c.a
+
+    w = ComponentArrays.ComponentArray(a = 1.0, b = [2, 1, 4], c = (a = 2, b = [1, 2]))
+    wold = deepcopy(w)
+    θ = Flux.params([w])
+    gs = gradient(() -> sum(w), θ)
+    opt = Descent(0.1)
+    Flux.update!(opt, θ, gs)
+    @test w ≈ wold .- 0.1
 end
 
 # Flux PR #1776
@@ -221,15 +216,15 @@ end
 # wreaks all sorts of havoc on our training loops.  This test ensures that
 # a simple optimization is montonically decreasing (up to learning step effects)
 @testset "Momentum Optimisers and complex values" begin
-  # Test every optimizer that has momentum internally
-  for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief]
+# Test every optimizer that has momentum internally
+for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief]
     # Our "model" is just a complex number
     w = zeros(ComplexF32, 1)
 
     # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x`
     function loss()
         # Deterministic training data is the best training data
-        x = ones(1, 1) + 1im*ones(1, 1)
+        x = ones(1, 1) + 1im * ones(1, 1)
 
         # Manually implement `mse()` to allow demonstration of brokenness
         # on older Flux builds that don't have a fixed `mse()`
@@ -247,5 +242,4 @@ end
         last_loss = loss()
         Flux.update!(opt, params, grads)
     end
-  end
-end
+end end
diff --git a/test/outputsize.jl b/test/outputsize.jl
index 0e5b807a60..a6e0238830 100644
--- a/test/outputsize.jl
+++ b/test/outputsize.jl
@@ -1,259 +1,261 @@
 @testset "basic" begin
-  m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
-  @test outputsize(m, (10, 10, 3, 1)) == (6, 6, 32, 1)
+    m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+    @test outputsize(m, (10, 10, 3, 1)) == (6, 6, 32, 1)
 
-  m = Dense(10, 5)
-  @test_throws DimensionMismatch outputsize(m, (5, 2)) == (5, 1)
-  @test outputsize(m, (10,); padbatch=true) == (5, 1)
+    m = Dense(10, 5)
+    @test_throws DimensionMismatch outputsize(m, (5, 2))==(5, 1)
+    @test outputsize(m, (10,); padbatch = true) == (5, 1)
 
-  m = Chain(Dense(10, 8, σ), Dense(8, 5), Dense(5, 2))
-  @test outputsize(m, (10,); padbatch=true) == (2, 1)
-  @test outputsize(m, (10, 30)) == (2, 30)
+    m = Chain(Dense(10, 8, σ), Dense(8, 5), Dense(5, 2))
+    @test outputsize(m, (10,); padbatch = true) == (2, 1)
+    @test outputsize(m, (10, 30)) == (2, 30)
 
-  @info "Don't mind the following error, it's for testing purpose."
-  m = Chain(Dense(10, 8, σ), Dense(8, 4), Dense(5, 2))
-  @test_throws DimensionMismatch outputsize(m, (10,))
+    @info "Don't mind the following error, it's for testing purpose."
+    m = Chain(Dense(10, 8, σ), Dense(8, 4), Dense(5, 2))
+    @test_throws DimensionMismatch outputsize(m, (10,))
 
-  m = Flux.Scale(10)
-  @test outputsize(m, (10, 1)) == (10, 1)
+    m = Flux.Scale(10)
+    @test outputsize(m, (10, 1)) == (10, 1)
 
-  m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
-  @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1)
+    m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
+    @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1)
 
-  m = flatten
-  @test outputsize(m, (5, 5, 3, 10)) == (75, 10)
+    m = flatten
+    @test outputsize(m, (5, 5, 3, 10)) == (75, 10)
 
-  m = Flux.unsqueeze(dims=3)
-  @test outputsize(m, (5, 7, 13)) == (5, 7, 1, 13)
+    m = Flux.unsqueeze(dims = 3)
+    @test outputsize(m, (5, 7, 13)) == (5, 7, 1, 13)
 
-  m = Flux.Bilinear(10, 10, 7)
-  @test outputsize(m, (10,)) == (7,)
-  @test outputsize(m, (10, 32)) == (7, 32)
+    m = Flux.Bilinear(10, 10, 7)
+    @test outputsize(m, (10,)) == (7,)
+    @test outputsize(m, (10, 32)) == (7, 32)
 
-  m = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), flatten, Dense(1024, 10))
-  @test outputsize(m, (10, 10, 3, 50)) == (10, 50)
-  @test outputsize(m, (10, 10, 3, 2)) == (10, 2)
+    m = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), flatten, Dense(1024, 10))
+    @test outputsize(m, (10, 10, 3, 50)) == (10, 50)
+    @test outputsize(m, (10, 10, 3, 2)) == (10, 2)
 
-  m = SkipConnection(Conv((3, 3), 3 => 16; pad = 1), (mx, x) -> cat(mx, x; dims = 3))
-  @test outputsize(m, (10, 10, 3, 1)) == (10, 10, 19, 1)
+    m = SkipConnection(Conv((3, 3), 3 => 16; pad = 1), (mx, x) -> cat(mx, x; dims = 3))
+    @test outputsize(m, (10, 10, 3, 1)) == (10, 10, 19, 1)
 
-  m = Parallel((mx, x) -> cat(mx, x; dims = 3), Conv((3, 3), 3 => 16; pad = 1), identity)
-  @test outputsize(m, (10, 10, 3, 1)) == (10, 10, 19, 1)
+    m = Parallel((mx, x) -> cat(mx, x; dims = 3), Conv((3, 3), 3 => 16; pad = 1), identity)
+    @test outputsize(m, (10, 10, 3, 1)) == (10, 10, 19, 1)
 end
 
 @testset "multiple inputs" begin
-  m = Parallel(vcat, Dense(2, 4, relu), Dense(3, 6, relu))
-  @test outputsize(m, (2,), (3,)) == (10,)
-  @test outputsize(m, ((2,), (3,))) == (10,)
-  @test outputsize(m, (2,), (3,); padbatch=true) == (10, 1)
-  @test outputsize(m, (2,7), (3,7)) == (10, 7)
-
-  m = Chain(m, Dense(10, 13, tanh), softmax)
-  @test outputsize(m, (2,), (3,)) == (13,)
-  @test outputsize(m, ((2,), (3,))) == (13,)
-  @test outputsize(m, (2,), (3,); padbatch=true) == (13, 1)
-  @test outputsize(m, (2,7), (3,7)) == (13, 7)
+    m = Parallel(vcat, Dense(2, 4, relu), Dense(3, 6, relu))
+    @test outputsize(m, (2,), (3,)) == (10,)
+    @test outputsize(m, ((2,), (3,))) == (10,)
+    @test outputsize(m, (2,), (3,); padbatch = true) == (10, 1)
+    @test outputsize(m, (2, 7), (3, 7)) == (10, 7)
+
+    m = Chain(m, Dense(10, 13, tanh), softmax)
+    @test outputsize(m, (2,), (3,)) == (13,)
+    @test outputsize(m, ((2,), (3,))) == (13,)
+    @test outputsize(m, (2,), (3,); padbatch = true) == (13, 1)
+    @test outputsize(m, (2, 7), (3, 7)) == (13, 7)
 end
 
-@testset "activations" begin
-  @testset for f in [celu, elu, gelu, hardsigmoid, hardtanh,
-                     leakyrelu, lisht, logcosh, logσ, mish,
-                     relu, relu6, rrelu, selu, σ, softplus,
-                     softshrink, softsign, swish, tanhshrink, trelu]
+@testset "activations" begin @testset for f in [celu, elu, gelu, hardsigmoid, hardtanh,
+    leakyrelu, lisht, logcosh, logσ, mish,
+    relu, relu6, rrelu, selu, σ, softplus,
+    softshrink, softsign, swish, tanhshrink, trelu]
     @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1)
-  end
-end
+end end
 
 @testset "conv" begin
-  m = Conv((3, 3), 3 => 16)
-  @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1)
-  m = Conv((3, 3), 3 => 16; stride = 2)
-  @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 16, 1)
-  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3)
-  @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1)
-  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
-  @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 16, 1)
-  @test_throws DimensionMismatch outputsize(m, (5, 5, 2))
-  @test outputsize(m, (5, 5, 3, 100)) == (4, 4, 16, 100)
-
-  m = ConvTranspose((3, 3), 3 => 16)
-  @test outputsize(m, (8, 8, 3, 1)) == (10, 10, 16, 1)
-  m = ConvTranspose((3, 3), 3 => 16; stride = 2)
-  @test outputsize(m, (2, 2, 3, 1)) == (5, 5, 16, 1)
-  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3)
-  @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1)
-  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
-  @test outputsize(m, (4, 4, 3, 1)) == (5, 5, 16, 1)
-
-  m = DepthwiseConv((3, 3), 3 => 6)
-  @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 6, 1)
-  m = DepthwiseConv((3, 3), 3 => 6; stride = 2)
-  @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 6, 1)
-  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3)
-  @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 6, 1)
-  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2)
-  @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 6, 1)
-
-  m = CrossCor((3, 3), 3 => 16)
-  @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1)
-  m = CrossCor((3, 3), 3 => 16; stride = 2)
-  @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 16, 1)
-  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3)
-  @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1)
-  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
-  @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 16, 1)
-
-  m = AdaptiveMaxPool((2, 2))
-  @test outputsize(m, (10, 10, 3, 1)) == (2, 2, 3, 1)
-
-  m = AdaptiveMeanPool((2, 2))
-  @test outputsize(m, (10, 10, 3, 1)) == (2, 2, 3, 1)
-
-  m = GlobalMaxPool()
-  @test outputsize(m, (10, 10, 3, 1)) == (1, 1, 3, 1)
-
-  m = GlobalMeanPool()
-  @test outputsize(m, (10, 10, 3, 1)) == (1, 1, 3, 1)
-
-  m = MaxPool((2, 2))
-  @test outputsize(m, (10, 10, 3, 1)) == (5, 5, 3, 1)
-  m = MaxPool((2, 2); stride = 1)
-  @test outputsize(m, (5, 5, 4, 1)) == (4, 4, 4, 1)
-  m = MaxPool((2, 2); stride = 2, pad = 3)
-  @test outputsize(m, (5, 5, 2, 1)) == (5, 5, 2, 1)
-
-  m = MeanPool((2, 2))
-  @test outputsize(m, (10, 10, 3, 1)) == (5, 5, 3, 1)
-  m = MeanPool((2, 2); stride = 1)
-  @test outputsize(m, (5, 5, 4, 1)) == (4, 4, 4, 1)
-  m = MeanPool((2, 2); stride = 2, pad = 3)
-  @test outputsize(m, (5, 5, 2, 1)) == (5, 5, 2, 1)
+    m = Conv((3, 3), 3 => 16)
+    @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1)
+    m = Conv((3, 3), 3 => 16; stride = 2)
+    @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 16, 1)
+    m = Conv((3, 3), 3 => 16; stride = 2, pad = 3)
+    @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1)
+    m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+    @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 16, 1)
+    @test_throws DimensionMismatch outputsize(m, (5, 5, 2))
+    @test outputsize(m, (5, 5, 3, 100)) == (4, 4, 16, 100)
+
+    m = ConvTranspose((3, 3), 3 => 16)
+    @test outputsize(m, (8, 8, 3, 1)) == (10, 10, 16, 1)
+    m = ConvTranspose((3, 3), 3 => 16; stride = 2)
+    @test outputsize(m, (2, 2, 3, 1)) == (5, 5, 16, 1)
+    m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3)
+    @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1)
+    m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+    @test outputsize(m, (4, 4, 3, 1)) == (5, 5, 16, 1)
+
+    m = DepthwiseConv((3, 3), 3 => 6)
+    @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 6, 1)
+    m = DepthwiseConv((3, 3), 3 => 6; stride = 2)
+    @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 6, 1)
+    m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3)
+    @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 6, 1)
+    m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2)
+    @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 6, 1)
+
+    m = CrossCor((3, 3), 3 => 16)
+    @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1)
+    m = CrossCor((3, 3), 3 => 16; stride = 2)
+    @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 16, 1)
+    m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3)
+    @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1)
+    m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+    @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 16, 1)
+
+    m = AdaptiveMaxPool((2, 2))
+    @test outputsize(m, (10, 10, 3, 1)) == (2, 2, 3, 1)
+
+    m = AdaptiveMeanPool((2, 2))
+    @test outputsize(m, (10, 10, 3, 1)) == (2, 2, 3, 1)
+
+    m = GlobalMaxPool()
+    @test outputsize(m, (10, 10, 3, 1)) == (1, 1, 3, 1)
+
+    m = GlobalMeanPool()
+    @test outputsize(m, (10, 10, 3, 1)) == (1, 1, 3, 1)
+
+    m = MaxPool((2, 2))
+    @test outputsize(m, (10, 10, 3, 1)) == (5, 5, 3, 1)
+    m = MaxPool((2, 2); stride = 1)
+    @test outputsize(m, (5, 5, 4, 1)) == (4, 4, 4, 1)
+    m = MaxPool((2, 2); stride = 2, pad = 3)
+    @test outputsize(m, (5, 5, 2, 1)) == (5, 5, 2, 1)
+
+    m = MeanPool((2, 2))
+    @test outputsize(m, (10, 10, 3, 1)) == (5, 5, 3, 1)
+    m = MeanPool((2, 2); stride = 1)
+    @test outputsize(m, (5, 5, 4, 1)) == (4, 4, 4, 1)
+    m = MeanPool((2, 2); stride = 2, pad = 3)
+    @test outputsize(m, (5, 5, 2, 1)) == (5, 5, 2, 1)
 end
 
 @testset "normalisation" begin
-  m = Dropout(0.1)
-  @test outputsize(m, (10, 10)) == (10, 10)
-  @test outputsize(m, (10,); padbatch=true) == (10, 1)
-
-  m = AlphaDropout(0.1)
-  @test outputsize(m, (10, 10)) == (10, 10)
-  @test outputsize(m, (10,); padbatch=true) == (10, 1)
-
-  m = LayerNorm(32)
-  @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16)
-  @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1)
-  m2 = LayerNorm(3, 2)
-  @test outputsize(m2, (3, 2)) == (3, 2) == size(m2(randn(3, 2)))
-  @test outputsize(m2, (3,)) == (3, 2) == size(m2(randn(3, 2)))
-
-  m = BatchNorm(3)
-  @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16)
-  @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1)
-  @test_throws Exception m(randn(Float32, 32, 32, 5, 1))
-  @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1))
-
-  m = InstanceNorm(3)
-  @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16)
-  @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1)
-  @test_throws Exception m(randn(Float32, 32, 32, 5, 1))
-  @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1))
-
-  m = GroupNorm(16, 4)
-  @test outputsize(m, (32, 32, 16, 16)) == (32, 32, 16, 16)
-  @test outputsize(m, (32, 32, 16); padbatch=true) == (32, 32, 16, 1)
-  @test_throws Exception m(randn(Float32, 32, 32, 15, 4))
-  @test_throws DimensionMismatch outputsize(m, (32, 32, 15, 4))
+    m = Dropout(0.1)
+    @test outputsize(m, (10, 10)) == (10, 10)
+    @test outputsize(m, (10,); padbatch = true) == (10, 1)
+
+    m = AlphaDropout(0.1)
+    @test outputsize(m, (10, 10)) == (10, 10)
+    @test outputsize(m, (10,); padbatch = true) == (10, 1)
+
+    m = LayerNorm(32)
+    @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16)
+    @test outputsize(m, (32, 32, 3); padbatch = true) == (32, 32, 3, 1)
+    m2 = LayerNorm(3, 2)
+    @test outputsize(m2, (3, 2)) == (3, 2) == size(m2(randn(3, 2)))
+    @test outputsize(m2, (3,)) == (3, 2) == size(m2(randn(3, 2)))
+
+    m = BatchNorm(3)
+    @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16)
+    @test outputsize(m, (32, 32, 3); padbatch = true) == (32, 32, 3, 1)
+    @test_throws Exception m(randn(Float32, 32, 32, 5, 1))
+    @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1))
+
+    m = InstanceNorm(3)
+    @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16)
+    @test outputsize(m, (32, 32, 3); padbatch = true) == (32, 32, 3, 1)
+    @test_throws Exception m(randn(Float32, 32, 32, 5, 1))
+    @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1))
+
+    m = GroupNorm(16, 4)
+    @test outputsize(m, (32, 32, 16, 16)) == (32, 32, 16, 16)
+    @test outputsize(m, (32, 32, 16); padbatch = true) == (32, 32, 16, 1)
+    @test_throws Exception m(randn(Float32, 32, 32, 15, 4))
+    @test_throws DimensionMismatch outputsize(m, (32, 32, 15, 4))
 end
 
 @testset "autosize macro" begin
-  m = @autosize (3,) Dense(_ => 4)
-  @test randn(3) |> m |> size == (4,)
+    m = @autosize (3,) Dense(_ => 4)
+    @test randn(3) |> m |> size == (4,)
 
-  m = @autosize (3, 1) Chain(Dense(_, 4), Dense(4 => 10), softmax)
-  @test randn(3, 5) |> m |> size == (10, 5)
-  
-  m = @autosize (2, 3, 4, 5) Dense(_ => 10)  # goes by first dim, not 2nd-last
-  @test randn(2, 3, 4, 5) |> m |> size == (10, 3, 4, 5)
+    m = @autosize (3, 1) Chain(Dense(_, 4), Dense(4 => 10), softmax)
+    @test randn(3, 5) |> m |> size == (10, 5)
 
-  @test_broken begin  # outputsize fails on Embedding
-    m = @autosize (2, 3, 4, 5) Embedding(_ => 10)  # goes by first dim, not 2nd-last
+    m = @autosize (2, 3, 4, 5) Dense(_ => 10)  # goes by first dim, not 2nd-last
     @test randn(2, 3, 4, 5) |> m |> size == (10, 3, 4, 5)
-  end
-
-  m = @autosize (9,) Dense(_ => div(_,2))
-  @test randn(9) |> m |> size == (4,)
-
-  m = @autosize (3,) Chain(one = Dense(_ => 4), two = softmax)  # needs kw
-  @test randn(3) |> m |> size == (4,)
-  
-  m = @autosize (3,) Chain(; one = Dense(_ => 4), two = softmax)  # needs parameters
-  @test randn(3) |> m |> size == (4,)
-
-  m = @autosize (3, 45) Maxout(() -> Dense(_ => 6, tanh), 2)    # needs ->, block
-  @test randn(3, 45) |> m |> size == (6, 45)
-
-  # here Parallel gets two inputs, no problem:
-  m = @autosize (3,) Chain(SkipConnection(Dense(_ => 4), Parallel(vcat, Dense(_ => 5), Dense(_ => 6))), Flux.Scale(_))
-  @test randn(3) |> m |> size == (11,)
-  
-  # like Dense, LayerNorm goes by the first dimension:
-  m = @autosize (3, 4, 5) LayerNorm(_)
-  @test rand(3, 6, 7) |> m |> size == (3, 6, 7)
-
-  m = @autosize (3, 3, 10) LayerNorm(_, _)  # does not check that sizes match
-  @test rand(3, 3, 10) |> m |> size == (3, 3, 10)
-  
-  m = @autosize (3,) Flux.Bilinear(_ => 10)
-  @test randn(3) |> m |> size == (10,)
-
-  m = @autosize (3, 1) Flux.Bilinear(_ => 10)
-  @test randn(3, 4) |> m |> size == (10, 4)
-  
-  m = @autosize (3,) SkipConnection(Dense(_ => _), Flux.Bilinear(_ => 10))  # Bilinear gets two inputs
-  @test randn(3, 4) |> m |> size == (10, 4)
-  
-  @test_throws Exception @eval @autosize (3,) Flux.Bilinear((_,3) => 10)
-  
-  # first docstring example
-  m = @autosize (3, 1) Chain(Dense(_ => 2, sigmoid), BatchNorm(_, affine=false))
-  @test randn(3, 4) |> m |> size == (2, 4)
-  
-  # evil docstring example
-  img = [28, 28];
-  m = @autosize (img..., 1, 32) Chain(              # size is only needed at runtime
-         Chain(c = Conv((3,3), _ => 5; stride=2, pad=SamePad()),
-               p = MeanPool((3,3)),
-               b = BatchNorm(_),
-               f = Flux.flatten),
-         Dense(_ => _÷4, relu, init=Flux.rand32),   # can calculate output size _÷4
-         SkipConnection(Dense(_ => _, relu), +),
-         Dense(_ => 10),
-      )
-  @test randn(Float32, img..., 1, 32) |> m |> size == (10, 32)
-  
-  # https://github.com/FluxML/Flux.jl/issues/2086
-  m = @autosize (3, 1) Chain(; c = Dense(_ => 2, sigmoid), b = BatchNorm(_, affine=false))
-  @test randn(Float32, 3, 32) |> m |> size == (2, 32)
+
+    @test_broken begin  # outputsize fails on Embedding
+        m = @autosize (2, 3, 4, 5) Embedding(_ => 10)  # goes by first dim, not 2nd-last
+        @test randn(2, 3, 4, 5) |> m |> size == (10, 3, 4, 5)
+    end
+
+    m = @autosize (9,) Dense(_ => div(_, 2))
+    @test randn(9) |> m |> size == (4,)
+
+    m = @autosize (3,) Chain(one = Dense(_ => 4), two = softmax)  # needs kw
+    @test randn(3) |> m |> size == (4,)
+
+    m = @autosize (3,) Chain(; one = Dense(_ => 4), two = softmax)  # needs parameters
+    @test randn(3) |> m |> size == (4,)
+
+    m = @autosize (3, 45) Maxout(() -> Dense(_ => 6, tanh), 2)    # needs ->, block
+    @test randn(3, 45) |> m |> size == (6, 45)
+
+    # here Parallel gets two inputs, no problem:
+    m = @autosize (3,) Chain(SkipConnection(Dense(_ => 4),
+                                            Parallel(vcat, Dense(_ => 5), Dense(_ => 6))),
+                             Flux.Scale(_))
+    @test randn(3) |> m |> size == (11,)
+
+    # like Dense, LayerNorm goes by the first dimension:
+    m = @autosize (3, 4, 5) LayerNorm(_)
+    @test rand(3, 6, 7) |> m |> size == (3, 6, 7)
+
+    m = @autosize (3, 3, 10) LayerNorm(_, _)  # does not check that sizes match
+    @test rand(3, 3, 10) |> m |> size == (3, 3, 10)
+
+    m = @autosize (3,) Flux.Bilinear(_ => 10)
+    @test randn(3) |> m |> size == (10,)
+
+    m = @autosize (3, 1) Flux.Bilinear(_ => 10)
+    @test randn(3, 4) |> m |> size == (10, 4)
+
+    m = @autosize (3,) SkipConnection(Dense(_ => _), Flux.Bilinear(_ => 10))  # Bilinear gets two inputs
+    @test randn(3, 4) |> m |> size == (10, 4)
+
+    @test_throws Exception @eval @autosize (3,) Flux.Bilinear((_, 3) => 10)
+
+    # first docstring example
+    m = @autosize (3, 1) Chain(Dense(_ => 2, sigmoid), BatchNorm(_, affine = false))
+    @test randn(3, 4) |> m |> size == (2, 4)
+
+    # evil docstring example
+    img = [28, 28]
+    m = @autosize (img..., 1, 32) Chain(Chain(c = Conv((3, 3), _ => 5; stride = 2,
+                                                       pad = SamePad()),
+                                              p = MeanPool((3, 3)),
+                                              b = BatchNorm(_),
+                                              f = Flux.flatten),
+                                        Dense(_ => _ ÷ 4, relu, init = Flux.rand32),   # can calculate output size _÷4
+                                        SkipConnection(Dense(_ => _, relu), +),
+                                        Dense(_ => 10))
+    @test randn(Float32, img..., 1, 32) |> m |> size == (10, 32)
+
+    # https://github.com/FluxML/Flux.jl/issues/2086
+    m = @autosize (3, 1) Chain(; c = Dense(_ => 2, sigmoid),
+                               b = BatchNorm(_, affine = false))
+    @test randn(Float32, 3, 32) |> m |> size == (2, 32)
 end
 
 @testset "LazyLayer" begin
-  # This is what `@autosize` uses, ideally nobody should make these by hand!
-  # Implicitly testeed by the macro, explicitly here too: 
-  ld = Flux.LazyLayer("Dense(_ => 3, relu; init=??)", x -> Dense(Flux.autosizefor(Dense, x) => 3, relu, init=ones), nothing)
+    # This is what `@autosize` uses, ideally nobody should make these by hand!
+    # Implicitly testeed by the macro, explicitly here too: 
+    ld = Flux.LazyLayer("Dense(_ => 3, relu; init=??)",
+                        x -> Dense(Flux.autosizefor(Dense, x) => 3, relu, init = ones),
+                        nothing)
+
+    lm = Chain(ld, Flux.Scale(3))
+    @test string(ld) == "LazyLayer(Dense(_ => 3, relu; init=??))"
+    @test_throws Exception Flux.striplazy(lm)
 
-  lm = Chain(ld, Flux.Scale(3))
-  @test string(ld) == "LazyLayer(Dense(_ => 3, relu; init=??))"
-  @test_throws Exception Flux.striplazy(lm)
+    @test lm([1, 2]) == [3, 3, 3]
 
-  @test lm([1,2]) == [3,3,3]
+    @test string(ld) == "LazyLayer(Dense(2 => 3, relu))"
+    @test Flux.striplazy(ld) isa Dense
 
-  @test string(ld) == "LazyLayer(Dense(2 => 3, relu))"
-  @test Flux.striplazy(ld) isa Dense
+    @test_throws Exception Flux.params(lm)
+    @test_throws Exception gradient(x -> sum(abs2, lm(x)), [1, 2])
+    @test_throws Exception gradient(m -> sum(abs2, Flux.striplazy(m)([1, 2])), ld)
 
-  @test_throws Exception Flux.params(lm)
-  @test_throws Exception gradient(x -> sum(abs2, lm(x)), [1,2])
-  @test_throws Exception gradient(m -> sum(abs2, Flux.striplazy(m)([1,2])), ld)
-  
-  # Can't let |> gpu act before the arrays are materialized... so it's an error: 
-  @test_throws ErrorException @eval @autosize (1,2,3) Dense(_=>2) |> f64
+    # Can't let |> gpu act before the arrays are materialized... so it's an error: 
+    @test_throws ErrorException @eval @autosize (1, 2, 3) Dense(_ => 2)|>f64
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 9027b114fc..4189ea0dd5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,53 +11,44 @@ using CUDA
 Random.seed!(0)
 
 @testset verbose=true "Flux.jl" begin
+    @testset "Utils" begin include("utils.jl") end
 
-  @testset "Utils" begin
-    include("utils.jl")
-  end
-
-  @testset "Optimise" begin
-    include("optimise.jl")
-  end
-
-  @testset "Data" begin
-    include("data.jl")
-  end
-
-  @testset "Losses" begin
-    include("losses.jl")
-    include("ctc.jl")
-    CUDA.functional() && include("ctc-gpu.jl")
-  end
-
-  @testset "Layers" begin
-    include("layers/basic.jl")
-    include("layers/normalisation.jl")
-    include("layers/stateless.jl")
-    include("layers/recurrent.jl")
-    include("layers/conv.jl")
-    include("layers/upsample.jl")
-    include("layers/show.jl")
-  end
-
-  @testset "outputsize" begin
-    using Flux: outputsize
-    include("outputsize.jl")
-  end
-
-  @testset "CUDA" begin
-    if CUDA.functional()
-      include("cuda/runtests.jl")
-    else
-      @warn "CUDA unavailable, not testing GPU support"
+    @testset "Optimise" begin include("optimise.jl") end
+
+    @testset "Data" begin include("data.jl") end
+
+    @testset "Losses" begin
+        include("losses.jl")
+        include("ctc.jl")
+        CUDA.functional() && include("ctc-gpu.jl")
+    end
+
+    @testset "Layers" begin
+        include("layers/basic.jl")
+        include("layers/normalisation.jl")
+        include("layers/stateless.jl")
+        include("layers/recurrent.jl")
+        include("layers/conv.jl")
+        include("layers/upsample.jl")
+        include("layers/show.jl")
+    end
+
+    @testset "outputsize" begin
+        using Flux: outputsize
+        include("outputsize.jl")
     end
-  end
 
-  @static if VERSION == v"1.6"
-    using Documenter
-    @testset "Docs" begin
-      DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
-      doctest(Flux)
+    @testset "CUDA" begin if CUDA.functional()
+        include("cuda/runtests.jl")
+    else
+        @warn "CUDA unavailable, not testing GPU support"
+    end end
+
+    @static if VERSION == v"1.6"
+        using Documenter
+        @testset "Docs" begin
+            DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true)
+            doctest(Flux)
+        end
     end
-  end
 end
diff --git a/test/utils.jl b/test/utils.jl
index 20359daf25..3607250883 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -1,8 +1,8 @@
 using Flux
 using Flux: throttle, nfan, glorot_uniform, glorot_normal,
-             kaiming_normal, kaiming_uniform, orthogonal, truncated_normal,
-             sparse_init, identity_init, unstack, batch, unbatch,
-             unsqueeze, params, loadparams!, loadmodel!
+            kaiming_normal, kaiming_uniform, orthogonal, truncated_normal,
+            sparse_init, identity_init, unstack, batch, unbatch,
+            unsqueeze, params, loadparams!, loadmodel!
 using MLUtils
 using StatsBase: var, std
 using Statistics, LinearAlgebra
@@ -10,798 +10,798 @@ using Random
 using Test
 
 @testset "Throttle" begin
-  @testset "default behaviour" begin
-    a = []
-    f = throttle(()->push!(a, time()), 1, leading=true, trailing=false)
-    f()
-    f()
-    f()
-    sleep(1.01)
-    @test length(a) == 1
-  end
-
-  @testset "leading behaviour" begin
-    a = []
-    f = throttle(()->push!(a, time()), 1, leading=true, trailing=false)
-    f()
-    @test length(a) == 1
-    f()
-    @test length(a) == 1
-    sleep(1.01)
-    f()
-    @test length(a) == 2
-  end
-
-  @testset "trailing behaviour" begin
-    a = []
-    f = throttle(()->push!(a, time()), 1, leading=false, trailing=true)
-    f()
-    @test length(a) == 0
-    f()
-    @test length(a) == 0
-    sleep(1.01)
-    @test length(a) == 1
-  end
-
-  @testset "arguments" begin
-    a = []
-    f = throttle((x)->push!(a, x), 1, leading=true, trailing=true)
-    f(1)
-    @test a == [1]
-    f(2)
-    @test a == [1]
-    f(3)
-    @test a == [1]
-    sleep(1.01)
-    @test a == [1, 3]
-  end
+    @testset "default behaviour" begin
+        a = []
+        f = throttle(() -> push!(a, time()), 1, leading = true, trailing = false)
+        f()
+        f()
+        f()
+        sleep(1.01)
+        @test length(a) == 1
+    end
+
+    @testset "leading behaviour" begin
+        a = []
+        f = throttle(() -> push!(a, time()), 1, leading = true, trailing = false)
+        f()
+        @test length(a) == 1
+        f()
+        @test length(a) == 1
+        sleep(1.01)
+        f()
+        @test length(a) == 2
+    end
+
+    @testset "trailing behaviour" begin
+        a = []
+        f = throttle(() -> push!(a, time()), 1, leading = false, trailing = true)
+        f()
+        @test length(a) == 0
+        f()
+        @test length(a) == 0
+        sleep(1.01)
+        @test length(a) == 1
+    end
+
+    @testset "arguments" begin
+        a = []
+        f = throttle((x) -> push!(a, x), 1, leading = true, trailing = true)
+        f(1)
+        @test a == [1]
+        f(2)
+        @test a == [1]
+        f(3)
+        @test a == [1]
+        sleep(1.01)
+        @test a == [1, 3]
+    end
 end
 
 @testset "Initialization" begin
-  # Set random seed so that these tests don't fail randomly
-  Random.seed!(0)
-
-  @testset "Fan in/out" begin
-    @test nfan() == (1, 1) #For a constant
-    @test nfan(100) == (1, 100) #For vector
-    @test nfan(100, 200) == (200, 100) == nfan((100, 200)) #For Dense layer
-    @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
-    @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
-    @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
-  end
-
-  @testset "Basics: $init" for init in [
-      glorot_uniform, glorot_normal, 
-      kaiming_uniform, kaiming_normal, 
-      orthogonal, 
-      sparse_init,
-      truncated_normal,
-      identity_init,
-      Flux.rand32,
-      Flux.randn32,
+    # Set random seed so that these tests don't fail randomly
+    Random.seed!(0)
+
+    @testset "Fan in/out" begin
+        @test nfan() == (1, 1) #For a constant
+        @test nfan(100) == (1, 100) #For vector
+        @test nfan(100, 200) == (200, 100) == nfan((100, 200)) #For Dense layer
+        @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
+        @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
+        @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
+    end
+
+    @testset "Basics: $init" for init in [
+        glorot_uniform, glorot_normal,
+        kaiming_uniform, kaiming_normal,
+        orthogonal,
+        sparse_init,
+        truncated_normal,
+        identity_init,
+        Flux.rand32,
+        Flux.randn32,
     ]
-    if init == sparse_init
-      init = (args...) -> sparse_init(args...; sparsity=0.5)
-    else
-      # sparse_init is the only one which accepts only matrices:
-      @test size(init(3)) == (3,)
-      @test size(init(3, 4, 5)) == (3, 4, 5)
-    end
-    @test size(init(3, 4)) == (3, 4)
-    # only init(size...) is accepted:
-    @test_throws MethodError size(init((3, 4, 5))) == (3, 4, 5)
-
-    # rng, and currying:
-    @test size(init(MersenneTwister(1), 3, 4)) == (3, 4)
-    closure = init(MersenneTwister(1))
-    @test size(closure(3, 4)) == (3, 4)
-
-    # eltype, default Float32
-    @test eltype(init(3, 4)) == Float32
-
-    # @non_differentiable
-    @test gradient(x -> sum(x .* init(3, 4)), 5.0)[1] isa Number
-  end
-
-  @testset "glorot: $init" for init ∈ [glorot_uniform, glorot_normal]
-    # glorot_uniform and glorot_normal should both yield a kernel with
-    # variance ≈ 2/(fan_in + fan_out)
-    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
-        v = init(dims...)
-        fan_in, fan_out = nfan(dims...)
-        σ2 = 2 / (fan_in + fan_out)
-        @test 0.9σ2 < var(v) < 1.1σ2
-    end
-    @test eltype(init(3, 4; gain=1.5)) == Float32
-  end
-
-  @testset "kaiming" begin
-    # kaiming_uniform should yield a kernel in range [-sqrt(6/n_out), sqrt(6/n_out)]
-    # and kaiming_normal should yield a kernel with stddev ~= sqrt(2/n_out)
-    for (n_in, n_out) in [(100, 100), (100, 400)]
-      v = kaiming_uniform(n_in, n_out)
-      σ2 = sqrt(6/n_out)
-      @test -1σ2  < minimum(v) < -0.9σ2
-      @test 0.9σ2  < maximum(v) < 1σ2
-
-      v = kaiming_normal(n_in, n_out)
-      σ2 = sqrt(2/n_out)
-      @test 0.9σ2 < std(v) < 1.1σ2
-    end
-    @test eltype(kaiming_uniform(3, 4; gain=1.5)) == Float32
-    @test eltype(kaiming_normal(3, 4; gain=1.5)) == Float32
-  end
-
-  @testset "orthogonal" begin
-    # A matrix of dim = (m,n) with m > n should produce a QR decomposition. In the other case, the transpose should be taken to compute the QR decomposition.
-    for (rows,cols) in [(5,3),(3,5)]
-      v = orthogonal(rows, cols)
-      rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
-    end
-    for mat in [(3,4,5),(2,2,5)]
-      v = orthogonal(mat...)
-      cols = mat[end]
-      rows = div(prod(mat),cols)
-      v = reshape(v, (rows,cols))
-      rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
-    end
-    @test eltype(orthogonal(3, 4; gain=1.5)) == Float32
-  end
-
-  @testset "sparse_init" begin
-    # sparse_init should yield an error for non 2-d dimensions
-    # sparse_init should yield no zero elements if sparsity < 0
-    # sparse_init should yield all zero elements if sparsity > 1
-    # sparse_init should yield exactly ceil(n_in * sparsity) elements in each column for other sparsity values
-    # sparse_init should yield a kernel in its non-zero elements consistent with the std parameter
-
-    @test_throws ArgumentError sparse_init(100, 100, 100, sparsity=0.1)
-    v = sparse_init(100, 100, sparsity=-0.1)
-    @test sum(v .== 0) == 0
-    v = sparse_init(100, 100, sparsity=1.1)
-    @test sum(v .== 0) == length(v)
-
-    for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)]
-      expected_zeros = ceil(Integer, n_in * sparsity)
-      v = sparse_init(n_in, n_out, sparsity=sparsity, std=σ)
-      @test all([sum(v[:,col] .== 0) == expected_zeros for col in 1:n_out])
-      @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ
+        if init == sparse_init
+            init = (args...) -> sparse_init(args...; sparsity = 0.5)
+        else
+            # sparse_init is the only one which accepts only matrices:
+            @test size(init(3)) == (3,)
+            @test size(init(3, 4, 5)) == (3, 4, 5)
+        end
+        @test size(init(3, 4)) == (3, 4)
+        # only init(size...) is accepted:
+        @test_throws MethodError size(init((3, 4, 5)))==(3, 4, 5)
+
+        # rng, and currying:
+        @test size(init(MersenneTwister(1), 3, 4)) == (3, 4)
+        closure = init(MersenneTwister(1))
+        @test size(closure(3, 4)) == (3, 4)
+
+        # eltype, default Float32
+        @test eltype(init(3, 4)) == Float32
+
+        # @non_differentiable
+        @test gradient(x -> sum(x .* init(3, 4)), 5.0)[1] isa Number
+    end
+
+    @testset "glorot: $init" for init in [glorot_uniform, glorot_normal]
+        # glorot_uniform and glorot_normal should both yield a kernel with
+        # variance ≈ 2/(fan_in + fan_out)
+        for dims in [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+            v = init(dims...)
+            fan_in, fan_out = nfan(dims...)
+            σ2 = 2 / (fan_in + fan_out)
+            @test 0.9σ2 < var(v) < 1.1σ2
+        end
+        @test eltype(init(3, 4; gain = 1.5)) == Float32
+    end
+
+    @testset "kaiming" begin
+        # kaiming_uniform should yield a kernel in range [-sqrt(6/n_out), sqrt(6/n_out)]
+        # and kaiming_normal should yield a kernel with stddev ~= sqrt(2/n_out)
+        for (n_in, n_out) in [(100, 100), (100, 400)]
+            v = kaiming_uniform(n_in, n_out)
+            σ2 = sqrt(6 / n_out)
+            @test -1σ2 < minimum(v) < -0.9σ2
+            @test 0.9σ2 < maximum(v) < 1σ2
+
+            v = kaiming_normal(n_in, n_out)
+            σ2 = sqrt(2 / n_out)
+            @test 0.9σ2 < std(v) < 1.1σ2
+        end
+        @test eltype(kaiming_uniform(3, 4; gain = 1.5)) == Float32
+        @test eltype(kaiming_normal(3, 4; gain = 1.5)) == Float32
+    end
+
+    @testset "orthogonal" begin
+        # A matrix of dim = (m,n) with m > n should produce a QR decomposition. In the other case, the transpose should be taken to compute the QR decomposition.
+        for (rows, cols) in [(5, 3), (3, 5)]
+            v = orthogonal(rows, cols)
+            rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
+        end
+        for mat in [(3, 4, 5), (2, 2, 5)]
+            v = orthogonal(mat...)
+            cols = mat[end]
+            rows = div(prod(mat), cols)
+            v = reshape(v, (rows, cols))
+            rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
+        end
+        @test eltype(orthogonal(3, 4; gain = 1.5)) == Float32
+    end
+
+    @testset "sparse_init" begin
+        # sparse_init should yield an error for non 2-d dimensions
+        # sparse_init should yield no zero elements if sparsity < 0
+        # sparse_init should yield all zero elements if sparsity > 1
+        # sparse_init should yield exactly ceil(n_in * sparsity) elements in each column for other sparsity values
+        # sparse_init should yield a kernel in its non-zero elements consistent with the std parameter
+
+        @test_throws ArgumentError sparse_init(100, 100, 100, sparsity = 0.1)
+        v = sparse_init(100, 100, sparsity = -0.1)
+        @test sum(v .== 0) == 0
+        v = sparse_init(100, 100, sparsity = 1.1)
+        @test sum(v .== 0) == length(v)
+
+        for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)]
+            expected_zeros = ceil(Integer, n_in * sparsity)
+            v = sparse_init(n_in, n_out, sparsity = sparsity, std = σ)
+            @test all([sum(v[:, col] .== 0) == expected_zeros for col in 1:n_out])
+            @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ
+        end
+
+        @test eltype(sparse_init(3, 4; std = 1.5, sparsity = 0.5)) == Float32
+    end
+
+    @testset "truncated_normal" begin
+        m = truncated_normal(100, 100)
+        @test minimum(m)≈-2 atol=0.05  # default arguments
+        @test maximum(m)≈2 atol=0.05
+        @test mean(m)≈0 atol=0.1
+
+        size100 = (100, 100, 100)
+        for (μ, σ, lo, hi) in [(0.0, 1, -2, 3), (1, 2, -4.0, 5.0)]
+            v = truncated_normal(size100...; mean = μ, std = σ, lo, hi)
+            @test isapprox(mean(v), μ; atol = 1.0f-1)
+            @test isapprox(minimum(v), lo; atol = 1.0f-2)
+            @test isapprox(maximum(v), hi; atol = 1.0f-2)
+            @test eltype(v) == Float32  # despite some Float64 arguments
+        end
+        for (μ, σ, lo, hi) in [(6, 2, -100.0, 100), (-7.0, 10, -100, 100)]
+            v = truncated_normal(size100...; mean = μ, std = σ, lo, hi)
+            @test isapprox(mean(v), μ; atol = 1.0f-1)
+            @test isapprox(std(v), σ; atol = 1.0f-1)
+        end
+    end
+
+    @testset "Partial application" begin
+        partial_ku = kaiming_uniform(gain = 1e9)
+        @test maximum(partial_ku(8, 8)) > 1e9 / 2
+        @test maximum(partial_ku(8, 8, gain = 1)) < 1e9 / 2
+
+        partial_kn = kaiming_normal(gain = 1e9)
+        @test maximum(partial_kn(8, 8)) > 1e9 / 2
+        @test maximum(partial_kn(8, 8, gain = 1)) < 1e9 / 2
+
+        partial_si = sparse_init(sparsity = 1)
+        @test maximum(partial_si(8, 8)) == 0
+        @test maximum(partial_si(8, 8, sparsity = 0)) > 0
+    end
+
+    @testset "identity_init" begin
+        @testset "Basic" begin
+            partial = identity_init(gain = 3)
+            @test partial(3, 3) == identity_init(3, 3; gain = 3) == [3 0 0; 0 3 0; 0 0 3]
+            @test eltype(identity_init(3, 4; gain = 1.5)) == Float32  # despite Float64 keyword
+        end
+        @testset "Non-identity sizes" begin
+            @test identity_init(2, 3)[:, end] == zeros(Float32, 2)
+            @test identity_init(3, 2; shift = 1)[1, :] == zeros(Float32, 2)
+            @test identity_init(1, 1, 3, 4)[:, :, :, end] == zeros(Float32, 1, 1, 3)
+            @test identity_init(2, 1, 3, 3)[end, :, :, :] == zeros(Float32, 1, 3, 3)
+            @test identity_init(1, 2, 3, 3)[:, end, :, :] == zeros(Float32, 1, 3, 3)
+        end
+        @testset "Dense ID mapping" begin
+            l = Dense(3, 3, init = identity_init)
+
+            indata = reshape(collect(Float32, 1:9), 3, 3)
+            @test l(indata) == indata
+        end
+        @testset "$layer ID mapping with kernelsize $kernelsize" for layer in (Conv,
+                                                                               ConvTranspose,
+                                                                               CrossCor),
+                                                                     kernelsize in ((1,),
+                                                                                    (3,),
+                                                                                    (1, 3),
+                                                                                    (3, 5),
+                                                                                    (3, 5,
+                                                                                     7))
+
+            nch = 3
+            l = layer(kernelsize, nch => nch, init = identity_init, pad = SamePad())
+
+            indata = randn(Float32, kernelsize..., nch, nch)
+            @test l(indata) == indata
+        end
+        @testset "Inception identity" begin
+            insize = 7
+            path1 = Conv((1, 3), insize => 2; init = identity_init, pad = SamePad())
+            path2 = Conv((3, 5), insize => 3; init = identity_init(shift = (0, 0, 2, 0)),
+                         pad = SamePad())
+            path3 = Conv((5, 7), insize => 2; init = identity_init(shift = (0, 0, 5, 0)),
+                         pad = SamePad())
+            block = Parallel((xs...) -> cat(xs...; dims = 3), path1, path2, path3)
+
+            indata = randn(Float32, 9, 9, 7, 2)
+            @test block(indata) == indata
+        end
     end
-
-    @test eltype(sparse_init(3, 4; std=1.5, sparsity=0.5)) == Float32
-  end
-
-  @testset "truncated_normal" begin
-    m = truncated_normal(100, 100)
-    @test minimum(m) ≈ -2 atol = 0.05  # default arguments
-    @test maximum(m) ≈ 2 atol = 0.05
-    @test mean(m) ≈ 0 atol = 0.1
-
-    size100 = (100, 100, 100)
-    for (μ, σ, lo, hi) in [(0.0, 1, -2, 3), (1, 2, -4.0, 5.0)]
-      v = truncated_normal(size100...; mean = μ, std = σ, lo, hi)
-      @test isapprox(mean(v), μ; atol = 1f-1)
-      @test isapprox(minimum(v), lo; atol = 1f-2)
-      @test isapprox(maximum(v), hi; atol = 1f-2)
-      @test eltype(v) == Float32  # despite some Float64 arguments
-    end
-    for (μ, σ, lo, hi) in [(6, 2, -100.0, 100), (-7.0, 10, -100, 100)]
-      v = truncated_normal(size100...; mean = μ, std = σ, lo, hi)
-      @test isapprox(mean(v), μ; atol = 1f-1)
-      @test isapprox(std(v), σ; atol = 1f-1)
-    end
-  end
-
-  @testset "Partial application" begin
-    partial_ku = kaiming_uniform(gain=1e9)
-    @test maximum(partial_ku(8, 8)) > 1e9 / 2
-    @test maximum(partial_ku(8, 8, gain=1)) < 1e9 / 2
-
-    partial_kn = kaiming_normal(gain=1e9)
-    @test maximum(partial_kn(8, 8)) > 1e9 / 2
-    @test maximum(partial_kn(8, 8, gain=1)) < 1e9 / 2
-
-    partial_si = sparse_init(sparsity=1)
-    @test maximum(partial_si(8, 8)) == 0
-    @test maximum(partial_si(8, 8, sparsity=0)) > 0
-  end
-
-  @testset "identity_init" begin
-    @testset "Basic" begin
-      partial = identity_init(gain=3)
-      @test partial(3, 3) == identity_init(3, 3; gain=3) == [3 0 0; 0 3 0; 0 0 3]
-      @test eltype(identity_init(3, 4; gain=1.5)) == Float32  # despite Float64 keyword
-    end
-    @testset "Non-identity sizes" begin
-        @test identity_init(2, 3)[:, end] == zeros(Float32, 2)
-        @test identity_init(3, 2; shift=1)[1, :] == zeros(Float32, 2)
-        @test identity_init(1, 1, 3, 4)[:, :, :, end] == zeros(Float32, 1, 1, 3)
-        @test identity_init(2, 1, 3, 3)[end, :, :, :] == zeros(Float32, 1, 3, 3)
-        @test identity_init(1, 2, 3, 3)[:, end, :, :] == zeros(Float32, 1, 3, 3)
-    end
-    @testset "Dense ID mapping" begin
-        l = Dense(3,3, init = identity_init)
-
-        indata = reshape(collect(Float32, 1:9), 3, 3)
-        @test l(indata) == indata
-    end
-    @testset "$layer ID mapping with kernelsize $kernelsize" for layer in (Conv, ConvTranspose, CrossCor), kernelsize in (
-        (1,),
-        (3,),
-        (1, 3),
-        (3, 5),
-        (3, 5, 7))
-        nch = 3
-        l = layer(kernelsize, nch=>nch, init=identity_init, pad=SamePad())
-
-        indata = randn(Float32, kernelsize..., nch, nch)
-        @test l(indata) == indata
-    end
-    @testset "Inception identity" begin
-      insize = 7
-      path1 = Conv((1, 3), insize=>2; init=identity_init, pad=SamePad())
-      path2 = Conv((3, 5), insize=>3; init=identity_init(shift=(0, 0, 2, 0)), pad=SamePad())
-      path3 = Conv((5, 7), insize=>2; init=identity_init(shift=(0, 0, 5, 0)), pad=SamePad())
-      block = Parallel((xs...) -> cat(xs...;dims=3), path1, path2, path3)
-
-      indata = randn(Float32, 9, 9, 7, 2)
-      @test block(indata) == indata
-    end
-  end
 end
 
 @testset "Params" begin
-  m = Dense(10, 5)
-  @test size.(params(m)) == [(5, 10), (5,)]
-  m = RNN(10, 5)
-  @test size.(params(m)) == [(5, 10), (5, 5), (5,), (5, 1)]
-
-  # Layer duplicated in same chain, params just once pls.
-  c = Chain(m, m)
-  @test size.(params(c)) == [(5, 10), (5, 5), (5,), (5, 1)]
-
-  # Self-referential array. Just want params, no stack overflow pls.
-  r = Any[nothing,m]
-  r[1] = r
-  @test size.(params(r)) == [(5, 10), (5, 5), (5,), (5, 1)]
-
-  # Ensure functor explores inside Transpose but not SubArray
-  m = (x = view([1,2,3]pi, 1:2), y = transpose([4 5]pi))
-  @test size.(Flux.params(m)) == [(2,), (1, 2)]
+    m = Dense(10, 5)
+    @test size.(params(m)) == [(5, 10), (5,)]
+    m = RNN(10, 5)
+    @test size.(params(m)) == [(5, 10), (5, 5), (5,), (5, 1)]
+
+    # Layer duplicated in same chain, params just once pls.
+    c = Chain(m, m)
+    @test size.(params(c)) == [(5, 10), (5, 5), (5,), (5, 1)]
+
+    # Self-referential array. Just want params, no stack overflow pls.
+    r = Any[nothing, m]
+    r[1] = r
+    @test size.(params(r)) == [(5, 10), (5, 5), (5,), (5, 1)]
+
+    # Ensure functor explores inside Transpose but not SubArray
+    m = (x = view([1, 2, 3]pi, 1:2), y = transpose([4 5]pi))
+    @test size.(Flux.params(m)) == [(2,), (1, 2)]
 end
 
 @testset "Precision" begin
-  m = Chain(Dense(10, 5, relu), Dense(5, 2))
-  x64 = rand(Float64, 10)
-  x32 = rand(Float32, 10)
-  @test eltype(m[1].weight) == Float32
-  @test eltype(m(x32)) == Float32
-  @test eltype(m(x64)) == Float64
-  @test eltype(f64(m)(x32)) == Float64
-  @test eltype(f64(m)(x64)) == Float64
-  @test eltype(f64(m)[1].weight) == Float64
-  @test eltype(f32(f64(m))[1].weight) == Float32
+    m = Chain(Dense(10, 5, relu), Dense(5, 2))
+    x64 = rand(Float64, 10)
+    x32 = rand(Float32, 10)
+    @test eltype(m[1].weight) == Float32
+    @test eltype(m(x32)) == Float32
+    @test eltype(m(x64)) == Float64
+    @test eltype(f64(m)(x32)) == Float64
+    @test eltype(f64(m)(x64)) == Float64
+    @test eltype(f64(m)[1].weight) == Float64
+    @test eltype(f32(f64(m))[1].weight) == Float32
 end
 
 @testset "zero bias" begin
-  m = Dense(3 => 2; bias=false)
-  @test f64(m).bias === m.bias === false
-  @test f32(m).bias === m.bias === false
+    m = Dense(3 => 2; bias = false)
+    @test f64(m).bias === m.bias === false
+    @test f32(m).bias === m.bias === false
 
-  @testset "Gradients for broadcasted $op with sizes $s" for op in (+,-,*), s in ((1,), (2,3))
-    o = ones(s)
-    z = zeros(s)
+    @testset "Gradients for broadcasted $op with sizes $s" for op in (+, -, *),
+                                                               s in ((1,), (2, 3))
 
-    @testset "Explicit" begin
-      gfun(args...) = gradient((x, y) -> sum(op.(x,y)), args...)
-      g = gfun(o, z)
-      @test gfun(o, false) == (g[1], nothing)
+        o = ones(s)
+        z = zeros(s)
 
-      g = gfun(z, o)
-      @test gfun(false, o) == (nothing, g[2])
-    end
+        @testset "Explicit" begin
+            gfun(args...) = gradient((x, y) -> sum(op.(x, y)), args...)
+            g = gfun(o, z)
+            @test gfun(o, false) == (g[1], nothing)
+
+            g = gfun(z, o)
+            @test gfun(false, o) == (nothing, g[2])
+        end
 
-    @testset "Implicit" begin
-      gfun(args...) = gradient(() -> sum(op.(args...)), params(collect(args)))
-      g = gfun(o, z)
+        @testset "Implicit" begin
+            gfun(args...) = gradient(() -> sum(op.(args...)), params(collect(args)))
+            g = gfun(o, z)
 
-      gres = gfun(o, false)
-      @test gres[o] == g[o]
-      @test false ∉ gres.params
+            gres = gfun(o, false)
+            @test gres[o] == g[o]
+            @test false ∉ gres.params
 
-      g = gfun(z, o)
-      gres = gfun(false, o)
-      @test gres[o] == g[o]
-      @test false ∉ gres.params
+            g = gfun(z, o)
+            gres = gfun(false, o)
+            @test gres[o] == g[o]
+            @test false ∉ gres.params
+        end
     end
-  end
 end
 
 @testset "unsqueeze" begin
-  x = randn(2, 3, 2)
-  @test @inferred(unsqueeze(x, dims=1)) == reshape(x, 1, 2, 3, 2)
-  @test @inferred(unsqueeze(x, dims=2)) == reshape(x, 2, 1, 3, 2)
-  @test @inferred(unsqueeze(x, dims=3)) == reshape(x, 2, 3, 1, 2)
-  @test @inferred(unsqueeze(x, dims=4)) == reshape(x, 2, 3, 2, 1)
+    x = randn(2, 3, 2)
+    @test @inferred(unsqueeze(x, dims = 1)) == reshape(x, 1, 2, 3, 2)
+    @test @inferred(unsqueeze(x, dims = 2)) == reshape(x, 2, 1, 3, 2)
+    @test @inferred(unsqueeze(x, dims = 3)) == reshape(x, 2, 3, 1, 2)
+    @test @inferred(unsqueeze(x, dims = 4)) == reshape(x, 2, 3, 2, 1)
 end
 
 @testset "Stacking" begin
-  x = randn(3,3)
-  stacked = MLUtils.stack([x, x], dims=2)
-  @test size(stacked) == (3,2,3)
-
-  stacked_array=[ 8 9 3 5; 9 6 6 9; 9 1 7 2; 7 4 10 6 ]
-  unstacked_array=[[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]]
-  @test unstack(stacked_array, dims=2) == unstacked_array
-  @test MLUtils.stack(unstacked_array, dims=2) == stacked_array
-  @test MLUtils.stack(unstack(stacked_array, dims=1), dims=1) == stacked_array
+    x = randn(3, 3)
+    stacked = MLUtils.stack([x, x], dims = 2)
+    @test size(stacked) == (3, 2, 3)
+
+    stacked_array = [8 9 3 5; 9 6 6 9; 9 1 7 2; 7 4 10 6]
+    unstacked_array = [[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]]
+    @test unstack(stacked_array, dims = 2) == unstacked_array
+    @test MLUtils.stack(unstacked_array, dims = 2) == stacked_array
+    @test MLUtils.stack(unstack(stacked_array, dims = 1), dims = 1) == stacked_array
 end
 
 @testset "Batching" begin
-  stacked_array=[ 8 9 3 5
-                  9 6 6 9
-                  9 1 7 2
-                  7 4 10 6 ]
-  unstacked_array=[[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]]
-  @test unbatch(stacked_array) == unstacked_array
-  @test batch(unstacked_array) == stacked_array
-
-  # no-op for vector of non-arrays
-  @test batch([1,2,3]) == [1,2,3]
-  @test unbatch([1,2,3]) == [1,2,3]
-
-  # generic iterable
-  @test batch(ones(2) for i=1:3) == ones(2, 3)
-  @test unbatch(ones(2, 3)) == [ones(2) for i=1:3]
+    stacked_array = [8 9 3 5
+                     9 6 6 9
+                     9 1 7 2
+                     7 4 10 6]
+    unstacked_array = [[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]]
+    @test unbatch(stacked_array) == unstacked_array
+    @test batch(unstacked_array) == stacked_array
+
+    # no-op for vector of non-arrays
+    @test batch([1, 2, 3]) == [1, 2, 3]
+    @test unbatch([1, 2, 3]) == [1, 2, 3]
+
+    # generic iterable
+    @test batch(ones(2) for i in 1:3) == ones(2, 3)
+    @test unbatch(ones(2, 3)) == [ones(2) for i in 1:3]
 end
 
 @testset "Param remapping" begin
-  ls(dims...) = reshape(collect(Float32, 1:prod(dims)), dims...) # accepts dims in reverse order to Dense
-  dl(nin, nout, bias) = Dense(ls(nout, nin), bias(nout))
-  dm(bias) = Chain(
-    dl(3, 5, bias),
-    dl(5, 4, bias),
-    dl(4, 3, bias)
-  )
-
-  nobias(n) = false
-  testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m, dm(bt)))
-    @test l1.weight == l2.weight
-    @test l1.bias == l2.bias
-    @test_skip typeof(l1.bias) === typeof(l2.bias)
-  end
-
-  @testset "loadparams!" begin
-    pars(w, b) = [w, b]
-    pars(l) = pars(l.weight, l.bias)
-    pararray(m) = mapreduce(pars, vcat, m)
-    weights(m) = mapreduce(l -> [l.weight], vcat, m)
-    @testset "Bias type $bt" for bt in (Flux.zeros32, nobias)
-      m = dm(bt)
-      Flux.loadparams!(m, params(m))
-      testdense(m, bt)
-    end
-  end
-
-  @testset "loadmodel!(dst, src)" begin
-    m1 = Chain(Dense(10, 5), Dense(5, 2, relu))
-    m2 = Chain(Dense(10, 5), Dense(5, 2))
-    m3 = Chain(Conv((3, 3), 3 => 16), Dense(5, 2))
-    m4 = Chain(Dense(10, 6), Dense(6, 2))
-    m5 = Chain(Dense(10, 5), Parallel(+, Dense(Flux.ones32(2, 5), false), Dense(5, 2)))
-    m6 = Chain(Dense(10, 5), Parallel(+, Dense(5, 2), Dense(5, 2)))
-
-    loadmodel!(m1, m2)
-    # trainable parameters copy over
-    @test m1[1].weight == m2[1].weight
-    @test m1[1].bias == m2[1].bias
-    # non-array leaves are untouched
-    @test m1[2].σ == relu
-
-    loadmodel!(m5, m6)
-    # more complex nested structures also work
-    @test m5[1].weight == m6[1].weight
-    @test m5[2][1].weight == m6[2][1].weight
-    # false bias is not overwritten
-    @test m5[2][1].bias == false
-
-    # mismatched nodes throw an error
-    @test_throws ArgumentError loadmodel!(m1, m3)
-    @test_throws ArgumentError loadmodel!(m1, m5)
-    # size mismatches throw an error
-    @test_throws DimensionMismatch loadmodel!(m1, m4)
-
-    # tests for BatchNorm and Dropout
-    m1 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), Flux.flatten, Dropout(0.2))
-    m2 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), x -> reshape(x, :, size(x)[end]), Dropout(0.1))
-    m2[2].μ .= rand(Float32, size(m2[2].μ)...)
-    loadmodel!(m1, m2)
-    # non-trainable parameters are copied as well
-    @test m1[2].μ == m2[2].μ
-    # functions are not copied
-    @test m1[3] == Flux.flatten
-    # dropout rate is not copied
-    @test m1[4].p == 0.2
-
-    # from LegolasFlux (https://github.com/beacon-biosignals/LegolasFlux.jl/blob/80569ab63a8248a8a063c76e0bbf701f4ada9bd4/examples/digits.jl#L33)
-    # tests Chain(...) vs Chain([...])
-    # tests MaxPool
-    # tests testmode!/trainmode! is not copied
-    # tests Dense, Conv, BatchNorm, Dropout (like above) but in a bigger model
-    chain1 = Chain(Dropout(0.2),
-                   Conv((3, 3), 1 => 32, relu),
-                   BatchNorm(32, relu),
-                   MaxPool((2, 2)),
-                   Dropout(0.2),
-                   Conv((3, 3), 32 => 16, relu),
-                   Dropout(0.2),
-                   MaxPool((2, 2)),
-                   Dropout(0.2),
-                   Conv((3, 3), 16 => 10, relu),
-                   Dropout(0.2),
-                   x -> reshape(x, :, size(x, 4)),
-                   Dropout(0.2),
-                   Dense(90, 10),
-                   softmax)
-    chain2 = Chain([Dropout(0.1),
-                   Conv((3, 3), 1 => 32, relu),
-                   BatchNorm(32, relu),
-                   MaxPool((3, 3)),
-                   Dropout(0.1),
-                   Conv((3, 3), 32 => 16, relu),
-                   Dropout(0.1),
-                   MaxPool((3, 3)),
-                   Dropout(0.1),
-                   Conv((3, 3), 16 => 10, relu),
-                   Dropout(0.1),
-                   x -> reshape(x, :, size(x, 4)),
-                   Dropout(0.1),
-                   Dense(90, 10),
-                   softmax])
-    chain2[3].μ .= 5f0
-    chain2[3].σ² .= 2f0
-    testmode!(chain2)
-    loadmodel!(chain1, chain2)
-    for (dst, src) in zip(chain1, chain2)
-      if dst isa Dropout
-        @test dst.p == 0.2
-      elseif dst isa Union{Conv, Dense}
-        @test dst.weight == src.weight
-        @test dst.bias == src.bias
-      elseif dst isa MaxPool
-        @test dst.k == (2, 2)
-      elseif dst isa BatchNorm
-        @test dst.μ == src.μ
-        @test dst.σ² == src.σ²
-        @test isnothing(dst.active)
-      end
-    end
-
-    # copy only a subset of the model
-    chain1[end - 1].weight .= 1f0
-    chain1[3].μ .= 3f0
-    chain1[2].bias .= 5f0
-    loadmodel!(chain2[end - 1], chain1[end - 1])
-    loadmodel!(chain2[3], chain1[3])
-    @test chain2[end - 1].weight == chain1[end - 1].weight
-    @test chain2[3].μ == chain1[3].μ
-    @test chain2[2].bias != chain1[2].bias
-
-    # test shared weights
-    shared_dst = Dense(10 => 10)
-    shared_src = Dense(10 => 10)
-    # matched weights are okay
-    m1 = Chain(shared_dst, Dense(shared_dst.weight))
-    m2 = Chain(shared_src, Dense(shared_src.weight))
-    loadmodel!(m1, m2)
-    @test m1[1].weight === m1[2].weight
-    @test m1[1].weight == m2[2].weight
-    # mismatched weights are an error
-    m2 = Chain(Dense(10 => 10), Dense(10 => 10))
-    @test_throws ErrorException loadmodel!(m1, m2)
-    # loading into tied weights with absent parameter is okay when the dst == zero
-    b = Flux.zeros32(5)
-    m1 = Chain(Dense(10 => 5; bias = b), Dense(5 => 5; bias = b))
-    m2 = Chain(Dense(10 => 5; bias = Flux.zeros32(5)), Dense(5 => 5; bias = false))
-    loadmodel!(m1, m2)
-    @test m1[1].bias === m1[2].bias
-    @test iszero(m1[1].bias)
-    # loading into tied weights with absent parameter is bad when the dst != zero
-    m2[1].bias .= 1
-    @test_throws ErrorException loadmodel!(m1, m2)
-
-    @testset "loadmodel! & filter" begin
-      m1 = Chain(Dense(10, 5), Dense(5, 2, relu))
-      m2 = Chain(Dense(10, 5), Dropout(0.2), Dense(5, 2))
-      m3 = Chain(Dense(10, 5), Dense(5, 2, relu))
-
-      # this will not error cause Dropout is skipped
-      loadmodel!(m1, m2; filter = x -> !(x isa Dropout))
-      @test m1[1].weight == m2[1].weight
-      @test m1[2].weight == m2[3].weight
-
-      # this will not error cause Dropout is skipped
-      loadmodel!(m2, m3; filter = x -> !(x isa Dropout))
-      @test m3[1].weight == m2[1].weight
-      @test m3[2].weight == m2[3].weight
-    end
-
-    @testset "loadmodel! & absent bias" begin
-      m0 = Chain(Dense(2 => 3; bias=false, init = Flux.ones32), Dense(3 => 1))
-      m1 = Chain(Dense(2 => 3; bias = Flux.randn32(3)), Dense(3 => 1))
-      m2 = Chain(Dense(Float32[1 2; 3 4; 5 6], Float32[7, 8, 9]), Dense(3 => 1))
-    
-      Flux.loadmodel!(m1, m2)
-      @test m1[1].bias == 7:9
-      @test sum(m1[1].weight) == 21
-    
-      # load from a model without bias -- should ideally recognise the `false` but `Params` doesn't store it
-      m1 = Flux.loadmodel!(m1, m0)
-      @test iszero(m1[1].bias)
-      @test sum(m1[1].weight) == 6  # written before error
-    
-      # load into a model without bias -- should it ignore the parameter which has no home, or error?
-      m0 = Flux.loadmodel!(m0, m2)
-      @test iszero(m0[1].bias)  # obviously unchanged
-      @test sum(m0[1].weight) == 21
-    end
-  end
-
-  @testset "destructure" begin
-    import Flux: destructure
-    @testset "Bias type $bt" for bt in (zeros, nobias)
-      m = dm(bt)
-      p, re = destructure(m)
-      testdense(re(p), bt)
+    ls(dims...) = reshape(collect(Float32, 1:prod(dims)), dims...) # accepts dims in reverse order to Dense
+    dl(nin, nout, bias) = Dense(ls(nout, nin), bias(nout))
+    dm(bias) = Chain(dl(3, 5, bias),
+                     dl(5, 4, bias),
+                     dl(4, 3, bias))
+
+    nobias(n) = false
+    testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m,
+                                                                                    dm(bt)))
+        @test l1.weight == l2.weight
+        @test l1.bias == l2.bias
+        @test_skip typeof(l1.bias) === typeof(l2.bias)
+    end
+
+    @testset "loadparams!" begin
+        pars(w, b) = [w, b]
+        pars(l) = pars(l.weight, l.bias)
+        pararray(m) = mapreduce(pars, vcat, m)
+        weights(m) = mapreduce(l -> [l.weight], vcat, m)
+        @testset "Bias type $bt" for bt in (Flux.zeros32, nobias)
+            m = dm(bt)
+            Flux.loadparams!(m, params(m))
+            testdense(m, bt)
+        end
+    end
+
+    @testset "loadmodel!(dst, src)" begin
+        m1 = Chain(Dense(10, 5), Dense(5, 2, relu))
+        m2 = Chain(Dense(10, 5), Dense(5, 2))
+        m3 = Chain(Conv((3, 3), 3 => 16), Dense(5, 2))
+        m4 = Chain(Dense(10, 6), Dense(6, 2))
+        m5 = Chain(Dense(10, 5), Parallel(+, Dense(Flux.ones32(2, 5), false), Dense(5, 2)))
+        m6 = Chain(Dense(10, 5), Parallel(+, Dense(5, 2), Dense(5, 2)))
+
+        loadmodel!(m1, m2)
+        # trainable parameters copy over
+        @test m1[1].weight == m2[1].weight
+        @test m1[1].bias == m2[1].bias
+        # non-array leaves are untouched
+        @test m1[2].σ == relu
+
+        loadmodel!(m5, m6)
+        # more complex nested structures also work
+        @test m5[1].weight == m6[1].weight
+        @test m5[2][1].weight == m6[2][1].weight
+        # false bias is not overwritten
+        @test m5[2][1].bias == false
+
+        # mismatched nodes throw an error
+        @test_throws ArgumentError loadmodel!(m1, m3)
+        @test_throws ArgumentError loadmodel!(m1, m5)
+        # size mismatches throw an error
+        @test_throws DimensionMismatch loadmodel!(m1, m4)
+
+        # tests for BatchNorm and Dropout
+        m1 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), Flux.flatten, Dropout(0.2))
+        m2 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), x -> reshape(x, :, size(x)[end]),
+                   Dropout(0.1))
+        m2[2].μ .= rand(Float32, size(m2[2].μ)...)
+        loadmodel!(m1, m2)
+        # non-trainable parameters are copied as well
+        @test m1[2].μ == m2[2].μ
+        # functions are not copied
+        @test m1[3] == Flux.flatten
+        # dropout rate is not copied
+        @test m1[4].p == 0.2
+
+        # from LegolasFlux (https://github.com/beacon-biosignals/LegolasFlux.jl/blob/80569ab63a8248a8a063c76e0bbf701f4ada9bd4/examples/digits.jl#L33)
+        # tests Chain(...) vs Chain([...])
+        # tests MaxPool
+        # tests testmode!/trainmode! is not copied
+        # tests Dense, Conv, BatchNorm, Dropout (like above) but in a bigger model
+        chain1 = Chain(Dropout(0.2),
+                       Conv((3, 3), 1 => 32, relu),
+                       BatchNorm(32, relu),
+                       MaxPool((2, 2)),
+                       Dropout(0.2),
+                       Conv((3, 3), 32 => 16, relu),
+                       Dropout(0.2),
+                       MaxPool((2, 2)),
+                       Dropout(0.2),
+                       Conv((3, 3), 16 => 10, relu),
+                       Dropout(0.2),
+                       x -> reshape(x, :, size(x, 4)),
+                       Dropout(0.2),
+                       Dense(90, 10),
+                       softmax)
+        chain2 = Chain([Dropout(0.1),
+                           Conv((3, 3), 1 => 32, relu),
+                           BatchNorm(32, relu),
+                           MaxPool((3, 3)),
+                           Dropout(0.1),
+                           Conv((3, 3), 32 => 16, relu),
+                           Dropout(0.1),
+                           MaxPool((3, 3)),
+                           Dropout(0.1),
+                           Conv((3, 3), 16 => 10, relu),
+                           Dropout(0.1),
+                           x -> reshape(x, :, size(x, 4)),
+                           Dropout(0.1),
+                           Dense(90, 10),
+                           softmax])
+        chain2[3].μ .= 5.0f0
+        chain2[3].σ² .= 2.0f0
+        testmode!(chain2)
+        loadmodel!(chain1, chain2)
+        for (dst, src) in zip(chain1, chain2)
+            if dst isa Dropout
+                @test dst.p == 0.2
+            elseif dst isa Union{Conv, Dense}
+                @test dst.weight == src.weight
+                @test dst.bias == src.bias
+            elseif dst isa MaxPool
+                @test dst.k == (2, 2)
+            elseif dst isa BatchNorm
+                @test dst.μ == src.μ
+                @test dst.σ² == src.σ²
+                @test isnothing(dst.active)
+            end
+        end
+
+        # copy only a subset of the model
+        chain1[end - 1].weight .= 1.0f0
+        chain1[3].μ .= 3.0f0
+        chain1[2].bias .= 5.0f0
+        loadmodel!(chain2[end - 1], chain1[end - 1])
+        loadmodel!(chain2[3], chain1[3])
+        @test chain2[end - 1].weight == chain1[end - 1].weight
+        @test chain2[3].μ == chain1[3].μ
+        @test chain2[2].bias != chain1[2].bias
+
+        # test shared weights
+        shared_dst = Dense(10 => 10)
+        shared_src = Dense(10 => 10)
+        # matched weights are okay
+        m1 = Chain(shared_dst, Dense(shared_dst.weight))
+        m2 = Chain(shared_src, Dense(shared_src.weight))
+        loadmodel!(m1, m2)
+        @test m1[1].weight === m1[2].weight
+        @test m1[1].weight == m2[2].weight
+        # mismatched weights are an error
+        m2 = Chain(Dense(10 => 10), Dense(10 => 10))
+        @test_throws ErrorException loadmodel!(m1, m2)
+        # loading into tied weights with absent parameter is okay when the dst == zero
+        b = Flux.zeros32(5)
+        m1 = Chain(Dense(10 => 5; bias = b), Dense(5 => 5; bias = b))
+        m2 = Chain(Dense(10 => 5; bias = Flux.zeros32(5)), Dense(5 => 5; bias = false))
+        loadmodel!(m1, m2)
+        @test m1[1].bias === m1[2].bias
+        @test iszero(m1[1].bias)
+        # loading into tied weights with absent parameter is bad when the dst != zero
+        m2[1].bias .= 1
+        @test_throws ErrorException loadmodel!(m1, m2)
+
+        @testset "loadmodel! & filter" begin
+            m1 = Chain(Dense(10, 5), Dense(5, 2, relu))
+            m2 = Chain(Dense(10, 5), Dropout(0.2), Dense(5, 2))
+            m3 = Chain(Dense(10, 5), Dense(5, 2, relu))
+
+            # this will not error cause Dropout is skipped
+            loadmodel!(m1, m2; filter = x -> !(x isa Dropout))
+            @test m1[1].weight == m2[1].weight
+            @test m1[2].weight == m2[3].weight
+
+            # this will not error cause Dropout is skipped
+            loadmodel!(m2, m3; filter = x -> !(x isa Dropout))
+            @test m3[1].weight == m2[1].weight
+            @test m3[2].weight == m2[3].weight
+        end
+
+        @testset "loadmodel! & absent bias" begin
+            m0 = Chain(Dense(2 => 3; bias = false, init = Flux.ones32), Dense(3 => 1))
+            m1 = Chain(Dense(2 => 3; bias = Flux.randn32(3)), Dense(3 => 1))
+            m2 = Chain(Dense(Float32[1 2; 3 4; 5 6], Float32[7, 8, 9]), Dense(3 => 1))
+
+            Flux.loadmodel!(m1, m2)
+            @test m1[1].bias == 7:9
+            @test sum(m1[1].weight) == 21
+
+            # load from a model without bias -- should ideally recognise the `false` but `Params` doesn't store it
+            m1 = Flux.loadmodel!(m1, m0)
+            @test iszero(m1[1].bias)
+            @test sum(m1[1].weight) == 6  # written before error
+
+            # load into a model without bias -- should it ignore the parameter which has no home, or error?
+            m0 = Flux.loadmodel!(m0, m2)
+            @test iszero(m0[1].bias)  # obviously unchanged
+            @test sum(m0[1].weight) == 21
+        end
+    end
+
+    @testset "destructure" begin
+        import Flux: destructure
+        @testset "Bias type $bt" for bt in (zeros, nobias)
+            m = dm(bt)
+            p, re = destructure(m)
+            testdense(re(p), bt)
+        end
+
+        @testset "restructure in gradient" begin
+            x = rand(Float32, 3, 1)
+            m = dm(zeros)
+            ∇m = gradient(m -> sum(m(x)), m)[1]
+            p, re = destructure(m)
+            ∇p = gradient(θ -> sum(re(θ)(x)), p)[1]
+            @test ∇p ≈ destructure(∇m)[1]
+        end
     end
-
-    @testset "restructure in gradient" begin
-      x = rand(Float32, 3, 1)
-      m = dm(zeros)
-      ∇m = gradient(m -> sum(m(x)), m)[1]
-      p, re = destructure(m)
-      ∇p = gradient(θ -> sum(re(θ)(x)), p)[1]
-      @test ∇p ≈ destructure(∇m)[1]
-    end
-  end
 end
 
 @testset "Train and test mode" begin
-  mutable struct DummyLayer
-    testing::Bool
-  end
-  Flux.testmode!(m::DummyLayer, testing=true) = (m.testing = testing; m)
-
-  c = Chain(DummyLayer(true))
-  testmode!(c)
-  @test c[1].testing
-  trainmode!(c)
-  @test !c[1].testing
+    mutable struct DummyLayer
+        testing::Bool
+    end
+    Flux.testmode!(m::DummyLayer, testing = true) = (m.testing = testing; m)
+
+    c = Chain(DummyLayer(true))
+    testmode!(c)
+    @test c[1].testing
+    trainmode!(c)
+    @test !c[1].testing
 end
 
 @testset "modules" begin
-  m1 = Conv((2,3), 4=>5; pad=6, stride=7)
-  m2 = LayerNorm(8)
-  m3 = m2.diag
-  m4 = SkipConnection(m1, +)
-  m5 =  Chain(m4, m2)
-  modules = Flux.modules(m5)
-  # Depth-first descent
-  @test length(modules) == 6
-  @test modules[1] === m5
-  @test modules[3] === m4
-  @test modules[4] === m1
-  @test modules[5] === m2
-  @test modules[6] === m3
-
-  mod_par = Flux.modules(Parallel(Flux.Bilinear(2,2,2,cbrt), Dense(2,2,abs), Dense(2,2,abs2)))
-  @test length(mod_par) == 5
-
-  mod_rnn = Flux.modules(Chain(Dense(2,3), BatchNorm(3), LSTM(3,4)))
-  @test length(mod_rnn) == 6
-  @test mod_rnn[end] isa Flux.LSTMCell
-
-  mod_skip = Flux.modules(Chain(SkipConnection(
-                                  Conv((2,3), 4=>5; pad=6, stride=7),
-                                  +),
-                                LayerNorm(8)))
-  @test length(mod_skip) == 6
-  @test mod_skip[end] isa Flux.Scale
+    m1 = Conv((2, 3), 4 => 5; pad = 6, stride = 7)
+    m2 = LayerNorm(8)
+    m3 = m2.diag
+    m4 = SkipConnection(m1, +)
+    m5 = Chain(m4, m2)
+    modules = Flux.modules(m5)
+    # Depth-first descent
+    @test length(modules) == 6
+    @test modules[1] === m5
+    @test modules[3] === m4
+    @test modules[4] === m1
+    @test modules[5] === m2
+    @test modules[6] === m3
+
+    mod_par = Flux.modules(Parallel(Flux.Bilinear(2, 2, 2, cbrt), Dense(2, 2, abs),
+                                    Dense(2, 2, abs2)))
+    @test length(mod_par) == 5
+
+    mod_rnn = Flux.modules(Chain(Dense(2, 3), BatchNorm(3), LSTM(3, 4)))
+    @test length(mod_rnn) == 6
+    @test mod_rnn[end] isa Flux.LSTMCell
+
+    mod_skip = Flux.modules(Chain(SkipConnection(Conv((2, 3), 4 => 5; pad = 6, stride = 7),
+                                                 +),
+                                  LayerNorm(8)))
+    @test length(mod_skip) == 6
+    @test mod_skip[end] isa Flux.Scale
 end
 
 @testset "Patience triggers" begin
-  @testset "patience" begin
-    trigger = Flux.patience(() -> true, 3)
+    @testset "patience" begin
+        trigger = Flux.patience(() -> true, 3)
 
-    @test trigger() == false
-    @test trigger() == false
-    @test trigger() == true
+        @test trigger() == false
+        @test trigger() == false
+        @test trigger() == true
 
-    v = [false, true, false, true, true, true]
-    trigger = let v = v
-      Flux.patience(i -> v[i], 3)
-    end
+        v = [false, true, false, true, true, true]
+        trigger = let v = v
+            Flux.patience(i -> v[i], 3)
+        end
 
-    n_iter = 0
-    for i in 1:length(v)
-      trigger(i) && break
-      n_iter += 1
-    end
+        n_iter = 0
+        for i in 1:length(v)
+            trigger(i) && break
+            n_iter += 1
+        end
 
-    @test n_iter == 5
-  end
+        @test n_iter == 5
+    end
 
-  @testset "early stopping" begin
-    @testset "args & kwargs" begin
-      es = Flux.early_stopping((x; y = 1) -> x + y, 10; min_dist=3)
+    @testset "early stopping" begin
+        @testset "args & kwargs" begin
+            es = Flux.early_stopping((x; y = 1) -> x + y, 10; min_dist = 3)
 
-      n_iter = 0
-      while n_iter < 99
-        es(-n_iter; y=-n_iter) && break
-        n_iter += 1
-      end
+            n_iter = 0
+            while n_iter < 99
+                es(-n_iter; y = -n_iter) && break
+                n_iter += 1
+            end
 
-      @test n_iter == 9
-    end
+            @test n_iter == 9
+        end
 
-    @testset "distance" begin
-      es = Flux.early_stopping(identity, 10; distance=(best_score, score) -> score - best_score)
+        @testset "distance" begin
+            es = Flux.early_stopping(identity, 10;
+                                     distance = (best_score, score) -> score - best_score)
 
-      n_iter = 0
-      while n_iter < 99
-        es(n_iter) && break
-        n_iter += 1
-      end
+            n_iter = 0
+            while n_iter < 99
+                es(n_iter) && break
+                n_iter += 1
+            end
 
-      @test n_iter == 99
-    end
+            @test n_iter == 99
+        end
 
-    @testset "init_score" begin
-      es = Flux.early_stopping(identity, 10; init_score=10)
+        @testset "init_score" begin
+            es = Flux.early_stopping(identity, 10; init_score = 10)
 
-      n_iter = 0
-      while n_iter < 99
-        es(n_iter) && break
-        n_iter += 1
-      end
+            n_iter = 0
+            while n_iter < 99
+                es(n_iter) && break
+                n_iter += 1
+            end
 
-      @test n_iter == 10
+            @test n_iter == 10
+        end
     end
-  end
 
-  @testset "plateau" begin
-    f = let v = 10
-      () -> v = v / abs(v) - v
-    end
+    @testset "plateau" begin
+        f = let v = 10
+            () -> v = v / abs(v) - v
+        end
 
-    trigger = Flux.plateau(f, 3, init_score=10, min_dist=18)
+        trigger = Flux.plateau(f, 3, init_score = 10, min_dist = 18)
 
-    n_iter = 0
-    while n_iter < 99
-      trigger() && break
-      n_iter += 1
-    end
+        n_iter = 0
+        while n_iter < 99
+            trigger() && break
+            n_iter += 1
+        end
 
-    @test n_iter == 3
-  end
+        @test n_iter == 3
+    end
 end
 
 @testset "Shared parameters" begin
-  mat = [1 2; 3 4.0]
-  simple = ((nothing, mat, (3, mat, 4)))
-  @test length(Flux.params(simple)) == 1
-  
-  oneadj = (nt = (m = mat, a = mat'))
-  @test length(Flux.params(oneadj)) == 1  # needs Functors@0.3
-  
-  @test Flux.destructure(simple)[1] == Flux.destructure(oneadj)[1] == [1, 3, 2, 4]
-end
-
-@testset "Various destructure bugs" begin
+    mat = [1 2; 3 4.0]
+    simple = ((nothing, mat, (3, mat, 4)))
+    @test length(Flux.params(simple)) == 1
 
-  @testset "issue 1601" begin
-    struct TwoDenses
-        dense::Dense
-        dense2::Dense
-    end
-    Flux.@functor TwoDenses
+    oneadj = (nt = (m = mat, a = mat'))
+    @test length(Flux.params(oneadj)) == 1  # needs Functors@0.3
 
-    function (m::TwoDenses)(x)
-        out = m.dense(x)
-    end
+    @test Flux.destructure(simple)[1] == Flux.destructure(oneadj)[1] == [1, 3, 2, 4]
+end
 
-    model = TwoDenses(
-        Dense(3,1),
-        Dense(3,2)
-    )
-    p, re = Flux.destructure(model)
+@testset "Various destructure bugs" begin
+    @testset "issue 1601" begin
+        struct TwoDenses
+            dense::Dense
+            dense2::Dense
+        end
+        Flux.@functor TwoDenses
 
-    x = [1., 2., 3.]
-    y, back = Flux.Zygote.pullback((x, p) -> re(p)(x), x, p)
+        function (m::TwoDenses)(x)
+            return out = m.dense(x)
+        end
 
-    dy = [4.]
-    dx, dp = back(dy)
-    @test length(p) == length(dp)
-  end
+        model = TwoDenses(Dense(3, 1),
+                          Dense(3, 2))
+        p, re = Flux.destructure(model)
 
-  @testset "issue 1727" begin
-    p, re = Flux.destructure(BatchNorm(3))  # 6 parameters, plus 6 non-trainable
-    @test length(p) == 6
+        x = [1.0, 2.0, 3.0]
+        y, back = Flux.Zygote.pullback((x, p) -> re(p)(x), x, p)
 
-    x = rand(Float32, 3, 4)
-    y, back = Flux.pullback(x, p) do x, p
-      vec(re(p)(x))
+        dy = [4.0]
+        dx, dp = back(dy)
+        @test length(p) == length(dp)
     end
-    @test_nowarn back(y)
-    b = back(y)
 
-    @test size(b[1]) == size(x)
-    @test size(b[2]) == size(p)
-  end
+    @testset "issue 1727" begin
+        p, re = Flux.destructure(BatchNorm(3))  # 6 parameters, plus 6 non-trainable
+        @test length(p) == 6
+
+        x = rand(Float32, 3, 4)
+        y, back = Flux.pullback(x, p) do x, p
+            return vec(re(p)(x))
+        end
+        @test_nowarn back(y)
+        b = back(y)
 
-  @testset "issue 1767" begin
-    struct Model{A}
-        a::A
-        b::A
+        @test size(b[1]) == size(x)
+        @test size(b[2]) == size(p)
     end
-    Flux.@functor Model
-    (m::Model)(x) = m.a(x) .+ m.b(x)
 
-    d = Dense(1, 1)
-    x = rand(Float32, 1, 1)
+    @testset "issue 1767" begin
+        struct Model{A}
+            a::A
+            b::A
+        end
+        Flux.@functor Model
+        (m::Model)(x) = m.a(x) .+ m.b(x)
 
-    # Sharing the parameters
-    model = Model(d, d)
+        d = Dense(1, 1)
+        x = rand(Float32, 1, 1)
 
-    # Works
-    g1 = Flux.gradient(() -> sum(model(x)), Flux.params(model))
+        # Sharing the parameters
+        model = Model(d, d)
 
-    p, re = Flux.destructure(model)
-    # Fails
-    g2 = Flux.gradient(p -> sum(re(p)(x)), p)
+        # Works
+        g1 = Flux.gradient(() -> sum(model(x)), Flux.params(model))
 
-    @test g2[1] ≈ vcat(g1[d.weight], g1[d.bias])
-  end
+        p, re = Flux.destructure(model)
+        # Fails
+        g2 = Flux.gradient(p -> sum(re(p)(x)), p)
 
-  @testset "issue 1826" begin
-    struct Split{T}  # taken from: https://fluxml.ai/Flux.jl/stable/models/advanced/#Multiple-outputs:-a-custom-Split-layer
-        paths::T
+        @test g2[1] ≈ vcat(g1[d.weight], g1[d.bias])
     end
-    Split(paths...) = Split(paths)
-    Flux.@functor Split
-    (m::Split)(x::AbstractArray) = map(f -> f(x), m.paths)
 
-    n_input, n_batch, n_shared = 5, 13, 11
-    n_outputs = [3, 7]
+    @testset "issue 1826" begin
+        struct Split{T}  # taken from: https://fluxml.ai/Flux.jl/stable/models/advanced/#Multiple-outputs:-a-custom-Split-layer
+            paths::T
+        end
+        Split(paths...) = Split(paths)
+        Flux.@functor Split
+        (m::Split)(x::AbstractArray) = map(f -> f(x), m.paths)
 
-    data = rand(Float32, n_input, n_batch)
-    model = Chain(
-        Dense(n_input, n_shared),
-        Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2]))
-    )
+        n_input, n_batch, n_shared = 5, 13, 11
+        n_outputs = [3, 7]
 
-    pvec, re = Flux.destructure(model)
-    loss(x, idx, pv) = sum(abs2, re(pv)(x)[idx])  # loss wrt `idx`th output term
+        data = rand(Float32, n_input, n_batch)
+        model = Chain(Dense(n_input, n_shared),
+                      Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2])))
 
-    g = Flux.Zygote.ForwardDiff.gradient(pv -> loss(data, 1, pv), pvec)
-    @test g ≈ Flux.Zygote.gradient(pv -> loss(data, 1, pv), pvec)[1]
-  end
-end
+        pvec, re = Flux.destructure(model)
+        loss(x, idx, pv) = sum(abs2, re(pv)(x)[idx])  # loss wrt `idx`th output term
 
-@testset "Rrule" begin
-  @testset "issue 2033" begin
-    if CUDA.functional()
-      struct Wrapped{T}
-          x::T
-      end
-      y, _ = Flux.pullback(Wrapped, cu(randn(3,3)))
-      @test y isa Wrapped{<:CuArray}
+        g = Flux.Zygote.ForwardDiff.gradient(pv -> loss(data, 1, pv), pvec)
+        @test g ≈ Flux.Zygote.gradient(pv -> loss(data, 1, pv), pvec)[1]
     end
-  end
 end
 
+@testset "Rrule" begin @testset "issue 2033" begin if CUDA.functional()
+    struct Wrapped{T}
+        x::T
+    end
+    y, _ = Flux.pullback(Wrapped, cu(randn(3, 3)))
+    @test y isa Wrapped{<:CuArray}
+end end end
+
 # make sure rng_from_array is non_differentiable
 @testset "rng_from_array" begin
-  m(x) = (rand(rng_from_array(x)) * x)[1]
-  gradient(m, ones(2))
+    m(x) = (rand(rng_from_array(x)) * x)[1]
+    gradient(m, ones(2))
 end

From f263ed7c7b52eea47dece2e12b32646d758fd40a Mon Sep 17 00:00:00 2001
From: Saransh Chopra <saransh0701@gmail.com>
Date: Wed, 19 Oct 2022 14:11:24 +0530
Subject: [PATCH 2/2] Preserve git blame

---
 .git-blame-ignore-revs | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .git-blame-ignore-revs

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 0000000000..0eaecea1ff
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,2 @@
+# activated JuliaFormatter
+bf30ed78e89f9a652d885d1848a1d41f414dffc8