LuxDL · avik-pal · Jul 1, 2024 · Jun 30, 2024 · Jun 30, 2024 · Jun 30, 2024
diff --git a/.buildkite/testing.yml b/.buildkite/testing.yml
@@ -1,7 +1,7 @@
 steps:
   - group: ":julia: CUDA GPU"
     steps:
-      - label: ":julia: Julia {{matrix.julia}} + CUDA GPU"
+      - label: ":julia: Julia {{matrix.julia}} + {{matrix.testing_group}} + CUDA GPU"
         plugins:
           - JuliaCI/julia#v1:
               version: "{{matrix.julia}}"
@@ -17,12 +17,16 @@ steps:
           cuda: "*"
         env:
           BACKEND_GROUP: "CUDA"
+          LUX_TEST_GROUP: "{{matrix.testing_group}}"
         if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
-        timeout_in_minutes: 240
+        timeout_in_minutes: 60
         matrix:
           setup:
             julia:
               - "1"
+            testing_group:
+              - "!distributed"
+              - "distributed"
 
   - group: ":telescope: Downstream CUDA"
     steps:
@@ -42,7 +46,7 @@ steps:
         env:
           RETESTITEMS_NWORKERS: 2
         if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip downstream\]/ && build.message !~ /\[skip ci\]/ && build.pull_request.labels includes "run downstream test"
-        timeout_in_minutes: 240
+        timeout_in_minutes: 60
         matrix:
           setup:
             repo:
@@ -52,7 +56,7 @@ steps:
 
   - group: ":julia: AMD GPU"
     steps:
-      - label: ":julia: Julia: {{matrix.julia}} + AMD GPU"
+      - label: ":julia: Julia: {{matrix.julia}} + {{matrix.testing_group}} + AMD GPU"
         plugins:
           - JuliaCI/julia#v1:
               version: "{{matrix.julia}}"
@@ -68,6 +72,8 @@ steps:
           JULIA_AMDGPU_HIP_MUST_LOAD: "1"
           JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
           BACKEND_GROUP: "AMDGPU"
+          LUX_TEST_GROUP: "{{matrix.testing_group}}"
+          RETESTITEMS_NWORKERS: 2
         agents:
           queue: "juliagpu"
           rocm: "*"
@@ -78,6 +84,9 @@ steps:
           setup:
             julia:
               - "1"
+            testing_group:
+              - "!distributed"
+              - "distributed"
 
   - group: ":telescope: Downstream AMD GPU"
     steps:

diff --git a/codecov.yml b/codecov.yml
diff --git a/ext/LuxForwardDiffExt/utils.jl b/ext/LuxForwardDiffExt/utils.jl
@@ -1,7 +1,10 @@
 # Low-Level functions
 @inline function Lux.__partials(::Type{Tag}, x, i) where {Tag}
     x isa ForwardDiff.Dual && return ForwardDiff.partials(Tag, x, i)
-    x isa AbstractArray && return ForwardDiff.partials.(Tag, x, i)
+    if x isa AbstractArray
+        bfn(xᵢ, iᵢ) = ForwardDiff.partials(Tag, xᵢ, iᵢ)
+        return bfn.(x, i)
+    end
     map_fn = @closure(xᵢ->Lux.__partials(Tag, xᵢ, i))
     x isa Tuple && return map(map_fn, x)
     x isa NamedTuple && return NamedTuple{keys(x)}(map(map_fn, values(x)))
@@ -12,8 +15,8 @@ end
 
 @inline function Lux.__dualify(::Type{Tag}, ::Type{T}, x, u) where {Tag, T}
     if x isa AbstractArray
-        return ForwardDiff.Dual{
-            Tag, T, 1}.(x, ForwardDiff.Partials{1, T}.(tuple.(reshape(u, size(x)))))
+        bfn(xᵢ, uᵢ) = ForwardDiff.Dual{Tag, T, 1}(xᵢ, ForwardDiff.Partials{1, T}(uᵢ))
+        return bfn.(x, tuple.(reshape(u, size(x))))
     end
     x isa Tuple && return map((xᵢ, uᵢ) -> Lux.__dualify(Tag, T, xᵢ, uᵢ), x, u)
     x isa NamedTuple &&

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -247,8 +247,7 @@ function WrappedFunction(f::F) where {F}
     # Not a depwarn but helpful to call this
     Base.depwarn("The current default of `:direct_call` will be replaced with \
                   `:runtime_check` from v0.6). Please make sure that the assumptions of \
-                  this function are correct or specific \
-                  `WrappedFunction{:direct_call}(f)`",
+                  this function are correct or specify `WrappedFunction{:direct_call}(f)`",
         :WrappedFunction)
     return WrappedFunction{:direct_call}(f)
 end

diff --git a/test/core_tests.jl b/test/core_tests.jl
diff --git a/test/helpers/batched_ad_tests.jl b/test/helpers/batched_ad_tests.jl
@@ -4,9 +4,6 @@
     rng = StableRNG(12345)
 
     @testset "$mode" for (mode, aType, dev, ongpu) in MODES
-        # FIXME: AMDGPU takes too long right now
-        mode === "amdgpu" && continue
-
         models = (
             Chain(Conv((3, 3), 2 => 4, gelu; pad=SamePad()),
                 Conv((3, 3), 4 => 2, gelu; pad=SamePad()), FlattenLayer(), Dense(18 => 2)),
@@ -84,16 +81,14 @@ end
     rng = StableRNG(12345)
 
     @testset "$mode" for (mode, aType, dev, ongpu) in MODES
-        # FIXME: AMDGPU takes too long right now
-        mode === "amdgpu" && continue
-
         models = (
             Chain(Conv((3, 3), 2 => 4, gelu; pad=SamePad()),
                 Conv((3, 3), 4 => 2, gelu; pad=SamePad()), FlattenLayer(), Dense(18 => 2)),
             Chain(Dense(2, 4, gelu), Dense(4, 2)))
         Xs = (aType(randn(rng, Float32, 3, 3, 2, 4)), aType(randn(rng, Float32, 2, 4)))
 
         for (model, X) in zip(models, Xs), backend in (AutoZygote(), AutoForwardDiff())
+            model = maybe_rewrite_to_crosscor(mode, model)
             ps, st = Lux.setup(rng, model) |> dev
 
             function loss_function_batched(model, x, ps, st)

diff --git a/test/helpers/nestedad_tests.jl b/test/helpers/nestedad_tests.jl
@@ -6,9 +6,6 @@
     rng = StableRNG(1234)
 
     @testset "$mode" for (mode, aType, dev, ongpu) in MODES
-        # FIXME: AMDGPU takes too long right now
-        mode === "amdgpu" && continue
-
         Xs = (aType(randn(rng, Float32, 3, 3, 2, 4)), aType(randn(rng, Float32, 2, 4)),
             aType(randn(rng, Float32, 2, 4)), aType(randn(rng, Float32, 3, 3, 2, 4)))
         models = (
@@ -22,6 +19,7 @@
                 BatchNorm(2), FlattenLayer(), Dense(18 => 1)))
 
         for (X, model) in zip(Xs, models)
+            model = maybe_rewrite_to_crosscor(mode, model)
             ps, st = Lux.setup(rng, model) |> dev
 
             # smodel | ForwardDiff.jacobian
@@ -52,8 +50,9 @@
                        (loss_function1, loss_function2, loss_function3, loss_function4)
 
             for loss_fn in loss_fns
-                @test_nowarn loss_fn(model, X, ps, st)
-                @test loss_fn(model, X, ps, st) isa Number
+                l = loss_fn(model, X, ps, st)
+                @test l isa Number
+                @test isfinite(l) && !isnan(l)
 
                 _, ∂x, ∂ps, _ = Zygote.gradient(loss_fn, model, X, ps, st)
 
@@ -84,9 +83,6 @@ end
     rng = StableRNG(1234)
 
     @testset "$mode" for (mode, aType, dev, ongpu) in MODES
-        # FIXME: AMDGPU takes too long right now
-        mode === "amdgpu" && continue
-
         Xs = (aType(randn(rng, Float32, 3, 3, 2, 4)), aType(randn(rng, Float32, 2, 4)),
             aType(randn(rng, Float32, 2, 4)), aType(randn(rng, Float32, 3, 3, 2, 4)))
         models = (
@@ -100,6 +96,7 @@ end
                 BatchNorm(2), FlattenLayer(), Dense(18 => 1)))
 
         for (X, model) in zip(Xs, models)
+            model = maybe_rewrite_to_crosscor(mode, model)
             ps, st = Lux.setup(rng, model)
             ps = ps |> ComponentArray |> dev
             st = st |> dev
@@ -134,8 +131,9 @@ end
                        (loss_function1, loss_function2, loss_function3, loss_function4)
 
             for loss_fn in loss_fns
-                @test_nowarn loss_fn(model, X, ps, st)
-                @test loss_fn(model, X, ps, st) isa Number
+                l = loss_fn(model, X, ps, st)
+                @test l isa Number
+                @test isfinite(l) && !isnan(l)
 
                 _, ∂x, ∂ps, _ = Zygote.gradient(loss_fn, model, X, ps, st)
 
@@ -166,9 +164,6 @@ end
     rng = StableRNG(1234)
 
     @testset "$mode" for (mode, aType, dev, ongpu) in MODES
-        # FIXME: AMDGPU takes too long right now
-        mode === "amdgpu" && continue
-
         @testset "Structured Matrix: Issue LuxDL/Lux.jl#602" begin
             model = @compact(; potential=Dense(5 => 5, gelu)) do x
                 @return reshape(diag(only(Zygote.jacobian(potential, x))), size(x))
@@ -206,16 +201,14 @@ end
     rng = StableRNG(1234)
 
     @testset "$mode" for (mode, aType, dev, ongpu) in MODES
-        # FIXME: AMDGPU takes too long right now
-        mode === "amdgpu" && continue
-
         models = (
             Chain(Conv((3, 3), 2 => 4, gelu; pad=SamePad()), BatchNorm(4),
                 Conv((3, 3), 4 => 1, gelu; pad=SamePad())),
             Chain(Dense(2, 4, gelu), Dense(4, 1)))
         Xs = (aType(randn(rng, Float32, 3, 3, 2, 4)), aType(randn(rng, Float32, 2, 4)))
 
         for (model, X) in zip(models, Xs)
+            model = maybe_rewrite_to_crosscor(mode, model)
             ps, st = Lux.setup(rng, model) |> dev
 
             vjp_input = first(model(X, ps, st))
@@ -278,9 +271,6 @@ end
     rng = StableRNG(1234)
 
     @testset "$mode" for (mode, aType, dev, ongpu) in MODES
-        # FIXME: AMDGPU takes too long right now
-        mode === "amdgpu" && continue
-
         x = rand(rng, 3, 3) |> aType
         v = vec(rand(rng, 3, 3)) |> aType
 

diff --git a/test/layers/containers_tests.jl b/test/layers/containers_tests.jl
@@ -4,7 +4,8 @@
 
     @testset "$mode" for (mode, aType, device, ongpu) in MODES
         @testset "zero sum" begin
-            layer = SkipConnection(WrappedFunction(zero), (a, b) -> a .+ b)
+            layer = SkipConnection(
+                WrappedFunction{:direct_call}(Broadcast.BroadcastFunction(zero)), .+)
             display(layer)
             ps, st = Lux.setup(rng, layer) .|> device
             x = randn(rng, 10, 10, 10, 10) |> aType
@@ -13,7 +14,7 @@
 
             @jet layer(x, ps, st)
             __f = x -> sum(first(layer(x, ps, st)))
-            @eval @test_gradients $__f $x atol=1.0f-3 rtol=1.0f-3 reverse_diff_broken=true gpu_testing=$ongpu
+            @eval @test_gradients $__f $x atol=1.0f-3 rtol=1.0f-3 gpu_testing=$ongpu
         end
 
         @testset "concat size" begin
@@ -36,7 +37,9 @@ end
 
     @testset "$mode" for (mode, aType, device, ongpu) in MODES
         @testset "zero sum" begin
-            layer = Parallel(+, WrappedFunction(zero), NoOpLayer())
+            layer = Parallel(
+                +, WrappedFunction{:direct_call}(Broadcast.BroadcastFunction(zero)),
+                NoOpLayer())
             @test :layer_1 in keys(layer) && :layer_2 in keys(layer)
             display(layer)
             ps, st = Lux.setup(rng, layer) .|> device
@@ -46,7 +49,7 @@ end
 
             @jet layer(x, ps, st)
             __f = x -> sum(first(layer(x, ps, st)))
-            @eval @test_gradients $__f $x atol=1.0f-3 rtol=1.0f-3 reverse_diff_broken=true gpu_testing=$ongpu
+            @eval @test_gradients $__f $x atol=1.0f-3 rtol=1.0f-3 gpu_testing=$ongpu
         end
 
         @testset "concat size" begin