diff --git a/Project.toml b/Project.toml
index 8efed07..38eefd4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,7 +12,8 @@ julia = "1"
 
 [extras]
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "FiniteDifferences"]
+test = ["Test", "FiniteDifferences", "Random"]
diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 6992955..15ba75b 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -31,16 +31,32 @@ primalvalue(x::Tuple) = map(primalvalue, x)
 primalvalue(x) = x
 
 function derivative(ab::AbstractBackend, f, xs::Number...)
-    return getindex.(jacobian(lowest(ab), f, xs...), 1)
+    der = getindex.(jacobian(lowest(ab), f, xs...), 1)
+    if der isa Tuple
+        return der
+    else
+        return (der,)
+    end
 end
+
 function gradient(ab::AbstractBackend, f, xs...)
-    return adjoint.(jacobian(lowest(ab), f, xs...))
+    return reshape.(adjoint.(jacobian(lowest(ab), f, xs...)),size.(xs))
 end
 function jacobian(ab::AbstractBackend, f, xs...) end
 function hessian(ab::AbstractBackend, f, xs...)
-    return jacobian(secondlowest(ab), (xs...,) -> begin
-        gradient(lowest(ab), f, xs...)
-    end, xs...)
+    xss = collect((xs...,)) 
+    counter = 0
+    # gradient returns tuple of gradient values with respect to inputs x,y ∈ xs
+    # Hessian is the Jacobian of the individual gradients, i.e., the tuple of matrices 
+    # defined by ∂x∂x f, ∂y∂y f, in the case of a scalar valued function `f`.  
+    hess = map((xs...,)) do x
+        counter += 1
+        _f = _x->f(setindex!(deepcopy(xss),_x,counter)...)
+        return jacobian(secondlowest(ab),(x,)-> begin
+            return gradient(lowest(ab), _f, x)
+            end, x)[1]
+    end
+    return hess
 end
 
 function value_and_derivative(ab::AbstractBackend, f, xs::Number...)
@@ -49,11 +65,15 @@ function value_and_derivative(ab::AbstractBackend, f, xs::Number...)
 end
 function value_and_gradient(ab::AbstractBackend, f, xs...)
     value, jacs = value_and_jacobian(lowest(ab), f, xs...)
-    return value, adjoint.(jacs)
+    return value, reshape.(adjoint.(jacs),size.(xs))
 end
 function value_and_jacobian(ab::AbstractBackend, f, xs...)
     local value
     primalcalled = false
+    if lowest(ab) isa AbstractFiniteDifference
+        value = primalvalue(ab, nothing, f, xs)
+        primalcalled = true
+    end
     jacs = jacobian(lowest(ab), (_xs...,) -> begin
         v = f(_xs...)
         if !primalcalled
@@ -62,11 +82,16 @@ function value_and_jacobian(ab::AbstractBackend, f, xs...)
         end
         return v
     end, xs...)
+
     return value, jacs
 end
 function value_and_hessian(ab::AbstractBackend, f, xs...)
     local value
     primalcalled = false
+    if ab isa AbstractFiniteDifference
+        value = primalvalue(ab, nothing, f, xs)
+        primalcalled = true
+    end
     hess = jacobian(secondlowest(ab), (_xs...,) -> begin
         v, g = value_and_gradient(lowest(ab), f, _xs...)
         if !primalcalled
@@ -145,6 +170,10 @@ function value_and_pushforward_function(
         @assert ds isa Tuple && length(ds) == length(xs)
         local value
         primalcalled = false
+        if ab isa AbstractFiniteDifference
+            value = primalvalue(ab, nothing, f, xs)
+            primalcalled = true
+        end
         pf = pushforward_function(lowest(ab), (_xs...,) -> begin
             vs = f(_xs...)
             if !primalcalled
@@ -199,6 +228,10 @@ function value_and_pullback_function(
     return (ws) -> begin
         local value
         primalcalled = false
+        if ab isa AbstractFiniteDifference
+            value = primalvalue(ab, nothing, f, xs)
+            primalcalled = true
+        end
         if ws === nothing
             vs = f(xs...)
             if !primalcalled
@@ -224,63 +257,169 @@ struct LazyDerivative{B, F, X}
     f::F
     xs::X
 end
+
 function Base.:*(d::LazyDerivative, y)
-    return derivative(d.ab, d.f, d.xs...) * y
+    return derivative(d.backend, d.f, d.xs...) * y
 end
+
 function Base.:*(y, d::LazyDerivative)
-    return y * derivative(d.ab, d.f, d.xs...)
+    return y * derivative(d.backend, d.f, d.xs...)
+end
+
+function Base.:*(d::LazyDerivative, y::Union{Number,Tuple})
+    return derivative(d.backend, d.f, d.xs...) .* y
+end
+
+function Base.:*(y::Union{Number,Tuple}, d::LazyDerivative)
+    return y .* derivative(d.backend, d.f, d.xs...)
 end
 
+function Base.:*(d::LazyDerivative, y::AbstractArray)
+    return map((d)-> d*y, derivative(d.backend, d.f, d.xs...))
+end
+
+function Base.:*(y::AbstractArray, d::LazyDerivative)
+    return map((d)-> y*d, derivative(d.backend, d.f, d.xs...))
+end
+
+
 struct LazyGradient{B, F, X}
     backend::B
     f::F
     xs::X
 end
-Base.:*(d::LazyGradient, y) = gradient(d.ab, d.f, d.xs...) * y
-Base.:*(y, d::LazyGradient) = y * gradient(d.ab, d.f, d.xs...)
+Base.:*(d::LazyGradient, y) = gradient(d.backend, d.f, d.xs...) * y
+Base.:*(y, d::LazyGradient) = y * gradient(d.backend, d.f, d.xs...)
+
+function Base.:*(d::LazyGradient, y::Union{Number,Tuple})
+    if d.xs isa Tuple
+        return gradient(d.backend, d.f, d.xs...) .* y
+    else
+        return gradient(d.backend, d.f, d.xs) .* y
+    end
+end
+
+function Base.:*(y::Union{Number,Tuple}, d::LazyGradient)
+    if d.xs isa Tuple
+        return y .* gradient(d.backend, d.f, d.xs...)
+    else
+        return y .* gradient(d.backend, d.f, d.xs)
+    end
+end
+
 
 struct LazyJacobian{B, F, X}
     backend::B
     f::F
     xs::X
 end
+
 function Base.:*(d::LazyJacobian, ys)
-    return pushforward_function(d.ab, d.f, d.xs...)(ys)
+    if !(ys isa Tuple)
+        ys = (ys, )
+    end
+    if d.xs isa Tuple
+        vjp = pushforward_function(d.backend, d.f, d.xs...)(ys)
+    else
+        vjp = pushforward_function(d.backend, d.f, d.xs)(ys)
+    end
+    if vjp isa Tuple
+        return vjp
+    else
+        return (vjp,)
+    end
 end
+
 function Base.:*(ys, d::LazyJacobian)
     if ys isa Tuple
         ya = adjoint.(ys)
     else
         ya = adjoint(ys)
     end
-    return pullback_function(d.ab, d.f, d.xs...)(ya)
+    if d.xs isa Tuple
+        return pullback_function(d.backend, d.f, d.xs...)(ya)
+    else
+        return pullback_function(d.backend, d.f, d.xs)(ya)
+    end
+end
+
+function Base.:*(d::LazyJacobian, ys::Number)
+    if d.xs isa Tuple
+        return jacobian(d.backend, d.f, d.xs...) .* ys
+    else
+        return jacobian(d.backend, d.f, d.xs) .* ys
+    end
 end
 
+function Base.:*(ys::Number, d::LazyJacobian)
+    if d.xs isa Tuple
+        return jacobian(d.backend, d.f, d.xs...) .* ys
+    else
+        return jacobian(d.backend, d.f, d.xs) .* ys
+    end
+end
+
+
 struct LazyHessian{B, F, X}
     backend::B
     f::F
     xs::X
 end
+
 function Base.:*(d::LazyHessian, ys)
-    return pushforward_function(
-        secondlowest(d.ab),
-        (xs...,) -> gradient(lowest(d.ab), d.f, xs...),
-        d.xs...,
-    )(ys)
+    if !(ys isa Tuple)
+        ys = (ys, )
+    end
+
+    if d.xs isa Tuple
+        return pushforward_function(
+            secondlowest(d.backend),
+            (xs...,) -> gradient(lowest(d.backend), d.f, xs...), d.xs...,)(ys)
+    else
+        return pushforward_function(
+            secondlowest(d.backend),
+            (xs,) -> gradient(lowest(d.backend), d.f, xs),d.xs,)(ys)
+    end
 end
+
 function Base.:*(ys, d::LazyHessian)
     if ys isa Tuple
         ya = adjoint.(ys)
     else
         ya = adjoint(ys)
     end
-    return pullback_function(
-        secondlowest(d.ab),
-        (xs...,) -> gradient(lowest(d.ab), d.f, xs...),
-        d.xs...,
-    )(ya)
+    if d.xs isa Tuple
+        return pullback_function(
+            secondlowest(d.backend),
+            (xs...,) -> gradient(lowest(d.backend), d.f, xs...),
+            d.xs...,
+            )(ya)
+    else
+        return pullback_function(
+            secondlowest(d.backend),
+            (xs,) -> gradient(lowest(d.backend), d.f, xs)[1],
+            d.xs,
+            )(ya)
+    end
 end
 
+function Base.:*(d::LazyHessian, ys::Number)
+    if d.xs isa Tuple
+        return hessian(d.backend, d.f, d.xs...).*ys
+    else
+        return hessian(d.backend, d.f, d.xs).*ys
+    end
+end
+
+function Base.:*(ys::Number, d::LazyHessian)
+    if d.xs isa Tuple
+        return ys.*hessian(d.backend, d.f, d.xs...)
+    else
+        return ys.*hessian(d.backend, d.f, d.xs)
+    end
+end
+
+
 function lazyderivative(ab::AbstractBackend, f, xs::Number...)
     return LazyDerivative(ab, f, xs)
 end
@@ -350,6 +489,20 @@ function define_pushforward_function_and_friends(fdef)
                         pff(cols)
                     end
                 end
+            elseif eltype(identity_like) <: AbstractMatrix
+                # needed for the computation of the Hessian and Jacobian
+                ret = hcat.(mapslices(identity_like[1], dims=1) do cols
+                    # cols loop over basis states   
+                    pf = pff((cols,))
+                    if typeof(pf) <: AbstractVector
+                        # to make the hcat. work / get correct matrix-like, non-flat output dimension
+                        return (pf, )
+                    else
+                        return pf
+                    end
+                end ...)
+                return ret isa Tuple ? ret : (ret,)
+
             else
                 return pff(identity_like)
             end
@@ -373,6 +526,13 @@ function define_pullback_function_and_friends(fdef)
                         value_and_pbf(cols)[2]'
                     end
                 end
+            elseif eltype(identity_like) <: AbstractMatrix
+                # needed for Hessian computation:
+                # value is a (grad,). Then, identity_like is a (matrix,).
+                # cols loops over columns of the matrix  
+                return vcat.(mapslices(identity_like[1], dims=1) do cols
+                    adjoint.(value_and_pbf((cols,))[2])
+                end ...)
             else
                 return adjoint.(value_and_pbf(identity_like)[2])
             end
@@ -427,4 +587,4 @@ function zero_matrix_like(x)
     throw("The function `zero_matrix_like` is not defined for the type $(typeof(x)).")
 end
 
-end
\ No newline at end of file
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 324a265..23ce6dc 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,7 @@
 using AbstractDifferentiation
 using Test, FiniteDifferences, LinearAlgebra
+using Random
+Random.seed!(1234)
 
 const FDM = FiniteDifferences
 
@@ -19,7 +21,9 @@ end
 FDMBackend2() = FDMBackend2(central_fdm(5, 1))
 const fdm_backend2 = FDMBackend2()
 AD.@primitive function pushforward_function(ab::FDMBackend2, f, xs...)
-    return (vs) -> jvp(ab.alg, f, tuple.(xs, vs)...)
+    return function (vs)
+        jvp(ab.alg, f, tuple.(xs, vs)...)
+    end
 end
 
 struct FDMBackend3{A} <: AD.AbstractFiniteDifference
@@ -30,16 +34,39 @@ const fdm_backend3 = FDMBackend3()
 AD.@primitive function pullback_function(ab::FDMBackend3, f, xs...)
     return function (vs)
         # Supports only single output
-        @assert length(vs) == 1
-        return j′vp(ab.alg, f, vs[1], xs...)
+        if vs isa AbstractVector
+            return j′vp(ab.alg, f, vs, xs...)
+        else
+            @assert length(vs) == 1
+            return j′vp(ab.alg, f, vs[1], xs...)
+
+        end
     end
 end
 
 fder(x, y) = exp(y) * x + y * log(x)
+dfderdx(x, y) = exp(y) + y * 1/x
+dfderdy(x, y) = exp(y) * x + log(x)
+
 fgrad(x, y) = prod(x) + sum(y ./ (1:length(y)))
+dfgraddx(x, y) = prod(x)./x
+dfgraddy(x, y) = one(eltype(y)) ./ (1:length(y))
+dfgraddxdx(x, y) = prod(x)./(x*x') - Diagonal(diag(prod(x)./(x*x')))
+dfgraddydy(x, y) = zeros(length(y),length(y))
+
 function fjac(x, y)
     x + Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U) * y
 end
+dfjacdx(x, y) = I(length(x))
+dfjacdy(x, y) = Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U)
+
+# Jvp
+jxvp(x,y,v) = dfjacdx(x,y)*v
+jyvp(x,y,v) = dfjacdy(x,y)*v
+
+# vJp
+vJxp(x,y,v) = dfjacdx(x,y)'*v
+vJyp(x,y,v) = dfjacdy(x,y)'*v
 
 const xscalar = rand()
 const yscalar = rand()
@@ -47,6 +74,10 @@ const yscalar = rand()
 const xvec = rand(5)
 const yvec = rand(5)
 
+# to check if vectors get mutated
+xvec2 = deepcopy(xvec)
+yvec2 = deepcopy(yvec)
+
 function test_fdm_derivatives(fdm_backend)
     der1 = AD.derivative(fdm_backend, fder, xscalar, yscalar)
     der2 = (
@@ -57,6 +88,15 @@ function test_fdm_derivatives(fdm_backend)
     valscalar, der3 = AD.value_and_derivative(fdm_backend, fder, xscalar, yscalar)
     @test valscalar == fder(xscalar, yscalar)
     @test der3 .- der1 == (0, 0)
+    der_exact = (dfderdx(xscalar,yscalar), dfderdy(xscalar,yscalar))
+    @test minimum(isapprox.(der_exact, der1, rtol=1e-10))
+    # test if single input (no tuple works)
+    valscalara, dera = AD.value_and_derivative(fdm_backend, x -> fder(x, yscalar), xscalar)
+    valscalarb, derb = AD.value_and_derivative(fdm_backend, y -> fder(xscalar, y), yscalar)
+    @test valscalar == valscalara
+    @test valscalar == valscalarb
+    @test isapprox(dera[1], der1[1], rtol=1e-10)
+    @test isapprox(derb[1], der1[2], rtol=1e-10)
 end
 
 function test_fdm_gradients(fdm_backend)
@@ -66,6 +106,17 @@ function test_fdm_gradients(fdm_backend)
     valscalar, grad3 = AD.value_and_gradient(fdm_backend, fgrad, xvec, yvec)
     @test valscalar == fgrad(xvec, yvec)
     @test norm.(grad3 .- grad1) == (0, 0)
+    grad_exact = (dfgraddx(xvec,yvec), dfgraddy(xvec,yvec))
+    @test minimum(isapprox.(grad_exact, grad1, rtol=1e-10))
+    @test xvec == xvec2
+    @test yvec == yvec2
+    # test if single input (no tuple works)
+    valscalara, grada = AD.value_and_gradient(fdm_backend, x -> fgrad(x, yvec), xvec)
+    valscalarb, gradb = AD.value_and_gradient(fdm_backend, y -> fgrad(xvec, y), yvec)
+    @test valscalar == valscalara
+    @test valscalar == valscalarb
+    @test isapprox(grada[1], grad1[1], rtol=1e-10)
+    @test isapprox(gradb[1], grad1[2], rtol=1e-10)
 end
 
 function test_fdm_jacobians(fdm_backend)
@@ -75,9 +126,25 @@ function test_fdm_jacobians(fdm_backend)
     valvec, jac3 = AD.value_and_jacobian(fdm_backend, fjac, xvec, yvec)
     @test valvec == fjac(xvec, yvec)
     @test norm.(jac3 .- jac1) == (0, 0)
+    grad_exact = (dfjacdx(xvec, yvec), dfjacdy(xvec, yvec))
+    @test minimum(isapprox.(grad_exact, jac1, rtol=1e-10))
+    @test xvec == xvec2
+    @test yvec == yvec2
+    # test if single input (no tuple works)
+    valveca, jaca = AD.value_and_jacobian(fdm_backend, x -> fjac(x, yvec), xvec)
+    valvecb, jacb = AD.value_and_jacobian(fdm_backend, y -> fjac(xvec, y), yvec)
+    @test valvec == valveca
+    @test valvec == valvecb
+    @test isapprox(jaca[1], jac1[1], rtol=1e-10)
+    @test isapprox(jacb[1], jac1[2], rtol=1e-10)
 end
 
 function test_fdm_hessians(fdm_backend)
+    H1 = AD.hessian(fdm_backend, fgrad, xvec, yvec)
+    @test dfgraddxdx(xvec,yvec) ≈ H1[1] atol=1e-10
+    @test dfgraddydy(xvec,yvec) ≈ H1[2] atol=1e-10
+
+    # test if single input (no tuple works)
     fhess = x -> fgrad(x, yvec)
     hess1 = AD.hessian(fdm_backend, fhess, xvec)
     hess2 = FDM.jacobian(
@@ -99,19 +166,42 @@ function test_fdm_hessians(fdm_backend)
     @test valscalar == fgrad(xvec, yvec)
     @test norm.(grad .- AD.gradient(fdm_backend, fhess, xvec)) == (0,)
     @test norm.(hess4 .- hess1) == (0,)
+    @test dfgraddxdx(xvec,yvec) ≈ hess1[1] atol=1e-10
+    @test xvec == xvec2
+    @test yvec == yvec2
+    fhess2 = x-> dfgraddx(x, yvec)
+    hess5 = AD.jacobian(fdm_backend, fhess2, xvec)
+    @test minimum(isapprox.(hess5, hess1, atol=1e-10))
 end
 
 function test_fdm_jvp(fdm_backend)
     v = (rand(length(xvec)), rand(length(yvec)))
-    pf1 = AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+
+    if fdm_backend isa FDMBackend2 # augmented version of v
+        identity_like = AD.identity_matrix_like(v)
+        vaug = map(identity_like) do identity_like_i
+            identity_like_i .* v
+        end
+
+        pf1 = map(v->AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v), vaug)
+        ((valvec1, pf3x), (valvec2, pf3y)) = map(v->AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v), vaug)
+    else
+        pf1 = AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+        valvec, pf3 = AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+        ((valvec1, pf3x), (valvec2, pf3y)) = (valvec, pf3[1]), (valvec, pf3[2])
+    end
     pf2 = (
         FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
         FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
     )
     @test norm.(pf1 .- pf2) == (0, 0)
-    valvec, pf3 = AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
-    @test valvec == fjac(xvec, yvec)
-    @test norm.(pf3 .- pf1) == (0, 0)
+
+    @test valvec1 == fjac(xvec, yvec)
+    @test valvec2 == fjac(xvec, yvec)
+    @test norm.((pf3x,pf3y) .- pf1) == (0, 0)
+    @test minimum(isapprox.(pf1, (jxvp(xvec,yvec,v[1]), jyvp(xvec,yvec,v[2])), atol=1e-10))
+    @test xvec == xvec2
+    @test yvec == yvec2
 end
 
 function test_fdm_j′vp(fdm_backend)
@@ -122,6 +212,218 @@ function test_fdm_j′vp(fdm_backend)
     valvec, pb3 = AD.value_and_pullback_function(fdm_backend, fjac, xvec, yvec)(w)
     @test valvec == fjac(xvec, yvec)
     @test norm.(pb3 .- pb1) == (0, 0)
+    @test minimum(isapprox.(pb1, (vJxp(xvec,yvec,w), vJyp(xvec,yvec,w)), atol=1e-10))
+    @test xvec == xvec2
+    @test yvec == yvec2
+end
+
+function test_fdm_lazy_derivatives(fdm_backend)
+    # single input function
+    der1 = AD.derivative(fdm_backend, x->fder(x, yscalar), xscalar)
+    der2 = (
+        fdm_backend.alg(x -> fder(x, yscalar), xscalar),
+        fdm_backend.alg(y -> fder(xscalar, y), yscalar),
+    )
+
+    lazyder = AD.LazyDerivative(fdm_backend, x->fder(x, yscalar), xscalar)
+
+    # multiplication with scalar
+    @test der1[1]*yscalar == der2[1]*yscalar
+    @test lazyder*yscalar == der1.*yscalar
+    @test lazyder*yscalar isa Tuple
+
+    @test yscalar*der1[1] == yscalar*der2[1]
+    @test yscalar*lazyder == yscalar.*der1
+    @test yscalar*lazyder isa Tuple
+
+    # multiplication with array
+    @test der1[1]*yvec == der2[1]*yvec
+    @test lazyder*yvec == (der1.*yvec,)
+    @test lazyder*yvec isa Tuple
+
+    @test yvec*der1[1] == yvec*der2[1]
+    @test yvec*lazyder == (yvec.*der1,)
+    @test yvec*lazyder isa Tuple
+
+    # multiplication with tuple
+    @test lazyder*(yscalar,) == lazyder*yscalar
+    @test lazyder*(yvec,) == lazyder*yvec
+
+    @test (yscalar,)*lazyder == yscalar*lazyder
+    @test (yvec,)*lazyder == yvec*lazyder
+
+    # two input function
+    der1 = AD.derivative(fdm_backend, fder, xscalar, yscalar)
+    der2 = (
+        fdm_backend.alg(x -> fder(x, yscalar), xscalar),
+        fdm_backend.alg(y -> fder(xscalar, y), yscalar),
+    )
+
+    lazyder = AD.LazyDerivative(fdm_backend, fder, (xscalar, yscalar))
+
+    # multiplication with scalar
+    @test der1.*yscalar == der2.*yscalar
+    @test lazyder*yscalar == der1.*yscalar
+    @test lazyder*yscalar isa Tuple
+
+    @test yscalar.*der1 == yscalar.*der2
+    @test yscalar*lazyder == yscalar.*der1
+    @test yscalar*lazyder isa Tuple
+
+    # multiplication with array
+    @test (der1[1]*yvec, der1[2]*yvec) == (der2[1]*yvec, der2[2]*yvec)
+    @test lazyder*yvec == (der1[1]*yvec, der1[2]*yvec)
+    @test lazyder*yvec isa Tuple
+
+    @test (yvec*der1[1], yvec*der1[2]) == (yvec*der2[1], yvec*der2[2])
+    @test yvec*lazyder == (yvec*der1[1], yvec*der1[2])
+    @test lazyder*yvec isa Tuple
+
+    # multiplication with tuple
+    @test lazyder*(yscalar,) == lazyder*yscalar
+    @test lazyder*(yvec,) == lazyder*yvec
+
+    @test (yscalar,)*lazyder == yscalar*lazyder
+    @test (yvec,)*lazyder == yvec*lazyder
+end
+
+function test_fdm_lazy_gradients(fdm_backend)
+    # single input function
+    grad1 = AD.gradient(fdm_backend, x->fgrad(x, yvec), xvec)
+    grad2 = FDM.grad(fdm_backend.alg, x->fgrad(x, yvec), xvec)
+    lazygrad = AD.LazyGradient(fdm_backend, x->fgrad(x, yvec), xvec)
+
+    # multiplication with scalar
+    @test norm.(grad1.*yscalar .- grad2.*yscalar) == (0,)
+    @test norm.(lazygrad*yscalar .- grad1.*yscalar) == (0,)
+    @test lazygrad*yscalar isa Tuple
+
+    @test norm.(yscalar.*grad1 .- yscalar.*grad2) == (0,)
+    @test norm.(yscalar*lazygrad .- yscalar.*grad1) == (0,)
+    @test yscalar*lazygrad isa Tuple
+
+    # multiplication with tuple
+    @test lazygrad*(yscalar,) == lazygrad*yscalar
+    @test (yscalar,)*lazygrad == yscalar*lazygrad
+
+    # two input function
+    grad1 = AD.gradient(fdm_backend, fgrad, xvec, yvec)
+    grad2 = FDM.grad(fdm_backend.alg, fgrad, xvec, yvec)
+    lazygrad = AD.LazyGradient(fdm_backend, fgrad, (xvec, yvec))
+
+    # multiplication with scalar
+    @test norm.(grad1.*yscalar .- grad2.*yscalar) == (0,0)
+    @test norm.(lazygrad*yscalar .- grad1.*yscalar) == (0,0)
+    @test lazygrad*yscalar isa Tuple
+
+    @test norm.(yscalar.*grad1 .- yscalar.*grad2) == (0,0)
+    @test norm.(yscalar*lazygrad .- yscalar.*grad1) == (0,0)
+    @test yscalar*lazygrad isa Tuple
+
+    # multiplication with tuple
+    @test lazygrad*(yscalar,) == lazygrad*yscalar
+    @test (yscalar,)*lazygrad == yscalar*lazygrad
+end
+
+function test_fdm_lazy_jacobians(fdm_backend)
+    # single input function
+    jac1 = AD.jacobian(fdm_backend, x->fjac(x, yvec), xvec)
+    jac2 = FDM.jacobian(fdm_backend.alg, x->fjac(x, yvec), xvec)
+    lazyjac = AD.LazyJacobian(fdm_backend, x->fjac(x, yvec), xvec)
+
+    # multiplication with scalar
+    @test norm.(jac1.*yscalar .- jac2.*yscalar) == (0,)
+    @test norm.(lazyjac*yscalar .- jac1.*yscalar) == (0,)
+    @test lazyjac*yscalar isa Tuple
+
+    @test norm.(yscalar.*jac1 .- yscalar.*jac2) == (0,)
+    @test norm.(yscalar*lazyjac .- yscalar.*jac1) == (0,)
+    @test yscalar*lazyjac isa Tuple
+
+    w = adjoint(rand(length(fjac(xvec, yvec))))
+    v = (rand(length(xvec)),rand(length(xvec)))
+
+    # vjp
+    pb1 = FDM.j′vp(fdm_backend.alg, x -> fjac(x, yvec), w, xvec)
+    res = w*lazyjac
+    @test minimum(isapprox.(pb1, res, atol=1e-10))
+    @test res isa Tuple
+
+    # jvp
+    pf1 = (FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),)
+    res = lazyjac*v[1]
+    @test minimum(isapprox.(pf1, res, atol=1e-10))
+    @test res isa Tuple
+
+    # two input function
+    jac1 = AD.jacobian(fdm_backend, fjac, xvec, yvec)
+    jac2 = FDM.jacobian(fdm_backend.alg, fjac, xvec, yvec)
+    lazyjac = AD.LazyJacobian(fdm_backend, fjac, (xvec, yvec))
+
+    # multiplication with scalar
+    @test norm.(jac1.*yscalar .- jac2.*yscalar) == (0,0)
+    @test norm.(lazyjac*yscalar .- jac1.*yscalar) == (0,0)
+    @test lazyjac*yscalar isa Tuple
+
+    @test norm.(yscalar.*jac1 .- yscalar.*jac2) == (0,0)
+    @test norm.(yscalar*lazyjac .- yscalar.*jac1) == (0,0)
+    @test yscalar*lazyjac isa Tuple
+
+    # vjp
+    pb1 = FDM.j′vp(fdm_backend.alg, fjac, w, xvec, yvec)
+    res = w*lazyjac
+    @test minimum(isapprox.(pb1, res, atol=1e-10))
+    @test res isa Tuple
+
+    # jvp
+    pf1 = (
+        FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
+        FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
+    )
+
+    if fdm_backend isa FDMBackend2 # augmented version of v
+        identity_like = AD.identity_matrix_like(v)
+        vaug = map(identity_like) do identity_like_i
+            identity_like_i .* v
+        end
+
+        res = map(v->(lazyjac*v)[1], vaug)
+    else
+        res = lazyjac*v
+    end
+
+    @test minimum(isapprox.(pf1, res, atol=1e-10))
+    @test res isa Tuple
+end
+
+function test_fdm_lazy_hessians(fdm_backend)
+    # single input function
+    fhess = x -> fgrad(x, yvec)
+    hess1 = (dfgraddxdx(xvec,yvec),)
+    lazyhess = AD.LazyHessian(fdm_backend, x->fgrad(x, yvec), xvec)
+
+    # multiplication with scalar
+    @test minimum(isapprox.(lazyhess*yscalar, hess1.*yscalar, atol=1e-10))
+    @test lazyhess*yscalar isa Tuple
+
+    # multiplication with scalar
+    @test minimum(isapprox.(yscalar*lazyhess, yscalar.*hess1, atol=1e-10))
+    @test yscalar*lazyhess isa Tuple
+
+    w = adjoint(rand(length(xvec)))
+    v = rand(length(xvec))
+
+    # Hvp
+    Hv = map(h->h*v, hess1)
+    res = lazyhess*v
+    @test minimum(isapprox.(Hv, res, atol=1e-10))
+    @test res isa Tuple
+
+    # H′vp
+    wH = map(h->h'*adjoint(w), hess1)
+    res = w*lazyhess
+    @test minimum(isapprox.(wH, res, atol=1e-10))
+    @test res isa Tuple
 end
 
 @testset "AbstractDifferentiation.jl" begin
@@ -139,29 +441,43 @@ end
         @testset "Jacobian" begin
             test_fdm_jacobians(fdm_backend1)
             test_fdm_jacobians(fdm_backend2)
-            # Errors
             test_fdm_jacobians(fdm_backend3)
         end
         @testset "Hessian" begin
             # Works but super slow
             test_fdm_hessians(fdm_backend1)
-            # Errors
             test_fdm_hessians(fdm_backend2)
-            # Errors
             test_fdm_hessians(fdm_backend3)
         end
         @testset "jvp" begin
             test_fdm_jvp(fdm_backend1)
-            # Errors
             test_fdm_jvp(fdm_backend2)
-            # Errors
             test_fdm_jvp(fdm_backend3)
         end
         @testset "j′vp" begin
             test_fdm_j′vp(fdm_backend1)
             test_fdm_j′vp(fdm_backend2)
-            # Errors
             test_fdm_j′vp(fdm_backend3)
         end
+        @testset "Lazy Derivative" begin
+            test_fdm_lazy_derivatives(fdm_backend1)
+            test_fdm_lazy_derivatives(fdm_backend2)
+            test_fdm_lazy_derivatives(fdm_backend3)
+        end
+        @testset "Lazy Gradient" begin
+            test_fdm_lazy_gradients(fdm_backend1)
+            test_fdm_lazy_gradients(fdm_backend2)
+            test_fdm_lazy_gradients(fdm_backend3)
+        end
+        @testset "Lazy Jacobian" begin
+            test_fdm_lazy_jacobians(fdm_backend1)
+            test_fdm_lazy_jacobians(fdm_backend2)
+            test_fdm_lazy_jacobians(fdm_backend3)
+        end
+        @testset "Lazy Hessian" begin
+            test_fdm_lazy_hessians(fdm_backend1)
+            test_fdm_lazy_hessians(fdm_backend2)
+            test_fdm_lazy_hessians(fdm_backend3)
+        end
     end
 end