JuliaData · bkamins · Dec 2, 2022 · Nov 25, 2022 · Nov 25, 2022 · Nov 28, 2022
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,9 @@
 
 * Add `Iterators.partition` support
    ([#3212](https://github.com/JuliaData/DataFrames.jl/pull/3212))
+* Add `allunique` and allow transformations in `cols` argument of `describe`
+  and `nonunique` when working with `SubDataFrame`
+  ([3232](https://github.com/JuliaData/DataFrames.jl/pull/3232))
 
 # DataFrames.jl v1.4.4 Patch Release Notes
 

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
@@ -146,6 +146,7 @@ valuecols
 
 ## Filtering rows
 ```@docs
+allunique
 deleteat!
 empty
 empty!

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -449,8 +449,8 @@ $METADATA_FIXED
 """
 function Base.similar(df::AbstractDataFrame, rows::Integer = size(df, 1))
     rows < 0 && throw(ArgumentError("the number of rows must be non-negative"))
-    out_df = DataFrame(AbstractVector[similar(x, rows) for x in eachcol(df)], copy(index(df)),
-                       copycols=false)
+    out_df = DataFrame(AbstractVector[similar(x, rows) for x in eachcol(df)],
+                       copy(index(df)), copycols=false)
     _copy_all_note_metadata!(out_df, df)
     return out_df
 end
@@ -565,7 +565,6 @@ $METADATA_FIXED
 @inline Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false) =
     view ? Base.view(df, max(1, nrow(df)-n+1):nrow(df), :) : df[max(1, nrow(df)-n+1):nrow(df), :]
 
-
 """
     describe(df::AbstractDataFrame; cols=:)
     describe(df::AbstractDataFrame, stats::Union{Symbol, Pair}...; cols=:)
@@ -656,10 +655,10 @@ julia> describe(df, :min, sum => :sum, cols=:x)
 DataAPI.describe(df::AbstractDataFrame,
                  stats::Union{Symbol, Pair{<:Base.Callable, <:SymbolOrString}}...;
                  cols=:) =
-    _describe(select(df, cols, copycols=false), Any[s for s in stats])
+    _describe(_try_select_no_copy(df, cols), Any[s for s in stats])
 
 DataAPI.describe(df::AbstractDataFrame; cols=:) =
-    _describe(select(df, cols, copycols=false),
+    _describe(_try_select_no_copy(df, cols),
               Any[:mean, :min, :median, :max, :nmissing, :eltype])
 
 function _describe(df::AbstractDataFrame, stats::AbstractVector)
@@ -1422,7 +1421,7 @@ function nonunique(df::AbstractDataFrame)
 end
 
 function nonunique(df::AbstractDataFrame, cols)
-    udf = select(df, cols, copycols=false)
+    udf = _try_select_no_copy(df, cols)
     if ncol(df) > 0 && ncol(udf) == 0
          throw(ArgumentError("finding duplicate rows in data frame when " *
                              "`cols` selects no columns is not allowed"))
@@ -1431,6 +1430,49 @@ function nonunique(df::AbstractDataFrame, cols)
     end
 end
 
+"""
+    allunique(df::AbstractDataFrame, cols=:)
+
+Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if
+all their columns contain equal values (according to `isequal`).
+
+See also [`unique`](@ref) and [`nonunique`](@ref).
+
+# Arguments
+- `df` : `AbstractDataFrame`
+- `cols` : a selector specifying the column(s) or their transformations to compare.
+  Can be any column selector or transformation accepted by [`select`](@ref).
+
+# Examples
+
+```jldoctest
+julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
+4×2 DataFrame
+ Row │ i      x
+     │ Int64  Int64
+─────┼──────────────
+   1 │     1      1
+   2 │     2      2
+   3 │     3      1
+   4 │     4      2
+
+julia> allunique(df)
+true
+
+julia> allunique(df, :x)
+false
+
+julia> allunique(df, :i => ByRow(isodd))
+false
+```
+"""
+function Base.allunique(df::AbstractDataFrame, cols=:)
+    udf = _try_select_no_copy(df, cols)
+    nrow(udf) == 0 && return true
+    return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)),
+                           Val(false), nothing, false, nothing)[1] == nrow(df)
+end
+
 """
     unique(df::AbstractDataFrame; view::Bool=false)
     unique(df::AbstractDataFrame, cols; view::Bool=false)

diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -1546,3 +1546,6 @@ function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...)
     @assert size(out_df) == (target_rows, length(colnames))
     return out_df
 end
+
+_try_select_no_copy(df::DataFrame, cols) = select(df, cols, copycols=false)
+
diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl
@@ -372,3 +372,18 @@ function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame; keep_present::Bo
 
     return sdf
 end
+
+# _try_select_no_copy selects cols from df; it tries to avoid copying data if possible;
+# for SubDataFrame if cols is not a simple column selector then copying is needed
+function _try_select_no_copy(sdf::SubDataFrame, cols)
+    # try is needed here as `cols` could be AbstractVector in which case
+    # it is not possible to statically check if it is a valid column selector
+    colsidx = try
+        index(sdf)[cols]
+    catch
+        nothing
+    end 
+
+    return isnothing(colsidx) ? select(sdf, cols) : select(sdf, colsidx, copycols=false)
+end
+
diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -2254,6 +2254,34 @@ end
     @test !isempty(DataFrame(a=1))
 end
 
+@testset "allunique" begin
+    refdf = DataFrame(a=[1, 1, 2, 2, 3], b=[1, 2, 1, 2, 3], c=[1, 2, 1, 2, 3])
+    for df in (refdf[1:4, 1:2], view(refdf, 1:4, 1:2))
+        @test allunique(df)
+        @test !allunique(df, 1)
+        @test !allunique(df, :b)
+        @test allunique(df, All())
+        @test allunique(df, [])
+        @test allunique(df, x -> 1:4)
+        @test allunique(df, [:a, :b] => ByRow(string))
+    end
+end
+
+@testset "extra tests describe, nonunique, allunique for SubDataFrame" begin
+    refdf = DataFrame(a=[1, 1, 2, 2, 3], b=[1, 2, 1, 2, 3], c=[1, 2, 1, 2, 3])
+    sdf = @view refdf[1:4, 1:2]
+    @test describe(sdf, cols=:a => ByRow(string)) ==
+          DataFrame(variable=:a_string, mean=nothing, min="1",
+                    median=nothing, max="2", nmissing=0, eltype=String)
+    @test describe(sdf, :min, :max, cols=x -> DataFrame(x=11:14)) ==
+          DataFrame(variable=:x, min=11, max=14)
+    @test nonunique(sdf, x->[1, 1, 2, 2]) == [false, true, false, true]
+    @test nonunique(sdf, :a => x -> true) == [false, true, true, true]
+    @test !allunique(sdf, x -> [1, 1, 2, 2])
+    @test allunique(sdf, :a => x -> 1:4)
+    @test !allunique(sdf, :a => x -> true)
+end
+
 @testset "Iterators.partition" begin
     for df in (DataFrame(x=1:5), view(DataFrame(x=1:6, y=11:16), 1:5, 1:1))
         p = Iterators.partition(df, 2)