Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add allunique and improve nonunique and describe #3232

Merged
merged 6 commits into from
Dec 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

* Add `Iterators.partition` support
([#3212](https://github.com/JuliaData/DataFrames.jl/pull/3212))
* Add `allunique` and allow transformations in `cols` argument of `describe`
and `nonunique` when working with `SubDataFrame`
([3232](https://github.com/JuliaData/DataFrames.jl/pull/3232))

# DataFrames.jl v1.4.4 Patch Release Notes

Expand Down
1 change: 1 addition & 0 deletions docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ valuecols

## Filtering rows
```@docs
allunique
deleteat!
empty
empty!
Expand Down
54 changes: 48 additions & 6 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,8 @@ $METADATA_FIXED
"""
function Base.similar(df::AbstractDataFrame, rows::Integer = size(df, 1))
rows < 0 && throw(ArgumentError("the number of rows must be non-negative"))
out_df = DataFrame(AbstractVector[similar(x, rows) for x in eachcol(df)], copy(index(df)),
copycols=false)
out_df = DataFrame(AbstractVector[similar(x, rows) for x in eachcol(df)],
copy(index(df)), copycols=false)
_copy_all_note_metadata!(out_df, df)
return out_df
end
Expand Down Expand Up @@ -565,7 +565,6 @@ $METADATA_FIXED
@inline Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false) =
view ? Base.view(df, max(1, nrow(df)-n+1):nrow(df), :) : df[max(1, nrow(df)-n+1):nrow(df), :]


"""
describe(df::AbstractDataFrame; cols=:)
describe(df::AbstractDataFrame, stats::Union{Symbol, Pair}...; cols=:)
Expand Down Expand Up @@ -656,10 +655,10 @@ julia> describe(df, :min, sum => :sum, cols=:x)
DataAPI.describe(df::AbstractDataFrame,
stats::Union{Symbol, Pair{<:Base.Callable, <:SymbolOrString}}...;
cols=:) =
_describe(select(df, cols, copycols=false), Any[s for s in stats])
_describe(_try_select_no_copy(df, cols), Any[s for s in stats])

DataAPI.describe(df::AbstractDataFrame; cols=:) =
_describe(select(df, cols, copycols=false),
_describe(_try_select_no_copy(df, cols),
Any[:mean, :min, :median, :max, :nmissing, :eltype])

function _describe(df::AbstractDataFrame, stats::AbstractVector)
Expand Down Expand Up @@ -1422,7 +1421,7 @@ function nonunique(df::AbstractDataFrame)
end

function nonunique(df::AbstractDataFrame, cols)
udf = select(df, cols, copycols=false)
udf = _try_select_no_copy(df, cols)
if ncol(df) > 0 && ncol(udf) == 0
throw(ArgumentError("finding duplicate rows in data frame when " *
"`cols` selects no columns is not allowed"))
Expand All @@ -1431,6 +1430,49 @@ function nonunique(df::AbstractDataFrame, cols)
end
end

"""
allunique(df::AbstractDataFrame, cols=:)

Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if
all their columns contain equal values (according to `isequal`).

See also [`unique`](@ref) and [`nonunique`](@ref).

# Arguments
- `df` : `AbstractDataFrame`
- `cols` : a selector specifying the column(s) or their transformations to compare.
Can be any column selector or transformation accepted by [`select`](@ref).

# Examples

```jldoctest
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
4×2 DataFrame
Row │ i x
│ Int64 Int64
─────┼──────────────
1 │ 1 1
2 │ 2 2
3 │ 3 1
4 │ 4 2

julia> allunique(df)
true

julia> allunique(df, :x)
false

julia> allunique(df, :i => ByRow(isodd))
false
```
"""
function Base.allunique(df::AbstractDataFrame, cols=:)
udf = _try_select_no_copy(df, cols)
nrow(udf) == 0 && return true
return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)),
Val(false), nothing, false, nothing)[1] == nrow(df)
end

"""
unique(df::AbstractDataFrame; view::Bool=false)
unique(df::AbstractDataFrame, cols; view::Bool=false)
Expand Down
3 changes: 3 additions & 0 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1546,3 +1546,6 @@ function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...)
@assert size(out_df) == (target_rows, length(colnames))
return out_df
end

_try_select_no_copy(df::DataFrame, cols) = select(df, cols, copycols=false)

15 changes: 15 additions & 0 deletions src/subdataframe/subdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -372,3 +372,18 @@ function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame; keep_present::Bo

return sdf
end

# _try_select_no_copy selects cols from df; it tries to avoid copying data if possible;
# for SubDataFrame if cols is not a simple column selector then copying is needed
function _try_select_no_copy(sdf::SubDataFrame, cols)
# try is needed here as `cols` could be AbstractVector in which case
# it is not possible to statically check if it is a valid column selector
colsidx = try
index(sdf)[cols]
catch
nothing
end

return isnothing(colsidx) ? select(sdf, cols) : select(sdf, colsidx, copycols=false)
end

28 changes: 28 additions & 0 deletions test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2254,6 +2254,34 @@ end
@test !isempty(DataFrame(a=1))
end

@testset "allunique" begin
refdf = DataFrame(a=[1, 1, 2, 2, 3], b=[1, 2, 1, 2, 3], c=[1, 2, 1, 2, 3])
for df in (refdf[1:4, 1:2], view(refdf, 1:4, 1:2))
@test allunique(df)
@test !allunique(df, 1)
@test !allunique(df, :b)
@test allunique(df, All())
@test allunique(df, [])
@test allunique(df, x -> 1:4)
@test allunique(df, [:a, :b] => ByRow(string))
end
end

@testset "extra tests describe, nonunique, allunique for SubDataFrame" begin
refdf = DataFrame(a=[1, 1, 2, 2, 3], b=[1, 2, 1, 2, 3], c=[1, 2, 1, 2, 3])
sdf = @view refdf[1:4, 1:2]
@test describe(sdf, cols=:a => ByRow(string)) ==
DataFrame(variable=:a_string, mean=nothing, min="1",
median=nothing, max="2", nmissing=0, eltype=String)
@test describe(sdf, :min, :max, cols=x -> DataFrame(x=11:14)) ==
DataFrame(variable=:x, min=11, max=14)
@test nonunique(sdf, x->[1, 1, 2, 2]) == [false, true, false, true]
@test nonunique(sdf, :a => x -> true) == [false, true, true, true]
@test !allunique(sdf, x -> [1, 1, 2, 2])
@test allunique(sdf, :a => x -> 1:4)
@test !allunique(sdf, :a => x -> true)
end

@testset "Iterators.partition" begin
for df in (DataFrame(x=1:5), view(DataFrame(x=1:6, y=11:16), 1:5, 1:1))
p = Iterators.partition(df, 2)
Expand Down