Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add reverse!, shuffle, shuffle!, permute!, and invpermute! #3010

Merged
merged 12 commits into from
Feb 20, 2022
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
* Add special syntax for `eachindex`, `groupindices`, and `proprow`
to transformation mini-language
([#3001](https://github.com/JuliaData/DataFrames.jl/pull/3001)).
* Add support for `reverse!`, `permute!`, `invpermute!`, `shuffle`,
and `shuffle!` functions. Improve functionality of `reverse`.
([#3010](https://github.com/JuliaData/DataFrames.jl/pull/3010)).
* `first` and `last` for `GroupedDataFrame` now support passing number of elements to get
([#3006](https://github.com/JuliaData/DataFrames.jl/issues/3006))

Expand Down
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
Expand Down Expand Up @@ -47,13 +48,12 @@ DataValues = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a"

[targets]
test = ["CategoricalArrays", "Combinatorics", "DataStructures", "DataValues",
"Dates", "Logging", "OffsetArrays", "Random", "Test", "Unitful",
"Dates", "Logging", "OffsetArrays", "Test", "Unitful",
"ShiftedArrays", "SparseArrays"]
5 changes: 5 additions & 0 deletions docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,20 @@ combine
flatten
hcat
insertcols!
invpermute!
mapcols
mapcols!
permute!
push!
reduce
repeat
repeat!
reverse
reverse!
select
select!
shuffle
shuffle!
transform
transform!
vcat
Expand Down
1 change: 1 addition & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ using TableTraits, IteratorInterfaceExtensions
import LinearAlgebra: norm
using Markdown
using PrettyTables
using Random

import DataAPI,
DataAPI.All,
Expand Down
256 changes: 254 additions & 2 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2261,9 +2261,10 @@ Base.setindex!(::AbstractDataFrame, ::Any, ::Union{Symbol, Integer, AbstractStri
throw(ArgumentError("syntax df[column] is not supported use df[!, column] instead"))

"""
reverse(df::AbstractDataFrame)
reverse(df::AbstractDataFrame, start=1, stop=nrow(df))

Return a data frame containing the rows in `df` in reversed order.
bkamins marked this conversation as resolved.
Show resolved Hide resolved
If `start` and `stop` are provided, only rows in the `start:stop` range are affected.

# Examples

Expand All @@ -2289,6 +2290,257 @@ julia> reverse(df)
3 │ 3 8 13
4 │ 2 7 12
5 │ 1 6 11

julia> reverse(df, 2, 3)
5×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 1 6 11
2 │ 3 8 13
3 │ 2 7 12
4 │ 4 9 14
5 │ 5 10 15
```
"""
Base.reverse(df::AbstractDataFrame) = df[nrow(df):-1:1, :]
Base.reverse(df::AbstractDataFrame, start::Integer=1, stop::Integer=nrow(df)) =
mapcols(x -> reverse(x, start, stop), df)

"""
reverse!(df::AbstractDataFrame, start=1, stop=nrow(df))

Mutate data frame in-place to reverse its row order.
bkamins marked this conversation as resolved.
Show resolved Hide resolved
If `start` and `stop` are provided, only rows in the `start:stop` range are affected.

`reverse!` will produce a correct result even if some columns of passed data frame
are identical (checked with `===`). Otherwise, if two columns share some part of
memory but are not identical (e.g. are different views of the same parent
vector) then `reverse!` result might be incorrect.

# Examples

```jldoctest
julia> df = DataFrame(a=1:5, b=6:10, c=11:15)
5×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 1 6 11
2 │ 2 7 12
3 │ 3 8 13
4 │ 4 9 14
5 │ 5 10 15

julia> reverse!(df)
5×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 5 10 15
2 │ 4 9 14
3 │ 3 8 13
4 │ 2 7 12
5 │ 1 6 11

julia> reverse!(df, 2, 3)
5×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 5 10 15
2 │ 3 8 13
3 │ 4 9 14
4 │ 2 7 12
5 │ 1 6 11
```
"""
function Base.reverse!(df::AbstractDataFrame, start::Integer=1, stop::Integer=nrow(df))
toskip = Set{Int}()
seen_cols = IdDict{Any, Nothing}()
for (i, col) in enumerate(eachcol(df))
if haskey(seen_cols, col)
push!(toskip, i)
else
seen_cols[col] = nothing
end
end

for (i, col) in enumerate(eachcol(df))
if !(i in toskip)
reverse!(col, start, stop)
end
end
return df
end

function _permutation_helper!(fun::Union{typeof(Base.permute!!), typeof(Base.invpermute!!)},
df::AbstractDataFrame, p::AbstractVector{<:Integer})
toskip = Set{Int}()
seen_cols = IdDict{Any, Nothing}()
for (i, col) in enumerate(eachcol(df))
if haskey(seen_cols, col)
push!(toskip, i)
else
seen_cols[col] = nothing
end
# p might be a column of df so we make sure we unalias
if col === p
p = copy(p)
end
end

pp = similar(p)

for (i, col) in enumerate(eachcol(df))
if !(i in toskip)
copyto!(pp, p)
fun(col, pp)
end
end
return df
end

"""
permute!(df::AbstractDataFrame, p)

Permute data frame `df` in-place, according to permutation `p`.
No checking is done to verify that `p` is a permutation.

To return a new data frame instead of permuting `df` in-place, use `df[p]`.
Note that this is generally faster than `permute!(df, p)` for large data frames.

`permute!` will produce a correct result even if some columns of passed data frame
or permutation `p` are identical (checked with `===`). Otherwise, if two columns share
some part of memory but are not identical (e.g. are different views of the same parent
vector) then `permute!` result might be incorrect.

# Examples
julia> df = DataFrame(a=1:5, b=6:10, c=11:15)
5×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 1 6 11
2 │ 2 7 12
3 │ 3 8 13
4 │ 4 9 14
5 │ 5 10 15

julia> permute!(df, [5, 3, 1, 2, 4])
5×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 5 10 15
2 │ 3 8 13
3 │ 1 6 11
4 │ 2 7 12
5 │ 4 9 14
"""
Base.permute!(df::AbstractDataFrame, p::AbstractVector{<:Integer}) =
_permutation_helper!(Base.permute!!, df, p)

"""
invpermute!(df::AbstractDataFrame, p)

Like [`permute!`](@ref), but the inverse of the given permutation is applied.

`invpermute!` will produce a correct result even if some columns of passed data
frame or permutation `p` are identical (checked with `===`). Otherwise, if two
columns share some part of memory but are not identical (e.g. are different views
of the same parent vector) then `permute!` result might be incorrect.

# Examples

julia> df = DataFrame(a=1:5, b=6:10, c=11:15)
5×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 1 6 11
2 │ 2 7 12
3 │ 3 8 13
4 │ 4 9 14
5 │ 5 10 15

julia> permute!(df, [5, 3, 1, 2, 4])
5×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 5 10 15
2 │ 3 8 13
3 │ 1 6 11
4 │ 2 7 12
5 │ 4 9 14

julia> invpermute!(df, [5, 3, 1, 2, 4])
5×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 1 6 11
2 │ 2 7 12
3 │ 3 8 13
4 │ 4 9 14
5 │ 5 10 15
"""
Base.invpermute!(df::AbstractDataFrame, p::AbstractVector{<:Integer}) =
_permutation_helper!(Base.invpermute!!, df, p)

"""
shuffle([rng=GLOBAL_RNG,] df::AbstractDataFrame)

Return a copy of `df` with randomly permuted rows.
The optional `rng` argument specifies a random number generator.

# Examples

julia> rng = MersenneTwister(1234);

julia> shuffle(rng, DataFrame(a=1:5, b=1:5))
5×2 DataFrame
Row │ a b
│ Int64 Int64
─────┼──────────────
1 │ 2 2
2 │ 1 1
3 │ 4 4
4 │ 3 3
5 │ 5 5
"""
Random.shuffle(df::AbstractDataFrame) =
df[randperm(nrow(df)), :]
Random.shuffle(r::AbstractRNG, df::AbstractDataFrame) =
df[randperm(r, nrow(df)), :]

"""
shuffle!([rng=GLOBAL_RNG,] df::AbstractDataFrame)

Randomly permute rows of `df` in-place.
The optional `rng` argument specifies a random number generator.

`shuffle!` will produce a correct result even if some columns of passed data frame
are identical (checked with `===`). Otherwise, if two columns share some part of
memory but are not identical (e.g. are different views of the same parent
vector) then `shuffle!` result might be incorrect.

# Examples

julia> rng = MersenneTwister(1234);

julia> shuffle!(rng, DataFrame(a=1:5, b=1:5))
5×2 DataFrame
Row │ a b
│ Int64 Int64
─────┼──────────────
1 │ 2 2
2 │ 1 1
3 │ 4 4
4 │ 3 3
5 │ 5 5
"""
Random.shuffle!(df::AbstractDataFrame) =
permute!(df, randperm(nrow(df)))
Random.shuffle!(r::AbstractRNG, df::AbstractDataFrame) =
permute!(df, randperm(r, nrow(df)))
24 changes: 2 additions & 22 deletions src/abstractdataframe/sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -682,25 +682,5 @@ function Base.sort!(df::AbstractDataFrame, cols=All();
return sort!(df, _alg, ord)
end

function Base.sort!(df::AbstractDataFrame, a::Base.Sort.Algorithm, o::Base.Sort.Ordering)
toskip = Set{Int}()
seen_cols = IdDict{Any, Nothing}()
for (i, col) in enumerate(eachcol(df))
if haskey(seen_cols, col)
push!(toskip, i)
else
seen_cols[col] = nothing
end
end

p = _sortperm(df, a, o)
pp = similar(p)

for (i, col) in enumerate(eachcol(df))
if !(i in toskip)
copyto!(pp, p)
Base.permute!!(col, pp)
end
end
return df
end
Base.sort!(df::AbstractDataFrame, a::Base.Sort.Algorithm, o::Base.Sort.Ordering) =
permute!(df, _sortperm(df, a, o))
Loading