Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add filter and filter! to GroupedDataFrame #2279

Merged
merged 8 commits into from
Jun 24, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -960,7 +960,6 @@ Base.filter((cols, f)::Pair{<:AbstractVector{Symbol}}, df::AbstractDataFrame) =
filter([index(df)[col] for col in cols] => f, df)
Base.filter((cols, f)::Pair{<:AbstractVector{<:AbstractString}}, df::AbstractDataFrame) =
filter([index(df)[col] for col in cols] => f, df)

Base.filter((cols, f)::Pair, df::AbstractDataFrame) =
filter(index(df)[cols] => f, df)

Expand All @@ -977,11 +976,11 @@ function _filter_helper(df::AbstractDataFrame, f, cols...)
end

function Base.filter((cols, f)::Pair{<:AsTable}, df::AbstractDataFrame)
dff = select(df, cols.cols, copycols=false)
if ncol(dff) == 0
df_tmp = select(df, cols.cols, copycols=false)
if ncol(df_tmp) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
end
return _filter_helper_astable(df, Tables.namedtupleiterator(dff), f)
return _filter_helper_astable(df, Tables.namedtupleiterator(df_tmp), f)
end

_filter_helper_astable(df::AbstractDataFrame, nti::Tables.NamedTupleIterator, f) =
Expand Down
174 changes: 174 additions & 0 deletions src/groupeddataframe/groupeddataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -568,3 +568,177 @@ function Base.get(gd::GroupedDataFrame, key::Union{Tuple, NamedTuple}, default)
return default
end
end

"""
filter(function, gdf::GroupedDataFrame)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
filter(cols => function, gdf::GroupedDataFrame)

Return a new `GroupedDataFrame` containing only groups for which `function`
returns `true`.

If `cols` is not specified then the function is passed `SubDataFrame`s.
bkamins marked this conversation as resolved.
Show resolved Hide resolved

If `cols` is specified then the function is passed views of the corresponding
bkamins marked this conversation as resolved.
Show resolved Hide resolved
columns as separate positional arguments, unless `cols` is an `AsTable` selector,
in which case a `NamedTuple` of these arguments is passed.
`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR),
and column duplicates are allowed if a vector of `Symbol`s, strings, or integers
is passed.

See also: [`filter!`](@ref)

# Examples
```
julia> df = DataFrame(g=1:2, x='a':'b');
bkamins marked this conversation as resolved.
Show resolved Hide resolved

julia> gd = groupby(df, :g)
GroupedDataFrame with 2 groups based on key: g
First Group (1 row): g = 1
│ Row │ g │ x │
│ │ Int64 │ Char │
├─────┼───────┼──────┤
│ 1 │ 1 │ 'a' │
Last Group (1 row): g = 2
│ Row │ g │ x │
│ │ Int64 │ Char │
├─────┼───────┼──────┤
│ 1 │ 2 │ 'b' │

julia> filter(x -> x.x[1] == 'a', gd)
GroupedDataFrame with 1 group based on key: g
First Group (1 row): g = 1
│ Row │ g │ x │
│ │ Int64 │ Char │
├─────┼───────┼──────┤
│ 1 │ 1 │ 'a' │

julia> filter(:x => x -> x[1] == 'a', gd)
GroupedDataFrame with 1 group based on key: g
First Group (1 row): g = 1
│ Row │ g │ x │
│ │ Int64 │ Char │
├─────┼───────┼──────┤
│ 1 │ 1 │ 'a' │

```
"""
Base.filter(f::Base.Callable, gdf::GroupedDataFrame) =
bkamins marked this conversation as resolved.
Show resolved Hide resolved
gdf[[f(sdf)::Bool for sdf in gdf]]
Base.filter((col, f)::Pair{<:ColumnIndex}, gdf::GroupedDataFrame) =
_filter_helper(gdf, f, gdf.idx, gdf.starts, gdf.ends, parent(gdf)[!, col])
Base.filter((cols, f)::Pair{<:AbstractVector{Symbol}}, gdf::GroupedDataFrame) =
filter([index(parent(gdf))[col] for col in cols] => f, gdf)
Base.filter((cols, f)::Pair{<:AbstractVector{<:AbstractString}}, gdf::GroupedDataFrame) =
filter([index(parent(gdf))[col] for col in cols] => f, gdf)
Base.filter((cols, f)::Pair, gdf::GroupedDataFrame) =
filter(index(parent(gdf))[cols] => f, gdf)
Base.filter((cols, f)::Pair{<:AbstractVector{Int}}, gdf::GroupedDataFrame) =
_filter_helper(gdf, f, gdf.idx, gdf.starts, gdf.ends, (parent(gdf)[!, i] for i in cols)...)

function _filter_helper(gdf::GroupedDataFrame, f, idx::Vector{Int},
starts::Vector{Int}, ends::Vector{Int}, cols...)
function mapper(i)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
idxs = idx[starts[i]:ends[i]]
return map(x -> view(x, idxs), cols)
end

if length(cols) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
end
sel = [f(mapper(i)...)::Bool for i in 1:length(gdf)]
return gdf[sel]
end

function Base.filter((cols, f)::Pair{<:AsTable}, gdf::GroupedDataFrame)
df_tmp = select(parent(gdf), cols.cols, copycols=false)
if ncol(df_tmp) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
end
return _filter_helper_astable(gdf, Tables.columntable(df_tmp), f,
gdf.idx, gdf.starts, gdf.ends)
end

function _filter_helper_astable(gdf::GroupedDataFrame, nt::NamedTuple, f,
idx::Vector{Int}, starts::Vector{Int}, ends::Vector{Int})
function mapper(i)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
idxs = idx[starts[i]:ends[i]]
return map(x -> view(x, idxs), nt)
end

return gdf[[f(mapper(i))::Bool for i in 1:length(gdf)]]
end

"""
filter!(function, gdf::GroupedDataFrame)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have any other in-place functions on GroupedDataFrame already, apart from select! and transform!? I'm asking because for consistency with the two latter, it would make sense for filter! to drop rows from parent(gdf). Though that could be a unexpected. We need a general policy on this, which could also apply to in-place oeprations on SubDataFrame.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good question (actually I initially considered to add filter only for these reasons). It applies both to filter and filter!. I have not thought about it because what I proposed was most natural.

So the considerations are:

  1. we do not have other functions that mutate GroupedDataFrame in-place
  2. we do not have functions that mutate SubDataFrame schema in-place (of course we allow mutating the data, but not schema)
  3. select! and transform! work only for some GroupedDataFrames (ones that have no groups dropped) and they guarantee to retain row count in the parent; also it is in general expected (and documented) that they mutate the parent as they define new columns in general
  4. filter! and filter can work on an already "subsetted" GroupDataFrame so then it would be confusing what should happen with groups that are already not present.
  5. it is natural to expect that filter(predicate, collecion) does the same as collection[predicate.(collection)], which does not mutate the parent of collection if it is present.

All in all I think it is better to keep what we have.

filter!(cols => function, gdf::GroupedDataFrame)

Update a `GroupedDataFrame` in-place to contain only groups for which `function`
returns `true`.

If `cols` is not specified then the function is passed `SubDataFrame`s.

If `cols` is specified then the function is passed views of the corresponding
columns as separate positional arguments, unless `cols` is an `AsTable` selector,
in which case a `NamedTuple` of these arguments is passed.
`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR),
and column duplicates are allowed if a vector of `Symbol`s, strings, or integers
is passed.

See also: [`filter`](@ref)

# Examples
```
julia> df = DataFrame(g=1:2, x='a':'b');

julia> gd = groupby(df, :g)
GroupedDataFrame with 2 groups based on key: g
First Group (1 row): g = 1
│ Row │ g │ x │
│ │ Int64 │ Char │
├─────┼───────┼──────┤
│ 1 │ 1 │ 'a' │
Last Group (1 row): g = 2
│ Row │ g │ x │
│ │ Int64 │ Char │
├─────┼───────┼──────┤
│ 1 │ 2 │ 'b' │

julia> filter!(x -> x.x[1] == 'a', gd); gd
GroupedDataFrame with 1 group based on key: g
First Group (1 row): g = 1
│ Row │ g │ x │
│ │ Int64 │ Char │
├─────┼───────┼──────┤
│ 1 │ 1 │ 'a' │

julia> filter!(:x => x -> x[1] == 'a', gd); gd
GroupedDataFrame with 1 group based on key: g
First Group (1 row): g = 1
│ Row │ g │ x │
│ │ Int64 │ Char │
├─────┼───────┼──────┤
│ 1 │ 1 │ 'a' │
```
"""
function Base.filter!(f, gdf::GroupedDataFrame)
gdf_new = filter(f, gdf)
@assert gdf.parent === gdf_new.parent
@assert gdf.cols == gdf_new.cols

if length(gdf_new) != gdf
gdf.groups = gdf_new.groups
gdf.idx = gdf_new.idx
gdf.starts = gdf_new.starts
gdf.ends = gdf_new.ends
gdf.ngroups = gdf_new.ngroups
gdf.keymap = nothing
else
# this check is relatively cheap so we add it
# to make sure we do not have a bug in the code
@assert gdf.groups == gdf_new.groups
end

return gdf
end
42 changes: 42 additions & 0 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2338,4 +2338,46 @@ end
@test eltype(df2.a) === eltype(df2.b) === Union{UInt, Missing}
end

@testset "filter and filter!" begin
for df in (DataFrame(g1=[1, 3, 2, 1, 4, 1, 2, 5], x1=1:8,
g2=[1, 3, 2, 1, 4, 1, 2, 5], x2=1:8),
view(DataFrame(g1=[1, 3, 2, 1, 4, 1, 2, 5, 4, 5], x1=1:10,
g2=[1, 3, 2, 1, 4, 1, 2, 5, 4, 5], x2=1:10, y=1:10),
1:8, Not(:y)))
for gcols in (:g1, [:g1, :g2]), cutoff in (1, 0, 10),
predicate in (x -> nrow(x) > cutoff,
1 => x -> length(x) > cutoff,
:x1 => x -> length(x) > cutoff,
"x1" => x -> length(x) > cutoff,
[1, 2] => (x1, x2) -> length(x1) > cutoff,
[:x1, :x2] => (x1, x2) -> length(x1) > cutoff,
["x1", "x2"] => (x1, x2) -> length(x1) > cutoff,
r"x" => (x1, x2) -> length(x1) > cutoff,
AsTable(:x1) => x -> length(x.x1) > cutoff,
AsTable(r"x") => x -> length(x.x1) > cutoff)
gdf1 = groupby(df, gcols)
gdf2 = filter(predicate, gdf1)
if cutoff == 1
@test getindex.(keys(gdf2), 1) == 1:2
elseif cutoff == 0
@test gdf1 == gdf2
elseif cutoff == 10
@test isempty(gdf2)
end
filter!(predicate, gdf1)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
@test gdf1 == gdf2
end
for fun in (filter, filter!)
@test_throws TypeError fun(x -> 1, groupby(df, :g1))
@test_throws TypeError fun(r"x" => (x...) -> 1, groupby(df, :g1))
@test_throws TypeError fun(AsTable(r"x") => (x...) -> 1, groupby(df, :g1))

@test_throws ArgumentError fun(r"y" => (x...) -> true, groupby(df, :g1))
@test_throws ArgumentError fun([] => (x...) -> true, groupby(df, :g1))
@test_throws ArgumentError fun(AsTable(r"y") => (x...) -> true, groupby(df, :g1))
@test_throws ArgumentError fun(AsTable([]) => (x...) -> true, groupby(df, :g1))
end
end
end

end # module