Skip to content

Commit

Permalink
fix deleteat! and subset! performance (#3249)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins committed Dec 18, 2022
1 parent 83285f8 commit b240458
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 6 deletions.
4 changes: 2 additions & 2 deletions src/abstractdataframe/subset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ function subset!(df::AbstractDataFrame, @nospecialize(args...);
skipmissing::Bool=false, threads::Bool=true)
isempty(args) && return df
row_selector = _get_subset_conditions(df, Ref{Any}(args), skipmissing, threads)
return deleteat!(df, findall(!, row_selector))
return deleteat!(df, .!row_selector)
end

function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false,
Expand All @@ -486,7 +486,7 @@ function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Boo
groups = gdf.groups
lazy_lock = gdf.lazy_lock
row_selector = _get_subset_conditions(gdf, Ref{Any}(args), skipmissing, threads)
res = deleteat!(df, findall(!, row_selector))
res = deleteat!(df, .!row_selector)
if nrow(res) == length(groups) # we have not removed any rows
return ungroup ? res : gdf
end
Expand Down
12 changes: 8 additions & 4 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -887,12 +887,11 @@ function Base.deleteat!(df::DataFrame, inds::AbstractVector{Bool})
if length(inds) != size(df, 1)
throw(BoundsError(df, (inds, :)))
end
drop = _findall(inds)
# workaround https://github.com/JuliaLang/julia/pull/41646
if VERSION <= v"1.6.2" && drop isa UnitRange{<:Integer}
drop = collect(drop)
inds = collect(inds)
end
return _deleteat!_helper(df, drop)
return _deleteat!_helper(df, inds)
end

Base.deleteat!(df::DataFrame, inds::Not) =
Expand All @@ -910,7 +909,12 @@ function _deleteat!_helper(df::DataFrame, drop)
deleteat!(col1, drop)
newn = length(col1)
@assert newn <= n

# the 0.06 threshold is heuristic; based on tests
# the assignment makes the code type-unstable but it is a small union
# so the overhead should be small
if drop isa AbstractVector{Bool} && newn < 0.06 * n
drop = findall(drop)
end
for i in 2:length(cols)
col = cols[i]
# this check is done to handle column aliases
Expand Down
10 changes: 10 additions & 0 deletions test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,16 @@ end
@test_throws BoundsError deleteat!(df, true:true)
df = DataFrame(a=1, b=3.0)
@test isempty(deleteat!(df, true:true))

Random.seed!(1234)
for t in 0:0.005:1.0
# two columns are needed as the second column is affected
# by the adaptive algorithm
df = DataFrame(i=1:10^5, j=1:10^5)
idxs = rand(10^5) .< t
deleteat!(df, idxs)
df.i == df.j == findall(idxs)
end
end

@testset "describe" begin
Expand Down

0 comments on commit b240458

Please sign in to comment.