From b240458aca1681e74a94e979a0141b2b16f1a3e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 18 Dec 2022 15:13:07 +0100 Subject: [PATCH] fix deleteat! and subset! performance (#3249) --- src/abstractdataframe/subset.jl | 4 ++-- src/dataframe/dataframe.jl | 12 ++++++++---- test/dataframe.jl | 10 ++++++++++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index 92542398f8..ec5e90db3a 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -468,7 +468,7 @@ function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Bool=false, threads::Bool=true) isempty(args) && return df row_selector = _get_subset_conditions(df, Ref{Any}(args), skipmissing, threads) - return deleteat!(df, findall(!, row_selector)) + return deleteat!(df, .!row_selector) end function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false, @@ -486,7 +486,7 @@ function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Boo groups = gdf.groups lazy_lock = gdf.lazy_lock row_selector = _get_subset_conditions(gdf, Ref{Any}(args), skipmissing, threads) - res = deleteat!(df, findall(!, row_selector)) + res = deleteat!(df, .!row_selector) if nrow(res) == length(groups) # we have not removed any rows return ungroup ? res : gdf end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index b4b11b1b3a..01e05fc206 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -887,12 +887,11 @@ function Base.deleteat!(df::DataFrame, inds::AbstractVector{Bool}) if length(inds) != size(df, 1) throw(BoundsError(df, (inds, :))) end - drop = _findall(inds) # workaround https://github.com/JuliaLang/julia/pull/41646 if VERSION <= v"1.6.2" && drop isa UnitRange{<:Integer} - drop = collect(drop) + inds = collect(inds) end - return _deleteat!_helper(df, drop) + return _deleteat!_helper(df, inds) end Base.deleteat!(df::DataFrame, inds::Not) = @@ -910,7 +909,12 @@ function _deleteat!_helper(df::DataFrame, drop) deleteat!(col1, drop) newn = length(col1) @assert newn <= n - + # the 0.06 threshold is heuristic; based on tests + # the assignment makes the code type-unstable but it is a small union + # so the overhead should be small + if drop isa AbstractVector{Bool} && newn < 0.06 * n + drop = findall(drop) + end for i in 2:length(cols) col = cols[i] # this check is done to handle column aliases diff --git a/test/dataframe.jl b/test/dataframe.jl index 0b1b95d520..d338aa2fed 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -517,6 +517,16 @@ end @test_throws BoundsError deleteat!(df, true:true) df = DataFrame(a=1, b=3.0) @test isempty(deleteat!(df, true:true)) + + Random.seed!(1234) + for t in 0:0.005:1.0 + # two columns are needed as the second column is affected + # by the adaptive algorithm + df = DataFrame(i=1:10^5, j=1:10^5) + idxs = rand(10^5) .< t + deleteat!(df, idxs) + df.i == df.j == findall(idxs) + end end @testset "describe" begin