fix deleteat! and subset! performance (#3249)

JuliaData · Dec 18, 2022 · b240458 · b240458
1 parent 83285f8
commit b240458
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 6 deletions.
diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
@@ -468,7 +468,7 @@ function subset!(df::AbstractDataFrame, @nospecialize(args...);
                 skipmissing::Bool=false, threads::Bool=true)
     isempty(args) && return df
     row_selector = _get_subset_conditions(df, Ref{Any}(args), skipmissing, threads)
-    return deleteat!(df, findall(!, row_selector))
+    return deleteat!(df, .!row_selector)
 end
 
 function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false,
@@ -486,7 +486,7 @@ function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Boo
     groups = gdf.groups
     lazy_lock = gdf.lazy_lock
     row_selector = _get_subset_conditions(gdf, Ref{Any}(args), skipmissing, threads)
-    res = deleteat!(df, findall(!, row_selector))
+    res = deleteat!(df, .!row_selector)
     if nrow(res) == length(groups) # we have not removed any rows
         return ungroup ? res : gdf
     end

diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -887,12 +887,11 @@ function Base.deleteat!(df::DataFrame, inds::AbstractVector{Bool})
     if length(inds) != size(df, 1)
         throw(BoundsError(df, (inds, :)))
     end
-    drop = _findall(inds)
     # workaround https://github.com/JuliaLang/julia/pull/41646
     if VERSION <= v"1.6.2" && drop isa UnitRange{<:Integer}
-        drop = collect(drop)
+        inds = collect(inds)
     end
-    return _deleteat!_helper(df, drop)
+    return _deleteat!_helper(df, inds)
 end
 
 Base.deleteat!(df::DataFrame, inds::Not) =
@@ -910,7 +909,12 @@ function _deleteat!_helper(df::DataFrame, drop)
     deleteat!(col1, drop)
     newn = length(col1)
     @assert newn <= n
-
+    # the 0.06 threshold is heuristic; based on tests
+    # the assignment makes the code type-unstable but it is a small union
+    # so the overhead should be small
+    if drop isa AbstractVector{Bool} && newn < 0.06 * n
+        drop = findall(drop)
+    end
     for i in 2:length(cols)
         col = cols[i]
         # this check is done to handle column aliases

diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -517,6 +517,16 @@ end
     @test_throws BoundsError deleteat!(df, true:true)
     df = DataFrame(a=1, b=3.0)
     @test isempty(deleteat!(df, true:true))
+
+    Random.seed!(1234)
+    for t in 0:0.005:1.0
+        # two columns are needed as the second column is affected
+        # by the adaptive algorithm
+        df = DataFrame(i=1:10^5, j=1:10^5)
+        idxs = rand(10^5) .< t
+        deleteat!(df, idxs)
+        df.i == df.j == findall(idxs)
+    end
 end
 
 @testset "describe" begin