From d86dc4eca03631b1de5c70325daad4189e99dcba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 31 Oct 2020 18:34:25 +0100 Subject: [PATCH 1/5] remove CategoricalArrays dependency from joins --- src/abstractdataframe/join.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 4bba2f2292..2b448067b7 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -417,8 +417,9 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; if indicator !== nothing refs = left_indicator + right_indicator - pool = CategoricalPool{String,UInt8}(["left_only", "right_only", "both"]) - indicatorcol = CategoricalArray{String,1}(refs, pool) + refs_short, invpool, pool = PooledArrays._label(["left_only", "right_only", "both"], String) + @assert refs_short == [1, 2, 3] + indicatorcol = PooledArray(PooledArrays.RefArray(refs), invpool, pool) unique_indicator = indicator if makeunique From 0543b6d1db713f9dae9c0e4cb8451ae3de35858b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 31 Oct 2020 19:31:17 +0100 Subject: [PATCH 2/5] do not use internal methods of PooledArrays.jl --- NEWS.md | 2 ++ src/abstractdataframe/join.jl | 9 +++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 55a7340090..f1e791a572 100644 --- a/NEWS.md +++ b/NEWS.md @@ -36,6 +36,8 @@ choose the fast path only when it is safe; this resolves inconsistencies with what the same functions not using fast path produce ([#2357](https://github.com/JuliaData/DataFrames.jl/pull/2357)) +* joins now return `PooledVector` not `CategoricalVector` in indicator column + ([#2505](https://github.com/JuliaData/DataFrames.jl/pull/2505)) * `GroupKeys` now supports `in` for `GroupKey`, `Tuple`, `NamedTuple` and dictionaries ([2392](https://github.com/JuliaData/DataFrames.jl/pull/2392)) * in `describe` the specification of custom aggregation is now `function => name`; diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 2b448067b7..e0ef13590f 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -416,10 +416,11 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; end if indicator !== nothing - refs = left_indicator + right_indicator - refs_short, invpool, pool = PooledArrays._label(["left_only", "right_only", "both"], String) - @assert refs_short == [1, 2, 3] - indicatorcol = PooledArray(PooledArrays.RefArray(refs), invpool, pool) + left_indicator .+= right_indicator + pa_base = PooledArray(["left_only", "right_only", "both"]) + indicatorcol = PooledArray(PooledArrays.RefArray(left_indicator), + Dict{String, UInt8}("left_only" => 0x1, "right_only" => 0x2, "both" => 0x3), + ["left_only", "right_only", "both"]) unique_indicator = indicator if makeunique From 2bb1b5f4c22f9771abfa240c3e33bd360917229d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 31 Oct 2020 22:53:49 +0100 Subject: [PATCH 3/5] Update src/abstractdataframe/join.jl Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/join.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index e0ef13590f..1169dd70f0 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -417,7 +417,6 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; if indicator !== nothing left_indicator .+= right_indicator - pa_base = PooledArray(["left_only", "right_only", "both"]) indicatorcol = PooledArray(PooledArrays.RefArray(left_indicator), Dict{String, UInt8}("left_only" => 0x1, "right_only" => 0x2, "both" => 0x3), ["left_only", "right_only", "both"]) From b3f66a1f805413835cda65581ba32295e9d73b17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 31 Oct 2020 23:01:04 +0100 Subject: [PATCH 4/5] change indicator to UInt32 internal representation --- src/abstractdataframe/join.jl | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 1169dd70f0..5725917814 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -121,10 +121,10 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, # about the permutation of left data frame in rightjoin as we always # assign 0x1 to it anyway and these rows are guaranteed to come first # (even if they are permuted) - left_indicator = zeros(UInt8, nrow) - left_indicator[axes(all_orig_left_ixs, 1)] .= 0x1 - right_indicator = zeros(UInt8, nrow) - right_indicator[axes(all_orig_right_ixs, 1)] .= 0x2 + left_indicator = zeros(UInt32, nrow) + left_indicator[axes(all_orig_left_ixs, 1)] .= UInt32(1) + right_indicator = zeros(UInt32, nrow) + right_indicator[axes(all_orig_right_ixs, 1)] .= UInt32(2) permute!(right_indicator, right_perm) end @@ -417,9 +417,12 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; if indicator !== nothing left_indicator .+= right_indicator + pool = ["left_only", "right_only", "both"] + invpool = Dict{String, UInt8}("left_only" => UInt32(1), + "right_only" => UInt32(2), + "both" => UInt32(3)) indicatorcol = PooledArray(PooledArrays.RefArray(left_indicator), - Dict{String, UInt8}("left_only" => 0x1, "right_only" => 0x2, "both" => 0x3), - ["left_only", "right_only", "both"]) + invpool, pool) unique_indicator = indicator if makeunique From a8410f92ba3cd0ffb9ba1b52ab5645e78bb87133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 1 Nov 2020 00:15:45 +0100 Subject: [PATCH 5/5] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/join.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 5725917814..bffefd6be3 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -122,9 +122,9 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, # assign 0x1 to it anyway and these rows are guaranteed to come first # (even if they are permuted) left_indicator = zeros(UInt32, nrow) - left_indicator[axes(all_orig_left_ixs, 1)] .= UInt32(1) + left_indicator[axes(all_orig_left_ixs, 1)] .= 1 right_indicator = zeros(UInt32, nrow) - right_indicator[axes(all_orig_right_ixs, 1)] .= UInt32(2) + right_indicator[axes(all_orig_right_ixs, 1)] .= 2 permute!(right_indicator, right_perm) end @@ -418,9 +418,9 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; if indicator !== nothing left_indicator .+= right_indicator pool = ["left_only", "right_only", "both"] - invpool = Dict{String, UInt8}("left_only" => UInt32(1), - "right_only" => UInt32(2), - "both" => UInt32(3)) + invpool = Dict{String, UInt32}("left_only" => 1, + "right_only" => 2, + "both" => 3) indicatorcol = PooledArray(PooledArrays.RefArray(left_indicator), invpool, pool)