From a2281f05be4f7651ecdb5fd3297a1624bd95b59c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 31 Jul 2020 13:10:58 +0200 Subject: [PATCH 1/4] remove median and nunique from describe by default --- src/abstractdataframe/abstractdataframe.jl | 36 +++++++--------------- test/dataframe.jl | 2 +- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index c8e87d13de..788d6f1618 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -457,8 +457,8 @@ where each row represents a variable and each column a summary statistic. Arguments can be: - A symbol from the list `:mean`, `:std`, `:min`, `:q25`, `:median`, `:q75`, `:max`, `:eltype`, `:nunique`, `:first`, `:last`, and - `:nmissing`. The default statistics used are `:mean`, `:min`, `:median`, - `:max`, `:nunique`, `:nmissing`, and `:eltype`. + `:nmissing`. The default statistics used are `:mean`, `:min`, + `:max`, `:nmissing`, and `:eltype`. - `:all` as the only `Symbol` argument to return all statistics. - A `name => function` pair where `name` is a `Symbol` or string. This will create a column of summary statistics with the provided name. @@ -489,30 +489,16 @@ access missing values. # Examples ```julia -julia> df = DataFrame(i=1:10, x=0.1:0.1:1.0, y='a':'j') -10×3 DataFrame -│ Row │ i │ x │ y │ -│ │ Int64 │ Float64 │ Char │ -├─────┼───────┼─────────┼──────┤ -│ 1 │ 1 │ 0.1 │ 'a' │ -│ 2 │ 2 │ 0.2 │ 'b' │ -│ 3 │ 3 │ 0.3 │ 'c' │ -│ 4 │ 4 │ 0.4 │ 'd' │ -│ 5 │ 5 │ 0.5 │ 'e' │ -│ 6 │ 6 │ 0.6 │ 'f' │ -│ 7 │ 7 │ 0.7 │ 'g' │ -│ 8 │ 8 │ 0.8 │ 'h' │ -│ 9 │ 9 │ 0.9 │ 'i' │ -│ 10 │ 10 │ 1.0 │ 'j' │ +julia> df = DataFrame(i=1:10, x=0.1:0.1:1.0, y='a':'j'); julia> describe(df) -3×8 DataFrame -│ Row │ variable │ mean │ min │ median │ max │ nunique │ nmissing │ eltype │ -│ │ Symbol │ Union… │ Any │ Union… │ Any │ Union… │ Nothing │ DataType │ -├─────┼──────────┼────────┼─────┼────────┼─────┼─────────┼──────────┼──────────┤ -│ 1 │ i │ 5.5 │ 1 │ 5.5 │ 10 │ │ │ Int64 │ -│ 2 │ x │ 0.55 │ 0.1 │ 0.55 │ 1.0 │ │ │ Float64 │ -│ 3 │ y │ │ 'a' │ │ 'j' │ 10 │ │ Char │ +3×6 DataFrame +│ Row │ variable │ mean │ min │ max │ nmissing │ eltype │ +│ │ Symbol │ Union… │ Any │ Any │ Nothing │ DataType │ +├─────┼──────────┼────────┼─────┼─────┼──────────┼──────────┤ +│ 1 │ i │ 5.5 │ 1 │ 10 │ │ Int64 │ +│ 2 │ x │ 0.55 │ 0.1 │ 1.0 │ │ Float64 │ +│ 3 │ y │ │ 'a' │ 'j' │ │ Char │ julia> describe(df, :min, :max) 3×3 DataFrame @@ -547,7 +533,7 @@ DataAPI.describe(df::AbstractDataFrame, DataAPI.describe(df::AbstractDataFrame; cols=:) = _describe(select(df, cols, copycols=false), - [:mean, :min, :median, :max, :nunique, :nmissing, :eltype]) + [:mean, :min, :max, :nmissing, :eltype]) function _describe(df::AbstractDataFrame, stats::AbstractVector) predefined_funs = Symbol[s for s in stats if s isa Symbol] diff --git a/test/dataframe.jl b/test/dataframe.jl index b2fc337bd1..1fff74401c 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -641,7 +641,7 @@ end eltype = [Int, Union{Missing, Int}, String, Union{Missing, String}, Date, CategoricalValue{Int, UInt32}]) - default_fields = [:mean, :min, :median, :max, :nunique, :nmissing, :eltype] + default_fields = [:mean, :min, :max, :nmissing, :eltype] # Test that it works as a whole, without keyword arguments @test describe_output[:, [:variable; default_fields]] == describe(df) From 424bf393e429011764f1f6c37ef53cb3e977ac13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 31 Jul 2020 13:27:47 +0200 Subject: [PATCH 2/4] change unique to Set --- src/abstractdataframe/abstractdataframe.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 788d6f1618..a9c9308795 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -641,7 +641,7 @@ function get_stats(col::AbstractVector, stats::AbstractVector{Symbol}) if eltype(col) <: Real d[:nunique] = nothing else - d[:nunique] = try length(unique(col)) catch end + d[:nunique] = try length(Set(col)) catch end end end From bf7f78ae706a62a9238c13385ef2cc76962b22fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 31 Jul 2020 21:05:03 +0200 Subject: [PATCH 3/4] reintroduce median --- src/abstractdataframe/abstractdataframe.jl | 18 +++++++++--------- test/dataframe.jl | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index a9c9308795..902cc9ffb0 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -457,7 +457,7 @@ where each row represents a variable and each column a summary statistic. Arguments can be: - A symbol from the list `:mean`, `:std`, `:min`, `:q25`, `:median`, `:q75`, `:max`, `:eltype`, `:nunique`, `:first`, `:last`, and - `:nmissing`. The default statistics used are `:mean`, `:min`, + `:nmissing`. The default statistics used are `:mean`, `:min`, `:median` `:max`, `:nmissing`, and `:eltype`. - `:all` as the only `Symbol` argument to return all statistics. - A `name => function` pair where `name` is a `Symbol` or string. This will @@ -492,13 +492,13 @@ access missing values. julia> df = DataFrame(i=1:10, x=0.1:0.1:1.0, y='a':'j'); julia> describe(df) -3×6 DataFrame -│ Row │ variable │ mean │ min │ max │ nmissing │ eltype │ -│ │ Symbol │ Union… │ Any │ Any │ Nothing │ DataType │ -├─────┼──────────┼────────┼─────┼─────┼──────────┼──────────┤ -│ 1 │ i │ 5.5 │ 1 │ 10 │ │ Int64 │ -│ 2 │ x │ 0.55 │ 0.1 │ 1.0 │ │ Float64 │ -│ 3 │ y │ │ 'a' │ 'j' │ │ Char │ +3×7 DataFrame +│ Row │ variable │ mean │ min │ median │ max │ nmissing │ eltype │ +│ │ Symbol │ Union… │ Any │ Union… │ Any │ Nothing │ DataType │ +├─────┼──────────┼────────┼─────┼────────┼─────┼──────────┼──────────┤ +│ 1 │ i │ 5.5 │ 1 │ 5.5 │ 10 │ │ Int64 │ +│ 2 │ x │ 0.55 │ 0.1 │ 0.55 │ 1.0 │ │ Float64 │ +│ 3 │ y │ │ 'a' │ │ 'j' │ │ Char │ julia> describe(df, :min, :max) 3×3 DataFrame @@ -533,7 +533,7 @@ DataAPI.describe(df::AbstractDataFrame, DataAPI.describe(df::AbstractDataFrame; cols=:) = _describe(select(df, cols, copycols=false), - [:mean, :min, :max, :nmissing, :eltype]) + [:mean, :min, :median, :max, :nmissing, :eltype]) function _describe(df::AbstractDataFrame, stats::AbstractVector) predefined_funs = Symbol[s for s in stats if s isa Symbol] diff --git a/test/dataframe.jl b/test/dataframe.jl index 1fff74401c..83ab0da52f 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -641,7 +641,7 @@ end eltype = [Int, Union{Missing, Int}, String, Union{Missing, String}, Date, CategoricalValue{Int, UInt32}]) - default_fields = [:mean, :min, :max, :nmissing, :eltype] + default_fields = [:mean, :min, :median, :max, :nmissing, :eltype] # Test that it works as a whole, without keyword arguments @test describe_output[:, [:variable; default_fields]] == describe(df) From 8f81d1542afc410863b8d5fa388abc2c2cb60ebb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 31 Jul 2020 21:06:14 +0200 Subject: [PATCH 4/4] add comma --- src/abstractdataframe/abstractdataframe.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 902cc9ffb0..e18eb63d05 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -457,7 +457,7 @@ where each row represents a variable and each column a summary statistic. Arguments can be: - A symbol from the list `:mean`, `:std`, `:min`, `:q25`, `:median`, `:q75`, `:max`, `:eltype`, `:nunique`, `:first`, `:last`, and - `:nmissing`. The default statistics used are `:mean`, `:min`, `:median` + `:nmissing`. The default statistics used are `:mean`, `:min`, `:median`, `:max`, `:nmissing`, and `:eltype`. - `:all` as the only `Symbol` argument to return all statistics. - A `name => function` pair where `name` is a `Symbol` or string. This will