diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 1d0697debd..df7fd3594e 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -374,28 +374,35 @@ end Summarize the columns of an AbstractDataFrame ```julia -describe(df::AbstractDataFrame) -describe(io, df::AbstractDataFrame) +describe(df::AbstractDataFrame; colstats = [:mean, :min, :median, :max, :Nmissing, :datatype]) +describe(io, df::AbstractDataFrame; colstats = [:mean, :min, :median, :max, :Nmissing, :datatype]) ``` **Arguments** * `df` : the AbstractDataFrame * `io` : optional output descriptor +* `colstats` a vector of symbols representing the summaristatistics you wish to report **Result** -* nothing +* a DataFrame where each row represents a variable of your input DataFrame and each +column is a summary statistic **Details** -If the column's base type derives from Number, compute the minimum, first -quantile, median, mean, third quantile, and maximum. Missings are filtered and -reported separately. +If the column's base type derives from Number, compute the mean, standard +deviation, minimum, first quantile, median, third quantile, and maximum. If +a column is not numeric, these statistics are populated with `nothing`s. -For boolean columns, report trues, falses, and missings. +For variables of *all* types, `describe` can also report the type of the +variable and the number of unique values. -For other types, show column characteristics and number of missings. +Missings are filtered in the calculation of all statistics, however the optional +argument `Nmissing` will report the number of missing values of that variable. +If the column does not allow missing values, `nothing` is returned. +Consequently, `Nmissing = 0` (and not nothing) indicates that the column allows +missing values, but does not contain any at the time. **Examples** @@ -405,10 +412,10 @@ describe(df) ``` """ -StatsBase.describe(df::AbstractDataFrame) = describe(stdout, df) +StatsBase.describe(df::AbstractDataFrame; kwargs...) = describe(stdout, df; kwargs...) function StatsBase.describe(io, df::AbstractDataFrame; colstats = [:mean, :min, :median, :max, :Nmissing, :datatype]) # Check that people don't specify the wrong fields. - allowed_fields = [:mean, :sd, :min, :q25, :median, :q75, :max, :datatype, :NUnique,:Nmissing] + allowed_fields = [:mean, :sd, :min, :q25, :median, :q75, :max, :datatype, :Nunique, :Nmissing] for i in colstats if !contains(==, allowed_fields, i) error(""" @@ -421,7 +428,7 @@ function StatsBase.describe(io, df::AbstractDataFrame; colstats = [:mean, :min, :max, :datatype, :Nmissing, - :NUnique""") + :Nunique""") end end # Define 4 functions for getting summary statistics @@ -438,7 +445,7 @@ function StatsBase.describe(io, df::AbstractDataFrame; colstats = [:mean, :min, :max => stats.max, :datatype=> eltype(col), :Nmissing => nothing, - :NUnique => nothing + :Nunique => nothing ) end @@ -454,7 +461,7 @@ function StatsBase.describe(io, df::AbstractDataFrame; colstats = [:mean, :min, :max => stats.max, :datatype=> Missings.T(eltype(col)), :Nmissing => count(ismissing(col)), - :NUnique => nothing + :Nunique => nothing ) end @@ -469,7 +476,7 @@ function StatsBase.describe(io, df::AbstractDataFrame; colstats = [:mean, :min, :max => nothing, :datatype=> eltype(col), :Nmissing => nothing, - :NUnique => length(unique(col)) + :Nunique => length(unique(col)) ) end @@ -484,7 +491,7 @@ function StatsBase.describe(io, df::AbstractDataFrame; colstats = [:mean, :min, :max => nothing, :datatype=> Missings.T(eltype(col)), :Nmissing => count(ismissing(col)), - :NUnique => length(unique(col)) + :Nunique => length(unique(col)) ) end # Takes in a column and returns a row vector of the statistics diff --git a/test/dataframe.jl b/test/dataframe.jl index 58d7ec3cf7..b93c4113cc 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -362,7 +362,10 @@ module TestDataFrame @test describe(df) == describe_output - + # Test that the keyword arguments works + describe_output_mean = describe_output[[:variable, :mean]] + @test describe(df, colstats = [:mean]) == describe_output_mean + #Check the output of unstack df = DataFrame(Fish = CategoricalArray{Union{String, Missing}}(["Bob", "Bob", "Batman", "Batman"]), Key = CategoricalArray{Union{String, Missing}}(["Mass", "Color", "Mass", "Color"]),