From edf22c3cfd062e6ecbd2f9b656ab1f4df58eb4da Mon Sep 17 00:00:00 2001 From: pdeffebach <23196228+pdeffebach@users.noreply.github.com> Date: Tue, 27 Feb 2024 12:31:26 -0500 Subject: [PATCH] Add metadata macros (#377) * begin adding metadata features * tests * some docs * add to main docs * deleting notes * printing docs * printing tests * more docs --- Project.toml | 3 +- docs/src/index.md | 159 ++++++++++++++++++++++++ src/DataFramesMeta.jl | 6 + src/metadata.jl | 283 ++++++++++++++++++++++++++++++++++++++++++ test/metadata.jl | 79 ++++++++++++ 5 files changed, 529 insertions(+), 1 deletion(-) create mode 100644 src/metadata.jl create mode 100644 test/metadata.jl diff --git a/Project.toml b/Project.toml index ebba5ce3..30e2a385 100644 --- a/Project.toml +++ b/Project.toml @@ -8,14 +8,15 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +TableMetadataTools = "9ce81f87-eacc-4366-bf80-b621a3098ee2" [compat] Chain = "0.5" DataFrames = "1" MacroTools = "0.5" +OrderedCollections = "1" Reexport = "0.2, 1" julia = "1.6" -OrderedCollections = "1" [extras] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" diff --git a/docs/src/index.md b/docs/src/index.md index 15ef9c9b..8f54b1c3 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -983,6 +983,165 @@ in the middle of a `@chain` block. end ``` +## Attaching variable labels and notes + +A widely used and appreciated feature of the Stata data analysis +programming language is it's tools for column-level metadata in the +form of labels and notes. Like Stata, Julia's data ecosystem implements a common +API for keeping track of information associated with columns. DataFramesMeta.jl +implements the `@label!` and `@note!` macros to attach information to columns. + +DataFramesMeta.jl also provides two convenience functions +for examining metadata, `printlabels` and `printnotes`. + +### `@label!`: For short column labels + +Use `@label!` to attach short-but-informative labels to columns. For example, +a variable `:wage` might be given the label `"Wage (2015 USD)"`. + +```julia +df = DataFrame(wage = [16, 25, 14, 23]); +@label! df :wage = "Wage (2015 USD)" +``` + +View the labels with `printlabels(df)` + +``` +julia> printlabels(df) +┌────────┬─────────────────┐ +│ Column │ Label │ +├────────┼─────────────────┤ +│ wage │ Wage (2015 USD) │ +└────────┴─────────────────┘ +``` + +You can access labels via the `label` function defined in TablesMetaDataTools.jl + +``` +julia> label(df, :wage) +"Wage (2015 USD)" +``` + +### `@note!`: For longer column notes + +While labels are useful for pretty printing and clarification of short variable +names, notes are used to give more in depth information and describe the data +cleaning process. Unlike labels, notes can be stacked on to one another. + +Consider the cleaning process for wages, starting with the data frame + +```julia +julia> df = DataFrame(wage = [-99, 16, 14, 23, 5000]) +5×1 DataFrame + Row │ wage + │ Int64 +─────┼─────── + 1 │ -99 + 2 │ 16 + 3 │ 14 + 4 │ 23 + 5 │ 5000 + +``` + +When data cleaning you might want to do the following: + +1. Record the source of the data + +``` +@note! df :wage = "Hourly wage from 2015 American Community Survey (ACS)" +``` + +2. Fix coded wages. In this example, `-99` corresponds to "no job" + +``` +@rtransform! df :wage = :wage == -99 ? 0 : :wage +@note! df :wage = "Individuals with no job are recorded as 0 wage" +``` + +We use `printnotes` to see the notes for columns. + +``` +julia> printnotes(df) +Column: wage +──────────── +Hourly wage from 2015 American Community Survey (ACS) +Individuals with no job are recorded as 0 wage +``` + +You can access the note via the `note` function. + +``` +julia> note(df, :wage) +"Hourly wage from 2015 American Community Survey (ACS)\nIndividuals with no job are recorded as 0 wage" +``` + +To remove all notes from a column, run + +``` +note!(df, :wage, ""; append = false) +```` + +### Printing metadata + +#### `printlabels`: For printing labels + +Use `printlabels` to print the labels of columns in a data frame. The optional +argument `cols` determines which columns to print, while the keyword +argument `unlabelled` controls whether to print columns without user-defined labels. + +```julia-repl +julia> df = DataFrame(wage = [12], age = [23]); + +julia> @label! df :wage = "Hourly wage (2015 USD)"; + +julia> printlabels(df) +┌────────┬────────────────────────┐ +│ Column │ Label │ +├────────┼────────────────────────┤ +│ wage │ Hourly wage (2015 USD) │ +│ age │ age │ +└────────┴────────────────────────┘ + +julia> printlabels(df, [:wage, :age]; unlabelled = false) +┌────────┬────────────────────────┐ +│ Column │ Label │ +├────────┼────────────────────────┤ +│ wage │ Hourly wage (2015 USD) │ +└────────┴────────────────────────┘ +``` + +#### `printlabels`: For printing notes + +Use `printnotes` to print the notes of columns in a data frame. The optional +argument `cols` determines which columns to print, while the keyword +argument `unnoted` controls whether to print columns without user-defined notes. + +```julia-repl +julia> df = DataFrame(wage = [12], age = [23]); + +julia> @label! df :age = "Age (years)"; + +julia> @note! df :wage = "Derived from American Community Survey"; + +julia> @note! df :wage = "Missing values imputed as 0 wage"; + +julia> @label! df :wage = "Hourly wage (2015 USD)"; + +julia> printnotes(df) +Column: wage +──────────── +Label: Hourly wage (2015 USD) +Derived from American Community Survey +Missing values imputed as 0 wage + +Column: age +─────────── +Label: Age (years) + +``` + + ```@contents Pages = ["api/api.md"] Depth = 3 diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl index 1f42a9f3..52bd069f 100644 --- a/src/DataFramesMeta.jl +++ b/src/DataFramesMeta.jl @@ -6,10 +6,14 @@ using MacroTools using OrderedCollections: OrderedCollections +@reexport using TableMetadataTools + @reexport using DataFrames @reexport using Chain +using DataFrames.PrettyTables + # Basics: export @with, @subset, @subset!, @rsubset, @rsubset!, @@ -21,6 +25,7 @@ export @with, @distinct, @rdistinct, @distinct!, @rdistinct!, @eachrow, @eachrow!, @byrow, @passmissing, @astable, @kwarg, + @label!, @note!, printlabels, printnotes, @groupby, @based_on, @where # deprecated @@ -31,5 +36,6 @@ include("parsing_astable.jl") include("macros.jl") include("linqmacro.jl") include("eachrow.jl") +include("metadata.jl") end # module \ No newline at end of file diff --git a/src/metadata.jl b/src/metadata.jl new file mode 100644 index 00000000..070a2c0a --- /dev/null +++ b/src/metadata.jl @@ -0,0 +1,283 @@ +function get_lhs_rhs(e) + if !(e isa Expr) + throw(ArgumentError("Malformed @label expression")) + else + lhs = let t = e.args[1] + s = get_column_expr_rename(t) + if s === nothing + throw(ArgumentError("Invalid column identifier on LHS in @label macro")) + end + s + end + rhs = e.args[2] + return lhs, rhs + end +end + +function addlabel_helper(df, args...) + x, exprs, outer_flags, kw = get_df_args_kwargs(df, args...; wrap_byrow = false) + x_sym = gensym() + t = map(exprs) do e + lhs, rhs = get_lhs_rhs(e) + :($label!($x_sym, $lhs, $rhs)) + end + labblock = Expr(:block, t...) + quote + $x_sym = $x + $labblock + $x_sym + end +end + +""" + label!(df, args...) + +Assign labels to columns in a data frame using `:col = label` syntax. +Shorthand for `label!(df, ...)` from TablesMetaDataTools.jl. + +```julia-repl +julia> df = DataFrame(wage = 12); + +julia> @label! df :wage = "Wage per hour (USD)"; + +julia> printlabels(df) +┌────────┬─────────────────────┐ +│ Column │ Label │ +├────────┼─────────────────────┤ +│ wage │ Wage per hour (USD) │ +└────────┴─────────────────────┘ +``` + +Use `@label!` for short descriptions, primarily for pretty printing. +Use `@note!` for longer explanations of columns. + +Labels are "note"-style columnar metadata. Labels are preserved upon +renaming and transformations. `@label! :x = "Lab"` over-writes any +existing label for the column `:x`. To add information without overwriting, +use [`@note!`](@ref). + +`@label!` returns the input data frame for use with `@chain`. + +Like other DataFramesMeta.jl macros, `@label!` can be used in "keyword" +format as well as block format. + +```julia-repl +julia> df = DataFrame(wage = 12, tenure = 4); + +julia> @label! df begin + :wage = "Wage per hour (USD)" + :tenure = "Tenure at job (months)" + end; + +julia> printlabels(df) +┌────────┬────────────────────────┐ +│ Column │ Label │ +├────────┼────────────────────────┤ +│ wage │ Wage per hour (USD) │ +│ tenure │ Tenure at job (months) │ +└────────┴────────────────────────┘ +``` +""" +macro label!(df, args...) + esc(addlabel_helper(df, args...)) +end + +function addnote_helper(df, args...) + x, exprs, outer_flags, kw = get_df_args_kwargs(df, args...; wrap_byrow = false) + x_sym = gensym() + t = map(exprs) do e + lhs, rhs = get_lhs_rhs(e) + :($note!($x_sym, $lhs, string($rhs); append = true)) + end + labblock = Expr(:block, t...) + quote + $x_sym = $x + $labblock + $x_sym + end +end + +""" + note!(df, args...) + +Assign notes to columns in a data frame using `:col = note` syntax. +Shorthand for `note!(df, col, note)` from TablesMetadataTools.jl. + +Use `@note!` for longer explanations of columns. +Use `@label!` for short descriptions, primarily for pretty printing. + +```julia-repl +julia> df = DataFrame(wage = 12); + +julia> @note! df :wage = " + Long discussion of variable construction. + "; + +julia> printnotes(df) +Column: wage +──────────── +Long discussion of variable construction. +``` + +Unlike labels, notes are appended. + +```julia-repl +julia> @note! df :wage = "Another comment on variable construction"; + +julia> printnotes(df) +Column: wage +──────────── +Wage per hour in 2014 USD taken from ACS data provided by IPUMS. +Wage per hour is measured directly for hourly workers. For +salaried workers, equal to salary / hours worked. + +Values capped at the 99th percentile +``` +""" +macro note!(df, args...) + esc(addnote_helper(df, args...)) +end + + +""" + printlabels(df, [cols=All()]; unlabelled = true) + +Pretty-print all labels in a data frame. + +## Arguments + +* `cols`: Optional argument to select columns to print. Can + be any valid multi-column selector, such as `Not(...)`, + `Between(...)`, or a regular expression. + +* `unlabelled`: Keyword argument for whether to print + the columns without user-defined labels. Deftaults to `true`. + For column `col` without a user-defined label, `label(df, col)` returns + the name of the column, `col`. + +## Examples +```julia-repl +julia> df = DataFrame(wage = [12], age = [23]); + +julia> @label! df :wage = "Hourly wage (2015 USD)"; + +julia> printlabels(df) +┌────────┬────────────────────────┐ +│ Column │ Label │ +├────────┼────────────────────────┤ +│ wage │ Hourly wage (2015 USD) │ +│ age │ age │ +└────────┴────────────────────────┘ + +julia> printlabels(df, :wage) +┌────────┬────────────────────────┐ +│ Column │ Label │ +├────────┼────────────────────────┤ +│ wage │ Hourly wage (2015 USD) │ +└────────┴────────────────────────┘ + +julia> printlabels(df; unlabelled = false) +┌────────┬────────────────────────┐ +│ Column │ Label │ +├────────┼────────────────────────┤ +│ wage │ Hourly wage (2015 USD) │ +└────────┴────────────────────────┘ + +julia> printlabels(df, r"^wage") +┌────────┬────────────────────────┐ +│ Column │ Label │ +├────────┼────────────────────────┤ +│ wage │ Hourly wage (2015 USD) │ +└────────┴────────────────────────┘ +``` + +""" +function printlabels(df, cols=All(); unlabelled = true) + cs = String[] + ls = String[] + for n in names(df, cols) + lab = label(df, n) + if unlabelled == true + push!(cs, n) + push!(ls, lab) + else + if n != lab + push!(cs, n) + push!(ls, lab) + end + end + end + t = DataFrame(Column = cs, Label = ls) + pretty_table(t; show_subheader = false) + return nothing +end + +""" + printnotes(df, cols = All(); unnoted = false) + +Print the notes and labels in a data frame. + +## Arguments +* `cols`: Optional argument to select columns to print. Can + be any valid multi-column selector, such as `Not(...)`, + `Between(...)`, or a regular expression. +* `unnoted`: Keyword argument for whether to print + the columns without user-defined notes or labels. + +For the purposes of printing, column labels are printed in +addition to notes. However column labels are not returned by +`note(df, col)`. + +``` +julia> df = DataFrame(wage = [12], age = [23]); + +julia> @label! df :age = "Age (years)"; + +julia> @note! df :wage = "Derived from American Community Survey"; + +julia> @note! df :wage = "Missing values imputed as 0 wage"; + +julia> @label! df :wage = "Hourly wage (2015 USD)"; + +julia> printnotes(df) +Column: wage +──────────── +Label: Hourly wage (2015 USD) +Derived from American Community Survey +Missing values imputed as 0 wage + +Column: age +─────────── +Label: Age (years) +``` +""" +function printnotes(df, cols = All(); unnoted = false) + nms = names(df, cols) + for n in nms + nt = note(df, n) + lab = label(df, n) + no_note = nt == "" + no_lab = lab == n + if unnoted == true + printnote(n, nt, lab, no_note, no_lab) + else + if no_note == false || no_lab == false + printnote(n, nt, lab, no_note, no_lab) + end + end + end + nothing +end + +function printnote(n, nt, lab, no_note, no_lab) + # "Column: " has 8 characters + println("Column: $n") + println(repeat("─", length(n) + 8)) + if no_lab == false + println("Label: ", lab) + end + if no_note == false + println(nt) + end + println() +end \ No newline at end of file diff --git a/test/metadata.jl b/test/metadata.jl new file mode 100644 index 00000000..70b6e45b --- /dev/null +++ b/test/metadata.jl @@ -0,0 +1,79 @@ +module TestMetaData + +using Test + +@testset "labels" begin + df = DataFrame(a = 1, b = 2) + @label! df :a = "alab" + @test labels(df) == ["alab", "b"] + + df = DataFrame(a = 1, b = 2) + @label! df begin + :a = "alab" + :b = "blab" + end + @test labels(df) == ["alab", "blab"] + + df_new = leftjoin(DataFrame(a = 1, c = 2), df, on = :a) + @test labels(df_new) == ["a", "c", "blab"] + + df_new = @rename df :a2 = :a + @test labels(df_new) == ["alab", "blab"] + + df_new = @rtransform df :a = :a + 1 + @test labels(df_new) == ["alab", "blab"] +end + +@testset "notes" begin + df = DataFrame(a = 1, b = 2) + @note! df :a = "anote" + @test note(df, :a) == "anote" + + @note! df :a = "anote2" + @test note(df, :a) == "anote\nanote2" + + df = DataFrame(a = 1, b = 2) + @note! df begin + :a = "anote" + :b = "bnote" + end + @test note(df, :a) == "anote" + @test note(df, :b) == "bnote" + + df_new = leftjoin(DataFrame(a = 1, c = 2), df, on = :a) + @test note(df_new, :a) == "" + @test note(df_new, :b) == "bnote" + + df_new = @rename df :a2 = :a + @test note(df_new, :a2) == "anote" + @test note(df_new, :b) == "bnote" + + df_new = @rtransform df :a = :a + 1 + @test note(df_new, :a) == "anote" + @test note(df_new, :b) == "bnote" +end + +@testset "Metadata printing" begin + df = DataFrame(a = [1], b = [2]) + @label! df :a = "A label" + @note! df :a = "A note" + + # Just confirm the printing doesn't error + printlabels(df) + printlabels(df, :a) + printlabels(df, [:a, :b]) + printlabels(df; unlabelled = true) + printlabels(df; unlabelled = false) + printlabels(df, [:a, :b], unlabelled = false) + printlabels(df, [:a, :b], unlabelled = true) + + printnotes(df) + printnotes(df, :a) + printnotes(df, [:a, :b]) + printnotes(df; unnoted = true) + printnotes(df; unnoted = false) + printnotes(df, [:a, :b], unnoted = false) + printnotes(df, [:a, :b], unnoted = true) +end + +end # module \ No newline at end of file