diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 2f54b096b8..b5a1eabe1d 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -69,6 +69,10 @@ export AbstractDataFrame, tail, permutecols!, + metadata!, + metadata, + showmeta, + # Remove after deprecation period pool, pool! @@ -82,6 +86,7 @@ export AbstractDataFrame, include("other/utils.jl") include("other/index.jl") +include("other/metadata.jl") include("abstractdataframe/abstractdataframe.jl") include("dataframe/dataframe.jl") diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 27cbfd7ac7..9f0b621b28 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -100,6 +100,11 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, copyto!(cols[i+ncleft], view(col, all_orig_right_ixs)) permute!(cols[i+ncleft], right_perm) end + # To do: + # 1. Make a new metadata that is append(metadata(joiner.df1), metadata(df_noon)) + # 2. Make a new constructor so that we can construct a new dataframe + # 3. long term, add optional arguments to choose which metadata gets put in. + # Haven't added this yet because I only want to focus on dataframe/dataframe.jl for now. res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)), makeunique=makeunique) if length(rightonly_ixs.join) > 0 diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 32e15b817f..de56a00ef7 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -82,10 +82,11 @@ size(df1) mutable struct DataFrame <: AbstractDataFrame columns::Vector colindex::Index + metadata::MetaData - function DataFrame(columns::Vector{Any}, colindex::Index) + function DataFrame(columns::Vector{Any}, colindex::Index, metadata::MetaData=MetaData()) if length(columns) == length(colindex) == 0 - return new(Vector{Any}(undef, 0), Index()) + return new(Vector{Any}(undef, 0), Index(), metadata) elseif length(columns) != length(colindex) throw(DimensionMismatch("Number of columns ($(length(columns))) and number of" * " column names ($(length(colindex))) are not equal")) @@ -93,7 +94,7 @@ mutable struct DataFrame <: AbstractDataFrame lengths = [isa(col, AbstractArray) ? length(col) : 1 for col in columns] minlen, maxlen = extrema(lengths) if minlen == 0 && maxlen == 0 - return new(columns, colindex) + return new(columns, colindex, metadata) elseif minlen != maxlen || minlen == maxlen == 1 # recycle scalars for i in 1:length(columns) @@ -116,8 +117,9 @@ mutable struct DataFrame <: AbstractDataFrame throw(DimensionMismatch("columns must be 1-dimensional")) end end - new(columns, colindex) + new(columns, colindex, metadata) end + end function DataFrame(pairs::Pair{Symbol,<:Any}...; makeunique::Bool=false)::DataFrame @@ -223,6 +225,7 @@ end index(df::DataFrame) = getfield(df, :colindex) columns(df::DataFrame) = getfield(df, :columns) +metadata(df::DataFrame) = getfield(df, :metadata) # TODO: Remove these nrow(df::DataFrame) = ncol(df) > 0 ? length(columns(df)[1])::Int : 0 @@ -263,7 +266,7 @@ end function Base.getindex(df::DataFrame, col_inds::AbstractVector) selected_columns = index(df)[col_inds] new_columns = columns(df)[selected_columns] - return DataFrame(new_columns, Index(_names(df)[selected_columns])) + return DataFrame(new_columns, Index(_names(df)[selected_columns]), metadata(df)[selected_columns]) end # df[:] => DataFrame @@ -279,7 +282,7 @@ end function Base.getindex(df::DataFrame, row_ind::Real, col_inds::AbstractVector) selected_columns = index(df)[col_inds] new_columns = Any[dv[[row_ind]] for dv in columns(df)[selected_columns]] - return DataFrame(new_columns, Index(_names(df)[selected_columns])) + return DataFrame(new_columns, Index(_names(df)[selected_columns]), metadata(df)[selected_columns]) end # df[MultiRowIndex, SingleColumnIndex] => AbstractVector @@ -292,7 +295,7 @@ end function Base.getindex(df::DataFrame, row_inds::AbstractVector, col_inds::AbstractVector) selected_columns = index(df)[col_inds] new_columns = Any[dv[row_inds] for dv in columns(df)[selected_columns]] - return DataFrame(new_columns, Index(_names(df)[selected_columns])) + return DataFrame(new_columns, Index(_names(df)[selected_columns]), metadata(df)[selected_columns]) end # df[:, SingleColumnIndex] => AbstractVector @@ -305,7 +308,7 @@ Base.getindex(df::DataFrame, row_ind::Real, col_inds::Colon) = df[[row_ind], col # df[MultiRowIndex, :] => DataFrame function Base.getindex(df::DataFrame, row_inds::AbstractVector, col_inds::Colon) new_columns = Any[dv[row_inds] for dv in columns(df)] - return DataFrame(new_columns, copy(index(df))) + return DataFrame(new_columns, copy(index(df)), copy(metadata(df))) end # df[:, :] => DataFrame @@ -344,10 +347,12 @@ function insert_single_column!(df::DataFrame, if typeof(col_ind) <: Symbol push!(index(df), col_ind) push!(columns(df), dv) + push!(metadata(df), nothing) else if ncol(df) + 1 == Int(col_ind) push!(index(df), nextcolname(df)) push!(columns(df), dv) + push!(metadata(df), nothing) else throw(ArgumentError("Cannot assign to non-existent column: $col_ind")) end @@ -606,6 +611,7 @@ function Base.setindex!(df::DataFrame, col_inds::Colon=Colon()) setfield!(df, :columns, copy(columns(new_df))) setfield!(df, :colindex, copy(index(new_df))) + setfield!(df, :metadata, copy(metadata(new_df))) df end @@ -709,6 +715,7 @@ function Base.insert!(df::DataFrame, col_ind::Int, item::AbstractVector, name::S end insert!(index(df), col_ind, name) insert!(columns(df), col_ind, item) + insert!(metadata(df), col_ind, nothing) df end @@ -749,6 +756,7 @@ merge!(df, df2) # column z is added, column id is overwritten """ function Base.merge!(df::DataFrame, others::AbstractDataFrame...) for other in others + merge!(metadata(df), metadata(other), index(df), index(other)) for n in _names(other) df[n] = other[n] end @@ -764,12 +772,12 @@ end # A copy of a DataFrame points to the original column vectors but # gets its own Index. -Base.copy(df::DataFrame) = DataFrame(copy(columns(df)), copy(index(df))) +Base.copy(df::DataFrame) = DataFrame(copy(columns(df)), copy(index(df)), copy(metadata(df))) # Deepcopy is recursive -- if a column is a vector of DataFrames, each of # those DataFrames is deepcopied. function Base.deepcopy(df::DataFrame) - DataFrame(deepcopy(columns(df)), deepcopy(index(df))) + DataFrame(deepcopy(columns(df)), deepcopy(index(df)), deepcopy(metadata(df))) end ############################################################################## @@ -1100,9 +1108,55 @@ function permutecols!(df::DataFrame, p::AbstractVector) throw(ArgumentError("$p is not a valid column permutation for this DataFrame")) end permute!(columns(df), p) + permute!(metadata(df), p) setfield!(df, :colindex, Index(names(df)[p])) end function permutecols!(df::DataFrame, p::AbstractVector{Symbol}) permutecols!(df, getindex.(index(df).lookup, p)) end + + +############################################################################## +## +## Set and Get MetaData +## +############################################################################## + +""" + addlabel!(df::DataFrame, var::Symbol, label::String) + +Adds a label to a DataFrame. Does not add other metadata. +""" +function metadata!(df::DataFrame, var::Symbol, field::Symbol, info) + addmeta!(df.metadata, index(df)[var], ncol(df), field, info) + return df +end + +""" + showlabel(df::DataFrame, var::Symbol) + +Prints the label (not other metadata) for a single variable of a dataframe. +""" +function metadata(df::DataFrame, var::Symbol, field::Symbol) + metadata(df).dict[field][index(df)[var]] +end + +""" +Prints (does not return anything), all the MetaData +for a given field. +""" +function showmeta(df::DataFrame, fields::Union{Symbol, Vector{Symbol}}=collect(keys(metadata(df).dict))) + + if fields isa Symbol + fields = [fields] + end + + d = DataFrame(variable = names(df)) + + for field in fields + d[field] = getmeta.(df, names(df), field) + end + + d +end \ No newline at end of file diff --git a/src/other/metadata.jl b/src/other/metadata.jl new file mode 100644 index 0000000000..5d6ae4c158 --- /dev/null +++ b/src/other/metadata.jl @@ -0,0 +1,95 @@ +# Defining behavior for DataFrames metadata +struct MetaData + dict::Dict{Symbol, Vector} +end + +MetaData() = MetaData(Dict{Symbol,Vector}()) + +Base.isequal(x::MetaData, y::MetaData) = isequal(x.dict, y.dict) +Base.:(==)(x::MetaData, y::MetaData) = isequal(x, y) + +Base.copy(x::MetaData) = MetaData(copy(x.dict)) +Base.deepcopy(x::MetaData) = MetaData(copy(x.dict)) # field is immutable + +function Base.getindex(x::MetaData, col_inds::AbstractVector) + new_dict = copy(x.dict) + for key in keys(new_dict) + new_dict[key] = new_dict[key][col_inds] + end + MetaData(new_dict) +end + +function Base.permute!(x::MetaData, p::AbstractVector) + for key in keys(x.dict) + x.dict[key] = permute!(x.dict[key], p) + end + nothing +end + +function Base.permute(x::MetaData, p::AbstractVector) + new_metadata = copy(x) + permute!(new_metadata, p) +end + + +function newfield!(x::MetaData, ncol::Int, field::Symbol, info) + x.dict[field] = Union{typeof(info), Nothing}[nothing for i in 1:ncol] +end + +function addmeta!(x::MetaData, col_ind::Int, ncol::Int, field::Symbol, info) + if !haskey(x.dict, field) + newfield!(x, ncol, field, info) + end + x.dict[field][col_ind] = info +end + +# For creating a new column in the dataframe +function Base.push!(x::MetaData, info) + for key in keys(x.dict) + push!(x.dict[key], info) + end +end + +function Base.insert!(x::MetaData, col_ind::Int, item) + for key in keys(x.dict) + insert!(x.dict[key], col_ind, item) + end +end + +function Base.merge!(leftmeta::MetaData, rightmeta::MetaData, leftindex::Index, rightindex::Index) + # Find the unique columns on the right + right_and_not_left_names = setdiff(names(rightindex), names(leftindex)) + right_and_not_left_cols = rightindex[right_and_not_left_names] + # this imitates what's going on with the parent dataframes in merge! + rightmeta = rightmeta[right_and_not_left_cols] + rightindex = rightindex[right_and_not_left_names] + # Find the difference in the keys and allocate if needed + notonleft = setdiff(keys(rightmeta.dict), keys(leftmeta.dict)) + notonright = setdiff(keys(leftmeta.dict), keys(rightmeta.dict)) + + for field in notonleft + newfield!(leftmeta, length(leftindex), field, nothing) + end + + for field in notonright + newfield!(rightmeta, length(rightindex), field, nothing) + end + + for key in keys(leftmeta.dict) + leftmeta.dict[key] = + vcat(leftmeta.dict[key], rightmeta.dict[key]) + end +end + +function append(leftmeta::MetaData, rightmeta::MetaData) + append!(copy(leftmeta), rightmeta) +end + +# deleting columns is handled by get_index? +function getmeta(x::MetaData, col_ind::Int, field::Symbol) + if haskey(x.dict, field) + return x.dict[field][col_ind] + else + error("The field does not exist") + end +end \ No newline at end of file diff --git a/test/metadata.jl b/test/metadata.jl new file mode 100644 index 0000000000..95e7fdde2e --- /dev/null +++ b/test/metadata.jl @@ -0,0 +1,29 @@ +module TestMetaData + using Compat, Compat.Test, DataFrames, StatsBase, Compat.Random + using Suppressor + using Compat: @warn + +df1 = DataFrame(a = [1, 2], b = [3, 4]) +df2 = DataFrame(c = [3, 4], d = [5, 6]) + +# Just used to add metadata easily for testing. +metadata!(df, :a, :label, "A label for variable a") + +testdata = DataFrame(variable = names(df1), label = + ["A label for variable a", + nothing]) + +@test showmeta(df1) == testdata + +mergeddata = merge!(df1, df2) +testmergeddata = DataFrame(variable = names(mergeddata, + label = + ["A label for variable a", + nothing, + nothing, + nothing, + nothing])) + +@test showmeta(mergeddata) == testmergeddata + +end # module TestMetaData