-
Notifications
You must be signed in to change notification settings - Fork 368
/
subdataframe.jl
399 lines (351 loc) · 16.2 KB
/
subdataframe.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
"""
SubDataFrame{<:AbstractDataFrame, <:AbstractIndex, <:AbstractVector{Int}} <: AbstractDataFrame
A view of an `AbstractDataFrame`. It is returned by a call to the `view` function
on an `AbstractDataFrame` if a collections of rows and columns are specified.
A `SubDataFrame` is an `AbstractDataFrame`, so expect that most
DataFrame functions should work. Such methods include `describe`,
`summary`, `nrow`, `size`, `by`, `stack`, and `join`.
If the selection of columns in a parent data frame is passed as `:` (a colon)
then `SubDataFrame` will always have all columns from the parent,
even if they are added or removed after its creation.
# Examples
```jldoctest
julia> df = DataFrame(a=repeat([1, 2, 3, 4], outer=[2]),
b=repeat([2, 1], outer=[4]),
c=1:8)
8×3 DataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 1 2 1
2 │ 2 1 2
3 │ 3 2 3
4 │ 4 1 4
5 │ 1 2 5
6 │ 2 1 6
7 │ 3 2 7
8 │ 4 1 8
julia> sdf1 = view(df, :, 2:3) # column subsetting
8×2 SubDataFrame
Row │ b c
│ Int64 Int64
─────┼──────────────
1 │ 2 1
2 │ 1 2
3 │ 2 3
4 │ 1 4
5 │ 2 5
6 │ 1 6
7 │ 2 7
8 │ 1 8
julia> sdf2 = @view df[end:-1:1, [1, 3]] # row and column subsetting
8×2 SubDataFrame
Row │ a c
│ Int64 Int64
─────┼──────────────
1 │ 4 8
2 │ 3 7
3 │ 2 6
4 │ 1 5
5 │ 4 4
6 │ 3 3
7 │ 2 2
8 │ 1 1
julia> sdf3 = groupby(df, :a)[1] # indexing a GroupedDataFrame returns a SubDataFrame
2×3 SubDataFrame
Row │ a b c
│ Int64 Int64 Int64
─────┼─────────────────────
1 │ 1 2 1
2 │ 1 2 5
```
"""
struct SubDataFrame{D<:AbstractDataFrame, S<:AbstractIndex, T<:AbstractVector{Int}} <: AbstractDataFrame
parent::D
colindex::S
rows::T # maps from subdf row indexes to parent row indexes
end
# this method should be never called by DataFrames.jl code, but is added for safety
SubDataFrame(parent::SubDataFrame, colindex::AbstractIndex, rows::AbstractVector{Int}) =
throw(ArgumentError("Creation of a SubDataFrame from a SubDataFrame is not allowed"))
Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::AbstractVector{Int}, cols)
@boundscheck if !checkindex(Bool, axes(parent, 1), rows)
throw(BoundsError(parent, (rows, cols)))
end
SubDataFrame(parent, SubIndex(index(parent), cols), rows)
end
Base.@propagate_inbounds SubDataFrame(parent::DataFrame, ::Colon, cols) =
SubDataFrame(parent, axes(parent, 1), cols)
@inline SubDataFrame(parent::DataFrame, row::Integer, cols) =
throw(ArgumentError("invalid row index: $row of type $(typeof(row))"))
Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::AbstractVector{<:Integer}, cols)
if any(x -> x isa Bool, rows)
throw(ArgumentError("invalid row index of type `Bool`"))
end
return SubDataFrame(parent, convert(Vector{Int}, rows), cols)
end
Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::AbstractVector{Bool}, cols)
if length(rows) != nrow(parent)
throw(ArgumentError("invalid length of `AbstractVector{Bool}` row index " *
"(got $(length(rows)), expected $(nrow(parent)))"))
end
return SubDataFrame(parent, _findall(rows), cols)
end
Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::AbstractVector, cols)
if !all(x -> (x isa Integer) && !(x isa Bool), rows)
throw(ArgumentError("only `Integer` indices are accepted in `rows`"))
end
return SubDataFrame(parent, convert(Vector{Int}, rows), cols)
end
Base.@propagate_inbounds function SubDataFrame(sdf::SubDataFrame, rowind, cols)
new_rows = ncol(sdf) == 0 ? (Int[])[rowind] : rows(sdf)[rowind]
return SubDataFrame(parent(sdf), new_rows, parentcols(index(sdf), cols))
end
Base.@propagate_inbounds SubDataFrame(sdf::SubDataFrame, rowind::Bool, cols) =
throw(ArgumentError("invalid row index of type Bool"))
Base.@propagate_inbounds function SubDataFrame(sdf::SubDataFrame, rowind, ::Colon)
new_rows = ncol(sdf) == 0 ? (Int[])[rowind] : rows(sdf)[rowind]
if index(sdf) isa Index # sdf was created using : as row selector
return SubDataFrame(parent(sdf), new_rows, :)
else
return SubDataFrame(parent(sdf), new_rows, parentcols(index(sdf), :))
end
end
Base.@propagate_inbounds SubDataFrame(sdf::SubDataFrame, rowind::Bool, ::Colon) =
throw(ArgumentError("invalid row index of type Bool"))
Base.@propagate_inbounds SubDataFrame(sdf::SubDataFrame, ::Colon, cols) =
SubDataFrame(parent(sdf), rows(sdf), parentcols(index(sdf), cols))
@inline SubDataFrame(sdf::SubDataFrame, ::Colon, ::Colon) = sdf
rows(sdf::SubDataFrame) = getfield(sdf, :rows)
Base.parent(sdf::SubDataFrame) = getfield(sdf, :parent)
Base.parentindices(sdf::SubDataFrame) = (rows(sdf), parentcols(index(sdf)))
Base.@propagate_inbounds Base.view(adf::AbstractDataFrame, rowinds, colind::ColumnIndex) =
view(adf[!, colind], rowinds)
Base.@propagate_inbounds Base.view(adf::AbstractDataFrame, ::typeof(!), colind::ColumnIndex) =
view(adf[!, colind], :)
@inline Base.view(adf::AbstractDataFrame, rowinds, colind::Bool) =
throw(ArgumentError("invalid column index $colind of type `Bool`"))
Base.@propagate_inbounds Base.view(adf::AbstractDataFrame, rowinds,
colinds::MultiColumnIndex) =
SubDataFrame(adf, rowinds, colinds)
Base.@propagate_inbounds Base.view(adf::AbstractDataFrame, rowinds::typeof(!),
colinds::MultiColumnIndex) =
SubDataFrame(adf, :, colinds)
Base.@propagate_inbounds Base.view(adf::AbstractDataFrame, rowinds::Not,
colinds::MultiColumnIndex) =
SubDataFrame(adf, axes(adf, 1)[rowinds], colinds)
##############################################################################
##
## AbstractDataFrame interface
##
##############################################################################
index(sdf::SubDataFrame) = getfield(sdf, :colindex)
nrow(sdf::SubDataFrame) = ncol(sdf) > 0 ? length(rows(sdf))::Int : 0
Base.@propagate_inbounds Base.getindex(sdf::SubDataFrame, rowind::Integer, colind::ColumnIndex) =
parent(sdf)[!, parentcols(index(sdf), colind)][rows(sdf)[rowind]]
Base.@propagate_inbounds Base.getindex(sdf::SubDataFrame, rowind::Bool, colind::ColumnIndex) =
throw(ArgumentError("invalid row index of type Bool"))
Base.@propagate_inbounds Base.getindex(sdf::SubDataFrame, rowinds::Union{AbstractVector, Not},
colind::ColumnIndex) =
parent(sdf)[!, parentcols(index(sdf), colind)][rows(sdf)[rowinds]]
Base.@propagate_inbounds Base.getindex(sdf::SubDataFrame, ::Colon, colind::ColumnIndex) =
parent(sdf)[rows(sdf), parentcols(index(sdf), colind)]
Base.@propagate_inbounds Base.getindex(sdf::SubDataFrame, ::typeof(!), colind::ColumnIndex) =
view(parent(sdf), rows(sdf), parentcols(index(sdf), colind))
Base.@propagate_inbounds function Base.getindex(sdf::SubDataFrame,
rowinds::Union{AbstractVector, Not},
colinds::MultiColumnIndex)
new_rows = ncol(sdf) == 0 ? (Int[])[rowinds] : rows(sdf)[rowinds]
return parent(sdf)[new_rows, parentcols(index(sdf), colinds)]
end
Base.@propagate_inbounds Base.getindex(sdf::SubDataFrame, ::Colon,
colinds::MultiColumnIndex) =
parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)]
Base.@propagate_inbounds Base.getindex(sdf::SubDataFrame, row_ind::typeof(!),
col_inds::MultiColumnIndex) =
select(sdf, index(sdf)[col_inds], copycols=false)
Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, idx::CartesianIndex{2})
return setindex!(sdf, val, idx[1], idx[2])
end
Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any)
if colinds isa SymbolOrString && columnindex(sdf, colinds) == 0
if !is_column_insertion_allowed(sdf)
throw(ArgumentError("creating new columns in a SubDataFrame that subsets " *
"columns of its parent data frame is disallowed"))
end
if !(val isa AbstractVector && nrow(sdf) == length(val))
throw(ArgumentError("Assigned value must be a vector with length " *
"equal to number of rows in the SubDataFrame"))
end
T = eltype(val)
newcol = similar(val, Union{T, Missing}, nrow(parent(sdf)))
fill!(newcol, missing)
newcol[rows(sdf)] = val
parent(sdf)[!, colinds] = newcol
else
parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val
end
return sdf
end
# TODO: in the future, when refactoring source code
# (presumably when we would first define all the types that the package provides
# and then define methods for them)
# consider merging SubDataFrame and DataFrame setindex! methods
function Base.setindex!(sdf::SubDataFrame, v::AbstractVector,
::typeof(!), col_ind::ColumnIndex)
if col_ind isa Union{Signed, Unsigned} && !(1 <= col_ind <= ncol(sdf))
throw(ArgumentError("Cannot assign to non-existent column: $col_ind"))
end
if col_ind isa SymbolOrString && columnindex(sdf, col_ind) == 0
if !is_column_insertion_allowed(sdf)
throw(ArgumentError("creating new columns in a SubDataFrame that subsets " *
"columns of its parent data frame is disallowed"))
end
sdf[:, col_ind] = v
else
pdf = parent(sdf)
p_col_ind = parentcols(index(sdf), col_ind)
old_col = pdf[!, p_col_ind]
T = eltype(old_col)
S = eltype(v)
newcol = similar(old_col, promote_type(T, S), length(old_col))
newcol .= old_col
newcol[rows(sdf)] = v
pdf[!, p_col_ind] = newcol
end
return sdf
end
for T in MULTICOLUMNINDEX_TUPLE
@eval function Base.setindex!(sdf::SubDataFrame,
new_df::AbstractDataFrame,
row_inds::typeof(!),
col_inds::$T)
idxs = index(sdf)[col_inds]
if view(_names(sdf), idxs) != _names(new_df)
throw(ArgumentError("Column names in source and target data frames do not match"))
end
for (j, col) in enumerate(idxs)
sdf[!, col] = new_df[!, j]
end
return sdf
end
@eval function Base.setindex!(sdf::SubDataFrame,
mx::AbstractMatrix,
row_inds::typeof(!),
col_inds::$T)
idxs = index(sdf)[col_inds]
if size(mx, 2) != length(idxs)
throw(DimensionMismatch("number of selected columns ($(length(idxs))) " *
"and number of columns in " *
"matrix ($(size(mx, 2))) do not match"))
end
for (j, col) in enumerate(idxs)
sdf[!, col] = view(mx, :, j)
end
return sdf
end
end
Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, rowinds::Any, colinds::Any)
parent(sdf)[rows(sdf)[rowinds], parentcols(index(sdf), colinds)] = val
return sdf
end
Base.@propagate_inbounds Base.setindex!(sdf::SubDataFrame, val::Any, rowinds::Bool, colinds::Any) =
throw(ArgumentError("invalid row index of type Bool"))
Base.setproperty!(df::SubDataFrame, col_ind::Symbol, v::AbstractVector) =
(df[!, col_ind] = v)
Base.setproperty!(df::SubDataFrame, col_ind::AbstractString, v::AbstractVector) =
(df[!, col_ind] = v)
Base.setproperty!(::SubDataFrame, col_ind::Symbol, v::Any) =
throw(ArgumentError("It is only allowed to pass a vector as a column of a SubDataFrame. " *
"Instead use `df[!, col_ind] .= v` if you want to use broadcasting."))
Base.setproperty!(::SubDataFrame, col_ind::AbstractString, v::Any) =
throw(ArgumentError("It is only allowed to pass a vector as a column of a SubDataFrame. " *
"Instead use `df[!, col_ind] .= v` if you want to use broadcasting."))
##############################################################################
##
## Miscellaneous
##
##############################################################################
Base.copy(sdf::SubDataFrame) = parent(sdf)[rows(sdf), parentcols(index(sdf), :)]
Base.deleteat!(df::SubDataFrame, ind) =
throw(ArgumentError("SubDataFrame does not support deleting rows"))
function DataFrame(sdf::SubDataFrame; copycols::Bool=true)
if copycols
return sdf[:, :]
else
new_df = DataFrame(collect(eachcol(sdf)), _names(sdf), copycols=false)
_copy_all_note_metadata!(new_df, sdf)
return new_df
end
end
Base.convert(::Type{DataFrame}, sdf::SubDataFrame) = DataFrame(sdf)
# this function tests if it is allowed to add/remove/reorder columns to passed SubDataFrame
# currently it is only allowed when SubDataFrame was created with : as column selector
# which results in using Index as its index (as opposed to other columns selectors
# which result in SubIndex)
function is_column_insertion_allowed(df::AbstractDataFrame)
if df isa DataFrame
return true
elseif df isa SubDataFrame
return getfield(df, :colindex) isa Index
end
throw(ArgumentError("Unsupported data frame type"))
end
function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame; keep_present::Bool)
if keep_present
colsmatch = issubset(_names(newdf), _names(sdf))
else
colsmatch = _names(newdf) == _names(sdf)
end
if !(colsmatch || is_column_insertion_allowed(sdf))
throw(ArgumentError("changing the sequence of column names in a SubDataFrame " *
"that subsets columns of its parent data frame is disallowed"))
end
for colname in _names(newdf)
# This will allocate a fresh column in parent(sdf) for each colname
sdf[!, colname] = newdf[!, colname]
end
# if _replace_columns! was called from transform we are done as we want to
# keep all columns that were previously present.
# In this case column order will be correct.
# Otherwise if columns did not match this means that we have either:
# 1. inserted some columns into newdf
# or
# 2. requested to reorder the existing columns
# or
# 3. dropped some columns in newdf
# and that operation was allowed.
# Therefore we need to update the parent of sdf in place to make sure
# it holds only the required target columns in a correct order.
pdf = parent(sdf)
if !keep_present && !colsmatch
pdf = parent(sdf)
@assert pdf isa DataFrame
select!(pdf, _names(newdf))
end
for col in _names(newdf)
emptycolmetadata!(pdf, col)
end
for (col, col_keys) in colmetadatakeys(newdf)
if hasproperty(pdf, col)
for key in col_keys
val, style = colmetadata(newdf, col, key, style=true)
style === :note && colmetadata!(pdf, col, key, val, style=:note)
end
end
end
_drop_all_nonnote_metadata!(pdf)
return sdf
end
# _try_select_no_copy selects cols from df; it tries to avoid copying data if possible;
# for SubDataFrame if cols is not a simple column selector then copying is needed
function _try_select_no_copy(sdf::SubDataFrame, cols)
# try is needed here as `cols` could be AbstractVector in which case
# it is not possible to statically check if it is a valid column selector
colsidx = try
index(sdf)[cols]
catch
nothing
end
return isnothing(colsidx) ? select(sdf, cols) : select(sdf, colsidx, copycols=false)
end