Skip to content

Commit

Permalink
grouping: replace groupsort by _group_rows()
Browse files Browse the repository at this point in the history
  • Loading branch information
alyst committed Aug 7, 2015
1 parent 8951261 commit df8411e
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 41 deletions.
41 changes: 14 additions & 27 deletions src/groupeddataframe/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ groupby(cols)
### Arguments
* `d` : an AbstractDataFrame
* `cols` : an
* `cols` : data frame columns to group by
If `d` is not provided, a curried version of groupby is given.
Expand Down Expand Up @@ -82,32 +82,19 @@ df |> groupby([:a, :b]) |> [sum, length]
"""
function groupby{T}(d::AbstractDataFrame, cols::Vector{T})
## a subset of Wes McKinney's algorithm here:
## http://wesmckinney.com/blog/?p=489

ncols = length(cols)
# use the pool trick to get a set of integer references for each unique item
dv = PooledDataArray(d[cols[ncols]])
# if there are NAs, add 1 to the refs to avoid underflows in x later
dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0)
x = copy(dv.refs) .+ dv_has_nas
# also compute the number of groups, which is the product of the set lengths
ngroups = length(dv.pool) + dv_has_nas
# if there's more than 1 column, do roughly the same thing repeatedly
for j = (ncols - 1):-1:1
dv = PooledDataArray(d[cols[j]])
dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0)
for i = 1:nrow(d)
x[i] += (dv.refs[i] + dv_has_nas- 1) * ngroups
end
ngroups = ngroups * (length(dv.pool) + dv_has_nas)
# TODO if ngroups is really big, shrink it
d_groups = _group_rows(d[cols])
# sort the groups
d_group_keys = sort!(collect(keys(d_groups)))
# generate permutation that arranges rows by groups
idx = sizehint!(@compat(Vector{Int}()), nrow(d))
starts = sizehint!(@compat(Vector{Int}()), length(d_groups))
for gr_row in d_group_keys
push!(starts, length(idx)+1)
append!(idx, d_groups[gr_row])
end
(idx, starts) = DataArrays.groupsort_indexer(x, ngroups)
# Remove zero-length groupings
starts = _uniqueofsorted(starts)
ends = starts[2:end] - 1
GroupedDataFrame(d, cols, idx, starts[1:end-1], ends)
ends = push!(starts[2:end] - 1, length(idx))
GroupedDataFrame(d, cols, idx, starts, ends)
end
groupby(d::AbstractDataFrame, cols) = groupby(d, [cols])

Expand Down Expand Up @@ -284,7 +271,7 @@ notation can be used.
### Returns
* `::DataFrame`
* `::DataFrame`
### Examples
Expand Down Expand Up @@ -330,7 +317,7 @@ same length.
### Returns
* `::DataFrame`
* `::DataFrame`
### Examples
Expand Down
15 changes: 1 addition & 14 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -162,19 +162,6 @@ function _setdiff{T}(a::AbstractVector{T}, b::T)
diff
end

function _uniqueofsorted(x::Vector)
idx = fill(true, length(x))
lastx = x[1]
for i = 2:length(x)
if lastx == x[i]
idx[i] = false
else
lastx = x[i]
end
end
x[idx]
end

# Gets the name of a function. Used in groupedataframe/grouping.jl
function _fnames(fs::Vector{Function})
λcounter = 0
Expand All @@ -188,4 +175,4 @@ function _fnames(fs::Vector{Function})
name
end
names
end
end

0 comments on commit df8411e

Please sign in to comment.