Skip to content

Commit

Permalink
Clean up indicators
Browse files Browse the repository at this point in the history
* sparse is a keyword argument
* remove special handling for numerical types
  • Loading branch information
AlexanderFabisch committed Sep 5, 2013
1 parent b6214e3 commit 2a6adf8
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 65 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Basic statistics functions for Julia
* `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data.
* `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`).
* `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode.
* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide either a range min:max for categories that are encoded as integers (so that min will be encoded as [1 0 ...] and max as [... 0 1]) or a list of possible values, e.g. ["A", "B, "C"].
* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide a list of possible values, e.g. ["A", "B, "C"] or [1:3].
* `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`.
* `quantile(a)`: Compute any desired quantile of `a`.
* `quartile(a): Compute the quartiles of `a`.
Expand Down
66 changes: 6 additions & 60 deletions src/others.jl
Original file line number Diff line number Diff line change
Expand Up @@ -97,81 +97,27 @@ function ecdf{T}(X::AbstractVector{T})
return e
end

function indicators{T<:Real}(input::AbstractMatrix{T},
categories::Array{Any, 1}={},
sparse::Bool=false)
nfeatures, nsamples = size(input)
if length(categories) != 0 && length(categories) != nfeatures
error("You must provide either categories for each feature or no categories")
end
internal_categories = copy(categories)
nOutputRows = 0
if length(internal_categories) != nfeatures
for i in 1:nfeatures
xmin, xmax = minmax(input[i, :])
push!(internal_categories, xmin:xmax)
end
end
for i in 1:nfeatures
nOutputRows += length(internal_categories[i])
end
if sparse
output = spzeros(T, nOutputRows, nsamples)
else
output = zeros(T, nOutputRows, nsamples)
end
offset = 1
for i in 1:nfeatures
indicators!(output, offset, slice(input, i, :), internal_categories[i])
offset += length(internal_categories[i])
end
return output
end

function indicators{T<:Real}(input::AbstractVector{T},
categories::Range1{T}=min(input):max(input),
sparse::Bool=false)
if sparse
output = spzeros(T, length(categories), length(input))
else
output = zeros(T, length(categories), length(input))
end
indicators!(output, 1, input, categories)
return output
end

function indicators!{T<:Real}(output::AbstractArray{T},
offset::Integer,
input::AbstractVector{T},
categories::Range1{T}=min(input):max(input))
const lo = offset-categories[1]
for i in 1:length(input)
output[input[i]+lo, i] = one(T)
end
return
end

function indicators{T}(input::AbstractMatrix{T},
categories::Array{Any, 1}={},
categories::Array{Any,1}={};
sparse::Bool=false)
nfeatures, nsamples = size(input)
if length(categories) != 0 && length(categories) != nfeatures
error("You must provide either categories for each feature or no categories")
end
internal_categories = copy(categories)
nOutputRows = 0
noutrows = 0
if length(internal_categories) != nfeatures
for i in 1:nfeatures
push!(internal_categories, sort(unique(input[i, :])))
end
end
for i in 1:nfeatures
nOutputRows += length(internal_categories[i])
noutrows += length(internal_categories[i])
end
if sparse
output = spzeros(nOutputRows, nsamples)
output = spzeros(noutrows, nsamples)
else
output = zeros(nOutputRows, nsamples)
output = zeros(noutrows, nsamples)
end
offset = 1
for i in 1:nfeatures
Expand All @@ -182,7 +128,7 @@ function indicators{T}(input::AbstractMatrix{T},
end

function indicators{T}(input::AbstractVector{T},
categories::Array{T,1}=sort(unique(input)),
categories::Array{T,1}=sort(unique(input));
sparse::Bool=false)
if sparse
output = spzeros(length(categories), length(input))
Expand Down
8 changes: 4 additions & 4 deletions test/01.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,18 @@ fnecdf = ecdf([0.5])
y = [1, 2, 1, 3, 2]
expected = [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]'
@test indicators(y) == expected
@test indicators(y, 1:3, true) == expected
@test indicators(y, [1:3], sparse=true) == expected
y = [2, 3, 2, 4, 3]
@test indicators(y) == expected
X = [1 2 3; 1 1 1; 2 1 1]
expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 1 1; 1 0 0]
@test indicators(X) == expected
expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0; 0 0 0; 0 1 1; 1 0 0]
@test indicators(X, {1:3, 1:3, 1:2}) == expected
@test indicators(X, {[1:3], [1:3], [1:2]}) == expected
y = ["A", "B", "C", "B", "A"]
expected = [1.0 0.0 0.0 0.0; 0.0 1.0 0.0 0.0; 0.0 0.0 1.0 0.0; 0.0 1.0 0.0 0.0; 1.0 0.0 0.0 0.0]'
@test indicators(y, ["A", "B", "C", "D"], true) == expected
@test indicators(y, ["A", "B", "C", "D"], sparse=true) == expected
X = ["A" "B" "C"; "B" "A" "C"]
cats = ["A", "B", "C", "D"]
expected = [1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0; 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0; 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0]'
@test indicators(X, {cats, cats}, false) == expected
@test indicators(X, {cats, cats}, sparse=false) == expected

0 comments on commit 2a6adf8

Please sign in to comment.