diff --git a/README.md b/README.md index 93b578f3a95c0c..02dd68eb5cf421 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Basic statistics functions for Julia * `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data. * `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`). * `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode. -* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide either a range min:max for categories that are encoded as integers (so that min will be encoded as [1 0 ...] and max as [... 0 1]) or a list of possible values, e.g. ["A", "B, "C"]. +* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide a list of possible values, e.g. ["A", "B, "C"] or [1:3]. * `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`. * `quantile(a)`: Compute any desired quantile of `a`. * `quartile(a): Compute the quartiles of `a`. diff --git a/src/others.jl b/src/others.jl index a3db7f666e861b..86e2e53907286b 100644 --- a/src/others.jl +++ b/src/others.jl @@ -97,81 +97,27 @@ function ecdf{T}(X::AbstractVector{T}) return e end -function indicators{T<:Real}(input::AbstractMatrix{T}, - categories::Array{Any, 1}={}, - sparse::Bool=false) - nfeatures, nsamples = size(input) - if length(categories) != 0 && length(categories) != nfeatures - error("You must provide either categories for each feature or no categories") - end - internal_categories = copy(categories) - nOutputRows = 0 - if length(internal_categories) != nfeatures - for i in 1:nfeatures - xmin, xmax = minmax(input[i, :]) - push!(internal_categories, xmin:xmax) - end - end - for i in 1:nfeatures - nOutputRows += length(internal_categories[i]) - end - if sparse - output = spzeros(T, nOutputRows, nsamples) - else - output = zeros(T, nOutputRows, nsamples) - end - offset = 1 - for i in 1:nfeatures - indicators!(output, offset, slice(input, i, :), internal_categories[i]) - offset += length(internal_categories[i]) - end - return output -end - -function indicators{T<:Real}(input::AbstractVector{T}, - categories::Range1{T}=min(input):max(input), - sparse::Bool=false) - if sparse - output = spzeros(T, length(categories), length(input)) - else - output = zeros(T, length(categories), length(input)) - end - indicators!(output, 1, input, categories) - return output -end - -function indicators!{T<:Real}(output::AbstractArray{T}, - offset::Integer, - input::AbstractVector{T}, - categories::Range1{T}=min(input):max(input)) - const lo = offset-categories[1] - for i in 1:length(input) - output[input[i]+lo, i] = one(T) - end - return -end - function indicators{T}(input::AbstractMatrix{T}, - categories::Array{Any, 1}={}, + categories::Array{Any,1}={}; sparse::Bool=false) nfeatures, nsamples = size(input) if length(categories) != 0 && length(categories) != nfeatures error("You must provide either categories for each feature or no categories") end internal_categories = copy(categories) - nOutputRows = 0 + noutrows = 0 if length(internal_categories) != nfeatures for i in 1:nfeatures push!(internal_categories, sort(unique(input[i, :]))) end end for i in 1:nfeatures - nOutputRows += length(internal_categories[i]) + noutrows += length(internal_categories[i]) end if sparse - output = spzeros(nOutputRows, nsamples) + output = spzeros(noutrows, nsamples) else - output = zeros(nOutputRows, nsamples) + output = zeros(noutrows, nsamples) end offset = 1 for i in 1:nfeatures @@ -182,7 +128,7 @@ function indicators{T}(input::AbstractMatrix{T}, end function indicators{T}(input::AbstractVector{T}, - categories::Array{T,1}=sort(unique(input)), + categories::Array{T,1}=sort(unique(input)); sparse::Bool=false) if sparse output = spzeros(length(categories), length(input)) diff --git a/test/01.jl b/test/01.jl index 14871f4fbc9b41..fdcfab0e0bcef7 100644 --- a/test/01.jl +++ b/test/01.jl @@ -35,18 +35,18 @@ fnecdf = ecdf([0.5]) y = [1, 2, 1, 3, 2] expected = [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]' @test indicators(y) == expected -@test indicators(y, 1:3, true) == expected +@test indicators(y, [1:3], sparse=true) == expected y = [2, 3, 2, 4, 3] @test indicators(y) == expected X = [1 2 3; 1 1 1; 2 1 1] expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 1 1; 1 0 0] @test indicators(X) == expected expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0; 0 0 0; 0 1 1; 1 0 0] -@test indicators(X, {1:3, 1:3, 1:2}) == expected +@test indicators(X, {[1:3], [1:3], [1:2]}) == expected y = ["A", "B", "C", "B", "A"] expected = [1.0 0.0 0.0 0.0; 0.0 1.0 0.0 0.0; 0.0 0.0 1.0 0.0; 0.0 1.0 0.0 0.0; 1.0 0.0 0.0 0.0]' -@test indicators(y, ["A", "B", "C", "D"], true) == expected +@test indicators(y, ["A", "B", "C", "D"], sparse=true) == expected X = ["A" "B" "C"; "B" "A" "C"] cats = ["A", "B", "C", "D"] expected = [1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0; 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0; 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0]' -@test indicators(X, {cats, cats}, false) == expected +@test indicators(X, {cats, cats}, sparse=false) == expected