Clean up indicators

* sparse is a keyword argument * remove special handling for numerical types
LilithHafner · Sep 5, 2013 · 2a6adf8 · 2a6adf8
1 parent b6214e3
commit 2a6adf8
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 65 deletions.
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ Basic statistics functions for Julia
 * `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data.
 * `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`).
 * `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode.
-* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide either a range min:max for categories that are encoded as integers (so that min will be encoded as [1 0 ...] and max as [... 0 1]) or a list of possible values, e.g. ["A", "B, "C"].
+* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide a list of possible values, e.g. ["A", "B, "C"] or [1:3].
 * `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`.
 * `quantile(a)`: Compute any desired quantile of `a`.
 * `quartile(a): Compute the quartiles of `a`.

diff --git a/src/others.jl b/src/others.jl
@@ -97,81 +97,27 @@ function ecdf{T}(X::AbstractVector{T})
     return e
 end
 
-function indicators{T<:Real}(input::AbstractMatrix{T},
-                             categories::Array{Any, 1}={},
-                             sparse::Bool=false)
-    nfeatures, nsamples = size(input)
-    if length(categories) != 0 && length(categories) != nfeatures
-        error("You must provide either categories for each feature or no categories")
-    end
-    internal_categories = copy(categories)
-    nOutputRows = 0
-    if length(internal_categories) != nfeatures
-        for i in 1:nfeatures
-            xmin, xmax = minmax(input[i, :])
-            push!(internal_categories, xmin:xmax)
-        end
-    end
-    for i in 1:nfeatures
-        nOutputRows += length(internal_categories[i])
-    end
-    if sparse
-        output = spzeros(T, nOutputRows, nsamples)
-    else
-        output = zeros(T, nOutputRows, nsamples)
-    end
-    offset = 1
-    for i in 1:nfeatures
-        indicators!(output, offset, slice(input, i, :), internal_categories[i])
-        offset += length(internal_categories[i])
-    end
-    return output
-end
-
-function indicators{T<:Real}(input::AbstractVector{T},
-                             categories::Range1{T}=min(input):max(input),
-                             sparse::Bool=false)
-    if sparse
-        output = spzeros(T, length(categories), length(input))
-    else
-        output = zeros(T, length(categories), length(input))
-    end
-    indicators!(output, 1, input, categories)
-    return output
-end
-
-function indicators!{T<:Real}(output::AbstractArray{T},
-                              offset::Integer,
-                              input::AbstractVector{T},
-                              categories::Range1{T}=min(input):max(input))
-    const lo = offset-categories[1]
-    for i in 1:length(input)
-        output[input[i]+lo, i] = one(T)
-    end
-    return
-end
-
 function indicators{T}(input::AbstractMatrix{T},
-                       categories::Array{Any, 1}={},
+                       categories::Array{Any,1}={};
                        sparse::Bool=false)
     nfeatures, nsamples = size(input)
     if length(categories) != 0 && length(categories) != nfeatures
         error("You must provide either categories for each feature or no categories")
     end
     internal_categories = copy(categories)
-    nOutputRows = 0
+    noutrows = 0
     if length(internal_categories) != nfeatures
         for i in 1:nfeatures
             push!(internal_categories, sort(unique(input[i, :])))
         end
     end
     for i in 1:nfeatures
-        nOutputRows += length(internal_categories[i])
+        noutrows += length(internal_categories[i])
     end
     if sparse
-        output = spzeros(nOutputRows, nsamples)
+        output = spzeros(noutrows, nsamples)
     else
-        output = zeros(nOutputRows, nsamples)
+        output = zeros(noutrows, nsamples)
     end
     offset = 1
     for i in 1:nfeatures
@@ -182,7 +128,7 @@ function indicators{T}(input::AbstractMatrix{T},
 end
 
 function indicators{T}(input::AbstractVector{T},
-                       categories::Array{T,1}=sort(unique(input)),
+                       categories::Array{T,1}=sort(unique(input));
                        sparse::Bool=false)
     if sparse
         output = spzeros(length(categories), length(input))

diff --git a/test/01.jl b/test/01.jl
@@ -35,18 +35,18 @@ fnecdf = ecdf([0.5])
 y = [1, 2, 1, 3, 2]
 expected = [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]'
 @test indicators(y) == expected
-@test indicators(y, 1:3, true) == expected
+@test indicators(y, [1:3], sparse=true) == expected
 y = [2, 3, 2, 4, 3]
 @test indicators(y) == expected
 X = [1 2 3; 1 1 1; 2 1 1]
 expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 1 1; 1 0 0]
 @test indicators(X) == expected
 expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0; 0 0 0; 0 1 1; 1 0 0]
-@test indicators(X, {1:3, 1:3, 1:2}) == expected
+@test indicators(X, {[1:3], [1:3], [1:2]}) == expected
 y = ["A", "B", "C", "B", "A"]
 expected = [1.0 0.0 0.0 0.0; 0.0 1.0 0.0 0.0; 0.0 0.0 1.0 0.0; 0.0 1.0 0.0 0.0; 1.0 0.0 0.0 0.0]'
-@test indicators(y, ["A", "B", "C", "D"], true) == expected
+@test indicators(y, ["A", "B", "C", "D"], sparse=true) == expected
 X = ["A" "B" "C"; "B" "A" "C"]
 cats = ["A", "B", "C", "D"]
 expected = [1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0; 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0; 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0]'
-@test indicators(X, {cats, cats}, false) == expected
+@test indicators(X, {cats, cats}, sparse=false) == expected