From 2a6adf8729ef78a7ed7cb9de7018a5e593043860 Mon Sep 17 00:00:00 2001
From: Alexander Fabisch <afabisch@googlemail.com>
Date: Fri, 6 Sep 2013 00:29:01 +0200
Subject: [PATCH] Clean up `indicators`

* sparse is a keyword argument
* remove special handling for numerical types
---
 README.md     |  2 +-
 src/others.jl | 66 +++++----------------------------------------------
 test/01.jl    |  8 +++----
 3 files changed, 11 insertions(+), 65 deletions(-)

diff --git a/README.md b/README.md
index 93b578f3a95c0c..02dd68eb5cf421 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Basic statistics functions for Julia
 * `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data.
 * `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`).
 * `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode.
-* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide either a range min:max for categories that are encoded as integers (so that min will be encoded as [1 0 ...] and max as [... 0 1]) or a list of possible values, e.g. ["A", "B, "C"].
+* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide a list of possible values, e.g. ["A", "B, "C"] or [1:3].
 * `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`.
 * `quantile(a)`: Compute any desired quantile of `a`.
 * `quartile(a): Compute the quartiles of `a`.
diff --git a/src/others.jl b/src/others.jl
index a3db7f666e861b..86e2e53907286b 100644
--- a/src/others.jl
+++ b/src/others.jl
@@ -97,81 +97,27 @@ function ecdf{T}(X::AbstractVector{T})
     return e
 end
 
-function indicators{T<:Real}(input::AbstractMatrix{T},
-                             categories::Array{Any, 1}={},
-                             sparse::Bool=false)
-    nfeatures, nsamples = size(input)
-    if length(categories) != 0 && length(categories) != nfeatures
-        error("You must provide either categories for each feature or no categories")
-    end
-    internal_categories = copy(categories)
-    nOutputRows = 0
-    if length(internal_categories) != nfeatures
-        for i in 1:nfeatures
-            xmin, xmax = minmax(input[i, :])
-            push!(internal_categories, xmin:xmax)
-        end
-    end
-    for i in 1:nfeatures
-        nOutputRows += length(internal_categories[i])
-    end
-    if sparse
-        output = spzeros(T, nOutputRows, nsamples)
-    else
-        output = zeros(T, nOutputRows, nsamples)
-    end
-    offset = 1
-    for i in 1:nfeatures
-        indicators!(output, offset, slice(input, i, :), internal_categories[i])
-        offset += length(internal_categories[i])
-    end
-    return output
-end
-
-function indicators{T<:Real}(input::AbstractVector{T},
-                             categories::Range1{T}=min(input):max(input),
-                             sparse::Bool=false)
-    if sparse
-        output = spzeros(T, length(categories), length(input))
-    else
-        output = zeros(T, length(categories), length(input))
-    end
-    indicators!(output, 1, input, categories)
-    return output
-end
-
-function indicators!{T<:Real}(output::AbstractArray{T},
-                              offset::Integer,
-                              input::AbstractVector{T},
-                              categories::Range1{T}=min(input):max(input))
-    const lo = offset-categories[1]
-    for i in 1:length(input)
-        output[input[i]+lo, i] = one(T)
-    end
-    return
-end
-
 function indicators{T}(input::AbstractMatrix{T},
-                       categories::Array{Any, 1}={},
+                       categories::Array{Any,1}={};
                        sparse::Bool=false)
     nfeatures, nsamples = size(input)
     if length(categories) != 0 && length(categories) != nfeatures
         error("You must provide either categories for each feature or no categories")
     end
     internal_categories = copy(categories)
-    nOutputRows = 0
+    noutrows = 0
     if length(internal_categories) != nfeatures
         for i in 1:nfeatures
             push!(internal_categories, sort(unique(input[i, :])))
         end
     end
     for i in 1:nfeatures
-        nOutputRows += length(internal_categories[i])
+        noutrows += length(internal_categories[i])
     end
     if sparse
-        output = spzeros(nOutputRows, nsamples)
+        output = spzeros(noutrows, nsamples)
     else
-        output = zeros(nOutputRows, nsamples)
+        output = zeros(noutrows, nsamples)
     end
     offset = 1
     for i in 1:nfeatures
@@ -182,7 +128,7 @@ function indicators{T}(input::AbstractMatrix{T},
 end
 
 function indicators{T}(input::AbstractVector{T},
-                       categories::Array{T,1}=sort(unique(input)),
+                       categories::Array{T,1}=sort(unique(input));
                        sparse::Bool=false)
     if sparse
         output = spzeros(length(categories), length(input))
diff --git a/test/01.jl b/test/01.jl
index 14871f4fbc9b41..fdcfab0e0bcef7 100644
--- a/test/01.jl
+++ b/test/01.jl
@@ -35,18 +35,18 @@ fnecdf = ecdf([0.5])
 y = [1, 2, 1, 3, 2]
 expected = [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]'
 @test indicators(y) == expected
-@test indicators(y, 1:3, true) == expected
+@test indicators(y, [1:3], sparse=true) == expected
 y = [2, 3, 2, 4, 3]
 @test indicators(y) == expected
 X = [1 2 3; 1 1 1; 2 1 1]
 expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 1 1; 1 0 0]
 @test indicators(X) == expected
 expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0; 0 0 0; 0 1 1; 1 0 0]
-@test indicators(X, {1:3, 1:3, 1:2}) == expected
+@test indicators(X, {[1:3], [1:3], [1:2]}) == expected
 y = ["A", "B", "C", "B", "A"]
 expected = [1.0 0.0 0.0 0.0; 0.0 1.0 0.0 0.0; 0.0 0.0 1.0 0.0; 0.0 1.0 0.0 0.0; 1.0 0.0 0.0 0.0]'
-@test indicators(y, ["A", "B", "C", "D"], true) == expected
+@test indicators(y, ["A", "B", "C", "D"], sparse=true) == expected
 X = ["A" "B" "C"; "B" "A" "C"]
 cats = ["A", "B", "C", "D"]
 expected = [1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0; 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0; 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0]'
-@test indicators(X, {cats, cats}, false) == expected
+@test indicators(X, {cats, cats}, sparse=false) == expected