Skip to content

Commit

Permalink
Data standardization: z-score and 0-1 scaling (JuliaLang#85)
Browse files Browse the repository at this point in the history
  • Loading branch information
wildart authored and nalimilan committed Jan 21, 2019
1 parent 0a489d0 commit 641236d
Show file tree
Hide file tree
Showing 7 changed files with 429 additions and 4 deletions.
3 changes: 2 additions & 1 deletion docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ makedocs(
"signalcorr.md",
"multivariate.md",
"misc.md",
"statmodels.md"]
"statmodels.md",
"transformations.md"]
)

deploydocs(
Expand Down
2 changes: 1 addition & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ end


```@contents
Pages = ["weights.md", "means.md", "scalarstats.md", "robust.md", "deviation.md", "cov.md", "counts.md", "ranking.md", "sampling.md", "empirical.md", "signalcorr.md", "misc.md", "statmodels.md"]
Pages = ["weights.md", "means.md", "scalarstats.md", "robust.md", "deviation.md", "cov.md", "counts.md", "ranking.md", "sampling.md", "empirical.md", "signalcorr.md", "misc.md", "statmodels.md", "transformations.md"]
Depth = 2
```

Expand Down
43 changes: 43 additions & 0 deletions docs/src/transformations.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Data Transformations

In general, data transformations change raw feature vectors into
a representation that is more suitable for various estimators.

## Standardization

**Standardization** of dataset is a common requirement for many machine
learning techniques. These techniques might perform poorly if the individual
features do not more or less look like standard normally distributed data.

Standardization transforms data points into corresponding standard scores
by removing mean and scaling to unit variance.

The **standard score** is the signed number of standard deviations by which
the value of an observation or data point is above the mean value of what
is being observed or measured.

Standardization can be performed using `fit(ZScoreTransform, ...)`.

```@docs
fit(::Type{ZScoreTransform}, X::AbstractArray{<:Real,2}; center::Bool=true, scale::Bool=true)
```

## Unit range normalization

**Unit range normalization* is an alternative data transformation which scales features
to lie in the interval `[0; 1]`.

Unit range normalization can be performed using `fit(UnitRangeTransform, ...)`.

```@docs
fit(::Type{UnitRangeTransform}, X::AbstractArray{<:Real,2}; unit::Bool=true)
```

## Additional methods
```@docs
StatsBase.transform
StatsBase.transform!
StatsBase.reconstruct
StatsBase.reconstruct!
standardize
```
10 changes: 9 additions & 1 deletion src/StatsBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,13 @@ module StatsBase
r2,
r²,

ConvergenceException
ConvergenceException,

# data standardization
standardize,
AbstractDataTransform, # the type to represent a abstract data transformation
ZScoreTransform, # the type to represent a z-score data transformation
UnitRangeTransform # the type to represent a 0-1 data transformation

# source files

Expand All @@ -224,6 +230,8 @@ module StatsBase
include("sampling.jl")
include("statmodels.jl")

include("transformations.jl")

include("deprecates.jl")

end # module
306 changes: 306 additions & 0 deletions src/transformations.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
### Transformations

abstract type AbstractDataTransform end

# apply the transform
"""
transform!(t::AbstractDataTransform, x)
Apply transformation `t` to vector or matrix `x` in place.
"""
transform!(t::AbstractDataTransform, x::AbstractArray{<:Real,1}) = transform!(x, t, x)
transform!(t::AbstractDataTransform, x::AbstractArray{<:Real,2}) = transform!(x, t, x)

"""
transform(t::AbstractDataTransform, x)
Return a row-standardized vector or matrix `x` using `t` transformation.
"""
transform(t::AbstractDataTransform, x::AbstractArray{<:Real,1}) = transform!(similar(x), t, x)
transform(t::AbstractDataTransform, x::AbstractArray{<:Real,2}) = transform!(similar(x), t, x)

# reconstruct the original data from transformed values
"""
reconstruct!(t::AbstractDataTransform, y)
Perform an in-place reconstruction into an original data scale from a row-transformed
vector or matrix `y` using `t` transformation.
"""
reconstruct!(t::AbstractDataTransform, y::AbstractArray{<:Real,1}) = reconstruct!(y, t, y)
reconstruct!(t::AbstractDataTransform, y::AbstractArray{<:Real,2}) = reconstruct!(y, t, y)

"""
reconstruct(t::AbstractDataTransform, y)
Return a reconstruction of an originally scaled data from a row-transformed vector
or matrix `y` using `t` transformation.
"""
reconstruct(t::AbstractDataTransform, y::AbstractArray{<:Real,1}) = reconstruct!(similar(y), t, y)
reconstruct(t::AbstractDataTransform, y::AbstractArray{<:Real,2}) = reconstruct!(similar(y), t, y)

"""
Standardization (Z-score transformation)
"""
struct ZScoreTransform{T<:Real} <: AbstractDataTransform
dim::Int
mean::Vector{T}
scale::Vector{T}

function ZScoreTransform(d::Int, m::Vector{T}, s::Vector{T}) where T
lenm = length(m)
lens = length(s)
lenm == d || lenm == 0 || throw(DimensionMismatch("Inconsistent dimensions."))
lens == d || lens == 0 || throw(DimensionMismatch("Inconsistent dimensions."))
new{T}(d, m, s)
end
end

function Base.getproperty(t::ZScoreTransform, p::Symbol)
if p === :indim || p === :outdim
return t.dim
else
return getfield(t, p)
end
end

"""
fit(ZScoreTransform, X; center=true, scale=true)
Fit standardization parameters to `X` and return a `ZScoreTransform` transformation object.
# Arguments
* `data`: matrix of samples to fit transformation parameters.
# Keyword arguments
* `center`: if `true` (the default) center data so that its mean is zero.
* `scale`: if `true` (the default) scale the data so that its variance is equal to one.
# Examples
```jldoctest
julia> using StatsBase
julia> X = [0.0 -0.5 0.5; 0.0 1.0 2.0]
2×3 Array{Float64,2}:
0.0 -0.5 0.5
0.0 1.0 2.0
julia> dt = fit(ZScoreTransform, X)
ZScoreTransform{Float64}(2, [0.0, 1.0], [0.5, 1.0])
julia> StatsBase.transform(dt, X)
2×3 Array{Float64,2}:
0.0 -1.0 1.0
-1.0 0.0 1.0
```
"""
function fit(::Type{ZScoreTransform}, X::AbstractArray{<:Real,2}; center::Bool=true, scale::Bool=true)
d, n = size(X)
n >= 2 || error("X must contain at least two columns.")

T = eltype(X)
m, s = mean_and_std(X, 2)

return ZScoreTransform(d, (center ? vec(m) : zeros(T, 0)),
(scale ? vec(s) : zeros(T, 0)))
end

function transform!(y::AbstractVecOrMat{<:Real}, t::ZScoreTransform, x::AbstractVecOrMat{<:Real})
d = t.dim
size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions."))
n = size(y,2)
size(x,2) == n || throw(DimensionMismatch("Inconsistent dimensions."))

m = t.mean
s = t.scale

if isempty(m)
if isempty(s)
if x !== y
copyto!(y, x)
end
else
broadcast!(/, y, x, s)
end
else
if isempty(s)
broadcast!(-, y, x, m)
else
broadcast!((x,m,s)->(x-m)/s, y, x, m, s)
end
end
return y
end

function reconstruct!(x::AbstractVecOrMat{<:Real}, t::ZScoreTransform, y::AbstractVecOrMat{<:Real})
d = t.dim
size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions."))
n = size(y,2)
size(x,2) == n || throw(DimensionMismatch("Inconsistent dimensions."))

m = t.mean
s = t.scale

if isempty(m)
if isempty(s)
if y !== x
copyto!(x, y)
end
else
broadcast!(*, x, y, s)
end
else
if isempty(s)
broadcast!(+, x, y, m)
else
broadcast!((y,m,s)->y*s+m, x, y, m, s)
end
end
return x
end

"""
Unit range normalization
"""
struct UnitRangeTransform{T<:Real} <: AbstractDataTransform
dim::Int
unit::Bool
min::Vector{T}
scale::Vector{T}

function UnitRangeTransform(d::Int, unit::Bool, min::Vector{T}, max::Vector{T}) where {T}
lenmin = length(min)
lenmax = length(max)
lenmin == d || lenmin == 0 || throw(DimensionMismatch("Inconsistent dimensions."))
lenmax == d || lenmax == 0 || throw(DimensionMismatch("Inconsistent dimensions."))
new{T}(d, unit, min, max)
end
end

function Base.getproperty(t::UnitRangeTransform, p::Symbol)
if p === :indim || p === :outdim
return t.dim
else
return getfield(t, p)
end
end

# fit a unit transform
"""
fit(UnitRangeTransform, X; center=true, scale=true)
Fit a scaling parameters to `X` and return transformation description.
# Arguments
* `data`: matrix of samples to fit transformation parameters.
# Keyword arguments
* `center`: if `true` (the default) centere data around zero.
* `scale`: if `true` (the default) perform variance scaling.
# Examples
```jldoctest
julia> using StatsBase
julia> X = [0.0 -0.5 0.5; 0.0 1.0 2.0]
2×3 Array{Float64,2}:
0.0 -0.5 0.5
0.0 1.0 2.0
julia> dt = fit(UnitRangeTransform, X)
UnitRangeTransform{Float64}(2, true, [-0.5, 0.0], [1.0, 0.5])
julia> StatsBase.transform(dt, X)
2×3 Array{Float64,2}:
0.5 0.0 1.0
0.0 0.5 1.0
```
"""
function fit(::Type{UnitRangeTransform}, X::AbstractArray{<:Real,2}; unit::Bool=true)
d, n = size(X)

tmin = X[:, 1]
tmax = X[:, 1]
for j = 2:n
@inbounds for i = 1:d
if X[i, j] < tmin[i]
tmin[i] = X[i, j]
elseif X[i, j] > tmax[i]
tmax[i] = X[i, j]
end
end
end
for i = 1:d
@inbounds tmax[i] = 1 / (tmax[i] - tmin[i])
end
return UnitRangeTransform(d, unit, tmin, tmax)
end

function transform!(y::AbstractVecOrMat{<:Real}, t::UnitRangeTransform, x::AbstractVecOrMat{<:Real})
d = t.dim
size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions."))
n = size(x,2)
size(y,2) == n || throw(DimensionMismatch("Inconsistent dimensions."))

tmin = t.min
tscale = t.scale

if t.unit
broadcast!((x,s,m) -> (x-m)*s, y, x, tscale, tmin)
else
broadcast!(*, y, x, tscale)
end
return y
end

function reconstruct!(x::AbstractVecOrMat{<:Real}, t::UnitRangeTransform, y::AbstractVecOrMat{<:Real})
d = t.dim
size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions."))
n = size(y,2)
size(x,2) == n || throw(DimensionMismatch("Inconsistent dimensions."))

tmin = t.min
tscale = t.scale

if t.unit
broadcast!((y,s,m) -> y/s + m, x, y, tscale, tmin)
else
broadcast!(/, x, y, tscale)
end
return x
end

"""
standardize(DT, X; kwargs...)
Return a row-standardized matrix `X` using `DT` transformation which is a subtype of `AbstractDataTransform`:
- `ZScoreTransform`
- `UnitRangeTransform`
# Example
```jldoctest
julia> using StatsBase
julia> standardize(ZScoreTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0])
2×3 Array{Float64,2}:
0.0 -1.0 1.0
-1.0 0.0 1.0
julia> standardize(UnitRangeTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0])
2×3 Array{Float64,2}:
0.5 0.0 1.0
0.0 0.5 1.0
```
"""
function standardize(::Type{DT}, X::AbstractArray{<:Real,2}; kwargs...) where {DT<:AbstractDataTransform}
return transform(fit(DT, X; kwargs...), X)
end
Loading

0 comments on commit 641236d

Please sign in to comment.