Skip to content

Commit

Permalink
Merge pull request #11 from clara-labs/feature/autoformat
Browse files Browse the repository at this point in the history
Feature/autoformat
  • Loading branch information
jasonlaska authored Sep 9, 2018
2 parents 7d7697c + 8a1cee9 commit 823e0f4
Show file tree
Hide file tree
Showing 9 changed files with 355 additions and 316 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ Both `SphericalKMeans` and `VonMisesFisherMixture` are standard sklearn estimato

# skm.cluster_centers_
# skm.labels_
# skm.intertia_
# skm.inertia_

# movMF-soft
from spherecluster import VonMisesFisherMixture
Expand All @@ -74,7 +74,7 @@ Both `SphericalKMeans` and `VonMisesFisherMixture` are standard sklearn estimato
# vmf_soft.labels_
# vmf_soft.weights_
# vmf_soft.concentrations_
# vmf_soft.intertia_
# vmf_soft.inertia_

# movMF-hard
from spherecluster import VonMisesFisherMixture
Expand All @@ -85,7 +85,7 @@ Both `SphericalKMeans` and `VonMisesFisherMixture` are standard sklearn estimato
# vmf_hard.labels_
# vmf_hard.weights_
# vmf_hard.concentrations_
# vmf_hard.intertia_
# vmf_hard.inertia_

The full set of parameters for the `VonMisesFisherMixture` class can be found here in the doc string for the class; see `help(VonMisesFisherMixture)`.

Expand Down
8 changes: 5 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
scikit-learn==0.19.0
pytest==3.2.1
numpy
scipy
scipy
scikit-learn>=0.19.0
pytest
nose
black==18.6b4
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

setup(
name='spherecluster',
version='0.1.5',
version='0.1.6',
description='Clustering on the unit hypersphere in scikit-learn.',
author='Jason Laska',
author_email='jason@claralabs.com',
Expand Down
6 changes: 1 addition & 5 deletions spherecluster/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,4 @@
from .von_mises_fisher_mixture import VonMisesFisherMixture
from .util import sample_vMF

__all__ = [
'SphericalKMeans',
'VonMisesFisherMixture',
'sample_vMF',
]
__all__ = ["SphericalKMeans", "VonMisesFisherMixture", "sample_vMF"]
165 changes: 107 additions & 58 deletions spherecluster/spherical_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,35 @@
_tolerance,
_validate_center_shape,
)
from sklearn.utils import (
check_array,
check_random_state,
as_float_array,
)
from sklearn.utils import check_array, check_random_state, as_float_array
from sklearn.cluster import _k_means
from sklearn.preprocessing import normalize
from sklearn.externals.joblib import Parallel, delayed
from sklearn.utils.extmath import row_norms, squared_norm


def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
init='k-means++', verbose=False,
x_squared_norms=None,
random_state=None, tol=1e-4,
precompute_distances=True):
'''
def _spherical_kmeans_single_lloyd(
X,
n_clusters,
max_iter=300,
init="k-means++",
verbose=False,
x_squared_norms=None,
random_state=None,
tol=1e-4,
precompute_distances=True,
):
"""
Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
'''
"""
random_state = check_random_state(random_state)

best_labels, best_inertia, best_centers = None, None, None

# init
centers = _init_centroids(X, n_clusters, init, random_state=random_state,
x_squared_norms=x_squared_norms)
centers = _init_centroids(
X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms
)
if verbose:
print("Initialization complete")

Expand All @@ -51,15 +54,17 @@ def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
# TODO: _labels_inertia should be done with cosine distance
# since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
# this doesn't really matter.
labels, inertia = \
_labels_inertia(X, x_squared_norms, centers,
precompute_distances=precompute_distances,
distances=distances)
labels, inertia = _labels_inertia(
X,
x_squared_norms,
centers,
precompute_distances=precompute_distances,
distances=distances,
)

# computation of the means
if sp.issparse(X):
centers = _k_means._centers_sparse(X, labels, n_clusters,
distances)
centers = _k_means._centers_sparse(X, labels, n_clusters, distances)
else:
centers = _k_means._centers_dense(X, labels, n_clusters, distances)

Expand All @@ -77,50 +82,70 @@ def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
center_shift_total = squared_norm(centers_old - centers)
if center_shift_total <= tol:
if verbose:
print("Converged at iteration %d: "
"center shift %e within tolerance %e"
% (i, center_shift_total, tol))
print(
"Converged at iteration %d: "
"center shift %e within tolerance %e" % (i, center_shift_total, tol)
)
break

if center_shift_total > 0:
# rerun E-step in case of non-convergence so that predicted labels
# match cluster centers
best_labels, best_inertia = \
_labels_inertia(X, x_squared_norms, best_centers,
precompute_distances=precompute_distances,
distances=distances)
best_labels, best_inertia = _labels_inertia(
X,
x_squared_norms,
best_centers,
precompute_distances=precompute_distances,
distances=distances,
)

return best_labels, best_inertia, best_centers, i + 1


def spherical_k_means(X, n_clusters, init='k-means++', n_init=10,
max_iter=300, verbose=False, tol=1e-4, random_state=None,
copy_x=True, n_jobs=1, algorithm="auto",
return_n_iter=False):
def spherical_k_means(
X,
n_clusters,
init="k-means++",
n_init=10,
max_iter=300,
verbose=False,
tol=1e-4,
random_state=None,
copy_x=True,
n_jobs=1,
algorithm="auto",
return_n_iter=False,
):
"""Modified from sklearn.cluster.k_means_.k_means.
"""
if n_init <= 0:
raise ValueError("Invalid number of initializations."
" n_init=%d must be bigger than zero." % n_init)
raise ValueError(
"Invalid number of initializations."
" n_init=%d must be bigger than zero." % n_init
)
random_state = check_random_state(random_state)

if max_iter <= 0:
raise ValueError('Number of iterations should be a positive number,'
' got %d instead' % max_iter)
raise ValueError(
"Number of iterations should be a positive number,"
" got %d instead" % max_iter
)

best_inertia = np.infty
X = as_float_array(X, copy=copy_x)
tol = _tolerance(X, tol)

if hasattr(init, '__array__'):
if hasattr(init, "__array__"):
init = check_array(init, dtype=X.dtype.type, copy=True)
_validate_center_shape(X, n_clusters, init)

if n_init != 1:
warnings.warn(
'Explicit initial center position passed: '
'performing only one init in k-means instead of n_init=%d'
% n_init, RuntimeWarning, stacklevel=2)
"Explicit initial center position passed: "
"performing only one init in k-means instead of n_init=%d" % n_init,
RuntimeWarning,
stacklevel=2,
)
n_init = 1

# precompute squared norms of data points
Expand All @@ -132,9 +157,15 @@ def spherical_k_means(X, n_clusters, init='k-means++', n_init=10,
for it in range(n_init):
# run a k-means once
labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
X, n_clusters, max_iter=max_iter, init=init, verbose=verbose,
tol=tol, x_squared_norms=x_squared_norms,
random_state=random_state)
X,
n_clusters,
max_iter=max_iter,
init=init,
verbose=verbose,
tol=tol,
x_squared_norms=x_squared_norms,
random_state=random_state,
)

# determine if these results are the best so far
if best_inertia is None or inertia < best_inertia:
Expand All @@ -149,13 +180,16 @@ def spherical_k_means(X, n_clusters, init='k-means++', n_init=10,
delayed(_spherical_kmeans_single_lloyd)(
X,
n_clusters,
max_iter=max_iter, init=init,
verbose=verbose, tol=tol,
max_iter=max_iter,
init=init,
verbose=verbose,
tol=tol,
x_squared_norms=x_squared_norms,
# Change seed to ensure variety
random_state=seed
random_state=seed,
)
for seed in seeds)
for seed in seeds
)

# Get results with the lowest inertia
labels, inertia, centers, n_iters = zip(*results)
Expand Down Expand Up @@ -244,9 +278,20 @@ class SphericalKMeans(KMeans):
inertia_ : float
Sum of distances of samples to their closest cluster center.
"""
def __init__(self, n_clusters=8, init='k-means++', n_init=10,
max_iter=300, tol=1e-4, n_jobs=1,
verbose=0, random_state=None, copy_x=True, normalize=True):

def __init__(
self,
n_clusters=8,
init="k-means++",
n_init=10,
max_iter=300,
tol=1e-4,
n_jobs=1,
verbose=0,
random_state=None,
copy_x=True,
normalize=True,
):
self.n_clusters = n_clusters
self.init = init
self.max_iter = max_iter
Expand Down Expand Up @@ -274,14 +319,18 @@ def fit(self, X, y=None):

# TODO: add check that all data is unit-normalized

self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
spherical_k_means(
X, n_clusters=self.n_clusters, init=self.init,
n_init=self.n_init, max_iter=self.max_iter,
verbose=self.verbose,
tol=self.tol, random_state=random_state, copy_x=self.copy_x,
n_jobs=self.n_jobs,
return_n_iter=True
)
self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means(
X,
n_clusters=self.n_clusters,
init=self.init,
n_init=self.n_init,
max_iter=self.max_iter,
verbose=self.verbose,
tol=self.tol,
random_state=random_state,
copy_x=self.copy_x,
n_jobs=self.n_jobs,
return_n_iter=True,
)

return self
1 change: 1 addition & 0 deletions spherecluster/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@

def test_estimator_spherical_k_means():
return check_estimator(SphericalKMeans)

Loading

0 comments on commit 823e0f4

Please sign in to comment.