Merge pull request #11 from clara-labs/feature/autoformat

Feature/autoformat
jasonlaska · Sep 9, 2018 · 823e0f4 · 823e0f4
2 parents 7d7697c + 8a1cee9
commit 823e0f4
Show file tree

Hide file tree

Showing 9 changed files with 355 additions and 316 deletions.
diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ Both `SphericalKMeans` and `VonMisesFisherMixture` are standard sklearn estimato
 
     # skm.cluster_centers_
     # skm.labels_
-    # skm.intertia_
+    # skm.inertia_
 
     # movMF-soft
     from spherecluster import VonMisesFisherMixture
@@ -74,7 +74,7 @@ Both `SphericalKMeans` and `VonMisesFisherMixture` are standard sklearn estimato
     # vmf_soft.labels_
     # vmf_soft.weights_
     # vmf_soft.concentrations_
-    # vmf_soft.intertia_
+    # vmf_soft.inertia_
 
     # movMF-hard
     from spherecluster import VonMisesFisherMixture
@@ -85,7 +85,7 @@ Both `SphericalKMeans` and `VonMisesFisherMixture` are standard sklearn estimato
     # vmf_hard.labels_
     # vmf_hard.weights_
     # vmf_hard.concentrations_
-    # vmf_hard.intertia_
+    # vmf_hard.inertia_
 
 The full set of parameters for the `VonMisesFisherMixture` class can be found here in the doc string for the class; see `help(VonMisesFisherMixture)`.
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,6 @@
-scikit-learn==0.19.0
-pytest==3.2.1
 numpy
-scipy
+scipy
+scikit-learn>=0.19.0
+pytest
+nose
+black==18.6b4
diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
 
 setup(
     name='spherecluster',
-    version='0.1.5',
+    version='0.1.6',
     description='Clustering on the unit hypersphere in scikit-learn.',
     author='Jason Laska',
     author_email='jason@claralabs.com',

diff --git a/spherecluster/__init__.py b/spherecluster/__init__.py
@@ -3,8 +3,4 @@
 from .von_mises_fisher_mixture import VonMisesFisherMixture
 from .util import sample_vMF
 
-__all__ = [
-    'SphericalKMeans',
-    'VonMisesFisherMixture',
-    'sample_vMF',
-]
+__all__ = ["SphericalKMeans", "VonMisesFisherMixture", "sample_vMF"]
diff --git a/spherecluster/spherical_kmeans.py b/spherecluster/spherical_kmeans.py
@@ -10,32 +10,35 @@
     _tolerance,
     _validate_center_shape,
 )
-from sklearn.utils import (
-    check_array,
-    check_random_state,
-    as_float_array,
-)
+from sklearn.utils import check_array, check_random_state, as_float_array
 from sklearn.cluster import _k_means
 from sklearn.preprocessing import normalize
 from sklearn.externals.joblib import Parallel, delayed
 from sklearn.utils.extmath import row_norms, squared_norm
 
 
-def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
-                                   init='k-means++', verbose=False,
-                                   x_squared_norms=None,
-                                   random_state=None, tol=1e-4,
-                                   precompute_distances=True):
-    '''
+def _spherical_kmeans_single_lloyd(
+    X,
+    n_clusters,
+    max_iter=300,
+    init="k-means++",
+    verbose=False,
+    x_squared_norms=None,
+    random_state=None,
+    tol=1e-4,
+    precompute_distances=True,
+):
+    """
     Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
-    '''
+    """
     random_state = check_random_state(random_state)
 
     best_labels, best_inertia, best_centers = None, None, None
 
     # init
-    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
-                              x_squared_norms=x_squared_norms)
+    centers = _init_centroids(
+        X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms
+    )
     if verbose:
         print("Initialization complete")
 
@@ -51,15 +54,17 @@ def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
         # TODO: _labels_inertia should be done with cosine distance
         #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
         #       this doesn't really matter.
-        labels, inertia = \
-            _labels_inertia(X, x_squared_norms, centers,
-                            precompute_distances=precompute_distances,
-                            distances=distances)
+        labels, inertia = _labels_inertia(
+            X,
+            x_squared_norms,
+            centers,
+            precompute_distances=precompute_distances,
+            distances=distances,
+        )
 
         # computation of the means
         if sp.issparse(X):
-            centers = _k_means._centers_sparse(X, labels, n_clusters,
-                                               distances)
+            centers = _k_means._centers_sparse(X, labels, n_clusters, distances)
         else:
             centers = _k_means._centers_dense(X, labels, n_clusters, distances)
 
@@ -77,50 +82,70 @@ def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
         center_shift_total = squared_norm(centers_old - centers)
         if center_shift_total <= tol:
             if verbose:
-                print("Converged at iteration %d: "
-                      "center shift %e within tolerance %e"
-                      % (i, center_shift_total, tol))
+                print(
+                    "Converged at iteration %d: "
+                    "center shift %e within tolerance %e" % (i, center_shift_total, tol)
+                )
             break
 
     if center_shift_total > 0:
         # rerun E-step in case of non-convergence so that predicted labels
         # match cluster centers
-        best_labels, best_inertia = \
-            _labels_inertia(X, x_squared_norms, best_centers,
-                            precompute_distances=precompute_distances,
-                            distances=distances)
+        best_labels, best_inertia = _labels_inertia(
+            X,
+            x_squared_norms,
+            best_centers,
+            precompute_distances=precompute_distances,
+            distances=distances,
+        )
 
     return best_labels, best_inertia, best_centers, i + 1
 
 
-def spherical_k_means(X, n_clusters, init='k-means++', n_init=10,
-                      max_iter=300, verbose=False, tol=1e-4, random_state=None,
-                      copy_x=True, n_jobs=1, algorithm="auto",
-                      return_n_iter=False):
+def spherical_k_means(
+    X,
+    n_clusters,
+    init="k-means++",
+    n_init=10,
+    max_iter=300,
+    verbose=False,
+    tol=1e-4,
+    random_state=None,
+    copy_x=True,
+    n_jobs=1,
+    algorithm="auto",
+    return_n_iter=False,
+):
     """Modified from sklearn.cluster.k_means_.k_means.
     """
     if n_init <= 0:
-        raise ValueError("Invalid number of initializations."
-                         " n_init=%d must be bigger than zero." % n_init)
+        raise ValueError(
+            "Invalid number of initializations."
+            " n_init=%d must be bigger than zero." % n_init
+        )
     random_state = check_random_state(random_state)
 
     if max_iter <= 0:
-        raise ValueError('Number of iterations should be a positive number,'
-                         ' got %d instead' % max_iter)
+        raise ValueError(
+            "Number of iterations should be a positive number,"
+            " got %d instead" % max_iter
+        )
 
     best_inertia = np.infty
     X = as_float_array(X, copy=copy_x)
     tol = _tolerance(X, tol)
 
-    if hasattr(init, '__array__'):
+    if hasattr(init, "__array__"):
         init = check_array(init, dtype=X.dtype.type, copy=True)
         _validate_center_shape(X, n_clusters, init)
 
         if n_init != 1:
             warnings.warn(
-                'Explicit initial center position passed: '
-                'performing only one init in k-means instead of n_init=%d'
-                % n_init, RuntimeWarning, stacklevel=2)
+                "Explicit initial center position passed: "
+                "performing only one init in k-means instead of n_init=%d" % n_init,
+                RuntimeWarning,
+                stacklevel=2,
+            )
             n_init = 1
 
     # precompute squared norms of data points
@@ -132,9 +157,15 @@ def spherical_k_means(X, n_clusters, init='k-means++', n_init=10,
         for it in range(n_init):
             # run a k-means once
             labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
-                X, n_clusters, max_iter=max_iter, init=init, verbose=verbose,
-                tol=tol, x_squared_norms=x_squared_norms,
-                random_state=random_state)
+                X,
+                n_clusters,
+                max_iter=max_iter,
+                init=init,
+                verbose=verbose,
+                tol=tol,
+                x_squared_norms=x_squared_norms,
+                random_state=random_state,
+            )
 
             # determine if these results are the best so far
             if best_inertia is None or inertia < best_inertia:
@@ -149,13 +180,16 @@ def spherical_k_means(X, n_clusters, init='k-means++', n_init=10,
             delayed(_spherical_kmeans_single_lloyd)(
                 X,
                 n_clusters,
-                max_iter=max_iter, init=init,
-                verbose=verbose, tol=tol,
+                max_iter=max_iter,
+                init=init,
+                verbose=verbose,
+                tol=tol,
                 x_squared_norms=x_squared_norms,
                 # Change seed to ensure variety
-                random_state=seed
+                random_state=seed,
             )
-            for seed in seeds)
+            for seed in seeds
+        )
 
         # Get results with the lowest inertia
         labels, inertia, centers, n_iters = zip(*results)
@@ -244,9 +278,20 @@ class SphericalKMeans(KMeans):
     inertia_ : float
         Sum of distances of samples to their closest cluster center.
     """
-    def __init__(self, n_clusters=8, init='k-means++', n_init=10,
-                 max_iter=300, tol=1e-4, n_jobs=1,
-                 verbose=0, random_state=None, copy_x=True, normalize=True):
+
+    def __init__(
+        self,
+        n_clusters=8,
+        init="k-means++",
+        n_init=10,
+        max_iter=300,
+        tol=1e-4,
+        n_jobs=1,
+        verbose=0,
+        random_state=None,
+        copy_x=True,
+        normalize=True,
+    ):
         self.n_clusters = n_clusters
         self.init = init
         self.max_iter = max_iter
@@ -274,14 +319,18 @@ def fit(self, X, y=None):
 
         # TODO: add check that all data is unit-normalized
 
-        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
-            spherical_k_means(
-                X, n_clusters=self.n_clusters, init=self.init,
-                n_init=self.n_init, max_iter=self.max_iter,
-                verbose=self.verbose,
-                tol=self.tol, random_state=random_state, copy_x=self.copy_x,
-                n_jobs=self.n_jobs,
-                return_n_iter=True
-            )
+        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means(
+            X,
+            n_clusters=self.n_clusters,
+            init=self.init,
+            n_init=self.n_init,
+            max_iter=self.max_iter,
+            verbose=self.verbose,
+            tol=self.tol,
+            random_state=random_state,
+            copy_x=self.copy_x,
+            n_jobs=self.n_jobs,
+            return_n_iter=True,
+        )
 
         return self
diff --git a/spherecluster/tests/test_common.py b/spherecluster/tests/test_common.py
@@ -4,3 +4,4 @@
 
 def test_estimator_spherical_k_means():
     return check_estimator(SphericalKMeans)
+