[REVIEW][PROPOSAL] Add tags and prefered memory order tags to estimat…

…ors (#3113) * FEA Add preferred_order class parameter to linear models * ENH adopt tags from scikit-learn API to support preferred order attribute * DOC remove attribute docstrings * FIX Change straggling classes * FIX Change straggling classes * FIX Add missing self * FIX straggling attribute * ENH Add device data tag for proposal * FEA Add all scikit-learn API tags to base and improve gpu input types tag * FEA Add preferred_order tag to cluster models * FEA Add preferred_order tag to most models * ENH Improvements and PR review feedback * DOC add tag documentation to estimator guide * DOC add scikit link * Update wiki/python/ESTIMATOR_GUIDE.md Co-authored-by: Corey J. Nolet <cjnolet@users.noreply.github.com> * Update wiki/python/ESTIMATOR_GUIDE.md Co-authored-by: Corey J. Nolet <cjnolet@users.noreply.github.com> * Update wiki/python/ESTIMATOR_GUIDE.md Co-authored-by: Corey J. Nolet <cjnolet@users.noreply.github.com> * Update wiki/python/ESTIMATOR_GUIDE.md Co-authored-by: Corey J. Nolet <cjnolet@users.noreply.github.com> * Update wiki/python/ESTIMATOR_GUIDE.md Co-authored-by: Corey J. Nolet <cjnolet@users.noreply.github.com> * ENH Rename test_fit to test_api and add tags tests * FIX fixes from PR review * DOC Added entry to changelog * FIX PEP8 fixes Co-authored-by: Corey J. Nolet <cjnolet@users.noreply.github.com>
rapidsai · Nov 20, 2020 · b3e4827 · b3e4827
1 parent b7bfb7e
commit b3e4827
Show file tree

Hide file tree

Showing 29 changed files with 324 additions and 28 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -33,6 +33,7 @@
 - PR #3135: Add QuasiNewton tests
 - PR #3040: Improved Array Conversion with CumlArrayDescriptor and Decorators
 - PR #3134: Improving the Deprecation Message Formatting in Documentation
+- PR #3113: Add tags and prefered memory order tags to estimators
 - PR #3137: Reorganize Pytest Config and Add Quick Run Option
 - PR #3144: Adding Ability to Set Arbitrary Cmake Flags in ./build.sh
 - PR #3155: Eliminate unnecessary warnings from random projection test
@@ -58,7 +59,7 @@
 - PR #3086: Reverting FIL Notebook Testing
 - PR #3114: Fixed a typo in SVC's predict_proba AttributeError
 - PR #3117: Fix two crashes in experimental RF backend
-- PR #3119: Fix memset args for benchmark 
+- PR #3119: Fix memset args for benchmark
 - PR #3130: Return Python string from `dump_as_json()` of RF
 - PR #3132: Add `min_samples_split` + Rename `min_rows_per_node` -> `min_samples_leaf`
 - PR #3136: Fix stochastic gradient descent example

diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
@@ -346,3 +346,8 @@ class DBSCAN(Base):
             "max_mbytes_per_batch",
             "calc_core_sample_indices",
         ]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'C'
+        }
diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx
@@ -622,3 +622,8 @@ class KMeans(Base):
             ['n_init', 'oversampling_factor', 'max_samples_per_batch',
                 'init', 'max_iter', 'n_clusters', 'random_state',
                 'tol']
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'C'
+        }
diff --git a/python/cuml/common/base.pyx b/python/cuml/common/base.pyx
@@ -27,6 +27,34 @@ from cuml.common.doc_utils import generate_docstring
 import cuml.common.input_utils
 
 
+# tag system based on experimental tag system from Scikit-learn >=0.21
+# https://scikit-learn.org/stable/developers/develop.html#estimator-tags
+_default_tags = {
+    # cuML specific tags
+    'preferred_input_order': None,
+    'X_types_gpu': ['2darray'],
+
+    # Scikit-learn API standard tags
+    'non_deterministic': False,
+    'requires_positive_X': False,
+    'requires_positive_y': False,
+    'X_types': ['2darray'],
+    'poor_score': False,
+    'no_validation': False,
+    'multioutput': False,
+    'allow_nan': False,
+    'stateless': False,
+    'multilabel': False,
+    '_skip_test': False,
+    '_xfail_checks': False,
+    'multioutput_only': False,
+    'binary_only': False,
+    'requires_fit': True,
+    'requires_y': False,
+    'pairwise': False,
+}
+
+
 class Base(metaclass=cuml.internals.BaseMetaClass):
     """
     Base class for all the ML algos. It handles some of the common operations
@@ -348,6 +376,16 @@ class Base(metaclass=cuml.internals.BaseMetaClass):
         else:
             self.n_features_in_ = X.shape[1]
 
+    def _get_tags(self):
+        # method and code based on scikit-learn 0.21 _get_tags functionality:
+        # https://scikit-learn.org/stable/developers/develop.html#estimator-tags
+        collected_tags = _default_tags
+        for cl in reversed(inspect.getmro(self.__class__)):
+            if hasattr(cl, '_more_tags') and cl != Base:
+                more_tags = cl._more_tags(self)
+                collected_tags.update(more_tags)
+        return collected_tags
+
 
 class RegressorMixin:
     """Mixin class for regression estimators in cuML"""
@@ -379,6 +417,11 @@ class RegressorMixin:
         preds = self.predict(X, **kwargs)
         return r2_score(y, preds, handle=handle)
 
+    def _more_tags(self):
+        return {
+            'requires_y': True
+        }
+
 
 class ClassifierMixin:
     """Mixin class for classifier estimators in cuML"""
@@ -410,6 +453,11 @@ class ClassifierMixin:
         preds = self.predict(X, **kwargs)
         return accuracy_score(y, preds, handle=handle)
 
+    def _more_tags(self):
+        return {
+            'requires_y': True
+        }
+
 
 # Internal, non class owned helper functions
 def _check_output_type_str(output_str):

diff --git a/python/cuml/decomposition/pca.pyx b/python/cuml/decomposition/pca.pyx
@@ -728,3 +728,10 @@ class PCA(Base):
     def __setstate__(self, state):
         self.__dict__.update(state)
         self.handle = Handle()
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F',
+            'X_types_gpu': ['2darray', 'sparse'],
+            'X_types': ['2darray', 'sparse']
+        }
diff --git a/python/cuml/decomposition/tsvd.pyx b/python/cuml/decomposition/tsvd.pyx
@@ -476,3 +476,8 @@ class TruncatedSVD(Base):
     def get_param_names(self):
         return super().get_param_names() + \
             ["algorithm", "n_components", "n_iter", "random_state", "tol"]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -959,3 +959,9 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin):
         if self.dtype == np.float64:
             return dump_rf_as_json(rf_forest64).decode('utf-8')
         return dump_rf_as_json(rf_forest).decode('utf-8')
+
+    def _more_tags(self):
+        return {
+            # fit and predict require conflicting memory layouts
+            'preferred_input_order': None
+        }
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
@@ -756,3 +756,9 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin):
         if self.dtype == np.float64:
             return dump_rf_as_json(rf_forest64).decode('utf-8')
         return dump_rf_as_json(rf_forest).decode('utf-8')
+
+    def _more_tags(self):
+        return {
+            # fit and predict require conflicting memory layouts
+            'preferred_input_order': None
+        }
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
@@ -728,3 +728,8 @@ class ForestInference(Base):
 
         # DO NOT RETURN self._impl here!!
         return self
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'C'
+        }
diff --git a/python/cuml/linear_model/elastic_net.pyx b/python/cuml/linear_model/elastic_net.pyx
@@ -240,3 +240,8 @@ class ElasticNet(Base, RegressorMixin):
             "tol",
             "selection",
         ]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/linear_model/lasso.pyx b/python/cuml/linear_model/lasso.pyx
@@ -208,3 +208,8 @@ class Lasso(Base, RegressorMixin):
             "tol",
             "selection",
         ]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/linear_model/linear_regression.pyx b/python/cuml/linear_model/linear_regression.pyx
@@ -352,3 +352,8 @@ class LinearRegression(Base, RegressorMixin):
     def get_param_names(self):
         return super().get_param_names() + \
             ['algorithm', 'fit_intercept', 'normalize']
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/linear_model/logistic_regression.pyx b/python/cuml/linear_model/logistic_regression.pyx
@@ -424,3 +424,8 @@ class LogisticRegression(Base, ClassifierMixin):
         super(LogisticRegression, self).__init__(handle=None,
                                                  verbose=state["verbose"])
         self.__dict__.update(state)
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/linear_model/mbsgd_classifier.pyx b/python/cuml/linear_model/mbsgd_classifier.pyx
@@ -219,3 +219,8 @@ class MBSGDClassifier(Base, ClassifierMixin):
             "batch_size",
             "n_iter_no_change",
         ]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/linear_model/mbsgd_regressor.pyx b/python/cuml/linear_model/mbsgd_regressor.pyx
@@ -213,3 +213,8 @@ class MBSGDRegressor(Base, RegressorMixin):
             "batch_size",
             "n_iter_no_change",
         ]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/linear_model/ridge.pyx b/python/cuml/linear_model/ridge.pyx
@@ -214,7 +214,6 @@ class Ridge(Base, RegressorMixin):
     def __init__(self, alpha=1.0, solver='eig', fit_intercept=True,
                  normalize=False, handle=None, output_type=None,
                  verbose=False):
-
         """
         Initializes the linear ridge regression class.
 
@@ -394,3 +393,8 @@ class Ridge(Base, RegressorMixin):
     def get_param_names(self):
         return super().get_param_names() + \
             ['solver', 'fit_intercept', 'normalize', 'alpha']
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
@@ -485,3 +485,8 @@ class TSNE(Base):
             "pre_momentum",
             "post_momentum",
         ]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'C'
+        }
diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx
@@ -877,3 +877,8 @@ class UMAP(Base):
             "optim_batch_size",
             "callback",
         ]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'C'
+        }
diff --git a/python/cuml/neighbors/kneighbors_classifier.pyx b/python/cuml/neighbors/kneighbors_classifier.pyx
@@ -305,3 +305,9 @@ class KNeighborsClassifier(NearestNeighbors, ClassifierMixin):
 
     def get_param_names(self):
         return super().get_param_names() + ["weights"]
+
+    def _more_tags(self):
+        return {
+            # fit and predict require conflicting memory layouts
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/neighbors/kneighbors_regressor.pyx b/python/cuml/neighbors/kneighbors_regressor.pyx
@@ -231,3 +231,9 @@ class KNeighborsRegressor(NearestNeighbors, RegressorMixin):
 
     def get_param_names(self):
         return super().get_param_names() + ["weights"]
+
+    def _more_tags(self):
+        return {
+            # fit and predict require conflicting memory layouts
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx
@@ -746,3 +746,8 @@ def kneighbors_graph(X=None, n_neighbors=5, mode='connectivity', verbose=False,
             query = X.X_m
 
     return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/random_projection/random_projection.pyx b/python/cuml/random_projection/random_projection.pyx
@@ -589,3 +589,8 @@ class SparseRandomProjection(Base, BaseRandomProjection):
             "dense_output",
             "random_state"
         ]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/solvers/cd.pyx b/python/cuml/solvers/cd.pyx
@@ -349,3 +349,8 @@ class CD(Base):
             "tol",
             "shuffle",
         ]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/solvers/qn.pyx b/python/cuml/solvers/qn.pyx
@@ -537,3 +537,8 @@ class QN(Base):
         return super().get_param_names() + \
             ['loss', 'fit_intercept', 'l1_strength', 'l2_strength',
                 'max_iter', 'tol', 'linesearch_max_iter', 'lbfgs_memory']
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/solvers/sgd.pyx b/python/cuml/solvers/sgd.pyx
@@ -507,3 +507,8 @@ class SGD(Base):
             "batch_size",
             "n_iter_no_change",
         ]
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/svm/svc.pyx b/python/cuml/svm/svc.pyx
@@ -519,3 +519,8 @@ class SVC(SVMBase, ClassifierMixin):
             params.remove("epsilon")
 
         return params
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }
diff --git a/python/cuml/svm/svm_base.pyx b/python/cuml/svm/svm_base.pyx
@@ -574,3 +574,8 @@ class SVMBase(Base):
         self.__dict__.update(state)
         self._model = self._get_svm_model()
         self._freeSvmBuffers = False
+
+    def _more_tags(self):
+        return {
+            'preferred_input_order': 'F'
+        }