From da3491760911104bee424d23cc458c4605871735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= Date: Thu, 25 Nov 2021 23:48:08 +0100 Subject: [PATCH] Improve error message in ColumnTransformer parser (#792) * Improves error message in ColumnTransformer parser * fix attribute error --- skl2onnx/_parse.py | 15 ++- skl2onnx/_supported_operators.py | 12 +- skl2onnx/common/utils.py | 5 +- ...st_sklearn_k_bins_discretiser_converter.py | 111 ++++-------------- tests/test_sklearn_pipeline.py | 105 ++++++++++++++++- 5 files changed, 151 insertions(+), 97 deletions(-) diff --git a/skl2onnx/_parse.py b/skl2onnx/_parse.py index 2df59ba50..8a9e1ff6b 100644 --- a/skl2onnx/_parse.py +++ b/skl2onnx/_parse.py @@ -223,9 +223,20 @@ def _parse_sklearn_simple_model(scope, model, inputs, custom_parsers=None, variable = scope.declare_local_variable('variable', otype) this_operator.outputs.append(variable) else: - # We assume that all scikit-learn operator produce a single output. + if hasattr(model, 'get_feature_names_out'): + try: + out_names = model.get_feature_names_out() + except AttributeError: + # Catch a bug in scikit-learn. + out_names = None + this_operator.feature_names_out_ = out_names + if out_names is not None and len(out_names) == 0: + raise RuntimeError( + "get_feature_names_out() cannot return an empty value, " + "model is %r." % type(model)) + input_type = guess_tensor_type(inputs[0].type) variable = scope.declare_local_variable( - 'variable', guess_tensor_type(inputs[0].type)) + 'variable', input_type) this_operator.outputs.append(variable) options = scope.get_options(model, dict(decision_path=False), fail=False) diff --git a/skl2onnx/_supported_operators.py b/skl2onnx/_supported_operators.py index 7f51a3cee..dcceb1242 100644 --- a/skl2onnx/_supported_operators.py +++ b/skl2onnx/_supported_operators.py @@ -2,6 +2,7 @@ import warnings +import logging # Calibrated classifier CV from sklearn.calibration import CalibratedClassifierCV @@ -241,6 +242,8 @@ from .common._registration import register_converter, register_shape_calculator +logger = logging.getLogger('skl2onnx') + # In most cases, scikit-learn operator produces only one output. # However, each classifier has basically two outputs; one is the # predicted label and the other one is the probabilities of all @@ -505,9 +508,12 @@ def _get_sklearn_operator_name(model_type): our conversion framework """ if model_type not in sklearn_operator_name_map: - # "No proper operator name found, it means a local operator. - return None - return sklearn_operator_name_map[model_type] + # No proper operator name found, it means a local operator. + alias = None + else: + alias = sklearn_operator_name_map[model_type] + logger.debug('[parsing] found alias=%r for type=%r.', alias, model_type) + return alias def get_model_alias(model_type): diff --git a/skl2onnx/common/utils.py b/skl2onnx/common/utils.py index a4c536e28..bda65dcf6 100644 --- a/skl2onnx/common/utils.py +++ b/skl2onnx/common/utils.py @@ -112,7 +112,10 @@ def get_column_index(i, inputs): "Unable to find column name %r among names %r. " "Make sure the input names specified with parameter " "initial_types fits the column names specified in the " - "pipeline to convert." % (i, [n.onnx_name for n in inputs])) + "pipeline to convert. This may happen because a " + "ColumnTransformer follows a transformer without " + "any mapped converter in a pipeline." % ( + i, [n.onnx_name for n in inputs])) def get_column_indices(indices, inputs, multiple): diff --git a/tests/test_sklearn_k_bins_discretiser_converter.py b/tests/test_sklearn_k_bins_discretiser_converter.py index ff98cff42..d6bb803c7 100644 --- a/tests/test_sklearn_k_bins_discretiser_converter.py +++ b/tests/test_sklearn_k_bins_discretiser_converter.py @@ -35,14 +35,8 @@ def test_model_k_bins_discretiser_ordinal_uniform(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.float32), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOrdinalUniform", - allow_failure="StrictVersion(" - "onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.float32), model, model_onnx, + basename="SklearnKBinsDiscretiserOrdinalUniform") @unittest.skipIf( KBinsDiscretizer is None, @@ -67,14 +61,8 @@ def test_model_k_bins_discretiser_ordinal_quantile(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.float32), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOrdinalQuantile", - allow_failure="StrictVersion(" - "onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.float32), model, model_onnx, + basename="SklearnKBinsDiscretiserOrdinalQuantile") @unittest.skipIf( KBinsDiscretizer is None, @@ -98,14 +86,8 @@ def test_model_k_bins_discretiser_ordinal_kmeans(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.float32), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOrdinalKMeans", - allow_failure="StrictVersion(" - "onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.float32), model, model_onnx, + basename="SklearnKBinsDiscretiserOrdinalKMeans") @unittest.skipIf( KBinsDiscretizer is None, @@ -125,13 +107,8 @@ def test_model_k_bins_discretiser_onehot_dense_uniform(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.float32), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOneHotDenseUniform", - allow_failure="StrictVersion(onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.float32), model, model_onnx, + basename="SklearnKBinsDiscretiserOneHotDenseUniform") @unittest.skipIf( KBinsDiscretizer is None, @@ -156,13 +133,8 @@ def test_model_k_bins_discretiser_onehot_dense_quantile(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.float32), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOneHotDenseQuantile", - allow_failure="StrictVersion(onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.float32), model, model_onnx, + basename="SklearnKBinsDiscretiserOneHotDenseQuantile") @unittest.skipIf( KBinsDiscretizer is None, @@ -187,13 +159,8 @@ def test_model_k_bins_discretiser_onehot_dense_kmeans(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.float32), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOneHotDenseKMeans", - allow_failure="StrictVersion(onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.float32), model, model_onnx, + basename="SklearnKBinsDiscretiserOneHotDenseKMeans") @unittest.skipIf( KBinsDiscretizer is None, @@ -212,13 +179,8 @@ def test_model_k_bins_discretiser_ordinal_uniform_int(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.int64), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOrdinalUniformInt", - allow_failure="StrictVersion(onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.int64), model, model_onnx, + basename="SklearnKBinsDiscretiserOrdinalUniformInt") @unittest.skipIf( KBinsDiscretizer is None, @@ -241,13 +203,8 @@ def test_model_k_bins_discretiser_ordinal_quantile_int(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.int64), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOrdinalQuantileInt", - allow_failure="StrictVersion(onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.int64), model, model_onnx, + basename="SklearnKBinsDiscretiserOrdinalQuantileInt") @unittest.skipIf( KBinsDiscretizer is None, @@ -268,13 +225,8 @@ def test_model_k_bins_discretiser_ordinal_kmeans_int(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.int64), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOrdinalKMeansInt", - allow_failure="StrictVersion(onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.int64), model, model_onnx, + basename="SklearnKBinsDiscretiserOrdinalKMeansInt") @unittest.skipIf( KBinsDiscretizer is None, @@ -293,13 +245,8 @@ def test_model_k_bins_discretiser_onehot_dense_uniform_int(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.int64), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOneHotDenseUniformInt", - allow_failure="StrictVersion(onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.int64), model, model_onnx, + basename="SklearnKBinsDiscretiserOneHotDenseUniformInt") @unittest.skipIf( KBinsDiscretizer is None, @@ -318,13 +265,8 @@ def test_model_k_bins_discretiser_onehot_dense_quantile_int(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.int64), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOneHotDenseQuantileInt", - allow_failure="StrictVersion(onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.int64), model, model_onnx, + basename="SklearnKBinsDiscretiserOneHotDenseQuantileInt") @unittest.skipIf( KBinsDiscretizer is None, @@ -347,13 +289,8 @@ def test_model_k_bins_discretiser_onehot_dense_kmeans_int(self): ) self.assertTrue(model_onnx is not None) dump_data_and_model( - X.astype(np.int64), - model, - model_onnx, - basename="SklearnKBinsDiscretiserOneHotDenseKMeansInt", - allow_failure="StrictVersion(onnxruntime.__version__)" - "<= StrictVersion('0.2.1')", - ) + X.astype(np.int64), model, model_onnx, + basename="SklearnKBinsDiscretiserOneHotDenseKMeansInt") if __name__ == "__main__": diff --git a/tests/test_sklearn_pipeline.py b/tests/test_sklearn_pipeline.py index 989b55155..0cf455c49 100644 --- a/tests/test_sklearn_pipeline.py +++ b/tests/test_sklearn_pipeline.py @@ -8,9 +8,11 @@ import numpy from numpy.testing import assert_almost_equal import pandas -from sklearn import __version__ as sklearn_version +from sklearn import __version__ as skl_version from sklearn import datasets from sklearn.calibration import CalibratedClassifierCV +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.tree import DecisionTreeClassifier try: # scikit-learn >= 0.22 @@ -20,9 +22,13 @@ from sklearn.utils.testing import ignore_warnings try: from sklearn.compose import ColumnTransformer + from sklearn.compose import ( + make_column_transformer, make_column_selector) except ImportError: # not available in 0.19 ColumnTransformer = None + make_column_selector = None + make_column_transformer = None from sklearn.decomposition import PCA, TruncatedSVD try: @@ -52,13 +58,13 @@ from onnxruntime import __version__ as ort_version, InferenceSession +# StrictVersion does not work with development versions ort_version = ".".join(ort_version.split('.')[:2]) +skl_version = ".".join(skl_version.split('.')[:2]) def check_scikit_version(): - # StrictVersion does not work with development versions - vers = '.'.join(sklearn_version.split('.')[:2]) - return StrictVersion(vers) >= StrictVersion("0.21.0") + return StrictVersion(skl_version) >= StrictVersion("0.22") class PipeConcatenateInput: @@ -667,6 +673,9 @@ def test_pipeline_pipeline_voting_tfidf_svc(self): @unittest.skipIf(TARGET_OPSET < 11, reason="SequenceConstruct not available") + @unittest.skipIf( + not check_scikit_version(), + reason="Scikit 0.21 too old") @ignore_warnings(category=(FutureWarning, UserWarning)) def test_pipeline_pipeline_rf(self): cat_feat = ['A', 'B'] @@ -716,6 +725,9 @@ def test_pipeline_pipeline_rf(self): @unittest.skipIf(TARGET_OPSET < 11, reason="SequenceConstruct not available") + @unittest.skipIf( + not check_scikit_version(), + reason="Scikit 0.21 too old") @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def test_issue_712_multio(self): dfx = pandas.DataFrame( @@ -759,6 +771,9 @@ def test_issue_712_multio(self): @unittest.skipIf(TARGET_OPSET < 11, reason="SequenceConstruct not available") + @unittest.skipIf( + not check_scikit_version(), + reason="Scikit 0.21 too old") @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def test_issue_712_svc_multio(self): for sub_model in [LinearSVC(), SVC()]: @@ -820,6 +835,9 @@ def test_issue_712_svc_multio(self): @unittest.skipIf(TARGET_OPSET < 11, reason="SequenceConstruct not available") + @unittest.skipIf( + not check_scikit_version(), + reason="Scikit 0.21 too old") @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def test_issue_712_svc_binary0(self): for sub_model in [LinearSVC(), SVC()]: @@ -871,6 +889,9 @@ def test_issue_712_svc_binary0(self): @unittest.skipIf(TARGET_OPSET < 11, reason="SequenceConstruct not available") + @unittest.skipIf( + not check_scikit_version(), + reason="Scikit 0.21 too old") @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def test_issue_712_svc_multi(self): for sub_model in [SVC(), LinearSVC()]: @@ -925,6 +946,82 @@ def test_issue_712_svc_multi(self): else: assert_almost_equal(expected_proba, got[1], decimal=5) + @unittest.skipIf(TARGET_OPSET < 11, + reason="SequenceConstruct not available") + @unittest.skipIf( + not check_scikit_version(), + reason="Scikit 0.21 too old") + @ignore_warnings(category=(FutureWarning, UserWarning)) + def test_pipeline_make_column_selector(self): + X = pandas.DataFrame({ + 'city': ['London', 'London', 'Paris', 'Sallisaw'], + 'rating': [5, 3, 4, 5]}) + X['rating'] = X['rating'].astype(numpy.float32) + ct = make_column_transformer( + (StandardScaler(), make_column_selector( + dtype_include=numpy.number)), + (OneHotEncoder(), make_column_selector( + dtype_include=object))) + expected = ct.fit_transform(X) + onx = to_onnx(ct, X, target_opset=TARGET_OPSET) + sess = InferenceSession(onx.SerializeToString()) + names = [i.name for i in sess.get_inputs()] + got = sess.run(None, {names[0]: X[names[0]].values.reshape((-1, 1)), + names[1]: X[names[1]].values.reshape((-1, 1))}) + assert_almost_equal(expected, got[0]) + + @unittest.skipIf( + not check_scikit_version(), + reason="Scikit 0.21 too old") + def test_feature_selector_no_converter(self): + + class ColumnSelector(TransformerMixin, BaseEstimator): + def __init__(self, cols): + if not isinstance(cols, list): + self.cols = [cols] + else: + self.cols = cols + + def fit(self, X, y): + return self + + def transform(self, X): + X = X.copy() + return X[self.cols] + + # Inspired from + # https://github.com/databricks/automl/blob/main/ + # runtime/tests/automl_runtime/sklearn/column_selector_test.py + X_in = pandas.DataFrame( + numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + dtype=numpy.float32), + columns=["a", "b", "c"]) + y = pandas.DataFrame(numpy.array([[1], [0], [1]]), + columns=["label"]) + X_out_expected = numpy.array([1, 0, 1]) + + standardizer = StandardScaler() + selected_cols = ["a", "b"] + col_selector = ColumnSelector(selected_cols) + preprocessor = ColumnTransformer( + [("standardizer", standardizer, selected_cols)], remainder="drop") + + model = Pipeline([ + ("column_selector", col_selector), + ("preprocessor", preprocessor), + ("decision_tree", DecisionTreeClassifier()) + ]) + model.fit(X=X_in, y=y) + # Add one column so that the dataframe for prediction is + # different with the data for training + X_in["useless"] = 1 + X_out = model.predict(X_in) + assert_almost_equal(X_out, X_out_expected) + + with self.assertRaises(RuntimeError) as e: + to_onnx(model, X_in) + self.assertIn('ColumnTransformer', str(e)) + if __name__ == "__main__": # import logging