diff --git a/skl2onnx/operator_converters/text_vectoriser.py b/skl2onnx/operator_converters/text_vectoriser.py index 4a80d648c..4b4390903 100644 --- a/skl2onnx/operator_converters/text_vectoriser.py +++ b/skl2onnx/operator_converters/text_vectoriser.py @@ -226,8 +226,11 @@ def convert_sklearn_text_vectorizer(scope: Scope, operator: Operator, "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues.") - stop_words = op.stop_words_ | ( - set(op.stop_words) if op.stop_words else set()) + if hasattr(op, "stop_words_"): + stop_words = op.stop_words_ | ( + set(op.stop_words) if op.stop_words else set()) + else: + stop_words = set() if op.lowercase or stop_words: if len(operator.input_full_names) != 1: diff --git a/skl2onnx/operator_converters/tfidf_transformer.py b/skl2onnx/operator_converters/tfidf_transformer.py index ff7dec1a1..e755e78c8 100644 --- a/skl2onnx/operator_converters/tfidf_transformer.py +++ b/skl2onnx/operator_converters/tfidf_transformer.py @@ -8,7 +8,7 @@ from ..common._topology import Scope, Operator from ..common._container import ModelComponentContainer from ..common._apply_operation import ( - apply_mul, apply_identity, apply_normalizer) + apply_add, apply_log, apply_mul, apply_identity, apply_normalizer) def convert_sklearn_tfidf_transformer(scope: Scope, operator: Operator, @@ -30,23 +30,23 @@ def convert_sklearn_tfidf_transformer(scope: Scope, operator: Operator, # code scikit-learn # np.log(X.data, X.data) --> does not apply on null coefficient # X.data += 1 - raise RuntimeError( - "ONNX does not support sparse tensors before opset < 11, " - "sublinear_tf must be False.") - - # In case sparse is enabled. - # C = operator.inputs[0].type.shape[1] - # logged = scope.get_unique_variable_name('logged') - # apply_log(scope, data, logged, container) - # if not op.use_idf and op.norm is None: - # loggedplus1 = final - # else: - # loggedplus1 = scope.get_unique_variable_name('loggedplus1') - # ones = scope.get_unique_variable_name('ones') - # cst = np.ones((C,), dtype=float_type) - # container.add_initializer(ones, proto_dtype, [C], cst.flatten()) - # apply_add(scope, [logged, ones], loggedplus1, container, broadcast=1) - # data = [loggedplus1] + # ONNX does not support sparse tensors before opset < 11 + # approximated by X.data += 1 --> np.log(X.data, X.data) + if operator.target_opset < 11: + plus1 = scope.get_unique_variable_name("plus1") + C = operator.inputs[0].type.shape[1] + ones = scope.get_unique_variable_name("ones") + cst = np.ones((C,), dtype=float_type) + container.add_initializer(ones, proto_dtype, [C], cst.flatten()) + apply_add(scope, data + [ones], plus1, container, broadcast=1) + plus1logged = scope.get_unique_variable_name("plus1logged") + apply_log(scope, plus1, plus1logged, container) + data = [plus1logged] + else: + # sparse containers have not yet been implemented. + raise RuntimeError( + "ONNX does not support sparse tensors before opset < 11, " + "sublinear_tf must be False.") if op.use_idf: cst = op.idf_.astype(float_type) diff --git a/tests/test_sklearn_tfidf_vectorizer_converter.py b/tests/test_sklearn_tfidf_vectorizer_converter.py index 26595ffcd..9bcc8ce0c 100644 --- a/tests/test_sklearn_tfidf_vectorizer_converter.py +++ b/tests/test_sklearn_tfidf_vectorizer_converter.py @@ -577,6 +577,34 @@ def test_model_tfidf_vectorizer_nan(self): assert res.shape == (4, 9) assert numpy.isnan(res[0, 0]) + @unittest.skipIf( + StrictVersion(onnx.__version__) <= StrictVersion("1.4.1"), + reason="Requires opset 9.") + def test_model_tfidf_vectorizer11_custom_vocabulary(self): + corpus = numpy.array([ + "This is the first document.", + "This document is the second document.", + "And this is the third one.", + "Is this the first document?", + ]).reshape((4, 1)) + vc = ["first", "second", "third", "document", "this"] + vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, vocabulary=vc) + vect.fit(corpus.ravel()) + self.assertFalse(hasattr(vect, "stop_words_")) + model_onnx = convert_sklearn(vect, "TfidfVectorizer", + [("input", StringTensorType())], + options=self.get_options(), + target_opset=TARGET_OPSET) + self.assertTrue(model_onnx is not None) + dump_data_and_model( + corpus, + vect, + model_onnx, + basename="SklearnTfidfVectorizer11CustomVocab-OneOff-SklCol", + allow_failure="StrictVersion(onnxruntime.__version__)" + " <= StrictVersion('0.4.0')", + ) + if __name__ == "__main__": unittest.main()