diff --git a/.azure-pipelines/linux-conda-CI.yml b/.azure-pipelines/linux-conda-CI.yml index 8dbf4de2b..1f3871779 100644 --- a/.azure-pipelines/linux-conda-CI.yml +++ b/.azure-pipelines/linux-conda-CI.yml @@ -36,7 +36,7 @@ jobs: sklearn.version: '==0.24.2' lgbm.version: '' onnxcc.version: '>=1.8.1' # git - run.example: '1' + run.example: '0' Py39-Onnx1101-Rt181-Skl0242: do.bench: '0' python.version: '3.9' diff --git a/skl2onnx/common/_topology.py b/skl2onnx/common/_topology.py index 2389db998..7419a56e6 100644 --- a/skl2onnx/common/_topology.py +++ b/skl2onnx/common/_topology.py @@ -958,7 +958,7 @@ def _generate_unique_name(seed, existing_names): # Make the seed meet C-style naming convention # Only alphabets and numbers are allowed - seed = re.sub('[^0-9a-zA-Z]', '_', seed) + seed = re.sub('[^\\w+]', '_', seed) # The first symbol cannot be a number if re.match('^[0-9]', seed): seed = '_' + seed diff --git a/tests/test_variable_names.py b/tests/test_variable_names.py index 0030fb1db..d6fddfa11 100644 --- a/tests/test_variable_names.py +++ b/tests/test_variable_names.py @@ -1,16 +1,29 @@ # SPDX-License-Identifier: Apache-2.0 +# coding: utf-8 import unittest import copy +from textwrap import dedent +from io import StringIO import numpy as np +import pandas as pd from numpy.testing import assert_almost_equal from sklearn.pipeline import Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.linear_model import LinearRegression from onnxruntime import InferenceSession +try: + from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument +except ImportError: + InvalidArgument = RuntimeError from skl2onnx import get_model_alias, update_registered_converter from skl2onnx.algebra.onnx_ops import OnnxIdentity -from skl2onnx import convert_sklearn -from onnxconverter_common.data_types import FloatTensorType -from test_utils import TARGET_OPSET +from skl2onnx import convert_sklearn, to_onnx +from onnxconverter_common.data_types import ( + FloatTensorType, Int64TensorType, StringTensorType) +from test_utils import fit_regression_model, TARGET_OPSET class Passthrough: @@ -96,6 +109,79 @@ def test_variable_names_output(self): target_opset=TARGET_OPSET, final_types=final_types) + def _test_non_ascii_variable_name(self): + model, X = fit_regression_model(LinearRegression()) + model_onnx = to_onnx( + model, name="linear regression", + initial_types=[("年齢", FloatTensorType([None, X.shape[1]]))], + target_opset=TARGET_OPSET) + sess = InferenceSession(model_onnx.SerializeToString()) + # Invalid Feed Input Name:\u5e74\u9f62 + # sess.run(None, {'年齢': X}) + self.assertTrue(sess is not None) + + def test_non_ascii_variable_name_pipeline(self): + + data = dedent(""" + pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest + 1,1,"A",female,29.0,0,0,24160,211.3375,B5,S,2,,"MO" + 1,1,"B",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Can" + 1,0,"C",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Can" + 1,0,"D",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Can" + 1,0,"E",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Can" + 1,1,"F",male,48.0,0,0,19952,26.55,E12,S,3,,"NY" + 1,1,"G",female,63.0,1,0,13502,77.9583,D7,S,10,,"NY" + 1,0,"H",male,39.0,0,0,112050,0.0,A36,S,,,"NI" + 1,1,"I",female,53.0,2,0,11769,51.4792,C101,S,D,,"NY" + 1,0,"J",male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Uruguay" + 1,0,"K",male,47.0,1,0,PC 17757,227.525,C62 C64,C,,124.0,"NY" + 1,1,"L",female,18.0,1,0,PC 17757,227.525,C62 C64,C,4,,"NY" + 1,1,"M",female,24.0,0,0,PC 17477,69.3,B35,C,9,,"F" + 1,1,"N",female,26.0,0,0,19877,78.85,,S,6,, + 1,1,"L",male,80.0,0,0,27042,30.0,A23,S,B,,"Yorks" + 1,0,"O",male,,0,0,PC 17318,25.925,,S,,,"NY" + 1,0,"P",male,24.0,0,1,PC 17558,247.5208,B58 B60,C,,,"PQ" + 1,1,"Q",female,50.0,0,1,PC 17558,247.5208,B58 B60,C,6,,"PQ" + 1,1,"R",female,32.0,0,0,11813,76.2917,D15,C,8,, + 1,0,"S",male,36.0,0,0,13050,75.2417,C6,C,A,,"MN" + """).strip(" \n") + data = pd.read_csv(StringIO(data)) + data.rename(columns={"age": "年齢"}, inplace=True) + X = data.drop('survived', axis=1) + # y = data['survived'] + cols = ['embarked', 'sex', 'pclass', '年齢', 'fare'] + X = X[cols] + for cat in ['embarked', 'sex', 'pclass']: + X[cat].fillna('missing', inplace=True) + numeric_features = ['年齢', 'fare'] + numeric_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler())]) + categorical_features = ['embarked', 'sex', 'pclass'] + categorical_transformer = Pipeline(steps=[ + ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features)]) + preprocessor.fit_transform(X) + initial_type = [('pclass', Int64TensorType(shape=[None, 1])), + ('sex', StringTensorType(shape=[None, 1])), + ('年齢', FloatTensorType(shape=[None, 1])), + ('fare', FloatTensorType(shape=[None, 1])), + ('embarked', StringTensorType(shape=[None, 1]))] + + onnx_object = convert_sklearn( + preprocessor, initial_types=initial_type, + target_opset=TARGET_OPSET) + sess = InferenceSession(onnx_object.SerializeToString()) + self.assertTrue(sess is not None) + # Invalid Feed Input Name:\u5e74\u9f62 + # onx_data = {} + # for col in initial_type: + # onx_data[col[0]] = X[col[0]].values.reshape((-1, 1)) + # sess.run(None, onx_data) + if __name__ == "__main__": unittest.main()