Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support non ascii characters in variable names #784

Merged
merged 4 commits into from
Nov 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .azure-pipelines/linux-conda-CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
sklearn.version: '==0.24.2'
lgbm.version: ''
onnxcc.version: '>=1.8.1' # git
run.example: '1'
run.example: '0'
Py39-Onnx1101-Rt181-Skl0242:
do.bench: '0'
python.version: '3.9'
Expand Down
2 changes: 1 addition & 1 deletion skl2onnx/common/_topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,7 +958,7 @@ def _generate_unique_name(seed, existing_names):

# Make the seed meet C-style naming convention
# Only alphabets and numbers are allowed
seed = re.sub('[^0-9a-zA-Z]', '_', seed)
seed = re.sub('[^\\w+]', '_', seed)
# The first symbol cannot be a number
if re.match('^[0-9]', seed):
seed = '_' + seed
Expand Down
92 changes: 89 additions & 3 deletions tests/test_variable_names.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,29 @@
# SPDX-License-Identifier: Apache-2.0
# coding: utf-8

import unittest
import copy
from textwrap import dedent
from io import StringIO
import numpy as np
import pandas as pd
from numpy.testing import assert_almost_equal
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from onnxruntime import InferenceSession
try:
from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
except ImportError:
InvalidArgument = RuntimeError
from skl2onnx import get_model_alias, update_registered_converter
from skl2onnx.algebra.onnx_ops import OnnxIdentity
from skl2onnx import convert_sklearn
from onnxconverter_common.data_types import FloatTensorType
from test_utils import TARGET_OPSET
from skl2onnx import convert_sklearn, to_onnx
from onnxconverter_common.data_types import (
FloatTensorType, Int64TensorType, StringTensorType)
from test_utils import fit_regression_model, TARGET_OPSET


class Passthrough:
Expand Down Expand Up @@ -96,6 +109,79 @@ def test_variable_names_output(self):
target_opset=TARGET_OPSET,
final_types=final_types)

def _test_non_ascii_variable_name(self):
model, X = fit_regression_model(LinearRegression())
model_onnx = to_onnx(
model, name="linear regression",
initial_types=[("年齢", FloatTensorType([None, X.shape[1]]))],
target_opset=TARGET_OPSET)
sess = InferenceSession(model_onnx.SerializeToString())
# Invalid Feed Input Name:\u5e74\u9f62
# sess.run(None, {'年齢': X})
self.assertTrue(sess is not None)

def test_non_ascii_variable_name_pipeline(self):

data = dedent("""
pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1,1,"A",female,29.0,0,0,24160,211.3375,B5,S,2,,"MO"
1,1,"B",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Can"
1,0,"C",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Can"
1,0,"D",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Can"
1,0,"E",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Can"
1,1,"F",male,48.0,0,0,19952,26.55,E12,S,3,,"NY"
1,1,"G",female,63.0,1,0,13502,77.9583,D7,S,10,,"NY"
1,0,"H",male,39.0,0,0,112050,0.0,A36,S,,,"NI"
1,1,"I",female,53.0,2,0,11769,51.4792,C101,S,D,,"NY"
1,0,"J",male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Uruguay"
1,0,"K",male,47.0,1,0,PC 17757,227.525,C62 C64,C,,124.0,"NY"
1,1,"L",female,18.0,1,0,PC 17757,227.525,C62 C64,C,4,,"NY"
1,1,"M",female,24.0,0,0,PC 17477,69.3,B35,C,9,,"F"
1,1,"N",female,26.0,0,0,19877,78.85,,S,6,,
1,1,"L",male,80.0,0,0,27042,30.0,A23,S,B,,"Yorks"
1,0,"O",male,,0,0,PC 17318,25.925,,S,,,"NY"
1,0,"P",male,24.0,0,1,PC 17558,247.5208,B58 B60,C,,,"PQ"
1,1,"Q",female,50.0,0,1,PC 17558,247.5208,B58 B60,C,6,,"PQ"
1,1,"R",female,32.0,0,0,11813,76.2917,D15,C,8,,
1,0,"S",male,36.0,0,0,13050,75.2417,C6,C,A,,"MN"
""").strip(" \n")
data = pd.read_csv(StringIO(data))
data.rename(columns={"age": "年齢"}, inplace=True)
X = data.drop('survived', axis=1)
# y = data['survived']
cols = ['embarked', 'sex', 'pclass', '年齢', 'fare']
X = X[cols]
for cat in ['embarked', 'sex', 'pclass']:
X[cat].fillna('missing', inplace=True)
numeric_features = ['年齢', 'fare']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
preprocessor.fit_transform(X)
initial_type = [('pclass', Int64TensorType(shape=[None, 1])),
('sex', StringTensorType(shape=[None, 1])),
('年齢', FloatTensorType(shape=[None, 1])),
('fare', FloatTensorType(shape=[None, 1])),
('embarked', StringTensorType(shape=[None, 1]))]

onnx_object = convert_sklearn(
preprocessor, initial_types=initial_type,
target_opset=TARGET_OPSET)
sess = InferenceSession(onnx_object.SerializeToString())
self.assertTrue(sess is not None)
# Invalid Feed Input Name:\u5e74\u9f62
# onx_data = {}
# for col in initial_type:
# onx_data[col[0]] = X[col[0]].values.reshape((-1, 1))
# sess.run(None, onx_data)


if __name__ == "__main__":
unittest.main()