diff --git a/docs/cn/bucketizer.md b/docs/cn/bucketizer.md index 8ea6cec11..9a9df3a9d 100644 --- a/docs/cn/bucketizer.md +++ b/docs/cn/bucketizer.md @@ -1,19 +1,23 @@ ## 功能介绍 给定切分点,将连续变量分桶,可支持单列输入或多列输入,对应需要给出单列切分点或者多列切分点。 -每列切分点需要严格递增,且至少有三个点。 - ## 参数说明 + | 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | -| handleInvalid | 如何处理无效值 | 可以选择skip:跳过,error:报错抛异常。 | String | | "error" | -| selectedCols | 计算列对应的列名列表 | 计算列对应的列名列表 | String[] | | | -| splitsArray | 多列的切分点 | 多列的切分点 | String[] | | | +| selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | +| reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | | outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | -| reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | +| handleInvalid | 未知Token处理策略 | 未知Token处理策略,"keep", "skip", "error" | String | | "keep" | +| encode | 编码方式 | 编码方式,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | 是否删除最后一个元素 | 是否删除最后一个元素 | Boolean | | true | +| leftOpen | 是否左开右闭 | 是否左开右闭 | Boolean | | true | +| cutsArray | 多列的切分点 | 多列的切分点 | double[][] | ✓ | | + + ## 脚本示例 #### 脚本代码 @@ -29,7 +33,7 @@ data = np.array([ df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2], "str": data[:, 3]}) inOp = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') -bucketizer = Bucketizer().setSelectedCols(["double"]).setSplitsArray(["-Infinity:2:Infinity"]) +bucketizer = Bucketizer().setSelectedCols(["double"]).setCutsArray([[2]]) bucketizer.transform(inOp).print() ``` #### 脚本运行结果 diff --git a/docs/cn/bucketizerbatchop.md b/docs/cn/bucketizerbatchop.md index ad66ec3e2..81d9249cd 100644 --- a/docs/cn/bucketizerbatchop.md +++ b/docs/cn/bucketizerbatchop.md @@ -1,19 +1,20 @@ ## 功能介绍 给定切分点,将连续变量分桶,可支持单列输入或多列输入,对应需要给出单列切分点或者多列切分点。 -每列切分点需要严格递增,且至少有三个点。 - ## 参数说明 | 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | -| handleInvalid | 如何处理无效值 | 可以选择skip:跳过,error:报错抛异常。 | String | | "error" | -| selectedCols | 计算列对应的列名列表 | 计算列对应的列名列表 | String[] | | | -| splitsArray | 多列的切分点 | 多列的切分点 | String[] | | | +| selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | +| reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | | outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | -| reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | +| handleInvalid | 未知Token处理策略 | 未知Token处理策略,"keep", "skip", "error" | String | | "keep" | +| encode | 编码方式 | 编码方式,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | 是否删除最后一个元素 | 是否删除最后一个元素 | Boolean | | true | +| leftOpen | 是否左开右闭 | 是否左开右闭 | Boolean | | true | +| cutsArray | 多列的切分点 | 多列的切分点 | double[][] | ✓ | | ## 脚本示例 #### 脚本代码 @@ -31,10 +32,10 @@ df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2 inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') inOp2 = StreamOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') -bucketizer = BucketizerBatchOp().setSelectedCols(["double"]).setSplitsArray(["-Infinity:2:Infinity"]) +bucketizer = BucketizerBatchOp().setSelectedCols(["double"]).setCutsArray([[2]]) bucketizer.linkFrom(inOp1).print() -bucketizer = BucketizerStreamOp().setSelectedCols(["double"]).setSplitsArray(["-Infinity:2:Infinity"]) +bucketizer = BucketizerStreamOp().setSelectedCols(["double"]).setCutsArray([[2]]) bucketizer.linkFrom(inOp2).print() StreamOperator.execute() diff --git a/docs/cn/bucketizerstreamop.md b/docs/cn/bucketizerstreamop.md index ab57c3508..81d9249cd 100644 --- a/docs/cn/bucketizerstreamop.md +++ b/docs/cn/bucketizerstreamop.md @@ -1,18 +1,20 @@ ## 功能介绍 给定切分点,将连续变量分桶,可支持单列输入或多列输入,对应需要给出单列切分点或者多列切分点。 -每列切分点需要严格递增,且至少有三个点。 - +## 参数说明 | 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | -| handleInvalid | 如何处理无效值 | 可以选择skip:跳过,error:报错抛异常。 | String | | "error" | -| selectedCols | 计算列对应的列名列表 | 计算列对应的列名列表 | String[] | | | -| splitsArray | 多列的切分点 | 多列的切分点 | String[] | | | +| selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | +| reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | | outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | -| reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | +| handleInvalid | 未知Token处理策略 | 未知Token处理策略,"keep", "skip", "error" | String | | "keep" | +| encode | 编码方式 | 编码方式,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | 是否删除最后一个元素 | 是否删除最后一个元素 | Boolean | | true | +| leftOpen | 是否左开右闭 | 是否左开右闭 | Boolean | | true | +| cutsArray | 多列的切分点 | 多列的切分点 | double[][] | ✓ | | ## 脚本示例 #### 脚本代码 @@ -30,10 +32,10 @@ df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2 inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') inOp2 = StreamOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') -bucketizer = BucketizerBatchOp().setSelectedCols(["double"]).setSplitsArray(["-Infinity:2:Infinity"]) +bucketizer = BucketizerBatchOp().setSelectedCols(["double"]).setCutsArray([[2]]) bucketizer.linkFrom(inOp1).print() -bucketizer = BucketizerStreamOp().setSelectedCols(["double"]).setSplitsArray(["-Infinity:2:Infinity"]) +bucketizer = BucketizerStreamOp().setSelectedCols(["double"]).setCutsArray([[2]]) bucketizer.linkFrom(inOp2).print() StreamOperator.execute() diff --git a/docs/cn/onehotencoder.md b/docs/cn/onehotencoder.md index 66886cb52..1e4726417 100644 --- a/docs/cn/onehotencoder.md +++ b/docs/cn/onehotencoder.md @@ -9,66 +9,48 @@ one-hot编码,也称独热编码,对于每一个特征,如果它有m个可 -| 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | + 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | -| dropLast | 是否删除最后一个元素 | 删除最后一个元素是为了保证线性无关性。默认true | Boolean | | true | -| ignoreNull | 受否忽略null | 忽略将不对null 编码 | Boolean | | false | +| discreteThresholdsArray | 离散个数阈值 | 离散个数阈值,每一列对应数组中一个元素 | Integer[] | | | +| discreteThresholds | 离散个数阈值 | 离散个数阈值,低于该阈值的离散样本将不会单独成一个组别 | Integer | | Integer.MIN_VALUE | | selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | + selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | | reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | -| outputCol | 输出结果列列名 | 输出结果列列名,必选 | String | ✓ | | +| outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | +| handleInvalid | 未知Token处理策略 | 未知Token处理策略,"keep", "skip", "error" | String | | "keep" | +| encode | 编码方式 | 编码方式,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | | "ASSEMBLED_VECTOR" | +| dropLast | 是否删除最后一个元素 | 是否删除最后一个元素 | Boolean | | true | + + ## 脚本示例 #### 运行脚本 ```python +import numpy as np +import pandas as pd data = np.array([ - ["assisbragasm", 1], - ["assiseduc", 1], - ["assist", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assistencialgsamsung", 1] + [1.1, True, "2", "A"], + [1.1, False, "2", "B"], + [1.1, True, "1", "B"], + [2.2, True, "1", "A"] ]) +df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2], "str": data[:, 3]}) -# load data -df = pd.DataFrame({"query": data[:, 0], "weight": data[:, 1]}) - -inOp = dataframeToOperator(df, schemaStr='query string, weight long', op_type='batch') - -# one hot train -one_hot = OneHotEncoder()\ - .setSelectedCols(["query"])\ - .setDropLast(False)\ - .setIgnoreNull(False)\ - .setOutputCol("predicted_r")\ - .setReservedCols(["weight"]) - - -model = one_hot.fit(inOp) -model.transform(inOp).print() +inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') -# stream predict -inOp2 = dataframeToOperator(df, schemaStr='query string, weight long', op_type='stream') -model.transform(inOp2).print() - -StreamOperator.execute() +onehot = OneHotEncoder().setSelectedCols(["double", "bool"]).setDiscreteThresholds(2).setEncode("ASSEMBLED_VECTOR").setOutputCols(["pred"]).setDropLast(False) +onehot.fit(inOp).transform(inOp).collectToDataframe() ``` #### 运行结果 ```python - weight predicted_r -0 1 $6$4:1.0 -1 1 $6$3:1.0 -2 1 $6$2:1.0 -3 1 $6$3:1.0 -4 1 $6$1:1.0 -5 1 $6$3:1.0 -6 1 $6$1:1.0 -7 1 $6$0:1.0 - + double bool number str pred +0 1.1 True 2 A $6$0:1.0 3:1.0 +1 1.1 False 2 B $6$0:1.0 5:1.0 +2 1.1 True 1 B $6$0:1.0 3:1.0 +3 2.2 True 1 A $6$2:1.0 3:1.0 ``` diff --git a/docs/cn/onehotpredictbatchop.md b/docs/cn/onehotpredictbatchop.md index 5936c4f51..520edc725 100644 --- a/docs/cn/onehotpredictbatchop.md +++ b/docs/cn/onehotpredictbatchop.md @@ -13,55 +13,53 @@ one-hot编码,也称独热编码,对于每一个特征,如果它有m个可 | 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | +| selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | | reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | -| outputCol | 输出结果列列名 | 输出结果列列名,必选 | String | ✓ | | +| outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | +| handleInvalid | 未知Token处理策略 | 未知Token处理策略,"keep", "skip", "error" | String | | "keep" | +| encode | 编码方式 | 编码方式,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | | "ASSEMBLED_VECTOR" | +| dropLast | 是否删除最后一个元素 | 是否删除最后一个元素 | Boolean | | true | + + + ## 脚本示例 #### 运行脚本 ```python +import numpy as np +import pandas as pd data = np.array([ - ["assisbragasm", 1], - ["assiseduc", 1], - ["assist", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assistencialgsamsung", 1] + [1.1, True, "2", "A"], + [1.1, False, "2", "B"], + [1.1, True, "1", "B"], + [2.2, True, "1", "A"] ]) - -# load data -df = pd.DataFrame({"query": data[:, 0], "weight": data[:, 1]}) - -inOp = dataframeToOperator(df, schemaStr='query string, weight long', op_type='batch') - -# one hot train -one_hot = OneHotTrainBatchOp().setSelectedCols(["query"]).setDropLast(False).setIgnoreNull(False) -model = inOp.link(one_hot) - -# batch predict -predictor = OneHotPredictBatchOp().setOutputCol("predicted_r").setReservedCols(["weight"]) -print(BatchOperator.collectToDataframe(predictor.linkFrom(model, inOp))) - -# stream predict -inOp2 = dataframeToOperator(df, schemaStr='query string, weight long', op_type='stream') -predictor = OneHotPredictStreamOp(model).setOutputCol("predicted_r").setReservedCols(["weight"]) -predictor.linkFrom(inOp2).print() - +df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2], "str": data[:, 3]}) + +inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') +inOp2 = StreamOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') + +onehot = OneHotTrainBatchOp().setSelectedCols(["double", "bool", "number", "str"]).setDiscreteThresholds(2) +predictBatch = OneHotPredictBatchOp().setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["pred"]).setDropLast(False) +onehot.linkFrom(inOp1) +predictBatch.linkFrom(onehot, inOp1) +[model,predict] = collectToDataframes(onehot, predictBatch) +print(model) +print(predict) + +predictStream = OneHotPredictStreamOp(onehot).setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["vec"]) +predictStream.linkFrom(inOp2) +predictStream.print(refreshInterval=-1) StreamOperator.execute() ``` #### 运行结果 ```python - weight predicted_r -0 1 $6$4:1.0 -1 1 $6$3:1.0 -2 1 $6$2:1.0 -3 1 $6$3:1.0 -4 1 $6$1:1.0 -5 1 $6$3:1.0 -6 1 $6$1:1.0 -7 1 $6$0:1.0 + double bool number str pred +0 1.1 True 2 A $6$0:1.0 3:1.0 +1 1.1 False 2 B $6$0:1.0 5:1.0 +2 1.1 True 1 B $6$0:1.0 3:1.0 +3 2.2 True 1 A $6$2:1.0 3:1.0 ``` @@ -69,3 +67,7 @@ StreamOperator.execute() + + + + diff --git a/docs/cn/onehotpredictstreamop.md b/docs/cn/onehotpredictstreamop.md index ae38fb0c0..520edc725 100644 --- a/docs/cn/onehotpredictstreamop.md +++ b/docs/cn/onehotpredictstreamop.md @@ -1,3 +1,5 @@ + + # one-hot编码组件 ## 算法介绍 @@ -11,56 +13,53 @@ one-hot编码,也称独热编码,对于每一个特征,如果它有m个可 | 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | +| selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | | reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | -| outputCol | 输出结果列列名 | 输出结果列列名,必选 | String | ✓ | | +| outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | +| handleInvalid | 未知Token处理策略 | 未知Token处理策略,"keep", "skip", "error" | String | | "keep" | +| encode | 编码方式 | 编码方式,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | | "ASSEMBLED_VECTOR" | +| dropLast | 是否删除最后一个元素 | 是否删除最后一个元素 | Boolean | | true | + + ## 脚本示例 #### 运行脚本 ```python +import numpy as np +import pandas as pd data = np.array([ - ["assisbragasm", 1], - ["assiseduc", 1], - ["assist", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assistencialgsamsung", 1] + [1.1, True, "2", "A"], + [1.1, False, "2", "B"], + [1.1, True, "1", "B"], + [2.2, True, "1", "A"] ]) - -# load data -df = pd.DataFrame({"query": data[:, 0], "weight": data[:, 1]}) - -inOp = dataframeToOperator(df, schemaStr='query string, weight long', op_type='batch') - -# one hot train -one_hot = OneHotTrainBatchOp().setSelectedCols(["query"]).setDropLast(False).setIgnoreNull(False) -model = inOp.link(one_hot) - -# batch predict -predictor = OneHotPredictBatchOp().setOutputCol("predicted_r").setReservedCols(["weight"]) -print(BatchOperator.collectToDataframe(predictor.linkFrom(model, inOp))) - -# stream predict -inOp2 = dataframeToOperator(df, schemaStr='query string, weight long', op_type='stream') -predictor = OneHotPredictStreamOp(model).setOutputCol("predicted_r").setReservedCols(["weight"]) -predictor.linkFrom(inOp2).print() - +df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2], "str": data[:, 3]}) + +inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') +inOp2 = StreamOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') + +onehot = OneHotTrainBatchOp().setSelectedCols(["double", "bool", "number", "str"]).setDiscreteThresholds(2) +predictBatch = OneHotPredictBatchOp().setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["pred"]).setDropLast(False) +onehot.linkFrom(inOp1) +predictBatch.linkFrom(onehot, inOp1) +[model,predict] = collectToDataframes(onehot, predictBatch) +print(model) +print(predict) + +predictStream = OneHotPredictStreamOp(onehot).setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["vec"]) +predictStream.linkFrom(inOp2) +predictStream.print(refreshInterval=-1) StreamOperator.execute() ``` #### 运行结果 ```python - weight predicted_r -0 1 $6$4:1.0 -1 1 $6$3:1.0 -2 1 $6$2:1.0 -3 1 $6$3:1.0 -4 1 $6$1:1.0 -5 1 $6$3:1.0 -6 1 $6$1:1.0 -7 1 $6$0:1.0 + double bool number str pred +0 1.1 True 2 A $6$0:1.0 3:1.0 +1 1.1 False 2 B $6$0:1.0 5:1.0 +2 1.1 True 1 B $6$0:1.0 3:1.0 +3 2.2 True 1 A $6$2:1.0 3:1.0 ``` @@ -68,3 +67,7 @@ StreamOperator.execute() + + + + diff --git a/docs/cn/onehottrainbatchop.md b/docs/cn/onehottrainbatchop.md index 25062fdf0..48a49a74a 100644 --- a/docs/cn/onehottrainbatchop.md +++ b/docs/cn/onehottrainbatchop.md @@ -10,57 +10,50 @@ one-hot编码,也称独热编码,对于每一个特征,如果它有m个可 | 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | -| dropLast | 是否删除最后一个元素 | 删除最后一个元素是为了保证线性无关性。默认true | Boolean | | true | -| ignoreNull | 受否忽略null | 忽略将不对null 编码 | Boolean | | false | -| selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | +| discreteThresholdsArray | 离散个数阈值 | 离散个数阈值,每一列对应数组中一个元素 | Integer[] | | | +| discreteThresholds | 离散个数阈值 | 离散个数阈值,低于该阈值的离散样本将不会单独成一个组别 | Integer | | Integer.MIN_VALUE | +| selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | + + ## 脚本示例 #### 运行脚本 ```python +import numpy as np +import pandas as pd data = np.array([ - ["assisbragasm", 1], - ["assiseduc", 1], - ["assist", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assistencialgsamsung", 1] + [1.1, True, "2", "A"], + [1.1, False, "2", "B"], + [1.1, True, "1", "B"], + [2.2, True, "1", "A"] ]) - -# load data -df = pd.DataFrame({"query": data[:, 0], "weight": data[:, 1]}) - -inOp = dataframeToOperator(df, schemaStr='query string, weight long', op_type='batch') - -# one hot train -one_hot = OneHotTrainBatchOp().setSelectedCols(["query"]).setDropLast(False).setIgnoreNull(False) -model = inOp.link(one_hot) - -# batch predict -predictor = OneHotPredictBatchOp().setOutputCol("predicted_r").setReservedCols(["weight"]) -print(BatchOperator.collectToDataframe(predictor.linkFrom(model, inOp))) - -# stream predict -inOp2 = dataframeToOperator(df, schemaStr='query string, weight long', op_type='stream') -predictor = OneHotPredictStreamOp(model).setOutputCol("predicted_r").setReservedCols(["weight"]) -predictor.linkFrom(inOp2).print() - +df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2], "str": data[:, 3]}) + +inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') +inOp2 = StreamOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') + +onehot = OneHotTrainBatchOp().setSelectedCols(["double", "bool", "number", "str"]).setDiscreteThresholds(2) +predictBatch = OneHotPredictBatchOp().setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["pred"]).setDropLast(False) +onehot.linkFrom(inOp1) +predictBatch.linkFrom(onehot, inOp1) +[model,predict] = collectToDataframes(onehot, predictBatch) +print(model) +print(predict) + +predictStream = OneHotPredictStreamOp(onehot).setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["vec"]) +predictStream.linkFrom(inOp2) +predictStream.print(refreshInterval=-1) StreamOperator.execute() ``` #### 运行结果 ```python - weight predicted_r -0 1 $6$4:1.0 -1 1 $6$3:1.0 -2 1 $6$2:1.0 -3 1 $6$3:1.0 -4 1 $6$1:1.0 -5 1 $6$3:1.0 -6 1 $6$1:1.0 -7 1 $6$0:1.0 + double bool number str pred +0 1.1 True 2 A $6$0:1.0 3:1.0 +1 1.1 False 2 B $6$0:1.0 5:1.0 +2 1.1 True 1 B $6$0:1.0 3:1.0 +3 2.2 True 1 A $6$2:1.0 3:1.0 ``` diff --git a/docs/cn/quantilediscretizer.md b/docs/cn/quantilediscretizer.md index ea03b150f..709e723f1 100644 --- a/docs/cn/quantilediscretizer.md +++ b/docs/cn/quantilediscretizer.md @@ -10,14 +10,21 @@ + | 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | | selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | | numBuckets | quantile个数 | quantile个数,对所有列有效。 | Integer | | 2 | | numBucketsArray | quantile个数 | quantile个数,每一列对应数组中一个元素。 | Integer[] | | null | +| leftOpen | 左开右闭 | 左开右闭 | Boolean | | true | | selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | | reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | -| outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | +| outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | +| handleInvalid | 未知Token处理策略 | 未知Token处理策略,"keep", "skip", "error" | String | | "keep" | +| encode | 编码方式 | 编码方式,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | 是否删除最后一个元素 | 是否删除最后一个元素 | Boolean | | true | + + ## 脚本示例 diff --git a/docs/cn/quantilediscretizerpredictbatchop.md b/docs/cn/quantilediscretizerpredictbatchop.md index 93d5a4f91..a968051e5 100644 --- a/docs/cn/quantilediscretizerpredictbatchop.md +++ b/docs/cn/quantilediscretizerpredictbatchop.md @@ -6,11 +6,17 @@ ## 参数说明 + | 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | | selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | | reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | -| outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | +| outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | +| handleInvalid | 未知Token处理策略 | 未知Token处理策略,"keep", "skip", "error" | String | | "keep" | +| encode | 编码方式 | 编码方式,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | 是否删除最后一个元素 | 是否删除最后一个元素 | Boolean | | true | + + ## 脚本示例 diff --git a/docs/cn/quantilediscretizerpredictstreamop.md b/docs/cn/quantilediscretizerpredictstreamop.md index f2edc6421..25d761f38 100644 --- a/docs/cn/quantilediscretizerpredictstreamop.md +++ b/docs/cn/quantilediscretizerpredictstreamop.md @@ -6,11 +6,17 @@ ## 参数说明 + | 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | | selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | | reservedCols | 算法保留列名 | 算法保留列 | String[] | | null | -| outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | +| outputCols | 输出结果列列名数组 | 输出结果列列名数组,可选,默认null | String[] | | null | +| handleInvalid | 未知Token处理策略 | 未知Token处理策略,"keep", "skip", "error" | String | | "keep" | +| encode | 编码方式 | 编码方式,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | 是否删除最后一个元素 | 是否删除最后一个元素 | Boolean | | true | + + ## 脚本示例 diff --git a/docs/cn/quantilediscretizertrainbatchop.md b/docs/cn/quantilediscretizertrainbatchop.md index f46b03568..a50f555cd 100644 --- a/docs/cn/quantilediscretizertrainbatchop.md +++ b/docs/cn/quantilediscretizertrainbatchop.md @@ -11,6 +11,7 @@ | 名称 | 中文名称 | 描述 | 类型 | 是否必须? | 默认值 | | --- | --- | --- | --- | --- | --- | | selectedCols | 选择的列名 | 计算列对应的列名列表 | String[] | ✓ | | +| leftOpen | 左开右闭 | 左开右闭 | Boolean | | true | | numBuckets | quantile个数 | quantile个数,对所有列有效。 | Integer | | 2 | | numBucketsArray | quantile个数 | quantile个数,每一列对应数组中一个元素。 | Integer[] | | null | diff --git a/docs/en/bucketizer.md b/docs/en/bucketizer.md index a392d4425..04d8ce908 100644 --- a/docs/en/bucketizer.md +++ b/docs/en/bucketizer.md @@ -6,15 +6,15 @@ Map a continuous variable into several buckets. and splitsArray should be set, and the lengths of them should be equal. In the case of multiple columns, each column used the corresponding splits. - Split array must be strictly increasing and have at least three points. It's a string input with split points - segments with delimiter ",". - ## Parameters | Name | Description | Type | Required? | Default Value | | --- | --- | --- | --- | --- | -| handleInvalid | parameter for how to handle invalid data (NULL values) | String | | "error" | +| handleInvalid | Strategy to handle unseen token when doing prediction, one of "keep", "skip" or "error" | String | | "keep" | +| encode | Encode method,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | drop last | Boolean | | true | +| leftOpen | left open | Boolean | | true | +| cutsArray | Split points array, each of them is used for the corresponding selected column. | double[][] | ✓ | | | selectedCols | Names of the columns used for processing | String[] | | | -| splitsArray | Split points array, each of them is used for the corresponding selected column. | String[] | | | | outputCols | Names of the output columns | String[] | | null | | reservedCols | Names of the columns to be retained in the output table | String[] | | null | @@ -33,7 +33,7 @@ data = np.array([ df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2], "str": data[:, 3]}) inOp = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') -bucketizer = Bucketizer().setSelectedCols(["double"]).setSplitsArray(["-Infinity:2:Infinity"]) +bucketizer = Bucketizer().setSelectedCols(["double"]).setCutsArray([[2]]) bucketizer.transform(inOp).print() ``` #### Results diff --git a/docs/en/bucketizerbatchop.md b/docs/en/bucketizerbatchop.md index 0d5690be3..7b4bb49f2 100644 --- a/docs/en/bucketizerbatchop.md +++ b/docs/en/bucketizerbatchop.md @@ -6,19 +6,20 @@ Map a continuous variable into several buckets. and splitsArray should be set, and the lengths of them should be equal. In the case of multiple columns, each column used the corresponding splits. - Split array must be strictly increasing and have at least three points. It's a string input with split points - segments with delimiter ",". - ## Parameters | Name | Description | Type | Required? | Default Value | | --- | --- | --- | --- | --- | -| handleInvalid | parameter for how to handle invalid data (NULL values) | String | | "error" | -| selectedCols | Names of the columns used for processing | String[] | | | -| splitsArray | Split points array, each of them is used for the corresponding selected column. | String[] | | | +| handleInvalid | Strategy to handle unseen token when doing prediction, one of "keep", "skip" or "error" | String | | "keep" | +| encode | Encode method,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | drop last | Boolean | | true | +| leftOpen | left open | Boolean | | true | +| cutsArray | Split points array, each of them is used for the corresponding selected column. | double[][] | ✓ | | +| selectedCols | Names of the columns used for processing | String[] | ✓ | | | outputCols | Names of the output columns | String[] | | null | | reservedCols | Names of the columns to be retained in the output table | String[] | | null | + ## Script Example #### Code ``` @@ -35,10 +36,10 @@ df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2 inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') inOp2 = StreamOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') -bucketizer = BucketizerBatchOp().setSelectedCols(["double"]).setSplitsArray(["-Infinity:2:Infinity"]) +bucketizer = BucketizerBatchOp().setSelectedCols(["double"])..setCutsArray([[2]]) bucketizer.linkFrom(inOp1).print() -bucketizer = BucketizerStreamOp().setSelectedCols(["double"]).setSplitsArray(["-Infinity:2:Infinity"]) +bucketizer = BucketizerStreamOp().setSelectedCols(["double"]).setCutsArray([[2]]) bucketizer.linkFrom(inOp2).print() StreamOperator.execute() diff --git a/docs/en/bucketizerstreamop.md b/docs/en/bucketizerstreamop.md index 6b12abb0c..9cdf0a5d8 100644 --- a/docs/en/bucketizerstreamop.md +++ b/docs/en/bucketizerstreamop.md @@ -8,13 +8,17 @@ Map a continuous variable into several buckets. ## Parameters | Name | Description | Type | Required? | Default Value | | --- | --- | --- | --- | --- | -| handleInvalid | parameter for how to handle invalid data (NULL values) | String | | "error" | -| selectedCols | Names of the columns used for processing | String[] | | | -| splitsArray | Split points array, each of them is used for the corresponding selected column. | String[] | | | +| handleInvalid | Strategy to handle unseen token when doing prediction, one of "keep", "skip" or "error" | String | | "keep" | +| encode | Encode method,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | drop last | Boolean | | true | +| leftOpen | left open | Boolean | | true | +| cutsArray | Split points array, each of them is used for the corresponding selected column. | double[][] | ✓ | | +| selectedCols | Names of the columns used for processing | String[] | ✓ | | | outputCols | Names of the output columns | String[] | | null | | reservedCols | Names of the columns to be retained in the output table | String[] | | null | + ## Script Example #### Code ``` diff --git a/docs/en/onehotencoder.md b/docs/en/onehotencoder.md index ecb03b780..f764b7107 100644 --- a/docs/en/onehotencoder.md +++ b/docs/en/onehotencoder.md @@ -4,63 +4,55 @@ One hot pipeline op. ## Parameters | Name | Description | Type | Required? | Default Value | | --- | --- | --- | --- | --- | +| discreteThresholdsArray | discrete thresholds array | Integer[] | | | +| discreteThresholds | discrete thresholds array | Integer | | Integer.MIN_VALUE | +| selectedCols | Names of the columns used for processing | String[] | | | +| handleInvalid | Strategy to handle unseen token when doing prediction, one of "keep", "skip" or "error" | String | | "keep" | +| encode | Encode method,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | | dropLast | drop last | Boolean | | true | -| ignoreNull | ignore null | Boolean | | false | -| selectedCols | Names of the columns used for processing | String[] | ✓ | | +| selectedCols | Names of the columns used for processing | String[] | | | +| outputCols | Names of the output columns | String[] | | null | | reservedCols | Names of the columns to be retained in the output table | String[] | | null | -| outputCol | Name of the output column | String | ✓ | | ## Script Example #### Script ```python +import numpy as np +import pandas as pd data = np.array([ - ["assisbragasm", 1], - ["assiseduc", 1], - ["assist", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assistencialgsamsung", 1] + [1.1, True, "2", "A"], + [1.1, False, "2", "B"], + [1.1, True, "1", "B"], + [2.2, True, "1", "A"] ]) - -# load data -df = pd.DataFrame({"query": data[:, 0], "weight": data[:, 1]}) - -inOp = dataframeToOperator(df, schemaStr='query string, weight long', op_type='batch') - -# one hot train -one_hot = OneHotEncoder()\ - .setSelectedCols(["query"])\ - .setDropLast(False)\ - .setIgnoreNull(False)\ - .setOutputCol("predicted_r")\ - .setReservedCols(["weight"]) - - -model = one_hot.fit(inOp) -model.transform(inOp).print() - -# stream predict -inOp2 = dataframeToOperator(df, schemaStr='query string, weight long', op_type='stream') -model.transform(inOp2).print() - +df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2], "str": data[:, 3]}) + +inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') +inOp2 = StreamOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') + +onehot = OneHotTrainBatchOp().setSelectedCols(["double", "bool", "number", "str"]).setDiscreteThresholds(2) +predictBatch = OneHotPredictBatchOp().setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["pred"]).setDropLast(False) +onehot.linkFrom(inOp1) +predictBatch.linkFrom(onehot, inOp1) +[model,predict] = collectToDataframes(onehot, predictBatch) +print(model) +print(predict) + +predictStream = OneHotPredictStreamOp(onehot).setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["vec"]) +predictStream.linkFrom(inOp2) +predictStream.print(refreshInterval=-1) StreamOperator.execute() ``` #### Result ```python - weight predicted_r -0 1 $6$4:1.0 -1 1 $6$3:1.0 -2 1 $6$2:1.0 -3 1 $6$3:1.0 -4 1 $6$1:1.0 -5 1 $6$3:1.0 -6 1 $6$1:1.0 -7 1 $6$0:1.0 + double bool number str pred +0 1.1 True 2 A $6$0:1.0 3:1.0 +1 1.1 False 2 B $6$0:1.0 5:1.0 +2 1.1 True 1 B $6$0:1.0 3:1.0 +3 2.2 True 1 A $6$2:1.0 3:1.0 ``` diff --git a/docs/en/onehotpredictbatchop.md b/docs/en/onehotpredictbatchop.md index 9b4177045..9b848fbcd 100644 --- a/docs/en/onehotpredictbatchop.md +++ b/docs/en/onehotpredictbatchop.md @@ -5,61 +5,50 @@ One-hot batch operator maps a serial of columns of category indices to a column ## Parameters | Name | Description | Type | Required? | Default Value | | --- | --- | --- | --- | --- | +| handleInvalid | Strategy to handle unseen token when doing prediction, one of "keep", "skip" or "error" | String | | "keep" | +| encode | Encode method,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | drop last | Boolean | | true | +| selectedCols | Names of the columns used for processing | String[] | | | +| outputCols | Names of the output columns | String[] | | null | | reservedCols | Names of the columns to be retained in the output table | String[] | | null | -| outputCol | Name of the output column | String | ✓ | | ## Script Example #### Script ```python +import numpy as np +import pandas as pd data = np.array([ - ["assisbragasm", 1], - ["assiseduc", 1], - ["assist", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assistencialgsamsung", 1] + [1.1, True, "2", "A"], + [1.1, False, "2", "B"], + [1.1, True, "1", "B"], + [2.2, True, "1", "A"] ]) - -# load data -df = pd.DataFrame({"query": data[:, 0], "weight": data[:, 1]}) - -inOp = dataframeToOperator(df, schemaStr='query string, weight long', op_type='batch') - -# one hot train -one_hot = OneHotTrainBatchOp().setSelectedCols(["query"]).setDropLast(False).setIgnoreNull(False) -model = inOp.link(one_hot) - -# batch predict -predictor = OneHotPredictBatchOp().setOutputCol("predicted_r").setReservedCols(["weight"]) -print(BatchOperator.collectToDataframe(predictor.linkFrom(model, inOp))) - -# stream predict -inOp2 = dataframeToOperator(df, schemaStr='query string, weight long', op_type='stream') -predictor = OneHotPredictStreamOp(model).setOutputCol("predicted_r").setReservedCols(["weight"]) -predictor.linkFrom(inOp2).print() - +df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2], "str": data[:, 3]}) + +inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') +inOp2 = StreamOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') + +onehot = OneHotTrainBatchOp().setSelectedCols(["double", "bool", "number", "str"]).setDiscreteThresholds(2) +predictBatch = OneHotPredictBatchOp().setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["pred"]).setDropLast(False) +onehot.linkFrom(inOp1) +predictBatch.linkFrom(onehot, inOp1) +[model,predict] = collectToDataframes(onehot, predictBatch) +print(model) +print(predict) + +predictStream = OneHotPredictStreamOp(onehot).setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["vec"]) +predictStream.linkFrom(inOp2) +predictStream.print(refreshInterval=-1) StreamOperator.execute() ``` #### Result ```python - weight predicted_r -0 1 $6$4:1.0 -1 1 $6$3:1.0 -2 1 $6$2:1.0 -3 1 $6$3:1.0 -4 1 $6$1:1.0 -5 1 $6$3:1.0 -6 1 $6$1:1.0 -7 1 $6$0:1.0 + double bool number str pred +0 1.1 True 2 A $6$0:1.0 3:1.0 +1 1.1 False 2 B $6$0:1.0 5:1.0 +2 1.1 True 1 B $6$0:1.0 3:1.0 +3 2.2 True 1 A $6$2:1.0 3:1.0 ``` - - - - - - diff --git a/docs/en/onehotpredictstreamop.md b/docs/en/onehotpredictstreamop.md index 948bf1f05..6041127d5 100644 --- a/docs/en/onehotpredictstreamop.md +++ b/docs/en/onehotpredictstreamop.md @@ -6,56 +6,50 @@ ## Parameters | Name | Description | Type | Required? | Default Value | | --- | --- | --- | --- | --- | +| handleInvalid | Strategy to handle unseen token when doing prediction, one of "keep", "skip" or "error" | String | | "keep" | +| encode | Encode method,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | drop last | Boolean | | true | +| selectedCols | Names of the columns used for processing | String[] | | | +| outputCols | Names of the output columns | String[] | | null | | reservedCols | Names of the columns to be retained in the output table | String[] | | null | -| outputCol | Name of the output column | String | ✓ | | - ## Script Example #### Script ```python +import numpy as np +import pandas as pd data = np.array([ - ["assisbragasm", 1], - ["assiseduc", 1], - ["assist", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assistencialgsamsung", 1] + [1.1, True, "2", "A"], + [1.1, False, "2", "B"], + [1.1, True, "1", "B"], + [2.2, True, "1", "A"] ]) - -# load data -df = pd.DataFrame({"query": data[:, 0], "weight": data[:, 1]}) - -inOp = dataframeToOperator(df, schemaStr='query string, weight long', op_type='batch') - -# one hot train -one_hot = OneHotTrainBatchOp().setSelectedCols(["query"]).setDropLast(False).setIgnoreNull(False) -model = inOp.link(one_hot) - -# batch predict -predictor = OneHotPredictBatchOp().setOutputCol("predicted_r").setReservedCols(["weight"]) -print(BatchOperator.collectToDataframe(predictor.linkFrom(model, inOp))) - -# stream predict -inOp2 = dataframeToOperator(df, schemaStr='query string, weight long', op_type='stream') -predictor = OneHotPredictStreamOp(model).setOutputCol("predicted_r").setReservedCols(["weight"]) -predictor.linkFrom(inOp2).print() - +df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2], "str": data[:, 3]}) + +inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') +inOp2 = StreamOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') + +onehot = OneHotTrainBatchOp().setSelectedCols(["double", "bool", "number", "str"]).setDiscreteThresholds(2) +predictBatch = OneHotPredictBatchOp().setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["pred"]).setDropLast(False) +onehot.linkFrom(inOp1) +predictBatch.linkFrom(onehot, inOp1) +[model,predict] = collectToDataframes(onehot, predictBatch) +print(model) +print(predict) + +predictStream = OneHotPredictStreamOp(onehot).setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["vec"]) +predictStream.linkFrom(inOp2) +predictStream.print(refreshInterval=-1) StreamOperator.execute() ``` #### Result ```python - weight predicted_r -0 1 $6$4:1.0 -1 1 $6$3:1.0 -2 1 $6$2:1.0 -3 1 $6$3:1.0 -4 1 $6$1:1.0 -5 1 $6$3:1.0 -6 1 $6$1:1.0 -7 1 $6$0:1.0 + double bool number str pred +0 1.1 True 2 A $6$0:1.0 3:1.0 +1 1.1 False 2 B $6$0:1.0 5:1.0 +2 1.1 True 1 B $6$0:1.0 3:1.0 +3 2.2 True 1 A $6$2:1.0 3:1.0 ``` diff --git a/docs/en/onehottrainbatchop.md b/docs/en/onehottrainbatchop.md index 8216d5a1f..42345e273 100644 --- a/docs/en/onehottrainbatchop.md +++ b/docs/en/onehottrainbatchop.md @@ -6,57 +6,47 @@ One-hot maps a serial of columns of category indices to a column of ## Parameters | Name | Description | Type | Required? | Default Value | | --- | --- | --- | --- | --- | -| dropLast | drop last | Boolean | | true | -| ignoreNull | ignore null | Boolean | | false | -| selectedCols | Names of the columns used for processing | String[] | ✓ | | - +| discreteThresholdsArray | discrete thresholds array | Integer[] | | | +| discreteThresholds | discrete thresholds array | Integer | | Integer.MIN_VALUE | +| selectedCols | Names of the columns used for processing | String[] | | | ## Script Example #### Script ```python +import numpy as np +import pandas as pd data = np.array([ - ["assisbragasm", 1], - ["assiseduc", 1], - ["assist", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assiseduc", 1], - ["assistebrasil", 1], - ["assistencialgsamsung", 1] + [1.1, True, "2", "A"], + [1.1, False, "2", "B"], + [1.1, True, "1", "B"], + [2.2, True, "1", "A"] ]) - -# load data -df = pd.DataFrame({"query": data[:, 0], "weight": data[:, 1]}) - -inOp = dataframeToOperator(df, schemaStr='query string, weight long', op_type='batch') - -# one hot train -one_hot = OneHotTrainBatchOp().setSelectedCols(["query"]).setDropLast(False).setIgnoreNull(False) -model = inOp.link(one_hot) - -# batch predict -predictor = OneHotPredictBatchOp().setOutputCol("predicted_r").setReservedCols(["weight"]) -print(BatchOperator.collectToDataframe(predictor.linkFrom(model, inOp))) - -# stream predict -inOp2 = dataframeToOperator(df, schemaStr='query string, weight long', op_type='stream') -predictor = OneHotPredictStreamOp(model).setOutputCol("predicted_r").setReservedCols(["weight"]) -predictor.linkFrom(inOp2).print() - +df = pd.DataFrame({"double": data[:, 0], "bool": data[:, 1], "number": data[:, 2], "str": data[:, 3]}) + +inOp1 = BatchOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') +inOp2 = StreamOperator.fromDataframe(df, schemaStr='double double, bool boolean, number int, str string') + +onehot = OneHotTrainBatchOp().setSelectedCols(["double", "bool", "number", "str"]).setDiscreteThresholds(2) +predictBatch = OneHotPredictBatchOp().setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["pred"]).setDropLast(False) +onehot.linkFrom(inOp1) +predictBatch.linkFrom(onehot, inOp1) +[model,predict] = collectToDataframes(onehot, predictBatch) +print(model) +print(predict) + +predictStream = OneHotPredictStreamOp(onehot).setSelectedCols(["double", "bool"]).setEncode("ASSEMBLED_VECTOR").setOutputCols(["vec"]) +predictStream.linkFrom(inOp2) +predictStream.print(refreshInterval=-1) StreamOperator.execute() ``` #### Result ```python - weight predicted_r -0 1 $6$4:1.0 -1 1 $6$3:1.0 -2 1 $6$2:1.0 -3 1 $6$3:1.0 -4 1 $6$1:1.0 -5 1 $6$3:1.0 -6 1 $6$1:1.0 -7 1 $6$0:1.0 + double bool number str pred +0 1.1 True 2 A $6$0:1.0 3:1.0 +1 1.1 False 2 B $6$0:1.0 5:1.0 +2 1.1 True 1 B $6$0:1.0 3:1.0 +3 2.2 True 1 A $6$2:1.0 3:1.0 ``` diff --git a/docs/en/quantilediscretizer.md b/docs/en/quantilediscretizer.md index b972a5d4a..d422e0045 100644 --- a/docs/en/quantilediscretizer.md +++ b/docs/en/quantilediscretizer.md @@ -9,10 +9,13 @@ Quantile discretizer calculate the q-quantile as the interval, output the interv | selectedCols | Names of the columns used for processing | String[] | ✓ | | | numBuckets | number of buckets | Integer | | 2 | | numBucketsArray | Array of num bucket | Integer[] | | null | +| leftOpen | left open | Boolean | | true | | selectedCols | Names of the columns used for processing | String[] | ✓ | | | reservedCols | Names of the columns to be retained in the output table | String[] | | null | | outputCols | Names of the output columns | String[] | | null | - +| handleInvalid | Strategy to handle unseen token when doing prediction, one of "keep", "skip" or "error" | String | | "keep" | +| encode | Encode method,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | drop last | Boolean | | true | ## Script Example diff --git a/docs/en/quantilediscretizerpredictbatchop.md b/docs/en/quantilediscretizerpredictbatchop.md index 21db72570..0999d200d 100644 --- a/docs/en/quantilediscretizerpredictbatchop.md +++ b/docs/en/quantilediscretizerpredictbatchop.md @@ -7,6 +7,9 @@ The batch operator that predict the data using the quantile discretizer model. | selectedCols | Names of the columns used for processing | String[] | ✓ | | | reservedCols | Names of the columns to be retained in the output table | String[] | | null | | outputCols | Names of the output columns | String[] | | null | +| handleInvalid | Strategy to handle unseen token when doing prediction, one of "keep", "skip" or "error" | String | | "keep" | +| encode | Encode method,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | drop last | Boolean | | true | ## Script Example diff --git a/docs/en/quantilediscretizerpredictstreamop.md b/docs/en/quantilediscretizerpredictstreamop.md index d0f75aff6..13efed22a 100644 --- a/docs/en/quantilediscretizerpredictstreamop.md +++ b/docs/en/quantilediscretizerpredictstreamop.md @@ -7,7 +7,9 @@ The stream operator that predict the data using the quantile discretizer model. | selectedCols | Names of the columns used for processing | String[] | ✓ | | | reservedCols | Names of the columns to be retained in the output table | String[] | | null | | outputCols | Names of the output columns | String[] | | null | - +| handleInvalid | Strategy to handle unseen token when doing prediction, one of "keep", "skip" or "error" | String | | "keep" | +| encode | Encode method,"INDEX", "VECTOR", "ASSEMBLED_VECTOR" | String | |INDEX | +| dropLast | drop last | Boolean | | true | ## Script Example diff --git a/docs/en/quantilediscretizertrainbatchop.md b/docs/en/quantilediscretizertrainbatchop.md index 31b07edfe..1b5399b98 100644 --- a/docs/en/quantilediscretizertrainbatchop.md +++ b/docs/en/quantilediscretizertrainbatchop.md @@ -7,6 +7,7 @@ Fit a quantile discretizer model. | selectedCols | Names of the columns used for processing | String[] | ✓ | | | numBuckets | number of buckets | Integer | | 2 | | numBucketsArray | Array of num bucket | Integer[] | | null | +| leftOpen | left open | Boolean | | true | ## Script Example diff --git a/pyalink/adult.ipynb b/pyalink/adult.ipynb index a70a6e786..6736fa151 100644 --- a/pyalink/adult.ipynb +++ b/pyalink/adult.ipynb @@ -12,20 +12,18 @@ "output_type": "stream", "text": [ "\n", - "Use one of the following command to start using pyalink:\n", - "使用以下一条命令来开始使用 pyalink:\n", + "Use one of the following commands to start using PyAlink:\n", " - useLocalEnv(parallelism, flinkHome=None, config=None)\n", " - useRemoteEnv(host, port, parallelism, flinkHome=None, localIp=\"localhost\", config=None)\n", "Call resetEnv() to reset environment and switch to another.\n", - "使用 resetEnv() 来重置运行环境,并切换到另一个。\n", "\n", - "JVM listening on 127.0.0.1:57247\n" + "JVM listening on 127.0.0.1:64158\n" ] }, { "data": { "text/plain": [ - "JavaObject id=o6" + "MLEnv(benv=, btenv=, senv=, stenv=)" ] }, "execution_count": 1, @@ -84,7 +82,7 @@ "numerialColNames = [\"age\", \"fnlwgt\", \"education_num\", \"capital_gain\",\n", " \"capital_loss\", \"hours_per_week\"]\n", "onehot = OneHotEncoder().setSelectedCols(categoricalColNames) \\\n", - " .setOutputCol(\"output\").setReservedCols(numerialColNames + [\"label\"])\n", + " .setOutputCols([\"output\"]).setReservedCols(numerialColNames + [\"label\"])\n", "assembler = VectorAssembler().setSelectedCols([\"output\"] + numerialColNames) \\\n", " .setOutputCol(\"vec\").setReservedCols([\"label\"])\n", "pipeline = Pipeline().add(onehot).add(assembler)" @@ -129,20 +127,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "AUC: 0.9071346253140332\n", - "KS: 0.6508855101121852\n", - "PRC: 0.7654668375809972\n", - "Precision: 0.7311696264543784\n", - "Recall: 0.609105981379926\n", - "F1: 0.6645794197453558\n", - "ConfusionMatrix: [[4776, 1756], [3065, 22964]]\n", + "AUC: 0.9066240193960077\n", + "KS: 0.6495268264606959\n", + "PRC: 0.7662328278289783\n", + "Precision: 0.733230531996916\n", + "Recall: 0.6064277515623008\n", + "F1: 0.6638280050258272\n", + "ConfusionMatrix: [[4755, 1730], [3086, 22990]]\n", "LabelArray: ['>50K', '<=50K']\n", - "LogLoss: 0.31880016560096547\n", + "LogLoss: 0.3192012545654014\n", "TotalSamples: 32561\n", "ActualLabelProportion: [0.2408095574460244, 0.7591904425539756]\n", "ActualLabelFrequency: [7841, 24720]\n", - "Accuracy: 0.8519394367494856\n", - "Kappa: 0.5705912048680206\n" + "Accuracy: 0.8520929946868954\n", + "Kappa: 0.5701036372627706\n" ] } ], @@ -180,7 +178,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.7.0" } }, "nbformat": 4,