From 0a38e822dba836996198410339bc1b88977b41ca Mon Sep 17 00:00:00 2001 From: bluesjjw Date: Fri, 14 Dec 2018 10:36:29 +0800 Subject: [PATCH 1/2] wrapper of spark.ml transformer --- .../ml/automl/feature/PipelineBuilder.scala | 30 ++++++++++ .../automl/feature/TransformerWrapper.scala | 56 +++++++++++++++++++ .../feature/preprocess/SamplerWrapper.scala | 41 ++++++++++++++ .../preprocess/StopWordsRemoverWrapper.scala | 39 +++++++++++++ .../feature/preprocess/TokenizerWrapper.scala | 40 +++++++++++++ 5 files changed, 206 insertions(+) create mode 100644 spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/PipelineBuilder.scala create mode 100644 spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/TransformerWrapper.scala create mode 100644 spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala create mode 100644 spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala create mode 100644 spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/PipelineBuilder.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/PipelineBuilder.scala new file mode 100644 index 000000000..8bddd7240 --- /dev/null +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/PipelineBuilder.scala @@ -0,0 +1,30 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.ml.automl.feature + +import org.apache.spark.ml.PipelineStage +import org.apache.spark.ml.Transformer + +object PipelineBuilder { + + def declareFields(pipeline: Array[PipelineStage]): Unit = { + + } + +} diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/TransformerWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/TransformerWrapper.scala new file mode 100644 index 000000000..a3d09ad26 --- /dev/null +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/TransformerWrapper.scala @@ -0,0 +1,56 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.ml.automl.feature + +import org.apache.spark.ml.Transformer + +import scala.collection.mutable.ArrayBuffer + +abstract class TransformerWrapper { + + val transformer: Transformer + var parentTransformer: Transformer + + val requiredInputCols: Array[String] + val requiredOutputCols: Array[String] + + val inputCols: ArrayBuffer[String] + val outputCols: ArrayBuffer[String] + + var parentCols: Array[String] + + def getTransformer: Transformer = transformer + + def setParent(parent: Transformer) = parentTransformer = parent + + def hasInputCol: Boolean + + def hasOutputCol: Boolean + + def getInputCols: Array[String] = inputCols.toArray + + def getOutputCols: Array[String] = outputCols.toArray + + def addInputCol(col: String): Unit = inputCols += col + + def addOutputCol(col: String): Unit = outputCols += col + + def setParentCols: Array[String] = parentCols + +} diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala new file mode 100644 index 000000000..ab9d139ef --- /dev/null +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala @@ -0,0 +1,41 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.ml.automl.feature.preprocess + +import com.tencent.angel.spark.ml.automl.feature.TransformerWrapper +import org.apache.spark.ml.Transformer + +import scala.collection.mutable.ArrayBuffer + +class SamplerWrapper(fraction: Double) extends TransformerWrapper { + + override val transformer: Transformer = new Sampler(fraction) + + override var parentTransformer: Transformer = _ + override var parentCols: Array[String] = _ + override val requiredInputCols: Array[String] = null + override val requiredOutputCols: Array[String] = null + override val inputCols: ArrayBuffer[String] = new ArrayBuffer[String]() + override val outputCols: ArrayBuffer[String] = new ArrayBuffer[String]() + + override def hasInputCol: Boolean = true + + override def hasOutputCol: Boolean = false + +} diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala new file mode 100644 index 000000000..ce98b1933 --- /dev/null +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala @@ -0,0 +1,39 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + +package com.tencent.angel.spark.ml.automl.feature.preprocess + +import com.tencent.angel.spark.ml.automl.feature.TransformerWrapper +import org.apache.spark.ml.Transformer + +import scala.collection.mutable.ArrayBuffer +import org.apache.spark.ml.feature.StopWordsRemover + +class StopWordsRemoverWrapper extends TransformerWrapper { + + override val transformer: Transformer = new StopWordsRemover() + override var parentTransformer: Transformer = _ + override val requiredInputCols: Array[String] = Array("words") + override val requiredOutputCols: Array[String] = Array("filteredwords") + override val inputCols: ArrayBuffer[String] = _ + override val outputCols: ArrayBuffer[String] = _ + override var parentCols: Array[String] = _ + + override def hasInputCol: Boolean = true + + override def hasOutputCol: Boolean = true +} diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala new file mode 100644 index 000000000..c7bbbcd3f --- /dev/null +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala @@ -0,0 +1,40 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.ml.automl.feature.preprocess + +import com.tencent.angel.spark.ml.automl.feature.TransformerWrapper +import org.apache.spark.ml.Transformer + +import scala.collection.mutable.ArrayBuffer +import org.apache.spark.ml.feature.Tokenizer + +class TokenizerWrapper extends TransformerWrapper { + + override val transformer: Transformer = new Tokenizer() + override var parentTransformer: Transformer = _ + override val requiredInputCols: Array[String] = Array("sentence") + override val requiredOutputCols: Array[String] = Array("words") + override val inputCols: ArrayBuffer[String] = _ + override val outputCols: ArrayBuffer[String] = _ + override var parentCols: Array[String] = _ + + override def hasInputCol: Boolean = true + + override def hasOutputCol: Boolean = true +} From d194a946cfc2a79afae514640b104ed12c01a9a1 Mon Sep 17 00:00:00 2001 From: bluesjjw Date: Fri, 14 Dec 2018 10:39:43 +0800 Subject: [PATCH 2/2] code format --- .../spark/ml/automl/feature/preprocess/SamplerWrapper.scala | 6 ++++-- .../automl/feature/preprocess/StopWordsRemoverWrapper.scala | 3 +++ .../ml/automl/feature/preprocess/TokenizerWrapper.scala | 3 +++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala index ab9d139ef..016a807f0 100644 --- a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala @@ -26,14 +26,16 @@ import scala.collection.mutable.ArrayBuffer class SamplerWrapper(fraction: Double) extends TransformerWrapper { override val transformer: Transformer = new Sampler(fraction) - override var parentTransformer: Transformer = _ - override var parentCols: Array[String] = _ + override val requiredInputCols: Array[String] = null override val requiredOutputCols: Array[String] = null + override val inputCols: ArrayBuffer[String] = new ArrayBuffer[String]() override val outputCols: ArrayBuffer[String] = new ArrayBuffer[String]() + override var parentCols: Array[String] = _ + override def hasInputCol: Boolean = true override def hasOutputCol: Boolean = false diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala index ce98b1933..19fecc442 100644 --- a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala @@ -27,10 +27,13 @@ class StopWordsRemoverWrapper extends TransformerWrapper { override val transformer: Transformer = new StopWordsRemover() override var parentTransformer: Transformer = _ + override val requiredInputCols: Array[String] = Array("words") override val requiredOutputCols: Array[String] = Array("filteredwords") + override val inputCols: ArrayBuffer[String] = _ override val outputCols: ArrayBuffer[String] = _ + override var parentCols: Array[String] = _ override def hasInputCol: Boolean = true diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala index c7bbbcd3f..63357e382 100644 --- a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala @@ -28,10 +28,13 @@ class TokenizerWrapper extends TransformerWrapper { override val transformer: Transformer = new Tokenizer() override var parentTransformer: Transformer = _ + override val requiredInputCols: Array[String] = Array("sentence") override val requiredOutputCols: Array[String] = Array("words") + override val inputCols: ArrayBuffer[String] = _ override val outputCols: ArrayBuffer[String] = _ + override var parentCols: Array[String] = _ override def hasInputCol: Boolean = true