diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/PipelineBuilder.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/PipelineBuilder.scala new file mode 100644 index 000000000..8bddd7240 --- /dev/null +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/PipelineBuilder.scala @@ -0,0 +1,30 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.ml.automl.feature + +import org.apache.spark.ml.PipelineStage +import org.apache.spark.ml.Transformer + +object PipelineBuilder { + + def declareFields(pipeline: Array[PipelineStage]): Unit = { + + } + +} diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/TransformerWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/TransformerWrapper.scala new file mode 100644 index 000000000..a3d09ad26 --- /dev/null +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/TransformerWrapper.scala @@ -0,0 +1,56 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.ml.automl.feature + +import org.apache.spark.ml.Transformer + +import scala.collection.mutable.ArrayBuffer + +abstract class TransformerWrapper { + + val transformer: Transformer + var parentTransformer: Transformer + + val requiredInputCols: Array[String] + val requiredOutputCols: Array[String] + + val inputCols: ArrayBuffer[String] + val outputCols: ArrayBuffer[String] + + var parentCols: Array[String] + + def getTransformer: Transformer = transformer + + def setParent(parent: Transformer) = parentTransformer = parent + + def hasInputCol: Boolean + + def hasOutputCol: Boolean + + def getInputCols: Array[String] = inputCols.toArray + + def getOutputCols: Array[String] = outputCols.toArray + + def addInputCol(col: String): Unit = inputCols += col + + def addOutputCol(col: String): Unit = outputCols += col + + def setParentCols: Array[String] = parentCols + +} diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala new file mode 100644 index 000000000..016a807f0 --- /dev/null +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/SamplerWrapper.scala @@ -0,0 +1,43 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.ml.automl.feature.preprocess + +import com.tencent.angel.spark.ml.automl.feature.TransformerWrapper +import org.apache.spark.ml.Transformer + +import scala.collection.mutable.ArrayBuffer + +class SamplerWrapper(fraction: Double) extends TransformerWrapper { + + override val transformer: Transformer = new Sampler(fraction) + override var parentTransformer: Transformer = _ + + override val requiredInputCols: Array[String] = null + override val requiredOutputCols: Array[String] = null + + override val inputCols: ArrayBuffer[String] = new ArrayBuffer[String]() + override val outputCols: ArrayBuffer[String] = new ArrayBuffer[String]() + + override var parentCols: Array[String] = _ + + override def hasInputCol: Boolean = true + + override def hasOutputCol: Boolean = false + +} diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala new file mode 100644 index 000000000..19fecc442 --- /dev/null +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/StopWordsRemoverWrapper.scala @@ -0,0 +1,42 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + +package com.tencent.angel.spark.ml.automl.feature.preprocess + +import com.tencent.angel.spark.ml.automl.feature.TransformerWrapper +import org.apache.spark.ml.Transformer + +import scala.collection.mutable.ArrayBuffer +import org.apache.spark.ml.feature.StopWordsRemover + +class StopWordsRemoverWrapper extends TransformerWrapper { + + override val transformer: Transformer = new StopWordsRemover() + override var parentTransformer: Transformer = _ + + override val requiredInputCols: Array[String] = Array("words") + override val requiredOutputCols: Array[String] = Array("filteredwords") + + override val inputCols: ArrayBuffer[String] = _ + override val outputCols: ArrayBuffer[String] = _ + + override var parentCols: Array[String] = _ + + override def hasInputCol: Boolean = true + + override def hasOutputCol: Boolean = true +} diff --git a/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala new file mode 100644 index 000000000..63357e382 --- /dev/null +++ b/spark-on-angel/mllib/src/main/scala/com/tencent/angel/spark/ml/automl/feature/preprocess/TokenizerWrapper.scala @@ -0,0 +1,43 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.ml.automl.feature.preprocess + +import com.tencent.angel.spark.ml.automl.feature.TransformerWrapper +import org.apache.spark.ml.Transformer + +import scala.collection.mutable.ArrayBuffer +import org.apache.spark.ml.feature.Tokenizer + +class TokenizerWrapper extends TransformerWrapper { + + override val transformer: Transformer = new Tokenizer() + override var parentTransformer: Transformer = _ + + override val requiredInputCols: Array[String] = Array("sentence") + override val requiredOutputCols: Array[String] = Array("words") + + override val inputCols: ArrayBuffer[String] = _ + override val outputCols: ArrayBuffer[String] = _ + + override var parentCols: Array[String] = _ + + override def hasInputCol: Boolean = true + + override def hasOutputCol: Boolean = true +}