Skip to content

Commit

Permalink
move feature size validating tests into a separate suite
Browse files Browse the repository at this point in the history
  • Loading branch information
wbo4958 committed Mar 2, 2020
1 parent 74544e5 commit cf38b06
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 49 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
Copyright (c) 2014 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package ml.dmlc.xgboost4j.scala.spark

import ml.dmlc.xgboost4j.java.XGBoostError
import org.apache.spark.Partitioner
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

import scala.util.Random

class FeatureSizeValidatingSuite extends FunSuite with PerTest {

test("transform throwing exception if feature size of dataset is different with model's") {
val modelPath = getClass.getResource("/model/0.82/model").getPath
val model = XGBoostClassificationModel.read.load(modelPath)
val r = new Random(0)
// 0.82/model was trained with 251 features. and transform will throw exception
// if feature size of data is not equal to 251
val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
toDF("feature", "label")
val assembler = new VectorAssembler()
.setInputCols(df.columns.filter(!_.contains("label")))
.setOutputCol("features")
val thrown = intercept[Exception] {
model.transform(assembler.transform(df)).show()
}
assert(thrown.getMessage.contains(
"Number of columns does not match number of features in booster"))
}

test("train throwing exception if feature size of dataset is different on distributed train") {
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic",
"num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
import DataUtils._
val sparkSession = SparkSession.builder().getOrCreate()
import sparkSession.implicits._
val repartitioned = sc.parallelize(Synthetic.trainWithDiffFeatureSize, 2)
.map(lp => (lp.label, lp)).partitionBy(
new Partitioner {
override def numPartitions: Int = 2

override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
}
).map(_._2).zipWithIndex().map {
case (lp, id) =>
(id, lp.label, lp.features)
}.toDF("id", "label", "features")
val xgb = new XGBoostClassifier(paramMap)
intercept[XGBoostError] {
xgb.fit(repartitioned)
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,12 @@

package ml.dmlc.xgboost4j.scala.spark

import java.nio.file.Files

import ml.dmlc.xgboost4j.java.XGBoostError

import scala.util.Random
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
import ml.dmlc.xgboost4j.scala.DMatrix
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.{Partitioner, TaskContext}
import org.apache.spark.{TaskContext}
import org.scalatest.FunSuite
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.lit

class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
Expand Down Expand Up @@ -374,45 +367,4 @@ class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
df2.collect()
}

test("transform throwing exception when feature size of dataset is different with model's") {
val modelPath = getClass.getResource("/model/0.82/model").getPath
val model = XGBoostClassificationModel.read.load(modelPath)
val r = new Random(0)
// 0.82/model was trained with 251 features. and transform will throw exception
// if feature size of data is not equal to 251
val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
toDF("feature", "label")
val assembler = new VectorAssembler()
.setInputCols(df.columns.filter(!_.contains("label")))
.setOutputCol("features")
val thrown = intercept[Exception] {
model.transform(assembler.transform(df)).show()
}
assert(thrown.getMessage.contains(
"Number of columns does not match number of features in booster"))
}

test("train throwing exception when feature size of dataset is different on distributed train") {
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic",
"num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
import DataUtils._
val sparkSession = SparkSession.builder().getOrCreate()
import sparkSession.implicits._
val repartitioned = sc.parallelize(Synthetic.trainWithDiffFeatureSize, 2)
.map(lp => (lp.label, lp)).partitionBy(
new Partitioner {
override def numPartitions: Int = 2

override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
}
).map(_._2).zipWithIndex().map {
case (lp, id) =>
(id, lp.label, lp.features)
}.toDF("id", "label", "features")
val xgb = new XGBoostClassifier(paramMap)
intercept[XGBoostError] {
xgb.fit(repartitioned)
}
}
}

0 comments on commit cf38b06

Please sign in to comment.