From 34df92a74d38674866e52ef61e6a13277ebfbc48 Mon Sep 17 00:00:00 2001 From: danyaljj Date: Thu, 30 Jun 2016 23:31:44 -0700 Subject: [PATCH] functionality to print features in arff format. --- .../cogcomp/saul/classifier/Learnable.scala | 42 ++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/saul-core/src/main/scala/edu/illinois/cs/cogcomp/saul/classifier/Learnable.scala b/saul-core/src/main/scala/edu/illinois/cs/cogcomp/saul/classifier/Learnable.scala index 4daabc35..09978b6d 100644 --- a/saul-core/src/main/scala/edu/illinois/cs/cogcomp/saul/classifier/Learnable.scala +++ b/saul-core/src/main/scala/edu/illinois/cs/cogcomp/saul/classifier/Learnable.scala @@ -1,6 +1,6 @@ package edu.illinois.cs.cogcomp.saul.classifier -import java.io.File +import java.io.{ File, PrintWriter } import java.net.URL import edu.illinois.cs.cogcomp.core.io.IOUtils @@ -371,6 +371,46 @@ abstract class Learnable[T <: AnyRef](val node: Node[T], val parameters: Paramet using(properties: _*) } + /** This function would print all the input features and their corresponding output label on disk, in + * ARFF format: http://www.cs.waikato.ac.nz/ml/weka/arff.html + */ + def printAllFeatures(location: String) = { + // one time dry run, to add all the lexicon + node.getAllInstances.foreach { instance => this.classifier.getExampleArray(instance, true) } + logger.info("Feature length: " + this.classifier.getPrunedLexiconSize) + printFeatures(location: String, train = true) + printFeatures(location: String, train = false) + } + + /* this would print the feature values on disk */ + private def printFeatures(location: String, train: Boolean): Unit = { + val pw = new PrintWriter(new File(s"$location/outputFeatures_${if (train) "train" else "test"}.arff")) + val featureLength = this.classifier.getPrunedLexiconSize + + pw.write("@RELATION EssentialTerms\n") + (0 until featureLength).foreach { idx => pw.write(s"@ATTRIBUTE f$idx NUMERIC\n") } + pw.write("@ATTRIBUTE class {IMPORTANT, NOT-IMPORTANT}\n") + pw.write("@DATA\n") + + val examples = if (train) node.trainingSet else node.testingSet + examples.foreach { cons => + val out = this.classifier.getExampleArray(cons, true) + val intArray = out(0).asInstanceOf[Array[Int]].toList + val doubleArray = out(1).asInstanceOf[Array[Double]].toList + + pw.write("{") + val featureValues = intArray.zip(doubleArray).groupBy { _._1 }.map { _._2.head }.toList. // remove the repeated features + filter { case (ind, value) => value != 0.0 }. // drop zero features + sortBy { case (ind, value) => ind }. + map { + case (ind, value) => ind + " " + (if (value == 1.0) "1" else value) // print feature as integer if it is 1.0 + }.mkString(", ") + pw.write(featureValues) + pw.write(", " + featureLength + " " + getLabeler.discreteValue(cons) + "}\n") + } + pw.close() + } + // TODO Move the window properties out of Learner class. /** A windows of properties *