diff --git a/config/checkstyle/scalastyle_config.xml b/config/checkstyle/scalastyle_config.xml
index d8598040..48c4e16a 100644
--- a/config/checkstyle/scalastyle_config.xml
+++ b/config/checkstyle/scalastyle_config.xml
@@ -67,7 +67,7 @@
-
+
@@ -75,7 +75,7 @@
-
+
diff --git a/src/main/scala/io/archivesunleashed/DataFrameLoader.scala b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala
index e90eae6d..1ab86e0e 100644
--- a/src/main/scala/io/archivesunleashed/DataFrameLoader.scala
+++ b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala
@@ -18,9 +18,7 @@
package io.archivesunleashed
import org.apache.spark.SparkContext
-// scalastyle:off underscore.import
-import org.apache.spark.sql._
-// scalastyle:on underscore.import
+import org.apache.spark.sql.DataFrame
class DataFrameLoader(sc: SparkContext) {
def extractValidPages(path: String): DataFrame = {
diff --git a/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala b/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala
index a51d195b..00627049 100644
--- a/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala
+++ b/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala
@@ -16,9 +16,7 @@
*/
package io.archivesunleashed.app
-// scalastyle:off underscore.import
-import io.archivesunleashed._
-// scalastyle:on underscore.import
+import io.archivesunleashed.RecordLoader
import io.archivesunleashed.matchbox.{NERClassifier, RemoveHTML}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
diff --git a/src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala b/src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala
index ddb72a6e..1d107e34 100644
--- a/src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala
+++ b/src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala
@@ -16,11 +16,7 @@
*/
package io.archivesunleashed.app
-// scalastyle:off underscore.import
-import io.archivesunleashed._
-import io.archivesunleashed.matchbox
-import org.apache.spark.graphx._
-// scalastyle:on underscore.import
+import org.apache.spark.graphx.{Edge, Graph, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD
/** Extracts a site link structure using Spark's GraphX utility. */
diff --git a/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala b/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala
index cbabe535..4654be18 100644
--- a/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala
+++ b/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala
@@ -16,9 +16,7 @@
*/
package io.archivesunleashed.app
-// scalastyle:off underscore.import
-import io.archivesunleashed._
-// scalastyle:on underscore.import
+import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.matchbox.{ComputeImageSize, ComputeMD5}
import org.apache.spark.rdd.RDD
import org.apache.spark.{RangePartitioner, SparkContext}
diff --git a/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala b/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala
index 1a2fcfd3..4e2c29b5 100644
--- a/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala
+++ b/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala
@@ -21,9 +21,7 @@ import java.io.{BufferedReader, BufferedWriter, InputStreamReader, OutputStreamW
import io.archivesunleashed.matchbox.NERClassifier
import io.archivesunleashed.util.JsonUtils
import org.apache.hadoop.conf.Configuration
-// scalastyle:off underscore.import
-import org.apache.hadoop.fs._
-// scalastyle:on underscore.import
+import org.apache.hadoop.fs.{FileUtil, FileSystem, Path}
import org.apache.spark.SparkContext
import scala.collection.mutable.MutableList
diff --git a/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala b/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala
index dd00b836..a3d26bc0 100644
--- a/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala
+++ b/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala
@@ -15,9 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
-// scalastyle:off underscore.import
-import io.archivesunleashed.matchbox._
-// scalastyle:on underscore.import
+import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import org.apache.spark.rdd.RDD
diff --git a/src/main/scala/io/archivesunleashed/app/WriteGraph.scala b/src/main/scala/io/archivesunleashed/app/WriteGraph.scala
index 26f64f5c..af91e3e7 100644
--- a/src/main/scala/io/archivesunleashed/app/WriteGraph.scala
+++ b/src/main/scala/io/archivesunleashed/app/WriteGraph.scala
@@ -15,9 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
-// scalastyle:off underscore.import
-import io.archivesunleashed.matchbox._
-// scalastyle:on underscore.import
+import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import org.apache.spark.rdd.RDD
diff --git a/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala
index 0044d52a..fa8a7d04 100644
--- a/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala
+++ b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala
@@ -15,9 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
-// scalastyle:off underscore.import
-import io.archivesunleashed.matchbox._
-// scalastyle: on underscore.import
+import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import org.apache.spark.rdd.RDD
diff --git a/src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala b/src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala
index e109a225..afac0bf6 100644
--- a/src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala
+++ b/src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala
@@ -15,10 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
-// scalastyle:off underscore.import
-import io.archivesunleashed.matchbox._
-import org.apache.spark.graphx._
-// scalastyle:on underscore.import
+import org.apache.spark.graphx.Graph
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import io.archivesunleashed.app.ExtractGraphX.{VertexData,EdgeData,VertexDataPR}
diff --git a/src/main/scala/io/archivesunleashed/df/package.scala b/src/main/scala/io/archivesunleashed/df/package.scala
index 21f6ef7a..00ccbe01 100644
--- a/src/main/scala/io/archivesunleashed/df/package.scala
+++ b/src/main/scala/io/archivesunleashed/df/package.scala
@@ -17,9 +17,7 @@
package io.archivesunleashed
-// scalastyle:off underscore.import
-import io.archivesunleashed.matchbox._
-// scalastyle:on underscore.import
+import io.archivesunleashed.matchbox.{ComputeMD5, ExtractDomain, RemoveHTML}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.DataFrame
import java.io.ByteArrayInputStream
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
index ca07de7b..a7be3447 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
@@ -23,9 +23,8 @@ object ExtractDate {
type DateComponent = Value
val YYYY, MM, DD, YYYYMM, YYYYMMDD = Value
}
- // scalastyle:off underscore.import
- import DateComponent._
- // scalastyle:on underscore.import
+
+ import DateComponent.{DateComponent, DD, MM, YYYY, YYYYMM}
/** Extracts the wanted date component from a date.
*
diff --git a/src/main/scala/io/archivesunleashed/matchbox/package.scala b/src/main/scala/io/archivesunleashed/matchbox/package.scala
index a3340dde..8b953d37 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/package.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/package.scala
@@ -19,9 +19,7 @@ package io.archivesunleashed
import java.io.IOException
import java.security.MessageDigest
-// scalastyle:off underscore.import
-import scala.xml.Utility._
-// scalastyle:on underscore.import
+import scala.xml.Utility.escape
/** Package object which supplies implicits providing common UDF-related functionalities. */
diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala
index d6ed4ede..d8327fbd 100644
--- a/src/main/scala/io/archivesunleashed/package.scala
+++ b/src/main/scala/io/archivesunleashed/package.scala
@@ -24,11 +24,9 @@ import io.archivesunleashed.matchbox.ImageDetails
import io.archivesunleashed.matchbox.ExtractDate.DateComponent
import java.net.URI
import org.apache.hadoop.fs.{FileSystem, Path}
-// scalastyle:off underscore.import
-import io.archivesunleashed.matchbox.ExtractDate.DateComponent._
-import org.apache.spark.sql._
-import org.apache.spark.sql.types._
-// scalastyle:on: underscore.import
+import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent
+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.hadoop.io.LongWritable
import org.apache.spark.{SerializableWritable, SparkContext}
import org.apache.spark.rdd.RDD
diff --git a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
index f13113fc..59db63b9 100644
--- a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
+++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
@@ -31,6 +31,13 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
private val master = "local[4]"
private val appName = "example-spark"
private var sc: SparkContext = _
+ private val exampleArc = "example.arc.gz"
+ private val exampleWarc = "example.warc.gz"
+ private val exampleDate = "20080430"
+ private val exampleUrl = "www.archive.org"
+ private val exampleStatusCode1 = "000"
+ private val exampleStatusCode2 = "200"
+ private val exampleMimeType = "text/plain"
before {
val conf = new SparkConf()
@@ -51,10 +58,10 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
.take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => FilenameUtils.getName(x.getArchiveFilename)).take(3)
- assert(textSampleArc.deep == Array("example.arc.gz",
- "example.arc.gz", "example.arc.gz").deep)
- assert(textSampleWarc.deep == Array("example.warc.gz",
- "example.warc.gz", "example.warc.gz").deep)
+ assert(textSampleArc.deep == Array(exampleArc,
+ exampleArc, exampleArc).deep)
+ assert(textSampleWarc.deep == Array(exampleWarc,
+ exampleWarc, exampleWarc).deep)
}
test("Crawl Dates") {
@@ -62,8 +69,8 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
.map(x => x.getCrawlDate).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getCrawlDate).take(3)
- assert(textSampleArc.deep == Array("20080430", "20080430", "20080430").deep)
- assert(textSampleWarc.deep == Array("20080430", "20080430", "20080430").deep)
+ assert(textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep)
+ assert(textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep)
}
test("Domains") {
@@ -71,8 +78,8 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
.map(x => x.getDomain).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getDomain).take(3)
- assert(textSampleArc.deep == Array("", "", "www.archive.org").deep)
- assert(textSampleWarc.deep == Array("", "www.archive.org", "www.archive.org").deep)
+ assert(textSampleArc.deep == Array("", "", exampleUrl).deep)
+ assert(textSampleWarc.deep == Array("", exampleUrl, exampleUrl).deep)
}
test("Urls") {
@@ -91,8 +98,10 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
.map(x => x.getMimeType).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getMimeType).take(3)
- assert (textSampleArc.deep == Array ("text/plain", "text/dns", "text/plain").deep)
- assert (textSampleWarc.deep == Array("unknown", "text/plain", "text/html").deep)
+ assert (textSampleArc.deep == Array (exampleMimeType, "text/dns",
+ exampleMimeType).deep)
+ assert (textSampleWarc.deep == Array("unknown", exampleMimeType,
+ "text/html").deep)
}
test("Get Http Status") {
@@ -100,8 +109,10 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
.map(x => x.getHttpStatus).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getHttpStatus).take(3)
- assert (textSampleArc.deep == Array("000", "000", "200").deep)
- assert (textSampleWarc.deep == Array("000", "200", "200").deep)
+ assert (textSampleArc.deep == Array(exampleStatusCode1, exampleStatusCode1,
+ exampleStatusCode2).deep)
+ assert (textSampleWarc.deep == Array(exampleStatusCode1, exampleStatusCode2,
+ exampleStatusCode2).deep)
}
after {
diff --git a/src/test/scala/io/archivesunleashed/app/WriteGraphTest.scala b/src/test/scala/io/archivesunleashed/app/WriteGraphTest.scala
index a6bf5f95..d41288ef 100644
--- a/src/test/scala/io/archivesunleashed/app/WriteGraphTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/WriteGraphTest.scala
@@ -32,18 +32,28 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
- private val network = Seq((("Date1", "Source1", "Destination1"), 3),
- (("Date2", "Source2", "Destination2"), 4),
- (("Date3", "Source3", "Destination3"), 100))
- private val unescapedNetwork = Seq((("Date1", "Source1", "Destination1"), 3),
- (("Date2", "Source2", "Destination2"), 4),
- (("Date3", "Source<3", "Destination<3"), 100))
- private val networkDf = Seq(("Date1", "Source1", "Destination1", 3),
- ("Date2", "Source2", "Destination2", 4),
- ("Date3", "Source3", "Destination3", 100))
- private val networkWithDuplication = Seq((("Date1", "Source1", "Destination1"), 3),
- (("Date2", "Source2", "Source2"), 4),
- (("Date3", "Source3", "Destination3"), 100))
+ private val date1 = "Date1"
+ private val date2 = "Date2"
+ private val date3 = "Date3"
+ private val source1 = "Source1"
+ private val source2 = "Source2"
+ private val source3 = "Source3"
+ private val destination1 = "Destination1"
+ private val destination2 = "Destination2"
+ private val destination3 = "Destination3"
+ private val xmlDeclaration = """"""
+ private val network = Seq(((date1, source1, destination1), 3),
+ ((date2, source2, destination2), 4),
+ ((date3, source3, destination3), 100))
+ private val unescapedNetwork = Seq(((date1, source1, destination1), 3),
+ ((date2, source2, destination2), 4),
+ ((date3, "Source<3", "Destination<3"), 100))
+ private val networkDf = Seq((date1, source1, destination1, 3),
+ (date2, source2, destination2, 4),
+ (date3, source3, destination3, 100))
+ private val networkWithDuplication = Seq(((date1, source1, destination1), 3),
+ ((date2, source2, source2), 4),
+ ((date3, source3, destination3), 100))
private val testFile = "temporaryTestFile.txt"
private val testFile2 = "temporaryTestFile2.txt"
@@ -61,7 +71,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph.asGexf(networkrdd, testFile)
assert(Files.exists(Paths.get(testFile)))
val lines = Source.fromFile(testFile).getLines.toList
- assert(lines(testLines._1) == """""")
+ assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """""")
assert(lines(testLines._3) == """""")
assert(lines(testLines._4) == """""")
@@ -77,7 +87,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
val ret = WriteGraph.asGexf(networkarray, testFile)
assert(ret)
val lines = Source.fromFile(testFile).getLines.toList
- assert(lines(testLines._1) == """""")
+ assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """""")
assert(lines(testLines._3) == """""")
assert(lines(testLines._4) == """""")
@@ -104,7 +114,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
test ("Nodelookup returns a option") {
val networkrdd = sc.parallelize(network)
val nodes = WriteGraph.nodesWithIds(networkrdd)
- val lookup = "Source1"
+ val lookup = source1
val badlookup = "NOTTHERE"
assert (WriteGraph.nodeLookup(nodes, badlookup) == None)
assert (WriteGraph.nodeLookup(nodes, lookup) == Some((lookup, 6)))
@@ -115,7 +125,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
val nodes = WriteGraph.nodesWithIds(sc.parallelize(network))
val empty = -1
val expected = 6
- val lookup = WriteGraph.nodeLookup(nodes, "Source1")
+ val lookup = WriteGraph.nodeLookup(nodes, source1)
val badlookup = WriteGraph.nodeLookup(nodes, "NOTTHERE")
assert (WriteGraph.nodeIdFromLabel(lookup) == expected)
assert (WriteGraph.nodeIdFromLabel(badlookup) == empty)
@@ -123,9 +133,9 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
test ("Edge ids are captured from lookup") {
val edges = WriteGraph.edgeNodes(sc.parallelize(network))
- val expected = Array(("Date1", 6, 3, 3),
- ("Date2", 7, 4, 4),
- ("Date3", 0, 5, 100)).deep
+ val expected = Array((date1, 6, 3, 3),
+ (date2, 7, 4, 4),
+ (date3, 0, 5, 100)).deep
assert(edges.collect.deep == expected)
}
@@ -135,7 +145,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph.asGraphml(networkrdd, testFile)
assert(Files.exists(Paths.get(testFile)))
val lines = Source.fromFile(testFile).getLines.toList
- assert(lines(testLines._1) == """""")
+ assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """Source3""")
assert(lines(testLines._3) == """3""")
assert(lines(testLines._4) == """""")
@@ -147,7 +157,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph.asGraphml(networkrdd, testFile)
assert(Files.exists(Paths.get(testFile)))
val lines = Source.fromFile(testFile).getLines.toList
- assert(lines(testLines._1) == """""")
+ assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """Destination<3""")
assert(lines(testLines._3) == """100""")
assert(lines(testLines._4) == """""")
@@ -159,7 +169,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph(networkrdd, testFile2)
assert(Files.exists(Paths.get(testFile2)))
val lines = Source.fromFile(testFile2).getLines.toList
- assert(lines(testLines._1) == """""")
+ assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """""")
assert(lines(testLines._3) == """""")
assert(lines(testLines._4) == """""")
diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
index da1cf5bd..8b32d69e 100644
--- a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
@@ -33,6 +33,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
+ private val url = "url"
+ private val mime_type = "mime_type"
+ private val md5 = "md5"
before {
val conf = new SparkConf()
@@ -48,9 +51,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
val imageLinks = df.extractImageLinks(arcPath)
val images = df.extractImages(arcPath)
- val r_1 = validPages.select("url", "mime_type").take(1)(0)
- assert(r_1.getAs[String]("url") == "http://www.archive.org/")
- assert(r_1.getAs[String]("mime_type") == "text/html")
+ val r_1 = validPages.select(url, mime_type).take(1)(0)
+ assert(r_1.getAs[String](url) == "http://www.archive.org/")
+ assert(r_1.getAs[String](mime_type) == "text/html")
val r_2 = hyperlinks.select("Dest", "Anchor").take(3)(2)
assert(r_2(0) == "http://web.archive.org/collections/web/advanced.html")
@@ -61,8 +64,8 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
assert(r_3.get(1) == "http://www.archive.org/images/star.png")
val r_4 = images.take(1)(0)
- assert(r_4.getAs[String]("url") == "http://www.archive.org/images/logoc.jpg")
- assert(r_4.getAs[String]("md5") == "8211d1fbb9b03d8522a1ae378f9d1b24")
+ assert(r_4.getAs[String](url) == "http://www.archive.org/images/logoc.jpg")
+ assert(r_4.getAs[String](md5) == "8211d1fbb9b03d8522a1ae378f9d1b24")
}
after {
diff --git a/src/test/scala/io/archivesunleashed/df/SaveImageTest.scala b/src/test/scala/io/archivesunleashed/df/SaveImageTest.scala
index c38ddc50..0db9fccb 100644
--- a/src/test/scala/io/archivesunleashed/df/SaveImageTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/SaveImageTest.scala
@@ -43,6 +43,7 @@ class SaveImageTest extends FunSuite with BeforeAndAfter {
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
+ private val testString = "bytes"
before {
val conf = new SparkConf()
@@ -52,11 +53,10 @@ class SaveImageTest extends FunSuite with BeforeAndAfter {
}
test("Save image") {
- val testString = "bytes"
val df = RecordLoader.loadArchives(arcPath, sc)
.extractImageDetailsDF()
- val extracted = df.select("bytes")
+ val extracted = df.select(testString)
.orderBy(desc(testString)).limit(1)
extracted.saveToDisk(testString, "/tmp/foo")
@@ -97,7 +97,7 @@ class SaveImageTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val df = Seq(dummyImg).toDF
- df.saveToDisk("bytes", "/tmp/bar")
+ df.saveToDisk(testString, "/tmp/bar")
// Check that no file was written.
assert(new File("/tmp").listFiles.filter(_.isFile).toList
diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala
index d365291c..76ce1121 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala
@@ -25,17 +25,20 @@ import org.scalatest.junit.JUnitRunner
class ExtractDomainTest extends FunSuite {
private val index = "index.html"
private val umiacs = "www.umiacs.umd.edu"
+ private val jimmylin = "http://www.umiacs.umd.edu/~jimmylin/"
+ private val lintool = "https://github.com/lintool"
+ private val github = "github.com"
private val data1: Seq[(String, String)] = Seq.newBuilder.+=(
- ("http://www.umiacs.umd.edu/~jimmylin/", umiacs),
- ("https://github.com/lintool", "github.com"),
+ (jimmylin, umiacs),
+ (lintool, github),
("http://ianmilligan.ca/2015/05/04/iipc-2015-slides-for-warcs-wats-and-wgets-presentation/", "ianmilligan.ca"),
(index, "")).result()
private val data2 = Seq.newBuilder.+=(
- (index, "http://www.umiacs.umd.edu/~jimmylin/", umiacs),
- ("https://github.com/lintool", "http://www.umiacs.umd.edu/~jimmylin/", "github.com"),
- (index, "https://github.com/lintool", "github.com")).result()
+ (index, jimmylin, umiacs),
+ (lintool, jimmylin, github),
+ (index, lintool, github)).result()
private val data3 = Seq.newBuilder.+=(
("http://www.seetorontonow.canada-booknow.com\\booking_results.php", "www.seetorontonow.canada-booknow.com")).result()