diff --git a/config/checkstyle/scalastyle_config.xml b/config/checkstyle/scalastyle_config.xml index d8598040..48c4e16a 100644 --- a/config/checkstyle/scalastyle_config.xml +++ b/config/checkstyle/scalastyle_config.xml @@ -67,7 +67,7 @@ - + @@ -75,7 +75,7 @@ - + diff --git a/src/main/scala/io/archivesunleashed/DataFrameLoader.scala b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala index e90eae6d..1ab86e0e 100644 --- a/src/main/scala/io/archivesunleashed/DataFrameLoader.scala +++ b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala @@ -18,9 +18,7 @@ package io.archivesunleashed import org.apache.spark.SparkContext -// scalastyle:off underscore.import -import org.apache.spark.sql._ -// scalastyle:on underscore.import +import org.apache.spark.sql.DataFrame class DataFrameLoader(sc: SparkContext) { def extractValidPages(path: String): DataFrame = { diff --git a/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala b/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala index a51d195b..00627049 100644 --- a/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala +++ b/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala @@ -16,9 +16,7 @@ */ package io.archivesunleashed.app -// scalastyle:off underscore.import -import io.archivesunleashed._ -// scalastyle:on underscore.import +import io.archivesunleashed.RecordLoader import io.archivesunleashed.matchbox.{NERClassifier, RemoveHTML} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD diff --git a/src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala b/src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala index ddb72a6e..1d107e34 100644 --- a/src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala +++ b/src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala @@ -16,11 +16,7 @@ */ package io.archivesunleashed.app -// scalastyle:off underscore.import -import io.archivesunleashed._ -import io.archivesunleashed.matchbox -import org.apache.spark.graphx._ -// scalastyle:on underscore.import +import org.apache.spark.graphx.{Edge, Graph, PartitionStrategy, VertexId} import org.apache.spark.rdd.RDD /** Extracts a site link structure using Spark's GraphX utility. */ diff --git a/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala b/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala index cbabe535..4654be18 100644 --- a/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala +++ b/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala @@ -16,9 +16,7 @@ */ package io.archivesunleashed.app -// scalastyle:off underscore.import -import io.archivesunleashed._ -// scalastyle:on underscore.import +import io.archivesunleashed.ArchiveRecord import io.archivesunleashed.matchbox.{ComputeImageSize, ComputeMD5} import org.apache.spark.rdd.RDD import org.apache.spark.{RangePartitioner, SparkContext} diff --git a/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala b/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala index 1a2fcfd3..4e2c29b5 100644 --- a/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala +++ b/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala @@ -21,9 +21,7 @@ import java.io.{BufferedReader, BufferedWriter, InputStreamReader, OutputStreamW import io.archivesunleashed.matchbox.NERClassifier import io.archivesunleashed.util.JsonUtils import org.apache.hadoop.conf.Configuration -// scalastyle:off underscore.import -import org.apache.hadoop.fs._ -// scalastyle:on underscore.import +import org.apache.hadoop.fs.{FileUtil, FileSystem, Path} import org.apache.spark.SparkContext import scala.collection.mutable.MutableList diff --git a/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala b/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala index dd00b836..a3d26bc0 100644 --- a/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala +++ b/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala @@ -15,9 +15,7 @@ * limitations under the License. */ package io.archivesunleashed.app -// scalastyle:off underscore.import -import io.archivesunleashed.matchbox._ -// scalastyle:on underscore.import +import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink} import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} import org.apache.spark.rdd.RDD diff --git a/src/main/scala/io/archivesunleashed/app/WriteGraph.scala b/src/main/scala/io/archivesunleashed/app/WriteGraph.scala index 26f64f5c..af91e3e7 100644 --- a/src/main/scala/io/archivesunleashed/app/WriteGraph.scala +++ b/src/main/scala/io/archivesunleashed/app/WriteGraph.scala @@ -15,9 +15,7 @@ * limitations under the License. */ package io.archivesunleashed.app -// scalastyle:off underscore.import -import io.archivesunleashed.matchbox._ -// scalastyle:on underscore.import +import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink} import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} import org.apache.spark.rdd.RDD diff --git a/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala index 0044d52a..fa8a7d04 100644 --- a/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala +++ b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala @@ -15,9 +15,7 @@ * limitations under the License. */ package io.archivesunleashed.app -// scalastyle:off underscore.import -import io.archivesunleashed.matchbox._ -// scalastyle: on underscore.import +import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink} import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} import org.apache.spark.rdd.RDD diff --git a/src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala b/src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala index e109a225..afac0bf6 100644 --- a/src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala +++ b/src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala @@ -15,10 +15,7 @@ * limitations under the License. */ package io.archivesunleashed.app -// scalastyle:off underscore.import -import io.archivesunleashed.matchbox._ -import org.apache.spark.graphx._ -// scalastyle:on underscore.import +import org.apache.spark.graphx.Graph import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} import io.archivesunleashed.app.ExtractGraphX.{VertexData,EdgeData,VertexDataPR} diff --git a/src/main/scala/io/archivesunleashed/df/package.scala b/src/main/scala/io/archivesunleashed/df/package.scala index 21f6ef7a..00ccbe01 100644 --- a/src/main/scala/io/archivesunleashed/df/package.scala +++ b/src/main/scala/io/archivesunleashed/df/package.scala @@ -17,9 +17,7 @@ package io.archivesunleashed -// scalastyle:off underscore.import -import io.archivesunleashed.matchbox._ -// scalastyle:on underscore.import +import io.archivesunleashed.matchbox.{ComputeMD5, ExtractDomain, RemoveHTML} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.DataFrame import java.io.ByteArrayInputStream diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala index ca07de7b..a7be3447 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala @@ -23,9 +23,8 @@ object ExtractDate { type DateComponent = Value val YYYY, MM, DD, YYYYMM, YYYYMMDD = Value } - // scalastyle:off underscore.import - import DateComponent._ - // scalastyle:on underscore.import + + import DateComponent.{DateComponent, DD, MM, YYYY, YYYYMM} /** Extracts the wanted date component from a date. * diff --git a/src/main/scala/io/archivesunleashed/matchbox/package.scala b/src/main/scala/io/archivesunleashed/matchbox/package.scala index a3340dde..8b953d37 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/package.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/package.scala @@ -19,9 +19,7 @@ package io.archivesunleashed import java.io.IOException import java.security.MessageDigest -// scalastyle:off underscore.import -import scala.xml.Utility._ -// scalastyle:on underscore.import +import scala.xml.Utility.escape /** Package object which supplies implicits providing common UDF-related functionalities. */ diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index d6ed4ede..d8327fbd 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -24,11 +24,9 @@ import io.archivesunleashed.matchbox.ImageDetails import io.archivesunleashed.matchbox.ExtractDate.DateComponent import java.net.URI import org.apache.hadoop.fs.{FileSystem, Path} -// scalastyle:off underscore.import -import io.archivesunleashed.matchbox.ExtractDate.DateComponent._ -import org.apache.spark.sql._ -import org.apache.spark.sql.types._ -// scalastyle:on: underscore.import +import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.hadoop.io.LongWritable import org.apache.spark.{SerializableWritable, SparkContext} import org.apache.spark.rdd.RDD diff --git a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala index f13113fc..59db63b9 100644 --- a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala +++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala @@ -31,6 +31,13 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { private val master = "local[4]" private val appName = "example-spark" private var sc: SparkContext = _ + private val exampleArc = "example.arc.gz" + private val exampleWarc = "example.warc.gz" + private val exampleDate = "20080430" + private val exampleUrl = "www.archive.org" + private val exampleStatusCode1 = "000" + private val exampleStatusCode2 = "200" + private val exampleMimeType = "text/plain" before { val conf = new SparkConf() @@ -51,10 +58,10 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .take(3) val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) .map(x => FilenameUtils.getName(x.getArchiveFilename)).take(3) - assert(textSampleArc.deep == Array("example.arc.gz", - "example.arc.gz", "example.arc.gz").deep) - assert(textSampleWarc.deep == Array("example.warc.gz", - "example.warc.gz", "example.warc.gz").deep) + assert(textSampleArc.deep == Array(exampleArc, + exampleArc, exampleArc).deep) + assert(textSampleWarc.deep == Array(exampleWarc, + exampleWarc, exampleWarc).deep) } test("Crawl Dates") { @@ -62,8 +69,8 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .map(x => x.getCrawlDate).take(3) val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) .map(x => x.getCrawlDate).take(3) - assert(textSampleArc.deep == Array("20080430", "20080430", "20080430").deep) - assert(textSampleWarc.deep == Array("20080430", "20080430", "20080430").deep) + assert(textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep) + assert(textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep) } test("Domains") { @@ -71,8 +78,8 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .map(x => x.getDomain).take(3) val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) .map(x => x.getDomain).take(3) - assert(textSampleArc.deep == Array("", "", "www.archive.org").deep) - assert(textSampleWarc.deep == Array("", "www.archive.org", "www.archive.org").deep) + assert(textSampleArc.deep == Array("", "", exampleUrl).deep) + assert(textSampleWarc.deep == Array("", exampleUrl, exampleUrl).deep) } test("Urls") { @@ -91,8 +98,10 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .map(x => x.getMimeType).take(3) val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) .map(x => x.getMimeType).take(3) - assert (textSampleArc.deep == Array ("text/plain", "text/dns", "text/plain").deep) - assert (textSampleWarc.deep == Array("unknown", "text/plain", "text/html").deep) + assert (textSampleArc.deep == Array (exampleMimeType, "text/dns", + exampleMimeType).deep) + assert (textSampleWarc.deep == Array("unknown", exampleMimeType, + "text/html").deep) } test("Get Http Status") { @@ -100,8 +109,10 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .map(x => x.getHttpStatus).take(3) val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) .map(x => x.getHttpStatus).take(3) - assert (textSampleArc.deep == Array("000", "000", "200").deep) - assert (textSampleWarc.deep == Array("000", "200", "200").deep) + assert (textSampleArc.deep == Array(exampleStatusCode1, exampleStatusCode1, + exampleStatusCode2).deep) + assert (textSampleWarc.deep == Array(exampleStatusCode1, exampleStatusCode2, + exampleStatusCode2).deep) } after { diff --git a/src/test/scala/io/archivesunleashed/app/WriteGraphTest.scala b/src/test/scala/io/archivesunleashed/app/WriteGraphTest.scala index a6bf5f95..d41288ef 100644 --- a/src/test/scala/io/archivesunleashed/app/WriteGraphTest.scala +++ b/src/test/scala/io/archivesunleashed/app/WriteGraphTest.scala @@ -32,18 +32,28 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{ private var sc: SparkContext = _ private val master = "local[4]" private val appName = "example-spark" - private val network = Seq((("Date1", "Source1", "Destination1"), 3), - (("Date2", "Source2", "Destination2"), 4), - (("Date3", "Source3", "Destination3"), 100)) - private val unescapedNetwork = Seq((("Date1", "Source1", "Destination1"), 3), - (("Date2", "Source2", "Destination2"), 4), - (("Date3", "Source<3", "Destination<3"), 100)) - private val networkDf = Seq(("Date1", "Source1", "Destination1", 3), - ("Date2", "Source2", "Destination2", 4), - ("Date3", "Source3", "Destination3", 100)) - private val networkWithDuplication = Seq((("Date1", "Source1", "Destination1"), 3), - (("Date2", "Source2", "Source2"), 4), - (("Date3", "Source3", "Destination3"), 100)) + private val date1 = "Date1" + private val date2 = "Date2" + private val date3 = "Date3" + private val source1 = "Source1" + private val source2 = "Source2" + private val source3 = "Source3" + private val destination1 = "Destination1" + private val destination2 = "Destination2" + private val destination3 = "Destination3" + private val xmlDeclaration = """""" + private val network = Seq(((date1, source1, destination1), 3), + ((date2, source2, destination2), 4), + ((date3, source3, destination3), 100)) + private val unescapedNetwork = Seq(((date1, source1, destination1), 3), + ((date2, source2, destination2), 4), + ((date3, "Source<3", "Destination<3"), 100)) + private val networkDf = Seq((date1, source1, destination1, 3), + (date2, source2, destination2, 4), + (date3, source3, destination3, 100)) + private val networkWithDuplication = Seq(((date1, source1, destination1), 3), + ((date2, source2, source2), 4), + ((date3, source3, destination3), 100)) private val testFile = "temporaryTestFile.txt" private val testFile2 = "temporaryTestFile2.txt" @@ -61,7 +71,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{ WriteGraph.asGexf(networkrdd, testFile) assert(Files.exists(Paths.get(testFile))) val lines = Source.fromFile(testFile).getLines.toList - assert(lines(testLines._1) == """""") + assert(lines(testLines._1) == xmlDeclaration) assert(lines(testLines._2) == """""") assert(lines(testLines._3) == """""") assert(lines(testLines._4) == """""") @@ -77,7 +87,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{ val ret = WriteGraph.asGexf(networkarray, testFile) assert(ret) val lines = Source.fromFile(testFile).getLines.toList - assert(lines(testLines._1) == """""") + assert(lines(testLines._1) == xmlDeclaration) assert(lines(testLines._2) == """""") assert(lines(testLines._3) == """""") assert(lines(testLines._4) == """""") @@ -104,7 +114,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{ test ("Nodelookup returns a option") { val networkrdd = sc.parallelize(network) val nodes = WriteGraph.nodesWithIds(networkrdd) - val lookup = "Source1" + val lookup = source1 val badlookup = "NOTTHERE" assert (WriteGraph.nodeLookup(nodes, badlookup) == None) assert (WriteGraph.nodeLookup(nodes, lookup) == Some((lookup, 6))) @@ -115,7 +125,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{ val nodes = WriteGraph.nodesWithIds(sc.parallelize(network)) val empty = -1 val expected = 6 - val lookup = WriteGraph.nodeLookup(nodes, "Source1") + val lookup = WriteGraph.nodeLookup(nodes, source1) val badlookup = WriteGraph.nodeLookup(nodes, "NOTTHERE") assert (WriteGraph.nodeIdFromLabel(lookup) == expected) assert (WriteGraph.nodeIdFromLabel(badlookup) == empty) @@ -123,9 +133,9 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{ test ("Edge ids are captured from lookup") { val edges = WriteGraph.edgeNodes(sc.parallelize(network)) - val expected = Array(("Date1", 6, 3, 3), - ("Date2", 7, 4, 4), - ("Date3", 0, 5, 100)).deep + val expected = Array((date1, 6, 3, 3), + (date2, 7, 4, 4), + (date3, 0, 5, 100)).deep assert(edges.collect.deep == expected) } @@ -135,7 +145,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{ WriteGraph.asGraphml(networkrdd, testFile) assert(Files.exists(Paths.get(testFile))) val lines = Source.fromFile(testFile).getLines.toList - assert(lines(testLines._1) == """""") + assert(lines(testLines._1) == xmlDeclaration) assert(lines(testLines._2) == """Source3""") assert(lines(testLines._3) == """3""") assert(lines(testLines._4) == """""") @@ -147,7 +157,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{ WriteGraph.asGraphml(networkrdd, testFile) assert(Files.exists(Paths.get(testFile))) val lines = Source.fromFile(testFile).getLines.toList - assert(lines(testLines._1) == """""") + assert(lines(testLines._1) == xmlDeclaration) assert(lines(testLines._2) == """Destination<3""") assert(lines(testLines._3) == """100""") assert(lines(testLines._4) == """""") @@ -159,7 +169,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{ WriteGraph(networkrdd, testFile2) assert(Files.exists(Paths.get(testFile2))) val lines = Source.fromFile(testFile2).getLines.toList - assert(lines(testLines._1) == """""") + assert(lines(testLines._1) == xmlDeclaration) assert(lines(testLines._2) == """""") assert(lines(testLines._3) == """""") assert(lines(testLines._4) == """""") diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala index da1cf5bd..8b32d69e 100644 --- a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala @@ -33,6 +33,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ + private val url = "url" + private val mime_type = "mime_type" + private val md5 = "md5" before { val conf = new SparkConf() @@ -48,9 +51,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { val imageLinks = df.extractImageLinks(arcPath) val images = df.extractImages(arcPath) - val r_1 = validPages.select("url", "mime_type").take(1)(0) - assert(r_1.getAs[String]("url") == "http://www.archive.org/") - assert(r_1.getAs[String]("mime_type") == "text/html") + val r_1 = validPages.select(url, mime_type).take(1)(0) + assert(r_1.getAs[String](url) == "http://www.archive.org/") + assert(r_1.getAs[String](mime_type) == "text/html") val r_2 = hyperlinks.select("Dest", "Anchor").take(3)(2) assert(r_2(0) == "http://web.archive.org/collections/web/advanced.html") @@ -61,8 +64,8 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { assert(r_3.get(1) == "http://www.archive.org/images/star.png") val r_4 = images.take(1)(0) - assert(r_4.getAs[String]("url") == "http://www.archive.org/images/logoc.jpg") - assert(r_4.getAs[String]("md5") == "8211d1fbb9b03d8522a1ae378f9d1b24") + assert(r_4.getAs[String](url) == "http://www.archive.org/images/logoc.jpg") + assert(r_4.getAs[String](md5) == "8211d1fbb9b03d8522a1ae378f9d1b24") } after { diff --git a/src/test/scala/io/archivesunleashed/df/SaveImageTest.scala b/src/test/scala/io/archivesunleashed/df/SaveImageTest.scala index c38ddc50..0db9fccb 100644 --- a/src/test/scala/io/archivesunleashed/df/SaveImageTest.scala +++ b/src/test/scala/io/archivesunleashed/df/SaveImageTest.scala @@ -43,6 +43,7 @@ class SaveImageTest extends FunSuite with BeforeAndAfter { private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ + private val testString = "bytes" before { val conf = new SparkConf() @@ -52,11 +53,10 @@ class SaveImageTest extends FunSuite with BeforeAndAfter { } test("Save image") { - val testString = "bytes" val df = RecordLoader.loadArchives(arcPath, sc) .extractImageDetailsDF() - val extracted = df.select("bytes") + val extracted = df.select(testString) .orderBy(desc(testString)).limit(1) extracted.saveToDisk(testString, "/tmp/foo") @@ -97,7 +97,7 @@ class SaveImageTest extends FunSuite with BeforeAndAfter { // scalastyle:on val df = Seq(dummyImg).toDF - df.saveToDisk("bytes", "/tmp/bar") + df.saveToDisk(testString, "/tmp/bar") // Check that no file was written. assert(new File("/tmp").listFiles.filter(_.isFile).toList diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala index d365291c..76ce1121 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala @@ -25,17 +25,20 @@ import org.scalatest.junit.JUnitRunner class ExtractDomainTest extends FunSuite { private val index = "index.html" private val umiacs = "www.umiacs.umd.edu" + private val jimmylin = "http://www.umiacs.umd.edu/~jimmylin/" + private val lintool = "https://github.com/lintool" + private val github = "github.com" private val data1: Seq[(String, String)] = Seq.newBuilder.+=( - ("http://www.umiacs.umd.edu/~jimmylin/", umiacs), - ("https://github.com/lintool", "github.com"), + (jimmylin, umiacs), + (lintool, github), ("http://ianmilligan.ca/2015/05/04/iipc-2015-slides-for-warcs-wats-and-wgets-presentation/", "ianmilligan.ca"), (index, "")).result() private val data2 = Seq.newBuilder.+=( - (index, "http://www.umiacs.umd.edu/~jimmylin/", umiacs), - ("https://github.com/lintool", "http://www.umiacs.umd.edu/~jimmylin/", "github.com"), - (index, "https://github.com/lintool", "github.com")).result() + (index, jimmylin, umiacs), + (lintool, jimmylin, github), + (index, lintool, github)).result() private val data3 = Seq.newBuilder.+=( ("http://www.seetorontonow.canada-booknow.com\\booking_results.php", "www.seetorontonow.canada-booknow.com")).result()