Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More scalastyle work; addresses #196. #339

Merged
merged 1 commit into from
Aug 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions config/checkstyle/scalastyle_config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,15 @@
<parameter name="maxParameters"><![CDATA[8]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.MagicNumberChecker" level="warning" enabled="true">
<check class="org.scalastyle.scalariform.MagicNumberChecker" level="warning" enabled="false">
<parameters>
<parameter name="ignore"><![CDATA[-1,0,1,2,3]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ReturnChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NullChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NullChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.NoCloneChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NoFinalizeChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.CovariantEqualsChecker" level="warning" enabled="true"></check>
Expand Down
4 changes: 1 addition & 3 deletions src/main/scala/io/archivesunleashed/DataFrameLoader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@
package io.archivesunleashed

import org.apache.spark.SparkContext
// scalastyle:off underscore.import
import org.apache.spark.sql._
// scalastyle:on underscore.import
import org.apache.spark.sql.DataFrame

class DataFrameLoader(sc: SparkContext) {
def extractValidPages(path: String): DataFrame = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
*/
package io.archivesunleashed.app

// scalastyle:off underscore.import
import io.archivesunleashed._
// scalastyle:on underscore.import
import io.archivesunleashed.RecordLoader
import io.archivesunleashed.matchbox.{NERClassifier, RemoveHTML}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
Expand Down
6 changes: 1 addition & 5 deletions src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,7 @@
*/
package io.archivesunleashed.app

// scalastyle:off underscore.import
import io.archivesunleashed._
import io.archivesunleashed.matchbox
import org.apache.spark.graphx._
// scalastyle:on underscore.import
import org.apache.spark.graphx.{Edge, Graph, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD

/** Extracts a site link structure using Spark's GraphX utility. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
*/
package io.archivesunleashed.app

// scalastyle:off underscore.import
import io.archivesunleashed._
// scalastyle:on underscore.import
import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.matchbox.{ComputeImageSize, ComputeMD5}
import org.apache.spark.rdd.RDD
import org.apache.spark.{RangePartitioner, SparkContext}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ import java.io.{BufferedReader, BufferedWriter, InputStreamReader, OutputStreamW
import io.archivesunleashed.matchbox.NERClassifier
import io.archivesunleashed.util.JsonUtils
import org.apache.hadoop.conf.Configuration
// scalastyle:off underscore.import
import org.apache.hadoop.fs._
// scalastyle:on underscore.import
import org.apache.hadoop.fs.{FileUtil, FileSystem, Path}
import org.apache.spark.SparkContext

import scala.collection.mutable.MutableList
Expand Down
4 changes: 1 addition & 3 deletions src/main/scala/io/archivesunleashed/app/WriteGEXF.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox._
// scalastyle:on underscore.import
import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import org.apache.spark.rdd.RDD
Expand Down
4 changes: 1 addition & 3 deletions src/main/scala/io/archivesunleashed/app/WriteGraph.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox._
// scalastyle:on underscore.import
import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import org.apache.spark.rdd.RDD
Expand Down
4 changes: 1 addition & 3 deletions src/main/scala/io/archivesunleashed/app/WriteGraphML.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox._
// scalastyle: on underscore.import
import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import org.apache.spark.rdd.RDD
Expand Down
5 changes: 1 addition & 4 deletions src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox._
import org.apache.spark.graphx._
// scalastyle:on underscore.import
import org.apache.spark.graphx.Graph
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import io.archivesunleashed.app.ExtractGraphX.{VertexData,EdgeData,VertexDataPR}
Expand Down
4 changes: 1 addition & 3 deletions src/main/scala/io/archivesunleashed/df/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@

package io.archivesunleashed

// scalastyle:off underscore.import
import io.archivesunleashed.matchbox._
// scalastyle:on underscore.import
import io.archivesunleashed.matchbox.{ComputeMD5, ExtractDomain, RemoveHTML}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.DataFrame
import java.io.ByteArrayInputStream
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ object ExtractDate {
type DateComponent = Value
val YYYY, MM, DD, YYYYMM, YYYYMMDD = Value
}
// scalastyle:off underscore.import
import DateComponent._
// scalastyle:on underscore.import

import DateComponent.{DateComponent, DD, MM, YYYY, YYYYMM}

/** Extracts the wanted date component from a date.
*
Expand Down
4 changes: 1 addition & 3 deletions src/main/scala/io/archivesunleashed/matchbox/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ package io.archivesunleashed

import java.io.IOException
import java.security.MessageDigest
// scalastyle:off underscore.import
import scala.xml.Utility._
// scalastyle:on underscore.import
import scala.xml.Utility.escape


/** Package object which supplies implicits providing common UDF-related functionalities. */
Expand Down
8 changes: 3 additions & 5 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,9 @@ import io.archivesunleashed.matchbox.ImageDetails
import io.archivesunleashed.matchbox.ExtractDate.DateComponent
import java.net.URI
import org.apache.hadoop.fs.{FileSystem, Path}
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox.ExtractDate.DateComponent._
import org.apache.spark.sql._
import org.apache.spark.sql.types._
// scalastyle:on: underscore.import
import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.hadoop.io.LongWritable
import org.apache.spark.{SerializableWritable, SparkContext}
import org.apache.spark.rdd.RDD
Expand Down
35 changes: 23 additions & 12 deletions src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
private val master = "local[4]"
private val appName = "example-spark"
private var sc: SparkContext = _
private val exampleArc = "example.arc.gz"
private val exampleWarc = "example.warc.gz"
private val exampleDate = "20080430"
private val exampleUrl = "www.archive.org"
private val exampleStatusCode1 = "000"
private val exampleStatusCode2 = "200"
private val exampleMimeType = "text/plain"

before {
val conf = new SparkConf()
Expand All @@ -51,28 +58,28 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
.take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => FilenameUtils.getName(x.getArchiveFilename)).take(3)
assert(textSampleArc.deep == Array("example.arc.gz",
"example.arc.gz", "example.arc.gz").deep)
assert(textSampleWarc.deep == Array("example.warc.gz",
"example.warc.gz", "example.warc.gz").deep)
assert(textSampleArc.deep == Array(exampleArc,
exampleArc, exampleArc).deep)
assert(textSampleWarc.deep == Array(exampleWarc,
exampleWarc, exampleWarc).deep)
}

test("Crawl Dates") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getCrawlDate).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getCrawlDate).take(3)
assert(textSampleArc.deep == Array("20080430", "20080430", "20080430").deep)
assert(textSampleWarc.deep == Array("20080430", "20080430", "20080430").deep)
assert(textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep)
assert(textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep)
}

test("Domains") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getDomain).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getDomain).take(3)
assert(textSampleArc.deep == Array("", "", "www.archive.org").deep)
assert(textSampleWarc.deep == Array("", "www.archive.org", "www.archive.org").deep)
assert(textSampleArc.deep == Array("", "", exampleUrl).deep)
assert(textSampleWarc.deep == Array("", exampleUrl, exampleUrl).deep)
}

test("Urls") {
Expand All @@ -91,17 +98,21 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
.map(x => x.getMimeType).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getMimeType).take(3)
assert (textSampleArc.deep == Array ("text/plain", "text/dns", "text/plain").deep)
assert (textSampleWarc.deep == Array("unknown", "text/plain", "text/html").deep)
assert (textSampleArc.deep == Array (exampleMimeType, "text/dns",
exampleMimeType).deep)
assert (textSampleWarc.deep == Array("unknown", exampleMimeType,
"text/html").deep)
}

test("Get Http Status") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getHttpStatus).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getHttpStatus).take(3)
assert (textSampleArc.deep == Array("000", "000", "200").deep)
assert (textSampleWarc.deep == Array("000", "200", "200").deep)
assert (textSampleArc.deep == Array(exampleStatusCode1, exampleStatusCode1,
exampleStatusCode2).deep)
assert (textSampleWarc.deep == Array(exampleStatusCode1, exampleStatusCode2,
exampleStatusCode2).deep)
}

after {
Expand Down
54 changes: 32 additions & 22 deletions src/test/scala/io/archivesunleashed/app/WriteGraphTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,28 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
private val network = Seq((("Date1", "Source1", "Destination1"), 3),
(("Date2", "Source2", "Destination2"), 4),
(("Date3", "Source3", "Destination3"), 100))
private val unescapedNetwork = Seq((("Date1", "Source1", "Destination1"), 3),
(("Date2", "Source2", "Destination2"), 4),
(("Date3", "Source<3", "Destination<3"), 100))
private val networkDf = Seq(("Date1", "Source1", "Destination1", 3),
("Date2", "Source2", "Destination2", 4),
("Date3", "Source3", "Destination3", 100))
private val networkWithDuplication = Seq((("Date1", "Source1", "Destination1"), 3),
(("Date2", "Source2", "Source2"), 4),
(("Date3", "Source3", "Destination3"), 100))
private val date1 = "Date1"
private val date2 = "Date2"
private val date3 = "Date3"
private val source1 = "Source1"
private val source2 = "Source2"
private val source3 = "Source3"
private val destination1 = "Destination1"
private val destination2 = "Destination2"
private val destination3 = "Destination3"
private val xmlDeclaration = """<?xml version="1.0" encoding="UTF-8"?>"""
private val network = Seq(((date1, source1, destination1), 3),
((date2, source2, destination2), 4),
((date3, source3, destination3), 100))
private val unescapedNetwork = Seq(((date1, source1, destination1), 3),
((date2, source2, destination2), 4),
((date3, "Source<3", "Destination<3"), 100))
private val networkDf = Seq((date1, source1, destination1, 3),
(date2, source2, destination2, 4),
(date3, source3, destination3, 100))
private val networkWithDuplication = Seq(((date1, source1, destination1), 3),
((date2, source2, source2), 4),
((date3, source3, destination3), 100))
private val testFile = "temporaryTestFile.txt"
private val testFile2 = "temporaryTestFile2.txt"

Expand All @@ -61,7 +71,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph.asGexf(networkrdd, testFile)
assert(Files.exists(Paths.get(testFile)))
val lines = Source.fromFile(testFile).getLines.toList
assert(lines(testLines._1) == """<?xml version="1.0" encoding="UTF-8"?>""")
assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """<node id="3" label="Destination1" />""")
assert(lines(testLines._3) == """</attvalues>""")
assert(lines(testLines._4) == """</edges>""")
Expand All @@ -77,7 +87,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
val ret = WriteGraph.asGexf(networkarray, testFile)
assert(ret)
val lines = Source.fromFile(testFile).getLines.toList
assert(lines(testLines._1) == """<?xml version="1.0" encoding="UTF-8"?>""")
assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """<node id="8d3ab53ec817a1e5bf9ffd6e749b3983" label="Destination2" />""")
assert(lines(testLines._3) == """</attvalues>""")
assert(lines(testLines._4) == """</edges>""")
Expand All @@ -104,7 +114,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
test ("Nodelookup returns a option") {
val networkrdd = sc.parallelize(network)
val nodes = WriteGraph.nodesWithIds(networkrdd)
val lookup = "Source1"
val lookup = source1
val badlookup = "NOTTHERE"
assert (WriteGraph.nodeLookup(nodes, badlookup) == None)
assert (WriteGraph.nodeLookup(nodes, lookup) == Some((lookup, 6)))
Expand All @@ -115,17 +125,17 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
val nodes = WriteGraph.nodesWithIds(sc.parallelize(network))
val empty = -1
val expected = 6
val lookup = WriteGraph.nodeLookup(nodes, "Source1")
val lookup = WriteGraph.nodeLookup(nodes, source1)
val badlookup = WriteGraph.nodeLookup(nodes, "NOTTHERE")
assert (WriteGraph.nodeIdFromLabel(lookup) == expected)
assert (WriteGraph.nodeIdFromLabel(badlookup) == empty)
}

test ("Edge ids are captured from lookup") {
val edges = WriteGraph.edgeNodes(sc.parallelize(network))
val expected = Array(("Date1", 6, 3, 3),
("Date2", 7, 4, 4),
("Date3", 0, 5, 100)).deep
val expected = Array((date1, 6, 3, 3),
(date2, 7, 4, 4),
(date3, 0, 5, 100)).deep
assert(edges.collect.deep == expected)
}

Expand All @@ -135,7 +145,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph.asGraphml(networkrdd, testFile)
assert(Files.exists(Paths.get(testFile)))
val lines = Source.fromFile(testFile).getLines.toList
assert(lines(testLines._1) == """<?xml version="1.0" encoding="UTF-8"?>""")
assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """<data key="label">Source3</data>""")
assert(lines(testLines._3) == """<data key="weight">3</data>""")
assert(lines(testLines._4) == """<edge source="0" target="5" type="directed">""")
Expand All @@ -147,7 +157,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph.asGraphml(networkrdd, testFile)
assert(Files.exists(Paths.get(testFile)))
val lines = Source.fromFile(testFile).getLines.toList
assert(lines(testLines._1) == """<?xml version="1.0" encoding="UTF-8"?>""")
assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """<data key="label">Destination&lt;3</data>""")
assert(lines(testLines._3) == """<data key="weight">100</data>""")
assert(lines(testLines._4) == """<edge source="7" target="4" type="directed">""")
Expand All @@ -159,7 +169,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph(networkrdd, testFile2)
assert(Files.exists(Paths.get(testFile2)))
val lines = Source.fromFile(testFile2).getLines.toList
assert(lines(testLines._1) == """<?xml version="1.0" encoding="UTF-8"?>""")
assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """<node id="3" label="Source&lt;3" />""")
assert(lines(testLines._3) == """<edge source="7" target="4" weight="4" type="directed">""")
assert(lines(testLines._4) == """<attvalue for="0" value="Date2" />""")
Expand Down
13 changes: 8 additions & 5 deletions src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
private val url = "url"
private val mime_type = "mime_type"
private val md5 = "md5"

before {
val conf = new SparkConf()
Expand All @@ -48,9 +51,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
val imageLinks = df.extractImageLinks(arcPath)
val images = df.extractImages(arcPath)

val r_1 = validPages.select("url", "mime_type").take(1)(0)
assert(r_1.getAs[String]("url") == "http://www.archive.org/")
assert(r_1.getAs[String]("mime_type") == "text/html")
val r_1 = validPages.select(url, mime_type).take(1)(0)
assert(r_1.getAs[String](url) == "http://www.archive.org/")
assert(r_1.getAs[String](mime_type) == "text/html")

val r_2 = hyperlinks.select("Dest", "Anchor").take(3)(2)
assert(r_2(0) == "http://web.archive.org/collections/web/advanced.html")
Expand All @@ -61,8 +64,8 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
assert(r_3.get(1) == "http://www.archive.org/images/star.png")

val r_4 = images.take(1)(0)
assert(r_4.getAs[String]("url") == "http://www.archive.org/images/logoc.jpg")
assert(r_4.getAs[String]("md5") == "8211d1fbb9b03d8522a1ae378f9d1b24")
assert(r_4.getAs[String](url) == "http://www.archive.org/images/logoc.jpg")
assert(r_4.getAs[String](md5) == "8211d1fbb9b03d8522a1ae378f9d1b24")
}

after {
Expand Down
Loading