Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update keepValidPages to include a filter on 200 OK. #360

Merged
merged 1 commit into from
Sep 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,20 @@ package object archivesunleashed {
|| r.getMimeType == "application/xhtml+xml"
|| r.getUrl.toLowerCase.endsWith("htm")
|| r.getUrl.toLowerCase.endsWith("html"))
&& !r.getUrl.toLowerCase.endsWith("robots.txt"))
&& !r.getUrl.toLowerCase.endsWith("robots.txt")
&& r.getHttpStatus == "200")
}

def extractValidPagesDF(): DataFrame = {
val records = rdd.keepValidPages()
.map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType, r.getContentString))
.map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType,
DetectMimeTypeTika(r.getBinaryBytes), r.getContentString))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("content", StringType, true))

val sqlContext = SparkSession.builder()
Expand All @@ -115,7 +118,8 @@ package object archivesunleashed {
def extractHyperlinksDF(): DataFrame = {
val records = rdd
.keepValidPages()
.flatMap(r => ExtractLinks(r.getUrl, r.getContentString).map(t => (r.getCrawlDate, t._1, t._2, t._3)))
.flatMap(r => ExtractLinks(r.getUrl, r.getContentString)
.map(t => (r.getCrawlDate, t._1, t._2, t._3)))
.map(t => Row(t._1, t._2, t._3, t._4))

val schema = new StructType()
Expand Down
15 changes: 7 additions & 8 deletions src/test/scala/io/archivesunleashed/RecordRDDTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
assert (r2.sameElements(r)) }

test ("keep http status codes") {
val expected = 129
val expected = 94
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val statusCodes: Set[String] = Set ("200", "404")
Expand All @@ -98,7 +98,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
}

test ("check for domains") {
val expected = 132
val expected = 91
val base2 = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val urls: Set[String] = Set("www.archive.org", "www.sloan.org")
Expand Down Expand Up @@ -185,7 +185,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
}

test ("discard urls") {
val expected = 135
val expected = 94
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val urls: Set[String] = Set (sloan)
Expand All @@ -194,7 +194,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
}

test ("discard url patterns") {
val expected = 134
val expected = 93
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val urls = Set (archive.r, sloan.r, "".r)
Expand All @@ -203,16 +203,15 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
}

test ("discard http status codes") {
val expected = 6
val expected = 46
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val statusCodes: Set[String] = Set ("200", "404")
val r2 = base.discardHttpStatus(statusCodes).count
assert (r2 == expected)
}

test ("discard domains") {
val expected = 135
val expected = 94
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val urls: Set[String] = Set ("www.sloan.org")
Expand All @@ -221,7 +220,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
}

test ("discard content") {
val expected = 134
val expected = 93
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val regno = Set(regex, raw"UNINTELLIBLEDFSJKLS".r)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,20 @@ class DomainFrequencyExtractorTest extends FunSuite with BeforeAndAfter {
// +------------------+-----+
// | Domain|count|
// +------------------+-----+
// | www.archive.org| 132|
// | www.archive.org| 91|
// | deadlists.com| 2|
// |www.hideout.com.br| 1|
// +------------------+-----+

assert(dfResults(0).get(0) == "www.archive.org")
assert(dfResults(0).get(1) == 132)
assert(dfResults(0).get(1) == 91)
assert(dfResults(1).get(0) == "deadlists.com")
assert(dfResults(1).get(1) == 2)
assert(dfResults(2).get(0) == "www.hideout.com.br")
assert(dfResults(2).get(1) == 1)

assert(rddResults(0)._1 == "www.archive.org")
assert(rddResults(0)._2 == 132)
assert(rddResults(0)._2 == 91)
assert(rddResults(1)._1 == "deadlists.com")
assert(rddResults(1)._2 == 2)
assert(rddResults(2)._1 == "www.hideout.com.br")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter {

test("DomainGraphExtractor") {
val TESTLENGTH = 166
val TESTRESULT = 316
val TESTRESULT = 280
val df = RecordLoader.loadArchives(arcPath, sc).extractHyperlinksDF()
val dfResult = DomainGraphExtractor(df).collect()
assert(dfResult.length == TESTLENGTH)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class DomainGraphExtractorTest extends FunSuite with BeforeAndAfter {
assert(rddResult(0)._1._1 == "20080430")
assert(rddResult(0)._1._2 == "www.archive.org")
assert(rddResult(0)._1._3 == "www.archive.org")
assert(rddResult(0)._2 == 305)
assert(rddResult(0)._2 == 269)
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class PlainTextExtractorTest extends FunSuite with BeforeAndAfter {
val df = RecordLoader.loadArchives(arcPath, sc).extractValidPagesDF()
val rddResults = PlainTextExtractor(rdd).collect()
val dfResults = PlainTextExtractor(df).collect()
val RESULTSLENGTH = 135
val RESULTSLENGTH = 94

assert(rddResults.length == RESULTSLENGTH)
assert(rddResults(0)._1 == "20080430")
Expand Down
4 changes: 2 additions & 2 deletions src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter {
// +------------------+-----+
// | Domain|count|
// +------------------+-----+
// | www.archive.org| 132|
// | www.archive.org| 91|
// | deadlists.com| 2|
// |www.hideout.com.br| 1|
// +------------------+-----+

assert(results(0).get(0) == "www.archive.org")
assert(results(0).get(1) == 132)
assert(results(0).get(1) == 91)

assert(results(1).get(0) == "deadlists.com")
assert(results(1).get(1) == 2)
Expand Down