diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index 19370fa7..739252d4 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -95,17 +95,20 @@ package object archivesunleashed { || r.getMimeType == "application/xhtml+xml" || r.getUrl.toLowerCase.endsWith("htm") || r.getUrl.toLowerCase.endsWith("html")) - && !r.getUrl.toLowerCase.endsWith("robots.txt")) + && !r.getUrl.toLowerCase.endsWith("robots.txt") + && r.getHttpStatus == "200") } def extractValidPagesDF(): DataFrame = { val records = rdd.keepValidPages() - .map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType, r.getContentString)) + .map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType, + DetectMimeTypeTika(r.getBinaryBytes), r.getContentString)) val schema = new StructType() .add(StructField("crawl_date", StringType, true)) .add(StructField("url", StringType, true)) .add(StructField("mime_type_web_server", StringType, true)) + .add(StructField("mime_type_tika", StringType, true)) .add(StructField("content", StringType, true)) val sqlContext = SparkSession.builder() @@ -115,7 +118,8 @@ package object archivesunleashed { def extractHyperlinksDF(): DataFrame = { val records = rdd .keepValidPages() - .flatMap(r => ExtractLinks(r.getUrl, r.getContentString).map(t => (r.getCrawlDate, t._1, t._2, t._3))) + .flatMap(r => ExtractLinks(r.getUrl, r.getContentString) + .map(t => (r.getCrawlDate, t._1, t._2, t._3))) .map(t => Row(t._1, t._2, t._3, t._4)) val schema = new StructType() diff --git a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala index 9ab5d378..94d25052 100644 --- a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala @@ -71,7 +71,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { assert (r2.sameElements(r)) } test ("keep http status codes") { - val expected = 129 + val expected = 94 val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val statusCodes: Set[String] = Set ("200", "404") @@ -98,7 +98,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("check for domains") { - val expected = 132 + val expected = 91 val base2 = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val urls: Set[String] = Set("www.archive.org", "www.sloan.org") @@ -185,7 +185,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("discard urls") { - val expected = 135 + val expected = 94 val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val urls: Set[String] = Set (sloan) @@ -194,7 +194,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("discard url patterns") { - val expected = 134 + val expected = 93 val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val urls = Set (archive.r, sloan.r, "".r) @@ -203,16 +203,15 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("discard http status codes") { - val expected = 6 + val expected = 46 val base = RecordLoader.loadArchives(arcPath, sc) - .keepValidPages() val statusCodes: Set[String] = Set ("200", "404") val r2 = base.discardHttpStatus(statusCodes).count assert (r2 == expected) } test ("discard domains") { - val expected = 135 + val expected = 94 val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val urls: Set[String] = Set ("www.sloan.org") @@ -221,7 +220,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("discard content") { - val expected = 134 + val expected = 93 val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val regno = Set(regex, raw"UNINTELLIBLEDFSJKLS".r) diff --git a/src/test/scala/io/archivesunleashed/app/DomainFrequencyExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/DomainFrequencyExtractorTest.scala index 4a30ff59..473d81b4 100644 --- a/src/test/scala/io/archivesunleashed/app/DomainFrequencyExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/DomainFrequencyExtractorTest.scala @@ -49,20 +49,20 @@ class DomainFrequencyExtractorTest extends FunSuite with BeforeAndAfter { // +------------------+-----+ // | Domain|count| // +------------------+-----+ - // | www.archive.org| 132| + // | www.archive.org| 91| // | deadlists.com| 2| // |www.hideout.com.br| 1| // +------------------+-----+ assert(dfResults(0).get(0) == "www.archive.org") - assert(dfResults(0).get(1) == 132) + assert(dfResults(0).get(1) == 91) assert(dfResults(1).get(0) == "deadlists.com") assert(dfResults(1).get(1) == 2) assert(dfResults(2).get(0) == "www.hideout.com.br") assert(dfResults(2).get(1) == 1) assert(rddResults(0)._1 == "www.archive.org") - assert(rddResults(0)._2 == 132) + assert(rddResults(0)._2 == 91) assert(rddResults(1)._1 == "deadlists.com") assert(rddResults(1)._2 == 2) assert(rddResults(2)._1 == "www.hideout.com.br") diff --git a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorDfTest.scala b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorDfTest.scala index 5b3065fb..d5151b58 100644 --- a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorDfTest.scala +++ b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorDfTest.scala @@ -40,7 +40,7 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter { test("DomainGraphExtractor") { val TESTLENGTH = 166 - val TESTRESULT = 316 + val TESTRESULT = 280 val df = RecordLoader.loadArchives(arcPath, sc).extractHyperlinksDF() val dfResult = DomainGraphExtractor(df).collect() assert(dfResult.length == TESTLENGTH) diff --git a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala index f50c5c34..76402b81 100644 --- a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala @@ -47,7 +47,7 @@ class DomainGraphExtractorTest extends FunSuite with BeforeAndAfter { assert(rddResult(0)._1._1 == "20080430") assert(rddResult(0)._1._2 == "www.archive.org") assert(rddResult(0)._1._3 == "www.archive.org") - assert(rddResult(0)._2 == 305) + assert(rddResult(0)._2 == 269) } after { diff --git a/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala index 8f00bb36..e5ac5e9c 100644 --- a/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala @@ -43,7 +43,7 @@ class PlainTextExtractorTest extends FunSuite with BeforeAndAfter { val df = RecordLoader.loadArchives(arcPath, sc).extractValidPagesDF() val rddResults = PlainTextExtractor(rdd).collect() val dfResults = PlainTextExtractor(df).collect() - val RESULTSLENGTH = 135 + val RESULTSLENGTH = 94 assert(rddResults.length == RESULTSLENGTH) assert(rddResults(0)._1 == "20080430") diff --git a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala index f587399a..469b9c05 100644 --- a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala +++ b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala @@ -58,13 +58,13 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter { // +------------------+-----+ // | Domain|count| // +------------------+-----+ - // | www.archive.org| 132| + // | www.archive.org| 91| // | deadlists.com| 2| // |www.hideout.com.br| 1| // +------------------+-----+ assert(results(0).get(0) == "www.archive.org") - assert(results(0).get(1) == 132) + assert(results(0).get(1) == 91) assert(results(1).get(0) == "deadlists.com") assert(results(1).get(1) == 2)