Skip to content

Commit

Permalink
Filter or filedesc and dns records from arcs. (#517)
Browse files Browse the repository at this point in the history
- Resolves #516
- add removeFiledesc method, and apply it
- update tests
  • Loading branch information
ruebot authored May 12, 2021
1 parent 5cb0665 commit a6d3265
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 13 deletions.
35 changes: 24 additions & 11 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -165,18 +165,20 @@ package object archivesunleashed {
/* Creates a column for Bytes as well in Dataframe.
Call KeepImages OR KeepValidPages on RDD depending upon the requirement before calling this method */
def all(): DataFrame = {
val records = rdd.map(r =>
Row(
r.getCrawlDate,
r.getUrl,
r.getMimeType,
DetectMimeTypeTika(r.getBinaryBytes),
r.getContentString,
r.getBinaryBytes,
r.getHttpStatus,
r.getArchiveFilename
val records = rdd
.removeFiledesc()
.map(r =>
Row(
r.getCrawlDate,
r.getUrl,
r.getMimeType,
DetectMimeTypeTika(r.getBinaryBytes),
r.getContentString,
r.getBinaryBytes,
r.getHttpStatus,
r.getArchiveFilename
)
)
)

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
Expand All @@ -192,6 +194,14 @@ package object archivesunleashed {
sqlContext.getOrCreate().createDataFrame(records, schema)
}

/** Filters out filedesc:// and dns: records. */
def removeFiledesc(): RDD[ArchiveRecord] = {
rdd.filter(r =>
!r.getUrl.toLowerCase.startsWith("filedesc:")
&& !r.getUrl.toLowerCase.startsWith("dns:")
)
}

/** Removes all non-html-based data (images, executables, etc.) from html text. */
def keepValidPages(): RDD[ArchiveRecord] = {
rdd.filter(r =>
Expand All @@ -208,6 +218,7 @@ package object archivesunleashed {
/** Extracts webpages with columns for crawl data, url, MIME type, and content. */
def webpages(): DataFrame = {
val records = rdd
.removeFiledesc()
.keepValidPages()
.map(r =>
Row(
Expand Down Expand Up @@ -235,6 +246,7 @@ package object archivesunleashed {
/** Extracts a webgraph with columns for crawl date, source url, destination url, and anchor text. */
def webgraph(): DataFrame = {
val records = rdd
.removeFiledesc()
.keepValidPages()
.flatMap(r =>
ExtractLinks(r.getUrl, r.getContentString)
Expand All @@ -256,6 +268,7 @@ package object archivesunleashed {
/* Extracts all the images links from a source page. */
def imagegraph(): DataFrame = {
val records = rdd
.removeFiledesc()
.keepValidPages()
.flatMap(r =>
ExtractImageLinks(r.getUrl, r.getContentString)
Expand Down
2 changes: 1 addition & 1 deletion src/test/scala/io/archivesunleashed/RecordDFTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
import spark.implicits._
// scalastyle:on

val expected = "000"
val expected = "200"
val base = RecordLoader
.loadArchives(arcPath, sc)
.all()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {

val r_11 = all.select(url, mime_type).take(1)(0)
assert(
r_11.getAs[String](url) == "filedesc://IAH-20080430204825-00000-blackbook.arc"
r_11.getAs[String](url) == "http://www.archive.org/robots.txt"
)
assert(r_11.getAs[String](mime_type) == "text/plain")
}
Expand Down

0 comments on commit a6d3265

Please sign in to comment.