Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add discardLanguage filter to RecordLoader. #353

Merged
merged 2 commits into from
Aug 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 20 additions & 12 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ package object archivesunleashed {
&& DetectMimeTypeTika(r.getBinaryBytes).startsWith("image/"))
}

/** Removes all data but selected mimeTypes specified in ArchiveRecord.
/** Removes all data but selected mimeTypes specified.
*
* @param mimeTypes a list of Mime Types
*/
Expand All @@ -488,15 +488,15 @@ package object archivesunleashed {
rdd.filter(r => mimeTypes.contains(DetectMimeTypeTika(r.getBinaryBytes)))
}

/** Removes all data that does not have selected status codes.
/** Removes all data that does not have selected HTTP status codes.
*
* @param statusCodes a list of HTTP status codes
*/
def keepHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => statusCodes.contains(r.getHttpStatus))
}

/** Removes all data that does not have selected data.
/** Removes all data that does not have selected date.
*
* @param dates a list of dates
* @param component the selected DateComponent enum value
Expand All @@ -513,7 +513,7 @@ package object archivesunleashed {
rdd.filter(r => urls.contains(r.getUrl))
}

/** Removes all data but selected url patterns.
/** Removes all data but selected URL patterns.
*
* @param urlREs a list of regular expressions
*/
Expand Down Expand Up @@ -555,47 +555,47 @@ package object archivesunleashed {
}).exists(identity))
}

/** Filters ArchiveRecord MimeTypes from RDDs.
/** Filters ArchiveRecord MimeTypes (web server).
*
* @param mimeTypes a list of Mime Types
*/
def discardMimeTypes(mimeTypes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !mimeTypes.contains(r.getMimeType))
}

/** Filters detected MimeTypes from RDDs.
/** Filters detected MimeTypes (Tika).
*
* @param mimeTypes a list of Mime Types
*/
def discardMimeTypesTika(mimeTypes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !mimeTypes.contains(DetectMimeTypeTika(r.getBinaryBytes)))
}

/** Filters detected dates from RDDs.
/** Filters detected dates.
*
* @param date a list of dates
*/
def discardDate(date: String): RDD[ArchiveRecord] = {
rdd.filter(r => r.getCrawlDate != date)
}

/** Filters detected urls from RDDs.
/** Filters detected URLs.
*
* @param urls a list of urls
*/
def discardUrls(urls: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !urls.contains(r.getUrl))
}

/** Filters detected status codes from RDDs.
/** Filters detected HTTP status codes.
*
* @param statusCodes a list of HTTP status codes
*/
def discardHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !statusCodes.contains(r.getHttpStatus))
}

/** Filters detected url patterns from RDDs.
/** Filters detected URL patterns (regex).
*
* @param urlREs a list of Regular expressions
*/
Expand All @@ -608,15 +608,15 @@ package object archivesunleashed {
}).exists(identity))
}

/** Filters detected domains (regex) from RDDs.
/** Filters detected domains (regex).
*
* @param urls a list of urls for the source domains
*/
def discardDomains(urls: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !urls.contains(r.getDomain))
}

/** Filters detected content (regex) from RDDs.
/** Filters detected content (regex).
*
* @param contentREs a list of regular expressions
*/
Expand All @@ -628,5 +628,13 @@ package object archivesunleashed {
case None => false
}).exists(identity))
}

/** Filters detected language.
*
* @param lang a set of ISO 639-2 codes
*/
def discardLanguages(lang: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !lang.contains(DetectLanguage(RemoveHTML(r.getContentString))))
}
}
}
10 changes: 10 additions & 0 deletions src/test/scala/io/archivesunleashed/RecordRDDTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
assert (r2.sameElements(r))
}

test ("discard languages") {
val base2 = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val langs: Set[String] = Set("fr")
val r = Array("http://www.archive.org/", "http://www.archive.org/index.php")
val r2 = base2.discardLanguages(langs)
.map(r => r.getUrl).take(2)
assert (r2.sameElements(r))
}

test ("keep mime tika") {
val base = RecordLoader.loadArchives(arcPath, sc)
val mime = Set ("text/plain", "image/jpeg")
Expand Down