Skip to content

Commit

Permalink
Add discardLanguage filter to RecordLoader.
Browse files Browse the repository at this point in the history
- Clean up doc comments
- Add test
- Resolves #352
  • Loading branch information
ruebot committed Aug 21, 2019
1 parent 4313174 commit 3e70492
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 12 deletions.
32 changes: 20 additions & 12 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ package object archivesunleashed {
&& DetectMimeTypeTika(r.getBinaryBytes).startsWith("image/"))
}

/** Removes all data but selected mimeTypes specified in ArchiveRecord.
/** Removes all data but selected mimeTypes specified.
*
* @param mimeTypes a list of Mime Types
*/
Expand All @@ -488,15 +488,15 @@ package object archivesunleashed {
rdd.filter(r => mimeTypes.contains(DetectMimeTypeTika(r.getBinaryBytes)))
}

/** Removes all data that does not have selected status codes.
/** Removes all data that does not have selected HTTP status codes.
*
* @param statusCodes a list of HTTP status codes
*/
def keepHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => statusCodes.contains(r.getHttpStatus))
}

/** Removes all data that does not have selected data.
/** Removes all data that does not have selected date.
*
* @param dates a list of dates
* @param component the selected DateComponent enum value
Expand All @@ -513,7 +513,7 @@ package object archivesunleashed {
rdd.filter(r => urls.contains(r.getUrl))
}

/** Removes all data but selected url patterns.
/** Removes all data but selected URL patterns.
*
* @param urlREs a list of regular expressions
*/
Expand Down Expand Up @@ -555,47 +555,47 @@ package object archivesunleashed {
}).exists(identity))
}

/** Filters ArchiveRecord MimeTypes from RDDs.
/** Filters ArchiveRecord MimeTypes (web server).
*
* @param mimeTypes a list of Mime Types
*/
def discardMimeTypes(mimeTypes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !mimeTypes.contains(r.getMimeType))
}

/** Filters detected MimeTypes from RDDs.
/** Filters detected MimeTypes (Tika).
*
* @param mimeTypes a list of Mime Types
*/
def discardMimeTypesTika(mimeTypes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !mimeTypes.contains(DetectMimeTypeTika(r.getBinaryBytes)))
}

/** Filters detected dates from RDDs.
/** Filters detected dates.
*
* @param date a list of dates
*/
def discardDate(date: String): RDD[ArchiveRecord] = {
rdd.filter(r => r.getCrawlDate != date)
}

/** Filters detected urls from RDDs.
/** Filters detected URLs.
*
* @param urls a list of urls
*/
def discardUrls(urls: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !urls.contains(r.getUrl))
}

/** Filters detected status codes from RDDs.
/** Filters detected HTTP status codes.
*
* @param statusCodes a list of HTTP status codes
*/
def discardHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !statusCodes.contains(r.getHttpStatus))
}

/** Filters detected url patterns from RDDs.
/** Filters detected URL patterns (regex).
*
* @param urlREs a list of Regular expressions
*/
Expand All @@ -608,15 +608,15 @@ package object archivesunleashed {
}).exists(identity))
}

/** Filters detected domains (regex) from RDDs.
/** Filters detected domains (regex).
*
* @param urls a list of urls for the source domains
*/
def discardDomains(urls: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !urls.contains(r.getDomain))
}

/** Filters detected content (regex) from RDDs.
/** Filters detected content (regex).
*
* @param contentREs a list of regular expressions
*/
Expand All @@ -628,5 +628,13 @@ package object archivesunleashed {
case None => false
}).exists(identity))
}

/** Filters detected language.
*
* @param lang a set of ISO 639-2 codes
*/
def discardLanguages(lang: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !lang.contains(DetectLanguage(RemoveHTML(r.getContentString))))
}
}
}
10 changes: 10 additions & 0 deletions src/test/scala/io/archivesunleashed/RecordRDDTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
assert (r2.sameElements(r))
}

test ("discard languages") {
val base2 = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val langs: Set[String] = Set("fr")
val r = Array("http://www.archive.org/", "http://www.archive.org/index.php")
val r2 = base2.discardLanguages(langs)
.map(r => r.getUrl).take(2)
assert (r2.sameElements(r))
}

test ("check for keep content"){
val expected = 1
val base = RecordLoader.loadArchives(arcPath, sc)
Expand Down

0 comments on commit 3e70492

Please sign in to comment.