archivesunleashed · ianmilligan1 · Aug 16, 2019 · Aug 14, 2019 · Aug 15, 2019 · Aug 15, 2019
diff --git a/.codecov.yml b/.codecov.yml
@@ -0,0 +1,26 @@
+codecov:
+  notify:
+    require_ci_to_pass: yes
+
+coverage:
+  precision: 2
+  round: down
+  range: "50...80"
+
+  status:
+    project: yes
+    patch: yes
+    changes: no
+
+parsers:
+  gcov:
+    branch_detection:
+      conditional: yes
+      loop: yes
+      method: no
+      macro: no
+
+comment:
+  layout: "header, diff"
+  behavior: default
+  require_changes: no
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractAtMentions.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractAtMentions.scala
diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala
@@ -42,7 +42,7 @@ import scala.util.matching.Regex
   * Package object which supplies implicits to augment generic RDDs with AUT-specific transformations.
   */
 package object archivesunleashed {
-  /** Loads records from either WARCs, ARCs or Twitter API data (JSON). */
+  /** Loads records from either WARCs or ARCs. */
   object RecordLoader {
     /** Gets all non-empty archive files.
       *
@@ -211,18 +211,18 @@ package object archivesunleashed {
             (r, (DetectMimeTypeTika(r.getBinaryBytes)))
             )
         .filter(r => r._2.startsWith("audio/")
-          || r._1.getUrl.endsWith("aac")
-          || r._1.getUrl.endsWith("mid")
-          || r._1.getUrl.endsWith("midi")
-          || r._1.getUrl.endsWith("mp3")
-          || r._1.getUrl.endsWith("wav")
-          || r._1.getUrl.endsWith("oga")
-          || r._1.getUrl.endsWith("ogg")
-          || r._1.getUrl.endsWith("weba")
-          || r._1.getUrl.endsWith("ra")
-          || r._1.getUrl.endsWith("rm")
-          || r._1.getUrl.endsWith("3gp")
-          || r._1.getUrl.endsWith("3g2"))
+          || r._1.getUrl.endsWith(".aac")
+          || r._1.getUrl.endsWith(".mid")
+          || r._1.getUrl.endsWith(".midi")
+          || r._1.getUrl.endsWith(".mp3")
+          || r._1.getUrl.endsWith(".wav")
+          || r._1.getUrl.endsWith(".oga")
+          || r._1.getUrl.endsWith(".ogg")
+          || r._1.getUrl.endsWith(".weba")
+          || r._1.getUrl.endsWith(".ra")
+          || r._1.getUrl.endsWith(".rm")
+          || r._1.getUrl.endsWith(".3gp")
+          || r._1.getUrl.endsWith(".3g2"))
         .map(r => {
           val bytes = r._1.getBinaryBytes
           val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
@@ -255,18 +255,205 @@ package object archivesunleashed {
             (r, (DetectMimeTypeTika(r.getBinaryBytes)))
             )
         .filter(r => r._2.startsWith("video/")
-          || r._1.getUrl.endsWith("flv")
-          || r._1.getUrl.endsWith("mp4")
-          || r._1.getUrl.endsWith("mov")
-          || r._1.getUrl.endsWith("avi")
-          || r._1.getUrl.endsWith("wmv")
-          || r._1.getUrl.endsWith("rv")
-          || r._1.getUrl.endsWith("mpeg")
-          || r._1.getUrl.endsWith("ogv")
-          || r._1.getUrl.endsWith("webm")
-          || r._1.getUrl.endsWith("ts")
-          || r._1.getUrl.endsWith("3gp")
-          || r._1.getUrl.endsWith("3g2"))
+          || r._1.getUrl.endsWith(".flv")
+          || r._1.getUrl.endsWith(".mp4")
+          || r._1.getUrl.endsWith(".mov")
+          || r._1.getUrl.endsWith(".avi")
+          || r._1.getUrl.endsWith(".wmv")
+          || r._1.getUrl.endsWith(".rv")
+          || r._1.getUrl.endsWith(".mpeg")
+          || r._1.getUrl.endsWith(".ogv")
+          || r._1.getUrl.endsWith(".webm")
+          || r._1.getUrl.endsWith(".ts")
+          || r._1.getUrl.endsWith(".3gp")
+          || r._1.getUrl.endsWith(".3g2"))
+        .map(r => {
+          val bytes = r._1.getBinaryBytes
+          val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
+          val encodedBytes = Base64.getEncoder.encodeToString(bytes)
+          val url = new URL(r._1.getUrl)
+          val filename = FilenameUtils.getName(url.getPath())
+          val extension = FilenameUtils.getExtension(url.getPath())
+          (r._1.getUrl, filename, extension, r._1.getMimeType,
+            DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
+
+      val schema = new StructType()
+        .add(StructField("url", StringType, true))
+        .add(StructField("filename", StringType, true))
+        .add(StructField("extension", StringType, true))
+        .add(StructField("mime_type_web_server", StringType, true))
+        .add(StructField("mime_type_tika", StringType, true))
+        .add(StructField("md5", StringType, true))
+        .add(StructField("bytes", StringType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
+    /* Extract spreadsheet bytes and spreadsheet metadata. */
+    def extractSpreadsheetDetailsDF(): DataFrame = {
+      val records = rdd
+        .map(r =>
+            (r, (DetectMimeTypeTika(r.getBinaryBytes)))
+            )
+        .filter(r => (r._2 == "application/vnd.ms-excel"
+          || r._2 == "application/vnd.ms-excel.workspace.3"
+          || r._2 == "application/vnd.ms-excel.workspace.4"
+          || r._2 == "application/vnd.ms-excel.sheet.2"
+          || r._2 == "application/vnd.ms-excel.sheet.3"
+          || r._2 == "application/vnd.ms-excel.sheet.3"
+          || r._2 == "application/vnd.ms-excel.addin.macroenabled.12"
+          || r._2 == "application/vnd.ms-excel.sheet.binary.macroenabled.12"
+          || r._2 == "application/vnd.ms-excel.sheet.macroenabled.12"
+          || r._2 == "application/vnd.ms-excel.template.macroenabled.12"
+          || r._2 == "application/vnd.ms-spreadsheetml"
+          || r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.template"
+          || r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+          || r._2 == "application/x-vnd.oasis.opendocument.spreadsheet-template"
+          || r._2 == "application/vnd.oasis.opendocument.spreadsheet-template"
+          || r._2 == "application/vnd.oasis.opendocument.spreadsheet"
+          || r._2 == "application/x-vnd.oasis.opendocument.spreadsheet"
+          || r._2 == "application/x-tika-msworks-spreadsheet"
+          || r._2 == "application/vnd.lotus-1-2-3"
+          || r._1.getUrl.endsWith(".tsv")
+          || r._1.getMimeType == "text/csv"
+          || r._1.getUrl.endsWith(".csv"))
+          && !r._2.startsWith("audio/")
+          && !r._2.startsWith("video/")
+          && !r._2.startsWith("image/")
+          && r._2 != "text/html"
+          && !r._1.getUrl.endsWith("js"))
+        .map(r => {
+          val bytes = r._1.getBinaryBytes
+          val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
+          val encodedBytes = Base64.getEncoder.encodeToString(bytes)
+          val url = new URL(r._1.getUrl)
+          val filename = FilenameUtils.getName(url.getPath())
+          val extension = FilenameUtils.getExtension(url.getPath())
+          (r._1.getUrl, filename, extension, r._1.getMimeType,
+            DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
+
+      val schema = new StructType()
+        .add(StructField("url", StringType, true))
+        .add(StructField("filename", StringType, true))
+        .add(StructField("extension", StringType, true))
+        .add(StructField("mime_type_web_server", StringType, true))
+        .add(StructField("mime_type_tika", StringType, true))
+        .add(StructField("md5", StringType, true))
+        .add(StructField("bytes", StringType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
+    /* Extract presentation program bytes and presentation program metadata. */
+    def extractPresentationProgramDetailsDF(): DataFrame = {
+      val records = rdd
+        .map(r =>
+            (r, (DetectMimeTypeTika(r.getBinaryBytes)))
+            )
+        .filter(r => (r._2 == "application/vnd.ms-powerpoint"
+          || r._2 == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+          || r._2 == "application/vnd.oasis.opendocument.presentation"
+          || r._2 == "application/vnd.oasis.opendocument.presentation-template"
+          || r._2 == "application/vnd.sun.xml.impress"
+          || r._2 == "application/vnd.sun.xml.impress.template"
+          || r._2 == "application/vnd.stardivision.impress"
+          || r._2 == "application/x-starimpress"
+          || r._2 == "application/vnd.ms-powerpoint.addin.macroEnabled.12"
+          || r._2 == "application/vnd.ms-powerpoint.presentation.macroEnabled.12"
+          || r._2 == "application/vnd.ms-powerpoint.slide.macroEnabled.12"
+          || r._2 == "application/vnd.ms-powerpoint.slideshow.macroEnabled.12"
+          || r._2 == "application/vnd.ms-powerpoint.template.macroEnabled.12"))
+        .map(r => {
+          val bytes = r._1.getBinaryBytes
+          val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
+          val encodedBytes = Base64.getEncoder.encodeToString(bytes)
+          val url = new URL(r._1.getUrl)
+          val filename = FilenameUtils.getName(url.getPath())
+          val extension = FilenameUtils.getExtension(url.getPath())
+          (r._1.getUrl, filename, extension, r._1.getMimeType,
+            DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
+
+      val schema = new StructType()
+        .add(StructField("url", StringType, true))
+        .add(StructField("filename", StringType, true))
+        .add(StructField("extension", StringType, true))
+        .add(StructField("mime_type_web_server", StringType, true))
+        .add(StructField("mime_type_tika", StringType, true))
+        .add(StructField("md5", StringType, true))
+        .add(StructField("bytes", StringType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
+    /* Extract word processor bytes and word processor metadata. */
+    def extractWordProcessorDetailsDF(): DataFrame = {
+      val records = rdd
+        .map(r =>
+            (r, (DetectMimeTypeTika(r.getBinaryBytes)))
+            )
+        .filter(r => (r._2 == "application/vnd.lotus-wordpro"
+          || r._2 == "application/vnd.kde.kword"
+          || r._2 == "application/vnd.ms-word.document.macroEnabled.12"
+          || r._2 == "application/vnd.ms-word.template.macroEnabled.12"
+          || r._2 == "application/vnd.oasis.opendocument.text"
+          || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml"
+          || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+          || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml"
+          || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"
+          || r._2 == "application/vnd.wordperfect"
+          || r._2 == "application/wordperfect5.1"
+          || r._2 == "application/msword"
+          || r._2 == "application/vnd.ms-word.document.macroEnabled.12"
+          || r._2 == "application/vnd.ms-word.template.macroEnabled.12"
+          || r._2 == "application/vnd.apple.pages"
+          || r._2 == "application/macwriteii"
+          || r._2 == "application/vnd.ms-works"
+          || r._2 == "application/rtf"))
+        .map(r => {
+          val bytes = r._1.getBinaryBytes
+          val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
+          val encodedBytes = Base64.getEncoder.encodeToString(bytes)
+          val url = new URL(r._1.getUrl)
+          val filename = FilenameUtils.getName(url.getPath())
+          val extension = FilenameUtils.getExtension(url.getPath())
+          (r._1.getUrl, filename, extension, r._1.getMimeType,
+            DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
+
+      val schema = new StructType()
+        .add(StructField("url", StringType, true))
+        .add(StructField("filename", StringType, true))
+        .add(StructField("extension", StringType, true))
+        .add(StructField("mime_type_web_server", StringType, true))
+        .add(StructField("mime_type_tika", StringType, true))
+        .add(StructField("md5", StringType, true))
+        .add(StructField("bytes", StringType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
+    /* Extract plain text bytes and plain text metadata. */
+    def extractTextFilesDetailsDF(): DataFrame = {
+      val records = rdd
+        .map(r =>
+            (r, (DetectMimeTypeTika(r.getBinaryBytes)))
+            )
+        .filter(r => (r._2 == "text/plain"
+          || r._1.getUrl.endsWith(".txt"))
+        && !r._1.getUrl.endsWith("robots.txt")
+        && r._2 != "text/html"
+        && !r._1.getUrl.endsWith(".js"))
         .map(r => {
           val bytes = r._1.getBinaryBytes
           val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))

diff --git a/src/test/resources/warc/example.docs.warc.gz b/src/test/resources/warc/example.docs.warc.gz
diff --git a/src/test/resources/warc/example.txt.warc.gz b/src/test/resources/warc/example.txt.warc.gz
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala
@@ -0,0 +1,72 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source toolkit for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed
+
+import com.google.common.io.Resources
+import org.apache.spark.sql.SparkSession
+// scalastyle:off underscore.import
+import io.archivesunleashed.df._
+import org.apache.spark.sql.functions._
+// scalastyle:on underscore.import
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class ExtractPresentationProgramDetailsTest extends FunSuite with BeforeAndAfter {
+  private val warcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
+  private val master = "local[4]"
+  private val appName = "example-df"
+  private var sc: SparkContext = _
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    sc = new SparkContext(conf)
+  }
+
+  test("Word Processor DF extraction") {
+    val df = RecordLoader.loadArchives(warcPath, sc)
+      .extractPresentationProgramDetailsDF()
+
+    val extracted = df.select("url", "filename", "extension",
+      "mime_type_web_server", "mime_type_tika", "md5")
+      .orderBy(desc("md5")).head(2).toList
+    assert(extracted.size == 2)
+    assert("https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.odp" == extracted(0)(0))
+    assert("aut-test-fixtures.odp" == extracted(0)(1))
+    assert("odp" == extracted(0)(2))
+    assert("application/vnd.oasis.opendocument.presentation" == extracted(0)(3))
+    assert("application/vnd.oasis.opendocument.presentation" == extracted(0)(4))
+    assert("f38b2679029cf3453c8151b92c615c70" == extracted(0)(5))
+    assert("https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx" == extracted(1)(0))
+    assert("aut-test-fixtures.pptx" == extracted(1)(1))
+    assert("pptx" == extracted(1)(2))
+    assert("application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted(1)(3))
+    assert("application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted(1)(4))
+    assert("7a7b1fe4b6d311376eaced9de3b682ee" == extracted(1)(5))
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}