archivesunleashed · ruebot · May 21, 2018 · May 15, 2018 · May 16, 2018 · May 16, 2018
diff --git a/src/main/scala/io/archivesunleashed/DataFrameLoader.scala b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala
@@ -19,4 +19,10 @@ class DataFrameLoader(sc: SparkContext) {
   	RecordLoader.loadArchives(path, sc)
   		.extractImageLinksDF()
   }
+
+  /** Create a dataframe with (image url, type, width, height, md5, raw bytes) pairs */
+  def extractImages(path: String): DataFrame = {
+    RecordLoader.loadArchives(path, sc)
+      .extractImageDetailsDF()
+  }
 }
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala
@@ -0,0 +1,58 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.archivesunleashed.matchbox
+
+import java.io.ByteArrayInputStream
+import java.io.IOException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.image.{ImageParser, TiffParser};
+import org.apache.tika.parser.jpeg.JpegParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+
+/** Information about an image. e.g. width, height*/
+class ImageDetails(w: String, h: String) {
+	val width: String = w
+	val height: String = h
+}
+
+/** Extracts image details given raw bytes (using Apache Tika) */
+object ExtractImageDetails {
+
+	/**
+	 * @param bytes the raw bytes of the image
+	 * @return A tuple containing the width and height of the image
+	*/
+	def apply(url: String, mimetype: String, bytes: Array[Byte]): ImageDetails = {
+		val inputStream = new ByteArrayInputStream(bytes)
+		val handler = new BodyContentHandler();
+  	val metadata = new Metadata();
+  	val pcontext = new ParseContext();
+
+  	if ((mimetype != null && mimetype.contains("image/jpeg")) || url.endsWith("jpg") || url.endsWith("jpeg")) {
+  		val parser = new JpegParser();
+  		val results = parser.parse(inputStream, handler, metadata, pcontext)
+  	} else if ((mimetype != null && mimetype.contains("image/tiff")) || url.endsWith("tiff")) {
+  		val parser = new TiffParser();
+  		val results = parser.parse(inputStream, handler, metadata, pcontext)
+  	} else {
+  		val parser = new ImageParser();
+			val results = parser.parse(inputStream, handler, metadata, pcontext)
+    }
+    return new ImageDetails(metadata.get("Image Width"), metadata.get("Image Height"))
+	}
+}
diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala
@@ -19,9 +19,10 @@ package io
 
 import io.archivesunleashed.data.{ArchiveRecordWritable, ArchiveRecordInputFormat}
 import ArchiveRecordWritable.ArchiveFormat
-import io.archivesunleashed.matchbox.{DetectLanguage, ExtractDate, ExtractLinks, ExtractImageLinks, ExtractDomain, RemoveHTML}
+import io.archivesunleashed.matchbox.{DetectLanguage, ExtractDate, ExtractLinks, ExtractImageLinks, ExtractImageDetails, ExtractDomain, RemoveHTML, ComputeMD5}
 import io.archivesunleashed.matchbox.ExtractDate.DateComponent
 import io.archivesunleashed.matchbox.ExtractDate.DateComponent._
+import io.archivesunleashed.matchbox.ImageDetails
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.types._
@@ -139,6 +140,27 @@ package object archivesunleashed {
       sqlContext.getOrCreate().createDataFrame(records, schema)
     }
 
+    def extractImageDetailsDF(): DataFrame = {
+      val records = rdd
+        .keepImages()
+        .map(r => {
+          val details = ExtractImageDetails(r.getUrl, r.getMimeType, r.getImageBytes)
+          (r.getUrl, r.getMimeType, details.width, details.height, ComputeMD5(r.getImageBytes), r.getImageBytes)
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6))
+
+      val schema = new StructType()
+        .add(StructField("Url", StringType, true))
+        .add(StructField("Type", StringType, true))
+        .add(StructField("Width", StringType, true))
+        .add(StructField("Height", StringType, true))
+        .add(StructField("MD5", StringType, true))
+        .add(StructField("Bytes", BinaryType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
     /** Removes all data except images. */
     def keepImages() = {
       rdd.filter(r =>

diff --git a/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala
@@ -0,0 +1,69 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed
+
+import com.google.common.io.Resources
+import io.archivesunleashed.df._
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions._
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class ExtractImageDetailsTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("arc/example.arc.gz").getPath
+  private val master = "local[4]"
+  private val appName = "example-df"
+  private var sc: SparkContext = _
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    sc = new SparkContext(conf)
+  }
+
+  test("Fetch image links") {
+    val df = RecordLoader.loadArchives(arcPath, sc)
+      .extractImageDetailsDF()
+
+    // We need this in order to use the $-notation
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    import spark.implicits._
+
+    val extracted = df.select($"Url", $"Type", $"Width", $"Height", $"MD5")
+      .orderBy(desc("MD5")).head(2).toList
+    assert(extracted.size == 2)
+    assert("http://www.archive.org/images/LOCLogoSmall.jpg" == extracted(0)(0))
+    assert("image/jpeg" == extracted(0)(1))
+    assert("275 pixels" == extracted(0)(2))
+    assert("300 pixels" == extracted(0)(3))
+    assert("http://www.archive.org/images/lma.jpg" == extracted(1)(0))
+    assert("image/jpeg" == extracted(1)(1))
+    assert("215 pixels" == extracted(1)(2))
+    assert("71 pixels" == extracted(1)(3))
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}