Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Extract Image Details API #226

Merged
merged 12 commits into from
May 21, 2018
6 changes: 6 additions & 0 deletions src/main/scala/io/archivesunleashed/DataFrameLoader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,10 @@ class DataFrameLoader(sc: SparkContext) {
RecordLoader.loadArchives(path, sc)
.extractImageLinksDF()
}

/** Create a dataframe with (image url, type, width, height, md5, raw bytes) pairs */
def extractImages(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractImageDetailsDF()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about just extractImages?

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.matchbox

import java.io.ByteArrayInputStream
import java.io.IOException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.image.{ImageParser, TiffParser};
import org.apache.tika.parser.jpeg.JpegParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;

/** Information about an image. e.g. width, height*/
class ImageDetails(w: String, h: String) {
val width: String = w
val height: String = h
}

/** Extracts image details given raw bytes (using Apache Tika) */
object ExtractImageDetails {

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

two space indent please.

/**
* @param bytes the raw bytes of the image
* @return A tuple containing the width and height of the image
*/
def apply(url: String, mimetype: String, bytes: Array[Byte]): ImageDetails = {
val inputStream = new ByteArrayInputStream(bytes)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indenting is off.

val handler = new BodyContentHandler();
val metadata = new Metadata();
val pcontext = new ParseContext();

if ((mimetype != null && mimetype.contains("image/jpeg")) || url.endsWith("jpg") || url.endsWith("jpeg")) {
val parser = new JpegParser();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indenting is off

val results = parser.parse(inputStream, handler, metadata, pcontext)
} else if ((mimetype != null && mimetype.contains("image/tiff")) || url.endsWith("tiff")) {
val parser = new TiffParser();
val results = parser.parse(inputStream, handler, metadata, pcontext)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indenting if off.

} else {
val parser = new ImageParser();
val results = parser.parse(inputStream, handler, metadata, pcontext)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indenting is way off here.

}
return new ImageDetails(metadata.get("Image Width"), metadata.get("Image Height"))
}
}
24 changes: 23 additions & 1 deletion src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ package io

import io.archivesunleashed.data.{ArchiveRecordWritable, ArchiveRecordInputFormat}
import ArchiveRecordWritable.ArchiveFormat
import io.archivesunleashed.matchbox.{DetectLanguage, ExtractDate, ExtractLinks, ExtractImageLinks, ExtractDomain, RemoveHTML}
import io.archivesunleashed.matchbox.{DetectLanguage, ExtractDate, ExtractLinks, ExtractImageLinks, ExtractImageDetails, ExtractDomain, RemoveHTML, ComputeMD5}
import io.archivesunleashed.matchbox.ExtractDate.DateComponent
import io.archivesunleashed.matchbox.ExtractDate.DateComponent._
import io.archivesunleashed.matchbox.ImageDetails

import org.apache.spark.sql._
import org.apache.spark.sql.types._
Expand Down Expand Up @@ -139,6 +140,27 @@ package object archivesunleashed {
sqlContext.getOrCreate().createDataFrame(records, schema)
}

def extractImageDetailsDF(): DataFrame = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need doc comment here, and we're good to go.

val records = rdd
.keepImages()
.map(r => {
val details = ExtractImageDetails(r.getUrl, r.getMimeType, r.getImageBytes)
(r.getUrl, r.getMimeType, details.width, details.height, ComputeMD5(r.getImageBytes), r.getImageBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6))

val schema = new StructType()
.add(StructField("Url", StringType, true))
.add(StructField("Type", StringType, true))
.add(StructField("Width", StringType, true))
.add(StructField("Height", StringType, true))
.add(StructField("MD5", StringType, true))
.add(StructField("Bytes", BinaryType, true))

val sqlContext = SparkSession.builder();
sqlContext.getOrCreate().createDataFrame(records, schema)
}

/** Removes all data except images. */
def keepImages() = {
rdd.filter(r =>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed

import com.google.common.io.Resources
import io.archivesunleashed.df._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}

@RunWith(classOf[JUnitRunner])
class ExtractImageDetailsTest extends FunSuite with BeforeAndAfter {
private val arcPath = Resources.getResource("arc/example.arc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _

before {
val conf = new SparkConf()
.setMaster(master)
.setAppName(appName)
sc = new SparkContext(conf)
}

test("Fetch image links") {
val df = RecordLoader.loadArchives(arcPath, sc)
.extractImageDetailsDF()

// We need this in order to use the $-notation
val spark = SparkSession.builder().master("local").getOrCreate()
import spark.implicits._

val extracted = df.select($"Url", $"Type", $"Width", $"Height", $"MD5")
.orderBy(desc("MD5")).head(2).toList
assert(extracted.size == 2)
assert("http://www.archive.org/images/LOCLogoSmall.jpg" == extracted(0)(0))
assert("image/jpeg" == extracted(0)(1))
assert("275 pixels" == extracted(0)(2))
assert("300 pixels" == extracted(0)(3))
assert("http://www.archive.org/images/lma.jpg" == extracted(1)(0))
assert("image/jpeg" == extracted(1)(1))
assert("215 pixels" == extracted(1)(2))
assert("71 pixels" == extracted(1)(3))
}

after {
if (sc != null) {
sc.stop()
}
}
}