-
Notifications
You must be signed in to change notification settings - Fork 33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Extract Image Details API #226
Changes from 3 commits
1a30357
fdd09fc
002bcb2
05bd317
9723c05
136d00d
ea3aa40
6fc87f8
f9d3621
a67e6d2
2d44030
2fcbe83
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/* | ||
* Archives Unleashed Toolkit (AUT): | ||
* An open-source platform for analyzing web archives. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.archivesunleashed.matchbox | ||
|
||
import java.io.ByteArrayInputStream | ||
import java.io.IOException; | ||
import org.apache.tika.metadata.Metadata; | ||
import org.apache.tika.parser.image.{ImageParser, TiffParser}; | ||
import org.apache.tika.parser.jpeg.JpegParser; | ||
import org.apache.tika.parser.ParseContext; | ||
import org.apache.tika.sax.BodyContentHandler; | ||
|
||
/** Information about an image. e.g. width, height*/ | ||
class ImageDetails(w: String, h: String) { | ||
val width: String = w | ||
val height: String = h | ||
} | ||
|
||
/** Extracts image details given raw bytes (using Apache Tika) */ | ||
object ExtractImageDetails { | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. two space indent please. |
||
/** | ||
* @param bytes the raw bytes of the image | ||
* @return A tuple containing the width and height of the image | ||
*/ | ||
def apply(url: String, mimetype: String, bytes: Array[Byte]): ImageDetails = { | ||
val inputStream = new ByteArrayInputStream(bytes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indenting is off. |
||
val handler = new BodyContentHandler(); | ||
val metadata = new Metadata(); | ||
val pcontext = new ParseContext(); | ||
|
||
if ((mimetype != null && mimetype.contains("image/jpeg")) || url.endsWith("jpg") || url.endsWith("jpeg")) { | ||
val parser = new JpegParser(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indenting is off |
||
val results = parser.parse(inputStream, handler, metadata, pcontext) | ||
} else if ((mimetype != null && mimetype.contains("image/tiff")) || url.endsWith("tiff")) { | ||
val parser = new TiffParser(); | ||
val results = parser.parse(inputStream, handler, metadata, pcontext) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indenting if off. |
||
} else { | ||
val parser = new ImageParser(); | ||
val results = parser.parse(inputStream, handler, metadata, pcontext) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indenting is way off here. |
||
} | ||
return new ImageDetails(metadata.get("Image Width"), metadata.get("Image Height")) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,9 +19,10 @@ package io | |
|
||
import io.archivesunleashed.data.{ArchiveRecordWritable, ArchiveRecordInputFormat} | ||
import ArchiveRecordWritable.ArchiveFormat | ||
import io.archivesunleashed.matchbox.{DetectLanguage, ExtractDate, ExtractLinks, ExtractImageLinks, ExtractDomain, RemoveHTML} | ||
import io.archivesunleashed.matchbox.{DetectLanguage, ExtractDate, ExtractLinks, ExtractImageLinks, ExtractImageDetails, ExtractDomain, RemoveHTML, ComputeMD5} | ||
import io.archivesunleashed.matchbox.ExtractDate.DateComponent | ||
import io.archivesunleashed.matchbox.ExtractDate.DateComponent._ | ||
import io.archivesunleashed.matchbox.ImageDetails | ||
|
||
import org.apache.spark.sql._ | ||
import org.apache.spark.sql.types._ | ||
|
@@ -139,6 +140,27 @@ package object archivesunleashed { | |
sqlContext.getOrCreate().createDataFrame(records, schema) | ||
} | ||
|
||
def extractImageDetailsDF(): DataFrame = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need doc comment here, and we're good to go. |
||
val records = rdd | ||
.keepImages() | ||
.map(r => { | ||
val details = ExtractImageDetails(r.getUrl, r.getMimeType, r.getImageBytes) | ||
(r.getUrl, r.getMimeType, details.width, details.height, ComputeMD5(r.getImageBytes), r.getImageBytes) | ||
}) | ||
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6)) | ||
|
||
val schema = new StructType() | ||
.add(StructField("Url", StringType, true)) | ||
.add(StructField("Type", StringType, true)) | ||
.add(StructField("Width", StringType, true)) | ||
.add(StructField("Height", StringType, true)) | ||
.add(StructField("MD5", StringType, true)) | ||
.add(StructField("Bytes", BinaryType, true)) | ||
|
||
val sqlContext = SparkSession.builder(); | ||
sqlContext.getOrCreate().createDataFrame(records, schema) | ||
} | ||
|
||
/** Removes all data except images. */ | ||
def keepImages() = { | ||
rdd.filter(r => | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
/* | ||
* Archives Unleashed Toolkit (AUT): | ||
* An open-source platform for analyzing web archives. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.archivesunleashed | ||
|
||
import com.google.common.io.Resources | ||
import io.archivesunleashed.df._ | ||
import org.apache.spark.sql.SparkSession | ||
import org.apache.spark.sql.functions._ | ||
import org.apache.spark.{SparkConf, SparkContext} | ||
import org.junit.runner.RunWith | ||
import org.scalatest.junit.JUnitRunner | ||
import org.scalatest.{BeforeAndAfter, FunSuite} | ||
|
||
@RunWith(classOf[JUnitRunner]) | ||
class ExtractImageDetailsTest extends FunSuite with BeforeAndAfter { | ||
private val arcPath = Resources.getResource("arc/example.arc.gz").getPath | ||
private val master = "local[4]" | ||
private val appName = "example-df" | ||
private var sc: SparkContext = _ | ||
|
||
before { | ||
val conf = new SparkConf() | ||
.setMaster(master) | ||
.setAppName(appName) | ||
sc = new SparkContext(conf) | ||
} | ||
|
||
test("Fetch image links") { | ||
val df = RecordLoader.loadArchives(arcPath, sc) | ||
.extractImageDetailsDF() | ||
|
||
// We need this in order to use the $-notation | ||
val spark = SparkSession.builder().master("local").getOrCreate() | ||
import spark.implicits._ | ||
|
||
val extracted = df.select($"Url", $"Type", $"Width", $"Height", $"MD5") | ||
.orderBy(desc("MD5")).head(2).toList | ||
assert(extracted.size == 2) | ||
assert("http://www.archive.org/images/LOCLogoSmall.jpg" == extracted(0)(0)) | ||
assert("image/jpeg" == extracted(0)(1)) | ||
assert("275 pixels" == extracted(0)(2)) | ||
assert("300 pixels" == extracted(0)(3)) | ||
assert("http://www.archive.org/images/lma.jpg" == extracted(1)(0)) | ||
assert("image/jpeg" == extracted(1)(1)) | ||
assert("215 pixels" == extracted(1)(2)) | ||
assert("71 pixels" == extracted(1)(3)) | ||
} | ||
|
||
after { | ||
if (sc != null) { | ||
sc.stop() | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about just
extractImages
?