Skip to content

Commit

Permalink
make ArchiveRecord a trait (#175)
Browse files Browse the repository at this point in the history
  • Loading branch information
helgeho authored and ruebot committed Mar 7, 2018
1 parent 9d40244 commit cd0d7b0
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,31 @@ import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat
import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent
import io.archivesunleashed.spark.matchbox.{RemoveHttpHeader, ExtractDate, ExtractDomain}

class ArchiveRecord(r: SerializableWritable[ArchiveRecordWritable]) extends Serializable {
object ArchiveRecord {
val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX")
}

trait ArchiveRecord extends Serializable {
def getCrawlDate: String

def getCrawlMonth: String

def getContentBytes: Array[Byte]

def getContentString: String

def getMimeType: String

def getUrl: String

def getDomain: String

def getImageBytes: Array[Byte]
}

class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends ArchiveRecord {
import ArchiveRecord._

var arcRecord: ARCRecord = null
var warcRecord: WARCRecord = null

Expand All @@ -38,9 +62,6 @@ class ArchiveRecord(r: SerializableWritable[ArchiveRecordWritable]) extends Seri
else if (r.t.getFormat == ArchiveFormat.WARC)
warcRecord = r.t.getRecord.asInstanceOf[WARCRecord]


val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX")

val getCrawlDate: String = {
if (r.t.getFormat == ArchiveFormat.ARC) {
ExtractDate(arcRecord.getMetaData.getDate, DateComponent.YYYYMMDD)
Expand All @@ -67,15 +88,15 @@ class ArchiveRecord(r: SerializableWritable[ArchiveRecordWritable]) extends Seri

val getContentString: String = new String(getContentBytes)

val getMimeType = {
val getMimeType: String = {
if (r.t.getFormat == ArchiveFormat.ARC) {
arcRecord.getMetaData.getMimetype
} else {
WarcRecordUtils.getWarcResponseMimeType(getContentBytes)
}
}

val getUrl = {
val getUrl: String = {
if (r.t.getFormat == ArchiveFormat.ARC) {
arcRecord.getMetaData.getUrl
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import org.json4s.jackson.JsonMethods._
import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat
import io.archivesunleashed.io.ArchiveRecordWritable
import io.archivesunleashed.mapreduce.WacInputFormat
import io.archivesunleashed.spark.archive.io.ArchiveRecord
import io.archivesunleashed.spark.archive.io._
import io.archivesunleashed.spark.rdd.RecordRDD._

object RecordLoader {
Expand All @@ -34,7 +34,7 @@ object RecordLoader {
sc.newAPIHadoopFile(path, classOf[WacInputFormat], classOf[LongWritable], classOf[ArchiveRecordWritable])
.filter(r => (r._2.getFormat == ArchiveFormat.ARC) ||
((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response")))
.map(r => new ArchiveRecord(new SerializableWritable(r._2)))
.map(r => new ArchiveRecordImpl(new SerializableWritable(r._2)))

if (keepValidPages) rdd.keepValidPages() else rdd
}
Expand Down

0 comments on commit cd0d7b0

Please sign in to comment.