diff --git a/src/main/scala/io/archivesunleashed/spark/archive/io/ArchiveRecord.scala b/src/main/scala/io/archivesunleashed/spark/archive/io/ArchiveRecord.scala index 9229f772..eea02a99 100644 --- a/src/main/scala/io/archivesunleashed/spark/archive/io/ArchiveRecord.scala +++ b/src/main/scala/io/archivesunleashed/spark/archive/io/ArchiveRecord.scala @@ -29,31 +29,7 @@ import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent import io.archivesunleashed.spark.matchbox.{RemoveHttpHeader, ExtractDate, ExtractDomain} -object ArchiveRecord { - val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX") -} - -trait ArchiveRecord extends Serializable { - def getCrawlDate: String - - def getCrawlMonth: String - - def getContentBytes: Array[Byte] - - def getContentString: String - - def getMimeType: String - - def getUrl: String - - def getDomain: String - - def getImageBytes: Array[Byte] -} - -class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends ArchiveRecord { - import ArchiveRecord._ - +class ArchiveRecord(r: SerializableWritable[ArchiveRecordWritable]) extends Serializable { var arcRecord: ARCRecord = null var warcRecord: WARCRecord = null @@ -62,6 +38,9 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends else if (r.t.getFormat == ArchiveFormat.WARC) warcRecord = r.t.getRecord.asInstanceOf[WARCRecord] + + val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX") + val getCrawlDate: String = { if (r.t.getFormat == ArchiveFormat.ARC) { ExtractDate(arcRecord.getMetaData.getDate, DateComponent.YYYYMMDD) @@ -88,7 +67,7 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends val getContentString: String = new String(getContentBytes) - val getMimeType: String = { + val getMimeType = { if (r.t.getFormat == ArchiveFormat.ARC) { arcRecord.getMetaData.getMimetype } else { @@ -96,7 +75,7 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends } } - val getUrl: String = { + val getUrl = { if (r.t.getFormat == ArchiveFormat.ARC) { arcRecord.getMetaData.getUrl } else { diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/RecordLoader.scala b/src/main/scala/io/archivesunleashed/spark/matchbox/RecordLoader.scala index 853f091a..b0651d25 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/RecordLoader.scala +++ b/src/main/scala/io/archivesunleashed/spark/matchbox/RecordLoader.scala @@ -24,7 +24,7 @@ import org.json4s.jackson.JsonMethods._ import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat import io.archivesunleashed.io.ArchiveRecordWritable import io.archivesunleashed.mapreduce.WacInputFormat -import io.archivesunleashed.spark.archive.io._ +import io.archivesunleashed.spark.archive.io.ArchiveRecord import io.archivesunleashed.spark.rdd.RecordRDD._ object RecordLoader { @@ -34,7 +34,7 @@ object RecordLoader { sc.newAPIHadoopFile(path, classOf[WacInputFormat], classOf[LongWritable], classOf[ArchiveRecordWritable]) .filter(r => (r._2.getFormat == ArchiveFormat.ARC) || ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response"))) - .map(r => new ArchiveRecordImpl(new SerializableWritable(r._2))) + .map(r => new ArchiveRecord(new SerializableWritable(r._2))) if (keepValidPages) rdd.keepValidPages() else rdd }