Skip to content

Commit

Permalink
Scala imports cleanup. (#398)
Browse files Browse the repository at this point in the history
  • Loading branch information
ruebot authored and ianmilligan1 committed Jan 5, 2020
1 parent b915f82 commit d5c7bf7
Show file tree
Hide file tree
Showing 33 changed files with 142 additions and 142 deletions.
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/ArchiveRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@

package io.archivesunleashed

import java.text.SimpleDateFormat
import java.io.ByteArrayInputStream
import java.security.MessageDigest
import java.text.SimpleDateFormat

import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils, ArchiveRecordWritable}
import io.archivesunleashed.matchbox.{ComputeMD5RDD, ExtractDateRDD, ExtractDomainRDD, RemoveHTTPHeaderRDD}
import org.apache.commons.httpclient.{Header, HttpParser, StatusLine}
import org.apache.spark.SerializableWritable
import org.archive.io.arc.ARCRecord
import org.archive.io.warc.WARCRecord
import org.archive.util.ArchiveUtils
import scala.util.Try
import org.apache.commons.httpclient.{Header, HttpParser, StatusLine}

/** Trait for a record in a web archive. */
trait ArchiveRecord extends Serializable {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ import org.apache.log4j.Logger
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.{SparkConf, SparkContext}
import org.rogach.scallop.ScallopConf
import org.rogach.scallop.exceptions.ScallopException
import org.rogach.scallop.ScallopConf

/* Usage:
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

package io.archivesunleashed.app

import io.archivesunleashed.{ArchiveRecord, DataFrameLoader, CountableRDD}
import io.archivesunleashed.matchbox
import io.archivesunleashed.df
import io.archivesunleashed.matchbox
import io.archivesunleashed.{ArchiveRecord, DataFrameLoader, CountableRDD}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

package io.archivesunleashed.app

import io.archivesunleashed.{ArchiveRecord, DataFrameLoader, CountableRDD}
import io.archivesunleashed.matchbox.{ExtractDomainRDD, ExtractLinksRDD}
import io.archivesunleashed.df
import io.archivesunleashed.matchbox.{ExtractDomainRDD, ExtractLinksRDD}
import io.archivesunleashed.{ArchiveRecord, DataFrameLoader, CountableRDD}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/app/ExtractEntities.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
*/
package io.archivesunleashed.app

import io.archivesunleashed.RecordLoader
import io.archivesunleashed.matchbox.{ComputeMD5RDD, NERClassifier, RemoveHTMLRDD}
import org.apache.spark.SparkContext
import io.archivesunleashed.RecordLoader
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

/** Performs Named Entity Recognition (NER) on a WARC or ARC file.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.{RangePartitioner, SparkContext}
import org.apache.spark.sql.functions.{desc,first}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.{RangePartitioner, SparkContext}

/** Extract most popular images from a Data Frame. */
object ExtractPopularImagesDF {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

package io.archivesunleashed.app

import io.archivesunleashed.{ArchiveRecord, df}
import io.archivesunleashed.matchbox.RemoveHTMLRDD
import io.archivesunleashed.{ArchiveRecord, df}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
* limitations under the License.
*/
package io.archivesunleashed.app
import org.apache.spark.graphx.Graph
import io.archivesunleashed.app.ExtractGraphX.{EdgeData, VertexData, VertexDataPR}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import io.archivesunleashed.app.ExtractGraphX.{EdgeData, VertexData, VertexDataPR}
import org.apache.spark.graphx.Graph

import org.apache.spark.rdd.RDD

Expand Down
6 changes: 3 additions & 3 deletions src/main/scala/io/archivesunleashed/df/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
*/
package io.archivesunleashed

import org.apache.commons.io.IOUtils
import io.archivesunleashed.matchbox.{ComputeMD5RDD}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.DataFrame
import java.io.ByteArrayInputStream
import java.io.FileOutputStream
import java.util.Base64
import org.apache.commons.io.IOUtils
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.udf

/**
* UDFs for data frames.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ package io.archivesunleashed.matchbox
// scalastyle:off underscore.import
import scala.collection.JavaConverters._
// scalastyle:on underscore.import
import org.apache.tika.Tika
import org.apache.tika.detect.DefaultDetector
import org.apache.tika.io.TikaInputStream
import org.apache.tika.mime.MimeTypes
import org.apache.tika.parser.AutoDetectParser
import org.apache.tika.Tika

/** Detect MIME type using Apache Tika. */
object DetectMimeTypeTika {
Expand Down
18 changes: 9 additions & 9 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,25 @@ package io
import java.security.MessageDigest
import java.util.Base64

import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat
import io.archivesunleashed.data.{ArchiveRecordInputFormat, ArchiveRecordWritable}
import ArchiveRecordWritable.ArchiveFormat
import io.archivesunleashed.df.{ExtractDateDF,ExtractDomainDF}
import io.archivesunleashed.matchbox.{DetectLanguageRDD, DetectMimeTypeTika, ExtractDateRDD,
ExtractDomainRDD, ExtractImageDetails, ExtractImageLinksRDD,
ExtractLinksRDD, GetExtensionMimeRDD, RemoveHTMLRDD}
import io.archivesunleashed.matchbox.ExtractDateRDD.DateComponent
import org.apache.commons.codec.binary.Hex
import org.apache.commons.io.FilenameUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import io.archivesunleashed.matchbox.ExtractDateRDD.DateComponent.DateComponent
import java.net.URI
import java.net.URL
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.commons.codec.binary.Hex
import org.apache.commons.io.FilenameUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.LongWritable
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.{lit, udf}
import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType, StructField, StructType}
import org.apache.hadoop.io.LongWritable
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.{RangePartitioner, SerializableWritable, SparkContext}
import org.apache.spark.rdd.RDD
import scala.reflect.ClassTag
import scala.util.matching.Regex

Expand Down Expand Up @@ -101,7 +101,7 @@ package object archivesunleashed {
def keepValidPagesDF(): DataFrame = {
df.filter($"crawl_date" isNotNull)
.filter(!($"url".rlike(".*robots\\.txt$")) &&
( $"mime_type_web_server".rlike("text/html") ||
( $"mime_type_web_server".rlike("text/html") ||
$"mime_type_web_server".rlike("application/xhtml+xml") ||
$"url".rlike("(?i).*htm$") ||
$"url".rlike("(?i).*html$")
Expand Down Expand Up @@ -131,7 +131,7 @@ package object archivesunleashed {
/** Filters detected URLs.
*
* @param urls a list of urls
*/
*/
def discardUrlsDF(urls: Set[String]): DataFrame = {
val filteredUrls = udf((url: String) => !urls.contains(url))
df.filter(filteredUrls($"url"))
Expand Down
2 changes: 1 addition & 1 deletion src/test/scala/io/archivesunleashed/ArcTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
package io.archivesunleashed

import com.google.common.io.Resources
import io.archivesunleashed.matchbox.{DetectLanguageRDD, DetectMimeTypeTika, ExtractLinksRDD, RemoveHTMLRDD, RemoveHTTPHeaderRDD}
import io.archivesunleashed.matchbox.ExtractDateRDD.DateComponent
import io.archivesunleashed.matchbox.{DetectLanguageRDD, DetectMimeTypeTika, ExtractLinksRDD, RemoveHTMLRDD, RemoveHTTPHeaderRDD}
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
package io.archivesunleashed

import com.google.common.io.Resources
import org.apache.commons.io.FilenameUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}
import org.apache.commons.io.FilenameUtils

@RunWith(classOf[JUnitRunner])
class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
Expand Down
2 changes: 1 addition & 1 deletion src/test/scala/io/archivesunleashed/RecordDFTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,4 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
sc.stop()
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ import java.io.File
import java.nio.file.{Files, Paths}

import com.google.common.io.Resources
import io.archivesunleashed.{ArchiveRecord, RecordLoader}
import io.archivesunleashed.app.ExtractGraphX.{EdgeData, VertexData, VertexDataPR}
import io.archivesunleashed.matchbox.{ExtractDomainRDD, ExtractLinksRDD, WWWLink}
import io.archivesunleashed.{ArchiveRecord, RecordLoader}
import org.apache.commons.io.FileUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ package io.archivesunleashed.app

import java.io.File
import java.nio.file.{Files, Paths}
import org.apache.spark.sql.Row
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}
import org.apache.spark.sql.Row
import scala.io.Source

@RunWith(classOf[JUnitRunner])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ package io.archivesunleashed.app
import java.io.File
import java.nio.file.{Files, Paths}

import org.apache.spark.sql.Row
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}
import org.apache.spark.sql.Row

import scala.io.Source

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
*/
package io.archivesunleashed.df

import io.archivesunleashed.DataFrameLoader
import com.google.common.io.Resources
import io.archivesunleashed.DataFrameLoader
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
package io.archivesunleashed

import com.google.common.io.Resources
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Loading

0 comments on commit d5c7bf7

Please sign in to comment.