Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scala imports cleanup. #398

Merged
merged 1 commit into from
Jan 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/ArchiveRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@

package io.archivesunleashed

import java.text.SimpleDateFormat
import java.io.ByteArrayInputStream
import java.security.MessageDigest
import java.text.SimpleDateFormat

import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils, ArchiveRecordWritable}
import io.archivesunleashed.matchbox.{ComputeMD5RDD, ExtractDateRDD, ExtractDomainRDD, RemoveHTTPHeaderRDD}
import org.apache.commons.httpclient.{Header, HttpParser, StatusLine}
import org.apache.spark.SerializableWritable
import org.archive.io.arc.ARCRecord
import org.archive.io.warc.WARCRecord
import org.archive.util.ArchiveUtils
import scala.util.Try
import org.apache.commons.httpclient.{Header, HttpParser, StatusLine}

/** Trait for a record in a web archive. */
trait ArchiveRecord extends Serializable {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ import org.apache.log4j.Logger
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.{SparkConf, SparkContext}
import org.rogach.scallop.ScallopConf
import org.rogach.scallop.exceptions.ScallopException
import org.rogach.scallop.ScallopConf

/* Usage:
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

package io.archivesunleashed.app

import io.archivesunleashed.{ArchiveRecord, DataFrameLoader, CountableRDD}
import io.archivesunleashed.matchbox
import io.archivesunleashed.df
import io.archivesunleashed.matchbox
import io.archivesunleashed.{ArchiveRecord, DataFrameLoader, CountableRDD}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

package io.archivesunleashed.app

import io.archivesunleashed.{ArchiveRecord, DataFrameLoader, CountableRDD}
import io.archivesunleashed.matchbox.{ExtractDomainRDD, ExtractLinksRDD}
import io.archivesunleashed.df
import io.archivesunleashed.matchbox.{ExtractDomainRDD, ExtractLinksRDD}
import io.archivesunleashed.{ArchiveRecord, DataFrameLoader, CountableRDD}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/app/ExtractEntities.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
*/
package io.archivesunleashed.app

import io.archivesunleashed.RecordLoader
import io.archivesunleashed.matchbox.{ComputeMD5RDD, NERClassifier, RemoveHTMLRDD}
import org.apache.spark.SparkContext
import io.archivesunleashed.RecordLoader
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

/** Performs Named Entity Recognition (NER) on a WARC or ARC file.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.{RangePartitioner, SparkContext}
import org.apache.spark.sql.functions.{desc,first}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.{RangePartitioner, SparkContext}

/** Extract most popular images from a Data Frame. */
object ExtractPopularImagesDF {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

package io.archivesunleashed.app

import io.archivesunleashed.{ArchiveRecord, df}
import io.archivesunleashed.matchbox.RemoveHTMLRDD
import io.archivesunleashed.{ArchiveRecord, df}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
* limitations under the License.
*/
package io.archivesunleashed.app
import org.apache.spark.graphx.Graph
import io.archivesunleashed.app.ExtractGraphX.{EdgeData, VertexData, VertexDataPR}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import io.archivesunleashed.app.ExtractGraphX.{EdgeData, VertexData, VertexDataPR}
import org.apache.spark.graphx.Graph

import org.apache.spark.rdd.RDD

Expand Down
6 changes: 3 additions & 3 deletions src/main/scala/io/archivesunleashed/df/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
*/
package io.archivesunleashed

import org.apache.commons.io.IOUtils
import io.archivesunleashed.matchbox.{ComputeMD5RDD}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.DataFrame
import java.io.ByteArrayInputStream
import java.io.FileOutputStream
import java.util.Base64
import org.apache.commons.io.IOUtils
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.udf

/**
* UDFs for data frames.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ package io.archivesunleashed.matchbox
// scalastyle:off underscore.import
import scala.collection.JavaConverters._
// scalastyle:on underscore.import
import org.apache.tika.Tika
import org.apache.tika.detect.DefaultDetector
import org.apache.tika.io.TikaInputStream
import org.apache.tika.mime.MimeTypes
import org.apache.tika.parser.AutoDetectParser
import org.apache.tika.Tika

/** Detect MIME type using Apache Tika. */
object DetectMimeTypeTika {
Expand Down
18 changes: 9 additions & 9 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,25 @@ package io
import java.security.MessageDigest
import java.util.Base64

import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat
import io.archivesunleashed.data.{ArchiveRecordInputFormat, ArchiveRecordWritable}
import ArchiveRecordWritable.ArchiveFormat
import io.archivesunleashed.df.{ExtractDateDF,ExtractDomainDF}
import io.archivesunleashed.matchbox.{DetectLanguageRDD, DetectMimeTypeTika, ExtractDateRDD,
ExtractDomainRDD, ExtractImageDetails, ExtractImageLinksRDD,
ExtractLinksRDD, GetExtensionMimeRDD, RemoveHTMLRDD}
import io.archivesunleashed.matchbox.ExtractDateRDD.DateComponent
import org.apache.commons.codec.binary.Hex
import org.apache.commons.io.FilenameUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import io.archivesunleashed.matchbox.ExtractDateRDD.DateComponent.DateComponent
import java.net.URI
import java.net.URL
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.commons.codec.binary.Hex
import org.apache.commons.io.FilenameUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.LongWritable
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.{lit, udf}
import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType, StructField, StructType}
import org.apache.hadoop.io.LongWritable
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.{RangePartitioner, SerializableWritable, SparkContext}
import org.apache.spark.rdd.RDD
import scala.reflect.ClassTag
import scala.util.matching.Regex

Expand Down Expand Up @@ -101,7 +101,7 @@ package object archivesunleashed {
def keepValidPagesDF(): DataFrame = {
df.filter($"crawl_date" isNotNull)
.filter(!($"url".rlike(".*robots\\.txt$")) &&
( $"mime_type_web_server".rlike("text/html") ||
( $"mime_type_web_server".rlike("text/html") ||
$"mime_type_web_server".rlike("application/xhtml+xml") ||
$"url".rlike("(?i).*htm$") ||
$"url".rlike("(?i).*html$")
Expand Down Expand Up @@ -131,7 +131,7 @@ package object archivesunleashed {
/** Filters detected URLs.
*
* @param urls a list of urls
*/
*/
def discardUrlsDF(urls: Set[String]): DataFrame = {
val filteredUrls = udf((url: String) => !urls.contains(url))
df.filter(filteredUrls($"url"))
Expand Down
2 changes: 1 addition & 1 deletion src/test/scala/io/archivesunleashed/ArcTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
package io.archivesunleashed

import com.google.common.io.Resources
import io.archivesunleashed.matchbox.{DetectLanguageRDD, DetectMimeTypeTika, ExtractLinksRDD, RemoveHTMLRDD, RemoveHTTPHeaderRDD}
import io.archivesunleashed.matchbox.ExtractDateRDD.DateComponent
import io.archivesunleashed.matchbox.{DetectLanguageRDD, DetectMimeTypeTika, ExtractLinksRDD, RemoveHTMLRDD, RemoveHTTPHeaderRDD}
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
package io.archivesunleashed

import com.google.common.io.Resources
import org.apache.commons.io.FilenameUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}
import org.apache.commons.io.FilenameUtils

@RunWith(classOf[JUnitRunner])
class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
Expand Down
2 changes: 1 addition & 1 deletion src/test/scala/io/archivesunleashed/RecordDFTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,4 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
sc.stop()
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ import java.io.File
import java.nio.file.{Files, Paths}

import com.google.common.io.Resources
import io.archivesunleashed.{ArchiveRecord, RecordLoader}
import io.archivesunleashed.app.ExtractGraphX.{EdgeData, VertexData, VertexDataPR}
import io.archivesunleashed.matchbox.{ExtractDomainRDD, ExtractLinksRDD, WWWLink}
import io.archivesunleashed.{ArchiveRecord, RecordLoader}
import org.apache.commons.io.FileUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ package io.archivesunleashed.app

import java.io.File
import java.nio.file.{Files, Paths}
import org.apache.spark.sql.Row
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}
import org.apache.spark.sql.Row
import scala.io.Source

@RunWith(classOf[JUnitRunner])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ package io.archivesunleashed.app
import java.io.File
import java.nio.file.{Files, Paths}

import org.apache.spark.sql.Row
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}
import org.apache.spark.sql.Row

import scala.io.Source

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
*/
package io.archivesunleashed.df

import io.archivesunleashed.DataFrameLoader
import com.google.common.io.Resources
import io.archivesunleashed.DataFrameLoader
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
package io.archivesunleashed

import com.google.common.io.Resources
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Loading