diff --git a/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala b/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala deleted file mode 100644 index 3d93ab76..00000000 --- a/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Archives Unleashed Toolkit (AUT): - * An open-source toolkit for analyzing web archives. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.archivesunleashed.app - -import io.archivesunleashed.ArchiveRecord -import io.archivesunleashed.matchbox.{ExtractLinks, ExtractDomain, WWWLink} -import io.archivesunleashed.util.JsonUtils -// scalastyle:off underscore.import -import org.apache.spark.graphx._ -// scalastyle:on underscore.import -import org.apache.spark.rdd.RDD - -/** Extracts a network graph using Spark's GraphX utility. */ -@deprecated("Use ExtractGraphX instead.", "0.16.1") -object ExtractGraph { - val TOLERANCE: Double = 0.005 - val NUM_ITER: Int = 20 - - /** Creates a hashcode from a url to use as a unique id. - * - * @param url - * @return unique id as long integer. - */ - def pageHash(url: String): VertexId = { - url.hashCode.toLong - } - - case class VertexData(domain: String, pageRank: Double, inDegree: Int, outDegree: Int) - case class EdgeData(date: String, src: String, dst: String) - - /** Creates a network graph from loaded Archive Records with optional pageRank calculations. - * - * @param records an RDD of archive records - * @param dynamic whether to calculate PageRank (an O(n^2) calculation, so not - * recommended for very large graphs) - * @param tolerance the percentage of the time the PR algorithm "jumps" to - * a random location in its random walks - * @param numIter the number of iterations applied to the PR algorithm - * @return a Graph object containing data for vertices and edges as extracted. - */ - def apply(records: RDD[ArchiveRecord], dynamic: Boolean = false, - tolerance: Double = TOLERANCE, numIter: Int = NUM_ITER): Graph[VertexData, EdgeData] = { - val extractedLinks = records.keepValidPages() - .map(r => (r.getCrawlDate, ExtractLinks(r.getUrl, r.getContentString))) - .flatMap(r => r._2.map(f => (r._1, ExtractDomain(f._1).removePrefixWWW(), ExtractDomain(f._2).removePrefixWWW()))) - .filter(r => r._2 != "" && r._3 != "") - .persist() - - val vertices: RDD[(VertexId, VertexData)] = extractedLinks - .flatMap(r => List(r._2, r._3)) - .distinct - .map(r => (pageHash(r), VertexData(r, 0.0, 0, 0))) - - val edges: RDD[Edge[EdgeData]] = extractedLinks - .map(r => Edge(pageHash(r._2), pageHash(r._3), EdgeData(r._1, r._2, r._3))) - - val graph = Graph(vertices, edges) - - val graphInOut = graph.outerJoinVertices(graph.inDegrees) { - case (vid, rv, inDegOpt) => VertexData(rv.domain, rv.pageRank, inDegOpt.getOrElse(0), rv.outDegree) - }.outerJoinVertices(graph.outDegrees) { - case (vid, rv, outDegOpt) => VertexData(rv.domain, rv.pageRank, rv.inDegree, outDegOpt.getOrElse(0)) - } - - if (dynamic) { - graphInOut.outerJoinVertices(graph.pageRank(tolerance).vertices) { - case (vid, rv, pageRankOpt) => VertexData(rv.domain, pageRankOpt.getOrElse(0.0), rv.inDegree, rv.outDegree) - } - } else { - graphInOut.outerJoinVertices(graph.staticPageRank(numIter).vertices) { - case (vid, rv, pageRankOpt) => VertexData(rv.domain, pageRankOpt.getOrElse(0.0), rv.inDegree, rv.outDegree) - } - } - } - - /** Writes a Graph object to a Json file. - * - * @constructor graph - a SparkX graph object containing vertex and edge data - * @return Unit(). - */ - implicit class GraphWriter(graph: Graph[VertexData, EdgeData]) { - /** Writes a graph object to json files containing vertex and edge data. - * - * @param verticesPath Filepath for vertices output - * @param edgesPath Filepath for edges output - * @return Unit(). - */ - def writeAsJson(verticesPath: String, edgesPath: String): Unit = { - // Combine edges of a given (date, src, dst) combination into single record with count value. - val edgesCounted = graph.edges.countItems().map { - r => Map("date" -> r._1.attr.date, - "src" -> r._1.attr.src, - "dst" -> r._1.attr.dst, - "count" -> r._2) - } - edgesCounted.map(r => JsonUtils.toJson(r)).saveAsTextFile(edgesPath) - graph.vertices.map(r => JsonUtils.toJson(r._2)).saveAsTextFile(verticesPath) - } - } -} diff --git a/src/test/scala/io/archivesunleashed/app/ExtractGraphTest.scala b/src/test/scala/io/archivesunleashed/app/ExtractGraphTest.scala deleted file mode 100644 index 65c1d6cb..00000000 --- a/src/test/scala/io/archivesunleashed/app/ExtractGraphTest.scala +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Archives Unleashed Toolkit (AUT): - * An open-source toolkit for analyzing web archives. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.archivesunleashed.app - -import java.io.File -import java.nio.file.{Files, Paths} - -import com.google.common.io.Resources -// scalastyle:off underscore.import -import io.archivesunleashed._ -// scalastyle:on underscore.import -import org.apache.commons.io.FileUtils -import org.apache.spark.{SparkConf, SparkContext} -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{BeforeAndAfter, FunSuite} - -import scala.util.Try - - // See: https://github.com/archivesunleashed/aut/pull/204/files#diff-4541b9834513985c360b64093fd45073 - // @RunWith(classOf[JUnitRunner]) - @deprecated("Replaced with ExtractGraphX", "0.16.1") - class ExtractGraphTest extends FunSuite with BeforeAndAfter { - private val arcPath = Resources.getResource("arc/example.arc.gz").getPath - private var sc: SparkContext = _ - private val master = "local[4]" - private val appName = "example-spark" - private val testVertexFile = "temporaryTestVertexDir" - private val testEdgesFile = "temporaryTestEdgesDir" - private val dateTest = "20080430" - private val urlPrimaryTest = "deadlists.com" - private val urlSecondaryTest = "epic.org" - - before { - val conf = new SparkConf() - .setMaster(master) - .setAppName(appName) - conf.set("spark.driver.allowMultipleContexts", "true"); - sc = new SparkContext(conf) - } - - test("creates a network with pagerank scores") { - val examplerdd = RecordLoader.loadArchives(arcPath, sc) - val graph = ExtractGraph(examplerdd, dynamic=true) - val testVertexArray = Array(ExtractGraph.VertexData(urlSecondaryTest,0.15144580026750323,3,0), - ExtractGraph.VertexData("fepproject.org",0.15048193342250107,1,0), - ExtractGraph.VertexData("jou.ufl.edu",0.15048193342250107,1,0)) - val testEdgeArray = Array(ExtractGraph.EdgeData(dateTest,urlPrimaryTest,urlPrimaryTest), - ExtractGraph.EdgeData(dateTest, urlPrimaryTest, urlPrimaryTest), - ExtractGraph.EdgeData(dateTest, urlPrimaryTest,"psilo.com")) - val testCount = 1000 - assert(graph.vertices.map( r => r._2).take(3).deep == testVertexArray.deep) - assert(graph.edges.map( r => r.attr ).take(3).deep == testEdgeArray.deep) - assert(ExtractGraph.VertexData(urlSecondaryTest, 0.0, 0,0).domain == urlSecondaryTest) - assert(ExtractGraph.EdgeData(dateTest, urlPrimaryTest, urlPrimaryTest).date == dateTest) - } - - test("creates a network without pagerank scores") { - val examplerdd = RecordLoader.loadArchives(arcPath, sc) - val graph = ExtractGraph(examplerdd) - val testVertexArray = Array(ExtractGraph.VertexData(urlSecondaryTest,0.1514714083714221,3,0), - ExtractGraph.VertexData("fepproject.org",0.1504904694571407,1,0), - ExtractGraph.VertexData("jou.ufl.edu",0.1504904694571407,1,0)) - val testEdgeArray = Array(ExtractGraph.EdgeData(dateTest, urlPrimaryTest, urlPrimaryTest), - ExtractGraph.EdgeData(dateTest, urlPrimaryTest, urlPrimaryTest), - ExtractGraph.EdgeData(dateTest, urlPrimaryTest, "psilo.com")) - assert(graph.vertices.map( r => r._2).take(3).deep == testVertexArray.deep) - assert(graph.edges.map( r => r.attr ).take(3).deep == testEdgeArray.deep) - - } - - test("writes a json file") { - val examplerdd = RecordLoader.loadArchives(arcPath, sc) - val graph = ExtractGraph(examplerdd) - graph.writeAsJson(testVertexFile, testEdgesFile) - assert (Files.exists(Paths.get(testVertexFile))) - assert (Files.exists(Paths.get(testEdgesFile))) - } - - after { - if (sc != null) { - sc.stop() - } - if (Files.exists(Paths.get(testVertexFile))) { - Try (FileUtils.deleteDirectory(new File(testVertexFile))) - } - if (Files.exists(Paths.get(testEdgesFile))) { - Try(FileUtils.deleteDirectory(new File(testEdgesFile))); - } - } - }