-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add ExtractGraphX including algorithms for PageRank and Components. I…
…ssue 203 (#245) * pom.xml change for GraphX * Changes for GraphXSLS * Changes for SLS graph * Changes for GraphX * Changes for converting WARC RDD to GraphX object * Rename extractor to ExtractGraphX * Various lint fixes (usually Magic Numbers) * Remove illegal imports from scala style (we use wildcard imports a lot) * Add WriteGraphXMLTest.
- Loading branch information
1 parent
290b6aa
commit afe9254
Showing
16 changed files
with
421 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
95 changes: 95 additions & 0 deletions
95
src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
/* | ||
* Archives Unleashed Toolkit (AUT): | ||
* An open-source platform for analyzing web archives. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.archivesunleashed.app | ||
|
||
import io.archivesunleashed._ | ||
import io.archivesunleashed.matchbox._ | ||
import org.apache.spark.graphx._ | ||
import org.apache.spark.rdd.RDD | ||
|
||
/** Extracts a site link structure using Spark's GraphX utility. */ | ||
object ExtractGraphX { | ||
|
||
/** Creates a hashcode from a url to use as a unique id. | ||
* | ||
* @param url | ||
* @return unique id as long integer. | ||
*/ | ||
def pageHash(url: String): VertexId = { | ||
url.hashCode.toLong | ||
} | ||
val DEF_TOLERANCE = 0.005 | ||
val DEF_NUMITER = 20 | ||
val DEF_RESET = 0.15 | ||
/* tolerance level (probability of following a link) for page rank */ | ||
var tolerance: Double = DEF_TOLERANCE | ||
/* number of iterations for page rank and strongly connected components */ | ||
var numIter: Int = DEF_NUMITER | ||
/* probability page rank "walk" will start from a random position */ | ||
var resetProbability: Double = DEF_RESET | ||
/* whether to calculate dynamic page rank */ | ||
var dynamic: Boolean = false | ||
/* default page rank value */ | ||
val defaultPR: Double = 0.0 | ||
val defaultComponent: Long = 0 | ||
case class VertexData(url: String) | ||
case class EdgeData(edgeCount: Int) | ||
case class VertexDataPR(url: String, pageRank: Double, weak: Long, strong: Long) | ||
|
||
/** Creates a GraphX object | ||
* | ||
* @param records an RDD of tuples (source, destination) | ||
* @return a GraphX object | ||
*/ | ||
def extractGraphX(records: RDD[(String, String)]): Graph[VertexData, EdgeData] = { | ||
val extractedLinks = records.persist() | ||
val vertices: RDD[(VertexId, VertexData)] = extractedLinks | ||
.flatMap(r => List(r._1, r._2)) | ||
.distinct | ||
.map(r => (pageHash(r), VertexData(r))) | ||
val edges: RDD[Edge[EdgeData]] = extractedLinks | ||
.map(r => Edge(pageHash(r._1), pageHash(r._2), EdgeData(1))) | ||
val graph = Graph(vertices, edges) | ||
.partitionBy(PartitionStrategy.RandomVertexCut) | ||
.groupEdges((e1,e2) => EdgeData(e1.edgeCount + e2.edgeCount)) | ||
graph | ||
} | ||
|
||
/** calculates basic graph data (Page Rank, weak and strong components) for Graph | ||
* | ||
* @param graph GraphX object | ||
* @return new graph object with additional attributes | ||
*/ | ||
def runPageRankAlgorithm(graph: Graph[VertexData, EdgeData]): Graph[VertexDataPR, EdgeData] = { | ||
if(dynamic){ | ||
graph.outerJoinVertices(graph.pageRank(numIter, resetProbability).vertices) { | ||
case (id, vd, pr) => (vd, pr)} | ||
.outerJoinVertices(graph.connectedComponents().vertices) { | ||
case (id, (vd, pr), cc) => (vd, pr, cc)} | ||
.outerJoinVertices(graph.stronglyConnectedComponents(numIter).vertices) { | ||
case (id, (vd, pr, cc), scc) => VertexDataPR(vd.url, pr.getOrElse(defaultPR), cc.getOrElse(defaultComponent), scc.getOrElse(defaultComponent)) } | ||
} | ||
else{ | ||
graph.outerJoinVertices(graph.staticPageRank(numIter, resetProbability).vertices){ | ||
case (id, vd, pr) => (vd, pr) | ||
}.outerJoinVertices(graph.connectedComponents().vertices) { | ||
case (id, (vd, pr), cc) => (vd, pr, cc)} | ||
.outerJoinVertices(graph.stronglyConnectedComponents(numIter).vertices) { | ||
case (id, (vd, pr, cc), scc) => VertexDataPR(vd.url, pr.getOrElse(defaultPR), cc.getOrElse(defaultComponent), scc.getOrElse(defaultComponent)) } | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,6 @@ object WriteGraphML { | |
outFile.write("</graph>\n" + | ||
"</graphml>") | ||
outFile.close() | ||
return true | ||
true | ||
} | ||
} |
95 changes: 95 additions & 0 deletions
95
src/main/scala/io/archivesunleashed/app/WriteGraphXML.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
/* | ||
* Archives Unleashed Toolkit (AUT): | ||
* An open-source platform for analyzing web archives. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.archivesunleashed.app | ||
|
||
import io.archivesunleashed.matchbox._ | ||
|
||
import java.nio.charset.StandardCharsets | ||
import java.nio.file.{Files, Paths} | ||
import io.archivesunleashed.app.ExtractGraphX.{VertexData,EdgeData,VertexDataPR} | ||
import org.apache.spark.graphx._ | ||
import org.apache.spark.rdd.RDD | ||
|
||
/** | ||
* UDF for exporting an GraphX object representing a collection of links to a GraphML file. | ||
*/ | ||
object WriteGraphXML { | ||
/** Writes graphX object nodes and edges to file. | ||
* | ||
* @param graph GraphX object of type Graph[VertexDataPR, EdgeData] | ||
* @param graphmlPath output file | ||
*/ | ||
def apply(graph: Graph[VertexDataPR, EdgeData], graphmlPath: String): Boolean = { | ||
if (graphmlPath.isEmpty()) { | ||
false | ||
} else { | ||
makeFile (graph, graphmlPath) | ||
} | ||
} | ||
|
||
/** Produces the GraphML output from a GraphX object and outputs it to graphmlPath. | ||
* | ||
* @param graph GraphX object of type Graph[VertexDataPR, EdgeData] | ||
* @param graphmlPath output file | ||
* @return true on successful run. | ||
*/ | ||
def makeFile (graph: Graph[VertexDataPR, EdgeData], graphmlPath: String): Boolean = { | ||
val outFile = Files.newBufferedWriter(Paths.get(graphmlPath), StandardCharsets.UTF_8) | ||
|
||
val END_DATA_XML: String = "</data>\n" | ||
|
||
val mappedGraph = graph.triplets.map(triplet => List("<edge source=\"" + triplet.srcId + "\" target=\"" + | ||
triplet.dstId + "\" type=\"directed\">\n" + | ||
"<data key=\"weight\">" + triplet.attr.edgeCount + END_DATA_XML + | ||
"</edge>\n", "<node id=\"" + triplet.srcId + "\">\n" + | ||
"<data key=\"pageRank\">" + triplet.srcAttr.pageRank + END_DATA_XML + | ||
"<data key=\"connectedComponent\">" + triplet.srcAttr.weak + END_DATA_XML + | ||
"<data key=\"stronglyConnectedComponent\">" + triplet.srcAttr.strong + END_DATA_XML + | ||
"<data key=\"label\">" + triplet.srcAttr.url + END_DATA_XML + "</node>\n", | ||
"<node id=\"" + triplet.dstId + "\">\n" + | ||
"<data key=\"pageRank\">" + triplet.dstAttr.pageRank + END_DATA_XML + | ||
"<data key=\"connectedComponent\">" + triplet.dstAttr.weak + END_DATA_XML + | ||
"<data key=\"stronglyConnectedComponent\">" + triplet.dstAttr.strong + END_DATA_XML + | ||
"<data key=\"label\">" + triplet.dstAttr.url + END_DATA_XML + "</node>\n")).distinct.collect | ||
|
||
outFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + | ||
"<graphml xmlns=\"http://graphml.graphdrawing.org/xmlns\"\n" + | ||
" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n" + | ||
" xsi:schemaLocation=\"http://graphml.graphdrawing.org/xmlns\n" + | ||
" http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd\">\n" + | ||
"<key id=\"label\" for=\"node\" attr.name=\"label\" attr.type=\"string\" />\n" + | ||
"<key id=\"weight\" for=\"edge\" attr.name=\"weight\" attr.type=\"double\">\n" + | ||
"<default>0.0</default>\n" + | ||
"</key>\n" + | ||
"<key id=\"pageRank\" for=\"node\" attr.name=\"pageRank\" " + | ||
"attr.type=\"double\" />\n" + | ||
"<key id=\"stronglyConnectedComponent\" for=\"node\" " + | ||
"attr.name=\"stronglyConnectedComponent\" attr.type=\"int\" />\n" + | ||
"<key id=\"connectedComponent\" for=\"node\" " + | ||
"attr.name=\"connectedComponent\" attr.type=\"int\" />\n" + | ||
"<graph mode=\"static\" edgedefault=\"directed\">\n") | ||
outFile.write("<nodes>\n") | ||
mappedGraph.foreach(r => outFile.write(r(1) + r(2))) | ||
outFile.write("\n</nodes>\n<edges>\n") | ||
mappedGraph.foreach(r => outFile.write(r(0))) | ||
outFile.write("\n</edges>\n") | ||
outFile.write("</graph>\n" + | ||
"</graphml>") | ||
outFile.close() | ||
true | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.