Skip to content

Commit

Permalink
Add doc comments for webpages and webgraph; resolves #392. (#394)
Browse files Browse the repository at this point in the history
  • Loading branch information
ruebot authored and ianmilligan1 committed Dec 18, 2019
1 parent 8eb43ff commit 99e9d06
Showing 1 changed file with 2 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ package object archivesunleashed {
&& r.getHttpStatus == "200")
}

/** Extracts webpages with columns for crawl data, url, MIME type, and content. */
def webpages(): DataFrame = {
val records = rdd.keepValidPages()
.map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType,
Expand All @@ -201,6 +202,7 @@ package object archivesunleashed {
sqlContext.getOrCreate().createDataFrame(records, schema)
}

/** Extracts a webgraph with columns for crawl date, source url, destination url, and anchor text. */
def webgraph(): DataFrame = {
val records = rdd
.keepValidPages()
Expand Down

0 comments on commit 99e9d06

Please sign in to comment.