From 71ee62e03afe42e4cc387d5c08aa294f6599d834 Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Sat, 11 Jun 2022 13:02:18 -0400
Subject: [PATCH] 1.0.0

---
 .../version-1.0.0/auk-derivatives.md          |  279 ++++
 .../version-1.0.0/aut-at-scale.md             |  124 ++
 .../version-1.0.0/aut-spark-submit-app.md     |  415 ++++++
 .../version-1.0.0/binary-analysis.md          | 1123 +++++++++++++++++
 .../version-1.0.0/dataframe-schemas.md        |  156 +++
 .../version-1.0.0/extract-binary-info.md      |  177 +++
 .../version-1.0.0/extract-binary.md           |  117 ++
 .../version-1.0.0/filters-df.md               |  328 +++++
 .../version-1.0.0/image-analysis.md           |  357 ++++++
 .../version-1.0.0/link-analysis.md            |  593 +++++++++
 .../version-1.0.0/text-analysis.md            |  597 +++++++++
 .../version-1.0.0/toolkit-walkthrough.md      |  424 +++++++
 website/versioned_docs/version-1.0.0/usage.md |  149 +++
 website/versions.json                         |    1 +
 14 files changed, 4840 insertions(+)
 create mode 100644 website/versioned_docs/version-1.0.0/auk-derivatives.md
 create mode 100644 website/versioned_docs/version-1.0.0/aut-at-scale.md
 create mode 100644 website/versioned_docs/version-1.0.0/aut-spark-submit-app.md
 create mode 100644 website/versioned_docs/version-1.0.0/binary-analysis.md
 create mode 100644 website/versioned_docs/version-1.0.0/dataframe-schemas.md
 create mode 100644 website/versioned_docs/version-1.0.0/extract-binary-info.md
 create mode 100644 website/versioned_docs/version-1.0.0/extract-binary.md
 create mode 100644 website/versioned_docs/version-1.0.0/filters-df.md
 create mode 100644 website/versioned_docs/version-1.0.0/image-analysis.md
 create mode 100644 website/versioned_docs/version-1.0.0/link-analysis.md
 create mode 100644 website/versioned_docs/version-1.0.0/text-analysis.md
 create mode 100644 website/versioned_docs/version-1.0.0/toolkit-walkthrough.md
 create mode 100644 website/versioned_docs/version-1.0.0/usage.md

diff --git a/website/versioned_docs/version-1.0.0/auk-derivatives.md b/website/versioned_docs/version-1.0.0/auk-derivatives.md
new file mode 100644
index 00000000..f6673f8e
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/auk-derivatives.md
@@ -0,0 +1,279 @@
+---
+id: version-1.0.0-auk-derivatives
+title: ARCH Derivatives
+original_id: auk-derivatives
+---
+
+How do I create the Toolkit generated derivatives that the Archives
+Research Compute Hub creates on my own web archive collection?
+
+## Scala RDD
+
+**Will not be implemented.**
+
+## Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val warcs = "/path/to/warcs/*"
+val results = "/path/to/results/"
+
+val webpages = RecordLoader.loadArchives(warcs, sc).webpages()
+val webgraph = RecordLoader.loadArchives(warcs, sc).webgraph()
+
+// Domain frequency.
+webpages.groupBy($"domain")
+  .count()
+  .sort($"count".desc)
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "domains")
+
+// Domain graph.
+webgraph.groupBy(
+    $"crawl_date",
+    removePrefixWWW(extractDomain($"src")).as("src_domain"),
+    removePrefixWWW(extractDomain($"dest")).as("dest_domain")
+  )
+  .count()
+  .filter(!($"dest_domain" === ""))
+  .filter(!($"src_domain" === ""))
+  .filter($"count" > 5)
+  .orderBy(desc("count"))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "domain-graph")
+
+// Image graph.
+RecordLoader.loadArchives(warcs, sc)
+  .imagegraph()
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "image-graph")
+
+// Web graph.
+webgraph.write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "web-graph")
+
+// Web pages.
+webpages.write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "webpages")
+
+// Binary information.
+RecordLoader.loadArchives(warcs, sc)
+  .audio()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "audio")
+
+RecordLoader.loadArchives(warcs, sc)
+  .images()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"width", $"height", $"md5", $"sha1")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "image")
+
+RecordLoader.loadArchives(warcs, sc)
+  .pdfs()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "pdf")
+
+RecordLoader.loadArchives(warcs, sc)
+  .presentationProgramFiles()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "presentation-program")
+
+RecordLoader.loadArchives(warcs, sc)
+  .spreadsheets()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "spreadsheet")
+
+RecordLoader.loadArchives(warcs, sc)
+  .videos()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "video")
+
+RecordLoader.loadArchives(warcs, sc)
+  .wordProcessorFiles()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save(results + "word-processor")
+
+sys.exit
+```
+
+## Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col, desc
+
+warcs = "/path/to/warcs/*"
+results = "/path/to/results/"
+
+webpages = WebArchive(sc, sqlContext, warcs).webpages()
+webgraph = WebArchive(sc, sqlContext, warcs).webgraph()
+
+# Domain frequency.
+webpages.groupBy("domain") \
+  .count() \
+  .sort(col("count")\
+  .desc()) \
+  .write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "domains")
+
+# Domain graph.
+webgraph.groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src_domain"), remove_prefix_www(extract_domain("dest")).alias("dest_domain"))\
+  .count()\
+  .filter((col("dest_domain").isNotNull()) & (col("dest_domain") !=""))\
+  .filter((col("src_domain").isNotNull()) & (col("src_domain") !=""))\
+  .filter(col("count") > 5)\
+  .orderBy(desc("count"))\
+  .write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "domain-graph")
+
+# Image graph.
+WebArchive(sc, sqlContext, warcs).imagegraph()\
+  .write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "image-graph")
+
+# Web graph.
+webgraph.write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "web-graph")
+
+# Web pages.
+webpages.write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "webpages")
+
+# Binary information.
+WebArchive(sc, sqlContext, warcs).audio()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "audio")
+
+WebArchive(sc, sqlContext, warcs).images()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "images")
+
+WebArchive(sc, sqlContext, warcs).pdfs()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "pdfs")
+
+WebArchive(sc, sqlContext, warcs).presentation_program()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "presentation_program")
+
+WebArchive(sc, sqlContext, warcs).spreadsheets()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "spreadsheets")
+
+WebArchive(sc, sqlContext, warcs).video()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "videos")
+
+WebArchive(sc, sqlContext, warcs).word_processor()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write\
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save(results + "word_processor")
+```
diff --git a/website/versioned_docs/version-1.0.0/aut-at-scale.md b/website/versioned_docs/version-1.0.0/aut-at-scale.md
new file mode 100644
index 00000000..59f41947
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/aut-at-scale.md
@@ -0,0 +1,124 @@
+---
+id: version-1.0.0-aut-at-scale
+title: The Toolkit at Scale
+original_id: aut-at-scale
+---
+
+As your collections grow, you may need to provide more resources, and adjust
+Apache Spark configuration options. Apache Spark has great
+[Configuration](https://spark.apache.org/docs/latest/configuration.html) and
+[Tuning](https://spark.apache.org/docs/latest/tuning.html) guides that are
+worth checking out. If you're not sure where to start with scaling, join us in
+[Slack](slack.archivesunleashed.org) in the `#aut` channel, and we might be
+able to provide some guidance.
+
+## A Note on Memory and Cores
+
+As your datasets grow, you may need to provide more memory to Apache Spark.
+You'll know this if you get an error saying that you have run out of "Java Heap
+Space."
+
+You can add a
+[configuration](https://spark.apache.org/docs/latest/configuration.html) option
+for adjusting available memory like so:
+
+```shell
+spark-shell --driver-memory 4G --jars /path/to/aut-1.0.0-fatjar.jar
+```
+
+In the above case, you give Apache Spark 4GB of memory to execute the program.
+
+In some other cases, despite giving AUT sufficient memory, you may still
+encounter Java Heap Space issues. In those cases, it is worth trying to lower
+the number of worker threads. When running locally (i.e. on a single laptop,
+desktop, or server), by default AUT runs a number of threads equivalent to the
+number of cores in your machine.
+
+On a 16-core machine, you may want to drop to 12 cores if you are having memory
+issues. This will increase stability but decrease performance a bit.
+
+You can do so like this (the example is using 12 threads on a 16-core machine):
+
+```shell
+spark-shell --master local[12] --driver-memory 4G --jars /path/to/aut-1.0.0-fatjar.jar
+```
+
+If you continue to have errors, look at your output and logs. They will usually
+point you in the right direction. For instance, you may also need to increase
+the network timeout value. Once in a while, AUT might get stuck on an odd
+record and take longer than normal to process it. The `--conf
+spark.network.timeout=10000000` will ensure that AUT continues to work on
+material, although it may take a while to process. This command then works:
+
+```shell
+spark-shell --master local[12] --driver-memory 90G --conf spark.network.timeout=10000000 --jars /path/to/aut-1.0.0-fatjar.jar
+```
+
+## Reading Data from AWS S3
+
+We also support loading data stored in [Amazon S3](https://aws.amazon.com/s3/).
+This advanced functionality requires that you provide Spark shell with your AWS
+Access Key and AWS Secret Key, which you will get when creating your AWS
+credentials ([read more
+here](https://aws.amazon.com/blogs/security/wheres-my-secret-access-key/)).
+
+This script, for example, will find the top ten domains from a set of WARCs
+found in an s3 bucket.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+sc.hadoopConfiguration.set("fs.s3a.access.key", "<my-access-key>")
+sc.hadoopConfiguration.set("fs.s3a.secret.key", "<my-secret-key>")
+
+RecordLoader.loadArchives("s3a://<my-bucket>/*.gz", sc)
+  .keepValidPages()
+  .map(r => ExtractDomain(r.getUrl))
+  .countItems()
+  .take(10)
+```
+
+### Reading Data from an S3-like Endpoint
+
+We also support loading data stored in an Amazon S3-like system such as [Ceph
+RADOS](https://docs.ceph.com/docs/master/rados/). Similar to the above example,
+you'll need an access key and secret, and additionally, you'll need to define
+your endpoint.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+sc.hadoopConfiguration.set("fs.s3a.access.key", "<my-access-key>")
+sc.hadoopConfiguration.set("fs.s3a.secret.key", "<my-secret-key>")
+sc.hadoopConfiguration.set("fs.s3a.endpoint", "<my-end-point>")
+
+RecordLoader.loadArchives("s3a://<my-bucket>/*.gz", sc)
+  .keepValidPages()
+  .map(r => ExtractDomain(r.getUrl))
+  .countItems()
+  .take(10)
+```
+
+### Troubleshooting S3
+
+If you run into this `AmazonHttpClient` timeout error:
+
+```shell
+19/10/24 11:12:51 INFO AmazonHttpClient: Unable to execute HTTP request: Timeout waiting for connection from pool
+org.apache.http.conn.ConnectionPoolTimeoutException: Timeout waiting for connection from pool
+  at org.apache.http.impl.conn.PoolingClientConnectionManager.leaseConnection(PoolingClientConnectionManager.java:231)
+  at org.apache.http.impl.conn.PoolingClientConnectionManager$1.getConnection(PoolingClientConnectionManager.java:200)
+  at sun.reflect.GeneratedMethodAccessor7.invoke(Unknown Source)
+  at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
+  at java.lang.reflect.Method.invoke(Method.java:498)
+  at com.amazonaws.http.conn.ClientConnectionRequestFactory$Handler.invoke(ClientConnectionRequestFactory.java:70)
+```
+
+You can add the following two configuration lines to your script:
+
+```scala
+sc.hadoopConfiguration.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
+sc.hadoopConfiguration.setInt("fs.s3a.connection.maximum", 100)
+```
diff --git a/website/versioned_docs/version-1.0.0/aut-spark-submit-app.md b/website/versioned_docs/version-1.0.0/aut-spark-submit-app.md
new file mode 100644
index 00000000..ee5d7719
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/aut-spark-submit-app.md
@@ -0,0 +1,415 @@
+---
+id: version-1.0.0-aut-spark-submit-app
+title: The Toolkit with spark-submit
+original_id: aut-spark-submit-app
+---
+
+The Toolkit offers a variety of extraction jobs with
+[`spark-submit`](https://spark.apache.org/docs/latest/submitting-applications.html)
+. These extraction jobs have a few configuration options.
+
+The extraction jobs have a basic outline of:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner PATH_TO_AUT_JAR --extractor EXTRACTOR --input INPUT DIRECTORY --output OUTPUT DIRECTORY
+```
+
+Additional flags include:
+
+* `--output-format FORMAT` (`csv` (default) or `parquet`.
+  `DomainGraphExtractor` has two additional output options
+  `graphml` or `gexf`.)
+* `--split` (The extractor will put results for each input file in its own
+  directory. Each directory name will be the name of the ARC/WARC file parsed.)
+* `--partition N` (The extractor will partition the DataFrame according to N
+  before writing results. The is useful to combine all the results to a single
+  file.)
+
+## Audio Information
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `url`, `filename`, `extension`,
+`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`.
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor AudioInformationExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor AudioInformationExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor AudioInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor AudioInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## Domain Frequency
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `domain`, and `count`.
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainFrequencyExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainFrequencyExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainFrequencyExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainFrequencyExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## Domain Graph
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `src_domain`, `dest_domain`, and `count`. In
+addition to the standard text output, an additional flag `--output-format` can
+output [GraphML](https://en.wikipedia.org/wiki/GraphML), or
+[GEXF](https://gephi.org/gexf/format/).
+
+CSV output:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainGraphExtractor --input /path/to/warcs/* --output output/path --output-format csv
+```
+
+Parquet output:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainGraphExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+GEXF output:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainGraphExtractor --input /path/to/warcs/* --output output/path --output-format gexf
+```
+
+GraphML output:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainGraphExtractor --input /path/to/warcs/* --output output/path --output-format graphml
+```
+
+## Image Graph
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `src`, `image_url`, and `alt_text`.
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageGraphExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+``` shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageGraphExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageGraphExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+``` shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageGraphExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## Image Information
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `url`, `filename`, `extension`,
+`mime_type_web_server`, `mime_type_tika`, `width`, `height`, `md5`, and `sha1`.
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageInformationExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageInformationExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## PDF Information
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `url`, `filename`, `extension`,
+`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`.
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PDFInformationExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PDFInformationExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PDFInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PDFInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## Plain Text
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `content` (Boilerplate, HTTP headers, and HTML removed).
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PlainTextExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PlainTextExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PlainTextExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PlainTextExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## Presentation Program Information
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `url`, `filename`, `extension`,
+`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`.
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PresentationProgramInformationExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PresentationProgramInformationExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PresentationProgramInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PresentationProgramInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## Spreadsheet Information
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `url`, `filename`, `extension`,
+`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`.
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor SpreadsheetInformationExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor SpreadsheetInformationExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor SpreadsheetInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor SpreadsheetInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## Video Information
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `url`, `filename`, `extension`,
+`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`.
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor VideoInformationExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor VideoInformationExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor VideoInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor VideoInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## Web Graph Information
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `src`, `dest`, and `anchor`.
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebGraphExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebGraphExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebGraphExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebGraphExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## Web Pages
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `domain`, `url`,
+`mime_type_web_server`, `mime_type_tika`, and `content`
+(HTTP headers, and HTML removed).
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebPagesExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebPagesExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebPagesExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebPagesExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
+
+## Word Processor Information
+
+This extractor outputs a directory of files, or a single file with the
+following columns: `crawl_date`, `url`, `filename`, `extension`,
+`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`.
+
+Directory of CSV files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WordProcessorInformationExtractor --input /path/to/warcs/* --output output/path
+```
+
+A single CSV file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WordProcessorInformationExtractor --input /path/to/warcs/* --output output/path --partition 1
+```
+
+Directory of Parquet files:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WordProcessorInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet
+```
+
+A single Parquet file:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WordProcessorInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1
+```
diff --git a/website/versioned_docs/version-1.0.0/binary-analysis.md b/website/versioned_docs/version-1.0.0/binary-analysis.md
new file mode 100644
index 00000000..de37c468
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/binary-analysis.md
@@ -0,0 +1,1123 @@
+---
+id: version-1.0.0-binary-analysis
+title: Binary Analysis
+original_id: binary-analysis
+---
+
+## Extract Audio Information
+
+### Scala RDD
+
+**Will not be implemented.**
+
+### Scala DF
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).audio();
+
+df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes")
+  .orderBy(desc("md5"))
+  .show()
+```
+
+Will extract all following information from audio files in a web collection:
+
+- audio url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+
+|http://geocities....|    capasoligero.mp3|      mp3|          audio/mpeg|    audio/mpeg|fffd1aa802392be0f...|88e254b4cab7848a9...|//MozAAAAAAAAAAAA...|
+|http://www.geocit...|        colorwnd.mid|      mid|          audio/midi|    audio/midi|fff3f4e8a473f7c9a...|aea92a6f32dd1a1f4...|TVRoZAAAAAYAAQAGA...|
+|http://geocities....|santana_rob_thoma...|      mid|          audio/midi|    audio/midi|ffd4a24d4e4722d94...|28576c271898a1de5...|TVRoZAAAAAYAAQASA...|
+|http://geocities....|           music.mid|      mid|          audio/midi|    audio/midi|ffcbe35e28b553481...|cf1ebdbe1a070d4f6...|TVRoZAAAAAYAAAABA...|
+|http://geocities....|        evrythng.mid|      mid|          audio/midi|    audio/midi|ff751c86728ff09b5...|d22fc0911d3ceb17a...|TVRoZAAAAAYAAAABA...|
+|http://geocities....|        evrythn2.mid|      mid|          audio/midi|    audio/midi|ff751c86728ff09b5...|d22fc0911d3ceb17a...|TVRoZAAAAAYAAAABA...|
+|http://geocities....|          picket.mid|      mid|          audio/midi|    audio/midi|ff4d225a602630584...|ecef0a851cc028853...|TVRoZAAAAAYAAQAHA...|
+|http://geocities....|        simpsons.mid|      mid|          audio/midi|    audio/midi|ff3bc375860979f2f...|9c1204dad686ddeea...|TVRoZAAAAAYAAQAPA...|
+|http://www.geocit...|        simpsons.mid|      mid|          audio/midi|    audio/midi|ff3bc375860979f2f...|9c1204dad686ddeea...|TVRoZAAAAAYAAQAPA...|
+|http://geocities....|        mypretty.wav|      wav|         audio/x-wav|audio/vnd.wave|ff1a5015d3a380955...|113de5c1bb2f7ddb4...|UklGRvz8AABXQVZFZ...|
+|http://geocities....|          song37.mid|      mid|          audio/midi|    audio/midi|fee0a67ff7c71e35c...|ccd4fdfa0483d1058...|TVRoZAAAAAYAAAABA...|
+|http://geocities....|    holdyourhand.mid|      mid|          audio/midi|    audio/midi|fed14ecd7099e3fb9...|24fe5c097db5d506a...|TVRoZAAAAAYAAQANA...|
+|http://geocities....|    es_tu_sangre.mid|      mid|          audio/midi|    audio/midi|fec196e8086d868f2...|eccb1551d1e7b236e...|TVRoZAAAAAYAAQASA...|
+|http://www.geocit...|          virgin.mid|      mid|          audio/midi|    audio/midi|fec0ce795723b1287...|cc651312b1d57fe64...|TVRoZAAAAAYAAQAMA...|
+|http://www.geocit...|tonibraxtonunbrea...|      wav|         audio/x-wav|audio/vnd.wave|feb7e31a8edb0a484...|9420bdeece0f23b78...|UklGRtQoCgBXQVZFZ...|
+|http://geocities....|      comeandsee.mid|      mid|          audio/midi|    audio/midi|feb513cd7b6fab9cc...|51b4c2bb113cb43aa...|TVRoZAAAAAYAAAABA...|
+|http://geocities....|        song186t.mid|      mid|          audio/midi|    audio/midi|fead61a5a439675a3...|c652eda8a4ec5d197...|TVRoZAAAAAYAAAABA...|
+|http://geocities....|    be_magnified.mid|      mid|          audio/midi|    audio/midi|feac0e996e1555d84...|f51ec1e62a166fa82...|TVRoZAAAAAYAAQAPA...|
+|http://geocities....|        EVERYBOD.MID|      mid|          audio/midi|    audio/midi|fea911b19f0cf709d...|58bcd1b3c0288cbe0...|TVRoZAAAAAYAAQAUA...|
+|http://www.geocit...|        ff9waltz.mid|      mid|          audio/midi|    audio/midi|fe9eb1ea6d4b53a9f...|72e2467bfea6240b8...|TVRoZAAAAAYAAQAKA...|
++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+
+```
+
+If you wanted to work with all the audio files in a collection, you could
+extract them with the following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).audio();
+
+df.select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension")
+```
+
+### Python DF
+
+The following script:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "/path/to/warcs")
+
+df = archive.audio()
+df.show()
+```
+
+Will extract all following information from audio files in a web collection:
+
+- audio url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+
+|http://www.geocit...|        hc-tibet.wav|      wav|         audio/x-wav|audio/vnd.wave|416ad26133f63dc3e...|dfb764d759187d102...|UklGRg6eAABXQVZFZ...|
+|http://geocities....|bookmarkthissite.wav|      wav|         audio/x-wav|audio/vnd.wave|7897ff71780a903ca...|cfb942aeb3bc881cd...|UklGRppkAABXQVZFZ...|
+|http://geocities....|   NeilYoung-Hey.mp3|      mp3|          audio/mpeg|    audio/mpeg|40869eb3181e6035b...|19fa693521cd8125c...|//uQRAAAAcAAsNUEA...|
+|http://geocities....|          misty1.mp3|      mp3|          audio/mpeg|    audio/mpeg|d8cb3ce54072a7d4b...|43b92e16932c13a43...|//uQBAAAAsJl22mBE...|
+|http://geocities....|            sale.mid|      mid|          audio/midi|    audio/midi|5dfc0c3dd884e50c7...|071840b4822ae5e80...|TVRoZAAAAAYAAQALA...|
+|http://geocities....|        swaplink.mid|      mid|          audio/midi|    audio/midi|f32117ce2bffa9902...|0346223861c87acc1...|TVRoZAAAAAYAAQALA...|
+|http://geocities....|              m5.mid|      mid|          audio/midi|    audio/midi|7e5eedebafecd26c4...|393dfbc00c49fcdc9...|TVRoZAAAAAYAAQAJA...|
+|http://geocities....|          morder.mid|      mid|          audio/midi|    audio/midi|6cec0785377f5bbaf...|a94f0a75c0c3b3cf5...|TVRoZAAAAAYAAQAMA...|
+|http://geocities....|              m2.mid|      mid|          audio/midi|    audio/midi|58b0102f997e689a2...|51ad469ebc931e160...|TVRoZAAAAAYAAQALA...|
+|http://geocities....|           music.mid|      mid|          audio/midi|    audio/midi|7917a5a9d6ddfb8dd...|009db9df73cdf5247...|TVRoZAAAAAYAAQALA...|
+|http://www.geocit...|        hcpopeye.wav|      wav|         audio/x-wav|audio/vnd.wave|04d7b45c70e0a496e...|9db0e61c16554af88...|UklGRrbAAABXQVZFZ...|
+|http://geocities....|              m7.mid|      mid|          audio/midi|    audio/midi|3906ecaba32ba15a8...|e0d6e9f1c86b6204e...|TVRoZAAAAAYAAQAHA...|
+|http://geocities....|           words.mid|      mid|          audio/midi|    audio/midi|30da01a4ed42ae469...|160b2e5aaa9b95641...|TVRoZAAAAAYAAQAIA...|
+|http://geocities....|          brock5.mp3|      mp3|          audio/mpeg|    audio/mpeg|17f4e1c7a007983a5...|3bbdb27fafa4e8b12...|//MozAANkCLE/gjGA...|
+|http://geocities....|          brock1.mp3|      mp3|          audio/mpeg|    audio/mpeg|67db65825afc326ed...|2ec4ac110cff19134...|//MozAAMyX7VmBjGl...|
+|http://geocities....|       funkytown.wav|      wav|         audio/x-wav|audio/vnd.wave|6f841bcffe4bbb61d...|ab1fdb143d5752cf1...|UklGRlLOCQBXQVZFZ...|
+|http://geocities....|  welcomemyworld.mid|      mid|          audio/midi|    audio/midi|c546eac675e2dd974...|cb4f1fa32aa1e3205...|TVRoZAAAAAYAAQAMA...|
+|http://www.geocit...|        irisheye.mid|      mid|          audio/midi|    audio/midi|d906f32953742fdef...|f3ca7449483b0ea65...|TVRoZAAAAAYAAQAFA...|
+|http://geocities....|       mission21.mid|      mid|          audio/midi|    audio/midi|c507304afe6cddba1...|72a74c1914044746f...|TVRoZAAAAAYAAQAVA...|
+|http://geocities....|         tellit1.mid|      mid|          audio/midi|    audio/midi|a604ae85251d55504...|95096668900a76dc8...|TVRoZAAAAAYAAQAQA...|
++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+```
+
+## Extract Image Information
+
+### Scala RDD
+
+**Will not be implemented.**
+
+### Scala DF
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).images();
+
+df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"width", $"height", $"md5", $"sha1", $"bytes")
+  .orderBy(desc("md5"))
+  .show()
+```
+
+Will extract all following information from images in a web collection:
+
+- image url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- Width
+- Height
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|mime_type_tika|width|height|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+|http://www.archiv...|mediatype_movies.gif|      gif|           image/gif|     image/gif|   21|    21|ff05f9b408519079c...|194800d702aab9b87...|R0lGODlhFQAVAKUpA...|
+|http://www.archiv...|    LOCLogoSmall.jpg|      jpg|          image/jpeg|    image/jpeg|  275|   300|fbf1aec668101b960...|564c1a07152c12cea...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|   archive.small.jpg|      jpg|          image/jpeg|    image/jpeg|  300|   225|f611b554b9a44757d...|e9bf7ef0ae3fc50f5...|/9j/4RpBRXhpZgAAT...|
+|http://tsunami.ar...|  tsunamiweb1_02.jpg|      jpg|          image/jpeg|    image/jpeg|  384|   229|f02005e29ffb485ca...|9eeb9c3c67d7efc51...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|alexa_websearch_l...|      gif|           image/gif|     image/gif|  301|    47|eecc909992272ce0d...|ea18e226f3cf40005...|R0lGODlhLQEvAPcAA...|
+|http://www.archiv...|      lizardtech.gif|      gif|           image/gif|     image/gif|  140|    37|e7166743861126e51...|cf26e9ffc27be133f...|R0lGODlhjAAlANUwA...|
+|http://www.archiv...|       half_star.png|      png|           image/png|     image/png|   14|    12|e1e101f116d9f8251...|736abd06a978e2fd2...|iVBORw0KGgoAAAANS...|
+|http://www.archiv...|         hewlett.jpg|      jpg|          image/jpeg|    image/jpeg|  300|   116|e1da27028b81db60e...|eb418c17901b1b313...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|prelinger-header-...|      jpg|          image/jpeg|    image/jpeg|   84|    72|d39cce8b2f3aaa783...|1c41a644123e8f861...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|           arrow.gif|      gif|           image/gif|     image/gif|   13|    11|c7ee6d7c17045495e...|7013764e619066e60...|R0lGODlhDQALALMAA...|
+|http://www.archiv...|          folder.png|      png|           image/png|     image/png|   20|    15|c1905fb5f16232525...|ff7b8c60e8397cb5d...|iVBORw0KGgoAAAANS...|
+|http://www.archiv...|     wayback-wtc.gif|      gif|           image/gif|     image/gif|   35|    35|c15ec074d95fe7e1e...|f45425406600b136d...|R0lGODlhIwAjANUAA...|
+|http://www.archiv...|     clicktoplay.png|      png|           image/png|     image/png|  320|   240|b148d9544a1a65ae4...|477105e3a93b60dd8...|iVBORw0KGgoAAAANS...|
+|http://www.archiv...|    orange_arrow.gif|      gif|           image/gif|     image/gif|    8|    11|a820ac93e2a000c9d...|850b9daeef06bee6e...|R0lGODlhCAALAJECA...|
+|http://www.archiv...|  arc-it-tagline.gif|      gif|           image/gif|     image/gif|  385|    30|9f70e6cc21ac55878...|4601e2f642d8e55ac...|R0lGODlhgQEeALMPA...|
+|http://www.archiv...|          guitar.jpg|      jpg|          image/jpeg|    image/jpeg|  140|   171|9ed163df5065418db...|f6c9475009ae2416c...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|        blendbar.jpg|      jpg|          image/jpeg|    image/jpeg| 1800|    89|9e41e4d6bdd53cd9d...|dc780bf80720c87c9...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|alexalogo-archive...|      gif|           image/gif|     image/gif|  304|    36|9da73cf504be0eb70...|03e530ef04e4b68f7...|R0lGODlhMAEkAOYAA...|
+|http://www.archiv...|             lma.jpg|      jpg|          image/jpeg|    image/jpeg|  215|    71|97ebd3441323f9b5d...|ff9485b26300721b2...|/9j/4AAQSkZJRgABA...|
+|http://i.creative...|           88x31.png|      png|           image/png|     image/png|   88|    31|9772d34b683f8af83...|689bef4ffb8918612...|iVBORw0KGgoAAAANS...|
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+
+only showing top 20 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 7 more fields]
+```
+
+If you wanted to work with all the images in a collection, you could extract
+them with the following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).images();
+
+df.select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension")
+```
+
+### Python DF
+
+The following script:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "/path/to/warcs")
+
+df = archive.images()
+df.show()
+```
+
+Will extract all following information from images in a web collection:
+
+- image url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- Width
+- Height
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|mime_type_tika|width|height|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+|http://farm3.stat...|4047878934_ef12ba...|      jpg|          image/jpeg|    image/jpeg|  100|    75|e1a376f170b815f49...|2165fd2908950e9f6...|/9j/4AAQSkZJRgABA...|
+|http://farm3.stat...|4047881126_fc6777...|      jpg|          image/jpeg|    image/jpeg|   75|   100|371a2a5142c611405...|933f937c949826696...|/9j/4AAQSkZJRgABA...|
+|http://farm3.stat...|4047879492_a72dd8...|      jpg|          image/jpeg|    image/jpeg|  100|    75|8877679361cde970d...|31dbaaed2f7194c95...|/9j/4AAQSkZJRgABA...|
+|http://farm3.stat...|4047877728_c6c118...|      jpg|          image/jpeg|    image/jpeg|   75|   100|8f009a568d47e1888...|7b83e7d6c78ed65cf...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|96d9290d060547781...|2d3005bd6e09ca064...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|c69d65d4880445b31...|abe40cb96bfc79095...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|cb11c08d43e25ec3b...|2060857d6cf41b141...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|756b5a0a83a621eb7...|d4625efc80efb985e...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|0b60007c3e3d9d63f...|a154035590a01efb4...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|97fdea388e1245691...|e415a77a4369ecef8...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|05c2d43f687f40b60...|ed3f6ca2f3d7e9569...|/9j/4AAQSkZJRgABA...|
+|http://www.canadi...|     WebResource.axd|      gif|           image/gif|     image/gif|    1|     1|325472601571f31e1...|2daeaa8b5f19f0bc2...|R0lGODlhAQABAIAAA...|
+|http://www.davids...|footprint-carbon.jpg|      jpg|          image/jpeg|    image/jpeg|  200|   200|51f57de92e76f3edc...|c970137cd3bfdbbba...|/9j/4AAQSkZJRgABA...|
+|http://www.gca.ca...|              15.jpg|      jpg|          image/jpeg|    image/jpeg|  300|   230|8b3c192b9a0cc82d6...|851377ed11c9cd153...|/9j/4AAQSkZJRgABA...|
+|http://www.equalv...|loadingAnimation.gif|      gif|           image/gif|     image/gif|  208|    13|c33734a1bf58bec32...|2bb50e01775289c24...|R0lGODlh0AANAMQAA...|
+|http://www.davids...|Keep-greening-gre...|      jpg|          image/jpeg|    image/jpeg|  166|   252|4763383a8be13c735...|a42b963e18dc1e7d4...|/9j/4AAQSkZJRgABA...|
+|http://www.davids...|Keep-greening-don...|      jpg|          image/jpeg|    image/jpeg|  146|   252|515bd44bea759e169...|75abeb65cc4f54c7d...|/9j/4AAQSkZJRgABA...|
+|http://www.davids...|Keep-greening-eca...|      jpg|          image/jpeg|    image/jpeg|  158|   252|345f71df9702e99a0...|b6637ac654f6e2073...|/9j/4AAQSkZJRgABA...|
+|http://www.davids...|Keep-greening-tit...|      jpg|          image/jpeg|    image/jpeg|  470|    45|385522fde90ac7e96...|b42151cf8c3ce14e0...|/9j/4AAQSkZJRgABA...|
+|http://www.davids...|    last_minute2.jpg|      jpg|          image/jpeg|    image/jpeg|  265|    33|3defee897d4c553fc...|37c790bbc23c369d8...|/9j/4AAQSkZJRgABA...|
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+```
+
+## Extract Most Frequent Image URLs
+
+### Scala RDD
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .flatMap(r => ExtractImageLinks(r.getUrl, r.getContentString))
+  .countItems()
+  .take(10)
+```
+
+Will extract the top ten URLs of images found within a collection, in an array
+like so:
+
+```bash
+links: Array[(String, Int)] = Array((http://www.archive.org/images/star.png,408), (http://www.archive.org/images/no_star.png,122), (http://www.archive.org/images/logo.jpg,118), (http://www.archive.org/images/main-header.jpg,84), (http://www.archive.org/images/rss.png,20), (http://www.archive.org/images/mail.gif,13), (http://www.archive.org/images/half_star.png,10), (http://www.archive.org/images/arrow.gif,7), (http://ia300142.us.archive.org/3/items/americana/am_libraries.gif?cnt=0,3), (http://ia310121.us.archive.org/2/items/GratefulDead/gratefuldead.gif?cnt=0,3), (http://www.archive.org/images/wayback.gif,2), (http://www.archive.org/images/wayback-election2000.gif,2), (http://www.archive.org/images/wayback-wt...
+```
+
+If you wanted to work with the images, you could download them from the
+Internet Archive.
+
+Let's use the top-ranked example. [This
+link](http://web.archive.org/web/*/http://archive.org/images/star.png), for
+example, will show you the temporal distribution of the image. For a snapshot
+from September 2007, this URL would work:
+
+<http://web.archive.org/web/20070913051458/http://www.archive.org/images/star.png>
+
+To do analysis on all images, you could thus prepend
+`http://web.archive.org/web/20070913051458/` to each URL and `wget` them en
+masse.
+
+For more information on `wget`, please consult [this lesson available on the
+Programming Historian
+website](http://programminghistorian.org/lessons/automated-downloading-with-wget).
+
+### Scala DF
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).imagegraph();
+
+df.groupBy($"image_url")
+  .count()
+  .orderBy($"count".desc)
+  .show(10)
+```
+
+Will extract the top ten URLs of images found within a collection, in a
+DataFrame like so:
+
+```dataframe
++--------------------+-----+
+|           image_url|count|
++--------------------+-----+
+|http://www.archiv...|  408|
+|http://www.archiv...|  122|
+|http://www.archiv...|   83|
+|http://www.archiv...|   49|
+|http://www.archiv...|   20|
+|http://www.archiv...|   13|
+|http://www.archiv...|   10|
+|http://www.archiv...|    7|
+|http://ia300142.u...|    3|
+|http://ia310121.u...|    3|
++--------------------+-----+
+only showing top 10 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+df: org.apache.spark.sql.DataFrame = [src: string, image_url: string]
+```
+
+### Python DF
+
+The following script:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "/path/to/warcs")
+
+df = archive.imagegraph()
+
+df.groupBy("image_url")
+  .count()
+  .orderBy("count", ascending=False)
+  .show(10)
+```
+
+Will extract the top ten URLs of images found within a collection, in a
+DataFrame like so:
+
+```dataframe
++--------------------+-----+
+|           image_url|count|
++--------------------+-----+
+|http://www.archiv...|  408|
+|http://www.archiv...|  122|
+|http://www.archiv...|   83|
+|http://www.archiv...|   49|
+|http://www.archiv...|   20|
+|http://www.archiv...|   13|
+|http://www.archiv...|   10|
+|http://www.archiv...|    7|
+|http://ia300142.u...|    3|
+|http://ia310121.u...|    3|
++--------------------+-----+
+```
+
+## Extract Most Frequent Images MD5 Hash
+
+Some images may be the same, but have different URLs. This UDF finds the
+popular images by calculating the MD5 hash of each and presents the most
+frequent images based on that metric. This script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.app._
+import io.archivesunleashed.matchbox._
+
+val r = RecordLoader.loadArchives("/path/to/warcs",sc).persist()
+ExtractPopularImages(r, 500, sc).saveAsTextFile("500-Popular-Images")
+```
+
+Will save the 500 most popular URLs to an output directory.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.app._
+
+val df = RecordLoader.loadArchives("/path/to/warcs",sc).images()
+
+ExtractPopularImagesDF(df,10,30,30).show()
+```
+
+### Python DF
+
+```python
+from aut import *
+
+images = WebArchive(sc, sqlContext, "/path/to/warcs").images()
+
+popular_images = ExtractPopularImages(images, 20, 10, 10)
+
+popular_images.show()
+```
+
+## Find Images Shared Between Domains
+
+How to find images shared between domains that appear more than once _in more
+than one domain_.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val images = RecordLoader.loadArchives("/path/to/warcs", sc)
+                        .images()
+                        .select(removePrefixWWW(extractDomain($"url")).as("domain"), $"url", $"md5")
+
+val links = images.groupBy("md5").count().where(countDistinct("domain")>=2)
+
+val result = images.join(links, "md5")
+                   .groupBy("domain", "md5")
+                   .agg(first("url").as("image_url"))
+                   .orderBy(asc("md5"))
+                   .write
+                   .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+                   .format("csv")
+                   .option("escape", "\"")
+                   .option("encoding", "utf-8")
+                   .save("/path/to/output")
+```
+
+### PythonDF
+
+```python
+from aut import *
+from pyspark.sql.functions import asc, countDistinct, first
+
+images = WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .images() \
+  .select(remove_prefix_www(extract_domain("url")).alias("domain"), "url", "md5")
+
+links = images.groupBy("md5") \
+              .count() \
+              .where(countDistinct("domain")>=2)
+
+result = images.join(links, "md5") \
+               .groupBy("domain", "md5") \
+               .agg(first("url").alias("image_url")) \
+               .orderBy(asc("md5")) \
+               .write
+               .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+               .format("csv")
+               .option("escape", "\"")
+               .option("encoding", "utf-8")
+               .save("/path/to/output")
+```
+
+## Extract PDF Information
+
+### Scala RDD
+
+**Will not be implemented.**
+
+### Scala DF
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).pdfs();
+
+df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes")
+  .orderBy(desc("md5"))
+  .show()
+```
+
+Will extract all following information from PDF files in a web collection:
+
+- file url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server| mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+|http://geocities....|adicec_sopar_2009...|      pdf|application/octet...|application/pdf|ffc2ccc373b8ffd39...|3831b0f228af1701e...|JVBERi0xLjMNJeLjz...|
+|http://www.geocit...|      IntSt_2301.pdf|      pdf|application/octet...|application/pdf|ffa638c418dac2e19...|84dbaccde1ace4b24...|JVBERi0xLjQNJeLjz...|
+|http://www.geocit...|            lotg.pdf|      pdf|application/octet...|application/pdf|ff871ef64d3739b03...|95a777f0b4c7703c6...|JVBERi0xLjINJeLjz...|
+|http://geocities....|            ebad.pdf|      pdf|application/octet...|application/pdf|fe8feece5d08dc2ce...|0c01cc31b40a286da...|JVBERi0xLjMNJeLjz...|
+|http://geocities....|      regulament.pdf|      pdf|application/octet...|application/pdf|fe8018451633fd76c...|9c7cc720e29cad6e8...|JVBERi0xLjMKJcfsj...|
+|http://geocities....|dmatias_letterfor...|      pdf|application/octet...|application/pdf|fe7dbc89e664ba790...|dbe965e7a288cce59...|JVBERi0xLjYNJeLjz...|
+|http://geocities....|overcome_the_fear...|      pdf|application/octet...|application/pdf|fe3ec0805564cd3fc...|d0d30ba4f7f40434d...|JVBERi0xLjMKJcfsj...|
+|http://geocities....|       CIM_marks.pdf|      pdf|application/octet...|application/pdf|fe1622ac08b47cf60...|b97b57b3c77887324...|JVBERi0xLjMKJcTl8...|
+|http://geocities....|           board.PDF|      pdf|application/octet...|application/pdf|fd969b57508d3b135...|fc121c07fefbb722b...|JVBERi0xLjIgDQol4...|
+|http://geocities....|          cowell.pdf|      pdf|application/octet...|application/pdf|fbacc01cbe01aa0b4...|f9e9eba1b281ad800...|JVBERi0xLjMKJeLjz...|
+|http://geocities....|        gdbrasil.pdf|      pdf|application/octet...|application/pdf|fadc9b9b2408a1112...|247671acb971ddc21...|JVBERi0xLjQNJeLjz...|
+|http://www.geocit...|         EBOrder.pdf|      pdf|application/octet...|application/pdf|fa4a83d96441324b3...|5f6870832d035a5a9...|JVBERi0xLjINJeLjz...|
+|http://geocities....|        butlleta.pdf|      pdf|application/octet...|application/pdf|fa13dfbf62acb5083...|9a8ec0c0e8a190f46...|JVBERi0xLjQNJeLjz...|
+|http://www.geocit...|ALABAMAUNDERWOODM...|      pdf|application/octet...|application/pdf|f9791c7df35d9092a...|3e4c0ca1031152d24...|JVBERi0xLjIgDQol4...|
+|http://geocities....|         chimera.pdf|      pdf|application/octet...|application/pdf|f92a40f58cffcdc8e...|ba038d0146b0171f2...|JVBERi0xLjMKJcfsj...|
+|http://geocities....|          icarus.pdf|      pdf|application/octet...|application/pdf|f8da963b714e684b3...|4444f5a12c9dbb1df...|JVBERi0xLjMKJcfsj...|
+|http://geocities....|2008_ClubFinances...|      pdf|application/octet...|application/pdf|f878c0373edbc89f9...|700393c7b6aaf93df...|JVBERi0xLjQNJeLjz...|
+|http://geocities....|  WILLOWSTScene5.pdf|      pdf|application/octet...|application/pdf|f84fc521602fdf163...|5f03b19201536cbc8...|JVBERi0xLjQKJcfsj...|
+|http://geocities....|        isrherb2.pdf|      pdf|application/octet...|application/pdf|f83390642e9fe6313...|60befa2b5913bb19d...|JVBERi0xLjMNJeLjz...|
+|http://geocities....|            joel.pdf|      pdf|application/octet...|application/pdf|f828e4b447c085fdd...|2e3308c1a52f2f75a...|JVBERi0xLjQKJcOkw...|
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 6 more fields]
+```
+
+If you wanted to work with all the PDF files in a collection, you could extract
+them with the following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).pdfs();
+
+df.select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension")
+```
+
+### Python DF
+
+The following script:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "/path/to/warcs")
+
+df = archive.pdfs()
+df.show()
+```
+
+Will extract all following information from PDF files in a web collection:
+
+- file url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server| mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+|http://geocities....|20080304ordinance...|      pdf|application/octet...|application/pdf|ebbf9bf99b363493b...|f0b9a6788cbc1f8ab...|JVBERi0xLjMNJeLjz...|
+|http://geocities....|FACTSHEET2008ADOP...|      pdf|application/octet...|application/pdf|4fe261c2210189a52...|a91180b9170ff757f...|JVBERi0xLjQNJeLjz...|
+|http://geocities....|            Menu.pdf|      pdf|application/octet...|application/pdf|75e4d587589a1d85d...|d18724100d4616a45...|JVBERi0xLjMNJeLjz...|
+|http://geocities....|DSTC2009ContestFl...|      pdf|application/octet...|application/pdf|c80f38f96480aab0c...|369c4415ed9c2476d...|JVBERi0xLjQNJeLjz...|
+|http://geocities....|            ebad.pdf|      pdf|application/octet...|application/pdf|fe8feece5d08dc2ce...|0c01cc31b40a286da...|JVBERi0xLjMNJeLjz...|
+|http://geocities....|FACTSHEET2008APPR...|      pdf|application/octet...|application/pdf|8747971e78acb768b...|770f97a95c7e2ee16...|JVBERi0xLjQNJeLjz...|
+|http://geocities....|FACTSHEET2008APPE...|      pdf|application/octet...|application/pdf|32f57bbe5b28f4ab1...|d4f63b8d29f4c5dc5...|JVBERi0xLjQNJeLjz...|
+|http://geocities....|FACTSHEET2008ADOP...|      pdf|application/octet...|application/pdf|e9189eea563fde074...|f14b1846499dd4bd0...|JVBERi0xLjQNJeLjz...|
+|http://geocities....|          sharar.pdf|      pdf|application/octet...|application/pdf|771f5bd1b72b8e324...|9cef1f6af9e5c127e...|JVBERi0xLjMNJeLjz...|
+|http://geocities....|FACTSHEET2008UTIL...|      pdf|application/octet...|application/pdf|7f45c93d16823e852...|b3a2d3b95efd77bd6...|JVBERi0xLjQNJeLjz...|
+|http://geocities....|BakweriMarginalis...|      pdf|application/octet...|application/pdf|d25863303ba46a872...|bbd6c9bce4c523f0f...|JVBERi0xLjINJeLjz...|
+|http://geocities....|McCallaFoodSecuri...|      pdf|application/octet...|application/pdf|1291b633f49f7e51d...|622144ed0fd56bae3...|JVBERi0xLjMNJeLjz...|
+|http://geocities....|PovertyAndIncome.pdf|      pdf|application/octet...|application/pdf|278e1f281905d419d...|9bc00a54147a4b350...|JVBERi0xLjIgDSXi4...|
+|http://geocities....|          behold.pdf|      pdf|application/octet...|application/pdf|9fc1e4e1e0f567477...|63d324984d34eb168...|JVBERi0xLjMKJcfsj...|
+|http://geocities....|overcome_the_fear...|      pdf|application/octet...|application/pdf|fe3ec0805564cd3fc...|d0d30ba4f7f40434d...|JVBERi0xLjMKJcfsj...|
+|http://geocities....|           raven.pdf|      pdf|application/octet...|application/pdf|acabc7f7dba954f99...|1ddf3e53813a805a1...|JVBERi0xLjMKJcfsj...|
+|http://geocities....|          sunset.pdf|      pdf|application/octet...|application/pdf|1dc037712d47b11d9...|f502ca5cc2de2483b...|JVBERi0xLjMKJcfsj...|
+|http://geocities....|night_lasts_less_...|      pdf|application/octet...|application/pdf|1cda3dfab3bedaf04...|ad0f6e6fd53e4eb5f...|JVBERi0xLjMKJcfsj...|
+|http://geocities....|      angel_dust.pdf|      pdf|application/octet...|application/pdf|92d14676e34dfcb7e...|1588b870928d56667...|JVBERi0xLjMKJcfsj...|
+|http://geocities....|         vampire.pdf|      pdf|application/octet...|application/pdf|f1730689d52b9524e...|bf377a4e2580b8a29...|JVBERi0xLjMKJcfsj...|
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+```
+
+## Extract Presentation Program Files Information
+
+### Scala RDD
+
+**Will not be implemented.**
+
+### Scala DF
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).presentationProgramFiles();
+
+df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes")
+  .orderBy(desc("md5"))
+  .show()
+```
+
+Will extract all following information from presentation program files in a web collection:
+
+- file url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|      mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+|http://geocities....|           index.pps|      pps|application/mspow...|application/vnd.m...|fbaed5a1df163270a...|afa4c82593ea5bfd6...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|MathForEveryoneCa...|      ppt|application/mspow...|application/vnd.m...|f5fde5813a5aef2f3...|e791212ac91243f39...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|AD1-GE-Quiz4-Samp...|      ppt|application/mspow...|application/vnd.m...|f5824d64bb74b1377...|aaea2a38d11682753...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|    Agrarianism1.ppt|      ppt|application/mspow...|application/vnd.m...|f581932d9e4c57dc0...|3fbce2d175be293a8...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|            lego.pps|      pps|application/mspow...|application/vnd.m...|f0da5c58e7abbf102...|78bc45da68c6784be...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|            HPIB.ppt|      ppt|application/mspow...|application/vnd.m...|ef09c31bd8079d40b...|875a96d8b8dd3bf18...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|learningdisabilit...|      ppt|application/mspow...|application/vnd.m...|e6bb4f98761839a3a...|5a4dcc8bab2ee15f3...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|assessmentsummer.ppt|      ppt|application/mspow...|application/vnd.m...|e116a443b9031ec01...|141563f2f32687587...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|CommonlyConfusedW...|      ppt|application/mspow...|application/vnd.m...|dde43870e0da8ebf6...|7a94bf766d931a046...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|AD1-Unit5-Achieve...|      ppt|application/mspow...|application/vnd.m...|d4530e506c2e41f8f...|6c89c0e3d28ecceed...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|         Schwind.ppt|      ppt|application/mspow...|application/vnd.m...|cfdd4bb6e7b04f24a...|9c26a8ac091f88a35...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|  cpphtp4_PPT_07.ppt|      ppt|application/mspow...|application/vnd.m...|cd98e6e18c3b0ada0...|b3651507f61bafa4d...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|          mylife.ppt|      ppt|application/mspow...|application/vnd.m...|cb146894f8a544ace...|0129cfdfd2f196346...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|    refinterview.ppt|      ppt|application/mspow...|application/vnd.m...|ca6fd4ec5fcb8237d...|8312ca4c0dbeb6008...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|MathForEveryoneAl...|      ppt|application/mspow...|application/vnd.m...|c887f45fa58f273b0...|b253b732f8502f357...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|   ch2-DataTypes.ppt|      ppt|application/mspow...|application/vnd.m...|c74caee72b5ee6684...|f3bf878c775e2f72a...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|geographyofnortha...|      ppt|application/mspow...|application/vnd.m...|c35b93ac59f2eb5af...|b5de05a856838328c...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|       people1st.ppt|      ppt|application/mspow...|application/vnd.m...|bf19cdc1ff3ad82fd...|99f14fe81d8a9587f...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|     AD1-Reading.ppt|      ppt|application/mspow...|application/vnd.m...|be020b4564f972218...|0761a2fd5c176ce1c...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|         majalah.ppt|      ppt|application/mspow...|application/vnd.m...|b6f219693ef1df49f...|1039013624cf8de35...|0M8R4KGxGuEAAAAAA...|
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 6 more fields]
+```
+
+If you wanted to work with all the presentation program files in a collection,
+you could extract them with the following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).presentationProgramFiles();
+
+df.select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension")
+```
+
+### Python DF
+
+The following script:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "/path/to/warcs")
+
+df = archive.presentation_program()
+df.show()
+```
+
+Will extract all following information from presentation program files in a web collection:
+
+- file url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|      mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+|http://geocities....|          wincvs.ppt|      ppt|application/mspow...|application/vnd.m...|52ac23b58493234b2...|a2206af9847cceb06...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|           index.pps|      pps|application/mspow...|application/vnd.m...|fbaed5a1df163270a...|afa4c82593ea5bfd6...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|MathForEveryoneCa...|      ppt|application/mspow...|application/vnd.m...|f5fde5813a5aef2f3...|e791212ac91243f39...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|MathForEveryone7t...|      ppt|application/mspow...|application/vnd.m...|9893643e1cb87af0c...|2fa8301893ad21b2b...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|MathForEveryoneGe...|      ppt|application/mspow...|application/vnd.m...|2a914a95a61b227dd...|5d783c1beaffc0b57...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|MathForEveryoneAl...|      ppt|application/mspow...|application/vnd.m...|c887f45fa58f273b0...|b253b732f8502f357...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|MathForEveryone7t...|      ppt|application/mspow...|application/vnd.m...|034906471a0c0b997...|16142a0aa69b2fb1f...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|           tiago.ppt|      ppt|application/mspow...|application/vnd.m...|6871786192c187783...|e5a91a65ef9a4bade...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|        energypp.ppt|      ppt|application/mspow...|application/vnd.m...|94f9384ec57d8849c...|e943c5cf509f8f816...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|            lego.pps|      pps|application/mspow...|application/vnd.m...|f0da5c58e7abbf102...|78bc45da68c6784be...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|      celtiberos.pps|      pps|application/mspow...|application/vnd.m...|af897525acd31d359...|9c018a80253c38a57...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|          porque.pps|      pps|application/mspow...|application/vnd.m...|9c2cba37c64fd0ac8...|6f11733ddec0abc2d...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|SoftHandoffbyPara...|      pps|application/mspow...|application/vnd.m...|0c5ef732ea466574f...|dc7dfe545b401aeab...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|A_Land_Remembered...|      ppt|application/mspow...|application/vnd.m...|5b7273d03f8490490...|2d8721e7876cb6697...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|           DANCE.ppt|      ppt|application/mspow...|application/vnd.m...|5aa3308433666a30a...|4a23bd20768501dac...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|            unit.ppt|      ppt|application/mspow...|application/vnd.m...|6736886864069ee66...|e92031e6e0293cb73...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|         majalah.ppt|      ppt|application/mspow...|application/vnd.m...|b6f219693ef1df49f...|1039013624cf8de35...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|esos_si_son_probl...|      pps|application/mspow...|application/vnd.m...|932221045b6154d7e...|b23a0238c852d28bb...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|    refinterview.ppt|      ppt|application/mspow...|application/vnd.m...|ca6fd4ec5fcb8237d...|8312ca4c0dbeb6008...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|         Schwind.ppt|      ppt|application/mspow...|application/vnd.m...|cfdd4bb6e7b04f24a...|9c26a8ac091f88a35...|0M8R4KGxGuEAAAAAA...|
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+```
+
+## Extract Spreadsheet Information
+
+### Scala RDD
+
+**Will not be implemented.**
+
+### Scala DF
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).spreadsheets();
+
+df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes")
+  .orderBy(desc("md5"))
+  .show()
+```
+
+Will extract all following information from spreadsheet files in a web collection:
+
+- file url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|      mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+|http://geocities....|     statuscarib.xls|      xls|application/vnd.m...|application/vnd.m...|f9fd18b158df52ff2...|0d606f25ac3c9abc4...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|       timesheet.xls|      xls|application/vnd.m...|application/vnd.m...|f9549db15de69bc21...|e9c239d812705842f...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|statusccusspring0...|      xls|application/vnd.m...|application/vnd.m...|ef99704e5a734f386...|f265fc5c581ad1762...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|  Laboratorio_05.xls|      xls|application/vnd.m...|application/vnd.m...|eb0e39898ba513234...|976f69da07122d285...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|110_Laboratorio_6...|      xls|application/vnd.m...|application/vnd.m...|e5b7fee6d4c45e171...|befd9670be70a4fdb...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|          Pakuan.xls|      xls|application/vnd.m...|application/vnd.m...|e386f85a7bd74b1ab...|5b2b142de2c57ec68...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|Spring08_statusre...|      xls|application/vnd.m...|application/vnd.m...|df2d6792fb55c4e26...|6f4d2aef711aff4e1...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|    CTtimetable2.xls|      xls|application/vnd.m...|application/vnd.m...|dc987d3e996677ce9...|40bb63a4c0038a6ae...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|MalibuTrailChalle...|      xls|application/vnd.m...|application/vnd.m...|dbba76ead82576178...|ffbe099441053b47b...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|     CTtimetable.xls|      xls|application/vnd.m...|application/vnd.m...|d9ee9117e70df43b5...|596c4c6d5cdc7ddb5...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|  1071_Parcial_2.xls|      xls|application/vnd.m...|application/vnd.m...|d90dc210138676a2c...|6e3ed07f50393815c...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|excelsubtractione...|      xls|application/vnd.m...|application/vnd.m...|d6c8314e52f22e4aa...|1b1ebce0f85628921...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|Fall2008_statusre...|      xls|application/vnd.m...|application/vnd.m...|cd9974430477b75ce...|0e756bbc38608cb51...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|        report01.xls|      xls|application/vnd.m...|application/vnd.m...|cd947fe4099df4fe3...|0f11d17d38a72977b...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|TrackRecords20010...|      xls|application/vnd.m...|application/vnd.m...|c8aa0122443efa0e5...|fa9cdb4a329f926bf...|0M8R4KGxGuEAAAAAA...|
+|http://br.geociti...|  mycoinsforswap.xls|      xls|application/vnd.m...|application/vnd.m...|c665c83bc2b54292f...|18f1f3a4559d5c40a...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|     AAtimetable.xls|      xls|application/vnd.m...|application/vnd.m...|c66201762bf5e473e...|4e9bac4f217b0605d...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|   carwashroster.xls|      xls|application/vnd.m...|application/vnd.m...|c495d1b7dc954b975...|062167485baf9aa5d...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|         RSL_MDP.xls|      xls|application/vnd.m...|application/vnd.m...|bf6479bacbb758b52...|4d7ea33849447853d...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|  status_report4.xls|      xls|application/vnd.m...|application/vnd.m...|bc4d18e022522d185...|fc7b9fc64116c9ad1...|0M8R4KGxGuEAAAAAA...|
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 6 more fields]
+```
+
+If you wanted to work with all the spreadsheet files in a collection, you could extract
+them with the following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).spreadsheets();
+
+df.select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension")
+```
+
+### Python DF
+
+The following script:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "/path/to/warcs")
+
+df = archive.spreadsheets()
+df.show()
+```
+
+Will extract all following information from spreadsheet files in a web collection:
+
+- file url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|      mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+|http://geocities....|        tkadrosu.xls|      xls|application/vnd.m...|application/vnd.m...|8033532f88da42ad6...|a52b24bc760c5265b...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|     cal_counter.xls|      xls|application/vnd.m...|application/vnd.m...|56ad6c2f84fdd4a88...|ad0db35f2d7ff2cca...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|    CTtimetable2.xls|      xls|application/vnd.m...|application/vnd.m...|dc987d3e996677ce9...|40bb63a4c0038a6ae...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|     AAtimetable.xls|      xls|application/vnd.m...|application/vnd.m...|c66201762bf5e473e...|4e9bac4f217b0605d...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|     CTtimetable.xls|      xls|application/vnd.m...|application/vnd.m...|d9ee9117e70df43b5...|596c4c6d5cdc7ddb5...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|    CTtimetable2.xls|      xls|application/vnd.m...|application/vnd.m...|a4ed4330d5c18f1b2...|d8ce479596d49679d...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|           teams.xls|      xls|application/vnd.m...|application/vnd.m...|334fa42776cef7f81...|aa57fda7fb634c931...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|      collection.xls|      xls|application/vnd.m...|application/vnd.m...|30d7a67de8150f712...|841ba91f009d48b7a...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|music-collection.xls|      xls|application/vnd.m...|application/vnd.m...|4def75fa96bae579d...|090a95923c9599454...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|          020103.xls|      xls|application/vnd.m...|application/vnd.m...|48651a7592ca1b0f0...|1e2438c8247d33870...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|          011803.xls|      xls|application/vnd.m...|application/vnd.m...|0aab8ed40f91c1c76...|8e02e408fe1ce40b9...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|     RSL_TorOton.xls|      xls|application/vnd.m...|application/vnd.m...|1d9c13c6407a2b696...|007010ecf5b208453...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|         members.xls|      xls|application/vnd.m...|application/vnd.m...|b045a6b118981c6eb...|3ae096d6602b7cb36...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|        round309.xls|      xls|application/vnd.m...|application/vnd.m...|50bed4b3e9facb278...|f26e0c38082141598...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|       result109.xls|      xls|application/vnd.m...|application/vnd.m...|2235d094897f10c3b...|6ed0b65fd43502a2b...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|TrackRecords20010...|      xls|application/vnd.m...|application/vnd.m...|c8aa0122443efa0e5...|fa9cdb4a329f926bf...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|           Digox.xls|      xls|application/vnd.m...|application/vnd.m...|182d08821797269c7...|80e7ce8ecc1ecf389...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|       RSL_SemBA.xls|      xls|application/vnd.m...|application/vnd.m...|59613700fbf08b795...|44eac99a514141520...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|         RSL_MDP.xls|      xls|application/vnd.m...|application/vnd.m...|bf6479bacbb758b52...|4d7ea33849447853d...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|       RSL_ARG99.xls|      xls|application/vnd.m...|application/vnd.m...|a2f2fd063dd5689a7...|61568e0f4139ec568...|0M8R4KGxGuEAAAAAA...|
++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+```
+
+## Extract Video Information
+
+### Scala RDD
+
+**Will not be implemented.**
+
+### Scala DF
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).videos();
+
+df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes")
+  .orderBy(desc("md5"))
+  .show()
+```
+
+Will extract all following information from videos in a web collection:
+
+- file url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server| mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+|http://geocities....|       videohead.avi|      avi|     video/x-msvideo|video/x-msvideo|fa9852748ba7b4829...|0be56f200f8e1cb83...|UklGRjoMIQBBVkkgT...|
+|http://www.geocit...|       HandWrap2.avi|      avi|     video/x-msvideo|video/x-msvideo|f680cb463e7cb291e...|1d2ea1df3f5af2599...|UklGRrBrAgBBVkkgT...|
+|http://geocities....|         1kungfu.avi|      avi|     video/x-msvideo|video/x-msvideo|f4429277ed4b48efb...|5c542e8990efd484b...|UklGRkoSFwBBVkkgT...|
+|http://geocities....|  Vol_III_sample.mpg|      mpg|          video/mpeg|     video/mpeg|f2bc34f7294edc376...|a939dc619c123f81b...|AAABuiEAAdLxgA7xA...|
+|http://geocities....|         wherego.avi|      avi|     video/x-msvideo|video/x-msvideo|f23976ddeb6f08810...|714a9a548f9b2a156...|UklGRkq4HgBBVkkgT...|
+|http://geocities....|       couch100k.wmv|      asf|      video/x-ms-wmv| video/x-ms-asf|ee316d5871acb7859...|0593ebb8e450a6c3e...|MCaydY5mzxGm2QCqA...|
+|http://geocities....|    Mitwa_Lagaan.mp3|       qt|          audio/mpeg|video/quicktime|ebc5db8d30edd0135...|d3ebdd6da2c732481...|AAAE6W1vb3YAAABsb...|
+|http://www.geocit...|       tydunking.mpg|      mpg|          video/mpeg|     video/mpeg|eaa0d14dc05bdab98...|05d4ff2301d2d3818...|AAABuiEAAQALgBcdA...|
+|http://geocities....|       bigjleroy.avi|      avi|     video/x-msvideo|video/x-msvideo|e93538f0d76b86cca...|ebeb89fc2fa8f7cd6...|UklGRrjUCgBBVkkgT...|
+|http://geocities....|     NollieBs180.mov|      mov|     video/quicktime|video/quicktime|e7b97c287329340d5...|138fb8b0dea4c8e16...|AAAGwm1vb3YAAABsb...|
+|http://www.geocit...|           shirt.avi|      avi|     video/x-msvideo|video/x-msvideo|e36119d3c78225cbf...|11af72475ca754639...|UklGRvhdHQBBVkkgT...|
+|http://geocities....|          atdawn.wma|      asf|      audio/x-ms-wma| video/x-ms-asf|e1a85a79ea3ba5d96...|1be05aecdff99298c...|MCaydY5mzxGm2QCqA...|
+|http://www.geocit...|non_will_go_to_wa...|      mov|     video/quicktime|video/quicktime|de6cc975363c4076b...|0c00d0be9c89f9e97...|AAs4JW1kYXQAAA70A...|
+|http://geocities....|      Movies_20.mpeg|     mpeg|          video/mpeg|     video/mpeg|dd9d2af0c1318b5ff...|9d06f09744fe93408...|AAABuiEAV+PlgAU7A...|
+|http://www.geocit...|        artilery.mpg|      mpg|          video/mpeg|     video/mpeg|dcecbdfe46448bffb...|0b292aab1078d9bfa...|AAABswsAkBP//+CkA...|
+|http://geocities....| tancfigurakbbpl.wmv|      wmv|      video/x-ms-wmv| video/x-ms-wmv|dca4991392572dbc0...|cb349bdc35484d976...|MCaydY5mzxGm2QCqA...|
+|http://www.geocit...|          Trevi2.mov|      mov|     video/quicktime|video/quicktime|dc882205f5cae38f5...|c9dd804e1ee140221...|AAAEvG1vb3YAAAS0Y...|
+|http://www.geocit...|skillful_driving_...|      mpg|          video/mpeg|     video/mpeg|db8a767b00884e426...|f5a70cf5f091b530f...|AAABuiEAAQABgAORA...|
+|http://geocities....|      jeremy100k.wmv|      asf|      video/x-ms-wmv| video/x-ms-asf|dafba744438ae0110...|d3a217ce25507ae90...|MCaydY5mzxGm2QCqA...|
+|http://www.geocit...|           mbrl2.mpg|      mpg|          video/mpeg|     video/mpeg|d8eb5a12f0da99ca0...|8686002a444cc9dce...|AAABswsAkBP//+CkA...|
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 6 more fields]
+```
+
+If you wanted to work with all the video files in a collection, you could extract
+them with the following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).videos();
+
+df.select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension")
+```
+
+### Python DF
+
+The following script:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "/path/to/warcs")
+
+df = archive.video()
+df.show()
+```
+
+Will extract all following information from videos in a web collection:
+
+- file url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server| mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+|http://www.geocit...|Sea_Dawgs_2008_Ha...|      wmv|      video/x-ms-wmv| video/x-ms-wmv|7b35e4cf60a3cfa67...|b35ad7242e8135326...|MCaydY5mzxGm2QCqA...|
+|http://www.geocit...|       Excedrine.wmv|      wmv|      video/x-ms-wmv| video/x-ms-wmv|0aaf1d81ab6f2b354...|0b52af5f5facfd30f...|MCaydY5mzxGm2QCqA...|
+|http://geocities....|        homework.avi|      avi|     video/x-msvideo|video/x-msvideo|4e06cbd11764cd2ac...|770a8849375965b20...|UklGRsrLAgBBVkkgT...|
+|http://geocities....|    macarenababy.avi|      avi|     video/x-msvideo|video/x-msvideo|600084bbd732c0fda...|f99b2e31374d4ea18...|UklGRrC0AwBBVkkgT...|
+|http://geocities....|orlando_viggokiss...|      wmv|      video/x-ms-asf| video/x-ms-wmv|79d093eb6184dba74...|395eaf6dcb29a66d2...|MCaydY5mzxGm2QCqA...|
+|http://www.geocit...|skillful_driving_...|      mpg|          video/mpeg|     video/mpeg|db8a767b00884e426...|f5a70cf5f091b530f...|AAABuiEAAQABgAORA...|
+|http://www.geocit...|gray_havens_2.35.MPG|      mpg|          video/mpeg|     video/mpeg|af71353d69af0b42f...|f3625b897339b0f23...|AAABuiEAAQABgAORA...|
+|http://geocities....|         movie2.mpeg|     mpeg|          video/mpeg|     video/mpeg|3f6c7c48d2a990cf2...|760e6752bfd9e8a84...|AAABsxQA8MMCcSClE...|
+|http://www.geocit...|       Sequence1.mov|      mov|     video/quicktime|video/quicktime|931fc4dee8aa260f9...|5a5cf58e2a50cf942...|AAELrG1vb3YAAABsb...|
+|http://www.geocit...|           santa.mov|      mov|     video/quicktime|video/quicktime|8b9b98d0c567c4381...|49f49dd23c3bad61b...|AAAAIGZ0eXBxdCAgI...|
+|http://geocities....|          0602-2.avi|      avi|     video/x-msvideo|video/x-msvideo|92d04dbe7f1bdc109...|65ed7327aece11bac...|UklGRkauOABBVkkgT...|
+|http://geocities....|           movie.mpg|      mpg|          video/mpeg|     video/mpeg|a0e86539e5eb9bd35...|82eb4680a9f65ed1b...|AAABuiEAAdLxgASfA...|
+|http://geocities....|     misshawaii.mpeg|     mpeg|          video/mpeg|     video/mpeg|45cbfc4d03547861b...|44c93f871ea602112...|AAABuiEAAQABgAORA...|
+|http://geocities....|      Explosions.wmv|      wmv|      video/x-ms-wmv| video/x-ms-wmv|22cb24bffbd7eabf9...|a44d261ef5d7e7993...|MCaydY5mzxGm2QCqA...|
+|http://geocities....|       couch100k.wmv|      asf|      video/x-ms-wmv| video/x-ms-asf|ee316d5871acb7859...|0593ebb8e450a6c3e...|MCaydY5mzxGm2QCqA...|
+|http://geocities....|      jeremy100k.wmv|      asf|      video/x-ms-wmv| video/x-ms-asf|dafba744438ae0110...|d3a217ce25507ae90...|MCaydY5mzxGm2QCqA...|
+|http://geocities....|       jedi_wade.mov|      mov|     video/quicktime|video/quicktime|674688fd09bf18d29...|cd21c3a5b9e2f18b6...|AAAFB21vb3YAAAT/Y...|
+|http://geocities....|ylagallinanonosga...|      asf|      audio/x-ms-wma| video/x-ms-asf|9aac473134d7f2e7a...|3af7fbab238772f48...|MCaydY5mzxGm2QCqA...|
+|http://geocities....|Chris-5050NollieS...|      mov|     video/quicktime|video/quicktime|93aa2ce07e01f90ad...|f066f29e5faf0cee1...|AAAHRG1vb3YAAABsb...|
+|http://geocities....| floursack_jump2.avi|      avi|     video/x-msvideo|video/x-msvideo|a922441c0a7f0018d...|b82ca6fe1d46e16dc...|UklGRgjlAwBBVkkgT...|
++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+```
+
+## Extract Word Processor Files Information
+
+### Scala RDD
+
+**Will not be implemented.**
+
+### Scala DF
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).wordProcessorFiles();
+
+df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes")
+  .orderBy(desc("md5"))
+  .show()
+```
+
+Will extract all following information from word processor files in a web collection:
+
+- file url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|    mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+
+|http://geocities....|infiniteproducts.doc|      doc|  application/msword|application/msword|ffa1ea83af6cb9508...|7a3ae86a7a22d2682...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|      Everything.doc|      doc|  application/msword|application/msword|ff7216edf86fe196c...|082a889c27640fc9a...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|          survey.doc|      doc|  application/msword|application/msword|ff48df5e64bd5adeb...|383ab6ead48795ff3...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....| iepWrkshpFall01.doc|      doc|  application/msword|application/msword|ff421feb87b826d39...|ec60a48d393642629...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|24_reproduction_s...|      doc|  application/msword|application/msword|fec21eb30fac4588e...|36b41ba66801b10b9...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|Descendit_ad_Infe...|      doc|  application/msword|application/msword|fe66eeb7c04942c8b...|14f207787abef983e...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|Anthropology21FEB...|      doc|  application/msword|application/msword|fe079d498bd5e91f2...|ca54e6be7c0618ecc...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|          Senses.doc|      doc|  application/msword|application/msword|fdf881ef998c227f7...|04d6e72132537053a...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|hopewel-loudon-cl...|      doc|  application/msword|application/msword|fddffbabcaf1976c9...|b7ade5d661dd597a1...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|   billmprev9899.doc|      doc|  application/msword|application/msword|fdcc8b65cfb0a18c9...|602f323278c9fb726...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|approachesProject...|      doc|  application/msword|application/msword|fd4df7f89efe9cea7...|4e7be7664bfe992f3...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|         batayan.doc|      doc|  application/msword|application/msword|fc6f45fdfce72d4a3...|e614c9b9e95d64aa6...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....| VisitUnitPacket.doc|      doc|  application/msword|application/msword|fc2a0e45b627c3d4a...|dc7ba874b7b13d548...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|vc3c3ppstudyguide...|      doc|  application/msword|application/msword|fc293bbddb906615f...|538aa0d5e2f554258...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|30_chordates_fish...|      doc|  application/msword|application/msword|fc053770a82822f69...|9df86863983889373...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|c6artposterexampl...|      doc|  application/msword|application/msword|fbe2427b48f32d1d9...|47de792202dc3a059...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|        kun20509.doc|      doc|  application/msword|application/msword|fb8d1ae5e3db45131...|6b13d73759a956e62...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|            kun20509|      doc|  application/msword|application/msword|fb8d1ae5e3db45131...|6b13d73759a956e62...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|         Fishing.doc|      doc|  application/msword|application/msword|fb7df7ac80aa2cc8a...|eb4bb266226349bac...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|      resumedoAw.doc|      doc|  application/msword|application/msword|fb6d5bf501b9b97b3...|1e0d6500192d4ee21...|0M8R4KGxGuEAAAAAA...|
++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 6 more fields]
+```
+
+If you wanted to work with all the word processor files in a collection,
+you could extract them with the following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).wordProcessorFiles();
+
+df.select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension")
+```
+
+### Python DF
+
+The following script:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "/path/to/warcs")
+
+df = archive.word_processor()
+df.show()
+```
+
+Will extract all following information from word processor files in a web collection:
+
+- file url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|    mime_type_tika|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+
+|http://geocities....|            Doc2.doc|      doc|  application/msword|application/msword|09159efbefff59f64...|5412d6c55c2c8bec7...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|        CV-ITjob.doc|      doc|  application/msword|application/msword|7f2b7540e558de24e...|96a6ece7202ab309b...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|        CV-Teach.doc|      doc|  application/msword|application/msword|637bb22eff4bc5be5...|76130b6ffeac5c678...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|       CV-covlet.doc|      doc|  application/msword|application/msword|466c06bfa5a47d5cb...|dc763126cbdb589eb...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|        CV-extra.doc|      doc|  application/msword|application/msword|ab0fa931229c02a4b...|4c2a8200e6eaaafb2...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|020410Indonesia_N...|      doc|  application/msword|application/msword|b195e90841347be61...|6d2845902ad15a9a2...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|        Chapter1.doc|      doc|  application/msword|application/msword|65383c8c0cf5b6a4f...|fcf3008e9478b773c...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|CathyKoning_resum...|      doc|  application/msword|application/msword|924ad3f9f66d3c6bd...|2d0887c93ffd3e78b...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|  Greek_colonels.doc|      doc|  application/msword|application/msword|ee4b9db827086d0db...|94e5569e064195db5...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|         resume2.doc|      doc|  application/msword|application/msword|c39fa601733093268...|108563de6ba6102a5...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|     eta_writeup.doc|      doc|  application/msword|application/msword|661328d76ce3aa340...|debadb248da4dfbd3...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|Before_Night_Fall...|      doc|  application/msword|application/msword|a40371b35b4bf0838...|8f1dba8a46ea297b8...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|Membership_Form_2...|      doc|  application/msword|application/msword|bf3a3b8cc86b371c3...|472810e93a2245fb1...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|  walkthroughff1.doc|      doc|  application/msword|application/msword|c97de6941c3fb4aed...|16851a5445bdce07d...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|    Encyclopedia.doc|      doc|  application/msword|application/msword|26a94e8f3358c878c...|07f9b2ce6342f73bc...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|        Y.Kurulu.doc|      doc|  application/msword|application/msword|8e0ebe7c4f27b1841...|ebb5ce328f717f8e6...|0M8R4KGxGuEAAAAAA...|
+|http://www.geocit...|      fifty_eggs.doc|      doc|  application/msword|application/msword|2c1cdd4f75030650e...|d022311b2fc399750...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|     1pitagoras2.doc|      doc|  application/msword|application/msword|e07ff47cb8ebc4356...|97d46d781458f5a82...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|    constitution.doc|      doc|  application/msword|application/msword|e38dc3e5d553d8799...|d50096b5208146ce9...|0M8R4KGxGuEAAAAAA...|
+|http://geocities....|     feasibility.doc|      doc|  application/msword|application/msword|5574bf82d65935191...|53de74880c9ea2e2b...|0M8R4KGxGuEAAAAAA...|
++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+```
diff --git a/website/versioned_docs/version-1.0.0/dataframe-schemas.md b/website/versioned_docs/version-1.0.0/dataframe-schemas.md
new file mode 100644
index 00000000..9b3c7f73
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/dataframe-schemas.md
@@ -0,0 +1,156 @@
+---
+id: version-1.0.0-dataframe-schemas
+title: DataFrame Schemas
+original_id: dataframe-schemas
+---
+
+Below you can find all of the DataFrame schemas available in the Toolkit. For
+example, you can use `.all()` to extract the overall content from a web archive
+record. Some of the most popular ones include `.all()` (which includes raw
+content (HTTP headers & HTML), URLs, and file types); `.webpages()` (which
+includes full-text content and language); and `.webgraph()` which includes
+hyperlink information.
+
+## All
+
+**`.all()`**
+
+- `crawl_date` (string)
+- `domain` (string)
+- `url` (string)
+- `mime_type_web_server` (string)
+- `mime_type_tika` (string)
+- `raw_content` (string)
+- `bytes` (binary)
+- `http_status_code` (string)
+- `archive_filename` (string)
+
+## Web Pages
+
+**`.webpages()`**
+
+- `crawl_date` (string)
+- `domain` (string)
+- `url` (string)
+- `mime_type_web_server` (string)
+- `mime_type_tika` (string)
+- `language` (string)
+- `content`
+
+## Web Graph
+
+**`.webgraph()`**
+
+- `crawl_date` (string)
+- `src` (string)
+- `dest` (string)
+- `anchor` (string)
+
+## Image Graph
+
+**`.imagegraph()`**
+
+- `crawl_date` (string)
+- `src` (string)
+- `image_url` (string)
+- `alt_text` (string)
+
+## Images
+
+**`.images()`**
+
+- `crawl_date` (string)
+- `url` (string)
+- `filename` (string)
+- `extension` (string)
+- `mime_type_web_server` (string)
+- `mime_type_tika` (string)
+- `width` (string)
+- `height` (string)
+- `md5` (string)
+- `sha1` (string)
+- `bytes` (binary)
+
+## PDFs
+
+**`.pdfs()`**
+
+- `crawl_date` (string)
+- `url` (string)
+- `filename` (string)
+- `extension` (string)
+- `mime_type_web_server` (string)
+- `mime_type_tika` (string)
+- `md5` (string)
+- `sha1` (string)
+- `bytes` (binary)
+
+## Audio
+
+**`.audio()`**
+
+- `crawl_date` (string)
+- `url` (string)
+- `filename` (string)
+- `extension` (string)
+- `mime_type_web_server` (string)
+- `mime_type_tika` (string)
+- `md5` (string)
+- `sha1` (string)
+- `bytes` (binary)
+
+## Videos
+
+**`.videos()`**
+
+- `crawl_date` (string)
+- `url` (string)
+- `filename` (string)
+- `extension` (string)
+- `mime_type_web_server` (string)
+- `mime_type_tika` (string)
+- `md5` (string)
+- `sha1` (string)
+- `bytes` (binary)
+
+## Spreadsheets
+
+**`.spreadsheets()`**
+
+- `crawl_date` (string)
+- `url` (string)
+- `filename` (string)
+- `extension` (string)
+- `mime_type_web_server` (string)
+- `mime_type_tika` (string)
+- `md5` (string)
+- `sha1` (string)
+- `bytes` (binary)
+
+## Presentation Program Files
+
+**`.presentationProgramFiles()`**
+
+- `crawl_date` (string)
+- `url` (string)
+- `filename` (string)
+- `extension` (string)
+- `mime_type_web_server` (string)
+- `mime_type_tika` (string)
+- `md5` (string)
+- `sha1` (string)
+- `bytes` (binary)
+
+## Word Processor Files
+
+**`.wordProcessorFiles()`**
+
+- `crawl_date` (string)
+- `url` (string)
+- `filename` (string)
+- `extension` (string)
+- `mime_type_web_server` (string)
+- `mime_type_tika` (string)
+- `md5` (string)
+- `sha1` (string)
+- `bytes` (binary)
diff --git a/website/versioned_docs/version-1.0.0/extract-binary-info.md b/website/versioned_docs/version-1.0.0/extract-binary-info.md
new file mode 100644
index 00000000..9bc38434
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/extract-binary-info.md
@@ -0,0 +1,177 @@
+---
+id: version-1.0.0-extract-binary-info
+title: Extract Binary Info
+original_id: extract-binary-info
+---
+
+How do I extract the binary information of PDFs, audio files, video files, word
+processor files, spreadsheet files, and presentation program files to a CSV
+file, or into the [Apache Parquet](https://parquet.apache.org/) format
+to [work with later](df-results.md#what-to-do-with-dataframe-results)?
+
+You can also read and write to Amazon S3 by supplying your AWS credentials, and
+using `s3a`.
+
+## Scala RDD
+
+**Will not be implemented.**
+
+## Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+sc.setLogLevel("INFO")
+
+sc.hadoopConfiguration.set("fs.s3a.access.key", "YOUR ACCESS KEY")
+sc.hadoopConfiguration.set("fs.s3a.secret.key", "YOUR SECRET KEY ")
+
+// Local web archive collection.
+val warcs = RecordLoader.loadArchives("/local/path/to/warcs", sc)
+
+// S3 hosted web archive collection.
+val warcsS3 = RecordLoader.loadArchives("s3a://your-data-bucket/", sc)
+
+// Choose your format: CSV or Parquet.
+
+// For CSV:
+//  .write.csv("/path/to/derivatives/csv/audio")
+//  .write.csv("s3a://your-derivatives-bucket/parquet/pages")
+
+// For Parquet:
+// .write.parquet("/path/to/derivatives/parquet/pages/")
+// .write.parquet("s3a://your-derivatives-bucket/parquet/pages")
+
+// Audio Files.
+warcs.audio()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("/path/to/derivatives/csv/audio")
+
+// Images.
+warcsS3.images()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"width", $"height", $"md5", $"sha1")
+  .write
+  .parquet("/path/to/derivatives/parquet/image")
+
+// PDFs.
+warcs.pdfs()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("s3a://your-derivatives-bucket/csv/pdf")
+
+// Presentation Program Files.
+warcs.presentationProgramFiles()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write
+  .parquet("s3a://your-derivatives-bucket/parquet/presentation-program")
+
+// Spreadsheets.
+warcs.spreadsheets()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("/path/to/derivatives/csv/spreadsheet")
+
+// Videos.
+warcs.videos()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("/path/to/derivatives/csv/video")
+
+// Word Processor Files.
+warcs.wordProcessorFiles()
+  .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1")
+  .write
+  .parquet("/path/to/derivatives/parquet/word-processor")
+
+sys.exit
+```
+
+## Python DF
+
+```python
+from aut import *
+
+# Web archive collection (dataset).
+warcs = WebArchive(sc, sqlContext, "/path/to/aut-resources-master/Sample-Data/*gz")
+
+# Choose your format: CSV or Parquet.
+
+# For CSV:
+# .write.csv('/path/to/derivatives/csv/audio')
+# Include header='true' if you want headers.
+
+# For Parquet:
+# .write.parquet("/path/to/derivatives/parquet/pages/")
+
+# Audio Files.
+warcs.audio()
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save('/path/to/derivatives/csv/audio')
+
+# Images.
+warcs.images()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "width", "height", "md5", "sha1")\
+  .write\
+  .parquet('/path/to/derivatives/parquet/images')
+
+# PDFs.
+warcs.pdfs()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write\
+  .parquet('/path/to/derivatives/csv/pdfs')
+
+# Spreadsheets.
+warcs.spreadsheets()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save('/path/to/derivatives/csv/spreadsheets')
+
+# Presentation Program Files.
+warcs.presentation_program()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write\
+  .parquet('/path/to/derivatives/csv/presentation_program')
+
+# Videos.
+warcs.video()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write\
+  .parquet('/path/to/derivatives/csv/video')
+
+# Word Processor Files.
+warcs.word_processor()\
+  .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save('/path/to/derivatives/csv/word_processor')
+```
diff --git a/website/versioned_docs/version-1.0.0/extract-binary.md b/website/versioned_docs/version-1.0.0/extract-binary.md
new file mode 100644
index 00000000..7025078e
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/extract-binary.md
@@ -0,0 +1,117 @@
+---
+id: version-1.0.0-extract-binary
+title: Extract Binaries to Disk
+original_id: extract-binary
+---
+
+How do I extract all the binary files of PDFs, audio files, video files, word processor
+files, spreadsheet files, and presentation program files to disk?
+
+## Scala RDD
+
+**Will not be implemented.**
+
+## Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+// Web archive collection.
+warcs = RecordLoader.loadArchives("/path/to/warcs", sc)
+
+// Audio Files.
+warcs.audio()
+  .select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/extract/binaries/audio/your-prefix-audio", "extension")
+
+// Images.
+warcs.images()
+  .select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/extract/binaries/image/your-prefix-image", "extension")
+
+// PDFs
+warcs.pdfs()
+  .select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/extract/binaries/pdf/your-prefix-pdf", "extension")
+
+// Presentation Program Files.
+warcs.presentationProgramFiles()
+  .select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/extract/binaries/presentation-program/your-prefix-presentation-program", "extension")
+
+// Spreadsheets.
+warcs.spreadsheets()
+  .select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/extract/binaries/spreadsheet/your-prefix-spreadsheet", "extension")
+
+// Videos.
+warcs.videos()
+  .select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/extract/binaries/video/your-prefix-video", "extension")
+
+// Word Processor Files.
+warcs.wordProcessorFiles()
+  .select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/extract/binaries/word-processor/your-prefix-word-processor", "extension")
+
+sys.exit
+```
+
+## Python DF
+
+```python
+from aut import *
+
+# Web archive collection.
+warcs = WebArchive(sc, sqlContext, "path/to/warcs")
+
+# Audio Files.
+audio = warcs.audio()\
+             .select("extension", "bytes")\
+             .collect()
+
+SaveBytes(audio, "/path/to/extract/binaries/audio")
+
+# Images.
+images = warcs.images()\
+              .select("extension", "bytes")\
+              .collect()
+
+SaveBytes(images, "/path/to/extract/binaries/image")
+
+# PDFs
+pdfs = warcs.pdfs()\
+            .select("extension", "bytes")\
+            .collect()
+
+SaveBytes(pdfs, "/path/to/extract/binaries/pdf")
+
+# Presentation Program Files.
+pp_files = warcs.presentationProgramFiles()\
+                .select("extension", "bytes")\
+                .collect()
+
+SaveBytes(pp_files, "/path/to/extract/binaries/presentation_program")
+
+# Spreadsheets.
+spreadsheets = warcs.spreadsheets()\
+                    .select("extension", "bytes")\
+                    .collect()
+
+SaveBytes(spreadsheets, "/path/to/extract/binaries/spreadsheet")
+
+# Videos.
+videos = warcs.videos()\
+              .select("extension", "bytes")\
+              .collect()
+
+SaveBytes(videos, "/path/to/extract/binaries/video")
+
+# Word Processor Files.
+wp_files = warcs.wordProcessorFiles()\
+                .select("extension", "bytes")\
+                .collect()
+
+SaveBytes(wp_files, "/path/to/extract/binaries/word_processor")
+```
diff --git a/website/versioned_docs/version-1.0.0/filters-df.md b/website/versioned_docs/version-1.0.0/filters-df.md
new file mode 100644
index 00000000..49aa6847
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/filters-df.md
@@ -0,0 +1,328 @@
+---
+id: version-1.0.0-filters-df
+title: DataFrame Filters
+original_id: filters-df
+---
+
+## Has Content
+
+Filters or removes all data that does or does not pass a specified regular
+expression test on content.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val content = Array("Content-Length: [0-9]{4}")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .all()
+  .select("url", "raw_content")
+  .filter(!hasContent($"raw_content", lit(content)))
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+content = "Content-Length: [0-9]{4}"
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .select("url", "raw_content") \
+  .filter(col("raw_content").rlike(content))
+```
+
+## Has Dates
+
+Filters or keeps all data that does or does not match the timestamps or
+date patterns specified.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val dates = Array("2008.*", "200908.*", "20070502231159")
+
+RecordLoader.loadArchives("/path/to/warcs",sc)
+  .all()
+  .select($"url", $"crawl_date")
+  .filter(!hasDate($"crawl_date", lit(dates)))
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+dates = ["2008.*", "200908.*", "20070502231159"]
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .select("url", "crawl_date") \
+  .filter(~col("crawl_date").isin(dates))
+```
+
+## Has Domain(s)
+
+Filters or keeps all data that does or does not match the source domain(s) specified.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val domains = Array("archive.org", "sloan.org")
+
+RecordLoader.loadArchives("/path/to/warcs",sc)
+  .webpages()
+  .select($"url")
+  .filter(!hasDomains(extractDomain($"url"), lit(domains)))
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+domains = ["archive.org", "sloan.org"]
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select("url") \
+  .filter(~(extract_domain("url").isin(domains)))
+```
+
+## Has HTTP Status
+
+Filters or keeps all data that does or does not match the status codes specified.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val statusCodes = Array("200","000")
+
+RecordLoader.loadArchives("/path/to/warcs",sc)
+  .all()
+  .select($"url",$"http_status_code")
+  .filter(!hasHTTPStatus($"http_status_code", lit(statusCodes)))
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+status_codes = ["200", "000"]
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .select("url", "http_status_code") \
+  .filter(~col("http_status_code").isin(status_codes))
+```
+
+## Has Images
+
+Filters or keeps all data except images.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+RecordLoader.loadArchives("/path/to/warcs",sc)
+  .all()
+  .select($"mime_type_tika", $"mime_type_web_server", $"url")
+  .filter(hasImages($"crawl_date", detectMimeTypeTika($"bytes")))
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .select("mime_type_tika", "mime_type_web_server", "url") \
+  .filter(col("mime_type_tika").like("image/%") | col("mime_type_web_server").like("image/%"))
+```
+
+## Has Languages
+
+Filters or keeps all data that does or does not match the language(s) ([ISO
+639-2 codes](https://www.loc.gov/standards/iso639-2/php/code_list.php))
+specified.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val languages = Array("th","de","ht")
+
+RecordLoader.loadArchives("/path/to/warcs",sc)
+  .webpages()
+  .select($"language", $"url", $"content")
+  .filter(!hasContent($"language", lit(languages)))
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+languages = ["th","de","ht"]
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select("language", "url", "content") \
+  .filter(~col("language").isin(languages))
+```
+
+## Keep MIME Types (Apache Tika)
+
+Filters or keeps all data that does or does not match the MIME type(s)
+(identified by [Apache Tika](https://tika.apache.org/)) specified.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val mimeTypes = Array("text/html", "text/plain")
+
+RecordLoader.loadArchives("/path/to/warcs",sc)
+  .all()
+  .select($"url", $"mime_type_tika")
+  .filter(!hasMIMETypesTika($"mime_type_tika", lit(mimeTypes)))
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+mime_types = ["text/html", "text/plain"]
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .select("url", "mime_type_tika") \
+  .filter(~col("mime_type_tika").isin(mime_types))
+```
+
+## Keep MIME Types (web server)
+
+Filters or keeps all data that does or does not match the MIME type(s)
+(identified by the web server) specified.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val mimeTypes = Array("text/html", "text/plain")
+
+RecordLoader.loadArchives("/path/to/warcs",sc)
+  .all()
+  .select($"url", $"mime_type_web_server")
+  .filter(!hasMIMETypes($"mime_type_web_server", lit(mimeTypes)))
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+mime_types = ["text/html", "text/plain"]
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .select("url", "mime_type_web_server") \
+  .filter(~col("mime_type_web_server").isin(mime_types))
+```
+
+## Has URL Patterns
+
+Filters or removes all data that does or does not pass a specified regular
+expression test on URL patterns.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val urlsPattern = Array(".*images.*")
+
+RecordLoader.loadArchives("/path/to/warcs",sc)
+  .all()
+  .select($"url", $"raw_content")
+  .filter(hasUrlPatterns($"url", lit(urlsPattern)))
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+url_pattern = ".*images.*"
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .select("url", "raw_content") \
+  .filter(~col("url").rlike(url_pattern))
+```
+
+## Has URLs
+
+Filters or keeps all data that does or does not match the URL(s) specified.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val urls = Array("archive.org")
+
+RecordLoader.loadArchives("/path/to/warcs",sc)
+  .all()
+  .select($"url", $"raw_content")
+  .filter(hasUrls($"url", lit(urls)))
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+urls = ["archive.org"]
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .select("url", "raw_content") \
+  .filter(~col("url").isin(urls))
+```
diff --git a/website/versioned_docs/version-1.0.0/image-analysis.md b/website/versioned_docs/version-1.0.0/image-analysis.md
new file mode 100644
index 00000000..48b8bf91
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/image-analysis.md
@@ -0,0 +1,357 @@
+---
+id: version-1.0.0-image-analysis
+title: Image Analysis
+original_id: image-analysis
+---
+
+The Archives Unleashed Toolkit supports image analysis, a growing area of
+interest within web archives.
+
+## Extract Image Information
+
+### Scala RDD
+
+**Will not be implemented.**
+
+### Scala DF
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).images();
+
+df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"width", $"height", $"md5", $"sha1", $"bytes")
+  .orderBy(desc("md5"))
+  .show()
+```
+
+Will extract all following information from images in a web collection:
+
+- image url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- Width
+- Height
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|mime_type_tika|width|height|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+|http://www.archiv...|mediatype_movies.gif|      gif|           image/gif|     image/gif|   21|    21|ff05f9b408519079c...|194800d702aab9b87...|R0lGODlhFQAVAKUpA...|
+|http://www.archiv...|    LOCLogoSmall.jpg|      jpg|          image/jpeg|    image/jpeg|  275|   300|fbf1aec668101b960...|564c1a07152c12cea...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|   archive.small.jpg|      jpg|          image/jpeg|    image/jpeg|  300|   225|f611b554b9a44757d...|e9bf7ef0ae3fc50f5...|/9j/4RpBRXhpZgAAT...|
+|http://tsunami.ar...|  tsunamiweb1_02.jpg|      jpg|          image/jpeg|    image/jpeg|  384|   229|f02005e29ffb485ca...|9eeb9c3c67d7efc51...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|alexa_websearch_l...|      gif|           image/gif|     image/gif|  301|    47|eecc909992272ce0d...|ea18e226f3cf40005...|R0lGODlhLQEvAPcAA...|
+|http://www.archiv...|      lizardtech.gif|      gif|           image/gif|     image/gif|  140|    37|e7166743861126e51...|cf26e9ffc27be133f...|R0lGODlhjAAlANUwA...|
+|http://www.archiv...|       half_star.png|      png|           image/png|     image/png|   14|    12|e1e101f116d9f8251...|736abd06a978e2fd2...|iVBORw0KGgoAAAANS...|
+|http://www.archiv...|         hewlett.jpg|      jpg|          image/jpeg|    image/jpeg|  300|   116|e1da27028b81db60e...|eb418c17901b1b313...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|prelinger-header-...|      jpg|          image/jpeg|    image/jpeg|   84|    72|d39cce8b2f3aaa783...|1c41a644123e8f861...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|           arrow.gif|      gif|           image/gif|     image/gif|   13|    11|c7ee6d7c17045495e...|7013764e619066e60...|R0lGODlhDQALALMAA...|
+|http://www.archiv...|          folder.png|      png|           image/png|     image/png|   20|    15|c1905fb5f16232525...|ff7b8c60e8397cb5d...|iVBORw0KGgoAAAANS...|
+|http://www.archiv...|     wayback-wtc.gif|      gif|           image/gif|     image/gif|   35|    35|c15ec074d95fe7e1e...|f45425406600b136d...|R0lGODlhIwAjANUAA...|
+|http://www.archiv...|     clicktoplay.png|      png|           image/png|     image/png|  320|   240|b148d9544a1a65ae4...|477105e3a93b60dd8...|iVBORw0KGgoAAAANS...|
+|http://www.archiv...|    orange_arrow.gif|      gif|           image/gif|     image/gif|    8|    11|a820ac93e2a000c9d...|850b9daeef06bee6e...|R0lGODlhCAALAJECA...|
+|http://www.archiv...|  arc-it-tagline.gif|      gif|           image/gif|     image/gif|  385|    30|9f70e6cc21ac55878...|4601e2f642d8e55ac...|R0lGODlhgQEeALMPA...|
+|http://www.archiv...|          guitar.jpg|      jpg|          image/jpeg|    image/jpeg|  140|   171|9ed163df5065418db...|f6c9475009ae2416c...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|        blendbar.jpg|      jpg|          image/jpeg|    image/jpeg| 1800|    89|9e41e4d6bdd53cd9d...|dc780bf80720c87c9...|/9j/4AAQSkZJRgABA...|
+|http://www.archiv...|alexalogo-archive...|      gif|           image/gif|     image/gif|  304|    36|9da73cf504be0eb70...|03e530ef04e4b68f7...|R0lGODlhMAEkAOYAA...|
+|http://www.archiv...|             lma.jpg|      jpg|          image/jpeg|    image/jpeg|  215|    71|97ebd3441323f9b5d...|ff9485b26300721b2...|/9j/4AAQSkZJRgABA...|
+|http://i.creative...|           88x31.png|      png|           image/png|     image/png|   88|    31|9772d34b683f8af83...|689bef4ffb8918612...|iVBORw0KGgoAAAANS...|
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+
+only showing top 20 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 7 more fields]
+```
+
+If you wanted to work with all the images in a collection, you could extract
+them with the following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).images();
+
+df.select($"bytes", $"extension")
+  .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension")
+```
+
+### Python DF
+
+The following script:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "/path/to/warcs")
+
+df = archive.images()
+df.show()
+```
+
+Will extract all following information from images in a web collection:
+
+- image url
+- filename
+- extension
+- MimeType as identified by the hosting web server
+- MimeType as identified by [Apache Tika](https://tika.apache.org)
+- Width
+- Height
+- md5 hash
+- sha1 hash
+- bytes
+
+```dataframe
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+|                 url|            filename|extension|mime_type_web_server|mime_type_tika|width|height|                 md5|                sha1|               bytes|
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+|http://farm3.stat...|4047878934_ef12ba...|      jpg|          image/jpeg|    image/jpeg|  100|    75|e1a376f170b815f49...|2165fd2908950e9f6...|/9j/4AAQSkZJRgABA...|
+|http://farm3.stat...|4047881126_fc6777...|      jpg|          image/jpeg|    image/jpeg|   75|   100|371a2a5142c611405...|933f937c949826696...|/9j/4AAQSkZJRgABA...|
+|http://farm3.stat...|4047879492_a72dd8...|      jpg|          image/jpeg|    image/jpeg|  100|    75|8877679361cde970d...|31dbaaed2f7194c95...|/9j/4AAQSkZJRgABA...|
+|http://farm3.stat...|4047877728_c6c118...|      jpg|          image/jpeg|    image/jpeg|   75|   100|8f009a568d47e1888...|7b83e7d6c78ed65cf...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|96d9290d060547781...|2d3005bd6e09ca064...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|c69d65d4880445b31...|abe40cb96bfc79095...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|cb11c08d43e25ec3b...|2060857d6cf41b141...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|756b5a0a83a621eb7...|d4625efc80efb985e...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|0b60007c3e3d9d63f...|a154035590a01efb4...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|97fdea388e1245691...|e415a77a4369ecef8...|/9j/4AAQSkZJRgABA...|
+|http://img.youtub...|               0.jpg|      jpg|          image/jpeg|    image/jpeg|  480|   360|05c2d43f687f40b60...|ed3f6ca2f3d7e9569...|/9j/4AAQSkZJRgABA...|
+|http://www.canadi...|     WebResource.axd|      gif|           image/gif|     image/gif|    1|     1|325472601571f31e1...|2daeaa8b5f19f0bc2...|R0lGODlhAQABAIAAA...|
+|http://www.davids...|footprint-carbon.jpg|      jpg|          image/jpeg|    image/jpeg|  200|   200|51f57de92e76f3edc...|c970137cd3bfdbbba...|/9j/4AAQSkZJRgABA...|
+|http://www.gca.ca...|              15.jpg|      jpg|          image/jpeg|    image/jpeg|  300|   230|8b3c192b9a0cc82d6...|851377ed11c9cd153...|/9j/4AAQSkZJRgABA...|
+|http://www.equalv...|loadingAnimation.gif|      gif|           image/gif|     image/gif|  208|    13|c33734a1bf58bec32...|2bb50e01775289c24...|R0lGODlh0AANAMQAA...|
+|http://www.davids...|Keep-greening-gre...|      jpg|          image/jpeg|    image/jpeg|  166|   252|4763383a8be13c735...|a42b963e18dc1e7d4...|/9j/4AAQSkZJRgABA...|
+|http://www.davids...|Keep-greening-don...|      jpg|          image/jpeg|    image/jpeg|  146|   252|515bd44bea759e169...|75abeb65cc4f54c7d...|/9j/4AAQSkZJRgABA...|
+|http://www.davids...|Keep-greening-eca...|      jpg|          image/jpeg|    image/jpeg|  158|   252|345f71df9702e99a0...|b6637ac654f6e2073...|/9j/4AAQSkZJRgABA...|
+|http://www.davids...|Keep-greening-tit...|      jpg|          image/jpeg|    image/jpeg|  470|    45|385522fde90ac7e96...|b42151cf8c3ce14e0...|/9j/4AAQSkZJRgABA...|
+|http://www.davids...|    last_minute2.jpg|      jpg|          image/jpeg|    image/jpeg|  265|    33|3defee897d4c553fc...|37c790bbc23c369d8...|/9j/4AAQSkZJRgABA...|
++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+
+only showing top 20 rows
+```
+
+## Extract Most Frequent Image URLs
+
+### Scala RDD
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .flatMap(r => ExtractImageLinks(r.getUrl, r.getContentString))
+  .countItems()
+  .take(10)
+```
+
+Will extract the top ten URLs of images found within a collection, in an array
+like so:
+
+```scala
+links: Array[(String, Int)] = Array((http://www.archive.org/images/star.png,408), (http://www.archive.org/images/no_star.png,122), (http://www.archive.org/images/logo.jpg,118), (http://www.archive.org/images/main-header.jpg,84), (http://www.archive.org/images/rss.png,20), (http://www.archive.org/images/mail.gif,13), (http://www.archive.org/images/half_star.png,10), (http://www.archive.org/images/arrow.gif,7), (http://ia300142.us.archive.org/3/items/americana/am_libraries.gif?cnt=0,3), (http://ia310121.us.archive.org/2/items/GratefulDead/gratefuldead.gif?cnt=0,3), (http://www.archive.org/images/wayback.gif,2), (http://www.archive.org/images/wayback-election2000.gif,2), (http://www.archive.org/images/wayback-wt...
+```
+
+If you wanted to work with the images, you could download them from the
+Internet Archive.
+
+Let's use the top-ranked example. [This
+link](http://web.archive.org/web/*/http://archive.org/images/star.png), for
+example, will show you the temporal distribution of the image. For a snapshot
+from September 2007, this URL would work:
+
+<http://web.archive.org/web/20070913051458/http://www.archive.org/images/star.png>
+
+To do analysis on all images, you could thus prepend
+`http://web.archive.org/web/20070913051458/` to each URL and `wget` them en
+masse.
+
+For more information on `wget`, please consult [this lesson available on the
+Programming Historian
+website](http://programminghistorian.org/lessons/automated-downloading-with-wget).
+
+### Scala DF
+
+The following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc).imagegraph();
+
+df.groupBy($"image_url")
+  .count()
+  .orderBy($"count".desc)
+  .show(10)
+```
+
+Will extract the top ten URLs of images found within a collection, in a
+DataFrame like so:
+
+```dataframe
++--------------------+-----+
+|           image_url|count|
++--------------------+-----+
+|http://www.archiv...|  408|
+|http://www.archiv...|  122|
+|http://www.archiv...|   83|
+|http://www.archiv...|   49|
+|http://www.archiv...|   20|
+|http://www.archiv...|   13|
+|http://www.archiv...|   10|
+|http://www.archiv...|    7|
+|http://ia300142.u...|    3|
+|http://ia310121.u...|    3|
++--------------------+-----+
+only showing top 10 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+df: org.apache.spark.sql.DataFrame = [src: string, image_url: string]
+```
+
+### Python DF
+
+The following script:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "/path/to/warcs")
+
+df = archive.imagegraph()
+
+df.groupBy("image_url")
+  .count()
+  .orderBy("count", ascending=False)
+  .show(10)
+```
+
+Will extract the top ten URLs of images found within a collection, in a
+DataFrame like so:
+
+```dataframe
++--------------------+-----+
+|           image_url|count|
++--------------------+-----+
+|http://www.archiv...|  408|
+|http://www.archiv...|  122|
+|http://www.archiv...|   83|
+|http://www.archiv...|   49|
+|http://www.archiv...|   20|
+|http://www.archiv...|   13|
+|http://www.archiv...|   10|
+|http://www.archiv...|    7|
+|http://ia300142.u...|    3|
+|http://ia310121.u...|    3|
++--------------------+-----+
+```
+
+## Extract Most Frequent Images MD5 Hash
+
+Some images may be the same, but have different URLs. This UDF finds the
+popular images by calculating the MD5 hash of each and presents the most
+frequent images based on that metric. This script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.app._
+import io.archivesunleashed.matchbox._
+
+val r = RecordLoader.loadArchives("/path/to/warcs",sc).persist()
+ExtractPopularImages(r, 500, sc).saveAsTextFile("500-Popular-Images")
+```
+
+Will save the 500 most popular URLs to an output directory.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.app._
+
+val df = RecordLoader.loadArchives("/path/to/warcs",sc).images()
+
+ExtractPopularImagesDF(df,10,30,30).show()
+```
+
+### Python DF
+
+```python
+from aut import *
+
+images = WebArchive(sc, sqlContext, "/path/to/warcs").images()
+
+popular_images = ExtractPopularImages(images, 20, 10, 10)
+
+popular_images.show()
+```
+
+## Find Images Shared Between Domains
+
+How to find images shared between domains that appear more than once _in more
+than one domain_.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val images = RecordLoader.loadArchives("/path/to/warcs", sc)
+                        .images()
+                        .select(removePrefixWWW(extractDomain($"url")).as("domain"), $"url", $"md5")
+
+val links = images.groupBy("md5").count().where(countDistinct("domain")>=2)
+
+val result = images.join(links, "md5")
+                   .groupBy("domain", "md5")
+                   .agg(first("url").as("image_url"))
+                   .orderBy(asc("md5"))
+                   .write
+                   .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+                   .format("csv")
+                   .option("escape", "\"")
+                   .option("encoding", "utf-8")
+                   .save("/path/to/output")
+```
+
+### PythonDF
+
+```python
+from aut import *
+from pyspark.sql.functions import asc, countDistinct, first
+
+images = WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .images() \
+  .select(remove_prefix_www(extract_domain("url")).alias("domain"), "url", "md5")
+
+links = images.groupBy("md5") \
+              .count() \
+              .where(countDistinct("domain")>=2)
+
+result = images.join(links, "md5") \
+               .groupBy("domain", "md5") \
+               .agg(first("url").alias("image_url")) \
+               .orderBy(asc("md5")) \
+               .write \
+               .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+               .format("csv") \
+               .option("escape", "\"") \
+               .option("encoding", "utf-8") \
+               .save("/path/to/output")
+```
diff --git a/website/versioned_docs/version-1.0.0/link-analysis.md b/website/versioned_docs/version-1.0.0/link-analysis.md
new file mode 100644
index 00000000..a92434ef
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/link-analysis.md
@@ -0,0 +1,593 @@
+---
+id: version-1.0.0-link-analysis
+title: Link Analysis
+original_id: link-analysis
+---
+
+Site link structures can be very useful, allowing you to learn such things as:
+
+- what websites were the most linked to;
+- what websites had the most outbound links;
+- what paths could be taken through the network to connect pages;
+- what communities existed within the link structure?
+
+Most of the following examples show the **domain** to **domain** links. For
+example, you discover how many times that `liberal.ca` linked to `twitter.com`,
+rather than learning that `http://liberal.ca/contact` linked to
+`http://twitter.com/liberal_party`. The reason we do that is that in general,
+if you are working with any data at scale, the sheer number of raw URLs can
+become overwhelming. That said, we do provide one example below that provides
+raw data.
+
+## Extract Simple Site Link Structure
+
+### Scala RDD
+
+If your web archive does not have a temporal component, the following Spark
+script will generate the site-level link structure.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .flatMap(r => ExtractLinks(r.getUrl, r.getContentString))
+  .map(r => (ExtractDomain(r._1).removePrefixWWW(), ExtractDomain(r._2).removePrefixWWW()))
+  .filter(r => r._1 != "" && r._2 != "")
+  .countItems()
+  .filter(r => r._2 > 5)
+  .saveAsTextFile("links-all-rdd/")
+```
+
+Note how you can add filters. In this case, we add a filter which
+will result in a network graph of pages containing the phrase "apple." Filters
+can be applied immediately after `.keepValidPages()`.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .keepContent(Set("apple".r))
+  .flatMap(r => ExtractLinks(r.getUrl, r.getContentString))
+  .map(r => (ExtractDomain(r._1).removePrefixWWW(), ExtractDomain(r._2).removePrefixWWW()))
+  .filter(r => r._1 != "" && r._2 != "")
+  .countItems()
+  .filter(r => r._2 > 5)
+  .saveAsTextFile("links-all-apple-rdd/")
+```
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webgraph()
+  .groupBy(removePrefixWWW(extractDomain($"src")).as("src"), removePrefixWWW(extractDomain($"dest")).as("dest"))
+  .count()
+  .filter($"count" > 5)
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("links-all-df/")
+```
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val content = Array("radio")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .all()
+  .keepValidPagesDF()
+  .filter(hasContent($"raw_content", lit(content)))
+  .select(explode(extractLinks($"url", $"raw_content")).as("links"))
+  .select(removePrefixWWW(extractDomain(col("links._1"))).as("src"), removePrefixWWW(extractDomain(col("links._2"))).as("dest"))
+  .groupBy("src", "dest")
+  .count()
+  .filter($"count" > 5)
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("links-all-apple-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col, explode
+
+content = "%radio%"
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .filter("crawl_date is not NULL")\
+  .filter(~(col("url").rlike(".*robots\\.txt$")) & (col("mime_type_web_server").rlike("text/html") | col("mime_type_web_server").rlike("application/xhtml+xml") | col("url").rlike("(?i).*htm$") | col("url").rlike("(?i).*html$")))\
+  .filter(col("http_status_code") == 200)
+  .filter(col("raw_content").like(content)) \
+  .select(explode(extract_links("url", "raw_content")).alias("links")) \
+  .select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain(col("links._2"))).alias("dest")) \
+  .groupBy("src", "dest") \
+  .count() \
+  .filter(col("count") > 5) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("links-all-apple-df/")
+```
+
+## Extract Raw URL Link Structure
+
+### Scala RDD
+
+This following script extracts all of the hyperlink relationships between
+sites, using the full URL pattern.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .flatMap(r => ExtractLinks(r.getUrl, r.getContentString))
+  .filter(r => r._1 != "" && r._2 != "")
+  .countItems()
+  .saveAsTextFile("full-links-all-rdd/")
+```
+
+You can see that the above was achieved by removing the following line:
+
+```scala
+  .map(r => (ExtractDomain(r._1).removePrefixWWW(), ExtractDomain(r._2).removePrefixWWW()))
+```
+
+In a larger collection, you might want to add the following line:
+
+```scala
+.filter(r => r._2 > 5)
+```
+
+before `.countItems()` to find just the documents that are linked to more than
+five times. As you can imagine, raw URLs are very numerous!
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webgraph()
+  .groupBy(extractDomain($"src"), extractDomain($"dest"))
+  .count()
+  .filter($"count" > 5)
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("full-links-all-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webgraph() \
+  .groupBy(extract_domain("src"), extract_domain("dest")) \
+  .count() \
+  .filter(col("count") > 5) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("full-links-all-df/")
+```
+
+## Organize Links by URL Pattern
+
+### Scala RDD
+
+In this following example, we run the same script but only extract links coming
+from URLs matching the pattern `http://www.archive.org/details/*`. We do so by
+using the `keepUrlPatterns` command.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .keepUrlPatterns(Set("(?i)http://www.archive.org/details/.*".r))
+  .flatMap(r => ExtractLinks(r.getUrl, r.getContentString))
+  .map(r => (ExtractDomain(r._1).removePrefixWWW(), ExtractDomain(r._2).removePrefixWWW()))
+  .filter(r => r._1 != "" && r._2 != "")
+  .countItems()
+  .filter(r => r._2 > 5)
+  .saveAsTextFile("details-links-all-rdd/")
+```
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val urlPattern = Array("(?i)http://www.archive.org/details/.*")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .all()
+  .keepValidPagesDF()
+  .filter(hasUrlPatterns($"url", lit(urlPattern)))
+  .select(explode(extractLinks($"url", $"raw_content")).as("links"))
+  .select(removePrefixWWW(extractDomain(col("links._1"))).as("src"), removePrefixWWW(extractDomain(col("links._2"))).as("dest"))
+  .groupBy("src", "dest")
+  .count()
+  .filter($"count" > 5)
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("details-links-all-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col, explode
+
+url_pattern = "%http://www.archive.org/details/%"
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .filter("crawl_date is not NULL")\
+  .filter(~(col("url").rlike(".*robots\\.txt$")) & (col("mime_type_web_server").rlike("text/html") | col("mime_type_web_server").rlike("application/xhtml+xml") | col("url").rlike("(?i).*htm$") | col("url").rlike("(?i).*html$")))\
+  .filter(col("http_status_code") == 200)
+  .filter(col("url").like(url_pattern)) \
+  .select(explode(extract_links("url", "raw_content").alias("links"))) \
+  .select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain("links._2")).alias("dest")) \
+  .groupBy("src", "dest") \
+  .count() \
+  .filter(col("count") > 5) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("details-links-all-df/")
+```
+
+## Organize Links by Crawl Date
+
+### Scala RDD
+
+The following Spark script generates the aggregated site-level link structure,
+grouped by crawl date (YYYYMMDD). It
+makes use of the `ExtractLinks` and `ExtractToLevelDomain` functions.
+
+If you prefer to group by crawl month (YYYMM), replace `getCrawlDate` with
+`getCrawlMonth` below. If you prefer to group by simply crawl year (YYYY),
+replace `getCrawlDate` with `getCrawlYear` below.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc).keepValidPages()
+  .map(r => (r.getCrawlDate, ExtractLinks(r.getUrl, r.getContentString)))
+  .flatMap(r => r._2.map(f => (r._1, ExtractDomain(f._1).replaceAll("^\\s*www\\.", ""), ExtractDomain(f._2).replaceAll("^\\s*www\\.", ""))))
+  .filter(r => r._2 != "" && r._3 != "")
+  .countItems()
+  .filter(r => r._2 > 5)
+  .saveAsTextFile("sitelinks-by-date-rdd/")
+```
+
+The format of this output is:
+
+- Field one: Crawldate, `yyyyMMdd`
+- Field two: Source domain (i.e. liberal.ca)
+- Field three: Target domain of link (i.e. ndp.ca)
+- Field four: number of links.
+
+```scala
+((20080612,liberal.ca,liberal.ca),1832983)
+((20060326,ndp.ca,ndp.ca),1801775)
+((20060426,ndp.ca,ndp.ca),1771993)
+((20060325,policyalternatives.ca,policyalternatives.ca),1735154)
+```
+
+In the above example, you are seeing links within the same domain.
+
+Note also that `ExtractLinksRDD` takes an optional third parameter of a base
+URL. If you set this – typically to the source URL – `ExtractLinksRDD` will
+resolve a relative path to its absolute location. For example, if `val url =
+"http://mysite.com/some/dirs/here/index.html"` and `val html = "... <a
+href='../contact/'>Contact</a> ..."`, and we call `ExtractLinks(url, html,
+url)`, the list it returns will include the item
+`(http://mysite.com/a/b/c/index.html, http://mysite.com/a/b/contact/,
+Contact)`. It may be useful to have this absolute URL if you intend to call
+`ExtractDomainRDD` on the link and wish it to be counted.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webgraph()
+  .groupBy($"crawl_date", removePrefixWWW(extractDomain($"src")), removePrefixWWW(extractDomain($"dest")))
+  .count()
+  .filter($"count" > 5)
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("sitelinks-by-date-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webgraph() \
+  .groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src"), remove_prefix_www(extract_domain("dest")).alias("dest")) \
+  .count() \
+  .filter(col("count") > 5) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("sitelinks-by-date-df/")
+```
+
+## Filter by URL
+
+### Scala RDD
+
+In this case, you would only receive links coming from websites in matching the
+URL pattern listed under `keepUrlPatterns`.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+val links = RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .keepUrlPatterns(Set("http://www.archive.org/details/.*".r))
+  .map(r => (r.getCrawlDate, ExtractLinks(r.getUrl, r.getContentString)))
+  .flatMap(r => r._2.map(f => (r._1, ExtractDomain(f._1).replaceAll("^\\s*www\\.", ""), ExtractDomain(f._2).replaceAll("^\\s*www\\.", ""))))
+  .filter(r => r._2 != "" && r._3 != "")
+  .countItems()
+  .filter(r => r._2 > 5)
+  .saveAsTextFile("sitelinks-details-rdd/")
+```
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val urlPattern = Array("http://www.archive.org/details/.*")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .all()
+  .keepValidPagesDF()
+  .filter(hasUrlPatterns($"url", lit(urlPattern)))
+  .select(explode(extractLinks($"url", $"raw_content")).as("links"))
+  .select(removePrefixWWW(extractDomain(col("links._1"))).as("src"), removePrefixWWW(extractDomain(col("links._2"))).as("dest"))
+  .groupBy("src", "dest")
+  .count()
+  .filter($"count" > 5)
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("sitelinks-details-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col, explode
+
+url_pattern = "http://www.archive.org/details/.*"
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .filter("crawl_date is not NULL")\
+  .filter(~(col("url").rlike(".*robots\\.txt$")) & (col("mime_type_web_server").rlike("text/html") | col("mime_type_web_server").rlike("application/xhtml+xml") | col("url").rlike("(?i).*htm$") | col("url").rlike("(?i).*html$")))\
+  .filter(col("http_status_code") == 200)
+  .filter(col("url").rlike(url_pattern)) \
+  .select(explode(extract_links("url", "raw_content")).alias("links")) \
+  .select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain(col("links._2"))).alias("dest")) \
+  .groupBy("src", "dest") \
+  .count() \
+  .filter(col("count") > 5) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("sitelinks-details-df/")
+```
+
+## Export to Gephi
+
+You may want to export your data directly to the [Gephi software
+suite](http://gephi.github.io/), an open-source network analysis project. The
+following code writes to the GEXF format:
+
+### Scala RDD
+
+**Will not be implemented.**
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+import io.archivesunleashed.app._
+
+val graph = RecordLoader.loadArchives("/path/to/warcs",sc)
+              .webgraph.groupBy(
+                          $"crawl_date",
+                          removePrefixWWW(extractDomain($"src")).as("src_domain"),
+                          removePrefixWWW(extractDomain($"dest")).as("dest_domain"))
+              .count()
+              .filter(!($"dest_domain"===""))
+              .filter(!($"src_domain"===""))
+              .filter($"count" > 5)
+              .orderBy(desc("count"))
+              .collect()
+
+WriteGEXF(graph, "links-for-gephi.gexf")
+```
+
+We also support exporting to the
+[GraphML](https://en.wikipedia.org/wiki/GraphML) format. To do so, use
+the `WriteGraphml` method:
+
+```scala
+WriteGraphML(graph, "links-for-gephi.graphml")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col, desc
+
+graph = WebArchive(sc, sqlContext, "/path/to/data") \
+          .webgraph() \
+          .groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src_domain"), remove_prefix_www(extract_domain("dest")).alias("dest_domain")) \
+          .count() \
+          .filter((col("dest_domain").isNotNull()) & (col("dest_domain") !="")) \
+          .filter((col("src_domain").isNotNull()) & (col("src_domain") !="")) \
+          .filter(col("count") > 5) \
+          .orderBy(desc("count")) \
+          .collect()
+
+WriteGEXF(graph, "links-for-gephi.gexf")
+```
+
+We also support exporting to the
+[GraphML](https://en.wikipedia.org/wiki/GraphML) format. To do so, use
+the `WriteGraphml` method:
+
+```python
+WriteGraphML(graph, "links-for-gephi.graphml")
+```
+
+## Finding Hyperlinks within Collection on Pages with Certain Keyword
+
+The following script will extract a DataFrame with the following columns,
+`domain`, `url`, `crawl date`, `origin page`, and `destination page`, given a
+search term `Keystone` of the content (full-text). The example uses the sample
+data in
+[`aut-resources`](https://github.com/archivesunleashed/aut-resources/tree/master/Sample-Data).
+
+### Scala RDD
+
+**Will not be implemented.**
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val result = udf((vs: Seq[Any]) => vs(0)
+               .toString
+               .split(",")(1))
+
+val df = RecordLoader.loadArchives("/path/to/warcs", sc)
+          .all()
+          .keepValidPagesDF()
+          .select($"domain",
+                  $"url",
+                  $"crawl_date",
+                  explode_outer(extractLinks($"url", $"raw_content"))
+                    .as("link"))
+          .filter($"raw_content".contains("keystone"))
+
+df.select($"url", $"domain", $"crawl_date", result(array($"link"))
+    .as("destination_page"))
+  .show()
+
+// Exiting paste mode, now interpreting.
+
++--------------------+---------------+----------+--------------------+
+|                 url|         domain|crawl_date|    destination_page|
++--------------------+---------------+----------+--------------------+
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
+|http://www.davids...|davidsuzuki.org|  20091219|http://www.davids...|
++--------------------+---------------+----------+--------------------+
+only showing top 20 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+result: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,StringType,None)
+df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Domain: string, url: string ... 2 more fields]
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col, explode_outer
+
+webpages = WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .all() \
+  .filter("crawl_date is not NULL")\
+  .filter(~(col("url").rlike(".*robots\\.txt$")) & (col("mime_type_web_server").rlike("text/html") | col("mime_type_web_server").rlike("application/xhtml+xml") | col("url").rlike("(?i).*htm$") | col("url").rlike("(?i).*html$")))\
+  .filter(col("http_status_code") == 200)
+  .select("domain", "url", "crawl_date", explode_outer(extract_links("url", "raw_content")).alias("link")) \
+  .filter(col("raw_content").like("%food%")) \
+  .select("url", "domain", "crawl_date", col("link._1").alias("destination_page")) \
+  .show()
+```
diff --git a/website/versioned_docs/version-1.0.0/text-analysis.md b/website/versioned_docs/version-1.0.0/text-analysis.md
new file mode 100644
index 00000000..544b9ee1
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/text-analysis.md
@@ -0,0 +1,597 @@
+---
+id: version-1.0.0-text-analysis
+title: Text Analysis
+original_id: text-analysis
+---
+
+## Extract All Plain Text
+
+### Scala RDD
+
+This script extracts the crawl date, domain, URL, and plain text from HTML
+files in the sample ARC data (and saves the output to out/). By default, HTTP
+headers are included in the plain text that is extracted.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(r.getContentString)))
+  .saveAsTextFile("plain-text-rdd/")
+```
+
+Note that this will create a new directory to store the output, which cannot
+already exist.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webpages()
+  .select($"crawl_date", $"domain", $"url", removeHTML($"content"))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("plain-text-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select("crawl_date", "domain", "url", remove_html("content")) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("plain-text-df/")
+```
+
+## Extract Plain Text Without HTTP Headers
+
+### Scala RDD
+
+If you want to remove HTTP headers, you can add one more command:
+`RemoveHTTPHeader`. The script would then look like:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString))))
+  .saveAsTextFile("plain-text-noheaders-rdd/")
+```
+
+As most plain text use cases do not require HTTP headers to be in the output,
+we are removing headers in the following examples.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webpages()
+  .select(removeHTML(removeHTTPHeader($"content")))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("plain-text-noheaders-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select(remove_html(remove_http_header("content"))) \
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
+  .format("csv")\
+  .option("escape", "\"")\
+  .option("encoding", "utf-8")\
+  .save("plain-text-noheaders-df/")
+```
+
+## Extract Plain Text By Domain
+
+### Scala RDD
+
+The following Spark script generates plain text renderings for all the web
+pages in a collection with a URL matching a filter string. In the example case,
+it will go through the collection and find all of the URLs within the
+"archive.org" domain.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .keepDomains(Set("archive.org"))
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString))))
+  .saveAsTextFile("plain-text-domain-rdd/")
+```
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val domains = Array("archive.org", "geocities.org")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webpages()
+  .select($"crawl_date", $"domain", $"url", removeHTML(removeHTTPHeader($"content")))
+  .filter(hasDomains($"domain", lit(domains)))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("plain-text-domain-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+domains = ["archive.org"]
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select("crawl_date", "domain", "url", remove_html(remove_http_header("content"))) \
+  .filter(col("domain").isin(domains)) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("plain-text-domain-df/")
+```
+
+## Extract Plain Text by URL Pattern
+
+### Scala RDD
+
+The following Spark script generates plain text renderings for all the web
+pages in a collection with a URL matching a regular expression pattern. In the
+example case, it will go through a WARC file and find all of the URLs beginning
+with `http://archive.org/details/`, and save the text of those URLs.
+
+The `(?i)` makes this query case insensitive.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .keepUrlPatterns(Set("(?i)http://www.archive.org/details/.*".r))
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString))))
+  .saveAsTextFile("details-rdd/")
+```
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val urlPattern = Array("(?i)http://www.archive.org/details/.*")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webpages()
+  .select($"crawl_date", $"domain", $"url", removeHTML(removeHTTPHeader($"content")))
+  .filter(hasUrlPatterns($"url", lit(urlPattern)))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("details-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+url_pattern = "%http://www.archive.org/details/%"
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select("crawl_date", "domain", "url", remove_html(remove_http_header("content"))) \
+  .filter(col("url").like(url_pattern)) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("details-df/")
+```
+
+## Extract Plain Text Minus Boilerplate
+
+### Scala RDD
+
+The following Spark script generates plain text renderings for all the web
+pages in a collection, minus "boilerplate" content: advertisements,
+navigational elements, and elements of the website template. Boilerplate requires
+HTML, so it needs to run on `.all()` raw content. Not `.webpages()` content. For
+more information on the boilerplate removal library we are using, [please see
+this website and paper](http://www.l3s.de/~kohlschuetter/boilerplate/).
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .keepDomains(Set("archive.org"))
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, ExtractBoilerpipeText(RemoveHTTPHeader(r.getContentString))))
+  .saveAsTextFile("plain-text-no-boilerplate-rdd/")
+```
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val domains = Array("archive.org")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .all()
+  .select($"crawl_date", $"domain", $"url", extractBoilerpipeText(removeHTTPHeader($"content")))
+  .filter(hasDomains($"domain", lit(domains)))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("plain-text-no-boilerplate-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select("crawl_date", "domain", "url", extract_boilerplate(remove_http_header("content")).alias("content")) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("plain-text-no-boilerplate-df/")
+```
+
+## Extract Plain Text Filtered by Date
+
+### Scala RDD
+
+AUT permits you to filter records by a list of full or partial date strings. It
+conceives of the date string as a `DateComponent`. Use `keepDate` to specify
+the year (`YYYY`), month (`MM`), day (`DD`), year and month (`YYYYMM`), or a
+particular year-month-day (`YYYYMMDD`).
+
+The following Spark script extracts plain text for a given collection by date
+(in this case, April 2008).
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .keepDate(List("200804"), ExtractDate.DateComponent.YYYYMM)
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString))))
+  .saveAsTextFile("plain-text-date-filtered-200804/")
+```
+
+The following script extracts plain text for a given collection by year (in
+this case, 2008).
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .keepDate(List("2008"), ExtractDate.DateComponent.YYYY)
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString))))
+  .saveAsTextFile("plain-text-date-filtered-2008/")
+```
+
+Finally, you can also extract multiple dates or years. In this case, we would
+extract pages from both 2008 and 2015.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .keepDate(List("2008","2015"), ExtractDate.DateComponent.YYYY)
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString))))
+  .saveAsTextFile("plain-text-date-filtered-2008-2015-rdd/")
+```
+
+Note: if you created a dump of plain text using another one of the earlier
+commands, you do not need to go back and run this. You can instead use bash to
+extract a sample of text. For example, running this command on a dump of all
+plain text stored in `alberta_education_curriculum.txt`:
+
+```bash
+sed -n -e '/^(201204/p' alberta_education_curriculum.txt > alberta_education_curriculum-201204.txt
+```
+
+would select just the lines beginning with `(201204`, or April 2012.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val dates = Array("2008", "2015")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webpages()
+  .select($"crawl_date", $"domain", $"url", removeHTML(removeHTTPHeader($"content")))
+  .filter(hasDate($"crawl_date", lit(dates)))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("plain-text-date-filtered-2008-2015-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+dates = "2009[10][09]\d\d"
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select("crawl_date", "domain", "url", remove_html(remove_http_header("content"))) \
+  .filter(col("crawl_date").rlike(dates)) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("plain-text-date-filtered-2008-2015-df/")
+```
+
+## Extract Plain Text Filtered by Language
+
+### Scala RDD
+
+The following Spark script keeps only French language pages from a certain
+top-level domain. It uses the [ISO 639.2 language
+codes](https://www.loc.gov/standards/iso639-2/php/code_list.php).
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .keepDomains(Set("archive.org"))
+  .keepLanguages(Set("fr"))
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString))))
+  .saveAsTextFile("plain-text-fr-rdd/")
+```
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val domains = Array("archive.org")
+val languages = Array("fr")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webpages()
+  .select($"crawl_date", $"domain", $"url", $"language", removeHTML(removeHTTPHeader($"content")))
+  .filter(hasDomains($"domain", lit(domains)))
+  .filter(hasLanguages($"language", lit(languages)))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("plain-text-fr-df/")
+```
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val domains = Array("archive.org")
+val languages = Array("fr")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webpages()
+  .filter(hasDomains($"domain", lit(domains)))
+  .filter(hasLanguages($"language", lit(languages)))
+  .select($"crawl_date", $"domain", $"url", $"language", removeHTML(removeHTTPHeader($"content")))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("plain-text-fr-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+domains = ["geocities.com"]
+languages = ["fr"]
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select("crawl_date", "domain", "url", remove_html(remove_http_header("content"))) \
+  .filter(col("domain").isin(domains)) \
+  .filter(col("language").isin(languages)) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("plain-text-fr-df/")
+```
+
+## Extract Plain text Filtered by Keyword
+
+### Scala RDD
+
+The following Spark script keeps only pages containing a certain keyword, which
+also stacks on the other scripts.
+
+For example, the following script takes all pages containing the keyword
+"radio" in a collection.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs",sc)
+  .keepValidPages()
+  .keepContent(Set("radio".r))
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString))))
+  .saveAsTextFile("plain-text-radio-rdd/")
+```
+
+There is also `discardContent` which does the opposite, and can be used in
+cases where, for example, you have a frequent keyword you are not interested
+in.
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val content = Array("radio")
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .webpages()
+  .select($"crawl_date", $"domain", $"url", removeHTML(removeHTTPHeader($"content")))
+  .filter(hasContent($"content", lit(content)))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("plain-text-radio-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+from pyspark.sql.functions import col
+
+content = "%radio%"
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select("crawl_date", "domain", "url", remove_html(remove_http_header("content"))) \
+  .filter(col("content").like(content)) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("plain-text-radio-df/")
+```
+
+## Extract Raw HTML
+
+### Scala RDD
+
+In most cases, users will be interested in working with plain text. In some
+cases, however, you may want to work with the actual HTML of the pages
+themselves (for example, looking for specific tags or HTML content).
+
+The following script will produce the raw HTML of a WARC file. You can use the
+filters from above to filter it down accordingly by domain, language, etc.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.matchbox._
+
+RecordLoader.loadArchives("/path/to/warcs", sc)
+  .keepValidPages()
+  .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTTPHeader(r.getContentString)))
+  .saveAsTextFile("plain-html-rdd/")
+```
+
+### Scala DF
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+RecordLoader.loadArchives("example.warc.gz", sc)
+  .webpages()
+  .select($"crawl_date", extractDomain($"url"), $"url", removeHTTPHeader($"content"))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("plain-html-df/")
+```
+
+### Python DF
+
+```python
+from aut import *
+
+WebArchive(sc, sqlContext, "/path/to/warcs") \
+  .webpages() \
+  .select("crawl_date", "domain", "url", remove_http_header("content")) \
+  .write \
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
+  .format("csv") \
+  .option("escape", "\"") \
+  .option("encoding", "utf-8") \
+  .save("plain-html-df/")
+```
diff --git a/website/versioned_docs/version-1.0.0/toolkit-walkthrough.md b/website/versioned_docs/version-1.0.0/toolkit-walkthrough.md
new file mode 100644
index 00000000..c7124469
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/toolkit-walkthrough.md
@@ -0,0 +1,424 @@
+---
+id: version-1.0.0-toolkit-walkthrough
+title: Toolkit Walkthrough
+original_id: toolkit-walkthrough
+---
+
+Welcome to the Archives Unleashed Toolkit hands-on walkthrough!
+
+![Spark Terminal](https://user-images.githubusercontent.com/218561/73990154-4d1bd800-4916-11ea-9b6e-10e4503dfa38.png)
+
+The reality of any hands-on workshop is that things will break. We've tried our
+best to provide a robust environment that can let you walk through the basics
+of the Archives Unleashed Toolkit alongside us.
+
+If you have any questions, let us know in [Slack](http://slack.archivesunleashed.org/)!
+
+## Table of Contents
+
+- [Installation and Use](#installation-and-use)
+  - [Hello World: Our First Script](#hello-world-our-first-script)
+- [Extracting some Text](#extracting-some-text)
+  - [Ouch: Our First Error](#ouch-our-first-error)
+  - [Other Text Analysis Filters](#other-text-analysis-filters)
+- [Web of Links: Network Analysis](#web-of-links-network-analysis)
+- [Working with the Data](#working-with-the-data)
+- [Acknowledgements and Final Notes](#acknowledgements-and-final-notes)
+
+## Installation and Use
+
+**Got Docker?**
+This lesson requires that you install
+[Docker](https://www.docker.com/get-docker), and use `docker-aut`. We have
+instructions on how to install Docker
+[here](https://github.com/archivesunleashed/docker-aut/wiki/Docker-Install-Instructions)
+, as well as instructions on how to build and use `docker-aut`
+[here](https://github.com/archivesunleashed/docker-aut#build-and-run).
+
+Later in this lesson, we use the networking tool [Gephi](https://gephi.org/).
+
+Make sure that Docker is running! If it isn't, you might see an error like
+`docker: Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is
+the docker daemon running?` – make sure to run it (on Mac, for example, you
+need to run the Docker application itself).
+
+Make a directory in your userspace, somewhere where you can find it, on your desktop
+perhaps. Call it `data`. In my case, I will create it on my desktop
+and it will have a path like `/Users/ianmilligan1/desktop/data`.
+
+Use the following command, replacing `/path/to/your/data` with the directory.
+**If you want to use your own ARC or WARC files, please put them in this
+directory**.
+
+`docker run --rm -it -v "/path/to/your/data:/data" aut`
+
+For example, if your files are in `/Users/ianmilligan1/desktop/data` you would
+run the above command like:
+
+`docker run --rm -it -v "/Users/ianmilligan1/desktop/data:/data" aut`
+
+<hr />
+
+**Troubleshooting Tips**
+
+The above commands are important, as they make the rest of the lesson possible!
+
+Remember that you need to have the second `:/data` in the above example. This
+is making a connection between the directory called "data" on my desktop with a
+directory in the Docker virtual machine called "docker."
+
+Also, if you are using Windows, you will need to provide the path as it appears
+in your file system. For example: `C:\Users\ianmilligan1\data`.
+<hr />
+
+Once you run this command, you will have to wait a few minutes while data is
+downloaded and AUT builds. Once it is all working, you should see:
+
+```shell
+Welcome to
+      ____              __
+     / __/__  ___ _____/ /__
+    _\ \/ _ \/ _ `/ __/  '_/
+   /___/ .__/\_,_/_/ /_/\_\   version 2.4.4
+      /_/
+
+Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_212)
+Type in expressions to have them evaluated.
+Type :help for more information.
+
+scala>
+```
+
+## Hello World: Our First Script
+
+Now that we are at the prompt, let's get used to running commands. The easiest
+way to use the Spark Shell is to _copy and paste_ scripts that you've written
+somewhere else in.
+
+Fortunately, the Spark Shell supports this functionality!
+
+At the `scala>` prompt, type the following command and press enter.
+
+```shell
+:paste
+```
+
+Now cut and paste the following script:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc)
+  .all()
+  .keepValidPagesDF()
+  .groupBy($"domain"))
+  .count()
+  .sort($"count".desc)
+  .show(10, false)
+```
+
+Let's take a moment to look at this script. It:
+
+- begins by importing the AUT libraries;
+- tells the program where it can find the data (in this case, the sample data
+  that we have included in this Docker image);
+- tells it only to keep the
+  "[valid](https://aut.docs.archivesunleashed.org/docs/filters-rdd#scala-df)"
+  pages, in this case HTML data
+- tells it to `ExtractDomain`, or find the base domain of each URL - i.e.
+  `www.google.com/cats` we are interested just in the domain, or
+  `google.com`;
+- counts them - how many times does `google.com` appear in this collection,
+  for example;
+- and displays a DataFrame of the top ten!
+
+Once it is pasted in, let's run it.
+
+You run pasted scripts by pressing `ctrl` + `d`. Try that now.
+
+You should see:
+
+```dataframe
+// Exiting paste mode, now interpreting.
+
++-------------------------+-----+
+|domain                   |count|
++-------------------------+-----+
+|equalvoice.ca            |4274 |
+|liberal.ca               |1968 |
+|policyalternatives.ca    |588  |
+|greenparty.ca            |535  |
+|fairvote.ca              |442  |
+|ndp.ca                   |416  |
+|davidsuzuki.org          |348  |
+|canadiancrc.com          |88   |
+|communist-party.ca       |39   |
+|ccsd.ca                  |22   |
++-------------------------+-----+
+only showing top 10 rows
+
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+```
+
+We like to use this example for two reasons:
+
+- It is fairly simple and lets us know that AUT is working;
+- and it tells us what we can expect to find in the web archives! In this case,
+  we have a lot of the Liberal Party of Canada, Equal Voice Canada, and the
+  Green Party of Canada.
+
+**If you loaded your own data above**, you can access that directory by
+substituting the directory in the `loadArchives` command. Try it again!
+Remember to type `:paste`, paste the following command in, and then `ctrl` +
+`D` to execute.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+RecordLoader.loadArchives("/data/*.gz", sc)
+  .all()
+  .keepValidPagesDF()
+  .groupBy($"domain")
+  .count()
+  .sort($"count".desc)
+  .show(10, false)
+```
+
+## Extracting some Text
+
+Now that we know what we might find in a web archive, let us try extracting
+some text. You might want to get just the text of a given website or domain,
+for example.
+
+Above we learned that the Liberal Party of Canada's website has 1,968 captures
+in the sample files we provided. Let's try to just extract that text.
+
+To load this script, remember to type `:paste`, copy-and-paste it into the shell,
+and then press `ctrl` + `d`.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val domains = Array("liberal.ca")
+
+RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc)
+  .webpages()
+  .select($"crawl_date", $"domain", $"url", $"content")
+  .filter(hasDomains($"domain", lit(domains)))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("/data/liberal-party-text")
+```
+
+**If you're using your own data, that's why the domain count was key!** Swap
+out the "liberal.ca" command above with the domain that you want to look at
+from your own data.
+
+Now let's look at the ensuing data. Go to the folder you provided in the very
+first startup – remember, in my case it was `/users/ianmilligan1/desktop/data`,
+and you will now have a folder called `liberal-party-text`. Open up the files
+with your text editor and check it out!
+
+## Ouch: Our First Error
+
+One of the vexing parts of this interface is that it creates output directories
+and if the directory already exists, it comes tumbling down.
+
+As this is one of the most common errors, let's see it and then learn how to
+get around it.
+
+Try running the **exact same script** that you did above.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val domains = Array("liberal.ca")
+
+RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc)
+  .webpages()
+  .select($"crawl_date", $"domain", $"url", $"content")
+  .filter(hasDomains($"domain", lit(domains)))
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("/data/liberal-party-text")
+```
+
+Instead of a nice crisp feeling of success, you will see a long dump of text
+beginning with:
+
+```scala
+20/02/06 23:43:05 WARN SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect.
+org.apache.spark.sql.AnalysisException: path file:/data/liberal-party-text already exists.;
+```
+
+To get around this, you can do two things:
+
+- Delete the existing directory that you created;
+- Change the name of the output file - to `/data/liberal-party-text-2` for example.
+
+Good luck!
+
+## Other Text Analysis Filters
+
+Take some time to explore the various filters that you can use. Check out the
+[documentation](https://aut.docs.archivesunleashed.org/docs/filters-df)
+for some ideas.
+
+Some options:
+
+- **Keep URL Patterns**: Instead of domains, what if you wanted to have text
+  relating to just a certain pattern? Substitute `hasDomains` for a command
+  like:
+  `.filter($"domain"), Array("liberal.ca", "ndp.ca"))`
+- **Filter by Date**: What if we just wanted data from 2006? You could add the
+  following command after `.webpages()`: `.filter(hasDates($"crawl_date", Array("2006")))`
+- **Filter by Language**: What if you just want French-language pages? Add
+  another filter: `.filter($"languages", Array("fr")))`.
+
+For example, if we just wanted the French-language Liberal pages, we would run:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val domains = Array("liberal.ca")
+val languages = Array("fr")
+
+RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc)
+  .webpages()
+  .filter(hasDomains($"domain"), lit(domains)))
+  .filter(hasLanguages($"language", lit(languages)))
+  .select($"crawl_date", $"domain", $"url", $"content")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("/data/liberal-party-french-text")
+```
+
+Or if we wanted to just have pages from 2006, we would run:
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+val dates = Array("2006")
+
+RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc)
+  .webpages()
+  .filter(hasDate($"crawl_date", lit(dates)))
+  .select($"crawl_date", $"domain", $"url", $"content")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("/data/2006-text")
+```
+
+Finally, if we want to remove the HTTP headers – let's say if we want to create
+some nice word clouds – we can add a final command: `RemoveHttpHeader`.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+
+RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc)
+  .webpages()
+  .select($"crawl_date", $"domain", $"url", $"content")
+  .write
+  .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+  .format("csv")
+  .option("escape", "\"")
+  .option("encoding", "utf-8")
+  .save("/data/text-no-headers")
+```
+
+You could now try uploading one of the plain text files using a website like
+[Voyant Tools](https://voyant-tools.org).
+
+## Web of Links: Network Analysis
+
+One other thing we can do is a network analysis. By now you are probably
+getting good at running code.
+
+Let's extract all of the links from the sample data and export them to a file
+format that the popular network analysis program Gephi can use.
+
+```scala
+import io.archivesunleashed._
+import io.archivesunleashed.udfs._
+import io.archivesunleashed.app._
+
+val webgraph = RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc).webgraph()
+
+val graph = webgraph.groupBy(
+                            $"crawl_date",
+                            removePrefixWWW(extractDomain($"src")).as("src_domain"),
+                            removePrefixWWW(extractDomain($"dest")).as("dest_domain"))
+                    .count()
+                    .filter(!($"dest_domain"===""))
+                    .filter(!($"src_domain"===""))
+                    .filter($"count" > 5)
+                    .orderBy(desc("count"))
+
+WriteGEXF(graph.collect(), "/data/links-for-gephi.gexf")
+```
+
+By now this should be seeming pretty straightforward! (remember to keep using
+`:paste` to enter this code).
+
+## Working with the Data
+
+The first step should be to work with this network diagram so you can make a
+beautiful visualization yourself.
+
+![Gephi visualization](https://archivesunleashed.org/images/gephi.png)
+
+First, let's use these instructions to [work with Gephi](https://cloud.archivesunleashed.org/derivatives/gephi).
+
+Secondly, we can begin to think about how” to work with the plain text file.
+See the following documents from our "learning guides":
+
+- [**Filtering the Full-Text Derivative
+  File**](https://cloud.archivesunleashed.org/derivatives/text-filtering): This
+  tutorial explores the use of the "grep" command line tool to filter out
+  dates, domains, and keywords from plain text.
+- [**Text Analysis Part One: Beyond the Keyword Search: Using
+  AntConc**](https://cloud.archivesunleashed.org/derivatives/text-antconc):
+  This tutorial explores how you can explore text within a web archive using
+  the AntConc tool.
+- [**Text Analysis Part Two: Sentiment Analysis With the Natural Language
+  Toolkit**](https://cloud.archivesunleashed.org/derivatives/text-sentiment):
+  This tutorial explores how you can calculate the positivity or negativity (in
+  an emotional sense) of web archive text.
+
+Good luck and thanks for joining us on this lesson plan.
+
+## Acknowledgements and Final Notes
+
+The ARC and WARC file are drawn from the [Canadian Political Parties &
+Political Interest Groups Archive-It
+Collection](https://archive-it.org/collections/227), collected by the
+University of Toronto. We are grateful that they've provided this material to
+us.
+
+If you use their material, please cite it along the following lines:
+
+- University of Toronto Libraries, Canadian Political Parties and Interest
+  Groups, Archive-It Collection 227, Canadian Action Party,
+  <http://wayback.archive-it.org/227/20051004191340/http://canadianactionparty.ca/Default2.asp>
+
+You can find more information about this collection at [WebArchives.ca](http://webarchives.ca/).
diff --git a/website/versioned_docs/version-1.0.0/usage.md b/website/versioned_docs/version-1.0.0/usage.md
new file mode 100644
index 00000000..ba1742e1
--- /dev/null
+++ b/website/versioned_docs/version-1.0.0/usage.md
@@ -0,0 +1,149 @@
+---
+id: version-1.0.0-usage
+title: Usage
+original_id: usage
+---
+
+## The Toolkit with Spark Submit
+
+The Toolkit offers a variety of extraction jobs with
+[`spark-submit`](https://spark.apache.org/docs/latest/submitting-applications.html)
+. These extraction jobs have a few configuration options.
+
+The extraction jobs have a basic outline of:
+
+```shell
+spark-submit --class io.archivesunleashed.app.CommandLineAppRunner PATH_TO_AUT_JAR --extractor EXTRACTOR --input INPUT DIRECTORY --output OUTPUT DIRECTORY
+```
+
+More information on using the Toolkit with `spark-submit` can be found in
+[The Toolkit with spark-submit](aut-spark-submit-app.md) section of the documentation.
+
+## The Toolkit with Spark Shell
+
+The Toolkit only supports the `--jar` option.
+
+```shell
+spark-shell --help
+
+  --jars JARS                 Comma-separated list of jars to include on the driver
+                              and executor classpaths.
+```
+
+### With an UberJar
+
+Release version:
+
+```shell
+spark-shell --jars /path/to/aut-1.0.0-fatjar.jar
+```
+
+HEAD (built locally):
+
+```shell
+spark-shell --jars /path/to/aut/target/aut-1.0.1-SNAPSHOT-fatjar.jar
+```
+
+## The Toolkit with PySpark
+
+To run PySpark with the Toolkit loaded, you will need to
+provide PySpark with the Java/Scala artifact, as well as the Python bindings.
+The Java/Scala artifact can be provided with `--jars` as
+described above. The Python bindings can be
+[downloaded](https://github.com/archivesunleashed/aut/releases/download/aut-1.0.0/aut-1.0.0.zip)
+, or [built locally](#building-locally) (the zip file will be found in
+the `target` directory.
+
+In each of the examples below, `/path/to/python` is listed. If you are unsure
+where your Python is, it can be found with `which python`.
+
+### With an UberJar
+
+Release version:
+
+```shell
+export PYSPARK_PYTHON=/path/to/python; export PYSPARK_DRIVER_PYTHON=/path/to/python; /path/to/spark/bin/pyspark --py-files aut-1.0.0.zip --jars /path/to/aut-1.0.0-fatjar.jar
+```
+
+HEAD (built locally):
+
+```shell
+export PYSPARK_PYTHON=/path/to/python; export PYSPARK_DRIVER_PYTHON=/path/to/python; /path/to/spark/bin/pyspark --py-files aut.zip --jars /path/to/aut-1.0.1-SNAPSHOT-fatjar.jar
+```
+
+## The Toolkit with Jupyter
+
+To run a [Jupyter Notebook](https://jupyter.org/install) with the Archives
+Unleashed Toolkit loaded, you will need to provide PySpark the Java/Scala
+artifact, and the Python bindings. The Java/Scala artifact can be provided
+with `--jars` as described above. The Python bindings can be
+[downloaded](https://github.com/archivesunleashed/aut/releases/download/aut-1.0.0/aut-1.0.0.zip)
+, or [built locally](#Introduction) (the zip file will be found in
+the `target` directory.
+
+### With an UberJar
+
+Release version:
+
+```shell
+export PYSPARK_DRIVER_PYTHON=jupyter; export PYSPARK_DRIVER_PYTHON_OPTS=notebook; /path/to/spark/bin/pyspark --py-files aut-1.0.0.zip --jars /path/to/aut-1.0.0-fatjar.jar
+```
+
+HEAD (built locally):
+
+```shell
+export PYSPARK_DRIVER_PYTHON=jupyter; export PYSPARK_DRIVER_PYTHON_OPTS=notebook; /path/to/spark/bin/pyspark --py-files aut.zip --jars /path/to/aut-1.0.1-SNAPSHOT-fatjar.jar
+```
+
+A Jupyter Notebook _should_ automatically load in your browser at
+<http://localhost:8888>. You may be asked for a token upon first launch, which
+just offers a bit of security. The token is available in the load screen and
+will look something like this:
+
+```shell
+[I 19:18:30.893 NotebookApp] Writing notebook server cookie secret to /run/user/1001/jupyter/notebook_cookie_secret
+[I 19:18:31.111 NotebookApp] JupyterLab extension loaded from /home/nruest/bin/anaconda3/lib/python3.7/site-packages/jupyterlab
+[I 19:18:31.111 NotebookApp] JupyterLab application directory is /home/nruest/bin/anaconda3/share/jupyter/lab
+[I 19:18:31.112 NotebookApp] Serving notebooks from local directory: /home/nruest/Projects/au/aut
+[I 19:18:31.112 NotebookApp] The Jupyter Notebook is running at:
+[I 19:18:31.112 NotebookApp] http://localhost:8888/?token=87e7a47c5a015cb2b846c368722ec05c1100988fd9dcfe04
+[I 19:18:31.112 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
+[C 19:18:31.140 NotebookApp]
+
+    To access the notebook, open this file in a browser:
+        file:///run/user/1001/jupyter/nbserver-9702-open.html
+    Or copy and paste one of these URLs:
+        http://localhost:8888/?token=87e7a47c5a015cb2b846c368722ec05c1100988fd9dcfe04
+```
+
+Create a new notebook by clicking "New" (near the top right of the Jupyter
+homepage) and select "Python 3" from the drop-down list.
+
+The notebook will open in a new window. In the first cell enter:
+
+```python
+from aut import *
+
+archive = WebArchive(sc, sqlContext, "src/test/resources/warc/")
+
+webpages = archive.webpages()
+webpages.printSchema()
+```
+
+Then hit <kbd>Shift</kbd>+<kbd>Enter</kbd>, or press the play button.
+
+If you receive no errors, and see the following, you are ready to begin working
+with your web archives!
+
+![](https://user-images.githubusercontent.com/218561/63203995-42684080-c061-11e9-9361-f5e6177705ff.png)
+
+## The Toolkit with Docker
+
+The Toolkit offers a Docker container that can be used with Spark and PySpark.
+The container is great for learning how the Toolkit works, and quick prototyping.
+Containers are available for [each
+release](https://github.com/archivesunleashed/docker-aut/branches), as well as
+the `main` branch.
+
+More information on using the `docker-aut` can be found
+[here](https://github.com/archivesunleashed/docker-aut).
diff --git a/website/versions.json b/website/versions.json
index e8a6fd72..8321bc29 100644
--- a/website/versions.json
+++ b/website/versions.json
@@ -1,4 +1,5 @@
 [
+  "1.0.0",
   "0.91.0",
   "0.90.4",
   "0.90.3",