From 71ee62e03afe42e4cc387d5c08aa294f6599d834 Mon Sep 17 00:00:00 2001 From: nruest Date: Sat, 11 Jun 2022 13:02:18 -0400 Subject: [PATCH] 1.0.0 --- .../version-1.0.0/auk-derivatives.md | 279 ++++ .../version-1.0.0/aut-at-scale.md | 124 ++ .../version-1.0.0/aut-spark-submit-app.md | 415 ++++++ .../version-1.0.0/binary-analysis.md | 1123 +++++++++++++++++ .../version-1.0.0/dataframe-schemas.md | 156 +++ .../version-1.0.0/extract-binary-info.md | 177 +++ .../version-1.0.0/extract-binary.md | 117 ++ .../version-1.0.0/filters-df.md | 328 +++++ .../version-1.0.0/image-analysis.md | 357 ++++++ .../version-1.0.0/link-analysis.md | 593 +++++++++ .../version-1.0.0/text-analysis.md | 597 +++++++++ .../version-1.0.0/toolkit-walkthrough.md | 424 +++++++ website/versioned_docs/version-1.0.0/usage.md | 149 +++ website/versions.json | 1 + 14 files changed, 4840 insertions(+) create mode 100644 website/versioned_docs/version-1.0.0/auk-derivatives.md create mode 100644 website/versioned_docs/version-1.0.0/aut-at-scale.md create mode 100644 website/versioned_docs/version-1.0.0/aut-spark-submit-app.md create mode 100644 website/versioned_docs/version-1.0.0/binary-analysis.md create mode 100644 website/versioned_docs/version-1.0.0/dataframe-schemas.md create mode 100644 website/versioned_docs/version-1.0.0/extract-binary-info.md create mode 100644 website/versioned_docs/version-1.0.0/extract-binary.md create mode 100644 website/versioned_docs/version-1.0.0/filters-df.md create mode 100644 website/versioned_docs/version-1.0.0/image-analysis.md create mode 100644 website/versioned_docs/version-1.0.0/link-analysis.md create mode 100644 website/versioned_docs/version-1.0.0/text-analysis.md create mode 100644 website/versioned_docs/version-1.0.0/toolkit-walkthrough.md create mode 100644 website/versioned_docs/version-1.0.0/usage.md diff --git a/website/versioned_docs/version-1.0.0/auk-derivatives.md b/website/versioned_docs/version-1.0.0/auk-derivatives.md new file mode 100644 index 00000000..f6673f8e --- /dev/null +++ b/website/versioned_docs/version-1.0.0/auk-derivatives.md @@ -0,0 +1,279 @@ +--- +id: version-1.0.0-auk-derivatives +title: ARCH Derivatives +original_id: auk-derivatives +--- + +How do I create the Toolkit generated derivatives that the Archives +Research Compute Hub creates on my own web archive collection? + +## Scala RDD + +**Will not be implemented.** + +## Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val warcs = "/path/to/warcs/*" +val results = "/path/to/results/" + +val webpages = RecordLoader.loadArchives(warcs, sc).webpages() +val webgraph = RecordLoader.loadArchives(warcs, sc).webgraph() + +// Domain frequency. +webpages.groupBy($"domain") + .count() + .sort($"count".desc) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "domains") + +// Domain graph. +webgraph.groupBy( + $"crawl_date", + removePrefixWWW(extractDomain($"src")).as("src_domain"), + removePrefixWWW(extractDomain($"dest")).as("dest_domain") + ) + .count() + .filter(!($"dest_domain" === "")) + .filter(!($"src_domain" === "")) + .filter($"count" > 5) + .orderBy(desc("count")) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "domain-graph") + +// Image graph. +RecordLoader.loadArchives(warcs, sc) + .imagegraph() + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "image-graph") + +// Web graph. +webgraph.write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "web-graph") + +// Web pages. +webpages.write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "webpages") + +// Binary information. +RecordLoader.loadArchives(warcs, sc) + .audio() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "audio") + +RecordLoader.loadArchives(warcs, sc) + .images() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"width", $"height", $"md5", $"sha1") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "image") + +RecordLoader.loadArchives(warcs, sc) + .pdfs() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "pdf") + +RecordLoader.loadArchives(warcs, sc) + .presentationProgramFiles() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "presentation-program") + +RecordLoader.loadArchives(warcs, sc) + .spreadsheets() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "spreadsheet") + +RecordLoader.loadArchives(warcs, sc) + .videos() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "video") + +RecordLoader.loadArchives(warcs, sc) + .wordProcessorFiles() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save(results + "word-processor") + +sys.exit +``` + +## Python DF + +```python +from aut import * +from pyspark.sql.functions import col, desc + +warcs = "/path/to/warcs/*" +results = "/path/to/results/" + +webpages = WebArchive(sc, sqlContext, warcs).webpages() +webgraph = WebArchive(sc, sqlContext, warcs).webgraph() + +# Domain frequency. +webpages.groupBy("domain") \ + .count() \ + .sort(col("count")\ + .desc()) \ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "domains") + +# Domain graph. +webgraph.groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src_domain"), remove_prefix_www(extract_domain("dest")).alias("dest_domain"))\ + .count()\ + .filter((col("dest_domain").isNotNull()) & (col("dest_domain") !=""))\ + .filter((col("src_domain").isNotNull()) & (col("src_domain") !=""))\ + .filter(col("count") > 5)\ + .orderBy(desc("count"))\ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "domain-graph") + +# Image graph. +WebArchive(sc, sqlContext, warcs).imagegraph()\ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "image-graph") + +# Web graph. +webgraph.write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "web-graph") + +# Web pages. +webpages.write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "webpages") + +# Binary information. +WebArchive(sc, sqlContext, warcs).audio()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "audio") + +WebArchive(sc, sqlContext, warcs).images()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "images") + +WebArchive(sc, sqlContext, warcs).pdfs()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "pdfs") + +WebArchive(sc, sqlContext, warcs).presentation_program()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "presentation_program") + +WebArchive(sc, sqlContext, warcs).spreadsheets()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "spreadsheets") + +WebArchive(sc, sqlContext, warcs).video()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "videos") + +WebArchive(sc, sqlContext, warcs).word_processor()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save(results + "word_processor") +``` diff --git a/website/versioned_docs/version-1.0.0/aut-at-scale.md b/website/versioned_docs/version-1.0.0/aut-at-scale.md new file mode 100644 index 00000000..59f41947 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/aut-at-scale.md @@ -0,0 +1,124 @@ +--- +id: version-1.0.0-aut-at-scale +title: The Toolkit at Scale +original_id: aut-at-scale +--- + +As your collections grow, you may need to provide more resources, and adjust +Apache Spark configuration options. Apache Spark has great +[Configuration](https://spark.apache.org/docs/latest/configuration.html) and +[Tuning](https://spark.apache.org/docs/latest/tuning.html) guides that are +worth checking out. If you're not sure where to start with scaling, join us in +[Slack](slack.archivesunleashed.org) in the `#aut` channel, and we might be +able to provide some guidance. + +## A Note on Memory and Cores + +As your datasets grow, you may need to provide more memory to Apache Spark. +You'll know this if you get an error saying that you have run out of "Java Heap +Space." + +You can add a +[configuration](https://spark.apache.org/docs/latest/configuration.html) option +for adjusting available memory like so: + +```shell +spark-shell --driver-memory 4G --jars /path/to/aut-1.0.0-fatjar.jar +``` + +In the above case, you give Apache Spark 4GB of memory to execute the program. + +In some other cases, despite giving AUT sufficient memory, you may still +encounter Java Heap Space issues. In those cases, it is worth trying to lower +the number of worker threads. When running locally (i.e. on a single laptop, +desktop, or server), by default AUT runs a number of threads equivalent to the +number of cores in your machine. + +On a 16-core machine, you may want to drop to 12 cores if you are having memory +issues. This will increase stability but decrease performance a bit. + +You can do so like this (the example is using 12 threads on a 16-core machine): + +```shell +spark-shell --master local[12] --driver-memory 4G --jars /path/to/aut-1.0.0-fatjar.jar +``` + +If you continue to have errors, look at your output and logs. They will usually +point you in the right direction. For instance, you may also need to increase +the network timeout value. Once in a while, AUT might get stuck on an odd +record and take longer than normal to process it. The `--conf +spark.network.timeout=10000000` will ensure that AUT continues to work on +material, although it may take a while to process. This command then works: + +```shell +spark-shell --master local[12] --driver-memory 90G --conf spark.network.timeout=10000000 --jars /path/to/aut-1.0.0-fatjar.jar +``` + +## Reading Data from AWS S3 + +We also support loading data stored in [Amazon S3](https://aws.amazon.com/s3/). +This advanced functionality requires that you provide Spark shell with your AWS +Access Key and AWS Secret Key, which you will get when creating your AWS +credentials ([read more +here](https://aws.amazon.com/blogs/security/wheres-my-secret-access-key/)). + +This script, for example, will find the top ten domains from a set of WARCs +found in an s3 bucket. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +sc.hadoopConfiguration.set("fs.s3a.access.key", "") +sc.hadoopConfiguration.set("fs.s3a.secret.key", "") + +RecordLoader.loadArchives("s3a:///*.gz", sc) + .keepValidPages() + .map(r => ExtractDomain(r.getUrl)) + .countItems() + .take(10) +``` + +### Reading Data from an S3-like Endpoint + +We also support loading data stored in an Amazon S3-like system such as [Ceph +RADOS](https://docs.ceph.com/docs/master/rados/). Similar to the above example, +you'll need an access key and secret, and additionally, you'll need to define +your endpoint. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +sc.hadoopConfiguration.set("fs.s3a.access.key", "") +sc.hadoopConfiguration.set("fs.s3a.secret.key", "") +sc.hadoopConfiguration.set("fs.s3a.endpoint", "") + +RecordLoader.loadArchives("s3a:///*.gz", sc) + .keepValidPages() + .map(r => ExtractDomain(r.getUrl)) + .countItems() + .take(10) +``` + +### Troubleshooting S3 + +If you run into this `AmazonHttpClient` timeout error: + +```shell +19/10/24 11:12:51 INFO AmazonHttpClient: Unable to execute HTTP request: Timeout waiting for connection from pool +org.apache.http.conn.ConnectionPoolTimeoutException: Timeout waiting for connection from pool + at org.apache.http.impl.conn.PoolingClientConnectionManager.leaseConnection(PoolingClientConnectionManager.java:231) + at org.apache.http.impl.conn.PoolingClientConnectionManager$1.getConnection(PoolingClientConnectionManager.java:200) + at sun.reflect.GeneratedMethodAccessor7.invoke(Unknown Source) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:498) + at com.amazonaws.http.conn.ClientConnectionRequestFactory$Handler.invoke(ClientConnectionRequestFactory.java:70) +``` + +You can add the following two configuration lines to your script: + +```scala +sc.hadoopConfiguration.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") +sc.hadoopConfiguration.setInt("fs.s3a.connection.maximum", 100) +``` diff --git a/website/versioned_docs/version-1.0.0/aut-spark-submit-app.md b/website/versioned_docs/version-1.0.0/aut-spark-submit-app.md new file mode 100644 index 00000000..ee5d7719 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/aut-spark-submit-app.md @@ -0,0 +1,415 @@ +--- +id: version-1.0.0-aut-spark-submit-app +title: The Toolkit with spark-submit +original_id: aut-spark-submit-app +--- + +The Toolkit offers a variety of extraction jobs with +[`spark-submit`](https://spark.apache.org/docs/latest/submitting-applications.html) +. These extraction jobs have a few configuration options. + +The extraction jobs have a basic outline of: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner PATH_TO_AUT_JAR --extractor EXTRACTOR --input INPUT DIRECTORY --output OUTPUT DIRECTORY +``` + +Additional flags include: + +* `--output-format FORMAT` (`csv` (default) or `parquet`. + `DomainGraphExtractor` has two additional output options + `graphml` or `gexf`.) +* `--split` (The extractor will put results for each input file in its own + directory. Each directory name will be the name of the ARC/WARC file parsed.) +* `--partition N` (The extractor will partition the DataFrame according to N + before writing results. The is useful to combine all the results to a single + file.) + +## Audio Information + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `url`, `filename`, `extension`, +`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`. + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor AudioInformationExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor AudioInformationExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor AudioInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor AudioInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## Domain Frequency + +This extractor outputs a directory of files, or a single file with the +following columns: `domain`, and `count`. + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainFrequencyExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainFrequencyExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainFrequencyExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainFrequencyExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## Domain Graph + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `src_domain`, `dest_domain`, and `count`. In +addition to the standard text output, an additional flag `--output-format` can +output [GraphML](https://en.wikipedia.org/wiki/GraphML), or +[GEXF](https://gephi.org/gexf/format/). + +CSV output: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainGraphExtractor --input /path/to/warcs/* --output output/path --output-format csv +``` + +Parquet output: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainGraphExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +GEXF output: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainGraphExtractor --input /path/to/warcs/* --output output/path --output-format gexf +``` + +GraphML output: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor DomainGraphExtractor --input /path/to/warcs/* --output output/path --output-format graphml +``` + +## Image Graph + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `src`, `image_url`, and `alt_text`. + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageGraphExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +``` shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageGraphExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageGraphExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +``` shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageGraphExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## Image Information + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `url`, `filename`, `extension`, +`mime_type_web_server`, `mime_type_tika`, `width`, `height`, `md5`, and `sha1`. + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageInformationExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageInformationExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor ImageInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## PDF Information + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `url`, `filename`, `extension`, +`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`. + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PDFInformationExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PDFInformationExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PDFInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PDFInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## Plain Text + +This extractor outputs a directory of files, or a single file with the +following columns: `content` (Boilerplate, HTTP headers, and HTML removed). + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PlainTextExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PlainTextExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PlainTextExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PlainTextExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## Presentation Program Information + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `url`, `filename`, `extension`, +`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`. + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PresentationProgramInformationExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PresentationProgramInformationExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PresentationProgramInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor PresentationProgramInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## Spreadsheet Information + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `url`, `filename`, `extension`, +`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`. + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor SpreadsheetInformationExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor SpreadsheetInformationExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor SpreadsheetInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor SpreadsheetInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## Video Information + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `url`, `filename`, `extension`, +`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`. + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor VideoInformationExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor VideoInformationExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor VideoInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor VideoInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## Web Graph Information + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `src`, `dest`, and `anchor`. + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebGraphExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebGraphExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebGraphExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebGraphExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## Web Pages + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `domain`, `url`, +`mime_type_web_server`, `mime_type_tika`, and `content` +(HTTP headers, and HTML removed). + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebPagesExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebPagesExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebPagesExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WebPagesExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` + +## Word Processor Information + +This extractor outputs a directory of files, or a single file with the +following columns: `crawl_date`, `url`, `filename`, `extension`, +`mime_type_web_server`, `mime_type_tika`, `md5`, and `sha1`. + +Directory of CSV files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WordProcessorInformationExtractor --input /path/to/warcs/* --output output/path +``` + +A single CSV file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WordProcessorInformationExtractor --input /path/to/warcs/* --output output/path --partition 1 +``` + +Directory of Parquet files: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WordProcessorInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet +``` + +A single Parquet file: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner path/to/aut-fatjar.jar --extractor WordProcessorInformationExtractor --input /path/to/warcs/* --output output/path --output-format parquet --partition 1 +``` diff --git a/website/versioned_docs/version-1.0.0/binary-analysis.md b/website/versioned_docs/version-1.0.0/binary-analysis.md new file mode 100644 index 00000000..de37c468 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/binary-analysis.md @@ -0,0 +1,1123 @@ +--- +id: version-1.0.0-binary-analysis +title: Binary Analysis +original_id: binary-analysis +--- + +## Extract Audio Information + +### Scala RDD + +**Will not be implemented.** + +### Scala DF + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).audio(); + +df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes") + .orderBy(desc("md5")) + .show() +``` + +Will extract all following information from audio files in a web collection: + +- audio url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server|mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+ +|http://geocities....| capasoligero.mp3| mp3| audio/mpeg| audio/mpeg|fffd1aa802392be0f...|88e254b4cab7848a9...|//MozAAAAAAAAAAAA...| +|http://www.geocit...| colorwnd.mid| mid| audio/midi| audio/midi|fff3f4e8a473f7c9a...|aea92a6f32dd1a1f4...|TVRoZAAAAAYAAQAGA...| +|http://geocities....|santana_rob_thoma...| mid| audio/midi| audio/midi|ffd4a24d4e4722d94...|28576c271898a1de5...|TVRoZAAAAAYAAQASA...| +|http://geocities....| music.mid| mid| audio/midi| audio/midi|ffcbe35e28b553481...|cf1ebdbe1a070d4f6...|TVRoZAAAAAYAAAABA...| +|http://geocities....| evrythng.mid| mid| audio/midi| audio/midi|ff751c86728ff09b5...|d22fc0911d3ceb17a...|TVRoZAAAAAYAAAABA...| +|http://geocities....| evrythn2.mid| mid| audio/midi| audio/midi|ff751c86728ff09b5...|d22fc0911d3ceb17a...|TVRoZAAAAAYAAAABA...| +|http://geocities....| picket.mid| mid| audio/midi| audio/midi|ff4d225a602630584...|ecef0a851cc028853...|TVRoZAAAAAYAAQAHA...| +|http://geocities....| simpsons.mid| mid| audio/midi| audio/midi|ff3bc375860979f2f...|9c1204dad686ddeea...|TVRoZAAAAAYAAQAPA...| +|http://www.geocit...| simpsons.mid| mid| audio/midi| audio/midi|ff3bc375860979f2f...|9c1204dad686ddeea...|TVRoZAAAAAYAAQAPA...| +|http://geocities....| mypretty.wav| wav| audio/x-wav|audio/vnd.wave|ff1a5015d3a380955...|113de5c1bb2f7ddb4...|UklGRvz8AABXQVZFZ...| +|http://geocities....| song37.mid| mid| audio/midi| audio/midi|fee0a67ff7c71e35c...|ccd4fdfa0483d1058...|TVRoZAAAAAYAAAABA...| +|http://geocities....| holdyourhand.mid| mid| audio/midi| audio/midi|fed14ecd7099e3fb9...|24fe5c097db5d506a...|TVRoZAAAAAYAAQANA...| +|http://geocities....| es_tu_sangre.mid| mid| audio/midi| audio/midi|fec196e8086d868f2...|eccb1551d1e7b236e...|TVRoZAAAAAYAAQASA...| +|http://www.geocit...| virgin.mid| mid| audio/midi| audio/midi|fec0ce795723b1287...|cc651312b1d57fe64...|TVRoZAAAAAYAAQAMA...| +|http://www.geocit...|tonibraxtonunbrea...| wav| audio/x-wav|audio/vnd.wave|feb7e31a8edb0a484...|9420bdeece0f23b78...|UklGRtQoCgBXQVZFZ...| +|http://geocities....| comeandsee.mid| mid| audio/midi| audio/midi|feb513cd7b6fab9cc...|51b4c2bb113cb43aa...|TVRoZAAAAAYAAAABA...| +|http://geocities....| song186t.mid| mid| audio/midi| audio/midi|fead61a5a439675a3...|c652eda8a4ec5d197...|TVRoZAAAAAYAAAABA...| +|http://geocities....| be_magnified.mid| mid| audio/midi| audio/midi|feac0e996e1555d84...|f51ec1e62a166fa82...|TVRoZAAAAAYAAQAPA...| +|http://geocities....| EVERYBOD.MID| mid| audio/midi| audio/midi|fea911b19f0cf709d...|58bcd1b3c0288cbe0...|TVRoZAAAAAYAAQAUA...| +|http://www.geocit...| ff9waltz.mid| mid| audio/midi| audio/midi|fe9eb1ea6d4b53a9f...|72e2467bfea6240b8...|TVRoZAAAAAYAAQAKA...| ++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+ +``` + +If you wanted to work with all the audio files in a collection, you could +extract them with the following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).audio(); + +df.select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension") +``` + +### Python DF + +The following script: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "/path/to/warcs") + +df = archive.audio() +df.show() +``` + +Will extract all following information from audio files in a web collection: + +- audio url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server|mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+ +|http://www.geocit...| hc-tibet.wav| wav| audio/x-wav|audio/vnd.wave|416ad26133f63dc3e...|dfb764d759187d102...|UklGRg6eAABXQVZFZ...| +|http://geocities....|bookmarkthissite.wav| wav| audio/x-wav|audio/vnd.wave|7897ff71780a903ca...|cfb942aeb3bc881cd...|UklGRppkAABXQVZFZ...| +|http://geocities....| NeilYoung-Hey.mp3| mp3| audio/mpeg| audio/mpeg|40869eb3181e6035b...|19fa693521cd8125c...|//uQRAAAAcAAsNUEA...| +|http://geocities....| misty1.mp3| mp3| audio/mpeg| audio/mpeg|d8cb3ce54072a7d4b...|43b92e16932c13a43...|//uQBAAAAsJl22mBE...| +|http://geocities....| sale.mid| mid| audio/midi| audio/midi|5dfc0c3dd884e50c7...|071840b4822ae5e80...|TVRoZAAAAAYAAQALA...| +|http://geocities....| swaplink.mid| mid| audio/midi| audio/midi|f32117ce2bffa9902...|0346223861c87acc1...|TVRoZAAAAAYAAQALA...| +|http://geocities....| m5.mid| mid| audio/midi| audio/midi|7e5eedebafecd26c4...|393dfbc00c49fcdc9...|TVRoZAAAAAYAAQAJA...| +|http://geocities....| morder.mid| mid| audio/midi| audio/midi|6cec0785377f5bbaf...|a94f0a75c0c3b3cf5...|TVRoZAAAAAYAAQAMA...| +|http://geocities....| m2.mid| mid| audio/midi| audio/midi|58b0102f997e689a2...|51ad469ebc931e160...|TVRoZAAAAAYAAQALA...| +|http://geocities....| music.mid| mid| audio/midi| audio/midi|7917a5a9d6ddfb8dd...|009db9df73cdf5247...|TVRoZAAAAAYAAQALA...| +|http://www.geocit...| hcpopeye.wav| wav| audio/x-wav|audio/vnd.wave|04d7b45c70e0a496e...|9db0e61c16554af88...|UklGRrbAAABXQVZFZ...| +|http://geocities....| m7.mid| mid| audio/midi| audio/midi|3906ecaba32ba15a8...|e0d6e9f1c86b6204e...|TVRoZAAAAAYAAQAHA...| +|http://geocities....| words.mid| mid| audio/midi| audio/midi|30da01a4ed42ae469...|160b2e5aaa9b95641...|TVRoZAAAAAYAAQAIA...| +|http://geocities....| brock5.mp3| mp3| audio/mpeg| audio/mpeg|17f4e1c7a007983a5...|3bbdb27fafa4e8b12...|//MozAANkCLE/gjGA...| +|http://geocities....| brock1.mp3| mp3| audio/mpeg| audio/mpeg|67db65825afc326ed...|2ec4ac110cff19134...|//MozAAMyX7VmBjGl...| +|http://geocities....| funkytown.wav| wav| audio/x-wav|audio/vnd.wave|6f841bcffe4bbb61d...|ab1fdb143d5752cf1...|UklGRlLOCQBXQVZFZ...| +|http://geocities....| welcomemyworld.mid| mid| audio/midi| audio/midi|c546eac675e2dd974...|cb4f1fa32aa1e3205...|TVRoZAAAAAYAAQAMA...| +|http://www.geocit...| irisheye.mid| mid| audio/midi| audio/midi|d906f32953742fdef...|f3ca7449483b0ea65...|TVRoZAAAAAYAAQAFA...| +|http://geocities....| mission21.mid| mid| audio/midi| audio/midi|c507304afe6cddba1...|72a74c1914044746f...|TVRoZAAAAAYAAQAVA...| +|http://geocities....| tellit1.mid| mid| audio/midi| audio/midi|a604ae85251d55504...|95096668900a76dc8...|TVRoZAAAAAYAAQAQA...| ++--------------------+--------------------+---------+--------------------+--------------+--------------------+--------------------+--------------------+ +only showing top 20 rows +``` + +## Extract Image Information + +### Scala RDD + +**Will not be implemented.** + +### Scala DF + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).images(); + +df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"width", $"height", $"md5", $"sha1", $"bytes") + .orderBy(desc("md5")) + .show() +``` + +Will extract all following information from images in a web collection: + +- image url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- Width +- Height +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server|mime_type_tika|width|height| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ +|http://www.archiv...|mediatype_movies.gif| gif| image/gif| image/gif| 21| 21|ff05f9b408519079c...|194800d702aab9b87...|R0lGODlhFQAVAKUpA...| +|http://www.archiv...| LOCLogoSmall.jpg| jpg| image/jpeg| image/jpeg| 275| 300|fbf1aec668101b960...|564c1a07152c12cea...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...| archive.small.jpg| jpg| image/jpeg| image/jpeg| 300| 225|f611b554b9a44757d...|e9bf7ef0ae3fc50f5...|/9j/4RpBRXhpZgAAT...| +|http://tsunami.ar...| tsunamiweb1_02.jpg| jpg| image/jpeg| image/jpeg| 384| 229|f02005e29ffb485ca...|9eeb9c3c67d7efc51...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...|alexa_websearch_l...| gif| image/gif| image/gif| 301| 47|eecc909992272ce0d...|ea18e226f3cf40005...|R0lGODlhLQEvAPcAA...| +|http://www.archiv...| lizardtech.gif| gif| image/gif| image/gif| 140| 37|e7166743861126e51...|cf26e9ffc27be133f...|R0lGODlhjAAlANUwA...| +|http://www.archiv...| half_star.png| png| image/png| image/png| 14| 12|e1e101f116d9f8251...|736abd06a978e2fd2...|iVBORw0KGgoAAAANS...| +|http://www.archiv...| hewlett.jpg| jpg| image/jpeg| image/jpeg| 300| 116|e1da27028b81db60e...|eb418c17901b1b313...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...|prelinger-header-...| jpg| image/jpeg| image/jpeg| 84| 72|d39cce8b2f3aaa783...|1c41a644123e8f861...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...| arrow.gif| gif| image/gif| image/gif| 13| 11|c7ee6d7c17045495e...|7013764e619066e60...|R0lGODlhDQALALMAA...| +|http://www.archiv...| folder.png| png| image/png| image/png| 20| 15|c1905fb5f16232525...|ff7b8c60e8397cb5d...|iVBORw0KGgoAAAANS...| +|http://www.archiv...| wayback-wtc.gif| gif| image/gif| image/gif| 35| 35|c15ec074d95fe7e1e...|f45425406600b136d...|R0lGODlhIwAjANUAA...| +|http://www.archiv...| clicktoplay.png| png| image/png| image/png| 320| 240|b148d9544a1a65ae4...|477105e3a93b60dd8...|iVBORw0KGgoAAAANS...| +|http://www.archiv...| orange_arrow.gif| gif| image/gif| image/gif| 8| 11|a820ac93e2a000c9d...|850b9daeef06bee6e...|R0lGODlhCAALAJECA...| +|http://www.archiv...| arc-it-tagline.gif| gif| image/gif| image/gif| 385| 30|9f70e6cc21ac55878...|4601e2f642d8e55ac...|R0lGODlhgQEeALMPA...| +|http://www.archiv...| guitar.jpg| jpg| image/jpeg| image/jpeg| 140| 171|9ed163df5065418db...|f6c9475009ae2416c...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...| blendbar.jpg| jpg| image/jpeg| image/jpeg| 1800| 89|9e41e4d6bdd53cd9d...|dc780bf80720c87c9...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...|alexalogo-archive...| gif| image/gif| image/gif| 304| 36|9da73cf504be0eb70...|03e530ef04e4b68f7...|R0lGODlhMAEkAOYAA...| +|http://www.archiv...| lma.jpg| jpg| image/jpeg| image/jpeg| 215| 71|97ebd3441323f9b5d...|ff9485b26300721b2...|/9j/4AAQSkZJRgABA...| +|http://i.creative...| 88x31.png| png| image/png| image/png| 88| 31|9772d34b683f8af83...|689bef4ffb8918612...|iVBORw0KGgoAAAANS...| ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ + +only showing top 20 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 7 more fields] +``` + +If you wanted to work with all the images in a collection, you could extract +them with the following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).images(); + +df.select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension") +``` + +### Python DF + +The following script: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "/path/to/warcs") + +df = archive.images() +df.show() +``` + +Will extract all following information from images in a web collection: + +- image url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- Width +- Height +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server|mime_type_tika|width|height| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ +|http://farm3.stat...|4047878934_ef12ba...| jpg| image/jpeg| image/jpeg| 100| 75|e1a376f170b815f49...|2165fd2908950e9f6...|/9j/4AAQSkZJRgABA...| +|http://farm3.stat...|4047881126_fc6777...| jpg| image/jpeg| image/jpeg| 75| 100|371a2a5142c611405...|933f937c949826696...|/9j/4AAQSkZJRgABA...| +|http://farm3.stat...|4047879492_a72dd8...| jpg| image/jpeg| image/jpeg| 100| 75|8877679361cde970d...|31dbaaed2f7194c95...|/9j/4AAQSkZJRgABA...| +|http://farm3.stat...|4047877728_c6c118...| jpg| image/jpeg| image/jpeg| 75| 100|8f009a568d47e1888...|7b83e7d6c78ed65cf...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|96d9290d060547781...|2d3005bd6e09ca064...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|c69d65d4880445b31...|abe40cb96bfc79095...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|cb11c08d43e25ec3b...|2060857d6cf41b141...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|756b5a0a83a621eb7...|d4625efc80efb985e...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|0b60007c3e3d9d63f...|a154035590a01efb4...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|97fdea388e1245691...|e415a77a4369ecef8...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|05c2d43f687f40b60...|ed3f6ca2f3d7e9569...|/9j/4AAQSkZJRgABA...| +|http://www.canadi...| WebResource.axd| gif| image/gif| image/gif| 1| 1|325472601571f31e1...|2daeaa8b5f19f0bc2...|R0lGODlhAQABAIAAA...| +|http://www.davids...|footprint-carbon.jpg| jpg| image/jpeg| image/jpeg| 200| 200|51f57de92e76f3edc...|c970137cd3bfdbbba...|/9j/4AAQSkZJRgABA...| +|http://www.gca.ca...| 15.jpg| jpg| image/jpeg| image/jpeg| 300| 230|8b3c192b9a0cc82d6...|851377ed11c9cd153...|/9j/4AAQSkZJRgABA...| +|http://www.equalv...|loadingAnimation.gif| gif| image/gif| image/gif| 208| 13|c33734a1bf58bec32...|2bb50e01775289c24...|R0lGODlh0AANAMQAA...| +|http://www.davids...|Keep-greening-gre...| jpg| image/jpeg| image/jpeg| 166| 252|4763383a8be13c735...|a42b963e18dc1e7d4...|/9j/4AAQSkZJRgABA...| +|http://www.davids...|Keep-greening-don...| jpg| image/jpeg| image/jpeg| 146| 252|515bd44bea759e169...|75abeb65cc4f54c7d...|/9j/4AAQSkZJRgABA...| +|http://www.davids...|Keep-greening-eca...| jpg| image/jpeg| image/jpeg| 158| 252|345f71df9702e99a0...|b6637ac654f6e2073...|/9j/4AAQSkZJRgABA...| +|http://www.davids...|Keep-greening-tit...| jpg| image/jpeg| image/jpeg| 470| 45|385522fde90ac7e96...|b42151cf8c3ce14e0...|/9j/4AAQSkZJRgABA...| +|http://www.davids...| last_minute2.jpg| jpg| image/jpeg| image/jpeg| 265| 33|3defee897d4c553fc...|37c790bbc23c369d8...|/9j/4AAQSkZJRgABA...| ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ +only showing top 20 rows +``` + +## Extract Most Frequent Image URLs + +### Scala RDD + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .flatMap(r => ExtractImageLinks(r.getUrl, r.getContentString)) + .countItems() + .take(10) +``` + +Will extract the top ten URLs of images found within a collection, in an array +like so: + +```bash +links: Array[(String, Int)] = Array((http://www.archive.org/images/star.png,408), (http://www.archive.org/images/no_star.png,122), (http://www.archive.org/images/logo.jpg,118), (http://www.archive.org/images/main-header.jpg,84), (http://www.archive.org/images/rss.png,20), (http://www.archive.org/images/mail.gif,13), (http://www.archive.org/images/half_star.png,10), (http://www.archive.org/images/arrow.gif,7), (http://ia300142.us.archive.org/3/items/americana/am_libraries.gif?cnt=0,3), (http://ia310121.us.archive.org/2/items/GratefulDead/gratefuldead.gif?cnt=0,3), (http://www.archive.org/images/wayback.gif,2), (http://www.archive.org/images/wayback-election2000.gif,2), (http://www.archive.org/images/wayback-wt... +``` + +If you wanted to work with the images, you could download them from the +Internet Archive. + +Let's use the top-ranked example. [This +link](http://web.archive.org/web/*/http://archive.org/images/star.png), for +example, will show you the temporal distribution of the image. For a snapshot +from September 2007, this URL would work: + + + +To do analysis on all images, you could thus prepend +`http://web.archive.org/web/20070913051458/` to each URL and `wget` them en +masse. + +For more information on `wget`, please consult [this lesson available on the +Programming Historian +website](http://programminghistorian.org/lessons/automated-downloading-with-wget). + +### Scala DF + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).imagegraph(); + +df.groupBy($"image_url") + .count() + .orderBy($"count".desc) + .show(10) +``` + +Will extract the top ten URLs of images found within a collection, in a +DataFrame like so: + +```dataframe ++--------------------+-----+ +| image_url|count| ++--------------------+-----+ +|http://www.archiv...| 408| +|http://www.archiv...| 122| +|http://www.archiv...| 83| +|http://www.archiv...| 49| +|http://www.archiv...| 20| +|http://www.archiv...| 13| +|http://www.archiv...| 10| +|http://www.archiv...| 7| +|http://ia300142.u...| 3| +|http://ia310121.u...| 3| ++--------------------+-----+ +only showing top 10 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +df: org.apache.spark.sql.DataFrame = [src: string, image_url: string] +``` + +### Python DF + +The following script: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "/path/to/warcs") + +df = archive.imagegraph() + +df.groupBy("image_url") + .count() + .orderBy("count", ascending=False) + .show(10) +``` + +Will extract the top ten URLs of images found within a collection, in a +DataFrame like so: + +```dataframe ++--------------------+-----+ +| image_url|count| ++--------------------+-----+ +|http://www.archiv...| 408| +|http://www.archiv...| 122| +|http://www.archiv...| 83| +|http://www.archiv...| 49| +|http://www.archiv...| 20| +|http://www.archiv...| 13| +|http://www.archiv...| 10| +|http://www.archiv...| 7| +|http://ia300142.u...| 3| +|http://ia310121.u...| 3| ++--------------------+-----+ +``` + +## Extract Most Frequent Images MD5 Hash + +Some images may be the same, but have different URLs. This UDF finds the +popular images by calculating the MD5 hash of each and presents the most +frequent images based on that metric. This script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.app._ +import io.archivesunleashed.matchbox._ + +val r = RecordLoader.loadArchives("/path/to/warcs",sc).persist() +ExtractPopularImages(r, 500, sc).saveAsTextFile("500-Popular-Images") +``` + +Will save the 500 most popular URLs to an output directory. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.app._ + +val df = RecordLoader.loadArchives("/path/to/warcs",sc).images() + +ExtractPopularImagesDF(df,10,30,30).show() +``` + +### Python DF + +```python +from aut import * + +images = WebArchive(sc, sqlContext, "/path/to/warcs").images() + +popular_images = ExtractPopularImages(images, 20, 10, 10) + +popular_images.show() +``` + +## Find Images Shared Between Domains + +How to find images shared between domains that appear more than once _in more +than one domain_. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val images = RecordLoader.loadArchives("/path/to/warcs", sc) + .images() + .select(removePrefixWWW(extractDomain($"url")).as("domain"), $"url", $"md5") + +val links = images.groupBy("md5").count().where(countDistinct("domain")>=2) + +val result = images.join(links, "md5") + .groupBy("domain", "md5") + .agg(first("url").as("image_url")) + .orderBy(asc("md5")) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/output") +``` + +### PythonDF + +```python +from aut import * +from pyspark.sql.functions import asc, countDistinct, first + +images = WebArchive(sc, sqlContext, "/path/to/warcs") \ + .images() \ + .select(remove_prefix_www(extract_domain("url")).alias("domain"), "url", "md5") + +links = images.groupBy("md5") \ + .count() \ + .where(countDistinct("domain")>=2) + +result = images.join(links, "md5") \ + .groupBy("domain", "md5") \ + .agg(first("url").alias("image_url")) \ + .orderBy(asc("md5")) \ + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/output") +``` + +## Extract PDF Information + +### Scala RDD + +**Will not be implemented.** + +### Scala DF + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).pdfs(); + +df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes") + .orderBy(desc("md5")) + .show() +``` + +Will extract all following information from PDF files in a web collection: + +- file url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server| mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +|http://geocities....|adicec_sopar_2009...| pdf|application/octet...|application/pdf|ffc2ccc373b8ffd39...|3831b0f228af1701e...|JVBERi0xLjMNJeLjz...| +|http://www.geocit...| IntSt_2301.pdf| pdf|application/octet...|application/pdf|ffa638c418dac2e19...|84dbaccde1ace4b24...|JVBERi0xLjQNJeLjz...| +|http://www.geocit...| lotg.pdf| pdf|application/octet...|application/pdf|ff871ef64d3739b03...|95a777f0b4c7703c6...|JVBERi0xLjINJeLjz...| +|http://geocities....| ebad.pdf| pdf|application/octet...|application/pdf|fe8feece5d08dc2ce...|0c01cc31b40a286da...|JVBERi0xLjMNJeLjz...| +|http://geocities....| regulament.pdf| pdf|application/octet...|application/pdf|fe8018451633fd76c...|9c7cc720e29cad6e8...|JVBERi0xLjMKJcfsj...| +|http://geocities....|dmatias_letterfor...| pdf|application/octet...|application/pdf|fe7dbc89e664ba790...|dbe965e7a288cce59...|JVBERi0xLjYNJeLjz...| +|http://geocities....|overcome_the_fear...| pdf|application/octet...|application/pdf|fe3ec0805564cd3fc...|d0d30ba4f7f40434d...|JVBERi0xLjMKJcfsj...| +|http://geocities....| CIM_marks.pdf| pdf|application/octet...|application/pdf|fe1622ac08b47cf60...|b97b57b3c77887324...|JVBERi0xLjMKJcTl8...| +|http://geocities....| board.PDF| pdf|application/octet...|application/pdf|fd969b57508d3b135...|fc121c07fefbb722b...|JVBERi0xLjIgDQol4...| +|http://geocities....| cowell.pdf| pdf|application/octet...|application/pdf|fbacc01cbe01aa0b4...|f9e9eba1b281ad800...|JVBERi0xLjMKJeLjz...| +|http://geocities....| gdbrasil.pdf| pdf|application/octet...|application/pdf|fadc9b9b2408a1112...|247671acb971ddc21...|JVBERi0xLjQNJeLjz...| +|http://www.geocit...| EBOrder.pdf| pdf|application/octet...|application/pdf|fa4a83d96441324b3...|5f6870832d035a5a9...|JVBERi0xLjINJeLjz...| +|http://geocities....| butlleta.pdf| pdf|application/octet...|application/pdf|fa13dfbf62acb5083...|9a8ec0c0e8a190f46...|JVBERi0xLjQNJeLjz...| +|http://www.geocit...|ALABAMAUNDERWOODM...| pdf|application/octet...|application/pdf|f9791c7df35d9092a...|3e4c0ca1031152d24...|JVBERi0xLjIgDQol4...| +|http://geocities....| chimera.pdf| pdf|application/octet...|application/pdf|f92a40f58cffcdc8e...|ba038d0146b0171f2...|JVBERi0xLjMKJcfsj...| +|http://geocities....| icarus.pdf| pdf|application/octet...|application/pdf|f8da963b714e684b3...|4444f5a12c9dbb1df...|JVBERi0xLjMKJcfsj...| +|http://geocities....|2008_ClubFinances...| pdf|application/octet...|application/pdf|f878c0373edbc89f9...|700393c7b6aaf93df...|JVBERi0xLjQNJeLjz...| +|http://geocities....| WILLOWSTScene5.pdf| pdf|application/octet...|application/pdf|f84fc521602fdf163...|5f03b19201536cbc8...|JVBERi0xLjQKJcfsj...| +|http://geocities....| isrherb2.pdf| pdf|application/octet...|application/pdf|f83390642e9fe6313...|60befa2b5913bb19d...|JVBERi0xLjMNJeLjz...| +|http://geocities....| joel.pdf| pdf|application/octet...|application/pdf|f828e4b447c085fdd...|2e3308c1a52f2f75a...|JVBERi0xLjQKJcOkw...| ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +only showing top 20 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 6 more fields] +``` + +If you wanted to work with all the PDF files in a collection, you could extract +them with the following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).pdfs(); + +df.select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension") +``` + +### Python DF + +The following script: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "/path/to/warcs") + +df = archive.pdfs() +df.show() +``` + +Will extract all following information from PDF files in a web collection: + +- file url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server| mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +|http://geocities....|20080304ordinance...| pdf|application/octet...|application/pdf|ebbf9bf99b363493b...|f0b9a6788cbc1f8ab...|JVBERi0xLjMNJeLjz...| +|http://geocities....|FACTSHEET2008ADOP...| pdf|application/octet...|application/pdf|4fe261c2210189a52...|a91180b9170ff757f...|JVBERi0xLjQNJeLjz...| +|http://geocities....| Menu.pdf| pdf|application/octet...|application/pdf|75e4d587589a1d85d...|d18724100d4616a45...|JVBERi0xLjMNJeLjz...| +|http://geocities....|DSTC2009ContestFl...| pdf|application/octet...|application/pdf|c80f38f96480aab0c...|369c4415ed9c2476d...|JVBERi0xLjQNJeLjz...| +|http://geocities....| ebad.pdf| pdf|application/octet...|application/pdf|fe8feece5d08dc2ce...|0c01cc31b40a286da...|JVBERi0xLjMNJeLjz...| +|http://geocities....|FACTSHEET2008APPR...| pdf|application/octet...|application/pdf|8747971e78acb768b...|770f97a95c7e2ee16...|JVBERi0xLjQNJeLjz...| +|http://geocities....|FACTSHEET2008APPE...| pdf|application/octet...|application/pdf|32f57bbe5b28f4ab1...|d4f63b8d29f4c5dc5...|JVBERi0xLjQNJeLjz...| +|http://geocities....|FACTSHEET2008ADOP...| pdf|application/octet...|application/pdf|e9189eea563fde074...|f14b1846499dd4bd0...|JVBERi0xLjQNJeLjz...| +|http://geocities....| sharar.pdf| pdf|application/octet...|application/pdf|771f5bd1b72b8e324...|9cef1f6af9e5c127e...|JVBERi0xLjMNJeLjz...| +|http://geocities....|FACTSHEET2008UTIL...| pdf|application/octet...|application/pdf|7f45c93d16823e852...|b3a2d3b95efd77bd6...|JVBERi0xLjQNJeLjz...| +|http://geocities....|BakweriMarginalis...| pdf|application/octet...|application/pdf|d25863303ba46a872...|bbd6c9bce4c523f0f...|JVBERi0xLjINJeLjz...| +|http://geocities....|McCallaFoodSecuri...| pdf|application/octet...|application/pdf|1291b633f49f7e51d...|622144ed0fd56bae3...|JVBERi0xLjMNJeLjz...| +|http://geocities....|PovertyAndIncome.pdf| pdf|application/octet...|application/pdf|278e1f281905d419d...|9bc00a54147a4b350...|JVBERi0xLjIgDSXi4...| +|http://geocities....| behold.pdf| pdf|application/octet...|application/pdf|9fc1e4e1e0f567477...|63d324984d34eb168...|JVBERi0xLjMKJcfsj...| +|http://geocities....|overcome_the_fear...| pdf|application/octet...|application/pdf|fe3ec0805564cd3fc...|d0d30ba4f7f40434d...|JVBERi0xLjMKJcfsj...| +|http://geocities....| raven.pdf| pdf|application/octet...|application/pdf|acabc7f7dba954f99...|1ddf3e53813a805a1...|JVBERi0xLjMKJcfsj...| +|http://geocities....| sunset.pdf| pdf|application/octet...|application/pdf|1dc037712d47b11d9...|f502ca5cc2de2483b...|JVBERi0xLjMKJcfsj...| +|http://geocities....|night_lasts_less_...| pdf|application/octet...|application/pdf|1cda3dfab3bedaf04...|ad0f6e6fd53e4eb5f...|JVBERi0xLjMKJcfsj...| +|http://geocities....| angel_dust.pdf| pdf|application/octet...|application/pdf|92d14676e34dfcb7e...|1588b870928d56667...|JVBERi0xLjMKJcfsj...| +|http://geocities....| vampire.pdf| pdf|application/octet...|application/pdf|f1730689d52b9524e...|bf377a4e2580b8a29...|JVBERi0xLjMKJcfsj...| ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +only showing top 20 rows +``` + +## Extract Presentation Program Files Information + +### Scala RDD + +**Will not be implemented.** + +### Scala DF + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).presentationProgramFiles(); + +df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes") + .orderBy(desc("md5")) + .show() +``` + +Will extract all following information from presentation program files in a web collection: + +- file url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server| mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +|http://geocities....| index.pps| pps|application/mspow...|application/vnd.m...|fbaed5a1df163270a...|afa4c82593ea5bfd6...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|MathForEveryoneCa...| ppt|application/mspow...|application/vnd.m...|f5fde5813a5aef2f3...|e791212ac91243f39...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|AD1-GE-Quiz4-Samp...| ppt|application/mspow...|application/vnd.m...|f5824d64bb74b1377...|aaea2a38d11682753...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| Agrarianism1.ppt| ppt|application/mspow...|application/vnd.m...|f581932d9e4c57dc0...|3fbce2d175be293a8...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| lego.pps| pps|application/mspow...|application/vnd.m...|f0da5c58e7abbf102...|78bc45da68c6784be...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| HPIB.ppt| ppt|application/mspow...|application/vnd.m...|ef09c31bd8079d40b...|875a96d8b8dd3bf18...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|learningdisabilit...| ppt|application/mspow...|application/vnd.m...|e6bb4f98761839a3a...|5a4dcc8bab2ee15f3...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|assessmentsummer.ppt| ppt|application/mspow...|application/vnd.m...|e116a443b9031ec01...|141563f2f32687587...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|CommonlyConfusedW...| ppt|application/mspow...|application/vnd.m...|dde43870e0da8ebf6...|7a94bf766d931a046...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|AD1-Unit5-Achieve...| ppt|application/mspow...|application/vnd.m...|d4530e506c2e41f8f...|6c89c0e3d28ecceed...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| Schwind.ppt| ppt|application/mspow...|application/vnd.m...|cfdd4bb6e7b04f24a...|9c26a8ac091f88a35...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| cpphtp4_PPT_07.ppt| ppt|application/mspow...|application/vnd.m...|cd98e6e18c3b0ada0...|b3651507f61bafa4d...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| mylife.ppt| ppt|application/mspow...|application/vnd.m...|cb146894f8a544ace...|0129cfdfd2f196346...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| refinterview.ppt| ppt|application/mspow...|application/vnd.m...|ca6fd4ec5fcb8237d...|8312ca4c0dbeb6008...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|MathForEveryoneAl...| ppt|application/mspow...|application/vnd.m...|c887f45fa58f273b0...|b253b732f8502f357...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| ch2-DataTypes.ppt| ppt|application/mspow...|application/vnd.m...|c74caee72b5ee6684...|f3bf878c775e2f72a...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|geographyofnortha...| ppt|application/mspow...|application/vnd.m...|c35b93ac59f2eb5af...|b5de05a856838328c...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| people1st.ppt| ppt|application/mspow...|application/vnd.m...|bf19cdc1ff3ad82fd...|99f14fe81d8a9587f...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| AD1-Reading.ppt| ppt|application/mspow...|application/vnd.m...|be020b4564f972218...|0761a2fd5c176ce1c...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| majalah.ppt| ppt|application/mspow...|application/vnd.m...|b6f219693ef1df49f...|1039013624cf8de35...|0M8R4KGxGuEAAAAAA...| ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +only showing top 20 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 6 more fields] +``` + +If you wanted to work with all the presentation program files in a collection, +you could extract them with the following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).presentationProgramFiles(); + +df.select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension") +``` + +### Python DF + +The following script: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "/path/to/warcs") + +df = archive.presentation_program() +df.show() +``` + +Will extract all following information from presentation program files in a web collection: + +- file url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server| mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +|http://geocities....| wincvs.ppt| ppt|application/mspow...|application/vnd.m...|52ac23b58493234b2...|a2206af9847cceb06...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| index.pps| pps|application/mspow...|application/vnd.m...|fbaed5a1df163270a...|afa4c82593ea5bfd6...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|MathForEveryoneCa...| ppt|application/mspow...|application/vnd.m...|f5fde5813a5aef2f3...|e791212ac91243f39...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|MathForEveryone7t...| ppt|application/mspow...|application/vnd.m...|9893643e1cb87af0c...|2fa8301893ad21b2b...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|MathForEveryoneGe...| ppt|application/mspow...|application/vnd.m...|2a914a95a61b227dd...|5d783c1beaffc0b57...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|MathForEveryoneAl...| ppt|application/mspow...|application/vnd.m...|c887f45fa58f273b0...|b253b732f8502f357...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|MathForEveryone7t...| ppt|application/mspow...|application/vnd.m...|034906471a0c0b997...|16142a0aa69b2fb1f...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| tiago.ppt| ppt|application/mspow...|application/vnd.m...|6871786192c187783...|e5a91a65ef9a4bade...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| energypp.ppt| ppt|application/mspow...|application/vnd.m...|94f9384ec57d8849c...|e943c5cf509f8f816...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| lego.pps| pps|application/mspow...|application/vnd.m...|f0da5c58e7abbf102...|78bc45da68c6784be...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| celtiberos.pps| pps|application/mspow...|application/vnd.m...|af897525acd31d359...|9c018a80253c38a57...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| porque.pps| pps|application/mspow...|application/vnd.m...|9c2cba37c64fd0ac8...|6f11733ddec0abc2d...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|SoftHandoffbyPara...| pps|application/mspow...|application/vnd.m...|0c5ef732ea466574f...|dc7dfe545b401aeab...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|A_Land_Remembered...| ppt|application/mspow...|application/vnd.m...|5b7273d03f8490490...|2d8721e7876cb6697...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| DANCE.ppt| ppt|application/mspow...|application/vnd.m...|5aa3308433666a30a...|4a23bd20768501dac...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| unit.ppt| ppt|application/mspow...|application/vnd.m...|6736886864069ee66...|e92031e6e0293cb73...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| majalah.ppt| ppt|application/mspow...|application/vnd.m...|b6f219693ef1df49f...|1039013624cf8de35...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|esos_si_son_probl...| pps|application/mspow...|application/vnd.m...|932221045b6154d7e...|b23a0238c852d28bb...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| refinterview.ppt| ppt|application/mspow...|application/vnd.m...|ca6fd4ec5fcb8237d...|8312ca4c0dbeb6008...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| Schwind.ppt| ppt|application/mspow...|application/vnd.m...|cfdd4bb6e7b04f24a...|9c26a8ac091f88a35...|0M8R4KGxGuEAAAAAA...| ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +only showing top 20 rows +``` + +## Extract Spreadsheet Information + +### Scala RDD + +**Will not be implemented.** + +### Scala DF + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).spreadsheets(); + +df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes") + .orderBy(desc("md5")) + .show() +``` + +Will extract all following information from spreadsheet files in a web collection: + +- file url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server| mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +|http://geocities....| statuscarib.xls| xls|application/vnd.m...|application/vnd.m...|f9fd18b158df52ff2...|0d606f25ac3c9abc4...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| timesheet.xls| xls|application/vnd.m...|application/vnd.m...|f9549db15de69bc21...|e9c239d812705842f...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|statusccusspring0...| xls|application/vnd.m...|application/vnd.m...|ef99704e5a734f386...|f265fc5c581ad1762...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| Laboratorio_05.xls| xls|application/vnd.m...|application/vnd.m...|eb0e39898ba513234...|976f69da07122d285...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|110_Laboratorio_6...| xls|application/vnd.m...|application/vnd.m...|e5b7fee6d4c45e171...|befd9670be70a4fdb...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| Pakuan.xls| xls|application/vnd.m...|application/vnd.m...|e386f85a7bd74b1ab...|5b2b142de2c57ec68...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|Spring08_statusre...| xls|application/vnd.m...|application/vnd.m...|df2d6792fb55c4e26...|6f4d2aef711aff4e1...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| CTtimetable2.xls| xls|application/vnd.m...|application/vnd.m...|dc987d3e996677ce9...|40bb63a4c0038a6ae...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|MalibuTrailChalle...| xls|application/vnd.m...|application/vnd.m...|dbba76ead82576178...|ffbe099441053b47b...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| CTtimetable.xls| xls|application/vnd.m...|application/vnd.m...|d9ee9117e70df43b5...|596c4c6d5cdc7ddb5...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| 1071_Parcial_2.xls| xls|application/vnd.m...|application/vnd.m...|d90dc210138676a2c...|6e3ed07f50393815c...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|excelsubtractione...| xls|application/vnd.m...|application/vnd.m...|d6c8314e52f22e4aa...|1b1ebce0f85628921...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|Fall2008_statusre...| xls|application/vnd.m...|application/vnd.m...|cd9974430477b75ce...|0e756bbc38608cb51...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| report01.xls| xls|application/vnd.m...|application/vnd.m...|cd947fe4099df4fe3...|0f11d17d38a72977b...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|TrackRecords20010...| xls|application/vnd.m...|application/vnd.m...|c8aa0122443efa0e5...|fa9cdb4a329f926bf...|0M8R4KGxGuEAAAAAA...| +|http://br.geociti...| mycoinsforswap.xls| xls|application/vnd.m...|application/vnd.m...|c665c83bc2b54292f...|18f1f3a4559d5c40a...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| AAtimetable.xls| xls|application/vnd.m...|application/vnd.m...|c66201762bf5e473e...|4e9bac4f217b0605d...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| carwashroster.xls| xls|application/vnd.m...|application/vnd.m...|c495d1b7dc954b975...|062167485baf9aa5d...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| RSL_MDP.xls| xls|application/vnd.m...|application/vnd.m...|bf6479bacbb758b52...|4d7ea33849447853d...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| status_report4.xls| xls|application/vnd.m...|application/vnd.m...|bc4d18e022522d185...|fc7b9fc64116c9ad1...|0M8R4KGxGuEAAAAAA...| ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +only showing top 20 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 6 more fields] +``` + +If you wanted to work with all the spreadsheet files in a collection, you could extract +them with the following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).spreadsheets(); + +df.select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension") +``` + +### Python DF + +The following script: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "/path/to/warcs") + +df = archive.spreadsheets() +df.show() +``` + +Will extract all following information from spreadsheet files in a web collection: + +- file url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server| mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +|http://geocities....| tkadrosu.xls| xls|application/vnd.m...|application/vnd.m...|8033532f88da42ad6...|a52b24bc760c5265b...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| cal_counter.xls| xls|application/vnd.m...|application/vnd.m...|56ad6c2f84fdd4a88...|ad0db35f2d7ff2cca...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| CTtimetable2.xls| xls|application/vnd.m...|application/vnd.m...|dc987d3e996677ce9...|40bb63a4c0038a6ae...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| AAtimetable.xls| xls|application/vnd.m...|application/vnd.m...|c66201762bf5e473e...|4e9bac4f217b0605d...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| CTtimetable.xls| xls|application/vnd.m...|application/vnd.m...|d9ee9117e70df43b5...|596c4c6d5cdc7ddb5...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| CTtimetable2.xls| xls|application/vnd.m...|application/vnd.m...|a4ed4330d5c18f1b2...|d8ce479596d49679d...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| teams.xls| xls|application/vnd.m...|application/vnd.m...|334fa42776cef7f81...|aa57fda7fb634c931...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| collection.xls| xls|application/vnd.m...|application/vnd.m...|30d7a67de8150f712...|841ba91f009d48b7a...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|music-collection.xls| xls|application/vnd.m...|application/vnd.m...|4def75fa96bae579d...|090a95923c9599454...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| 020103.xls| xls|application/vnd.m...|application/vnd.m...|48651a7592ca1b0f0...|1e2438c8247d33870...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| 011803.xls| xls|application/vnd.m...|application/vnd.m...|0aab8ed40f91c1c76...|8e02e408fe1ce40b9...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| RSL_TorOton.xls| xls|application/vnd.m...|application/vnd.m...|1d9c13c6407a2b696...|007010ecf5b208453...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| members.xls| xls|application/vnd.m...|application/vnd.m...|b045a6b118981c6eb...|3ae096d6602b7cb36...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| round309.xls| xls|application/vnd.m...|application/vnd.m...|50bed4b3e9facb278...|f26e0c38082141598...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| result109.xls| xls|application/vnd.m...|application/vnd.m...|2235d094897f10c3b...|6ed0b65fd43502a2b...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|TrackRecords20010...| xls|application/vnd.m...|application/vnd.m...|c8aa0122443efa0e5...|fa9cdb4a329f926bf...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| Digox.xls| xls|application/vnd.m...|application/vnd.m...|182d08821797269c7...|80e7ce8ecc1ecf389...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| RSL_SemBA.xls| xls|application/vnd.m...|application/vnd.m...|59613700fbf08b795...|44eac99a514141520...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| RSL_MDP.xls| xls|application/vnd.m...|application/vnd.m...|bf6479bacbb758b52...|4d7ea33849447853d...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| RSL_ARG99.xls| xls|application/vnd.m...|application/vnd.m...|a2f2fd063dd5689a7...|61568e0f4139ec568...|0M8R4KGxGuEAAAAAA...| ++--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+ +only showing top 20 rows +``` + +## Extract Video Information + +### Scala RDD + +**Will not be implemented.** + +### Scala DF + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).videos(); + +df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes") + .orderBy(desc("md5")) + .show() +``` + +Will extract all following information from videos in a web collection: + +- file url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server| mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +|http://geocities....| videohead.avi| avi| video/x-msvideo|video/x-msvideo|fa9852748ba7b4829...|0be56f200f8e1cb83...|UklGRjoMIQBBVkkgT...| +|http://www.geocit...| HandWrap2.avi| avi| video/x-msvideo|video/x-msvideo|f680cb463e7cb291e...|1d2ea1df3f5af2599...|UklGRrBrAgBBVkkgT...| +|http://geocities....| 1kungfu.avi| avi| video/x-msvideo|video/x-msvideo|f4429277ed4b48efb...|5c542e8990efd484b...|UklGRkoSFwBBVkkgT...| +|http://geocities....| Vol_III_sample.mpg| mpg| video/mpeg| video/mpeg|f2bc34f7294edc376...|a939dc619c123f81b...|AAABuiEAAdLxgA7xA...| +|http://geocities....| wherego.avi| avi| video/x-msvideo|video/x-msvideo|f23976ddeb6f08810...|714a9a548f9b2a156...|UklGRkq4HgBBVkkgT...| +|http://geocities....| couch100k.wmv| asf| video/x-ms-wmv| video/x-ms-asf|ee316d5871acb7859...|0593ebb8e450a6c3e...|MCaydY5mzxGm2QCqA...| +|http://geocities....| Mitwa_Lagaan.mp3| qt| audio/mpeg|video/quicktime|ebc5db8d30edd0135...|d3ebdd6da2c732481...|AAAE6W1vb3YAAABsb...| +|http://www.geocit...| tydunking.mpg| mpg| video/mpeg| video/mpeg|eaa0d14dc05bdab98...|05d4ff2301d2d3818...|AAABuiEAAQALgBcdA...| +|http://geocities....| bigjleroy.avi| avi| video/x-msvideo|video/x-msvideo|e93538f0d76b86cca...|ebeb89fc2fa8f7cd6...|UklGRrjUCgBBVkkgT...| +|http://geocities....| NollieBs180.mov| mov| video/quicktime|video/quicktime|e7b97c287329340d5...|138fb8b0dea4c8e16...|AAAGwm1vb3YAAABsb...| +|http://www.geocit...| shirt.avi| avi| video/x-msvideo|video/x-msvideo|e36119d3c78225cbf...|11af72475ca754639...|UklGRvhdHQBBVkkgT...| +|http://geocities....| atdawn.wma| asf| audio/x-ms-wma| video/x-ms-asf|e1a85a79ea3ba5d96...|1be05aecdff99298c...|MCaydY5mzxGm2QCqA...| +|http://www.geocit...|non_will_go_to_wa...| mov| video/quicktime|video/quicktime|de6cc975363c4076b...|0c00d0be9c89f9e97...|AAs4JW1kYXQAAA70A...| +|http://geocities....| Movies_20.mpeg| mpeg| video/mpeg| video/mpeg|dd9d2af0c1318b5ff...|9d06f09744fe93408...|AAABuiEAV+PlgAU7A...| +|http://www.geocit...| artilery.mpg| mpg| video/mpeg| video/mpeg|dcecbdfe46448bffb...|0b292aab1078d9bfa...|AAABswsAkBP//+CkA...| +|http://geocities....| tancfigurakbbpl.wmv| wmv| video/x-ms-wmv| video/x-ms-wmv|dca4991392572dbc0...|cb349bdc35484d976...|MCaydY5mzxGm2QCqA...| +|http://www.geocit...| Trevi2.mov| mov| video/quicktime|video/quicktime|dc882205f5cae38f5...|c9dd804e1ee140221...|AAAEvG1vb3YAAAS0Y...| +|http://www.geocit...|skillful_driving_...| mpg| video/mpeg| video/mpeg|db8a767b00884e426...|f5a70cf5f091b530f...|AAABuiEAAQABgAORA...| +|http://geocities....| jeremy100k.wmv| asf| video/x-ms-wmv| video/x-ms-asf|dafba744438ae0110...|d3a217ce25507ae90...|MCaydY5mzxGm2QCqA...| +|http://www.geocit...| mbrl2.mpg| mpg| video/mpeg| video/mpeg|d8eb5a12f0da99ca0...|8686002a444cc9dce...|AAABswsAkBP//+CkA...| ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +only showing top 20 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 6 more fields] +``` + +If you wanted to work with all the video files in a collection, you could extract +them with the following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).videos(); + +df.select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension") +``` + +### Python DF + +The following script: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "/path/to/warcs") + +df = archive.video() +df.show() +``` + +Will extract all following information from videos in a web collection: + +- file url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server| mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +|http://www.geocit...|Sea_Dawgs_2008_Ha...| wmv| video/x-ms-wmv| video/x-ms-wmv|7b35e4cf60a3cfa67...|b35ad7242e8135326...|MCaydY5mzxGm2QCqA...| +|http://www.geocit...| Excedrine.wmv| wmv| video/x-ms-wmv| video/x-ms-wmv|0aaf1d81ab6f2b354...|0b52af5f5facfd30f...|MCaydY5mzxGm2QCqA...| +|http://geocities....| homework.avi| avi| video/x-msvideo|video/x-msvideo|4e06cbd11764cd2ac...|770a8849375965b20...|UklGRsrLAgBBVkkgT...| +|http://geocities....| macarenababy.avi| avi| video/x-msvideo|video/x-msvideo|600084bbd732c0fda...|f99b2e31374d4ea18...|UklGRrC0AwBBVkkgT...| +|http://geocities....|orlando_viggokiss...| wmv| video/x-ms-asf| video/x-ms-wmv|79d093eb6184dba74...|395eaf6dcb29a66d2...|MCaydY5mzxGm2QCqA...| +|http://www.geocit...|skillful_driving_...| mpg| video/mpeg| video/mpeg|db8a767b00884e426...|f5a70cf5f091b530f...|AAABuiEAAQABgAORA...| +|http://www.geocit...|gray_havens_2.35.MPG| mpg| video/mpeg| video/mpeg|af71353d69af0b42f...|f3625b897339b0f23...|AAABuiEAAQABgAORA...| +|http://geocities....| movie2.mpeg| mpeg| video/mpeg| video/mpeg|3f6c7c48d2a990cf2...|760e6752bfd9e8a84...|AAABsxQA8MMCcSClE...| +|http://www.geocit...| Sequence1.mov| mov| video/quicktime|video/quicktime|931fc4dee8aa260f9...|5a5cf58e2a50cf942...|AAELrG1vb3YAAABsb...| +|http://www.geocit...| santa.mov| mov| video/quicktime|video/quicktime|8b9b98d0c567c4381...|49f49dd23c3bad61b...|AAAAIGZ0eXBxdCAgI...| +|http://geocities....| 0602-2.avi| avi| video/x-msvideo|video/x-msvideo|92d04dbe7f1bdc109...|65ed7327aece11bac...|UklGRkauOABBVkkgT...| +|http://geocities....| movie.mpg| mpg| video/mpeg| video/mpeg|a0e86539e5eb9bd35...|82eb4680a9f65ed1b...|AAABuiEAAdLxgASfA...| +|http://geocities....| misshawaii.mpeg| mpeg| video/mpeg| video/mpeg|45cbfc4d03547861b...|44c93f871ea602112...|AAABuiEAAQABgAORA...| +|http://geocities....| Explosions.wmv| wmv| video/x-ms-wmv| video/x-ms-wmv|22cb24bffbd7eabf9...|a44d261ef5d7e7993...|MCaydY5mzxGm2QCqA...| +|http://geocities....| couch100k.wmv| asf| video/x-ms-wmv| video/x-ms-asf|ee316d5871acb7859...|0593ebb8e450a6c3e...|MCaydY5mzxGm2QCqA...| +|http://geocities....| jeremy100k.wmv| asf| video/x-ms-wmv| video/x-ms-asf|dafba744438ae0110...|d3a217ce25507ae90...|MCaydY5mzxGm2QCqA...| +|http://geocities....| jedi_wade.mov| mov| video/quicktime|video/quicktime|674688fd09bf18d29...|cd21c3a5b9e2f18b6...|AAAFB21vb3YAAAT/Y...| +|http://geocities....|ylagallinanonosga...| asf| audio/x-ms-wma| video/x-ms-asf|9aac473134d7f2e7a...|3af7fbab238772f48...|MCaydY5mzxGm2QCqA...| +|http://geocities....|Chris-5050NollieS...| mov| video/quicktime|video/quicktime|93aa2ce07e01f90ad...|f066f29e5faf0cee1...|AAAHRG1vb3YAAABsb...| +|http://geocities....| floursack_jump2.avi| avi| video/x-msvideo|video/x-msvideo|a922441c0a7f0018d...|b82ca6fe1d46e16dc...|UklGRgjlAwBBVkkgT...| ++--------------------+--------------------+---------+--------------------+---------------+--------------------+--------------------+--------------------+ +only showing top 20 rows +``` + +## Extract Word Processor Files Information + +### Scala RDD + +**Will not be implemented.** + +### Scala DF + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).wordProcessorFiles(); + +df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1", $"bytes") + .orderBy(desc("md5")) + .show() +``` + +Will extract all following information from word processor files in a web collection: + +- file url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server| mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+ +|http://geocities....|infiniteproducts.doc| doc| application/msword|application/msword|ffa1ea83af6cb9508...|7a3ae86a7a22d2682...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| Everything.doc| doc| application/msword|application/msword|ff7216edf86fe196c...|082a889c27640fc9a...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| survey.doc| doc| application/msword|application/msword|ff48df5e64bd5adeb...|383ab6ead48795ff3...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| iepWrkshpFall01.doc| doc| application/msword|application/msword|ff421feb87b826d39...|ec60a48d393642629...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|24_reproduction_s...| doc| application/msword|application/msword|fec21eb30fac4588e...|36b41ba66801b10b9...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|Descendit_ad_Infe...| doc| application/msword|application/msword|fe66eeb7c04942c8b...|14f207787abef983e...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|Anthropology21FEB...| doc| application/msword|application/msword|fe079d498bd5e91f2...|ca54e6be7c0618ecc...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| Senses.doc| doc| application/msword|application/msword|fdf881ef998c227f7...|04d6e72132537053a...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|hopewel-loudon-cl...| doc| application/msword|application/msword|fddffbabcaf1976c9...|b7ade5d661dd597a1...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| billmprev9899.doc| doc| application/msword|application/msword|fdcc8b65cfb0a18c9...|602f323278c9fb726...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|approachesProject...| doc| application/msword|application/msword|fd4df7f89efe9cea7...|4e7be7664bfe992f3...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| batayan.doc| doc| application/msword|application/msword|fc6f45fdfce72d4a3...|e614c9b9e95d64aa6...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| VisitUnitPacket.doc| doc| application/msword|application/msword|fc2a0e45b627c3d4a...|dc7ba874b7b13d548...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|vc3c3ppstudyguide...| doc| application/msword|application/msword|fc293bbddb906615f...|538aa0d5e2f554258...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|30_chordates_fish...| doc| application/msword|application/msword|fc053770a82822f69...|9df86863983889373...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...|c6artposterexampl...| doc| application/msword|application/msword|fbe2427b48f32d1d9...|47de792202dc3a059...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| kun20509.doc| doc| application/msword|application/msword|fb8d1ae5e3db45131...|6b13d73759a956e62...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| kun20509| doc| application/msword|application/msword|fb8d1ae5e3db45131...|6b13d73759a956e62...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| Fishing.doc| doc| application/msword|application/msword|fb7df7ac80aa2cc8a...|eb4bb266226349bac...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| resumedoAw.doc| doc| application/msword|application/msword|fb6d5bf501b9b97b3...|1e0d6500192d4ee21...|0M8R4KGxGuEAAAAAA...| ++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+ +only showing top 20 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 6 more fields] +``` + +If you wanted to work with all the word processor files in a collection, +you could extract them with the following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).wordProcessorFiles(); + +df.select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension") +``` + +### Python DF + +The following script: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "/path/to/warcs") + +df = archive.word_processor() +df.show() +``` + +Will extract all following information from word processor files in a web collection: + +- file url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server| mime_type_tika| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+ +|http://geocities....| Doc2.doc| doc| application/msword|application/msword|09159efbefff59f64...|5412d6c55c2c8bec7...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| CV-ITjob.doc| doc| application/msword|application/msword|7f2b7540e558de24e...|96a6ece7202ab309b...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| CV-Teach.doc| doc| application/msword|application/msword|637bb22eff4bc5be5...|76130b6ffeac5c678...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| CV-covlet.doc| doc| application/msword|application/msword|466c06bfa5a47d5cb...|dc763126cbdb589eb...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| CV-extra.doc| doc| application/msword|application/msword|ab0fa931229c02a4b...|4c2a8200e6eaaafb2...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|020410Indonesia_N...| doc| application/msword|application/msword|b195e90841347be61...|6d2845902ad15a9a2...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| Chapter1.doc| doc| application/msword|application/msword|65383c8c0cf5b6a4f...|fcf3008e9478b773c...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|CathyKoning_resum...| doc| application/msword|application/msword|924ad3f9f66d3c6bd...|2d0887c93ffd3e78b...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| Greek_colonels.doc| doc| application/msword|application/msword|ee4b9db827086d0db...|94e5569e064195db5...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| resume2.doc| doc| application/msword|application/msword|c39fa601733093268...|108563de6ba6102a5...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| eta_writeup.doc| doc| application/msword|application/msword|661328d76ce3aa340...|debadb248da4dfbd3...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|Before_Night_Fall...| doc| application/msword|application/msword|a40371b35b4bf0838...|8f1dba8a46ea297b8...|0M8R4KGxGuEAAAAAA...| +|http://geocities....|Membership_Form_2...| doc| application/msword|application/msword|bf3a3b8cc86b371c3...|472810e93a2245fb1...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| walkthroughff1.doc| doc| application/msword|application/msword|c97de6941c3fb4aed...|16851a5445bdce07d...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| Encyclopedia.doc| doc| application/msword|application/msword|26a94e8f3358c878c...|07f9b2ce6342f73bc...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| Y.Kurulu.doc| doc| application/msword|application/msword|8e0ebe7c4f27b1841...|ebb5ce328f717f8e6...|0M8R4KGxGuEAAAAAA...| +|http://www.geocit...| fifty_eggs.doc| doc| application/msword|application/msword|2c1cdd4f75030650e...|d022311b2fc399750...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| 1pitagoras2.doc| doc| application/msword|application/msword|e07ff47cb8ebc4356...|97d46d781458f5a82...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| constitution.doc| doc| application/msword|application/msword|e38dc3e5d553d8799...|d50096b5208146ce9...|0M8R4KGxGuEAAAAAA...| +|http://geocities....| feasibility.doc| doc| application/msword|application/msword|5574bf82d65935191...|53de74880c9ea2e2b...|0M8R4KGxGuEAAAAAA...| ++--------------------+--------------------+---------+--------------------+------------------+--------------------+--------------------+--------------------+ +only showing top 20 rows +``` diff --git a/website/versioned_docs/version-1.0.0/dataframe-schemas.md b/website/versioned_docs/version-1.0.0/dataframe-schemas.md new file mode 100644 index 00000000..9b3c7f73 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/dataframe-schemas.md @@ -0,0 +1,156 @@ +--- +id: version-1.0.0-dataframe-schemas +title: DataFrame Schemas +original_id: dataframe-schemas +--- + +Below you can find all of the DataFrame schemas available in the Toolkit. For +example, you can use `.all()` to extract the overall content from a web archive +record. Some of the most popular ones include `.all()` (which includes raw +content (HTTP headers & HTML), URLs, and file types); `.webpages()` (which +includes full-text content and language); and `.webgraph()` which includes +hyperlink information. + +## All + +**`.all()`** + +- `crawl_date` (string) +- `domain` (string) +- `url` (string) +- `mime_type_web_server` (string) +- `mime_type_tika` (string) +- `raw_content` (string) +- `bytes` (binary) +- `http_status_code` (string) +- `archive_filename` (string) + +## Web Pages + +**`.webpages()`** + +- `crawl_date` (string) +- `domain` (string) +- `url` (string) +- `mime_type_web_server` (string) +- `mime_type_tika` (string) +- `language` (string) +- `content` + +## Web Graph + +**`.webgraph()`** + +- `crawl_date` (string) +- `src` (string) +- `dest` (string) +- `anchor` (string) + +## Image Graph + +**`.imagegraph()`** + +- `crawl_date` (string) +- `src` (string) +- `image_url` (string) +- `alt_text` (string) + +## Images + +**`.images()`** + +- `crawl_date` (string) +- `url` (string) +- `filename` (string) +- `extension` (string) +- `mime_type_web_server` (string) +- `mime_type_tika` (string) +- `width` (string) +- `height` (string) +- `md5` (string) +- `sha1` (string) +- `bytes` (binary) + +## PDFs + +**`.pdfs()`** + +- `crawl_date` (string) +- `url` (string) +- `filename` (string) +- `extension` (string) +- `mime_type_web_server` (string) +- `mime_type_tika` (string) +- `md5` (string) +- `sha1` (string) +- `bytes` (binary) + +## Audio + +**`.audio()`** + +- `crawl_date` (string) +- `url` (string) +- `filename` (string) +- `extension` (string) +- `mime_type_web_server` (string) +- `mime_type_tika` (string) +- `md5` (string) +- `sha1` (string) +- `bytes` (binary) + +## Videos + +**`.videos()`** + +- `crawl_date` (string) +- `url` (string) +- `filename` (string) +- `extension` (string) +- `mime_type_web_server` (string) +- `mime_type_tika` (string) +- `md5` (string) +- `sha1` (string) +- `bytes` (binary) + +## Spreadsheets + +**`.spreadsheets()`** + +- `crawl_date` (string) +- `url` (string) +- `filename` (string) +- `extension` (string) +- `mime_type_web_server` (string) +- `mime_type_tika` (string) +- `md5` (string) +- `sha1` (string) +- `bytes` (binary) + +## Presentation Program Files + +**`.presentationProgramFiles()`** + +- `crawl_date` (string) +- `url` (string) +- `filename` (string) +- `extension` (string) +- `mime_type_web_server` (string) +- `mime_type_tika` (string) +- `md5` (string) +- `sha1` (string) +- `bytes` (binary) + +## Word Processor Files + +**`.wordProcessorFiles()`** + +- `crawl_date` (string) +- `url` (string) +- `filename` (string) +- `extension` (string) +- `mime_type_web_server` (string) +- `mime_type_tika` (string) +- `md5` (string) +- `sha1` (string) +- `bytes` (binary) diff --git a/website/versioned_docs/version-1.0.0/extract-binary-info.md b/website/versioned_docs/version-1.0.0/extract-binary-info.md new file mode 100644 index 00000000..9bc38434 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/extract-binary-info.md @@ -0,0 +1,177 @@ +--- +id: version-1.0.0-extract-binary-info +title: Extract Binary Info +original_id: extract-binary-info +--- + +How do I extract the binary information of PDFs, audio files, video files, word +processor files, spreadsheet files, and presentation program files to a CSV +file, or into the [Apache Parquet](https://parquet.apache.org/) format +to [work with later](df-results.md#what-to-do-with-dataframe-results)? + +You can also read and write to Amazon S3 by supplying your AWS credentials, and +using `s3a`. + +## Scala RDD + +**Will not be implemented.** + +## Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +sc.setLogLevel("INFO") + +sc.hadoopConfiguration.set("fs.s3a.access.key", "YOUR ACCESS KEY") +sc.hadoopConfiguration.set("fs.s3a.secret.key", "YOUR SECRET KEY ") + +// Local web archive collection. +val warcs = RecordLoader.loadArchives("/local/path/to/warcs", sc) + +// S3 hosted web archive collection. +val warcsS3 = RecordLoader.loadArchives("s3a://your-data-bucket/", sc) + +// Choose your format: CSV or Parquet. + +// For CSV: +// .write.csv("/path/to/derivatives/csv/audio") +// .write.csv("s3a://your-derivatives-bucket/parquet/pages") + +// For Parquet: +// .write.parquet("/path/to/derivatives/parquet/pages/") +// .write.parquet("s3a://your-derivatives-bucket/parquet/pages") + +// Audio Files. +warcs.audio() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/derivatives/csv/audio") + +// Images. +warcsS3.images() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"width", $"height", $"md5", $"sha1") + .write + .parquet("/path/to/derivatives/parquet/image") + +// PDFs. +warcs.pdfs() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("s3a://your-derivatives-bucket/csv/pdf") + +// Presentation Program Files. +warcs.presentationProgramFiles() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write + .parquet("s3a://your-derivatives-bucket/parquet/presentation-program") + +// Spreadsheets. +warcs.spreadsheets() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/derivatives/csv/spreadsheet") + +// Videos. +warcs.videos() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/derivatives/csv/video") + +// Word Processor Files. +warcs.wordProcessorFiles() + .select($"crawl_date", $"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5", $"sha1") + .write + .parquet("/path/to/derivatives/parquet/word-processor") + +sys.exit +``` + +## Python DF + +```python +from aut import * + +# Web archive collection (dataset). +warcs = WebArchive(sc, sqlContext, "/path/to/aut-resources-master/Sample-Data/*gz") + +# Choose your format: CSV or Parquet. + +# For CSV: +# .write.csv('/path/to/derivatives/csv/audio') +# Include header='true' if you want headers. + +# For Parquet: +# .write.parquet("/path/to/derivatives/parquet/pages/") + +# Audio Files. +warcs.audio() + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save('/path/to/derivatives/csv/audio') + +# Images. +warcs.images()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "width", "height", "md5", "sha1")\ + .write\ + .parquet('/path/to/derivatives/parquet/images') + +# PDFs. +warcs.pdfs()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write\ + .parquet('/path/to/derivatives/csv/pdfs') + +# Spreadsheets. +warcs.spreadsheets()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save('/path/to/derivatives/csv/spreadsheets') + +# Presentation Program Files. +warcs.presentation_program()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write\ + .parquet('/path/to/derivatives/csv/presentation_program') + +# Videos. +warcs.video()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write\ + .parquet('/path/to/derivatives/csv/video') + +# Word Processor Files. +warcs.word_processor()\ + .select("crawl_date", "url", "filename", "extension", "mime_type_web_server", "mime_type_tika", "md5", "sha1")\ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save('/path/to/derivatives/csv/word_processor') +``` diff --git a/website/versioned_docs/version-1.0.0/extract-binary.md b/website/versioned_docs/version-1.0.0/extract-binary.md new file mode 100644 index 00000000..7025078e --- /dev/null +++ b/website/versioned_docs/version-1.0.0/extract-binary.md @@ -0,0 +1,117 @@ +--- +id: version-1.0.0-extract-binary +title: Extract Binaries to Disk +original_id: extract-binary +--- + +How do I extract all the binary files of PDFs, audio files, video files, word processor +files, spreadsheet files, and presentation program files to disk? + +## Scala RDD + +**Will not be implemented.** + +## Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +// Web archive collection. +warcs = RecordLoader.loadArchives("/path/to/warcs", sc) + +// Audio Files. +warcs.audio() + .select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/extract/binaries/audio/your-prefix-audio", "extension") + +// Images. +warcs.images() + .select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/extract/binaries/image/your-prefix-image", "extension") + +// PDFs +warcs.pdfs() + .select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/extract/binaries/pdf/your-prefix-pdf", "extension") + +// Presentation Program Files. +warcs.presentationProgramFiles() + .select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/extract/binaries/presentation-program/your-prefix-presentation-program", "extension") + +// Spreadsheets. +warcs.spreadsheets() + .select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/extract/binaries/spreadsheet/your-prefix-spreadsheet", "extension") + +// Videos. +warcs.videos() + .select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/extract/binaries/video/your-prefix-video", "extension") + +// Word Processor Files. +warcs.wordProcessorFiles() + .select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/extract/binaries/word-processor/your-prefix-word-processor", "extension") + +sys.exit +``` + +## Python DF + +```python +from aut import * + +# Web archive collection. +warcs = WebArchive(sc, sqlContext, "path/to/warcs") + +# Audio Files. +audio = warcs.audio()\ + .select("extension", "bytes")\ + .collect() + +SaveBytes(audio, "/path/to/extract/binaries/audio") + +# Images. +images = warcs.images()\ + .select("extension", "bytes")\ + .collect() + +SaveBytes(images, "/path/to/extract/binaries/image") + +# PDFs +pdfs = warcs.pdfs()\ + .select("extension", "bytes")\ + .collect() + +SaveBytes(pdfs, "/path/to/extract/binaries/pdf") + +# Presentation Program Files. +pp_files = warcs.presentationProgramFiles()\ + .select("extension", "bytes")\ + .collect() + +SaveBytes(pp_files, "/path/to/extract/binaries/presentation_program") + +# Spreadsheets. +spreadsheets = warcs.spreadsheets()\ + .select("extension", "bytes")\ + .collect() + +SaveBytes(spreadsheets, "/path/to/extract/binaries/spreadsheet") + +# Videos. +videos = warcs.videos()\ + .select("extension", "bytes")\ + .collect() + +SaveBytes(videos, "/path/to/extract/binaries/video") + +# Word Processor Files. +wp_files = warcs.wordProcessorFiles()\ + .select("extension", "bytes")\ + .collect() + +SaveBytes(wp_files, "/path/to/extract/binaries/word_processor") +``` diff --git a/website/versioned_docs/version-1.0.0/filters-df.md b/website/versioned_docs/version-1.0.0/filters-df.md new file mode 100644 index 00000000..49aa6847 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/filters-df.md @@ -0,0 +1,328 @@ +--- +id: version-1.0.0-filters-df +title: DataFrame Filters +original_id: filters-df +--- + +## Has Content + +Filters or removes all data that does or does not pass a specified regular +expression test on content. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val content = Array("Content-Length: [0-9]{4}") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .all() + .select("url", "raw_content") + .filter(!hasContent($"raw_content", lit(content))) +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +content = "Content-Length: [0-9]{4}" + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .select("url", "raw_content") \ + .filter(col("raw_content").rlike(content)) +``` + +## Has Dates + +Filters or keeps all data that does or does not match the timestamps or +date patterns specified. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val dates = Array("2008.*", "200908.*", "20070502231159") + +RecordLoader.loadArchives("/path/to/warcs",sc) + .all() + .select($"url", $"crawl_date") + .filter(!hasDate($"crawl_date", lit(dates))) +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +dates = ["2008.*", "200908.*", "20070502231159"] + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .select("url", "crawl_date") \ + .filter(~col("crawl_date").isin(dates)) +``` + +## Has Domain(s) + +Filters or keeps all data that does or does not match the source domain(s) specified. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val domains = Array("archive.org", "sloan.org") + +RecordLoader.loadArchives("/path/to/warcs",sc) + .webpages() + .select($"url") + .filter(!hasDomains(extractDomain($"url"), lit(domains))) +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +domains = ["archive.org", "sloan.org"] + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select("url") \ + .filter(~(extract_domain("url").isin(domains))) +``` + +## Has HTTP Status + +Filters or keeps all data that does or does not match the status codes specified. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val statusCodes = Array("200","000") + +RecordLoader.loadArchives("/path/to/warcs",sc) + .all() + .select($"url",$"http_status_code") + .filter(!hasHTTPStatus($"http_status_code", lit(statusCodes))) +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +status_codes = ["200", "000"] + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .select("url", "http_status_code") \ + .filter(~col("http_status_code").isin(status_codes)) +``` + +## Has Images + +Filters or keeps all data except images. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +RecordLoader.loadArchives("/path/to/warcs",sc) + .all() + .select($"mime_type_tika", $"mime_type_web_server", $"url") + .filter(hasImages($"crawl_date", detectMimeTypeTika($"bytes"))) +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .select("mime_type_tika", "mime_type_web_server", "url") \ + .filter(col("mime_type_tika").like("image/%") | col("mime_type_web_server").like("image/%")) +``` + +## Has Languages + +Filters or keeps all data that does or does not match the language(s) ([ISO +639-2 codes](https://www.loc.gov/standards/iso639-2/php/code_list.php)) +specified. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val languages = Array("th","de","ht") + +RecordLoader.loadArchives("/path/to/warcs",sc) + .webpages() + .select($"language", $"url", $"content") + .filter(!hasContent($"language", lit(languages))) +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +languages = ["th","de","ht"] + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select("language", "url", "content") \ + .filter(~col("language").isin(languages)) +``` + +## Keep MIME Types (Apache Tika) + +Filters or keeps all data that does or does not match the MIME type(s) +(identified by [Apache Tika](https://tika.apache.org/)) specified. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val mimeTypes = Array("text/html", "text/plain") + +RecordLoader.loadArchives("/path/to/warcs",sc) + .all() + .select($"url", $"mime_type_tika") + .filter(!hasMIMETypesTika($"mime_type_tika", lit(mimeTypes))) +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +mime_types = ["text/html", "text/plain"] + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .select("url", "mime_type_tika") \ + .filter(~col("mime_type_tika").isin(mime_types)) +``` + +## Keep MIME Types (web server) + +Filters or keeps all data that does or does not match the MIME type(s) +(identified by the web server) specified. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val mimeTypes = Array("text/html", "text/plain") + +RecordLoader.loadArchives("/path/to/warcs",sc) + .all() + .select($"url", $"mime_type_web_server") + .filter(!hasMIMETypes($"mime_type_web_server", lit(mimeTypes))) +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +mime_types = ["text/html", "text/plain"] + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .select("url", "mime_type_web_server") \ + .filter(~col("mime_type_web_server").isin(mime_types)) +``` + +## Has URL Patterns + +Filters or removes all data that does or does not pass a specified regular +expression test on URL patterns. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val urlsPattern = Array(".*images.*") + +RecordLoader.loadArchives("/path/to/warcs",sc) + .all() + .select($"url", $"raw_content") + .filter(hasUrlPatterns($"url", lit(urlsPattern))) +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +url_pattern = ".*images.*" + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .select("url", "raw_content") \ + .filter(~col("url").rlike(url_pattern)) +``` + +## Has URLs + +Filters or keeps all data that does or does not match the URL(s) specified. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val urls = Array("archive.org") + +RecordLoader.loadArchives("/path/to/warcs",sc) + .all() + .select($"url", $"raw_content") + .filter(hasUrls($"url", lit(urls))) +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +urls = ["archive.org"] + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .select("url", "raw_content") \ + .filter(~col("url").isin(urls)) +``` diff --git a/website/versioned_docs/version-1.0.0/image-analysis.md b/website/versioned_docs/version-1.0.0/image-analysis.md new file mode 100644 index 00000000..48b8bf91 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/image-analysis.md @@ -0,0 +1,357 @@ +--- +id: version-1.0.0-image-analysis +title: Image Analysis +original_id: image-analysis +--- + +The Archives Unleashed Toolkit supports image analysis, a growing area of +interest within web archives. + +## Extract Image Information + +### Scala RDD + +**Will not be implemented.** + +### Scala DF + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).images(); + +df.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"width", $"height", $"md5", $"sha1", $"bytes") + .orderBy(desc("md5")) + .show() +``` + +Will extract all following information from images in a web collection: + +- image url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- Width +- Height +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server|mime_type_tika|width|height| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ +|http://www.archiv...|mediatype_movies.gif| gif| image/gif| image/gif| 21| 21|ff05f9b408519079c...|194800d702aab9b87...|R0lGODlhFQAVAKUpA...| +|http://www.archiv...| LOCLogoSmall.jpg| jpg| image/jpeg| image/jpeg| 275| 300|fbf1aec668101b960...|564c1a07152c12cea...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...| archive.small.jpg| jpg| image/jpeg| image/jpeg| 300| 225|f611b554b9a44757d...|e9bf7ef0ae3fc50f5...|/9j/4RpBRXhpZgAAT...| +|http://tsunami.ar...| tsunamiweb1_02.jpg| jpg| image/jpeg| image/jpeg| 384| 229|f02005e29ffb485ca...|9eeb9c3c67d7efc51...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...|alexa_websearch_l...| gif| image/gif| image/gif| 301| 47|eecc909992272ce0d...|ea18e226f3cf40005...|R0lGODlhLQEvAPcAA...| +|http://www.archiv...| lizardtech.gif| gif| image/gif| image/gif| 140| 37|e7166743861126e51...|cf26e9ffc27be133f...|R0lGODlhjAAlANUwA...| +|http://www.archiv...| half_star.png| png| image/png| image/png| 14| 12|e1e101f116d9f8251...|736abd06a978e2fd2...|iVBORw0KGgoAAAANS...| +|http://www.archiv...| hewlett.jpg| jpg| image/jpeg| image/jpeg| 300| 116|e1da27028b81db60e...|eb418c17901b1b313...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...|prelinger-header-...| jpg| image/jpeg| image/jpeg| 84| 72|d39cce8b2f3aaa783...|1c41a644123e8f861...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...| arrow.gif| gif| image/gif| image/gif| 13| 11|c7ee6d7c17045495e...|7013764e619066e60...|R0lGODlhDQALALMAA...| +|http://www.archiv...| folder.png| png| image/png| image/png| 20| 15|c1905fb5f16232525...|ff7b8c60e8397cb5d...|iVBORw0KGgoAAAANS...| +|http://www.archiv...| wayback-wtc.gif| gif| image/gif| image/gif| 35| 35|c15ec074d95fe7e1e...|f45425406600b136d...|R0lGODlhIwAjANUAA...| +|http://www.archiv...| clicktoplay.png| png| image/png| image/png| 320| 240|b148d9544a1a65ae4...|477105e3a93b60dd8...|iVBORw0KGgoAAAANS...| +|http://www.archiv...| orange_arrow.gif| gif| image/gif| image/gif| 8| 11|a820ac93e2a000c9d...|850b9daeef06bee6e...|R0lGODlhCAALAJECA...| +|http://www.archiv...| arc-it-tagline.gif| gif| image/gif| image/gif| 385| 30|9f70e6cc21ac55878...|4601e2f642d8e55ac...|R0lGODlhgQEeALMPA...| +|http://www.archiv...| guitar.jpg| jpg| image/jpeg| image/jpeg| 140| 171|9ed163df5065418db...|f6c9475009ae2416c...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...| blendbar.jpg| jpg| image/jpeg| image/jpeg| 1800| 89|9e41e4d6bdd53cd9d...|dc780bf80720c87c9...|/9j/4AAQSkZJRgABA...| +|http://www.archiv...|alexalogo-archive...| gif| image/gif| image/gif| 304| 36|9da73cf504be0eb70...|03e530ef04e4b68f7...|R0lGODlhMAEkAOYAA...| +|http://www.archiv...| lma.jpg| jpg| image/jpeg| image/jpeg| 215| 71|97ebd3441323f9b5d...|ff9485b26300721b2...|/9j/4AAQSkZJRgABA...| +|http://i.creative...| 88x31.png| png| image/png| image/png| 88| 31|9772d34b683f8af83...|689bef4ffb8918612...|iVBORw0KGgoAAAANS...| ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ + +only showing top 20 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +df: org.apache.spark.sql.DataFrame = [url: string, filename: string ... 7 more fields] +``` + +If you wanted to work with all the images in a collection, you could extract +them with the following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).images(); + +df.select($"bytes", $"extension") + .saveToDisk("bytes", "/path/to/export/directory/your-preferred-filename-prefix", $"extension") +``` + +### Python DF + +The following script: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "/path/to/warcs") + +df = archive.images() +df.show() +``` + +Will extract all following information from images in a web collection: + +- image url +- filename +- extension +- MimeType as identified by the hosting web server +- MimeType as identified by [Apache Tika](https://tika.apache.org) +- Width +- Height +- md5 hash +- sha1 hash +- bytes + +```dataframe ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ +| url| filename|extension|mime_type_web_server|mime_type_tika|width|height| md5| sha1| bytes| ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ +|http://farm3.stat...|4047878934_ef12ba...| jpg| image/jpeg| image/jpeg| 100| 75|e1a376f170b815f49...|2165fd2908950e9f6...|/9j/4AAQSkZJRgABA...| +|http://farm3.stat...|4047881126_fc6777...| jpg| image/jpeg| image/jpeg| 75| 100|371a2a5142c611405...|933f937c949826696...|/9j/4AAQSkZJRgABA...| +|http://farm3.stat...|4047879492_a72dd8...| jpg| image/jpeg| image/jpeg| 100| 75|8877679361cde970d...|31dbaaed2f7194c95...|/9j/4AAQSkZJRgABA...| +|http://farm3.stat...|4047877728_c6c118...| jpg| image/jpeg| image/jpeg| 75| 100|8f009a568d47e1888...|7b83e7d6c78ed65cf...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|96d9290d060547781...|2d3005bd6e09ca064...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|c69d65d4880445b31...|abe40cb96bfc79095...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|cb11c08d43e25ec3b...|2060857d6cf41b141...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|756b5a0a83a621eb7...|d4625efc80efb985e...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|0b60007c3e3d9d63f...|a154035590a01efb4...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|97fdea388e1245691...|e415a77a4369ecef8...|/9j/4AAQSkZJRgABA...| +|http://img.youtub...| 0.jpg| jpg| image/jpeg| image/jpeg| 480| 360|05c2d43f687f40b60...|ed3f6ca2f3d7e9569...|/9j/4AAQSkZJRgABA...| +|http://www.canadi...| WebResource.axd| gif| image/gif| image/gif| 1| 1|325472601571f31e1...|2daeaa8b5f19f0bc2...|R0lGODlhAQABAIAAA...| +|http://www.davids...|footprint-carbon.jpg| jpg| image/jpeg| image/jpeg| 200| 200|51f57de92e76f3edc...|c970137cd3bfdbbba...|/9j/4AAQSkZJRgABA...| +|http://www.gca.ca...| 15.jpg| jpg| image/jpeg| image/jpeg| 300| 230|8b3c192b9a0cc82d6...|851377ed11c9cd153...|/9j/4AAQSkZJRgABA...| +|http://www.equalv...|loadingAnimation.gif| gif| image/gif| image/gif| 208| 13|c33734a1bf58bec32...|2bb50e01775289c24...|R0lGODlh0AANAMQAA...| +|http://www.davids...|Keep-greening-gre...| jpg| image/jpeg| image/jpeg| 166| 252|4763383a8be13c735...|a42b963e18dc1e7d4...|/9j/4AAQSkZJRgABA...| +|http://www.davids...|Keep-greening-don...| jpg| image/jpeg| image/jpeg| 146| 252|515bd44bea759e169...|75abeb65cc4f54c7d...|/9j/4AAQSkZJRgABA...| +|http://www.davids...|Keep-greening-eca...| jpg| image/jpeg| image/jpeg| 158| 252|345f71df9702e99a0...|b6637ac654f6e2073...|/9j/4AAQSkZJRgABA...| +|http://www.davids...|Keep-greening-tit...| jpg| image/jpeg| image/jpeg| 470| 45|385522fde90ac7e96...|b42151cf8c3ce14e0...|/9j/4AAQSkZJRgABA...| +|http://www.davids...| last_minute2.jpg| jpg| image/jpeg| image/jpeg| 265| 33|3defee897d4c553fc...|37c790bbc23c369d8...|/9j/4AAQSkZJRgABA...| ++--------------------+--------------------+---------+--------------------+--------------+-----+------+--------------------+--------------------+--------------------+ +only showing top 20 rows +``` + +## Extract Most Frequent Image URLs + +### Scala RDD + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .flatMap(r => ExtractImageLinks(r.getUrl, r.getContentString)) + .countItems() + .take(10) +``` + +Will extract the top ten URLs of images found within a collection, in an array +like so: + +```scala +links: Array[(String, Int)] = Array((http://www.archive.org/images/star.png,408), (http://www.archive.org/images/no_star.png,122), (http://www.archive.org/images/logo.jpg,118), (http://www.archive.org/images/main-header.jpg,84), (http://www.archive.org/images/rss.png,20), (http://www.archive.org/images/mail.gif,13), (http://www.archive.org/images/half_star.png,10), (http://www.archive.org/images/arrow.gif,7), (http://ia300142.us.archive.org/3/items/americana/am_libraries.gif?cnt=0,3), (http://ia310121.us.archive.org/2/items/GratefulDead/gratefuldead.gif?cnt=0,3), (http://www.archive.org/images/wayback.gif,2), (http://www.archive.org/images/wayback-election2000.gif,2), (http://www.archive.org/images/wayback-wt... +``` + +If you wanted to work with the images, you could download them from the +Internet Archive. + +Let's use the top-ranked example. [This +link](http://web.archive.org/web/*/http://archive.org/images/star.png), for +example, will show you the temporal distribution of the image. For a snapshot +from September 2007, this URL would work: + + + +To do analysis on all images, you could thus prepend +`http://web.archive.org/web/20070913051458/` to each URL and `wget` them en +masse. + +For more information on `wget`, please consult [this lesson available on the +Programming Historian +website](http://programminghistorian.org/lessons/automated-downloading-with-wget). + +### Scala DF + +The following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val df = RecordLoader.loadArchives("/path/to/warcs", sc).imagegraph(); + +df.groupBy($"image_url") + .count() + .orderBy($"count".desc) + .show(10) +``` + +Will extract the top ten URLs of images found within a collection, in a +DataFrame like so: + +```dataframe ++--------------------+-----+ +| image_url|count| ++--------------------+-----+ +|http://www.archiv...| 408| +|http://www.archiv...| 122| +|http://www.archiv...| 83| +|http://www.archiv...| 49| +|http://www.archiv...| 20| +|http://www.archiv...| 13| +|http://www.archiv...| 10| +|http://www.archiv...| 7| +|http://ia300142.u...| 3| +|http://ia310121.u...| 3| ++--------------------+-----+ +only showing top 10 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +df: org.apache.spark.sql.DataFrame = [src: string, image_url: string] +``` + +### Python DF + +The following script: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "/path/to/warcs") + +df = archive.imagegraph() + +df.groupBy("image_url") + .count() + .orderBy("count", ascending=False) + .show(10) +``` + +Will extract the top ten URLs of images found within a collection, in a +DataFrame like so: + +```dataframe ++--------------------+-----+ +| image_url|count| ++--------------------+-----+ +|http://www.archiv...| 408| +|http://www.archiv...| 122| +|http://www.archiv...| 83| +|http://www.archiv...| 49| +|http://www.archiv...| 20| +|http://www.archiv...| 13| +|http://www.archiv...| 10| +|http://www.archiv...| 7| +|http://ia300142.u...| 3| +|http://ia310121.u...| 3| ++--------------------+-----+ +``` + +## Extract Most Frequent Images MD5 Hash + +Some images may be the same, but have different URLs. This UDF finds the +popular images by calculating the MD5 hash of each and presents the most +frequent images based on that metric. This script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.app._ +import io.archivesunleashed.matchbox._ + +val r = RecordLoader.loadArchives("/path/to/warcs",sc).persist() +ExtractPopularImages(r, 500, sc).saveAsTextFile("500-Popular-Images") +``` + +Will save the 500 most popular URLs to an output directory. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.app._ + +val df = RecordLoader.loadArchives("/path/to/warcs",sc).images() + +ExtractPopularImagesDF(df,10,30,30).show() +``` + +### Python DF + +```python +from aut import * + +images = WebArchive(sc, sqlContext, "/path/to/warcs").images() + +popular_images = ExtractPopularImages(images, 20, 10, 10) + +popular_images.show() +``` + +## Find Images Shared Between Domains + +How to find images shared between domains that appear more than once _in more +than one domain_. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val images = RecordLoader.loadArchives("/path/to/warcs", sc) + .images() + .select(removePrefixWWW(extractDomain($"url")).as("domain"), $"url", $"md5") + +val links = images.groupBy("md5").count().where(countDistinct("domain")>=2) + +val result = images.join(links, "md5") + .groupBy("domain", "md5") + .agg(first("url").as("image_url")) + .orderBy(asc("md5")) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/output") +``` + +### PythonDF + +```python +from aut import * +from pyspark.sql.functions import asc, countDistinct, first + +images = WebArchive(sc, sqlContext, "/path/to/warcs") \ + .images() \ + .select(remove_prefix_www(extract_domain("url")).alias("domain"), "url", "md5") + +links = images.groupBy("md5") \ + .count() \ + .where(countDistinct("domain")>=2) + +result = images.join(links, "md5") \ + .groupBy("domain", "md5") \ + .agg(first("url").alias("image_url")) \ + .orderBy(asc("md5")) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("/path/to/output") +``` diff --git a/website/versioned_docs/version-1.0.0/link-analysis.md b/website/versioned_docs/version-1.0.0/link-analysis.md new file mode 100644 index 00000000..a92434ef --- /dev/null +++ b/website/versioned_docs/version-1.0.0/link-analysis.md @@ -0,0 +1,593 @@ +--- +id: version-1.0.0-link-analysis +title: Link Analysis +original_id: link-analysis +--- + +Site link structures can be very useful, allowing you to learn such things as: + +- what websites were the most linked to; +- what websites had the most outbound links; +- what paths could be taken through the network to connect pages; +- what communities existed within the link structure? + +Most of the following examples show the **domain** to **domain** links. For +example, you discover how many times that `liberal.ca` linked to `twitter.com`, +rather than learning that `http://liberal.ca/contact` linked to +`http://twitter.com/liberal_party`. The reason we do that is that in general, +if you are working with any data at scale, the sheer number of raw URLs can +become overwhelming. That said, we do provide one example below that provides +raw data. + +## Extract Simple Site Link Structure + +### Scala RDD + +If your web archive does not have a temporal component, the following Spark +script will generate the site-level link structure. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .flatMap(r => ExtractLinks(r.getUrl, r.getContentString)) + .map(r => (ExtractDomain(r._1).removePrefixWWW(), ExtractDomain(r._2).removePrefixWWW())) + .filter(r => r._1 != "" && r._2 != "") + .countItems() + .filter(r => r._2 > 5) + .saveAsTextFile("links-all-rdd/") +``` + +Note how you can add filters. In this case, we add a filter which +will result in a network graph of pages containing the phrase "apple." Filters +can be applied immediately after `.keepValidPages()`. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .keepContent(Set("apple".r)) + .flatMap(r => ExtractLinks(r.getUrl, r.getContentString)) + .map(r => (ExtractDomain(r._1).removePrefixWWW(), ExtractDomain(r._2).removePrefixWWW())) + .filter(r => r._1 != "" && r._2 != "") + .countItems() + .filter(r => r._2 > 5) + .saveAsTextFile("links-all-apple-rdd/") +``` + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webgraph() + .groupBy(removePrefixWWW(extractDomain($"src")).as("src"), removePrefixWWW(extractDomain($"dest")).as("dest")) + .count() + .filter($"count" > 5) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("links-all-df/") +``` + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val content = Array("radio") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .all() + .keepValidPagesDF() + .filter(hasContent($"raw_content", lit(content))) + .select(explode(extractLinks($"url", $"raw_content")).as("links")) + .select(removePrefixWWW(extractDomain(col("links._1"))).as("src"), removePrefixWWW(extractDomain(col("links._2"))).as("dest")) + .groupBy("src", "dest") + .count() + .filter($"count" > 5) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("links-all-apple-df/") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col, explode + +content = "%radio%" + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .filter("crawl_date is not NULL")\ + .filter(~(col("url").rlike(".*robots\\.txt$")) & (col("mime_type_web_server").rlike("text/html") | col("mime_type_web_server").rlike("application/xhtml+xml") | col("url").rlike("(?i).*htm$") | col("url").rlike("(?i).*html$")))\ + .filter(col("http_status_code") == 200) + .filter(col("raw_content").like(content)) \ + .select(explode(extract_links("url", "raw_content")).alias("links")) \ + .select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain(col("links._2"))).alias("dest")) \ + .groupBy("src", "dest") \ + .count() \ + .filter(col("count") > 5) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("links-all-apple-df/") +``` + +## Extract Raw URL Link Structure + +### Scala RDD + +This following script extracts all of the hyperlink relationships between +sites, using the full URL pattern. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .flatMap(r => ExtractLinks(r.getUrl, r.getContentString)) + .filter(r => r._1 != "" && r._2 != "") + .countItems() + .saveAsTextFile("full-links-all-rdd/") +``` + +You can see that the above was achieved by removing the following line: + +```scala + .map(r => (ExtractDomain(r._1).removePrefixWWW(), ExtractDomain(r._2).removePrefixWWW())) +``` + +In a larger collection, you might want to add the following line: + +```scala +.filter(r => r._2 > 5) +``` + +before `.countItems()` to find just the documents that are linked to more than +five times. As you can imagine, raw URLs are very numerous! + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webgraph() + .groupBy(extractDomain($"src"), extractDomain($"dest")) + .count() + .filter($"count" > 5) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("full-links-all-df/") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webgraph() \ + .groupBy(extract_domain("src"), extract_domain("dest")) \ + .count() \ + .filter(col("count") > 5) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("full-links-all-df/") +``` + +## Organize Links by URL Pattern + +### Scala RDD + +In this following example, we run the same script but only extract links coming +from URLs matching the pattern `http://www.archive.org/details/*`. We do so by +using the `keepUrlPatterns` command. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .keepUrlPatterns(Set("(?i)http://www.archive.org/details/.*".r)) + .flatMap(r => ExtractLinks(r.getUrl, r.getContentString)) + .map(r => (ExtractDomain(r._1).removePrefixWWW(), ExtractDomain(r._2).removePrefixWWW())) + .filter(r => r._1 != "" && r._2 != "") + .countItems() + .filter(r => r._2 > 5) + .saveAsTextFile("details-links-all-rdd/") +``` + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val urlPattern = Array("(?i)http://www.archive.org/details/.*") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .all() + .keepValidPagesDF() + .filter(hasUrlPatterns($"url", lit(urlPattern))) + .select(explode(extractLinks($"url", $"raw_content")).as("links")) + .select(removePrefixWWW(extractDomain(col("links._1"))).as("src"), removePrefixWWW(extractDomain(col("links._2"))).as("dest")) + .groupBy("src", "dest") + .count() + .filter($"count" > 5) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("details-links-all-df/") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col, explode + +url_pattern = "%http://www.archive.org/details/%" + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .filter("crawl_date is not NULL")\ + .filter(~(col("url").rlike(".*robots\\.txt$")) & (col("mime_type_web_server").rlike("text/html") | col("mime_type_web_server").rlike("application/xhtml+xml") | col("url").rlike("(?i).*htm$") | col("url").rlike("(?i).*html$")))\ + .filter(col("http_status_code") == 200) + .filter(col("url").like(url_pattern)) \ + .select(explode(extract_links("url", "raw_content").alias("links"))) \ + .select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain("links._2")).alias("dest")) \ + .groupBy("src", "dest") \ + .count() \ + .filter(col("count") > 5) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("details-links-all-df/") +``` + +## Organize Links by Crawl Date + +### Scala RDD + +The following Spark script generates the aggregated site-level link structure, +grouped by crawl date (YYYYMMDD). It +makes use of the `ExtractLinks` and `ExtractToLevelDomain` functions. + +If you prefer to group by crawl month (YYYMM), replace `getCrawlDate` with +`getCrawlMonth` below. If you prefer to group by simply crawl year (YYYY), +replace `getCrawlDate` with `getCrawlYear` below. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc).keepValidPages() + .map(r => (r.getCrawlDate, ExtractLinks(r.getUrl, r.getContentString))) + .flatMap(r => r._2.map(f => (r._1, ExtractDomain(f._1).replaceAll("^\\s*www\\.", ""), ExtractDomain(f._2).replaceAll("^\\s*www\\.", "")))) + .filter(r => r._2 != "" && r._3 != "") + .countItems() + .filter(r => r._2 > 5) + .saveAsTextFile("sitelinks-by-date-rdd/") +``` + +The format of this output is: + +- Field one: Crawldate, `yyyyMMdd` +- Field two: Source domain (i.e. liberal.ca) +- Field three: Target domain of link (i.e. ndp.ca) +- Field four: number of links. + +```scala +((20080612,liberal.ca,liberal.ca),1832983) +((20060326,ndp.ca,ndp.ca),1801775) +((20060426,ndp.ca,ndp.ca),1771993) +((20060325,policyalternatives.ca,policyalternatives.ca),1735154) +``` + +In the above example, you are seeing links within the same domain. + +Note also that `ExtractLinksRDD` takes an optional third parameter of a base +URL. If you set this – typically to the source URL – `ExtractLinksRDD` will +resolve a relative path to its absolute location. For example, if `val url = +"http://mysite.com/some/dirs/here/index.html"` and `val html = "... Contact ..."`, and we call `ExtractLinks(url, html, +url)`, the list it returns will include the item +`(http://mysite.com/a/b/c/index.html, http://mysite.com/a/b/contact/, +Contact)`. It may be useful to have this absolute URL if you intend to call +`ExtractDomainRDD` on the link and wish it to be counted. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webgraph() + .groupBy($"crawl_date", removePrefixWWW(extractDomain($"src")), removePrefixWWW(extractDomain($"dest"))) + .count() + .filter($"count" > 5) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("sitelinks-by-date-df/") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webgraph() \ + .groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src"), remove_prefix_www(extract_domain("dest")).alias("dest")) \ + .count() \ + .filter(col("count") > 5) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("sitelinks-by-date-df/") +``` + +## Filter by URL + +### Scala RDD + +In this case, you would only receive links coming from websites in matching the +URL pattern listed under `keepUrlPatterns`. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +val links = RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .keepUrlPatterns(Set("http://www.archive.org/details/.*".r)) + .map(r => (r.getCrawlDate, ExtractLinks(r.getUrl, r.getContentString))) + .flatMap(r => r._2.map(f => (r._1, ExtractDomain(f._1).replaceAll("^\\s*www\\.", ""), ExtractDomain(f._2).replaceAll("^\\s*www\\.", "")))) + .filter(r => r._2 != "" && r._3 != "") + .countItems() + .filter(r => r._2 > 5) + .saveAsTextFile("sitelinks-details-rdd/") +``` + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val urlPattern = Array("http://www.archive.org/details/.*") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .all() + .keepValidPagesDF() + .filter(hasUrlPatterns($"url", lit(urlPattern))) + .select(explode(extractLinks($"url", $"raw_content")).as("links")) + .select(removePrefixWWW(extractDomain(col("links._1"))).as("src"), removePrefixWWW(extractDomain(col("links._2"))).as("dest")) + .groupBy("src", "dest") + .count() + .filter($"count" > 5) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("sitelinks-details-df/") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col, explode + +url_pattern = "http://www.archive.org/details/.*" + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .filter("crawl_date is not NULL")\ + .filter(~(col("url").rlike(".*robots\\.txt$")) & (col("mime_type_web_server").rlike("text/html") | col("mime_type_web_server").rlike("application/xhtml+xml") | col("url").rlike("(?i).*htm$") | col("url").rlike("(?i).*html$")))\ + .filter(col("http_status_code") == 200) + .filter(col("url").rlike(url_pattern)) \ + .select(explode(extract_links("url", "raw_content")).alias("links")) \ + .select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain(col("links._2"))).alias("dest")) \ + .groupBy("src", "dest") \ + .count() \ + .filter(col("count") > 5) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("sitelinks-details-df/") +``` + +## Export to Gephi + +You may want to export your data directly to the [Gephi software +suite](http://gephi.github.io/), an open-source network analysis project. The +following code writes to the GEXF format: + +### Scala RDD + +**Will not be implemented.** + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +import io.archivesunleashed.app._ + +val graph = RecordLoader.loadArchives("/path/to/warcs",sc) + .webgraph.groupBy( + $"crawl_date", + removePrefixWWW(extractDomain($"src")).as("src_domain"), + removePrefixWWW(extractDomain($"dest")).as("dest_domain")) + .count() + .filter(!($"dest_domain"==="")) + .filter(!($"src_domain"==="")) + .filter($"count" > 5) + .orderBy(desc("count")) + .collect() + +WriteGEXF(graph, "links-for-gephi.gexf") +``` + +We also support exporting to the +[GraphML](https://en.wikipedia.org/wiki/GraphML) format. To do so, use +the `WriteGraphml` method: + +```scala +WriteGraphML(graph, "links-for-gephi.graphml") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col, desc + +graph = WebArchive(sc, sqlContext, "/path/to/data") \ + .webgraph() \ + .groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src_domain"), remove_prefix_www(extract_domain("dest")).alias("dest_domain")) \ + .count() \ + .filter((col("dest_domain").isNotNull()) & (col("dest_domain") !="")) \ + .filter((col("src_domain").isNotNull()) & (col("src_domain") !="")) \ + .filter(col("count") > 5) \ + .orderBy(desc("count")) \ + .collect() + +WriteGEXF(graph, "links-for-gephi.gexf") +``` + +We also support exporting to the +[GraphML](https://en.wikipedia.org/wiki/GraphML) format. To do so, use +the `WriteGraphml` method: + +```python +WriteGraphML(graph, "links-for-gephi.graphml") +``` + +## Finding Hyperlinks within Collection on Pages with Certain Keyword + +The following script will extract a DataFrame with the following columns, +`domain`, `url`, `crawl date`, `origin page`, and `destination page`, given a +search term `Keystone` of the content (full-text). The example uses the sample +data in +[`aut-resources`](https://github.com/archivesunleashed/aut-resources/tree/master/Sample-Data). + +### Scala RDD + +**Will not be implemented.** + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val result = udf((vs: Seq[Any]) => vs(0) + .toString + .split(",")(1)) + +val df = RecordLoader.loadArchives("/path/to/warcs", sc) + .all() + .keepValidPagesDF() + .select($"domain", + $"url", + $"crawl_date", + explode_outer(extractLinks($"url", $"raw_content")) + .as("link")) + .filter($"raw_content".contains("keystone")) + +df.select($"url", $"domain", $"crawl_date", result(array($"link")) + .as("destination_page")) + .show() + +// Exiting paste mode, now interpreting. + ++--------------------+---------------+----------+--------------------+ +| url| domain|crawl_date| destination_page| ++--------------------+---------------+----------+--------------------+ +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| +|http://www.davids...|davidsuzuki.org| 20091219|http://www.davids...| ++--------------------+---------------+----------+--------------------+ +only showing top 20 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +result: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(,StringType,None) +df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Domain: string, url: string ... 2 more fields] +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col, explode_outer + +webpages = WebArchive(sc, sqlContext, "/path/to/warcs") \ + .all() \ + .filter("crawl_date is not NULL")\ + .filter(~(col("url").rlike(".*robots\\.txt$")) & (col("mime_type_web_server").rlike("text/html") | col("mime_type_web_server").rlike("application/xhtml+xml") | col("url").rlike("(?i).*htm$") | col("url").rlike("(?i).*html$")))\ + .filter(col("http_status_code") == 200) + .select("domain", "url", "crawl_date", explode_outer(extract_links("url", "raw_content")).alias("link")) \ + .filter(col("raw_content").like("%food%")) \ + .select("url", "domain", "crawl_date", col("link._1").alias("destination_page")) \ + .show() +``` diff --git a/website/versioned_docs/version-1.0.0/text-analysis.md b/website/versioned_docs/version-1.0.0/text-analysis.md new file mode 100644 index 00000000..544b9ee1 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/text-analysis.md @@ -0,0 +1,597 @@ +--- +id: version-1.0.0-text-analysis +title: Text Analysis +original_id: text-analysis +--- + +## Extract All Plain Text + +### Scala RDD + +This script extracts the crawl date, domain, URL, and plain text from HTML +files in the sample ARC data (and saves the output to out/). By default, HTTP +headers are included in the plain text that is extracted. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(r.getContentString))) + .saveAsTextFile("plain-text-rdd/") +``` + +Note that this will create a new directory to store the output, which cannot +already exist. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webpages() + .select($"crawl_date", $"domain", $"url", removeHTML($"content")) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-df/") +``` + +### Python DF + +```python +from aut import * + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select("crawl_date", "domain", "url", remove_html("content")) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-df/") +``` + +## Extract Plain Text Without HTTP Headers + +### Scala RDD + +If you want to remove HTTP headers, you can add one more command: +`RemoveHTTPHeader`. The script would then look like: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString)))) + .saveAsTextFile("plain-text-noheaders-rdd/") +``` + +As most plain text use cases do not require HTTP headers to be in the output, +we are removing headers in the following examples. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webpages() + .select(removeHTML(removeHTTPHeader($"content"))) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-noheaders-df/") +``` + +### Python DF + +```python +from aut import * + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select(remove_html(remove_http_header("content"))) \ + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save("plain-text-noheaders-df/") +``` + +## Extract Plain Text By Domain + +### Scala RDD + +The following Spark script generates plain text renderings for all the web +pages in a collection with a URL matching a filter string. In the example case, +it will go through the collection and find all of the URLs within the +"archive.org" domain. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .keepDomains(Set("archive.org")) + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString)))) + .saveAsTextFile("plain-text-domain-rdd/") +``` + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val domains = Array("archive.org", "geocities.org") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webpages() + .select($"crawl_date", $"domain", $"url", removeHTML(removeHTTPHeader($"content"))) + .filter(hasDomains($"domain", lit(domains))) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-domain-df/") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +domains = ["archive.org"] + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select("crawl_date", "domain", "url", remove_html(remove_http_header("content"))) \ + .filter(col("domain").isin(domains)) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-domain-df/") +``` + +## Extract Plain Text by URL Pattern + +### Scala RDD + +The following Spark script generates plain text renderings for all the web +pages in a collection with a URL matching a regular expression pattern. In the +example case, it will go through a WARC file and find all of the URLs beginning +with `http://archive.org/details/`, and save the text of those URLs. + +The `(?i)` makes this query case insensitive. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .keepUrlPatterns(Set("(?i)http://www.archive.org/details/.*".r)) + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString)))) + .saveAsTextFile("details-rdd/") +``` + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val urlPattern = Array("(?i)http://www.archive.org/details/.*") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webpages() + .select($"crawl_date", $"domain", $"url", removeHTML(removeHTTPHeader($"content"))) + .filter(hasUrlPatterns($"url", lit(urlPattern))) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("details-df/") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +url_pattern = "%http://www.archive.org/details/%" + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select("crawl_date", "domain", "url", remove_html(remove_http_header("content"))) \ + .filter(col("url").like(url_pattern)) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("details-df/") +``` + +## Extract Plain Text Minus Boilerplate + +### Scala RDD + +The following Spark script generates plain text renderings for all the web +pages in a collection, minus "boilerplate" content: advertisements, +navigational elements, and elements of the website template. Boilerplate requires +HTML, so it needs to run on `.all()` raw content. Not `.webpages()` content. For +more information on the boilerplate removal library we are using, [please see +this website and paper](http://www.l3s.de/~kohlschuetter/boilerplate/). + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .keepDomains(Set("archive.org")) + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, ExtractBoilerpipeText(RemoveHTTPHeader(r.getContentString)))) + .saveAsTextFile("plain-text-no-boilerplate-rdd/") +``` + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val domains = Array("archive.org") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .all() + .select($"crawl_date", $"domain", $"url", extractBoilerpipeText(removeHTTPHeader($"content"))) + .filter(hasDomains($"domain", lit(domains))) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-no-boilerplate-df/") +``` + +### Python DF + +```python +from aut import * + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select("crawl_date", "domain", "url", extract_boilerplate(remove_http_header("content")).alias("content")) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-no-boilerplate-df/") +``` + +## Extract Plain Text Filtered by Date + +### Scala RDD + +AUT permits you to filter records by a list of full or partial date strings. It +conceives of the date string as a `DateComponent`. Use `keepDate` to specify +the year (`YYYY`), month (`MM`), day (`DD`), year and month (`YYYYMM`), or a +particular year-month-day (`YYYYMMDD`). + +The following Spark script extracts plain text for a given collection by date +(in this case, April 2008). + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .keepDate(List("200804"), ExtractDate.DateComponent.YYYYMM) + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString)))) + .saveAsTextFile("plain-text-date-filtered-200804/") +``` + +The following script extracts plain text for a given collection by year (in +this case, 2008). + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .keepDate(List("2008"), ExtractDate.DateComponent.YYYY) + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString)))) + .saveAsTextFile("plain-text-date-filtered-2008/") +``` + +Finally, you can also extract multiple dates or years. In this case, we would +extract pages from both 2008 and 2015. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .keepDate(List("2008","2015"), ExtractDate.DateComponent.YYYY) + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString)))) + .saveAsTextFile("plain-text-date-filtered-2008-2015-rdd/") +``` + +Note: if you created a dump of plain text using another one of the earlier +commands, you do not need to go back and run this. You can instead use bash to +extract a sample of text. For example, running this command on a dump of all +plain text stored in `alberta_education_curriculum.txt`: + +```bash +sed -n -e '/^(201204/p' alberta_education_curriculum.txt > alberta_education_curriculum-201204.txt +``` + +would select just the lines beginning with `(201204`, or April 2012. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val dates = Array("2008", "2015") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webpages() + .select($"crawl_date", $"domain", $"url", removeHTML(removeHTTPHeader($"content"))) + .filter(hasDate($"crawl_date", lit(dates))) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-date-filtered-2008-2015-df/") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +dates = "2009[10][09]\d\d" + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select("crawl_date", "domain", "url", remove_html(remove_http_header("content"))) \ + .filter(col("crawl_date").rlike(dates)) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-date-filtered-2008-2015-df/") +``` + +## Extract Plain Text Filtered by Language + +### Scala RDD + +The following Spark script keeps only French language pages from a certain +top-level domain. It uses the [ISO 639.2 language +codes](https://www.loc.gov/standards/iso639-2/php/code_list.php). + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .keepDomains(Set("archive.org")) + .keepLanguages(Set("fr")) + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString)))) + .saveAsTextFile("plain-text-fr-rdd/") +``` + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val domains = Array("archive.org") +val languages = Array("fr") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webpages() + .select($"crawl_date", $"domain", $"url", $"language", removeHTML(removeHTTPHeader($"content"))) + .filter(hasDomains($"domain", lit(domains))) + .filter(hasLanguages($"language", lit(languages))) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-fr-df/") +``` + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val domains = Array("archive.org") +val languages = Array("fr") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webpages() + .filter(hasDomains($"domain", lit(domains))) + .filter(hasLanguages($"language", lit(languages))) + .select($"crawl_date", $"domain", $"url", $"language", removeHTML(removeHTTPHeader($"content"))) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-fr-df/") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +domains = ["geocities.com"] +languages = ["fr"] + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select("crawl_date", "domain", "url", remove_html(remove_http_header("content"))) \ + .filter(col("domain").isin(domains)) \ + .filter(col("language").isin(languages)) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-fr-df/") +``` + +## Extract Plain text Filtered by Keyword + +### Scala RDD + +The following Spark script keeps only pages containing a certain keyword, which +also stacks on the other scripts. + +For example, the following script takes all pages containing the keyword +"radio" in a collection. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs",sc) + .keepValidPages() + .keepContent(Set("radio".r)) + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHTTPHeader(r.getContentString)))) + .saveAsTextFile("plain-text-radio-rdd/") +``` + +There is also `discardContent` which does the opposite, and can be used in +cases where, for example, you have a frequent keyword you are not interested +in. + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val content = Array("radio") + +RecordLoader.loadArchives("/path/to/warcs", sc) + .webpages() + .select($"crawl_date", $"domain", $"url", removeHTML(removeHTTPHeader($"content"))) + .filter(hasContent($"content", lit(content))) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-radio-df/") +``` + +### Python DF + +```python +from aut import * +from pyspark.sql.functions import col + +content = "%radio%" + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select("crawl_date", "domain", "url", remove_html(remove_http_header("content"))) \ + .filter(col("content").like(content)) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-radio-df/") +``` + +## Extract Raw HTML + +### Scala RDD + +In most cases, users will be interested in working with plain text. In some +cases, however, you may want to work with the actual HTML of the pages +themselves (for example, looking for specific tags or HTML content). + +The following script will produce the raw HTML of a WARC file. You can use the +filters from above to filter it down accordingly by domain, language, etc. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ + +RecordLoader.loadArchives("/path/to/warcs", sc) + .keepValidPages() + .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTTPHeader(r.getContentString))) + .saveAsTextFile("plain-html-rdd/") +``` + +### Scala DF + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +RecordLoader.loadArchives("example.warc.gz", sc) + .webpages() + .select($"crawl_date", extractDomain($"url"), $"url", removeHTTPHeader($"content")) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-html-df/") +``` + +### Python DF + +```python +from aut import * + +WebArchive(sc, sqlContext, "/path/to/warcs") \ + .webpages() \ + .select("crawl_date", "domain", "url", remove_http_header("content")) \ + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-html-df/") +``` diff --git a/website/versioned_docs/version-1.0.0/toolkit-walkthrough.md b/website/versioned_docs/version-1.0.0/toolkit-walkthrough.md new file mode 100644 index 00000000..c7124469 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/toolkit-walkthrough.md @@ -0,0 +1,424 @@ +--- +id: version-1.0.0-toolkit-walkthrough +title: Toolkit Walkthrough +original_id: toolkit-walkthrough +--- + +Welcome to the Archives Unleashed Toolkit hands-on walkthrough! + +![Spark Terminal](https://user-images.githubusercontent.com/218561/73990154-4d1bd800-4916-11ea-9b6e-10e4503dfa38.png) + +The reality of any hands-on workshop is that things will break. We've tried our +best to provide a robust environment that can let you walk through the basics +of the Archives Unleashed Toolkit alongside us. + +If you have any questions, let us know in [Slack](http://slack.archivesunleashed.org/)! + +## Table of Contents + +- [Installation and Use](#installation-and-use) + - [Hello World: Our First Script](#hello-world-our-first-script) +- [Extracting some Text](#extracting-some-text) + - [Ouch: Our First Error](#ouch-our-first-error) + - [Other Text Analysis Filters](#other-text-analysis-filters) +- [Web of Links: Network Analysis](#web-of-links-network-analysis) +- [Working with the Data](#working-with-the-data) +- [Acknowledgements and Final Notes](#acknowledgements-and-final-notes) + +## Installation and Use + +**Got Docker?** +This lesson requires that you install +[Docker](https://www.docker.com/get-docker), and use `docker-aut`. We have +instructions on how to install Docker +[here](https://github.com/archivesunleashed/docker-aut/wiki/Docker-Install-Instructions) +, as well as instructions on how to build and use `docker-aut` +[here](https://github.com/archivesunleashed/docker-aut#build-and-run). + +Later in this lesson, we use the networking tool [Gephi](https://gephi.org/). + +Make sure that Docker is running! If it isn't, you might see an error like +`docker: Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is +the docker daemon running?` – make sure to run it (on Mac, for example, you +need to run the Docker application itself). + +Make a directory in your userspace, somewhere where you can find it, on your desktop +perhaps. Call it `data`. In my case, I will create it on my desktop +and it will have a path like `/Users/ianmilligan1/desktop/data`. + +Use the following command, replacing `/path/to/your/data` with the directory. +**If you want to use your own ARC or WARC files, please put them in this +directory**. + +`docker run --rm -it -v "/path/to/your/data:/data" aut` + +For example, if your files are in `/Users/ianmilligan1/desktop/data` you would +run the above command like: + +`docker run --rm -it -v "/Users/ianmilligan1/desktop/data:/data" aut` + +
+ +**Troubleshooting Tips** + +The above commands are important, as they make the rest of the lesson possible! + +Remember that you need to have the second `:/data` in the above example. This +is making a connection between the directory called "data" on my desktop with a +directory in the Docker virtual machine called "docker." + +Also, if you are using Windows, you will need to provide the path as it appears +in your file system. For example: `C:\Users\ianmilligan1\data`. +
+ +Once you run this command, you will have to wait a few minutes while data is +downloaded and AUT builds. Once it is all working, you should see: + +```shell +Welcome to + ____ __ + / __/__ ___ _____/ /__ + _\ \/ _ \/ _ `/ __/ '_/ + /___/ .__/\_,_/_/ /_/\_\ version 2.4.4 + /_/ + +Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_212) +Type in expressions to have them evaluated. +Type :help for more information. + +scala> +``` + +## Hello World: Our First Script + +Now that we are at the prompt, let's get used to running commands. The easiest +way to use the Spark Shell is to _copy and paste_ scripts that you've written +somewhere else in. + +Fortunately, the Spark Shell supports this functionality! + +At the `scala>` prompt, type the following command and press enter. + +```shell +:paste +``` + +Now cut and paste the following script: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) + .all() + .keepValidPagesDF() + .groupBy($"domain")) + .count() + .sort($"count".desc) + .show(10, false) +``` + +Let's take a moment to look at this script. It: + +- begins by importing the AUT libraries; +- tells the program where it can find the data (in this case, the sample data + that we have included in this Docker image); +- tells it only to keep the + "[valid](https://aut.docs.archivesunleashed.org/docs/filters-rdd#scala-df)" + pages, in this case HTML data +- tells it to `ExtractDomain`, or find the base domain of each URL - i.e. + `www.google.com/cats` we are interested just in the domain, or + `google.com`; +- counts them - how many times does `google.com` appear in this collection, + for example; +- and displays a DataFrame of the top ten! + +Once it is pasted in, let's run it. + +You run pasted scripts by pressing `ctrl` + `d`. Try that now. + +You should see: + +```dataframe +// Exiting paste mode, now interpreting. + ++-------------------------+-----+ +|domain |count| ++-------------------------+-----+ +|equalvoice.ca |4274 | +|liberal.ca |1968 | +|policyalternatives.ca |588 | +|greenparty.ca |535 | +|fairvote.ca |442 | +|ndp.ca |416 | +|davidsuzuki.org |348 | +|canadiancrc.com |88 | +|communist-party.ca |39 | +|ccsd.ca |22 | ++-------------------------+-----+ +only showing top 10 rows + +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +``` + +We like to use this example for two reasons: + +- It is fairly simple and lets us know that AUT is working; +- and it tells us what we can expect to find in the web archives! In this case, + we have a lot of the Liberal Party of Canada, Equal Voice Canada, and the + Green Party of Canada. + +**If you loaded your own data above**, you can access that directory by +substituting the directory in the `loadArchives` command. Try it again! +Remember to type `:paste`, paste the following command in, and then `ctrl` + +`D` to execute. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +RecordLoader.loadArchives("/data/*.gz", sc) + .all() + .keepValidPagesDF() + .groupBy($"domain") + .count() + .sort($"count".desc) + .show(10, false) +``` + +## Extracting some Text + +Now that we know what we might find in a web archive, let us try extracting +some text. You might want to get just the text of a given website or domain, +for example. + +Above we learned that the Liberal Party of Canada's website has 1,968 captures +in the sample files we provided. Let's try to just extract that text. + +To load this script, remember to type `:paste`, copy-and-paste it into the shell, +and then press `ctrl` + `d`. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val domains = Array("liberal.ca") + +RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) + .webpages() + .select($"crawl_date", $"domain", $"url", $"content") + .filter(hasDomains($"domain", lit(domains))) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/data/liberal-party-text") +``` + +**If you're using your own data, that's why the domain count was key!** Swap +out the "liberal.ca" command above with the domain that you want to look at +from your own data. + +Now let's look at the ensuing data. Go to the folder you provided in the very +first startup – remember, in my case it was `/users/ianmilligan1/desktop/data`, +and you will now have a folder called `liberal-party-text`. Open up the files +with your text editor and check it out! + +## Ouch: Our First Error + +One of the vexing parts of this interface is that it creates output directories +and if the directory already exists, it comes tumbling down. + +As this is one of the most common errors, let's see it and then learn how to +get around it. + +Try running the **exact same script** that you did above. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val domains = Array("liberal.ca") + +RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) + .webpages() + .select($"crawl_date", $"domain", $"url", $"content") + .filter(hasDomains($"domain", lit(domains))) + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/data/liberal-party-text") +``` + +Instead of a nice crisp feeling of success, you will see a long dump of text +beginning with: + +```scala +20/02/06 23:43:05 WARN SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect. +org.apache.spark.sql.AnalysisException: path file:/data/liberal-party-text already exists.; +``` + +To get around this, you can do two things: + +- Delete the existing directory that you created; +- Change the name of the output file - to `/data/liberal-party-text-2` for example. + +Good luck! + +## Other Text Analysis Filters + +Take some time to explore the various filters that you can use. Check out the +[documentation](https://aut.docs.archivesunleashed.org/docs/filters-df) +for some ideas. + +Some options: + +- **Keep URL Patterns**: Instead of domains, what if you wanted to have text + relating to just a certain pattern? Substitute `hasDomains` for a command + like: + `.filter($"domain"), Array("liberal.ca", "ndp.ca"))` +- **Filter by Date**: What if we just wanted data from 2006? You could add the + following command after `.webpages()`: `.filter(hasDates($"crawl_date", Array("2006")))` +- **Filter by Language**: What if you just want French-language pages? Add + another filter: `.filter($"languages", Array("fr")))`. + +For example, if we just wanted the French-language Liberal pages, we would run: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val domains = Array("liberal.ca") +val languages = Array("fr") + +RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) + .webpages() + .filter(hasDomains($"domain"), lit(domains))) + .filter(hasLanguages($"language", lit(languages))) + .select($"crawl_date", $"domain", $"url", $"content") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/data/liberal-party-french-text") +``` + +Or if we wanted to just have pages from 2006, we would run: + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +val dates = Array("2006") + +RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) + .webpages() + .filter(hasDate($"crawl_date", lit(dates))) + .select($"crawl_date", $"domain", $"url", $"content") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/data/2006-text") +``` + +Finally, if we want to remove the HTTP headers – let's say if we want to create +some nice word clouds – we can add a final command: `RemoveHttpHeader`. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ + +RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) + .webpages() + .select($"crawl_date", $"domain", $"url", $"content") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/data/text-no-headers") +``` + +You could now try uploading one of the plain text files using a website like +[Voyant Tools](https://voyant-tools.org). + +## Web of Links: Network Analysis + +One other thing we can do is a network analysis. By now you are probably +getting good at running code. + +Let's extract all of the links from the sample data and export them to a file +format that the popular network analysis program Gephi can use. + +```scala +import io.archivesunleashed._ +import io.archivesunleashed.udfs._ +import io.archivesunleashed.app._ + +val webgraph = RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc).webgraph() + +val graph = webgraph.groupBy( + $"crawl_date", + removePrefixWWW(extractDomain($"src")).as("src_domain"), + removePrefixWWW(extractDomain($"dest")).as("dest_domain")) + .count() + .filter(!($"dest_domain"==="")) + .filter(!($"src_domain"==="")) + .filter($"count" > 5) + .orderBy(desc("count")) + +WriteGEXF(graph.collect(), "/data/links-for-gephi.gexf") +``` + +By now this should be seeming pretty straightforward! (remember to keep using +`:paste` to enter this code). + +## Working with the Data + +The first step should be to work with this network diagram so you can make a +beautiful visualization yourself. + +![Gephi visualization](https://archivesunleashed.org/images/gephi.png) + +First, let's use these instructions to [work with Gephi](https://cloud.archivesunleashed.org/derivatives/gephi). + +Secondly, we can begin to think about how” to work with the plain text file. +See the following documents from our "learning guides": + +- [**Filtering the Full-Text Derivative + File**](https://cloud.archivesunleashed.org/derivatives/text-filtering): This + tutorial explores the use of the "grep" command line tool to filter out + dates, domains, and keywords from plain text. +- [**Text Analysis Part One: Beyond the Keyword Search: Using + AntConc**](https://cloud.archivesunleashed.org/derivatives/text-antconc): + This tutorial explores how you can explore text within a web archive using + the AntConc tool. +- [**Text Analysis Part Two: Sentiment Analysis With the Natural Language + Toolkit**](https://cloud.archivesunleashed.org/derivatives/text-sentiment): + This tutorial explores how you can calculate the positivity or negativity (in + an emotional sense) of web archive text. + +Good luck and thanks for joining us on this lesson plan. + +## Acknowledgements and Final Notes + +The ARC and WARC file are drawn from the [Canadian Political Parties & +Political Interest Groups Archive-It +Collection](https://archive-it.org/collections/227), collected by the +University of Toronto. We are grateful that they've provided this material to +us. + +If you use their material, please cite it along the following lines: + +- University of Toronto Libraries, Canadian Political Parties and Interest + Groups, Archive-It Collection 227, Canadian Action Party, + + +You can find more information about this collection at [WebArchives.ca](http://webarchives.ca/). diff --git a/website/versioned_docs/version-1.0.0/usage.md b/website/versioned_docs/version-1.0.0/usage.md new file mode 100644 index 00000000..ba1742e1 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/usage.md @@ -0,0 +1,149 @@ +--- +id: version-1.0.0-usage +title: Usage +original_id: usage +--- + +## The Toolkit with Spark Submit + +The Toolkit offers a variety of extraction jobs with +[`spark-submit`](https://spark.apache.org/docs/latest/submitting-applications.html) +. These extraction jobs have a few configuration options. + +The extraction jobs have a basic outline of: + +```shell +spark-submit --class io.archivesunleashed.app.CommandLineAppRunner PATH_TO_AUT_JAR --extractor EXTRACTOR --input INPUT DIRECTORY --output OUTPUT DIRECTORY +``` + +More information on using the Toolkit with `spark-submit` can be found in +[The Toolkit with spark-submit](aut-spark-submit-app.md) section of the documentation. + +## The Toolkit with Spark Shell + +The Toolkit only supports the `--jar` option. + +```shell +spark-shell --help + + --jars JARS Comma-separated list of jars to include on the driver + and executor classpaths. +``` + +### With an UberJar + +Release version: + +```shell +spark-shell --jars /path/to/aut-1.0.0-fatjar.jar +``` + +HEAD (built locally): + +```shell +spark-shell --jars /path/to/aut/target/aut-1.0.1-SNAPSHOT-fatjar.jar +``` + +## The Toolkit with PySpark + +To run PySpark with the Toolkit loaded, you will need to +provide PySpark with the Java/Scala artifact, as well as the Python bindings. +The Java/Scala artifact can be provided with `--jars` as +described above. The Python bindings can be +[downloaded](https://github.com/archivesunleashed/aut/releases/download/aut-1.0.0/aut-1.0.0.zip) +, or [built locally](#building-locally) (the zip file will be found in +the `target` directory. + +In each of the examples below, `/path/to/python` is listed. If you are unsure +where your Python is, it can be found with `which python`. + +### With an UberJar + +Release version: + +```shell +export PYSPARK_PYTHON=/path/to/python; export PYSPARK_DRIVER_PYTHON=/path/to/python; /path/to/spark/bin/pyspark --py-files aut-1.0.0.zip --jars /path/to/aut-1.0.0-fatjar.jar +``` + +HEAD (built locally): + +```shell +export PYSPARK_PYTHON=/path/to/python; export PYSPARK_DRIVER_PYTHON=/path/to/python; /path/to/spark/bin/pyspark --py-files aut.zip --jars /path/to/aut-1.0.1-SNAPSHOT-fatjar.jar +``` + +## The Toolkit with Jupyter + +To run a [Jupyter Notebook](https://jupyter.org/install) with the Archives +Unleashed Toolkit loaded, you will need to provide PySpark the Java/Scala +artifact, and the Python bindings. The Java/Scala artifact can be provided +with `--jars` as described above. The Python bindings can be +[downloaded](https://github.com/archivesunleashed/aut/releases/download/aut-1.0.0/aut-1.0.0.zip) +, or [built locally](#Introduction) (the zip file will be found in +the `target` directory. + +### With an UberJar + +Release version: + +```shell +export PYSPARK_DRIVER_PYTHON=jupyter; export PYSPARK_DRIVER_PYTHON_OPTS=notebook; /path/to/spark/bin/pyspark --py-files aut-1.0.0.zip --jars /path/to/aut-1.0.0-fatjar.jar +``` + +HEAD (built locally): + +```shell +export PYSPARK_DRIVER_PYTHON=jupyter; export PYSPARK_DRIVER_PYTHON_OPTS=notebook; /path/to/spark/bin/pyspark --py-files aut.zip --jars /path/to/aut-1.0.1-SNAPSHOT-fatjar.jar +``` + +A Jupyter Notebook _should_ automatically load in your browser at +. You may be asked for a token upon first launch, which +just offers a bit of security. The token is available in the load screen and +will look something like this: + +```shell +[I 19:18:30.893 NotebookApp] Writing notebook server cookie secret to /run/user/1001/jupyter/notebook_cookie_secret +[I 19:18:31.111 NotebookApp] JupyterLab extension loaded from /home/nruest/bin/anaconda3/lib/python3.7/site-packages/jupyterlab +[I 19:18:31.111 NotebookApp] JupyterLab application directory is /home/nruest/bin/anaconda3/share/jupyter/lab +[I 19:18:31.112 NotebookApp] Serving notebooks from local directory: /home/nruest/Projects/au/aut +[I 19:18:31.112 NotebookApp] The Jupyter Notebook is running at: +[I 19:18:31.112 NotebookApp] http://localhost:8888/?token=87e7a47c5a015cb2b846c368722ec05c1100988fd9dcfe04 +[I 19:18:31.112 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation). +[C 19:18:31.140 NotebookApp] + + To access the notebook, open this file in a browser: + file:///run/user/1001/jupyter/nbserver-9702-open.html + Or copy and paste one of these URLs: + http://localhost:8888/?token=87e7a47c5a015cb2b846c368722ec05c1100988fd9dcfe04 +``` + +Create a new notebook by clicking "New" (near the top right of the Jupyter +homepage) and select "Python 3" from the drop-down list. + +The notebook will open in a new window. In the first cell enter: + +```python +from aut import * + +archive = WebArchive(sc, sqlContext, "src/test/resources/warc/") + +webpages = archive.webpages() +webpages.printSchema() +``` + +Then hit Shift+Enter, or press the play button. + +If you receive no errors, and see the following, you are ready to begin working +with your web archives! + +![](https://user-images.githubusercontent.com/218561/63203995-42684080-c061-11e9-9361-f5e6177705ff.png) + +## The Toolkit with Docker + +The Toolkit offers a Docker container that can be used with Spark and PySpark. +The container is great for learning how the Toolkit works, and quick prototyping. +Containers are available for [each +release](https://github.com/archivesunleashed/docker-aut/branches), as well as +the `main` branch. + +More information on using the `docker-aut` can be found +[here](https://github.com/archivesunleashed/docker-aut). diff --git a/website/versions.json b/website/versions.json index e8a6fd72..8321bc29 100644 --- a/website/versions.json +++ b/website/versions.json @@ -1,4 +1,5 @@ [ + "1.0.0", "0.91.0", "0.90.4", "0.90.3",