bin/pdfextract

#!/usr/bin/env ruby

=begin

= Info
    Extracts valuable data from a PDF document. Can extract:
     - decoded streams
     - JavaScript
     - file attachments

= License
    Copyright (C) 2016  Guillaume Delugré.

    Origami is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Origami is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with Origami.  If not, see <http://www.gnu.org/licenses/>.

=end

begin
    require 'origami'
rescue LoadError
    $: << File.join(__dir__, '../lib')
    require 'origami'
end
include Origami

require 'optparse'
require 'rexml/document'

class OptParser
    BANNER = <<USAGE
Usage: #{$0} <PDF-file> [-afjms] [-d <output-directory>]
Extracts various data out of a document (streams, scripts, images, fonts, metadata, attachments).
Bug reports or feature requests at: http://github.com/gdelugre/origami

Options:
USAGE

    def self.parser(options)
        OptionParser.new do |opts|
            opts.banner = BANNER

            opts.on("-d", "--output-dir DIR", "Output directory") do |d|
                options[:output_dir] = d
            end

            opts.on("-s", "--streams", "Extracts all decoded streams") do
                options[:streams] = true
            end

            opts.on("-a", "--attachments", "Extracts file attachments") do
                options[:attachments] = true
            end

            opts.on("-f", "--fonts", "Extracts embedded font files") do
                options[:fonts] = true
            end

            opts.on("-j", "--js", "Extracts JavaScript scripts") do
                options[:javascript] = true
            end

            opts.on("-m", "--metadata", "Extracts metadata streams") do
                options[:metadata] = true
            end

            opts.on("-i", "--images", "Extracts embedded images") do
                options[:images] = true
            end

            opts.on_tail("-h", "--help", "Show this message") do
                puts opts
                exit
            end
        end
    end

    def self.parse(args)
        options = {}

        self.parser(options).parse!(args)

        options
    end
end

begin
    @options = OptParser.parse(ARGV)

    if ARGV.empty?
        abort "Error: No filename was specified. #{$0} --help for details."
    else
        target = ARGV.shift
    end

    unless %i[streams javascript attachments fonts metadata images].any? {|opt| @options[opt]}
        @options[:streams] =
        @options[:javascript] =
        @options[:fonts] =
        @options[:attachments] =
        @options[:images] = true
    end

    if @options[:output_dir].nil?
        @options[:output_dir] = "#{File.basename(target, '.pdf')}.dump"
    end

    # Force data extraction, even for invalid FlateDecode streams.
    Origami::OPTIONS[:ignore_zlib_errors] = true
    Origami::OPTIONS[:ignore_png_errors] = true

    OUTPUT_DIR = @options[:output_dir]
    Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR)

    params =
    {
        verbosity: Parser::VERBOSE_QUIET,
    }
    pdf = PDF.read(target, params)

    if @options[:streams]
        nstreams = 0
        stream_dir = File.join(OUTPUT_DIR, "streams")
        Dir::mkdir(stream_dir) unless File.directory?(stream_dir)

        pdf.each_object.select {|obj| obj.is_a?(Stream)}.each do |stream|
            stream_file = File.join(stream_dir, "stream_#{stream.reference.refno}.dmp")
            begin
                File.binwrite(stream_file, stream.data)
            rescue
                STDERR.puts "Cannot decode stream #{stream.reference}: #{$!.message}"
                next
            end

            nstreams += 1
        end

        puts "Extracted #{nstreams} PDF streams to '#{stream_dir}'."
    end

    if @options[:javascript]
        nscripts = 0
        js_dir = File.join(OUTPUT_DIR, "scripts")
        Dir::mkdir(js_dir) unless File.directory?(js_dir)

        pdf.ls(/^JS$/).each do |script|
            script_file = File.join(js_dir, "script_#{script.hash}.js")
            script_data =
                case script
                when Stream then script.data
                else script.value
                end

            File.binwrite(script_file, script_data)
            nscripts += 1
        end

        # Also checking for presence of JavaScript in XML forms.
        if pdf.form? and pdf.Catalog.AcroForm.has_key?(:XFA)
            xfa = pdf.Catalog.AcroForm.XFA

            case xfa
            when Array then
                xml = ""
                i = 0
                xfa.each do |packet|
                    if i % 2 == 1
                        xml << packet.solve.data
                    end

                    i = i + 1
                end
            when Stream then
                xml = xfa.data
            else
                reject("Malformed XFA dictionary")
            end

            xfadoc = REXML::Document.new(xml)
            REXML::XPath.match(xfadoc, "//script").each do |script|
                script_file = File.join(js_dir, "script_#{script.hash}.js")
                File.binwrite(script_file, script.text)
                nscripts += 1
            end
        end

        puts "Extracted #{nscripts} scripts to '#{js_dir}'."
    end

    if @options[:attachments]
        nattach = 0
        attachments_dir = File.join(OUTPUT_DIR, "attachments")
        Dir::mkdir(attachments_dir) unless File.directory?(attachments_dir)

        pdf.each_attachment do |name, attachment|
            name = name.to_utf8.tr("\/\x00", "_")
            attached_file = File.join(attachments_dir, "attached_#{File.basename(name)}")

            if attachment and attachment.EF and attachment.EF.F.is_a?(Stream)
                File.binwrite(attached_file, attachment.EF.F.data)
                nattach += 1
            end
        end

        puts "Extracted #{nattach} attachments to '#{attachments_dir}'."
    end

    if @options[:fonts]
        nfonts = 0
        fonts_dir = File.join(OUTPUT_DIR, "fonts")
        Dir::mkdir(fonts_dir) unless File.directory?(fonts_dir)

        pdf.each_object.select {|obj| obj.is_a?(Stream)}.each do |stream|
            font = stream.xrefs.find{|obj| obj.is_a?(FontDescriptor)}
            if font
                font_file = File.join(fonts_dir, File.basename(font.FontName.value.to_s))
                File.binwrite(font_file, stream.data)
                nfonts += 1
            end
        end

        puts "Extracted #{nfonts} fonts to '#{fonts_dir}'."
    end

    if @options[:metadata]
        nmeta = 0
        metadata_dir = File.join(OUTPUT_DIR, "metadata")
        Dir::mkdir(metadata_dir) unless File.directory?(metadata_dir)

        pdf.each_object.select {|obj| obj.is_a?(MetadataStream)}.each do |stream|
            metadata_file = File.join(metadata_dir, "metadata_#{stream.reference.refno}.xml")
            File.binwrite(metadata_file, stream.data)
            nmeta += 1
        end

        puts "Extracted #{nmeta} metadata streams to '#{metadata_dir}'."
    end

    if @options[:images]
        nimages = 0
        image_dir = File.join(OUTPUT_DIR, "images")
        Dir::mkdir(image_dir) unless File.directory?(image_dir)

        pdf.each_object.select {|obj| obj.is_a?(Graphics::ImageXObject)}.each do |stream|
            begin
                ext, image_data = stream.to_image_file
                image_file = File.join(image_dir, "image_#{stream.reference.refno}.#{ext}")

                if ext != 'png' and stream.ColorSpace == Graphics::Color::Space::DEVICE_CMYK
                    STDERR.puts "Warning: file '#{image_file}' is intended to be viewed in CMYK color space."
                end

                File.binwrite(image_file, image_data)
                nimages += 1
            rescue
                STDERR.puts "Unable to decode image (stream #{stream.reference.refno}). #{$!.message}"
                STDERR.puts $!.backtrace.join($/)
            end
        end

        puts "Extracted #{nimages} images to '#{image_dir}'."
    end

rescue
    STDERR.puts $!.backtrace.join($/)
    abort "#{$!.class}: #{$!.message}"
end