From af81254a4d31690a5dd13355109d3934aa17bac7 Mon Sep 17 00:00:00 2001 From: Raphael Reitzig Date: Mon, 28 Oct 2013 21:37:59 +0100 Subject: [PATCH] Adds port of pdfinvert to Ruby, including improvements and new features. * $DEBUG controls amount of files left for inspection. * Arbitrarily large embedded images are now supported. * User can specify which embedded images to convert. * Code is cleaner and more robust than old one. * Supports embedded image types other than PNG. --- pdfinvert.rb | 351 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100755 pdfinvert.rb diff --git a/pdfinvert.rb b/pdfinvert.rb new file mode 100755 index 0000000..96b49ce --- /dev/null +++ b/pdfinvert.rb @@ -0,0 +1,351 @@ +#!/usr/bin/ruby + +# Copyright 2013, Raphael Reitzig +# +# +# pdfinvert is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# pdfinvert is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with pdfinvert. If not, see . + +# Inverts colors in a PDF, including embedded images. Can use replacement +# tables instead of inverting all colors. Can transform all embedded images +# or use rules to determine which remain unchanged. Can add page numbers (`-pn`). +# +# Color files have one rule per line. Each line has the format +# +# dd #xxxxxx #xxxxxx +# +# where +# +# * dd is an integer used for fuzzy color matching in embedded binary +# images (see imagemagick documentation). Higher numbers mean colors +# "farther away" from the specified source color are replaced. +# * xxxxxx is an RGC color in hex. The first color is the source color, the +# second the replacement color. +# +# So, for instance, the line +# +# 30 #ffffff #000000 +# +# will replace white with black, and in embedded images also colors "30-close" +# to white. +# Note that rules are applied in sequence, from top to bottom. +# +# Image rule files have one line per page. Every line contains whitespace-separated +# zeros and ones; if the i-th digit is zero, the i-th embedded image on that +# page is not converted, otherwise it is. Illegal entries and missing numbers are +# considered to be one. +# +# For example, the file +# +# 1 0 0 +# +# 1 1 0 +# +# Means that images two and three on page one as well as image three on page three +# remain unchanged; all other images will be converted. +# +# Requirements: +# * ruby +# * inkscape +# * imagemagick +# * pdftk +# * gs + +require 'fileutils' + +$DEBUG = false + +# # # # # # # # # # # # # # # +# Init +# # # # # # # # # # # # # # # + +if ( ARGV.size == 0 ) + puts "Usage: pdfinvert [-pn] [-c ] [-i ] [output file]" + Process.exit +end + +$pagenumbers = false +$colors = "" +$images = "" +$input = "" + +# Read in command line parameters +skip = 0 +(0..ARGV.size - 1).each { |i| + if ( skip > 0 ) + skip -= 1 + next + end + + if ( ARGV[i] == "-pn" ) + $pagenumbers = true + elsif ( ARGV[i] == "-c" ) + $colors = ARGV[i+1] + skip = 1 + elsif ( ARGV[i] == "-i" ) + $images = ARGV[i+1] + skip = 1 + elsif ( $input == "" ) + $input = ARGV[i] + $filename = File.basename($input, ".pdf") + $output = "#{$filename}_inverted.pdf" + else + $output = ARGV[i]; + end +} + +# Verify that input file exists +if ( $input == "" ) + puts "Please provide an input file." + Process.exit +elsif ( !File.exists?($input) ) + puts "File '#{$input}' does not exist."; + Process.exit; +end + +$tmp = "/tmp/pdfinvert_#{$filename}" +$dir = Dir.pwd + +# Ensure that temporary directory exists and is empty +if ( !Dir.exists?($tmp) ) + Dir.mkdir($tmp) +else + Dir["#{$tmp}/*"].each { |f| File.delete(f) } +end + +# Preprocess replacement colors file +$colorrules = {} +$colororder = [] +if ( $colors != "" ) + if ( File.exists?($colors) ) + File.open($colors, "r") { |f| + f.readlines.each { |line| + entry = line.strip.split(/\s+/) + entry[0] = Integer(entry[0]) + entry[1] = entry[1][1..6] + entry[2] = entry[2][1..6] + $colorrules[entry[1]] = [entry[0], entry[2]] + $colororder.push(entry[1]) + } + } + else + puts "Color specification file '#{$colors}' does not exist. Inverting now." + $colors = "" + end +end + +def replacecolor(color) + if ( $colors != "" ) + if ( $colorrules.include?(color) ) + $colorrules[color][1] + else + color + end + else + sprintf("%06x", 0xFFFFFF - Integer("0x#{color}", 16)) + end +end + +# Preprocess image rule file +$imagerules = [[]] +if ( $images != "" ) + if ( File.exists?($images) ) + File.open($images, "r") { |f| + ctr = 1 + f.readlines.each { |line| + $imagerules[ctr] = line.strip.split(/\s+/).map { |b| Integer(b) rescue 1 } + ctr += 1 + } + } + else + puts "Image rule file '#{$images}' does not exist. Converting all images now." + $images = "" + end +end + +def convertimage?(page, image) + if ( $imagerules.size > page && $imagerules[page].size > image ) + $imagerules[page][image] != 0 + else + true + end +end + +# Function that returns page number inset for SVG +def pagenumber(nr, x, y) + return " \n" + + " \n" + + " \n" + + " #{nr}\n" + + " \n" + + " \n" + + " "; +end + +# This function processes the given file +def invert(file) + log = "Inverting #{file}...\n" + basename = File.basename(file, ".pdf") + + p = IO::popen("inkscape -l #{basename}.svg #{file} 2>&1") + log += p.readlines.join + FileUtils.rm(file) if !$DEBUG + + # Change size form US Letter to A4 (may want to generalise?): + #sed -e 's/width="765"/width="210mm"/;s/height="990"/height="297mm"/' ${1%.pdf}.svg \ + # > ${1%.pdf}a4.svg; + #mv ${1%.pdf}a4.svg ${1%.pdf}.svg; + # This does not rescale/fit! + + svg = [] + File.open("#{basename}.svg", "r") { |f| + svg = f.readlines + } + FileUtils.rm("#{basename}.svg") if !$DEBUG + + pnr = file.gsub(/[^0-9]/, "").to_i + pny = nil + pnx = nil + imgctr = -1 + svg.map! { |line| + # Replace colors of SVG elements as specified + line.gsub!(/#([0-9a-f]{6})/) { |match| + "##{replacecolor($~[1])}" + } + + # Replace colors in binary images as specified + line.gsub!(/"data:image\/(\w+?);base64,(.*)"/) { |match| + imgctr += 1 + if ( convertimage?(pnr, imgctr) ) + imgtype = $~[1] + File.open("#{basename}_#{imgctr}.b64", "w") { |f| f.write($~[2]) } + + # Convert base 64 string to image + p = IO::popen("base64 -d #{basename}_#{imgctr}.b64 > #{basename}_#{imgctr}.#{imgtype} 2>&1") + log += p.readlines.join + + # Invert/replace colors + if ( $colors == "" ) + p = IO::popen("convert #{basename}_#{imgctr}.#{imgtype} -negate #{basename}_#{imgctr}.#{imgtype} 2>&1") + log += p.readlines.join + else + $colororder.each { |color| + fuzz = $colorrules[color][0] + p = IO::popen("convert #{basename}_#{imgctr}.#{imgtype} -fuzz #{fuzz}% " + + "-fill \"##{replacecolor(color)}\" -opaque \"##{color}\" " + + "#{basename}_#{imgctr}.#{imgtype} 2>&1") + log += p.readlines.join + } + end + + # Convert back to base 64 + p = IO::popen("base64 #{basename}_#{imgctr}.#{imgtype} > #{basename}_#{imgctr}.b64 2>&1") + log += p.readlines.join + + result = "\"data:image/#{imgtype};base64,#{File.open("#{basename}_#{imgctr}.b64", "r") { |f| result = f.readlines.join}}\"" + + # Cleanup + FileUtils.rm("#{basename}_#{imgctr}.b64") if !$DEBUG + FileUtils.rm("#{basename}_#{imgctr}.#{imgtype}") if !$DEBUG + + result + else + # Leave old image in place + "\"data:image/#{$~[1]};base64,#{$~[2]}\"" + end + } + + # Add page number (if requested) + if ( $pagenumbers ) + # Find out (--> page number position) and change document height. + # Need room for page number! No worry, we resize later, anyway. + if ( pny == nil ) + line.gsub!(/height="(\d+)"/) { |match| + newheight = Integer($~[1]) + 30 + pny = newheight.to_s + "height=\"#{newheight}\"" + } + end + + # Find out document width (--> page number position) + if ( pnx == nil && /width="(\d+)"/ =~ line.strip ) + pnx = (Integer($~[1]) / 2).to_s + end + + line.gsub!("", "\n#{pagenumber(pnr, pnx, pny)}\n") + end + line + } + + File.open("#{basename}_inv.svg", "w") { |f| + f.write(svg.join) + } + + p = IO::popen("inkscape -A #{file} #{basename}_inv.svg 2>&1") + log += p.readlines.join + FileUtils.rm("#{basename}_inv.svg") if !$DEBUG + + # Change PDF size to A4. Nasty workaround. + p = IO::popen("gs -sOutputFile=#{basename}a4.pdf -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -sPAPERSIZE=a4 " + + "-dFIXEDMEDIA -dPDFFitPage -q -f #{file} 2>&1") + log += p.readlines.join + FileUtils.mv("#{basename}a4.pdf", file) + + log += "Done inverting #{file}.\n"; + return log +end + +# # # # # # # # # # # # # # # +# Actual Work +# # # # # # # # # # # # # # # + +FileUtils.cp($input, $tmp) +Dir.chdir($tmp) +$input = File.basename($input) + +`pdftk #{$input} burst output #{$tmp}/input_%04d.pdf` +FileUtils.rm($input) if !$DEBUG + +$log = "" + +# Invert all pages +begin + gem "parallel" + require 'parallel' + + $log += Parallel.map(Dir["input_*"]) { |f| + invert(f) + }.join("\n") +rescue Gem::LoadError + # Fall back to sequential processing if gem is not available + $log += "Hint: install gem 'parallel' to speed up jobs with many pages!\n\n" + Dir["input_*"].each { |f| + $log += invert(f) + } +end + +# Write log (for debugging) +File.open("log", "w") { |f| + f.write($log) +} if $DEBUG + +# Join pages together again +`pdftk input_*.pdf cat output output.pdf allow AllFeatures` + +# # # # # # # # # # # # # # # +# Wrap-up +# # # # # # # # # # # # # # # + +Dir.chdir($dir) +FileUtils.cp("#{$tmp}/output.pdf", $output)