Skip to content

Commit

Permalink
Adds port of pdfinvert to Ruby, including improvements and new features.
Browse files Browse the repository at this point in the history
 * $DEBUG controls amount of files left for inspection.
 * Arbitrarily large embedded images are now supported.
 * User can specify which embedded images to convert.
 * Code is cleaner and more robust than old one.
 * Supports embedded image types other than PNG.
  • Loading branch information
Raphael Reitzig committed Oct 28, 2013
1 parent f519405 commit af81254
Showing 1 changed file with 351 additions and 0 deletions.
351 changes: 351 additions & 0 deletions pdfinvert.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,351 @@
#!/usr/bin/ruby

# Copyright 2013, Raphael Reitzig
# <code@verrech.net>
#
# pdfinvert is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# pdfinvert is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with pdfinvert. If not, see <http://www.gnu.org/licenses/>.

# Inverts colors in a PDF, including embedded images. Can use replacement
# tables instead of inverting all colors. Can transform all embedded images
# or use rules to determine which remain unchanged. Can add page numbers (`-pn`).
#
# Color files have one rule per line. Each line has the format
#
# dd #xxxxxx #xxxxxx
#
# where
#
# * dd is an integer used for fuzzy color matching in embedded binary
# images (see imagemagick documentation). Higher numbers mean colors
# "farther away" from the specified source color are replaced.
# * xxxxxx is an RGC color in hex. The first color is the source color, the
# second the replacement color.
#
# So, for instance, the line
#
# 30 #ffffff #000000
#
# will replace white with black, and in embedded images also colors "30-close"
# to white.
# Note that rules are applied in sequence, from top to bottom.
#
# Image rule files have one line per page. Every line contains whitespace-separated
# zeros and ones; if the i-th digit is zero, the i-th embedded image on that
# page is not converted, otherwise it is. Illegal entries and missing numbers are
# considered to be one.
#
# For example, the file
#
# 1 0 0
#
# 1 1 0
#
# Means that images two and three on page one as well as image three on page three
# remain unchanged; all other images will be converted.
#
# Requirements:
# * ruby
# * inkscape
# * imagemagick
# * pdftk
# * gs

require 'fileutils'

$DEBUG = false

# # # # # # # # # # # # # # #
# Init
# # # # # # # # # # # # # # #

if ( ARGV.size == 0 )
puts "Usage: pdfinvert [-pn] [-c <color file>] [-i <image rule file>] <input file> [output file]"
Process.exit
end

$pagenumbers = false
$colors = ""
$images = ""
$input = ""

# Read in command line parameters
skip = 0
(0..ARGV.size - 1).each { |i|
if ( skip > 0 )
skip -= 1
next
end

if ( ARGV[i] == "-pn" )
$pagenumbers = true
elsif ( ARGV[i] == "-c" )
$colors = ARGV[i+1]
skip = 1
elsif ( ARGV[i] == "-i" )
$images = ARGV[i+1]
skip = 1
elsif ( $input == "" )
$input = ARGV[i]
$filename = File.basename($input, ".pdf")
$output = "#{$filename}_inverted.pdf"
else
$output = ARGV[i];
end
}

# Verify that input file exists
if ( $input == "" )
puts "Please provide an input file."
Process.exit
elsif ( !File.exists?($input) )
puts "File '#{$input}' does not exist.";
Process.exit;
end

$tmp = "/tmp/pdfinvert_#{$filename}"
$dir = Dir.pwd

# Ensure that temporary directory exists and is empty
if ( !Dir.exists?($tmp) )
Dir.mkdir($tmp)
else
Dir["#{$tmp}/*"].each { |f| File.delete(f) }
end

# Preprocess replacement colors file
$colorrules = {}
$colororder = []
if ( $colors != "" )
if ( File.exists?($colors) )
File.open($colors, "r") { |f|
f.readlines.each { |line|
entry = line.strip.split(/\s+/)
entry[0] = Integer(entry[0])
entry[1] = entry[1][1..6]
entry[2] = entry[2][1..6]
$colorrules[entry[1]] = [entry[0], entry[2]]
$colororder.push(entry[1])
}
}
else
puts "Color specification file '#{$colors}' does not exist. Inverting now."
$colors = ""
end
end

def replacecolor(color)
if ( $colors != "" )
if ( $colorrules.include?(color) )
$colorrules[color][1]
else
color
end
else
sprintf("%06x", 0xFFFFFF - Integer("0x#{color}", 16))
end
end

# Preprocess image rule file
$imagerules = [[]]
if ( $images != "" )
if ( File.exists?($images) )
File.open($images, "r") { |f|
ctr = 1
f.readlines.each { |line|
$imagerules[ctr] = line.strip.split(/\s+/).map { |b| Integer(b) rescue 1 }
ctr += 1
}
}
else
puts "Image rule file '#{$images}' does not exist. Converting all images now."
$images = ""
end
end

def convertimage?(page, image)
if ( $imagerules.size > page && $imagerules[page].size > image )
$imagerules[page][image] != 0
else
true
end
end

# Function that returns page number inset for SVG
def pagenumber(nr, x, y)
return " <g id=\"pagenumberg\">\n" +
" <text xml:space=\"preserve\" style=\"font-size:25px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#888888;fill-opacity:1;stroke:none;font-family:Sans;\" " +
"x=\"#{x}\" y=\"#{y}\" id=\"pagenumbert\" >\n" +
" <tspan id=\"pagenumberts\" x=\"#{x}\" y=\"#{y}\" style=\"font-weight:1;font-style:normal;font-stretch:normal;font-variant:normal;font-size:25px;font-family:Sans;\">\n" +
" #{nr}\n" +
" </tspan>\n" +
" </text>\n" +
" </g>";
end

# This function processes the given file
def invert(file)
log = "Inverting #{file}...\n"
basename = File.basename(file, ".pdf")

p = IO::popen("inkscape -l #{basename}.svg #{file} 2>&1")
log += p.readlines.join
FileUtils.rm(file) if !$DEBUG

# Change size form US Letter to A4 (may want to generalise?):
#sed -e 's/width="765"/width="210mm"/;s/height="990"/height="297mm"/' ${1%.pdf}.svg \
# > ${1%.pdf}a4.svg;
#mv ${1%.pdf}a4.svg ${1%.pdf}.svg;
# This does not rescale/fit!

svg = []
File.open("#{basename}.svg", "r") { |f|
svg = f.readlines
}
FileUtils.rm("#{basename}.svg") if !$DEBUG

pnr = file.gsub(/[^0-9]/, "").to_i
pny = nil
pnx = nil
imgctr = -1
svg.map! { |line|
# Replace colors of SVG elements as specified
line.gsub!(/#([0-9a-f]{6})/) { |match|
"##{replacecolor($~[1])}"
}

# Replace colors in binary images as specified
line.gsub!(/"data:image\/(\w+?);base64,(.*)"/) { |match|
imgctr += 1
if ( convertimage?(pnr, imgctr) )
imgtype = $~[1]
File.open("#{basename}_#{imgctr}.b64", "w") { |f| f.write($~[2]) }

# Convert base 64 string to image
p = IO::popen("base64 -d #{basename}_#{imgctr}.b64 > #{basename}_#{imgctr}.#{imgtype} 2>&1")
log += p.readlines.join

# Invert/replace colors
if ( $colors == "" )
p = IO::popen("convert #{basename}_#{imgctr}.#{imgtype} -negate #{basename}_#{imgctr}.#{imgtype} 2>&1")
log += p.readlines.join
else
$colororder.each { |color|
fuzz = $colorrules[color][0]
p = IO::popen("convert #{basename}_#{imgctr}.#{imgtype} -fuzz #{fuzz}% " +
"-fill \"##{replacecolor(color)}\" -opaque \"##{color}\" " +
"#{basename}_#{imgctr}.#{imgtype} 2>&1")
log += p.readlines.join
}
end

# Convert back to base 64
p = IO::popen("base64 #{basename}_#{imgctr}.#{imgtype} > #{basename}_#{imgctr}.b64 2>&1")
log += p.readlines.join

result = "\"data:image/#{imgtype};base64,#{File.open("#{basename}_#{imgctr}.b64", "r") { |f| result = f.readlines.join}}\""

# Cleanup
FileUtils.rm("#{basename}_#{imgctr}.b64") if !$DEBUG
FileUtils.rm("#{basename}_#{imgctr}.#{imgtype}") if !$DEBUG

result
else
# Leave old image in place
"\"data:image/#{$~[1]};base64,#{$~[2]}\""
end
}

# Add page number (if requested)
if ( $pagenumbers )
# Find out (--> page number position) and change document height.
# Need room for page number! No worry, we resize later, anyway.
if ( pny == nil )
line.gsub!(/height="(\d+)"/) { |match|
newheight = Integer($~[1]) + 30
pny = newheight.to_s
"height=\"#{newheight}\""
}
end

# Find out document width (--> page number position)
if ( pnx == nil && /width="(\d+)"/ =~ line.strip )
pnx = (Integer($~[1]) / 2).to_s
end

line.gsub!("</svg>", "\n#{pagenumber(pnr, pnx, pny)}\n</svg>")
end
line
}

File.open("#{basename}_inv.svg", "w") { |f|
f.write(svg.join)
}

p = IO::popen("inkscape -A #{file} #{basename}_inv.svg 2>&1")
log += p.readlines.join
FileUtils.rm("#{basename}_inv.svg") if !$DEBUG

# Change PDF size to A4. Nasty workaround.
p = IO::popen("gs -sOutputFile=#{basename}a4.pdf -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -sPAPERSIZE=a4 " +
"-dFIXEDMEDIA -dPDFFitPage -q -f #{file} 2>&1")
log += p.readlines.join
FileUtils.mv("#{basename}a4.pdf", file)

log += "Done inverting #{file}.\n";
return log
end

# # # # # # # # # # # # # # #
# Actual Work
# # # # # # # # # # # # # # #

FileUtils.cp($input, $tmp)
Dir.chdir($tmp)
$input = File.basename($input)

`pdftk #{$input} burst output #{$tmp}/input_%04d.pdf`
FileUtils.rm($input) if !$DEBUG

$log = ""

# Invert all pages
begin
gem "parallel"
require 'parallel'

$log += Parallel.map(Dir["input_*"]) { |f|
invert(f)
}.join("\n")
rescue Gem::LoadError
# Fall back to sequential processing if gem is not available
$log += "Hint: install gem 'parallel' to speed up jobs with many pages!\n\n"
Dir["input_*"].each { |f|
$log += invert(f)
}
end

# Write log (for debugging)
File.open("log", "w") { |f|
f.write($log)
} if $DEBUG

# Join pages together again
`pdftk input_*.pdf cat output output.pdf allow AllFeatures`

# # # # # # # # # # # # # # #
# Wrap-up
# # # # # # # # # # # # # # #

Dir.chdir($dir)
FileUtils.cp("#{$tmp}/output.pdf", $output)

0 comments on commit af81254

Please sign in to comment.