armaab · zach-m · Jul 18, 2024 · Jul 18, 2024 · Oct 19, 2024
diff --git a/README.md b/README.md
@@ -35,14 +35,49 @@ otherwise it is closed. The exclamation mark indicates the end of those asterisk
 the beginning of the bookmark title. There are one or more spaces, i.e. ' ', after the title,
 then follows the page number. In summary, each line of toc
 file should match the regular expression `(^\**)(1?)!(.+?)\s+(-?[0-9]+)\s*$`.
+
+## Alternative Toc file format (Tab-delemited)
+If preferred, a tab-delimited format can be used instead as the toc file (by passing `--tsv` to the command line).
+It has a fixed number of columns and hence easier to edit in Excel or other tabular editors. The columns are:
+```
+Level,IsOpen,Title,Page
+```
+
+The tab-delimited equivalent of the above toc would be (ignore the empty "header" line):
+
+|||||
+|-|-|-|-|
+|0||Contents|1|
+|0||0. Introduction|2|
+|0||1. First section|3|
+|1|*|1.1 Subsection|3|
+|2||1.1.1 Subsubsection|3|
+|2||1.1.2 Another subsubsection|5|
+|1||1.2 Another subsection|7|
+|0||2. Second section|8|
+|1||2.1 Subsection|3|
+|1||2.2 Subsection|3|
+|0||3. Third section|8|
+
+Where:
+- `Level` is a non-empty zero-based integer indicating the depth of the current item in the bookmarks
+- `IsOpen`, if not blank, indicates whether this entry is opened by default, and can be any **single character**
+- The file can't contain headers, comments, and empty lines.
+
+Technically, each line of this toc format should match the regular expression `^([0-9]+)\t(.?)\t(.+?)\t(-?[0-9]+)$`.
+
 # Usage
 ```
-$ pdfmark --in <input> --toc <toc-file> --out <output> [--offset <offset>]
+$ pdfmark --in <input> --toc <toc-file> --out <output> [--offset <offset>] [--tsv] [--page <page>] [--fit page|width] [--print-pdfmarks]
 ```
-Where `<input>`, and `<output>` are input PDF and output PDF, `<toc-file>`
-is the toc file as described above, and the option `<offset>` is optional, it
-stands for the offset that should be added to the page numbers in toc file in order
-to get the real page number in the PDF file.
+Where:
+- `<input>`, and `<output>` are input PDF and output PDF
+- `<toc-file>` is the toc file as described above
+- `<offset>` (optional) stands for the offset that should be added to the page numbers in toc file in order to get the real page number in the PDF file
+- `--tsv` (optional) indicates that the toc file format is tab-delimited
+- `<page>` (optional) sets the default page to display when the PDF opens (defaults to 1)
+- `--fit` (optional) sets the default zoom for when the PDF opens, can be either `page` or `width`
+- `--print-pdfmarks` (optional) is for debugging purposes, prints `pdfmarks` and exists (doesn't create an output PDF)
 
 [1]: http://blog.tremily.us/posts/PDF_bookmarks_with_Ghostscript/
 [2]: http://ghostscript.com/
diff --git a/pdfmark.py b/pdfmark.py
@@ -17,7 +17,13 @@ def tounicode(s):
             s = s.replace(x, y)
         return '({})'.format(s)
 
-def parsetoc(s):
+def unquote(s: str):
+    """ when tab-delimited files are edited in Excel, it adds artificial quotes around titles with commas """
+    if s.startswith('"') and s.endswith('"'):
+        return s[1:-1]
+    return s
+
+def parsetoc(s, legacy_format=True):
     '''Parse toc file.
 
     Args:
@@ -29,7 +35,7 @@ def parsetoc(s):
 
         {'count': 1,
          'flag': '',
-          'title': 'Some title',
+         'title': 'Some title',
          'page': 10}
 
          If there is an error in the toc file, then a tuple is
@@ -42,7 +48,10 @@ def parsetoc(s):
          the content of that line is 'Contents 4'.
     '''
     import re
-    regexp = re.compile(r'(^\**)(1*)!(.+?)\s+(-?[0-9]+)\s*$')
+    if legacy_format:
+        regexp = re.compile(r'(^\**)(1*)!(.+?)\s+(-?[0-9]+)\s*$')
+    else:
+        regexp = re.compile(r'^([0-9]+)\t(.?)\t(.+?)\t(-?[0-9]+)$')
     lastlevel = 0
     res, lines = [], []
     i = 0
@@ -51,9 +60,9 @@ def parsetoc(s):
         m = regexp.match(l)
         if m is None:
             return (j, l)
-        level = len(m.group(1))
+        level = len(m.group(1)) if legacy_format else int(m.group(1))
         res.append({'count': 0, 'flag': '' if m.group(2) else '-',
-            'title': m.group(3), 'page': int(m.group(4))})
+            'title': unquote(m.group(3)), 'page': int(m.group(4))})
 
         if level > lastlevel + 1:
             return (j, l)
@@ -117,22 +126,39 @@ def gen_pdfmarks(infos, offset=0):
             help='path to toc file')
     parser.add_argument('--offset', dest='offset', type=int, default=0,
             help='offset of page numbers')
-    parser.add_argument('--gs', dest='gs', default=GS,
+    parser.add_argument('--tsv', action='store_true',
+            help='use tab-delimited format for TOC file')
+    parser.add_argument('--page', type=int, default=1,
+            help='default page to show when pdf opens')
+    parser.add_argument('--fit', choices=["page", "width"],
+            help='default zoom when pdf opens')
+    parser.add_argument('--gs', default=GS,
             help='path to the gs (ghostscript) excutable')
     parser.add_argument('--print-pdfmarks', dest='marks', action='store_true',
-            help='print pdfmarks to the standard output')
+            help='print pdfmarks to the standard output and exit')
 
     args = parser.parse_args()
     s = []
     with open(args.toc, 'r') as f:
-        infos = parsetoc(f)
+        infos = parsetoc(f, legacy_format=not args.tsv)
     if isinstance(infos, tuple):
         print('Error on line {} in {}:\n{}'.format(infos[0]+1, args.toc,infos[1]))
         exit(1)
-    marks = '\n'.join(row for row in gen_pdfmarks(infos, args.offset))
+
+    page_str = " /Page " + str(args.page)
+    fit_str = " /View [/Fit] " if args.fit == "page" else " /View [/FitH -32768] " if args.fit == "width" else ""
+    marks = (
+        "[/PageMode /UseOutlines"
+        + page_str
+        + fit_str
+        + " /DOCVIEW pdfmark\n"
+        + "\n".join(row for row in gen_pdfmarks(infos, args.offset))
+    )
     if args.marks:
-        print(marks)
+        for mark in marks.split("\n"):
+            print(mark)
         exit()
+
     marks = '/pdfmark { originalpdfmark } bind def' + marks
     marks = marks.encode()