forked from jdmonaco/pdf-title-rename
-
Notifications
You must be signed in to change notification settings - Fork 2
/
pdf-rename-batch.py
165 lines (138 loc) · 5.18 KB
/
pdf-rename-batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python
"""
Extract title from PDF file.
Depends on: pyPDF, PDFMiner.
Usage:
find . -name "*.pdf" | xargs -I{} pdf-title-rename-batch.py -d tmp --rename {}
"""
import io # cStringIO in python2
import getopt
import os
import re
import string
import sys
from PyPDF2 import PdfFileReader
from PyPDF2.utils import PdfReadError
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf, PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFSyntaxError
__all__ = ['pdf_title']
def check_contain_chinese(check_str):
return any((u'\u4e00' <= char <= u'\u9fff') for char in check_str)
def check_contain_number(check_str):
return any(char.isdigit() for char in check_str)
def sanitize(filename):
"""Turn string to valid file name.
"""
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
return ''.join([c for c in filename if c in valid_chars])
def sanitize_chinese(filename):
return re.sub('\?|\.|\。|\!|\/|\;|\:|\*|\>|\<|\~|\(|\)|\[|\]|[A-Za-z0-9]|', '', filename)
def meta_title(filename):
"""Title from pdf metadata.
"""
try:
fp = open(filename, 'rb')
docinfo = PdfFileReader(fp).getDocumentInfo()
fp.close()
print('===docinfo===',docinfo.title)
return docinfo.title if docinfo.title else ""
except PdfReadError:
print(">>>>>Can't not read doc meta info")
return ""
def copyright_line(line):
"""Judge if a line is copyright info.
"""
return re.search(r'technical\s+report|proceedings|preprint|to\s+appear|submission', line.lower())
def empty_str(s):
return len(s.strip()) == 0
def pdf_text(filename):
try:
text = io.StringIO()
rsrc = PDFResourceManager()
device = TextConverter(rsrc, text, laparams=LAParams()) # no codec='utf-8' in TextConverter
fp = open(filename, 'rb')
process_pdf(rsrc, device, fp, None, maxpages=1, password='') # open(filename, 'rb') need to close or use: 'with open(filename, 'rb') as fp'
#fp.close()
device.close()
print('===text.getvalue===',text.getvalue())
return text.getvalue()
except (PDFSyntaxError, PDFTextExtractionNotAllowed, UnicodeEncodeError):
print(">>>>>Can't read doc's text info")
return ""
def title_start(lines):
for i, line in enumerate(lines):
if not empty_str(line) and not copyright_line(line):
return i;
return 0
def title_end(lines, start, max_lines=2):
for i, line in enumerate(lines[start+1:start+max_lines+1], start+1):
if empty_str(line):
return i
return start + 1
def title_start_end(lines, max_lines=100,line_range=4):
start = max_lines - line_range; end = max_lines # last 4 line
for i, line in enumerate(lines[:max_lines]):
if check_contain_chinese(line): # Chinese
if (" " in line) or check_contain_number(line):
continue
start = i; end = i + line_range; break
elif not empty_str(line) and not copyright_line(line):
if check_contain_number(line):
continue
start = i; end = i + line_range - 1; break
continue # Other language
print('=start=%s==end=%s'%(start,end))
return start,end
def text_title(filename):
"""Extract title from PDF's text.
"""
lines = pdf_text(filename).strip().split('\n')
#i = title_start(lines)
#j = title_end(lines, i,max_lines=5)
i,j = title_start_end(lines,max_lines=20,line_range=5)
print('====return===',' '.join(line.strip() for line in lines[i:j]))
return ' '.join(line.strip() for line in lines[i:j])
def valid_title(title):
return not empty_str(title) and empty_str(os.path.splitext(title)[1])
def pdf_title(filename):
title = meta_title(filename)
if valid_title(title):
return title
title = text_title(filename)
if valid_title(title):
return title
return os.path.basename(os.path.splitext(filename)[0])
if __name__ == "__main__":
opts, args = getopt.getopt(sys.argv[1:], 'nd:', ['dry-run', 'rename'])
dry_run = False
rename = True
dir = "."
for opt, arg in opts:
if opt in ['-n', '--dry-run']:
dry_run = True
rename = False
elif opt in ['--rename']:
rename = True
elif opt in ['-d']:
dir = arg
if len(args) == 0:
print ("Usage: %s [-d output] [--dry-run] [--rename] filenames" % sys.argv[0])
sys.exit(1)
for filename in args:
title = pdf_title(filename)
if rename:
if check_contain_chinese(title):
title = sanitize_chinese('_'.join(title.split())) # for Chinese
else:
title = sanitize(' '.join(title.split())) # for others languages
new_name = os.path.join('', title + ".pdf")
print ("%s => %s" % (filename, new_name))
if not dry_run:
if os.path.exists(new_name):
print ("*** Target %s already exists! ***" % new_name)
else:
os.rename(filename, new_name)
else:
print (title)