forked from jdmonaco/pdf-title-rename
-
Notifications
You must be signed in to change notification settings - Fork 2
/
docx2txt.py
112 lines (94 loc) · 3.49 KB
/
docx2txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#! /usr/bin/env python
import argparse
import re
import xml.etree.ElementTree as ET
import zipfile
import os
import sys
nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
def process_args():
parser = argparse.ArgumentParser(description='A pure python-based utility '
'to extract text and images '
'from docx files.')
parser.add_argument("docx", help="path of the docx file")
parser.add_argument('-i', '--img_dir', help='path of directory '
'to extract images')
args = parser.parse_args()
if not os.path.exists(args.docx):
print('File {} does not exist.'.format(args.docx))
sys.exit(1)
if args.img_dir is not None:
if not os.path.exists(args.img_dir):
try:
os.makedirs(args.img_dir)
except OSError:
print("Unable to create img_dir {}".format(args.img_dir))
sys.exit(1)
return args
def qn(tag):
"""
Stands for 'qualified name', a utility function to turn a namespace
prefixed tag name into a Clark-notation qualified tag name for lxml. For
example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
Source: https://github.com/python-openxml/python-docx/
"""
prefix, tagroot = tag.split(':')
uri = nsmap[prefix]
return '{{{}}}{}'.format(uri, tagroot)
def xml2text(xml):
"""
A string representing the textual content of this run, with content
child elements like ``<w:tab/>`` translated to their Python
equivalent.
Adapted from: https://github.com/python-openxml/python-docx/
"""
text = u''
root = ET.fromstring(xml)
for child in root.iter():
if child.tag == qn('w:t'):
t_text = child.text
text += t_text if t_text is not None else ''
elif child.tag == qn('w:tab'):
text += '\t'
elif child.tag in (qn('w:br'), qn('w:cr')):
text += '\n'
elif child.tag == qn("w:p"):
text += '\n\n'
return text
def process(docx, img_dir=None):
text = u''
# unzip the docx in memory
zipf = zipfile.ZipFile(docx)
filelist = zipf.namelist()
# get header text
# there can be 3 header files in the zip
header_xmls = 'word/header[0-9]*.xml'
for fname in filelist:
if re.match(header_xmls, fname):
text += xml2text(zipf.read(fname))
# get main text
doc_xml = 'word/document.xml'
text += xml2text(zipf.read(doc_xml))
# get footer text
# there can be 3 footer files in the zip
footer_xmls = 'word/footer[0-9]*.xml'
for fname in filelist:
if re.match(footer_xmls, fname):
text += xml2text(zipf.read(fname))
if img_dir is not None:
# extract images
for fname in filelist:
_, extension = os.path.splitext(fname)
if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
dst_fname = os.path.join(img_dir, os.path.basename(fname))
with open(dst_fname, "wb") as dst_f:
dst_f.write(zipf.read(fname))
zipf.close()
text = text.strip()
text = re.split(r' |\n',text) # split rule, modify depend your need
#print('=====text====',' '.join(str(x) for x in text))
return text
if __name__ == '__main__':
args = process_args()
text = process(args.docx, args.img_dir)
sys.stdout.write(text.encode('utf-8'))