-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
226 lines (182 loc) · 5.73 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/bin/python3.4
# -*- coding: utf-8 -*-
"""A Python app that makes epub files with pinyin in them."""
import sys
import html as htmlLib
import os
from pathlib import Path
from dragonmapper import transcriptions as trans
from dragonmapper import hanzi
from dragonmapper import html
import hanzidentifier as ziid
from ebooklib import epub
import jieba
from bs4 import BeautifulSoup
import bs4
# Change These #
debug_is_on = False
numbered_pinyin = False
dict_file = './dict.txt.big'
# \Change These #
FULL_FILE_NAME = sys.argv[1]
FILE_NAME, FILE_EXT = FULL_FILE_NAME.split('.')
NEW_DIR = FILE_NAME + '-with-pinyin'
EPUB_BLACKLIST_KEYWORDS = ['pagenav']
GET_TAG_NAMES = ['p', 'strong', 'h1',
'h2', 'h3', 'h4',
'h5', 'h6', 'i',
'b', 'u', 'a', 'em']
def is_entirely_chinese(s):
"""
Check if every character in the string is Chinese.
*s* string to search in
"""
debug('TEST(IEC): '+str(s))
for c in s:
if not ziid.has_chinese(c):
return False
debug('TEST(IEC): '+c+' is not chinese.')
return True
def debug(s):
"""
Print info to screen, but only when debugging is enabled.
*s* object to print
"""
if debug_is_on:
print(s)
def find(name, path):
"""
Look for, and return (if found) the path of a file.
*name* name of the file
*path* directory to look in
"""
for root, drs, fls in os.walk(path):
debug('ROOT: {0}\nDIRS: {1}\nFILES: {2}'.format(root, drs, fls))
if name in fls:
return os.path.join(root, name)
def find_epub_files(f):
"""
Get list of files needed to be converted.
*f* epub file no extention
"""
all_star_html = [x for x in Path(f).rglob("*html")]
return all_star_html
'''
book = epub.read_epub(f + '.epub')
for t in book.toc:
debug(t.href)
debug(f)
t_href = t.href
file_dot_index = t_href.rfind('.')
t_name = t_href[:file_dot_index]
t_ext = t_href[file_dot_index:]
stn_file = t_name + t_ext
extr_file = t_name + '-extracted' + t_ext
debug(stn_file)
debug(extr_file)
addStnFile = find(stn_file, f)
addExtrFile = find(extr_file, f)
debug(addStnFile)
debug(addExtrFile)
if addStnFile is not None:
debug("Adding file")
all_star_html.append(addStnFile)
if addExtrFile is not None:
debug("Adding -extracted file")
all_star_html.append(addExtrFile)
return all_star_html'''
def get_all_tags_text(pt):
"""
Get all text from inner tags.
*pt* parent tag
"""
all_tags_text = []
for tag in pt.contents:
if isinstance(tag, bs4.element.NavigableString):
all_tags_text.append({'str': str(tag), 'tag': tag})
return all_tags_text
def generate_new_html_for_words(words):
"""
Go through list of words, and generates the HTML + Ruby characters.
*words* a list of chinese words
"""
words_with_pinyin_html = ""
for word in words:
debug("{0} : {1}".format(word, "".join(words)))
debug('CHARACTER: {0}'.format(word))
if not is_entirely_chinese(word):
pi = (' '*len(word)).split(' ')
else:
zh = hanzi.to_zhuyin(word)
debug('ZHUYIN: {0}'.format(zh))
pi = trans.zhuyin_to_pinyin(
zh,
accented=not numbered_pinyin).split(' ')
debug('PINYIN: {0}'.format(pi))
debug("{0}={1}".format(word, "".join(pi)))
words_with_pinyin_html += html.to_html(word, top=pi, minified=True)
return words_with_pinyin_html
def add_js_link(bs_file):
"""
Return file w/ link to js/functions.js file.
*bs_file* Beautiful Soup file object.
"""
js_link = bs_file.new_tag("script", type="text/javascript", src="./js/functions.js")
bs_file.head.insert(-1, js_link)
return bs_file
def get_pinyin_bs(file_name):
"""
Return file + Ruby characters.
*fn* file name to run on
"""
# open file and parse with bs4, with xml rules
debug(file_name)
f = open(file_name, 'r')
s1 = f.read()
f.close()
bs = BeautifulSoup(s1, 'xml')
# Go through all the 'text' tags and extract only the tags' strings
# assign to all_of_the_tags
all_of_the_tags = []
for tn in GET_TAG_NAMES:
for t in bs.findAll(tn):
if "href" in t:
t['href'] = htmlLib.escape(t['href'])
for ts in get_all_tags_text(t):
all_of_the_tags.append(ts)
for p in all_of_the_tags:
debug(p)
p_el = p['tag']
p_text = p['str']
debug(p_text)
new_p_str = ''
debug('WORKING ON: ')
debug("#######{0}#######".format(p_text))
# pos_tagging will give you the type of word
# unneccesary in this case
words = [word for word in jieba.cut(p_text)]
debug("WORDS: {0}".format(words))
new_p_str = generate_new_html_for_words(words)
debug("Parent Element: " + str(p_el.parent))
debug("{0} will be replaced by {1}".format(p_el, new_p_str))
p_el.replace_with(bs.new_string(new_p_str))
return str(add_js_link(bs).decode(formatter=None))
files = find_epub_files(NEW_DIR)
debug(files)
jieba.load_userdict(dict_file)
total_files = 0
current_file_num = 1
for fn in files:
total_files += 1
for fn in files:
if fn != "":
print("{0}({1})/{2} starting... ".format(
current_file_num,
fn,
total_files))
new_text = get_pinyin_bs(fn)
debug(new_text)
new_f = open("{0}".format(fn), encoding='utf-8', mode='w')
new_f.write(new_text)
new_f.close()
current_file_num += 1