forked from empenguinxh/Anki-CreateImportFile
-
Notifications
You must be signed in to change notification settings - Fork 0
/
my_helpers.py
280 lines (275 loc) · 12.1 KB
/
my_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# coding:utf-8
import re
import json
import codecs
import functools
import os.path
from random import random
from random import randint
from pprint import pprint
from copy import deepcopy
def strF2H(ustring):
'''
convert full width character to half width
input: a unicode object
return: a unicode object
'''
h_ustring = u""
assert isinstance(ustring, unicode)
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 12288:
# white space
inside_code = 32
elif 65281 <= inside_code <= 65374:
# other characters
inside_code -= 65248
h_ustring += unichr(inside_code)
return h_ustring
# pretty print the embedded unicode of list or dict
def iter_print(obj_iter, indent=0, increment=2, max_top_level_print=None,
top_level=True, top_level_extra_line_feed=False, print_list_index=True):
if not hasattr(obj_iter, '__iter__'):
if isinstance(obj_iter, basestring):
if obj_iter == u'':
pass
elif '\n' in obj_iter:
for line in obj_iter.split('\n'):
if line:
print ' '*indent, line
else:
print ' '*indent, obj_iter
else:
print ' '*indent, obj_iter
return
print_count = 0
if isinstance(obj_iter, dict):
for key, iter_sub_obj in obj_iter.iteritems():
print ' '*indent, key
iter_print(iter_sub_obj, indent+increment, increment, None, False, False, print_list_index)
if top_level:
print_count += 1
if max_top_level_print:
if print_count >= max_top_level_print:
break
if top_level_extra_line_feed:
print '\n'
else:
for list_index, sub_obj_iter in enumerate(obj_iter):
if print_list_index:
print ' '*indent, list_index
iter_print(sub_obj_iter, indent+increment, increment, None, False, False, print_list_index)
if top_level:
print_count += 1
if max_top_level_print:
if print_count >= max_top_level_print:
break
if top_level_extra_line_feed:
print '\n'
def extract_content_between(obj_str, match_re, return_str_before_first_match=False):
'''
extract content between the start of two equal pattern found in a str,
also extract the content after the last match
input: obj_str, the string to extract content from, must be a unicode object
match_re, the pattern to be matched
return: a list of str
return_str_before_first_match: whether to return the str before the first match of the given patter
'''
assert isinstance(obj_str, unicode)
retype = type(re.compile(r'a str'))
assert isinstance(match_re, retype)
match_results_iter = match_re.finditer(obj_str)
returned_str_l = []
start_index = None
end_index = None
first_start_index = None
for match_result in match_results_iter:
if first_start_index is None:
first_start_index = match_result.start()
if not (start_index is None):
end_index = match_result.start()
returned_str_l.append(obj_str[start_index:end_index])
start_index = match_result.start()
returned_str_l.append(obj_str[start_index:])
if return_str_before_first_match:
returned_str_l = [obj_str[:first_start_index]] + returned_str_l
return returned_str_l
def iter_value_of_key_through_d_l_d_d(obj_d_l_d_d, key_2nd_level, key_4th_level,
expected_draw=1.0, yield_top_key=False, yield_list_index=False):
'''
a function that return a generator
it will iter through all the values of the first level dict with every value being themself a dict
for every such value dict,
a key specified by key_2nd_level is used to access a list
for every elment of the list
a key specified by key_4th_level is used to access the corresponding value
so in total it is a two level nested loop
key_2nd_level: what it points to must be a list
expected_draw: roughly control the proportion of the innermost values to be sampled
can be an integar, which will be converted to the corresponding probability
yield_top_key: whether to include the top key
yield_list_index: whether to include the list index
note that (yield_top_key=False, yield_list_index=True) is a useless combination, so raise an ValueError
'''
if isinstance(expected_draw, int):
expected_draw = float(expected_draw)/len(obj_d_l_d_d)
assert isinstance(expected_draw, float)
for top_key, value_d_l_d in obj_d_l_d_d.iteritems():
assert isinstance(value_d_l_d[key_2nd_level], list)
for _list_index, value_d in enumerate(value_d_l_d[key_2nd_level]):
if random() <= expected_draw:
if (not yield_top_key) and (not yield_list_index):
yield value_d[key_4th_level]
elif yield_top_key and (not yield_list_index):
yield top_key, value_d[key_4th_level]
elif yield_top_key and yield_list_index:
yield top_key, _list_index, value_d[key_4th_level]
else:
raise ValueError('Invalid Combination of yield_top_key and yield_list_index')
def iter_through_general(obj_iter, path, yield_flags=True, final_yield_object=None):
'''
iter through an object following the given path
yield_flags: control whether to yield the flags indicating the path at the global level
final_yield_object: internal parameter, don't modify
obj_iter: an iterable variable
path: a sequence, each element has the following structure
(how_to_iter, what_to_iter, yield_flag)
how_to_iter: a str, accept the following values
'all' or 'all_values': iter through key-value pair for dict, and all elements for other type
if yield_flag is True, attach key or index to the final yield object
'all_keys', only iter through the keys of a dict
obj_iter must be a dict
'key', iter through the value of a given key
what_to_iter must be a str representing a key in obj_iter
if yield_flag is True, attach key to the final yield object
ignored when obj_iter is not dict
'keys', iter through the values of a given set of keys
what_to_iter must be a tuple with elements reprenting keys in obj_iter
if yield_flag is True, attach key to the final yield object
ignored when obj_iter is not dict
'index', iter through a given element
what_to_iter must be an int within bound
if yield_flag is True, attach index to the final yield object
ignored when obj_iter is dict
'indexes', iter through the elements with given indexes
what_to_iter must be an list of int within bound
if yield_flag is True, attach key to the final yield object
ignored when obj_iter is dict
what_to_iter: content decided by how_to_iter
ignored for the following values of how_to_iter
all, all_values, all_keys
yield_flag: True or False
True: depending on how_to_iter, attch different flags to the final result
False: no flag wil be yield
ignored for the following values of how_to_iter
all_keys
'''
is_dict = isinstance(obj_iter, dict)
if final_yield_object is None:
final_yield_object = []
if len(path) == 0:
if yield_flags:
final_yield_object.append(obj_iter)
yield final_yield_object
else:
yield obj_iter
else:
how_to_iter, what_to_iter, yield_flag = path.pop(0)
assert isinstance(how_to_iter, basestring)
if how_to_iter in [u'all', u'all_values', u'keys', u'indexes']:
if how_to_iter in [u'keys', u'indexes']:
assert hasattr(what_to_iter, '__iter__')
for item in what_to_iter:
if is_dict:
assert how_to_iter == u'keys'
assert isinstance(item, basestring)
assert item in obj_iter
else:
assert how_to_iter == u'indexes'
assert isinstance(item, int)
assert item < len(obj_iter)
temp_iterator = ((item, obj_iter[item]) for item in what_to_iter)
else:
temp_iterator = obj_iter.iteritems() if is_dict else enumerate(obj_iter)
for flag, sub_obj_iter in temp_iterator:
final_yield_object_copy = deepcopy(final_yield_object)
if yield_flag:
final_yield_object_copy.append(flag)
for value in iter_through_general(sub_obj_iter, deepcopy(path), yield_flags, final_yield_object_copy):
yield value
elif how_to_iter == u'all_keys':
assert is_dict
for key in obj_iter.iterkeys():
if yield_flags:
final_yield_object.append(key)
yield final_yield_object
else:
yield key
elif how_to_iter in [u'key', u'index']:
if is_dict:
assert how_to_iter == u'key'
assert isinstance(what_to_iter, basestring)
assert what_to_iter in obj_iter
else:
assert how_to_iter == u'index'
assert isinstance(what_to_iter, int)
assert what_to_iter < len(obj_iter)
sub_obj_iter = obj_iter[what_to_iter]
if yield_flag:
final_yield_object.append(what_to_iter)
for value in iter_through_general(sub_obj_iter, deepcopy(path), yield_flags, final_yield_object):
yield value
else:
raise ValueError('Invalid path')
def reservoir_sample_k(obj_iter, k):
assert isinstance(k, int)
assert hasattr(obj_iter, '__iter__')
# fit into k items
sampled_l = []
for _ in range(k):
sampled_l.append(obj_iter.next())
i = k
for item in obj_iter:
i += 1
j = randint(1, i)
if j <= k:
sampled_l[j-1] = item
return sampled_l
def iter_through_and_sample_k(obj_iter, k, path):
obj_iter_follow_path = iter_through_general(obj_iter, path)
return reservoir_sample_k(obj_iter_follow_path, k)
strip_white_space = lambda _str: _str.replace(' ', '')
new_line_join = lambda str_l: '\n'.join(str_l)
def codecs_open_r_utf8(file_path):
with codecs.open(file_path, 'r', 'utf-8') as f:
returned_str = f.read()
return returned_str
# merge blank lines
def collapse_blank_line(base_str):
match_double_line_feed_re = re.compile(r'\n\n')
while match_double_line_feed_re.search(base_str):
base_str = match_double_line_feed_re.sub(r'\n', base_str)
return base_str
def custom_html_element(_str):
"""
convert the markdown notations in a string to html tags
currently, only two kinds of markdown notation exist in all the strings
** and *
"""
formatted_str = _str
# format double asterisk
match_double_asterisk_re = re.compile(u'\*\*(.*?)\*\*')
# replace **...** with <strong>...</strong>
#formatted_str = match_double_asterisk_re.sub(r'<strong>\1</strong>', formatted_str)
# replace **...** with <ins>...</ins>
formatted_str = match_double_asterisk_re.sub(r'<ins>\1</ins>', formatted_str)
# format single asterisk
# replace *...* with <i>...</i>
match_single_asterisk_re = re.compile(u'\*(.*?)\*')
formatted_str = match_single_asterisk_re.sub(r'<i>\1</i>', formatted_str)
return formatted_str
def is_file_and_json_load(file_name_str):
if os.path.isfile(file_name_str):
with codecs.open(file_name_str, 'r', encoding='utf-8') as f:
json_d = json.load(f)
return json_d