-
Notifications
You must be signed in to change notification settings - Fork 8
/
preprocessors.py
70 lines (54 loc) · 2 KB
/
preprocessors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re
class Binarizer(object):
'''Callable for transforming "truthy/falsey" values to 0/1 respectively.
'''
def __init__(self, **kwargs):
pass
def __call__(self, value):
return int(bool(value))
class FlatMap(object):
'''Callable for transforming a nested structure composted of dictionaries
and lists into a single list of unique paths. The return value is always
a list; an input that is a primitive (string or numeric) value is
transformed into a list containing that value.
'''
def __init__(self, **kwargs):
pass
def __call__(self, document):
state = []
def recur(key, value):
# Unpacks the `value` object to recursively add keys to the state
if isinstance(value, dict):
for child, new_value in value.items():
new_key = '.'.join(filter(None, [key, child]))
recur(new_key, new_value)
elif isinstance(value, list):
for child in value:
recur(key, child)
else:
state.append(':'.join(filter(None, [key, str(value)])))
recur('', document)
return state
class Tokenizer(object):
'''Transforms a string into a list of "canonical" tokens via two
operations: (1) normalizing to lower case, and (2) extracting only
matches of a single regex.
'''
def __init__(self, regex='\\S+', **kwargs):
self._regex = re.compile(regex)
def __call__(self, document):
state = []
def recur(datum):
if isinstance(datum, str):
state.extend(self._regex.findall(datum.lower()))
elif isinstance(datum, list):
for child in datum:
recur(child)
recur(document)
return list(filter(None, state))
class PassThrough(object):
'''Implements the identity transform.'''
def __init__(self, **kwargs):
pass
def __call__(self, document):
return document