-
Notifications
You must be signed in to change notification settings - Fork 2
/
fixlet_parser.py
233 lines (201 loc) · 7.58 KB
/
fixlet_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python
'''
A library of parsing functions for fixlets.
'''
import json
import cgi
import re
'''
The regular expression for matching the format of an attribute in a directory
entry.
'''
ATTR_REGEX = '\w+: (.*)'
'''
The attributes present in a file's entry on the gather site, in order.
See the function 'parse_directory(text)'.
'''
FILE_ATTRS = ('url', 'name', 'modified', 'size', 'type', 'hash', 'hashinfo')
'''
Same as FILE_ATTRS but without 'hashinfo', which is missing on some sites.
'''
FILE_ATTRS0 = ('url', 'name', 'modified', 'size', 'type', 'hash')
'''
The regular expression for matching a multipart boundary in a fixfile.
'''
BOUND_REGEX = '.*boundary="(.*)"'
'''
Regular expression for a HTML omment.
'''
COMMENT_REGEX = '<!--.*?-->'
class FixletParsingException(Exception):
pass
def parse_directory(text):
'''
Returns a 'directory': a list of file entries.
A file entry is a dictionary representing a mapping from attributes in
'FILE_ATTRS' to their actual values, parsed.
'''
directory = []
lines = text.split('\n')
lines = map(lambda x: x.strip(), lines)
# go to first entry
while not lines[2].startswith('URL: '):
lines = lines[1:]
if len(lines) < 3:
return []
# figure out whether the hashinfo attribute exists in our file entry
try:
content = map(lambda x: re.findall(ATTR_REGEX, x)[0], lines[2:9])
target_attrs = FILE_ATTRS
except Exception:
assert lines[8] == ''
target_attrs = FILE_ATTRS0
while lines[2].startswith('URL: '):
content = lines[2:2+len(target_attrs)]
content = map(lambda x: re.findall(ATTR_REGEX, x)[0], content)
attrs = dict(zip(target_attrs, content))
directory.append(attrs)
lines = lines[len(target_attrs)+3:]
assert len(lines) >= 3
return directory
def parse_directory_metadata(text):
properties = [] # not dictionary because duplicates exist (e.g. relevance)
lines = text.split('\n')
lines = map(lambda x: x.strip(), lines)
try:
# find boundary of document
while not lines[0].startswith('Content-Type:'):
lines = lines[1:]
# go to header
header = re.findall(BOUND_REGEX, lines[0])[0]
lines = lines[1:]
while not (header in lines[0]):
lines = lines[1:]
# parse MIME-properties
lines = lines[1:]
while not lines[0] == '':
separator = lines[0].find(":")
key = lines[0][:separator]
value = lines[0][separator+2:]
properties.append((key, value))
lines = lines[1:]
except IndexError:
return properties
return properties
def flatten(lists):
'''
Flatten nested lists into one list.
'''
flattened = []
for l in lists:
if not type(l) == list: # cannot flatten
flattened.append(l)
else:
flattened += flatten(l)
return flattened
def extract_site_name(url):
return re.findall('/([^/]*)$', url)[0]
def parse_fixlet(sections):
'''
Parse the smallest partition of a fixfile (contains ActionScript
or the text description).
TODO we currently only parse fixlets - add support for analyses, tasks, etc.
'''
text = None
actions = []
for section in sections:
lines = section.split('\n')
while not lines[0].startswith('Content-Type: '):
lines = lines[1:]
if (lines[0].split('Content-Type: ')[1].strip()
== 'text/html; charset=us-ascii'):
lines = lines[1:]
section = '\n'.join(lines)
text = re.sub(COMMENT_REGEX, '', section).strip()
elif (lines[0].split('Content-Type: ')[1].strip()
== 'application/x-Fixlet-Windows-Shell'):
lines = lines[1:]
section = '\n'.join(lines)
actions.append(section.strip())
elif (lines[0].split('Content-Type: ')[1].strip() in
('application/x-bigfix-analysis-template' or
'application/x-bigfix-itclient-property')):
# TODO add analysis parsing later (look for x-fixlet-type)
raise FixletParsingException("not fixlet, is analysis")
elif (lines[0].split('Content-Type: ')[1].strip()
== 'application/x-Task-Windows-Shell'):
# TODO add task parsing later (look for x-fixlet-type)
raise FixletParsingException("not fixlet, is task")
else:
raise FixletParsingException("couldn't recognize " + lines[0].split('Content-Type: ')[1].strip())
return (text, actions)
class Relevance:
def __init__(self, clauses, parent):
self.clauses = clauses
self.parent = parent
def compressed_str_list(self):
c = []
if self.parent:
c += self.parent.compressed_str_list()
return c + self.clauses
def _to_dict(self):
info = {'clauses': self.clauses}
if self.parent:
info['parent'] = self.parent._to_dict()
return info
class Fixlet:
def __init__(self, fid, relevance, title, modified, text, actions):
self.fid = fid
self.relevance = relevance
self.title = title
self.modified = modified
self.text = text
self.actions = actions
@property
def contents(self):
escape = lambda text: cgi.escape(text, True) if not (text is None) else None
r = list(map(escape, self.relevance.compressed_str_list()))
a = list(map(escape, self.actions))
d = {'relevance': r, 'text': [escape(self.text)], 'actions': a}
return json.dumps(d)
def rsplit_fixfile(text, parent=None):
'''
Recursively split a fixfile between boundaries, scraping relevance and
actionscript as we go.
'''
relevance = []
modified = 'unknown'
lines = text.split('\n')
while not lines[0].startswith('Content-Type: multipart/'):
if lines[0].startswith('X-Relevant-When: '):
relevance.append(lines[0].split('X-Relevant-When: ')[1].strip() + '\n')
elif lines[0].startswith('X-Fixlet-ID: '):
fid = int(lines[0].split('X-Fixlet-ID: ')[1].strip())
elif lines[0].startswith('Subject: '):
title = lines[0].split('Subject: ')[1].strip()
elif lines[0].startswith('X-Fixlet-Modification-Time: '):
modified = lines[0].split('X-Fixlet-Modification-Time: ')[1].strip()
lines = lines[1:]
splitter = re.findall(BOUND_REGEX, lines[0])[0]
assert len(text.split('--{}--'.format(splitter))) == 2
main_section = text.split('--{}--'.format(splitter))[0]
subsections = main_section.split('--{}'.format(splitter))[1:] # remove header
relevance = Relevance(relevance, parent)
if lines[0].startswith('Content-Type: multipart/digest'):
return filter(lambda x: x is not None,
map(lambda x: rsplit_fixfile(x, relevance), subsections))
elif lines[0].startswith('Content-Type: multipart/related'):
try:
text, actions = parse_fixlet(subsections)
return Fixlet(fid, relevance, title, modified, text, actions)
except FixletParsingException as e: # TODO handle properly
try:
print 'skipped parsing fixlet {} (id {})'.format(title, str(fid))
except UnicodeEncodeError:
print 'skipped parsing fixlet <UnicodeEncodeError> (id {})'.format(str(fid))
return None
else:
assert False
def parse_fxffile(text):
fixlets = flatten(filter(lambda x: x is not None, rsplit_fixfile(text)))
return dict(map(lambda fxf: (fxf.fid, fxf), fixlets))