-
Notifications
You must be signed in to change notification settings - Fork 0
/
bdfparse.py
223 lines (171 loc) · 7.79 KB
/
bdfparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
"""bdfparse
Abuse the email.feedparser class to read the structure of a BDF binary file.
We remember where the binary blobs are so we can mmap them into numpy arrays
later.
f = open ('/path/to/bdf_file')
bdf = BDFData (f).parse ()
for i in xrange (bdf.n_integrations):
c = bdf.get_data ('crossData.bin', i)
nbl, nchan, npol = c.shape
....
The following arrays can be retrieved with the get_data() function:
crossData.bin: the basic cross-correlation data
dtype: complex64
shape: (nbaselines, nchans, npol)
autoData.bin: the basic auto-correlation data
dtype: complex64
shape: (nantennas, nchans, nfeeds=2)
flags.bin: flags on the auto and cross correlations
dtype: uint32
shape: (nbaselines + nantennas, nchans, npol)
FIXME: I don't know how the baseline/antenna dimension should be indexed!
Consult the BDF spec. Also, this will break if the BDF does not
contain auto+cross data.
Shortcomings:
We hardcode the array axis orderings and which axis have non-unity size. This
could theoretically all change under us.
The BDF spec makes it sound like the different binary blobs are allowed to
have differing sizes -- the "size" attribute in the header is a maximum. If
this ever happens in practice, we're kind of dicked -- I don't see how we
can guess the datachunk size without just reading it in. Let's hope that
never happens.
BDF is little-endian as are x86 processors, so we ignore endianness issues.
"""
__all__ = ['BDFData']
import numpy as np
from email.feedparser import FeedParser
from email.message import Message
from xml.etree import ElementTree
import mmap
_datatypes = {
'autoData.bin': np.complex64,
'crossData.bin': np.complex64,
'flags.bin': np.uint32,
}
nanttag = 'numAntenna'
basebandtag = 'baseband'
class BDFData (object):
def __init__ (self, fp):
"""fp is an open, seekable filestream."""
self.fp = fp
self.mmdata = mmap.mmap (fp.fileno (), 0, mmap.MAP_PRIVATE, mmap.PROT_READ)
def parse (self):
"""Parse the BDF mime structure and record the locations of the binary
blobs. Sets up various data fields in the BDFData object."""
feedparser = FeedParser (Message)
binarychunks = {}
sizeinfo = None
self.fp.seek (0, 0)
while True:
data = self.fp.readline ()
if not data:
break
feedparser.feed (data)
skip = (data == '\n' and
len (feedparser._msgstack) == 3 and
feedparser._msgstack[-1].get_content_type () in ('application/octet-stream',
'binary/octet-stream'))
if skip:
# We just finished reading the headers for a huge binary blob.
# Time to remember where the data chunk is and pretend it doesn't
# exist.
msg = feedparser._msgstack[-1]
ident = msg['Content-Location']
assert ident.endswith ('.bin'), 'confusion #1 in hacky MIME parsing!'
binarychunks[ident] = self.fp.tell ()
if sizeinfo is None:
headxml, sizeinfo, tagpfx = _extract_size_info (feedparser)
kind = ident.split ('/')[-1]
assert kind in sizeinfo, 'no size info for binary chunk kind %s in MIME!' % kind
self.fp.seek (sizeinfo[kind] + 1, 1) # skip ahead by data chunk size
sample = self.fp.read (16)
assert sample.startswith ('--MIME'), 'crap, unexpected chunk size in MIME parsing: %r' % sample
self.fp.seek (-16, 1) # go back
if headxml is None:
raise RuntimeError ('never found any binary data')
self.mimemsg = feedparser.close ()
self.headxml = headxml
self.sizeinfo = sizeinfo
self.binarychunks = binarychunks
# Compute some miscellaneous parameters that we'll need.
self.n_integrations = len (self.mimemsg.get_payload ()) - 1
self.n_antennas = int (headxml.find (tagpfx + nanttag).text)
self.n_baselines = (self.n_antennas * (self.n_antennas - 1)) // 2
ds = headxml.find (tagpfx + dstag)
nbb = 0
nspw = 0
nchan = 0
crosspolstr = None
for bb in ds.findall (tagpfx + basebandtag):
nbb += 1
for spw in bb.getchildren ():
nspw += 1
nchan += int (spw.get ('numSpectralPoint'))
if crosspolstr is None:
crosspolstr = spw.get ('crossPolProducts')
elif spw.get ('crossPolProducts') != crosspolstr:
raise Exception ('can only handle spectral windows with identical cross pol products')
self.n_basebands = nbb
self.n_spws = nspw
self.n_channels = nchan
self.crosspols = crosspolstr.split ()
return self # convenience
def get_data (self, datakind, integnum):
"""Given an integration number (0 <= integnum < self.n_integrations) and a
data kind ('crossData.bin', 'autoData.bin'), memory-map the corresponding data
and return a wrapping numpy array."""
if integnum < 0 or integnum >= self.n_integrations:
raise ValueError ('illegal integration number %d' % integnum)
size = self.sizeinfo.get (datakind)
if size is None:
raise ValueError ('unrecognized data kind "%s"' % datakind)
dtype = _datatypes[datakind]
key = '/%d/%s' % (integnum + 1, datakind) # numbers are 1-based here
for ident, offset in self.binarychunks.iteritems ():
if ident.endswith (key):
break
else:
# Gets executed if we don't break out of the loop.
raise ValueError ('can\'t find integration #%d of kind %s in BDF'
% (integnum, datakind))
dslice = self.mmdata[offset:offset+size]
data = np.fromstring (dslice, dtype=dtype)
if datakind == 'crossData.bin':
data = data.reshape ((self.n_baselines, self.n_channels, len (self.crosspols)))
elif datakind == 'autoData.bin':
data = data.reshape ((self.n_antennas, self.n_channels, 2))
elif datakind == 'flags.bin':
data = data.reshape ((self.n_baselines + self.n_antennas, self.n_channels,
len (self.crosspols)))
return data
tagprefixes = ['{http://Alma/XASDM/sdmbin}', '']
dstag = 'dataStruct'
cdtag = 'crossData'
adtag = 'autoData'
fgtag = 'flags'
def _extract_size_info (feedparser):
# This parses the XML of the header section
text = feedparser._msgstack[0].get_payload ()[0].get_payload ()
headxml = ElementTree.fromstring (text)
# The XML may or may not have an xmlns attribute which manifests itself
# as a prefix to the tags we need to use.
sizeinfo = {}
for tp in tagprefixes:
dselem = headxml.find (tp + dstag)
if dselem is not None:
break
else:
raise RuntimeError ('cannot find dataStruct item in any known XML namespace')
# Now pull out the dataStruct bit and chunk size info. We return
# sizes in bytes, not data elements.
e = dselem.find (tp + cdtag)
if e is not None:
sizeinfo['crossData.bin'] = 4 * int (e.attrib['size'])
e = dselem.find (tp + adtag)
if e is not None:
sizeinfo['autoData.bin'] = 4 * int (e.attrib['size'])
e = dselem.find (tp + fgtag)
if e is not None:
sizeinfo['flags.bin'] = 4 * int (e.attrib['size'])
# ... could fill in more if needed ...
return headxml, sizeinfo, tp