This repository has been archived by the owner on Nov 5, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
lazyreader.py
44 lines (37 loc) · 1.51 KB
/
lazyreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- encoding: utf-8
def lazyread(f, delimiter):
"""
Generator which continually reads ``f`` to the next instance
of ``delimiter``.
This allows you to do batch processing on the contents of ``f`` without
loading the entire file into memory.
:param f: Any file-like object which has a ``.read()`` method.
:param delimiter: Delimiter on which to split up the file.
"""
# Get an empty string to start with. We need to make sure that if the
# file is opened in binary mode, we're using byte strings, and similar
# for Unicode. Otherwise trying to update the running string will
# hit a TypeError.
try:
running = f.read(0)
except Exception as e:
# The boto3 APIs don't let you read zero bytes from an S3 object, but
# they always return bytestrings, so in this case we know what to
# start with.
if e.__class__.__name__ == 'IncompleteReadError':
running = b''
else:
raise
while True:
new_data = f.read(1024)
# When a call to read() returns nothing, we're at the end of the file.
if not new_data:
yield running
return
# Otherwise, update the running stream and look for instances of
# the delimiter. Remember we might have read more than one delimiter
# since the last time we checked
running += new_data
while delimiter in running:
curr, running = running.split(delimiter, 1)
yield curr + delimiter