From fac9f5dc426ad8a6650a41d06d64d8400e8a9cf7 Mon Sep 17 00:00:00 2001 From: Alexander Shorin Date: Sat, 14 Feb 2015 16:02:54 +0300 Subject: [PATCH 1/5] Fix unfinished code for handling extended params in content disposition --- aiohttp/multipart.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/aiohttp/multipart.py b/aiohttp/multipart.py index c4da1802dd2..ea2575bd2ca 100644 --- a/aiohttp/multipart.py +++ b/aiohttp/multipart.py @@ -53,8 +53,7 @@ def is_quoted(string): return string[0] == string[-1] == '"' def is_rfc5987(string): - # this isn't very correct - return "''" in string + return is_token(string) and string.count("'") == 2 def is_extended_param(string): return string.endswith('*') @@ -103,20 +102,12 @@ def unescape(text, *, chars=''.join(map(re.escape, CHAR))): continue elif is_extended_param(key): - if is_quoted(value): - warnings.warn(BadContentDispositionParam(item)) - continue - elif is_rfc5987(value): + if is_rfc5987(value): encoding, _, value = value.split("'", 2) encoding = encoding or 'utf-8' - elif "'": - warnings.warn(BadContentDispositionParam(item)) - continue - elif not is_token(value): + else: warnings.warn(BadContentDispositionParam(item)) continue - else: - encoding = 'utf-8' try: value = unquote(value, encoding, 'strict') From 0783d154db85202573dde7730cd9e7fa430490e2 Mon Sep 17 00:00:00 2001 From: Alexander Shorin Date: Sat, 14 Feb 2015 16:36:00 +0300 Subject: [PATCH 2/5] Resolve body part content type before content length This will allow us to apply more smart rules on determining content length. --- aiohttp/multipart.py | 11 +++++------ tests/test_multipart.py | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/aiohttp/multipart.py b/aiohttp/multipart.py index ea2575bd2ca..fdad53d8e48 100644 --- a/aiohttp/multipart.py +++ b/aiohttp/multipart.py @@ -560,17 +560,16 @@ def __init__(self, obj, headers=None, *, chunk_size=8192): } def _fill_headers_with_defaults(self): - """Updates part headers by """ - if CONTENT_LENGTH not in self.headers: - content_length = self._guess_content_length(self.obj) - if content_length is not None: - self.headers[CONTENT_LENGTH] = str(content_length) - if CONTENT_TYPE not in self.headers: content_type = self._guess_content_type(self.obj) if content_type is not None: self.headers[CONTENT_TYPE] = content_type + if CONTENT_LENGTH not in self.headers: + content_length = self._guess_content_length(self.obj) + if content_length is not None: + self.headers[CONTENT_LENGTH] = str(content_length) + if CONTENT_DISPOSITION not in self.headers: filename = self._guess_filename(self.obj) if filename is not None: diff --git a/tests/test_multipart.py b/tests/test_multipart.py index e16072cdda3..4681e1f02bd 100644 --- a/tests/test_multipart.py +++ b/tests/test_multipart.py @@ -699,8 +699,8 @@ def test_serialize_with_content_encoding_identity(self): thing, {CONTENT_ENCODING: 'identity'}) stream = part.serialize() self.assertEqual(b'CONTENT-ENCODING: identity\r\n' - b'CONTENT-LENGTH: 16\r\n' - b'CONTENT-TYPE: application/octet-stream', + b'CONTENT-TYPE: application/octet-stream\r\n' + b'CONTENT-LENGTH: 16', next(stream)) self.assertEqual(b'\r\n\r\n', next(stream)) From 29d672df825bdbf6f59aaa79346fa2ca521d708d Mon Sep 17 00:00:00 2001 From: Alexander Shorin Date: Sat, 14 Feb 2015 16:50:43 +0300 Subject: [PATCH 3/5] Extract body part payload serialization into own method --- aiohttp/multipart.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/aiohttp/multipart.py b/aiohttp/multipart.py index fdad53d8e48..a3442528388 100644 --- a/aiohttp/multipart.py +++ b/aiohttp/multipart.py @@ -625,21 +625,20 @@ def serialize(self): for item in self.headers.items() ) yield b'\r\n\r\n' + yield from self._maybe_encode_stream(self._serialize_obj()) + yield b'\r\n' + def _serialize_obj(self): obj = self.obj mtype, stype, *_ = parse_mimetype(self.headers.get(CONTENT_TYPE)) serializer = self._serialize_map.get((mtype, stype)) if serializer is not None: - stream = serializer(obj) - else: - for key in self._serialize_map: - if not isinstance(key, tuple) and isinstance(obj, key): - stream = self._serialize_map[key](obj) - break - else: - stream = self._serialize_default(obj) - yield from self._maybe_encode_stream(stream) - yield b'\r\n' + return serializer(obj) + + for key in self._serialize_map: + if not isinstance(key, tuple) and isinstance(obj, key): + return self._serialize_map[key](obj) + return self._serialize_default(obj) def _serialize_bytes(self, obj): yield obj From 85006722b76591b83edc5f8443b25864e8a8248d Mon Sep 17 00:00:00 2001 From: Alexander Shorin Date: Sun, 15 Feb 2015 15:36:08 +0300 Subject: [PATCH 4/5] Guess content length for strings and StringIO --- aiohttp/multipart.py | 14 +++++++++++--- tests/test_multipart.py | 10 ++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/aiohttp/multipart.py b/aiohttp/multipart.py index a3442528388..78898108818 100644 --- a/aiohttp/multipart.py +++ b/aiohttp/multipart.py @@ -578,12 +578,20 @@ def _fill_headers_with_defaults(self): def _guess_content_length(self, obj): if isinstance(obj, bytes): return len(obj) + elif isinstance(obj, str): + *_, params = parse_mimetype(self.headers.get(CONTENT_TYPE)) + charset = params.get('charset', 'us-ascii') + return len(obj.encode(charset)) + elif isinstance(obj, io.StringIO): + *_, params = parse_mimetype(self.headers.get(CONTENT_TYPE)) + charset = params.get('charset', 'us-ascii') + return len(obj.getvalue().encode(charset)) - obj.tell() + elif isinstance(obj, io.BytesIO): + return len(obj.getvalue()) - obj.tell() elif isinstance(obj, io.IOBase): try: return os.fstat(obj.fileno()).st_size - obj.tell() except (AttributeError, OSError): - if isinstance(obj, io.BytesIO): - return len(obj.getvalue()) - obj.tell() return None else: return None @@ -592,7 +600,7 @@ def _guess_content_type(self, obj, default='application/octet-stream'): if hasattr(obj, 'name'): name = getattr(obj, 'name') return mimetypes.guess_type(name)[0] - elif isinstance(obj, str): + elif isinstance(obj, (str, io.StringIO)): return 'text/plain; charset=utf-8' else: return default diff --git a/tests/test_multipart.py b/tests/test_multipart.py index 4681e1f02bd..73b3f052f2e 100644 --- a/tests/test_multipart.py +++ b/tests/test_multipart.py @@ -533,12 +533,17 @@ def setUp(self): self.part = aiohttp.multipart.BodyPartWriter(b'') def test_guess_content_length(self): + self.part.headers[CONTENT_TYPE] = 'text/plain; charset=utf-8' self.assertIsNone(self.part._guess_content_length({})) self.assertIsNone(self.part._guess_content_length(object())) self.assertEqual(3, self.part._guess_content_length(io.BytesIO(b'foo'))) - self.assertIsNone(self.part._guess_content_length(io.StringIO('foo'))) + self.assertEqual(3, + self.part._guess_content_length(io.StringIO('foo'))) + self.assertEqual(6, + self.part._guess_content_length(io.StringIO('мяу'))) self.assertEqual(3, self.part._guess_content_length(b'bar')) + self.assertEqual(12, self.part._guess_content_length('пассед')) with open(__file__, 'rb') as f: self.assertEqual(os.fstat(f.fileno()).st_size, self.part._guess_content_length(f)) @@ -644,7 +649,8 @@ def test_serialize_multipart(self): multipart.append_json({'test': 'passed'}) self.assertEqual( [b'--:\r\n', - b'CONTENT-TYPE: text/plain; charset=utf-8', + b'CONTENT-TYPE: text/plain; charset=utf-8\r\n' + b'CONTENT-LENGTH: 11', b'\r\n\r\n', b'foo-bar-baz', b'\r\n', From 51df89564924e0b6e3750e8289f76224699dd479 Mon Sep 17 00:00:00 2001 From: Alexander Shorin Date: Thu, 12 Mar 2015 10:41:02 +0300 Subject: [PATCH 5/5] Add multipart docs --- docs/api.rst | 12 +- docs/client.rst | 1 + docs/index.rst | 1 + docs/multipart.rst | 331 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 344 insertions(+), 1 deletion(-) create mode 100644 docs/multipart.rst diff --git a/docs/api.rst b/docs/api.rst index 2a72415f8ae..859a83ef744 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,8 +1,10 @@ +.. _aiohttp-api: + Helpers API =========== All public names from submodules ``client``, ``connector``, -``errors``, ``parsers``, ``protocol``, ``server``, ``utils``, +``errors``, ``multipart``, ``parsers``, ``protocol``, ``server``, ``utils``, ``websocket`` and ``wsgi`` are exported into ``aiohttp`` namespace. @@ -38,6 +40,14 @@ aiohttp.helpers module :undoc-members: :show-inheritance: +aiohttp.multipart module +------------------------ + +.. automodule:: aiohttp.multipart + :members: + :undoc-members: + :show-inheritance: + aiohttp.parsers module ---------------------- diff --git a/docs/client.rst b/docs/client.rst index ffb26db810d..90d5708c698 100644 --- a/docs/client.rst +++ b/docs/client.rst @@ -251,6 +251,7 @@ If you pass file object as data parameter, aiohttp will stream it to server automatically. Check :class:`aiohttp.stream.StreamReader` for supported format information. +.. seealso:: :ref:`aiohttp-multipart` Streaming uploads ----------------- diff --git a/docs/index.rst b/docs/index.rst index 86d80a6b746..344de72b3b8 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -129,6 +129,7 @@ Contents: web_reference server multidict + multipart api contributing changes diff --git a/docs/multipart.rst b/docs/multipart.rst new file mode 100644 index 00000000000..e3aab8503f3 --- /dev/null +++ b/docs/multipart.rst @@ -0,0 +1,331 @@ +.. highlight:: python + +.. module:: aiohttp.multipart + +.. _aiohttp-multipart: + +Working with Multipart +====================== + +`aiohttp` supports full featured multipart reader and writer. Both are designed +with steaming processing in mind to avoid unwanted footprint which may be +significant if you're dealing with large payloads, but this also means that +most I/O operation are only possible to execute only a single time. + +Reading Multipart Responses +--------------------------- + +Assume you made a request, as usual, and want to process the respond multipart +data:: + + >>> resp = yield from aiohttp.request(...) + +First, you need to wrap the response with a +:meth:`MultipartReader.from_response`. This needs to keep implementation of +:class:`MultipartReader` separated from response and connection routines what +makes him more portable:: + + >>> reader = aiohttp.MultipartReader.from_response(resp) + +Let's assume with this response you'd received some JSON document and multiple +files for it, but you don't need all of them, just a specific one. + +So first you need to enter into a loop where multipart body will be processed:: + + >>> metadata = None + >>> filedata = None + >>> while True: + ... part = yield from reader.next() + +The returned type depends on what the next part is: if it's a simple body part +than you'll get :class:`BodyPartReader` instance here, otherwise, it will +be another :class:`MultipartReader` instance for the nested multipart. Remember, +that multipart format is recursive and supports multiple levels of nested body +parts. When there are no more parts left to fetch, ``None`` value will be +returned - that's our signal to break the loop:: + + ... if part is None: + ... break + +Both :class:`BodyPartReader` and :class:`MultipartReader` provides access to +body part headers: this allows you to filter parts by their attributes:: + + ... if part.headers[aiohttp.hdrs.CONTENT-TYPE] == 'application/json': + ... metadata = yield from part.json() + ... continue + +Nor :class:`BodyPartReader` or :class:`MultipartReader` instances doesn't +reads whole body part data without explicit asking for. :class:`BodyPartReader` +provides a set of helpers to fetch popular content types in friendly way: + +- :meth:`BodyPartReader.text` for plaintext data; +- :meth:`BodyPartReader.json` for JSON; +- :meth:`BodyPartReader.form` for `application/www-urlform-encode` + +Each of these helpers automagically recognizes if content is compressed by +using `gzip` and `deflate` encoding (while it respects `identity` one), or if +transfer encoding is base64 or `quoted-printable` - in each case the result +will get automagically decoded. But in case if you need to access to raw binary +data as it is, there are :meth:`BodyPartReader.read` and +:meth:`BodyPartReader.read_chunk` coroutine methods as well to read raw binary +data as it is all-in-single-shot or by chunks respectively. + +When you have to deal with multipart files, the :attr:`BodyPartReader.filename` +property comes to the aid. It's very smart helper which handles +`Content-Disposition` handler right and extracts the right filename attribute +from it:: + + ... if part.filename != 'secret.txt': + ... continue + +If current body part doesn't matches your expectation and you want to skip it +- just continue a loop to start a next iteration of it. Here the magic happens. +Before fetch next body part ``yield from reader.next()`` ensures that previous +one was read completely. If it wasn't even started to be, all it content +sends to the void in term to fetch the next part. So you don't have to care +about cleanup routines while you're within a loop. + +Once you'd found a part for the file you'd searched for, just read it. Let's +handle it as it is without applying any decoding magic:: + + ... filedata = yield from part.read(decode=False) + +Later you may decide to decode the data. It's still simple and possible +to do:: + + ... filedata = part.decode(filedata) + +Once you done multipart processing, just break a loop:: + + ... break + +And release connection to not let it hold a response in the middle of the data:: + + ... yield from resp.release() # or yield from reader.release() + + +Sending Multipart Requests +-------------------------- + +:class:`MultipartWriter` provides an interface to build multipart payload from +the Python data and serialize it into chunked binary stream. Since multipart +format is recursive and supports deeply nestings, you can use ``with`` statement +to design your multipart data closer to how it will be:: + + >>> with aiohttp.MultipartWriter('mixed') as mpwriter: + ... ... + ... with aiohttp.MultipartWriter('related') as subwriter: + ... ... + ... mpwriter.append(subwriter) + ... + ... with aiohttp.MultipartWriter('related') as subwriter: + ... ... + ... with aiohttp.MultipartWriter('related') as subsubwriter: + ... ... + ... subwriter.append(subsubwriter) + ... mpwriter.append(subwriter) + ... + ... with aiohttp.MultipartWriter('related') as subwriter: + ... ... + ... mpwriter.append(subwriter) + +The :meth:`MultipartWriter.append` is used join a new body parts into the +single stream. It accepts various input and determines which default headers +should be used for. + +For text data default `Content-Type` is :mimetype:`text/plain; charset=utf-8`:: + + ... mpwriter.append('hello') + +For binary data :mimetype:`application/octet-stream` is used:: + + ... mpwriter.append(b'aiohttp') + +You can always override these default by passing own headers with the second +argument:: + + ... mpwriter.append(io.BytesIO(b'GIF89a...'), + {'CONTENT-TYPE': 'image/gif'}) + +For file objects `Content-Type` will be determined by using Python's +`mimetypes`_ module and additionally `Content-Disposition` header will include +file's basename:: + + ... part = root.append(open(__file__, 'rb)) + +If you want to send a file with different name, just handle the +:class:`BodyPartWriter` instance which :meth:`MultipartWriter.append` always +returns and set `Content-Disposition` explicitly by using +:meth:`BodyPartWriter.set_content_disposition` helper:: + + ... part.set_content_disposition('attachment', filename='secret.txt') + +Additionally, you may set other headers here:: + + ... part.headers[aiohttp.hdrs.CONTENT_ID] = 'X-12345' + +If you'd set `Content-Encoding`, it will be automatically applied to the +data on serialization (see below):: + + ... part.headers[aiohttp.hdrs.CONTENT_ENCODING] = 'gzip' + +There are also :meth:`MultipartWriter.append_json` and +:meth:`MultipartWriter.append_form` helpers which are useful to work with JSON +and form urlencoded data, so you don't have to encode it every time manually:: + + ... mpwriter.append_json({'test': 'passed'}) + ... mpwriter.append_form([('key', 'value')]) + +When it's done, to make a request just pass root :class:`MultipartWriter` +instance as :func:`aiohttp.client.request` `data` argument:: + + >>> yield from aiohttp.request('POST', 'http://example.com', data=mpwriter) + +Behind the scene :meth:`MultipartWriter.serialize` will yield by chunks every +part and if body part has `Content-Encoding` or `Content-Transfer-Encoding` +they will be applied on streaming content. + +Please note, that on :meth:`MultipartWriter.serialize` all the file objects +will be read till the end and there is no way to repeat a request without rewind +their pointers to the start. + +Hacking Multipart +----------------- + +The Internet is a full of terror and sometimes you may find a server which +implements a multipart support in a strange ways when an oblivious solution +doesn't works. + +For instance, is server used `cgi.FieldStorage`_ then you have to ensure that +no body part contains a `Content-Length` header:: + + for part in mpwriter: + part.headers.pop(aiohttp.hdrs.CONTENT_LENGTH, None) + +On the other hand, some server may require to specify `Content-Length` for the +whole multipart request. `aiohttp` doesn't do that since it sends multipart +using chunked transfer encoding by default. To overcome this issue, you have +to serialize a :class:`MultipartWriter` by our own in the way to calculate it +size:: + + body = b''.join(mpwriter.serialize()) + yield from aiohttp.request('POST', 'http://example.com', + data=body, headers=mpwriter.headers) + +Sometimes the server response may not be well structured: it may or may not +contains nested parts. For instance, we requesting a resource which returns +JSON documents with the files attached to it. If document has any attachments, +they are returned as a nested multipart thing. If it has not it comes as plain +body part:: + + CONTENT-TYPE: multipart/mixed; boundary=--: + + --: + CONTENT-TYPE: application/json + + {"_id": "foo"} + --: + CONTENT-TYPE: multipart/related; boundary=----: + + ----: + CONTENT-TYPE: application/json + + {"_id": "bar"} + ----: + CONTENT-TYPE: text/plain + CONTENT-DISPOSITION: attachment; filename=bar.txt + + bar! bar! bar! + ----:-- + --: + CONTENT-TYPE: application/json + + {"_id": "boo"} + --: + CONTENT-TYPE: multipart/related; boundary=----: + + ----: + CONTENT-TYPE: application/json + + {"_id": "baz"} + ----: + CONTENT-TYPE: text/plain + CONTENT-DISPOSITION: attachment; filename=baz.txt + + baz! baz! baz! + ----:-- + --:-- + +Reading such kind of data in single stream is possible, but not clean a lot:: + + result = [] + while True: + part = yield from reader.next() + + if part is None: + break + + if isinstance(part, aiohttp.MultipartReader): + # Fetching files + while True: + filepart = yield from part.next() + if filepart is None: + break + result[-1].append((yield from filepart.read())) + + else: + # Fetching document + result.append([(yield from part.json())]) + +Let's hack a reader in the way to return pairs of document and reader of the +related files on each iteration:: + + class PairsMultipartReader(aiohttp.MultipartReader): + + # keep reference on the original reader + multipart_reader_cls = aiohttp.MultipartReader + + @asyncio.coroutine + def next(self): + """Emits a tuple of document object (:class:`dict`) and multipart + reader of the followed attachments (if any). + + :rtype: tuple + """ + reader = yield from super().next() + + if self._at_eof: + return None, None + + if isinstance(reader, self.multipart_reader_cls): + part = yield from reader.next() + doc = yield from part.json() + else: + doc = yield from reader.json() + + return doc, reader + +And this gives us a more cleaner solution:: + + reader = PairsMultipartReader.from_response(resp) + result = [] + while True: + doc, files_reader = yield from reader.next() + + if doc is None: + break + + files = [] + while True: + filepart = yield from files_reader.next() + if file.part is None: + break + files.append((yield from filepart.read())) + + result.append((doc, files)) + +.. seealso:: Multipart API in :ref:`aiohttp-api` section. + + +.. _cgi.FieldStorage: https://docs.python.org/3.4/library/cgi.html +.. _mimetypes: https://docs.python.org/3.4/library/mimetypes.html