Skip to content

Commit

Permalink
minor refactoring of compression.py, add documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
mpenkov committed May 25, 2021
1 parent 56cee00 commit 930c897
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 47 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Unreleased

This release introduces a new top-level parameter: `compression`.
It controls compression behavior and partially overlaps with the old `ignore_ext` parameter.
For details, see the README.md file.
You may continue to use `ignore_ext` parameter for now, but it will be deprecated in the next major release.

- Add warning for recently deprecated s3 parameters (PR [#618](https://github.com/RaRe-Technologies/smart_open/pull/618), [@mpenkov](https://github.com/mpenkov))
- Add new top-level compression parameter (PR [#609](https://github.com/RaRe-Technologies/smart_open/pull/609), [@dmcguire81](https://github.com/dmcguire81))
- Drop mock dependency; standardize on unittest.mock (PR [#621](https://github.com/RaRe-Technologies/smart_open/pull/621), [@musicinmybrain](https://github.com/musicinmybrain))
Expand Down
56 changes: 46 additions & 10 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ How?
... with open('smart_open/tests/test_data/1984.txt.bz2', 'w') as fout:
... for line in fin:
... fout.write(line)
74
80
78
79
>>> # can use any IOBase operations, like seek
>>> with open('s3://commoncrawl/robots.txt', 'rb') as fin:
Expand Down Expand Up @@ -212,12 +216,46 @@ For the sake of simplicity, the examples below assume you have all the dependenc
with open('azure://mycontainer/my_file.txt', 'wb', transport_params=transport_params) as fout:
fout.write(b'hello world')
Supported Compression Formats
-----------------------------
Compression Handling
--------------------

``smart_open`` allows reading and writing gzip and bzip2 files.
They are transparently handled over HTTP, S3, and other protocols, too, based on the extension of the file being opened.
You can easily add support for other file extensions and compression formats.
The top-level `compression` parameter controls compression/decompression behavior when reading and writing.
The supported values for this parameter are:

- ``infer_from_extension`` (default behavior)
- ``disable``
- ``.gz``
- ``.bz2``

By default, ``smart_open`` determines the compression algorithm to use based on the file extension.

.. code-block:: python
>>> from smart_open import open, register_compressor
>>> with open('smart_open/tests/test_data/1984.txt.gz') as fin:
... print(fin.read(32))
It was a bright cold day in Apri
You can override this behavior to either disable compression, or explicitly specify the algorithm to use.
To disable compression:

.. code-block:: python
>>> from smart_open import open, register_compressor
>>> with open('smart_open/tests/test_data/1984.txt.gz', 'rb', compression='disable') as fin:
... print(fin.read(32))
b'1234'
To specify the algorithm explicitly (e.g. for non-standard file extensions):

.. code-block:: python
>>> from smart_open import open, register_compressor
>>> with open('smart_open/tests/test_data/1984.txt.gzip', compression='.gz') as fin:
... print(fin.read(32))
It was a bright cold day in Apri
You can also easily add support for other file extensions and compression formats.
For example, to open xz-compressed files:

.. code-block:: python
Expand All @@ -230,17 +268,15 @@ For example, to open xz-compressed files:
>>> register_compressor('.xz', _handle_xz)
>>> with open('smart_open/tests/test_data/crime-and-punishment.txt.xz') as fin:
... text = fin.read()
>>> print(len(text))
1696
>>> with open('smart_open/tests/test_data/1984.txt.xz') as fin:
... print(fin.read(32))
It was a bright cold day in Apri
``lzma`` is in the standard library in Python 3.3 and greater.
For 2.7, use `backports.lzma`_.

.. _backports.lzma: https://pypi.org/project/backports.lzma/


Transport-specific Options
--------------------------

Expand Down
40 changes: 20 additions & 20 deletions smart_open/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,11 @@

logger = logging.getLogger(__name__)


_COMPRESSOR_REGISTRY = {}


NO_COMPRESSION = 'none'
NO_COMPRESSION = 'disable'
"""Use no compression. Read/write the data as-is."""
INFER_FROM_EXTENSION = 'extension'
INFER_FROM_EXTENSION = 'infer_from_extension'
"""Determine the compression to use from the file extension.
See get_supported_extensions().
Expand All @@ -29,7 +27,7 @@ def get_supported_compression_types():
See compression paratemeter to smart_open.open().
"""
return [NO_COMPRESSION, INFER_FROM_EXTENSION] + [ext[1:] for ext in get_supported_extensions()]
return [NO_COMPRESSION, INFER_FROM_EXTENSION] + get_supported_extensions()


def get_supported_extensions():
Expand Down Expand Up @@ -79,34 +77,36 @@ def _handle_gzip(file_obj, mode):
return gzip.GzipFile(fileobj=file_obj, mode=mode)


def compression_wrapper(file_obj, mode, filename=None):
def compression_wrapper(file_obj, mode, compression):
"""
This function will wrap the file_obj with an appropriate
[de]compression mechanism based on the extension of the filename.
[de]compression mechanism based on the specified extension.
file_obj must either be a filehandle object, or a class which behaves
like one. It must have a .name attribute unless ``filename`` is given.
like one. It must have a .name attribute.
If the filename extension isn't recognized, will simply return the original
file_obj.
"""
try:
if filename is None:
filename = file_obj.name
_, ext = os.path.splitext(filename)
except (AttributeError, TypeError):
logger.warning(
'unable to transparently decompress %r because it '
'seems to lack a string-like .name', file_obj
)
if compression == NO_COMPRESSION:
return file_obj

if ext in _COMPRESSOR_REGISTRY and mode.endswith('+'):
elif compression == INFER_FROM_EXTENSION:
try:
filename = file_obj.name
except (AttributeError, TypeError):
logger.warning(
'unable to transparently decompress %r because it '
'seems to lack a string-like .name', file_obj
)
return file_obj
_, compression = os.path.splitext(filename)

if compression in _COMPRESSOR_REGISTRY and mode.endswith('+'):
raise ValueError('transparent (de)compression unsupported for mode %r' % mode)

try:
callback = _COMPRESSOR_REGISTRY[ext]
callback = _COMPRESSOR_REGISTRY[compression]
except KeyError:
return file_obj
else:
Expand Down
8 changes: 1 addition & 7 deletions smart_open/smart_open_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,13 +233,7 @@ def open(
raise NotImplementedError(ve.args[0])

binary = _open_binary_stream(uri, binary_mode, transport_params)
if compression == so_compression.NO_COMPRESSION:
decompressed = binary
elif compression == so_compression.INFER_FROM_EXTENSION:
decompressed = so_compression.compression_wrapper(binary, binary_mode)
else:
faked_extension = f"{binary.name}.{compression.lower()}"
decompressed = so_compression.compression_wrapper(binary, binary_mode, filename=faked_extension)
decompressed = so_compression.compression_wrapper(binary, binary_mode, compression)

if 'b' not in mode or explicit_encoding is not None:
decoded = _encoding_wrapper(
Expand Down
20 changes: 10 additions & 10 deletions smart_open/tests/test_smart_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -1876,8 +1876,8 @@ def test_gzip_compress_sanity(self):
@parameterizedtestcase.ParameterizedTestCase.parameterize(
("url", "_compression"),
[
("s3://bucket/gzipped", "gz"),
("s3://bucket/bzipped", "bz2"),
("s3://bucket/gzipped", ".gz"),
("s3://bucket/bzipped", ".bz2"),
]
)
def test_read_explicit(self, url, _compression):
Expand All @@ -1888,8 +1888,8 @@ def test_read_explicit(self, url, _compression):
@parameterizedtestcase.ParameterizedTestCase.parameterize(
("_compression", "expected"),
[
("gz", gzip_compress(_DECOMPRESSED_DATA, 'key')),
("bz2", bz2.compress(_DECOMPRESSED_DATA)),
(".gz", gzip_compress(_DECOMPRESSED_DATA, 'key')),
(".bz2", bz2.compress(_DECOMPRESSED_DATA)),
],
)
def test_write_explicit(self, _compression, expected):
Expand All @@ -1903,8 +1903,8 @@ def test_write_explicit(self, _compression, expected):
@parameterizedtestcase.ParameterizedTestCase.parameterize(
("url", "_compression", "expected"),
[
("s3://bucket/key.gz", "gz", gzip_compress(_DECOMPRESSED_DATA, 'key.gz')),
("s3://bucket/key.bz2", "bz2", bz2.compress(_DECOMPRESSED_DATA)),
("s3://bucket/key.gz", ".gz", gzip_compress(_DECOMPRESSED_DATA, 'key.gz')),
("s3://bucket/key.bz2", ".bz2", bz2.compress(_DECOMPRESSED_DATA)),
],
)
def test_write_implicit(self, url, _compression, expected):
Expand All @@ -1918,8 +1918,8 @@ def test_write_implicit(self, url, _compression, expected):
@parameterizedtestcase.ParameterizedTestCase.parameterize(
("url", "_compression", "expected"),
[
("s3://bucket/key.gz", "gz", gzip_compress(_DECOMPRESSED_DATA, 'key.gz')),
("s3://bucket/key.bz2", "bz2", bz2.compress(_DECOMPRESSED_DATA)),
("s3://bucket/key.gz", ".gz", gzip_compress(_DECOMPRESSED_DATA, 'key.gz')),
("s3://bucket/key.bz2", ".bz2", bz2.compress(_DECOMPRESSED_DATA)),
],
)
def test_ignore_ext(self, url, _compression, expected):
Expand All @@ -1946,8 +1946,8 @@ def test_ignore_ext(self, url, _compression, expected):
dict(compression=INFER_FROM_EXTENSION, ignore_ext=True),
ValueError,
),
("", dict(compression="gz", ignore_ext=True), ValueError),
("", dict(compression="bz2", ignore_ext=True), ValueError),
("", dict(compression=".gz", ignore_ext=True), ValueError),
("", dict(compression=".bz2", ignore_ext=True), ValueError),
],
)
def test_compression_invalid(self, extension, kwargs, error):
Expand Down

0 comments on commit 930c897

Please sign in to comment.