minor refactoring of compression.py, add documentation

piskvorky · May 25, 2021 · 930c897 · 930c897
1 parent 56cee00
commit 930c897
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 47 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Unreleased
 
+This release introduces a new top-level parameter: `compression`.
+It controls compression behavior and partially overlaps with the old `ignore_ext` parameter.
+For details, see the README.md file.
+You may continue to use `ignore_ext` parameter for now, but it will be deprecated in the next major release.
+
 - Add warning for recently deprecated s3 parameters (PR [#618](https://github.com/RaRe-Technologies/smart_open/pull/618), [@mpenkov](https://github.com/mpenkov))
 - Add new top-level compression parameter (PR [#609](https://github.com/RaRe-Technologies/smart_open/pull/609), [@dmcguire81](https://github.com/dmcguire81))
 - Drop mock dependency; standardize on unittest.mock (PR [#621](https://github.com/RaRe-Technologies/smart_open/pull/621), [@musicinmybrain](https://github.com/musicinmybrain))

diff --git a/README.rst b/README.rst
@@ -63,6 +63,10 @@ How?
   ...    with open('smart_open/tests/test_data/1984.txt.bz2', 'w') as fout:
   ...        for line in fin:
   ...           fout.write(line)
+  74
+  80
+  78
+  79
 
   >>> # can use any IOBase operations, like seek
   >>> with open('s3://commoncrawl/robots.txt', 'rb') as fin:
@@ -212,12 +216,46 @@ For the sake of simplicity, the examples below assume you have all the dependenc
     with open('azure://mycontainer/my_file.txt', 'wb', transport_params=transport_params) as fout:
         fout.write(b'hello world')
 
-Supported Compression Formats
------------------------------
+Compression Handling
+--------------------
 
-``smart_open`` allows reading and writing gzip and bzip2 files.
-They are transparently handled over HTTP, S3, and other protocols, too, based on the extension of the file being opened.
-You can easily add support for other file extensions and compression formats.
+The top-level `compression` parameter controls compression/decompression behavior when reading and writing.
+The supported values for this parameter are:
+
+- ``infer_from_extension`` (default behavior)
+- ``disable``
+- ``.gz``
+- ``.bz2``
+
+By default, ``smart_open`` determines the compression algorithm to use based on the file extension.
+
+.. code-block:: python
+
+    >>> from smart_open import open, register_compressor
+    >>> with open('smart_open/tests/test_data/1984.txt.gz') as fin:
+    ...     print(fin.read(32))
+    It was a bright cold day in Apri
+
+You can override this behavior to either disable compression, or explicitly specify the algorithm to use.
+To disable compression:
+
+.. code-block:: python
+
+    >>> from smart_open import open, register_compressor
+    >>> with open('smart_open/tests/test_data/1984.txt.gz', 'rb', compression='disable') as fin:
+    ...     print(fin.read(32))
+    b'1234'
+
+To specify the algorithm explicitly (e.g. for non-standard file extensions):
+
+.. code-block:: python
+
+    >>> from smart_open import open, register_compressor
+    >>> with open('smart_open/tests/test_data/1984.txt.gzip', compression='.gz') as fin:
+    ...     print(fin.read(32))
+    It was a bright cold day in Apri
+
+You can also easily add support for other file extensions and compression formats.
 For example, to open xz-compressed files:
 
 .. code-block:: python
@@ -230,17 +268,15 @@ For example, to open xz-compressed files:
 
     >>> register_compressor('.xz', _handle_xz)
 
-    >>> with open('smart_open/tests/test_data/crime-and-punishment.txt.xz') as fin:
-    ...     text = fin.read()
-    >>> print(len(text))
-    1696
+    >>> with open('smart_open/tests/test_data/1984.txt.xz') as fin:
+    ...     print(fin.read(32))
+    It was a bright cold day in Apri
 
 ``lzma`` is in the standard library in Python 3.3 and greater.
 For 2.7, use `backports.lzma`_.
 
 .. _backports.lzma: https://pypi.org/project/backports.lzma/
 
-
 Transport-specific Options
 --------------------------
 

diff --git a/smart_open/compression.py b/smart_open/compression.py
@@ -11,13 +11,11 @@
 
 logger = logging.getLogger(__name__)
 
-
 _COMPRESSOR_REGISTRY = {}
 
-
-NO_COMPRESSION = 'none'
+NO_COMPRESSION = 'disable'
 """Use no compression. Read/write the data as-is."""
-INFER_FROM_EXTENSION = 'extension'
+INFER_FROM_EXTENSION = 'infer_from_extension'
 """Determine the compression to use from the file extension.
 
 See get_supported_extensions().
@@ -29,7 +27,7 @@ def get_supported_compression_types():
 
     See compression paratemeter to smart_open.open().
     """
-    return [NO_COMPRESSION, INFER_FROM_EXTENSION] + [ext[1:] for ext in get_supported_extensions()]
+    return [NO_COMPRESSION, INFER_FROM_EXTENSION] + get_supported_extensions()
 
 
 def get_supported_extensions():
@@ -79,34 +77,36 @@ def _handle_gzip(file_obj, mode):
     return gzip.GzipFile(fileobj=file_obj, mode=mode)
 
 
-def compression_wrapper(file_obj, mode, filename=None):
+def compression_wrapper(file_obj, mode, compression):
     """
     This function will wrap the file_obj with an appropriate
-    [de]compression mechanism based on the extension of the filename.
+    [de]compression mechanism based on the specified extension.
 
     file_obj must either be a filehandle object, or a class which behaves
-    like one. It must have a .name attribute unless ``filename`` is given.
+    like one. It must have a .name attribute.
 
     If the filename extension isn't recognized, will simply return the original
     file_obj.
 
     """
-    try:
-        if filename is None:
-            filename = file_obj.name
-        _, ext = os.path.splitext(filename)
-    except (AttributeError, TypeError):
-        logger.warning(
-            'unable to transparently decompress %r because it '
-            'seems to lack a string-like .name', file_obj
-        )
+    if compression == NO_COMPRESSION:
         return file_obj
-
-    if ext in _COMPRESSOR_REGISTRY and mode.endswith('+'):
+    elif compression == INFER_FROM_EXTENSION:
+        try:
+            filename = file_obj.name
+        except (AttributeError, TypeError):
+            logger.warning(
+                'unable to transparently decompress %r because it '
+                'seems to lack a string-like .name', file_obj
+            )
+            return file_obj
+        _, compression = os.path.splitext(filename)
+
+    if compression in _COMPRESSOR_REGISTRY and mode.endswith('+'):
         raise ValueError('transparent (de)compression unsupported for mode %r' % mode)
 
     try:
-        callback = _COMPRESSOR_REGISTRY[ext]
+        callback = _COMPRESSOR_REGISTRY[compression]
     except KeyError:
         return file_obj
     else:

diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py
@@ -233,13 +233,7 @@ def open(
         raise NotImplementedError(ve.args[0])
 
     binary = _open_binary_stream(uri, binary_mode, transport_params)
-    if compression == so_compression.NO_COMPRESSION:
-        decompressed = binary
-    elif compression == so_compression.INFER_FROM_EXTENSION:
-        decompressed = so_compression.compression_wrapper(binary, binary_mode)
-    else:
-        faked_extension = f"{binary.name}.{compression.lower()}"
-        decompressed = so_compression.compression_wrapper(binary, binary_mode, filename=faked_extension)
+    decompressed = so_compression.compression_wrapper(binary, binary_mode, compression)
 
     if 'b' not in mode or explicit_encoding is not None:
         decoded = _encoding_wrapper(

diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py
@@ -1876,8 +1876,8 @@ def test_gzip_compress_sanity(self):
     @parameterizedtestcase.ParameterizedTestCase.parameterize(
         ("url", "_compression"),
         [
-            ("s3://bucket/gzipped", "gz"),
-            ("s3://bucket/bzipped", "bz2"),
+            ("s3://bucket/gzipped", ".gz"),
+            ("s3://bucket/bzipped", ".bz2"),
         ]
     )
     def test_read_explicit(self, url, _compression):
@@ -1888,8 +1888,8 @@ def test_read_explicit(self, url, _compression):
     @parameterizedtestcase.ParameterizedTestCase.parameterize(
         ("_compression", "expected"),
         [
-            ("gz", gzip_compress(_DECOMPRESSED_DATA, 'key')),
-            ("bz2", bz2.compress(_DECOMPRESSED_DATA)),
+            (".gz", gzip_compress(_DECOMPRESSED_DATA, 'key')),
+            (".bz2", bz2.compress(_DECOMPRESSED_DATA)),
         ],
     )
     def test_write_explicit(self, _compression, expected):
@@ -1903,8 +1903,8 @@ def test_write_explicit(self, _compression, expected):
     @parameterizedtestcase.ParameterizedTestCase.parameterize(
         ("url", "_compression", "expected"),
         [
-            ("s3://bucket/key.gz", "gz", gzip_compress(_DECOMPRESSED_DATA, 'key.gz')),
-            ("s3://bucket/key.bz2", "bz2", bz2.compress(_DECOMPRESSED_DATA)),
+            ("s3://bucket/key.gz", ".gz", gzip_compress(_DECOMPRESSED_DATA, 'key.gz')),
+            ("s3://bucket/key.bz2", ".bz2", bz2.compress(_DECOMPRESSED_DATA)),
         ],
     )
     def test_write_implicit(self, url, _compression, expected):
@@ -1918,8 +1918,8 @@ def test_write_implicit(self, url, _compression, expected):
     @parameterizedtestcase.ParameterizedTestCase.parameterize(
         ("url", "_compression", "expected"),
         [
-            ("s3://bucket/key.gz", "gz", gzip_compress(_DECOMPRESSED_DATA, 'key.gz')),
-            ("s3://bucket/key.bz2", "bz2", bz2.compress(_DECOMPRESSED_DATA)),
+            ("s3://bucket/key.gz", ".gz", gzip_compress(_DECOMPRESSED_DATA, 'key.gz')),
+            ("s3://bucket/key.bz2", ".bz2", bz2.compress(_DECOMPRESSED_DATA)),
         ],
     )
     def test_ignore_ext(self, url, _compression, expected):
@@ -1946,8 +1946,8 @@ def test_ignore_ext(self, url, _compression, expected):
                 dict(compression=INFER_FROM_EXTENSION, ignore_ext=True),
                 ValueError,
             ),
-            ("", dict(compression="gz", ignore_ext=True), ValueError),
-            ("", dict(compression="bz2", ignore_ext=True), ValueError),
+            ("", dict(compression=".gz", ignore_ext=True), ValueError),
+            ("", dict(compression=".bz2", ignore_ext=True), ValueError),
         ],
     )
     def test_compression_invalid(self, extension, kwargs, error):