uktrade · frederikaalund · Apr 12, 2024 · michalc · May 6, 2024 · ferdnyc
diff --git a/docs/get-started.md b/docs/get-started.md
@@ -157,6 +157,21 @@ Each member file is compressed with a method that must be specified in client co
 In general, not all valid ZIP files are possible to be stream unzipped. However, all files generated by stream-zip are suitable for stream unzipping, for example by [stream-unzip](https://stream-unzip.docs.trade.gov.uk/).
 
 
+## Timestamps
+
+File timestamps (e.g., "modified at") have to fit into the ZIP file format.
+Therefore, stream-zip both rounds and clamps timestamps to make them fit the ZIP file format.
+
+If `extended_timestamps=True` (the default):
+
+ * Timestamps are clamped between 1970-1-1 and 2038-1-19 (both inclusive)
+ * Timestamps are rounded down with 1-second precision
+
+If `extended_timestamps=False`:
+
+ * Timestamps are clamped between 1980-1-1 and 2107-12-31 (both inclusive)
+ * Timestamps are rounded down with 2-second precision
+
 ## Limitations
 
 The `NO_COMPRESSION_32` and `NO_COMPRESSION_64` methods do not stream - they buffer the entire binary contents of the file in memory before output. They do this to calculate the length and CRC 32 to output them before the binary contents in the ZIP. This is required in order for ZIP to be stream unzippable.

diff --git a/stream_zip.py b/stream_zip.py
@@ -1,4 +1,5 @@
 from collections import deque
+from datetime import datetime
 from struct import Struct
 import asyncio
 import secrets
@@ -21,6 +22,17 @@
 _AUTO_UPGRADE_CENTRAL_DIRECTORY = object()
 _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY = object()
 
+_MS_DOS_DATE_BEGIN = datetime(1980, 1, 1)
+_MS_DOS_DATE_END = datetime(
+    # Max year since 1980 repesentable in a 7-bit unsigned integer
+    year=_MS_DOS_DATE_BEGIN.year + 2**7-1,
+    month=12,
+    day=31,
+    hour=23,
+    minute=59,
+    second=59,
+)
+
 def __NO_COMPRESSION_BUFFERED_32(offset, default_get_compressobj):
     return _NO_COMPRESSION_BUFFERED_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None
 
@@ -612,19 +624,25 @@ def _no_compression_streamed_data(chunks, uncompressed_size, crc_32, maximum_siz
             name_encoded = name.encode('utf-8')
             _raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError)
 
+            # Remove time zone information (if any) during clamp
+            mod_datetime_ms_dos = min(max(modified_at.replace(tzinfo=None), _MS_DOS_DATE_BEGIN), _MS_DOS_DATE_END)
             mod_at_ms_dos = modified_at_struct.pack(
-                int(modified_at.second / 2) | \
-                (modified_at.minute << 5) | \
-                (modified_at.hour << 11),
-                modified_at.day | \
-                (modified_at.month << 5) | \
-                (modified_at.year - 1980) << 9,
+                (mod_datetime_ms_dos.second // 2) | \
+                (mod_datetime_ms_dos.minute << 5) | \
+                (mod_datetime_ms_dos.hour << 11),
+                mod_datetime_ms_dos.day | \
+                (mod_datetime_ms_dos.month << 5) | \
+                (mod_datetime_ms_dos.year - 1980) << 9,
             )
             mod_at_unix_extra = mod_at_unix_extra_struct.pack(
                 mod_at_unix_extra_signature,
                 5,        # Size of extra
                 b'\x01',  # Only modification time (as opposed to also other times)
-                int(modified_at.timestamp()),
+                # Clamp timestamp to fit the field size (4-byte signed integer)
+                # In principle, we the lower limit should be `-2**31` but we set it
+                # to zero to avoid issues with common zip utilities like `unzip`.
+                # Said tools do not correctly interpret negative timestamps.
+                max(min(int(modified_at.timestamp()), 2**31 - 1), 0),
             ) if extended_timestamps else b''
             external_attr = \
                 (mode << 16) | \

diff --git a/test_stream_zip.py b/test_stream_zip.py
@@ -989,6 +989,25 @@ def test_bsdio_empty_directory(method, trailing_slash, mode, expected_mode):
 @pytest.mark.parametrize(
     "modified_at,expected_time",
     [
+        # Datetimes near the 1980 epoch used in the MS-DOS header.
+        # Note the 2-second precision and the cutoff of everything before the epoch.
+        (datetime(1979, 12, 31, 23, 59, 58), (1980, 1, 1, 0, 0, 0)),
+        (datetime(1979, 12, 31, 23, 59, 59), (1980, 1, 1, 0, 0, 0)),
+        (datetime(1980,  1,  1,  0,  0,  0),  (1980, 1, 1, 0, 0, 0)),
+        (datetime(1980,  1,  1,  0,  0,  1),  (1980, 1, 1, 0, 0, 0)),
+        (datetime(1980,  1,  1,  0,  0,  2),  (1980, 1, 1, 0, 0, 2)),
+        (datetime(1980,  1,  1,  0,  0,  3),  (1980, 1, 1, 0, 0, 2)),
+        (datetime(1980,  1,  1,  0,  0,  4),  (1980, 1, 1, 0, 0, 4)),
+        # Datetimes near year 2108 test the maximum datetime that the MS-DOS
+        # header can store. Again, note the 2-second precision.
+        (datetime(2107, 12, 31, 23, 59, 56), (2107, 12, 31, 23, 59, 56)),
+        (datetime(2107, 12, 31, 23, 59, 57), (2107, 12, 31, 23, 59, 56)),
+        (datetime(2107, 12, 31, 23, 59, 58), (2107, 12, 31, 23, 59, 58)),
+        (datetime(2107, 12, 31, 23, 59, 59), (2107, 12, 31, 23, 59, 58)),
+        (datetime(2108,  1,  1,  0,  0,  0),  (2107, 12, 31, 23, 59, 58)),
+        (datetime(2108,  1,  1,  0,  0,  1),  (2107, 12, 31, 23, 59, 58)),
+        (datetime(2108,  1,  1,  0,  0,  2),  (2107, 12, 31, 23, 59, 58)),
+        # Miscellaneous
         (datetime(2011, 1, 1, 1, 2, 3, 123), (2011, 1, 1, 1, 2, 2)),
         (datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), (2011, 1, 1, 1, 2, 2)),
         (datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), (2011, 1, 1, 1, 2, 2)),
@@ -1027,27 +1046,40 @@ def extracted():
     ],
 )
 @pytest.mark.parametrize(
-    "timezone,modified_at",
+    "timezone,modified_at,expected_modified_at",
     [
-        ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123)),
-        ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))),
-        ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))),
-        ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))),
-        ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123)),
-        ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))),
-        ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))),
-        ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))),
-        ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123)),
-        ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))),
-        ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))),
-        ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))),
+        # Datetimes near the UNIX epoch (1970)
+        ('UTC+0', datetime(1969, 12, 31, 23, 59, 58), datetime(1970,  1,  1,  0,  0,  0)),
+        ('UTC+0', datetime(1969, 12, 31, 23, 59, 59), datetime(1970,  1,  1,  0,  0,  0)),
+        ('UTC+0', datetime(1970,  1,  1,  0,  0,  0), None),
+        # Datetimes near the maximum representable datetime in the UNIX timestamp header 
+        # (4-byte signed integer counting the number of seconds since 1970)
+        ('UTC+0', datetime(2038,  1, 19,  3, 14,  7), None),
+        ('UTC+0', datetime(2038,  1, 19,  3, 14,  8), datetime(2038,  1, 19,  3, 14,  7)),
+        ('UTC+0', datetime(2038,  1, 19,  3, 14,  9), datetime(2038,  1, 19,  3, 14,  7)),
+        ('UTC+0', datetime(2038,  1, 19,  3, 14, 10), datetime(2038,  1, 19,  3, 14,  7)),
+        # Miscellaneous
+        ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123), None),
+        ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None),
+        ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None),
+        ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None),
+        ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123), None),
+        ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None),
+        ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None),
+        ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None),
+        ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123), None),
+        ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None),
+        ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None),
+        ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None),
     ],
 )
-def test_unzip_modification_time(method, timezone, modified_at):
+def test_unzip_modification_time(method, timezone, modified_at, expected_modified_at):
     member_files = (
         ('my_file', modified_at, stat.S_IFREG | 0o600, method, ()),
     )
     zipped_chunks = stream_zip(member_files)
+    if expected_modified_at is None:
+        expected_modified_at = modified_at
 
     with \
             TemporaryDirectory() as d, \
@@ -1059,7 +1091,7 @@ def test_unzip_modification_time(method, timezone, modified_at):
 
         subprocess.run(['unzip', f'{d}/test.zip', '-d', d], env={'TZ': timezone})
 
-        assert os.path.getmtime('my_file') == int(modified_at.timestamp())
+        assert os.path.getmtime('my_file') == int(expected_modified_at.timestamp())
 
 
 @pytest.mark.parametrize(
@@ -1074,6 +1106,28 @@ def test_unzip_modification_time(method, timezone, modified_at):
 @pytest.mark.parametrize(
     "timezone,modified_at,expected_modified_at",
     [
+        # Datetimes near the 1980 epoch used in the MS-DOS header.
+        # Note the 2-second precision and the cutoff of everything before the epoch.
+        ("UTC+0", datetime(1979, 12, 31, 23, 59, 58), datetime(1980,  1,  1,  0,  0,  0)),
+        ("UTC+0", datetime(1979, 12, 31, 23, 59, 59), datetime(1980,  1,  1,  0,  0,  0)),
+        ("UTC+0", datetime(1980,  1,  1,  0,  0,  0), datetime(1980,  1,  1,  0,  0,  0)),
+        ("UTC+0", datetime(1980,  1,  1,  0,  0,  1), datetime(1980,  1,  1,  0,  0,  0)),
+        ("UTC+0", datetime(1980,  1,  1,  0,  0,  2), datetime(1980,  1,  1,  0,  0,  2)),
+        ("UTC+0", datetime(1980,  1,  1,  0,  0,  3), datetime(1980,  1,  1,  0,  0,  2)),
+        ("UTC+0", datetime(1980,  1,  1,  0,  0,  4), datetime(1980,  1,  1,  0,  0,  4)),
+        # Datetimes near year 2108 test the maximum datetime that the MS-DOS
+        # header can store. Again, note the 2-second precision.
+        ("UTC+0", datetime(2100, 12, 31, 23, 59, 56), datetime(2100, 12, 31, 23, 59, 56)),
+        ("UTC+0", datetime(2100, 12, 31, 23, 59, 57), datetime(2100, 12, 31, 23, 59, 56)),
+        ("UTC+0", datetime(2100, 12, 31, 23, 59, 58), datetime(2100, 12, 31, 23, 59, 58)),
+        ("UTC+0", datetime(2100, 12, 31, 23, 59, 59), datetime(2100, 12, 31, 23, 59, 58)),
+        # The upper limit for the datetime field is supposed to be the end of year 2107.
+        # In practice, however, we see very strange behaviour from `unzip` after year 2100.
+        # It seems that there is an off-by-one bug in `unzip` for dates after year 2100.
+        ("UTC+0", datetime(2101,  1,  1,  0,  0,  0), datetime(2101,  1,  2,  0,  0,  0)),
+        ("UTC+0", datetime(2101,  1,  2,  0,  0,  0), datetime(2101,  1,  3,  0,  0,  0)),
+        ("UTC+0", datetime(2101,  1,  3,  0,  0,  0), datetime(2101,  1,  4,  0,  0,  0)),
+        # Miscellaneous
         ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123), datetime(2011, 1, 1, 2, 2, 2, 0)),
     ],
 )