-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add strict_timestamps option #117
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
from collections import deque | ||
from datetime import datetime | ||
from struct import Struct | ||
import asyncio | ||
import secrets | ||
|
@@ -21,6 +22,17 @@ | |
_AUTO_UPGRADE_CENTRAL_DIRECTORY = object() | ||
_NO_AUTO_UPGRADE_CENTRAL_DIRECTORY = object() | ||
|
||
_MS_DOS_DATE_BEGIN = datetime(1980, 1, 1) | ||
_MS_DOS_DATE_END = datetime( | ||
# Max year since 1980 repesentable in a 7-bit unsigned integer | ||
year=_MS_DOS_DATE_BEGIN.year + 2**7-1, | ||
month=12, | ||
day=31, | ||
hour=23, | ||
minute=59, | ||
second=59, | ||
) | ||
|
||
def __NO_COMPRESSION_BUFFERED_32(offset, default_get_compressobj): | ||
return _NO_COMPRESSION_BUFFERED_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None | ||
|
||
|
@@ -612,19 +624,25 @@ def _no_compression_streamed_data(chunks, uncompressed_size, crc_32, maximum_siz | |
name_encoded = name.encode('utf-8') | ||
_raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError) | ||
|
||
# Remove time zone information (if any) during clamp | ||
mod_datetime_ms_dos = min(max(modified_at.replace(tzinfo=None), _MS_DOS_DATE_BEGIN), _MS_DOS_DATE_END) | ||
mod_at_ms_dos = modified_at_struct.pack( | ||
int(modified_at.second / 2) | \ | ||
(modified_at.minute << 5) | \ | ||
(modified_at.hour << 11), | ||
modified_at.day | \ | ||
(modified_at.month << 5) | \ | ||
(modified_at.year - 1980) << 9, | ||
(mod_datetime_ms_dos.second // 2) | \ | ||
(mod_datetime_ms_dos.minute << 5) | \ | ||
(mod_datetime_ms_dos.hour << 11), | ||
mod_datetime_ms_dos.day | \ | ||
(mod_datetime_ms_dos.month << 5) | \ | ||
(mod_datetime_ms_dos.year - 1980) << 9, | ||
) | ||
mod_at_unix_extra = mod_at_unix_extra_struct.pack( | ||
mod_at_unix_extra_signature, | ||
5, # Size of extra | ||
b'\x01', # Only modification time (as opposed to also other times) | ||
int(modified_at.timestamp()), | ||
# Clamp timestamp to fit the field size (4-byte signed integer) | ||
# In principle, we the lower limit should be `-2**31` but we set it | ||
# to zero to avoid issues with common zip utilities like `unzip`. | ||
# Said tools do not correctly interpret negative timestamps. | ||
max(min(int(modified_at.timestamp()), 2**31 - 1), 0), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From (admittedly only brief tests) it looks like once times go below 1970 with extended timestamps, then unzip seems to clamp to 1980 (so probably using the MS DOS timestamp?). While odd certainly, I guess I'm leaning to going for the full range in the zip file, because the timestamp before 1970 will be wrong in unzip no matter what, but we can at least make timestamps before 1970 right for other clients. |
||
) if extended_timestamps else b'' | ||
external_attr = \ | ||
(mode << 16) | \ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -989,6 +989,25 @@ def test_bsdio_empty_directory(method, trailing_slash, mode, expected_mode): | |
@pytest.mark.parametrize( | ||
"modified_at,expected_time", | ||
[ | ||
# Datetimes near the 1980 epoch used in the MS-DOS header. | ||
# Note the 2-second precision and the cutoff of everything before the epoch. | ||
(datetime(1979, 12, 31, 23, 59, 58), (1980, 1, 1, 0, 0, 0)), | ||
(datetime(1979, 12, 31, 23, 59, 59), (1980, 1, 1, 0, 0, 0)), | ||
(datetime(1980, 1, 1, 0, 0, 0), (1980, 1, 1, 0, 0, 0)), | ||
(datetime(1980, 1, 1, 0, 0, 1), (1980, 1, 1, 0, 0, 0)), | ||
(datetime(1980, 1, 1, 0, 0, 2), (1980, 1, 1, 0, 0, 2)), | ||
(datetime(1980, 1, 1, 0, 0, 3), (1980, 1, 1, 0, 0, 2)), | ||
(datetime(1980, 1, 1, 0, 0, 4), (1980, 1, 1, 0, 0, 4)), | ||
# Datetimes near year 2108 test the maximum datetime that the MS-DOS | ||
# header can store. Again, note the 2-second precision. | ||
(datetime(2107, 12, 31, 23, 59, 56), (2107, 12, 31, 23, 59, 56)), | ||
(datetime(2107, 12, 31, 23, 59, 57), (2107, 12, 31, 23, 59, 56)), | ||
(datetime(2107, 12, 31, 23, 59, 58), (2107, 12, 31, 23, 59, 58)), | ||
(datetime(2107, 12, 31, 23, 59, 59), (2107, 12, 31, 23, 59, 58)), | ||
(datetime(2108, 1, 1, 0, 0, 0), (2107, 12, 31, 23, 59, 58)), | ||
(datetime(2108, 1, 1, 0, 0, 1), (2107, 12, 31, 23, 59, 58)), | ||
(datetime(2108, 1, 1, 0, 0, 2), (2107, 12, 31, 23, 59, 58)), | ||
# Miscellaneous | ||
(datetime(2011, 1, 1, 1, 2, 3, 123), (2011, 1, 1, 1, 2, 2)), | ||
(datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), (2011, 1, 1, 1, 2, 2)), | ||
(datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), (2011, 1, 1, 1, 2, 2)), | ||
|
@@ -1027,27 +1046,40 @@ def extracted(): | |
], | ||
) | ||
@pytest.mark.parametrize( | ||
"timezone,modified_at", | ||
"timezone,modified_at,expected_modified_at", | ||
[ | ||
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123)), | ||
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))), | ||
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))), | ||
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))), | ||
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123)), | ||
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))), | ||
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))), | ||
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))), | ||
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123)), | ||
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))), | ||
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))), | ||
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))), | ||
# Datetimes near the UNIX epoch (1970) | ||
('UTC+0', datetime(1969, 12, 31, 23, 59, 58), datetime(1970, 1, 1, 0, 0, 0)), | ||
('UTC+0', datetime(1969, 12, 31, 23, 59, 59), datetime(1970, 1, 1, 0, 0, 0)), | ||
('UTC+0', datetime(1970, 1, 1, 0, 0, 0), None), | ||
# Datetimes near the maximum representable datetime in the UNIX timestamp header | ||
# (4-byte signed integer counting the number of seconds since 1970) | ||
('UTC+0', datetime(2038, 1, 19, 3, 14, 7), None), | ||
('UTC+0', datetime(2038, 1, 19, 3, 14, 8), datetime(2038, 1, 19, 3, 14, 7)), | ||
('UTC+0', datetime(2038, 1, 19, 3, 14, 9), datetime(2038, 1, 19, 3, 14, 7)), | ||
('UTC+0', datetime(2038, 1, 19, 3, 14, 10), datetime(2038, 1, 19, 3, 14, 7)), | ||
# Miscellaneous | ||
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123), None), | ||
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None), | ||
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None), | ||
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None), | ||
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123), None), | ||
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None), | ||
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None), | ||
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None), | ||
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123), None), | ||
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None), | ||
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None), | ||
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None), | ||
], | ||
) | ||
def test_unzip_modification_time(method, timezone, modified_at): | ||
def test_unzip_modification_time(method, timezone, modified_at, expected_modified_at): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm just running the tests locally, and I'm getting some errors... I wonder if it's because I'm currently in British Summer Time, 1 our off from UTC... pytest 'test_stream_zip.py::test_unzip_modification_time[UTC+0-modified_at0-expected_modified_at0-ZIP_32]' Results in > assert os.path.getmtime('my_file') == int(expected_modified_at.timestamp())
E AssertionError: assert 0.0 == -3600
E + where 0.0 = <function getmtime at 0x101338040>('my_file')
E + where <function getmtime at 0x101338040> = <module 'posixpath' (frozen)>.getmtime
E + where <module 'posixpath' (frozen)> = os.path
E + and -3600 = int(-3600.0)
E + where -3600.0 = <built-in method timestamp of datetime.datetime object at 0x1040fb990>()
E + where <built-in method timestamp of datetime.datetime object at 0x1040fb990> = datetime.datetime(1970, 1, 1, 0, 0).timestamp The tests Not quite sure why right now... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to this jszip issue, convention is that MS-DOS timestamps are in local time, whereas extended timestamps use UNIX conventions which means all timestamps are values in UTC. That's backed up by a note in the unzip(1) man page, as well: -f freshen existing files, i.e., extract only those files that al‐ ready exist on disk and that are newer than the disk copies. By default unzip queries before overwriting, but the -o option may be used to suppress the queries. Note that under many operating systems, the TZ (timezone) environment variable must be set cor‐ rectly in order for -f and -u to work properly (under Unix the variable is usually set automatically). The reasons for this are somewhat subtle but have to do with the differences between DOS-format file times (always local time) and Unix-format times (always in GMT/UTC) and the necessity to compare the two. A typical TZ value is ``PST8PDT'' (US Pacific time with automatic adjustment for Daylight Savings Time or ``summer time''). Both # These produce the same value
datetime.datetime.now().timestamp()
datetime.datetime.now(datetime.UTC).timestamp()
# as do these
datetime.datetime.fromtimestamp(os.path.getmtime('some-file')).timestamp()
datetime.datetime.fromtimestamp(os.path.getmtime('some-file'), datetime.UTC).timestamp() But a >>> dt = datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=None)
>>> dt.timestamp()
18000.0
>>> dtutc = datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=datetime.UTC)
>>> dtutc.timestamp()
0.0 |
||
member_files = ( | ||
('my_file', modified_at, stat.S_IFREG | 0o600, method, ()), | ||
) | ||
zipped_chunks = stream_zip(member_files) | ||
if expected_modified_at is None: | ||
expected_modified_at = modified_at | ||
|
||
with \ | ||
TemporaryDirectory() as d, \ | ||
|
@@ -1059,7 +1091,7 @@ def test_unzip_modification_time(method, timezone, modified_at): | |
|
||
subprocess.run(['unzip', f'{d}/test.zip', '-d', d], env={'TZ': timezone}) | ||
|
||
assert os.path.getmtime('my_file') == int(modified_at.timestamp()) | ||
assert os.path.getmtime('my_file') == int(expected_modified_at.timestamp()) | ||
Comment on lines
1092
to
+1094
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The test failures seem to be caused by the |
||
|
||
|
||
@pytest.mark.parametrize( | ||
|
@@ -1074,6 +1106,28 @@ def test_unzip_modification_time(method, timezone, modified_at): | |
@pytest.mark.parametrize( | ||
"timezone,modified_at,expected_modified_at", | ||
[ | ||
# Datetimes near the 1980 epoch used in the MS-DOS header. | ||
# Note the 2-second precision and the cutoff of everything before the epoch. | ||
("UTC+0", datetime(1979, 12, 31, 23, 59, 58), datetime(1980, 1, 1, 0, 0, 0)), | ||
("UTC+0", datetime(1979, 12, 31, 23, 59, 59), datetime(1980, 1, 1, 0, 0, 0)), | ||
("UTC+0", datetime(1980, 1, 1, 0, 0, 0), datetime(1980, 1, 1, 0, 0, 0)), | ||
("UTC+0", datetime(1980, 1, 1, 0, 0, 1), datetime(1980, 1, 1, 0, 0, 0)), | ||
("UTC+0", datetime(1980, 1, 1, 0, 0, 2), datetime(1980, 1, 1, 0, 0, 2)), | ||
("UTC+0", datetime(1980, 1, 1, 0, 0, 3), datetime(1980, 1, 1, 0, 0, 2)), | ||
("UTC+0", datetime(1980, 1, 1, 0, 0, 4), datetime(1980, 1, 1, 0, 0, 4)), | ||
# Datetimes near year 2108 test the maximum datetime that the MS-DOS | ||
# header can store. Again, note the 2-second precision. | ||
("UTC+0", datetime(2100, 12, 31, 23, 59, 56), datetime(2100, 12, 31, 23, 59, 56)), | ||
("UTC+0", datetime(2100, 12, 31, 23, 59, 57), datetime(2100, 12, 31, 23, 59, 56)), | ||
("UTC+0", datetime(2100, 12, 31, 23, 59, 58), datetime(2100, 12, 31, 23, 59, 58)), | ||
("UTC+0", datetime(2100, 12, 31, 23, 59, 59), datetime(2100, 12, 31, 23, 59, 58)), | ||
# The upper limit for the datetime field is supposed to be the end of year 2107. | ||
# In practice, however, we see very strange behaviour from `unzip` after year 2100. | ||
# It seems that there is an off-by-one bug in `unzip` for dates after year 2100. | ||
("UTC+0", datetime(2101, 1, 1, 0, 0, 0), datetime(2101, 1, 2, 0, 0, 0)), | ||
("UTC+0", datetime(2101, 1, 2, 0, 0, 0), datetime(2101, 1, 3, 0, 0, 0)), | ||
("UTC+0", datetime(2101, 1, 3, 0, 0, 0), datetime(2101, 1, 4, 0, 0, 0)), | ||
# Miscellaneous | ||
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123), datetime(2011, 1, 1, 2, 2, 2, 0)), | ||
], | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the situation a bit more complex than this? If extended_timestamps=True, then it depends on the client - ones that read the extended timestamps will use the greater range, but older ones will still use the smaller range.
I wonder if it's better if the documentation should be split into "older clients" and "newer clients" (or something like that). The older clients in all cases use the smaller range, but the newer ones depend on extended_timestamps?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The data in the file (er, in the stream) doesn't depend on the client, and I think that's all you can reasonably make statements about. What the client will do with it depends on the client, but that doesn't change anything about the stream or the values encoded in it. And documenting what will be done with that data by all possible clients seems both out of scope and impossible.
I guess you could say something like this, to be more precise:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Perhaps first documenting the standard timestamps without the "if
extended_timestamps=False
" qualifier, since those timestamps will always be there.)