diff --git a/netkan/netkan/metadata.py b/netkan/netkan/metadata.py index a64368c..f9b1e5b 100644 --- a/netkan/netkan/metadata.py +++ b/netkan/netkan/metadata.py @@ -5,6 +5,7 @@ from hashlib import sha1 import uuid import urllib.parse +from string import Template from typing import Optional, List, Tuple, Union, Any, Dict, TYPE_CHECKING from ruamel.yaml import YAML import dateutil.parser @@ -320,6 +321,7 @@ def __str__(self) -> str: ISODATETIME_PROPERTIES = [ 'release_date' ] + MIRROR_FILENAME_TEMPLATE = Template('$prefix-$identifier-$version.$extension') def __init__(self, filename: Optional[Union[str, Path]] = None, contents: Optional[str] = None) -> None: if filename: @@ -438,7 +440,11 @@ def redistributable(self) -> bool: def mirror_filename(self, with_epoch: bool = True) -> Optional[str]: if 'download_hash' not in self._raw: return None - return f'{self.download_hash["sha1"][0:8]}-{self.identifier}-{self._format_version(with_epoch)}.{Ckan.MIME_TO_EXTENSION[self.download_content_type]}' + return self.MIRROR_FILENAME_TEMPLATE.safe_substitute( + prefix=self._mirror_prefix(), + identifier=self.identifier, + version=self._format_version(with_epoch), + extension=Ckan.MIME_TO_EXTENSION[self.download_content_type]) def mirror_download(self, with_epoch: bool = True) -> Optional[str]: filename = self.mirror_filename(with_epoch) @@ -450,6 +456,12 @@ def mirror_item(self, with_epoch: bool = True) -> str: return self._ia_bucket_sanitize( f'{self.identifier}-{self._format_version(with_epoch)}') + def _mirror_prefix(self) -> str: + return (self.download_hash['sha1'] + if 'sha1' in self.download_hash + else self.download_hash['sha256'] + )[0:8] + # InternetArchive says: # Bucket names should be valid archive identifiers; # try someting matching this regular expression: diff --git a/netkan/netkan/mirrorer.py b/netkan/netkan/mirrorer.py index 975c5b0..7ff51fe 100644 --- a/netkan/netkan/mirrorer.py +++ b/netkan/netkan/mirrorer.py @@ -124,8 +124,22 @@ def mirrored(self, iarchive: internetarchive.session.ArchiveSession) -> bool: return False if not item.exists: return False - sha1 = self.download_hash['sha1'].lower() - return any(file['sha1'].lower() == sha1 for file in item.files if 'sha1' in file) + sha1 = self._sha1() + if sha1 is None: + return False + return any(file['sha1'].lower() == sha1 + for file in item.files + if 'sha1' in file) + + def _sha1(self) -> Optional[str]: + if 'sha1' in self.download_hash: + # Use hash from metadata if set + return self.download_hash['sha1'].lower() + dl_io = self.open_download() + if dl_io is not None: + # Calculate hash from file if found + return self.large_file_sha1(dl_io) + return None def license_urls(self) -> List[str]: return [self.LICENSE_URLS[lic] @@ -158,6 +172,13 @@ def large_file_sha256(file: BinaryIO, block_size: int = 8192) -> str: sha.update(block) return sha.hexdigest().upper() + @staticmethod + def large_file_sha1(file: BinaryIO, block_size: int = 8192) -> str: + sha = hashlib.sha1() + for block in iter(lambda: file.read(block_size), b''): + sha.update(block) + return sha.hexdigest().upper() + def open_if_hash_match(self, path: Path) -> Optional[BinaryIO]: """Check whether the file located at the given path matches our sha256.