-
Notifications
You must be signed in to change notification settings - Fork 0
/
catalogue.py
56 lines (50 loc) · 2.25 KB
/
catalogue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import argparse, os, hashlib, csv, struct
from pathlib import Path
from config import BIGFILE_GALLERY_DIRECTORY_PATH, EXTENSION_TO_ENDIAN
def catalogue():
hashes = {}
versions = set()
version_triples = {}
for root, dirs, files in os.walk(BIGFILE_GALLERY_DIRECTORY_PATH):
if (
Path(root) == BIGFILE_GALLERY_DIRECTORY_PATH
or BIGFILE_GALLERY_DIRECTORY_PATH / "Rubbish" in Path(root).parents
):
continue
for file in files:
bigfile_path = Path(os.path.join(root, file))
with open(bigfile_path, "rb") as f:
buffer = f.read()
sha256 = hashlib.sha256()
sha256.update(buffer)
sha256_value = sha256.hexdigest()
bigfile_path_relative = bigfile_path.relative_to(
BIGFILE_GALLERY_DIRECTORY_PATH
)
versions.add(bigfile_path_relative.parts[2])
hashes[bigfile_path_relative] = sha256_value
if bigfile_path_relative.suffix[1:] in EXTENSION_TO_ENDIAN:
f.seek(0x00000114)
endian = EXTENSION_TO_ENDIAN[bigfile_path_relative.suffix[1:]]
x, y, z = struct.unpack(endian + "III", f.read(12))
version_triple = (x, y, z)
if bigfile_path_relative.parts[2] not in version_triples:
version_triples[bigfile_path_relative.parts[2]] = set()
version_triples[bigfile_path_relative.parts[2]].add(version_triple)
with open("bigfile_hashes.csv", "w", newline="") as f:
writer = csv.writer(f, dialect="excel")
writer.writerow(["path", "sha256"])
for key, value in sorted(hashes.items()):
writer.writerow([key, value])
print(f"Number of unique BigFiles: {len(set(hashes.values()))}")
print(f"Number of unique versions: {len(versions)}")
print("\n".join(sorted(versions)))
print("\n".join(map(str, sorted(version_triples.items()))))
def main():
parser = argparse.ArgumentParser(
prog="BigFile Catalogue", description="Update the CSV with known BigFiles"
)
args = parser.parse_args()
catalogue()
if __name__ == "__main__":
main()