From aa7625103502e1bb5041c8348a3c2cf58e32f38e Mon Sep 17 00:00:00 2001 From: Brian Lester Date: Wed, 10 Apr 2024 13:23:18 -0400 Subject: [PATCH 1/2] add ci for linting --- .github/workflows/lint.yml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .github/workflows/lint.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..d4b40b2 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,32 @@ +name: Lint + +on: + push: + branches: + - main + pull_request: + branches: + - main + workflow_dispatch: + +jobs: + black: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: psf/black@stable + with: + version: 23.1.0 + isort: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + # Install package and deps so third-party packages are sorted + - name: Install Dependencies and Package + run: | + python -m pip install --upgrade pip + python -m pip install . + - uses: isort/isort-action@master From 5485376da3e5b8575ea03201806a64c360f16b87 Mon Sep 17 00:00:00 2001 From: Brian Lester Date: Wed, 10 Apr 2024 13:33:00 -0400 Subject: [PATCH 2/2] run linters over all files --- bhl/build-index.py | 37 ++++++++++++++------ bhl/extract-files.py | 66 ++++++++++++++++++++++++++---------- bhl/to-dolma.py | 30 +++++++++++----- gutenberg/possible-rights.py | 1 + 4 files changed, 98 insertions(+), 36 deletions(-) diff --git a/bhl/build-index.py b/bhl/build-index.py index 48489fa..e36d7df 100644 --- a/bhl/build-index.py +++ b/bhl/build-index.py @@ -1,16 +1,18 @@ """Build index of Biodiversity Heritage Library books""" import argparse -import xml.etree.ElementTree as ET -import os -import logging import json +import logging +import os +import xml.etree.ElementTree as ET from collections import defaultdict from tqdm.auto import tqdm - -logging.basicConfig(level=logging.INFO, format="build-index: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s") +logging.basicConfig( + level=logging.INFO, + format="build-index: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s", +) SOURCE_NAME = "biodiversity-heritage-library" @@ -18,8 +20,16 @@ def parse_args(): parser = argparse.ArgumentParser("Biodiversity Heritage Library index builder") - parser.add_argument("--metadata-file", default=f"data/{SOURCE_NAME}/raw/metadata/bhlitem.mods.xml", help="Path to XML metadata file") - parser.add_argument("--output-dir", default=f"data/{SOURCE_NAME}/raw/", help="Path to output directory") + parser.add_argument( + "--metadata-file", + default=f"data/{SOURCE_NAME}/raw/metadata/bhlitem.mods.xml", + help="Path to XML metadata file", + ) + parser.add_argument( + "--output-dir", + default=f"data/{SOURCE_NAME}/raw/", + help="Path to output directory", + ) return parser.parse_args() @@ -28,7 +38,7 @@ def main(args): logging.info(f"Loading metadata file from {args.metadata_file}") metadata = ET.parse(args.metadata_file).getroot() - + num_entries = 0 pbar = tqdm(metadata) for entry in pbar: @@ -45,12 +55,17 @@ def main(args): break pbar.set_postfix({"Entries w/ License Info": num_entries}) - + logging.info("Computing summary statistics") counts = {license: len(uris) for license, uris in index.items()} print("\nLicense Summary Statistics:") - print(json.dumps(dict(sorted(counts.items(), reverse=True, key=lambda entry: entry[1])), indent=4)) - + print( + json.dumps( + dict(sorted(counts.items(), reverse=True, key=lambda entry: entry[1])), + indent=4, + ) + ) + logging.info(f"Saving index to {args.output_dir}") os.makedirs(args.output_dir, exist_ok=True) with open(os.path.join(args.output_dir, "index.json"), "w") as f: diff --git a/bhl/extract-files.py b/bhl/extract-files.py index 6f14f14..43791cf 100644 --- a/bhl/extract-files.py +++ b/bhl/extract-files.py @@ -1,26 +1,45 @@ """Build index of Biodiversity Heritage Library books""" import argparse -import tarfile -import os -import logging import json +import logging +import os +import tarfile from collections import defaultdict from tqdm.auto import tqdm - -logging.basicConfig(level=logging.INFO, format="extract-files: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s") +logging.basicConfig( + level=logging.INFO, + format="extract-files: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s", +) SOURCE_NAME = "biodiversity-heritage-library" + def parse_args(): parser = argparse.ArgumentParser("Biodiversity Heritage Library file extractor") - parser.add_argument("--index-file", default=f"data/{SOURCE_NAME}/raw/index.json", help="Path to JSON index") - parser.add_argument("--whitelist-file", default="bhl/license_whitelist.json", help="Path to JSON file of whitelisted license strings") - parser.add_argument("--content-file", default=f"data/{SOURCE_NAME}/raw/data/bhl-ocr-20230823.tar.bz2", help="Path to tar-ed and bz2 compressed content file") - parser.add_argument("--output-dir", default=f"data/{SOURCE_NAME}/raw/extracted_data", help="Path to output directory") + parser.add_argument( + "--index-file", + default=f"data/{SOURCE_NAME}/raw/index.json", + help="Path to JSON index", + ) + parser.add_argument( + "--whitelist-file", + default="bhl/license_whitelist.json", + help="Path to JSON file of whitelisted license strings", + ) + parser.add_argument( + "--content-file", + default=f"data/{SOURCE_NAME}/raw/data/bhl-ocr-20230823.tar.bz2", + help="Path to tar-ed and bz2 compressed content file", + ) + parser.add_argument( + "--output-dir", + default=f"data/{SOURCE_NAME}/raw/extracted_data", + help="Path to output directory", + ) return parser.parse_args() @@ -30,18 +49,26 @@ def main(args): logging.info(f"Loading index file from {args.index_file}") with open(args.index_file, "r") as f: index = json.load(f) - + logging.info(f"Loading license whitelist file from {args.whitelist_file}") with open(args.whitelist_file, "r") as f: whitelist = json.load(f) - + logging.info(f"Loading content file from {args.content_file}") content_file = tarfile.open(args.content_file, "r:bz2") logging.info("Constructing list of all whitelisted items") - whitelisted_items = set(sum([[uri.split("/")[-1].zfill(6) for uri in index[license]] for license in whitelist], start=[])) + whitelisted_items = set( + sum( + [ + [uri.split("/")[-1].zfill(6) for uri in index[license]] + for license in whitelist + ], + start=[], + ) + ) logging.info(f"Found {len(whitelisted_items)} whitelisted items") - + num_extracted_files = 0 extracted_size = 0 pbar = tqdm(content_file) @@ -50,12 +77,17 @@ def main(args): continue item_id = item_info.path.split("/")[2] if item_id in whitelisted_items: - content_file.extract(item_info, path=args.output_dir) + content_file.extract(item_info, path=args.output_dir) num_extracted_files += 1 extracted_size += item_info.size - - pbar.set_postfix({"Extracted Files": num_extracted_files, "Extracted Size": f"{extracted_size / 2**30:.3f} GB"}) - + + pbar.set_postfix( + { + "Extracted Files": num_extracted_files, + "Extracted Size": f"{extracted_size / 2**30:.3f} GB", + } + ) + if __name__ == "__main__": args = parse_args() diff --git a/bhl/to-dolma.py b/bhl/to-dolma.py index 9e81663..69e478b 100644 --- a/bhl/to-dolma.py +++ b/bhl/to-dolma.py @@ -3,15 +3,17 @@ import argparse import datetime import glob -import logging import json +import logging import os from licensed_pile.licenses import PermissiveLicenses from licensed_pile.write import to_dolma - -logging.basicConfig(level=logging.INFO, format="to-dolma: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s") +logging.basicConfig( + level=logging.INFO, + format="to-dolma: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s", +) BASE_URL = "https://www.biodiversitylibrary.org/page" @@ -19,12 +21,14 @@ parser = argparse.ArgumentParser(description="Convert data to dolma.") parser.add_argument( - "--data", default=f"data/{SOURCE_NAME}/extracted_data", help="Path to the directory containing BHL data." + "--data", + default=f"data/{SOURCE_NAME}/extracted_data", + help="Path to the directory containing BHL data.", ) parser.add_argument( "--output_dir", default=f"data/{SOURCE_NAME}/v0", - help="Where the dolma formatted data goes." + help="Where the dolma formatted data goes.", ) parser.add_argument( "--filename", default="bhl.jsonl.gz", help="The base filename for the BHL data" @@ -34,8 +38,12 @@ ) -def format_dolma(content_file: str, source_name: str = SOURCE_NAME, base_url: str = BASE_URL): - item_id, page_id, page_num = os.path.splitext(os.path.basename(content_file))[0].split("-") +def format_dolma( + content_file: str, source_name: str = SOURCE_NAME, base_url: str = BASE_URL +): + item_id, page_id, page_num = os.path.splitext(os.path.basename(content_file))[ + 0 + ].split("-") with open(content_file) as f: try: text = f.read() @@ -60,7 +68,13 @@ def format_dolma(content_file: str, source_name: str = SOURCE_NAME, base_url: st def main(args): # Use iterators so we don't have to load the whole dataset in memory. - content_pages = filter(lambda x: x is not None, map(format_dolma, glob.iglob(os.path.join(args.data, "**", "*.txt"), recursive=True))) + content_pages = filter( + lambda x: x is not None, + map( + format_dolma, + glob.iglob(os.path.join(args.data, "**", "*.txt"), recursive=True), + ), + ) to_dolma(content_pages, args.output_dir, args.filename, args.shard_size) diff --git a/gutenberg/possible-rights.py b/gutenberg/possible-rights.py index a159a48..df8d8b1 100644 --- a/gutenberg/possible-rights.py +++ b/gutenberg/possible-rights.py @@ -8,6 +8,7 @@ import argparse import glob + import tqdm from rdflib import Graph