Skip to content

Commit

Permalink
Merge pull request r-three#63 from r-three/chore/ci
Browse files Browse the repository at this point in the history
add ci for linting
  • Loading branch information
blester125 authored Apr 11, 2024
2 parents c1c145c + 5485376 commit a71426f
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 36 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Lint

on:
push:
branches:
- main
pull_request:
branches:
- main
workflow_dispatch:

jobs:
black:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: psf/black@stable
with:
version: 23.1.0
isort:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v4
with:
python-version: 3.8
# Install package and deps so third-party packages are sorted
- name: Install Dependencies and Package
run: |
python -m pip install --upgrade pip
python -m pip install .
- uses: isort/isort-action@master
37 changes: 26 additions & 11 deletions bhl/build-index.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,35 @@
"""Build index of Biodiversity Heritage Library books"""

import argparse
import xml.etree.ElementTree as ET
import os
import logging
import json
import logging
import os
import xml.etree.ElementTree as ET
from collections import defaultdict

from tqdm.auto import tqdm


logging.basicConfig(level=logging.INFO, format="build-index: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s")
logging.basicConfig(
level=logging.INFO,
format="build-index: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s",
)


SOURCE_NAME = "biodiversity-heritage-library"


def parse_args():
parser = argparse.ArgumentParser("Biodiversity Heritage Library index builder")
parser.add_argument("--metadata-file", default=f"data/{SOURCE_NAME}/raw/metadata/bhlitem.mods.xml", help="Path to XML metadata file")
parser.add_argument("--output-dir", default=f"data/{SOURCE_NAME}/raw/", help="Path to output directory")
parser.add_argument(
"--metadata-file",
default=f"data/{SOURCE_NAME}/raw/metadata/bhlitem.mods.xml",
help="Path to XML metadata file",
)
parser.add_argument(
"--output-dir",
default=f"data/{SOURCE_NAME}/raw/",
help="Path to output directory",
)
return parser.parse_args()


Expand All @@ -28,7 +38,7 @@ def main(args):

logging.info(f"Loading metadata file from {args.metadata_file}")
metadata = ET.parse(args.metadata_file).getroot()

num_entries = 0
pbar = tqdm(metadata)
for entry in pbar:
Expand All @@ -45,12 +55,17 @@ def main(args):
break

pbar.set_postfix({"Entries w/ License Info": num_entries})

logging.info("Computing summary statistics")
counts = {license: len(uris) for license, uris in index.items()}
print("\nLicense Summary Statistics:")
print(json.dumps(dict(sorted(counts.items(), reverse=True, key=lambda entry: entry[1])), indent=4))

print(
json.dumps(
dict(sorted(counts.items(), reverse=True, key=lambda entry: entry[1])),
indent=4,
)
)

logging.info(f"Saving index to {args.output_dir}")
os.makedirs(args.output_dir, exist_ok=True)
with open(os.path.join(args.output_dir, "index.json"), "w") as f:
Expand Down
66 changes: 49 additions & 17 deletions bhl/extract-files.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,45 @@
"""Build index of Biodiversity Heritage Library books"""

import argparse
import tarfile
import os
import logging
import json
import logging
import os
import tarfile
from collections import defaultdict

from tqdm.auto import tqdm


logging.basicConfig(level=logging.INFO, format="extract-files: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s")
logging.basicConfig(
level=logging.INFO,
format="extract-files: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s",
)


SOURCE_NAME = "biodiversity-heritage-library"


def parse_args():
parser = argparse.ArgumentParser("Biodiversity Heritage Library file extractor")
parser.add_argument("--index-file", default=f"data/{SOURCE_NAME}/raw/index.json", help="Path to JSON index")
parser.add_argument("--whitelist-file", default="bhl/license_whitelist.json", help="Path to JSON file of whitelisted license strings")
parser.add_argument("--content-file", default=f"data/{SOURCE_NAME}/raw/data/bhl-ocr-20230823.tar.bz2", help="Path to tar-ed and bz2 compressed content file")
parser.add_argument("--output-dir", default=f"data/{SOURCE_NAME}/raw/extracted_data", help="Path to output directory")
parser.add_argument(
"--index-file",
default=f"data/{SOURCE_NAME}/raw/index.json",
help="Path to JSON index",
)
parser.add_argument(
"--whitelist-file",
default="bhl/license_whitelist.json",
help="Path to JSON file of whitelisted license strings",
)
parser.add_argument(
"--content-file",
default=f"data/{SOURCE_NAME}/raw/data/bhl-ocr-20230823.tar.bz2",
help="Path to tar-ed and bz2 compressed content file",
)
parser.add_argument(
"--output-dir",
default=f"data/{SOURCE_NAME}/raw/extracted_data",
help="Path to output directory",
)
return parser.parse_args()


Expand All @@ -30,18 +49,26 @@ def main(args):
logging.info(f"Loading index file from {args.index_file}")
with open(args.index_file, "r") as f:
index = json.load(f)

logging.info(f"Loading license whitelist file from {args.whitelist_file}")
with open(args.whitelist_file, "r") as f:
whitelist = json.load(f)

logging.info(f"Loading content file from {args.content_file}")
content_file = tarfile.open(args.content_file, "r:bz2")

logging.info("Constructing list of all whitelisted items")
whitelisted_items = set(sum([[uri.split("/")[-1].zfill(6) for uri in index[license]] for license in whitelist], start=[]))
whitelisted_items = set(
sum(
[
[uri.split("/")[-1].zfill(6) for uri in index[license]]
for license in whitelist
],
start=[],
)
)
logging.info(f"Found {len(whitelisted_items)} whitelisted items")

num_extracted_files = 0
extracted_size = 0
pbar = tqdm(content_file)
Expand All @@ -50,12 +77,17 @@ def main(args):
continue
item_id = item_info.path.split("/")[2]
if item_id in whitelisted_items:
content_file.extract(item_info, path=args.output_dir)
content_file.extract(item_info, path=args.output_dir)
num_extracted_files += 1
extracted_size += item_info.size

pbar.set_postfix({"Extracted Files": num_extracted_files, "Extracted Size": f"{extracted_size / 2**30:.3f} GB"})


pbar.set_postfix(
{
"Extracted Files": num_extracted_files,
"Extracted Size": f"{extracted_size / 2**30:.3f} GB",
}
)


if __name__ == "__main__":
args = parse_args()
Expand Down
30 changes: 22 additions & 8 deletions bhl/to-dolma.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,32 @@
import argparse
import datetime
import glob
import logging
import json
import logging
import os

from licensed_pile.licenses import PermissiveLicenses
from licensed_pile.write import to_dolma


logging.basicConfig(level=logging.INFO, format="to-dolma: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s")
logging.basicConfig(
level=logging.INFO,
format="to-dolma: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s",
)


BASE_URL = "https://www.biodiversitylibrary.org/page"
SOURCE_NAME = "biodiversity-heritage-library"

parser = argparse.ArgumentParser(description="Convert data to dolma.")
parser.add_argument(
"--data", default=f"data/{SOURCE_NAME}/extracted_data", help="Path to the directory containing BHL data."
"--data",
default=f"data/{SOURCE_NAME}/extracted_data",
help="Path to the directory containing BHL data.",
)
parser.add_argument(
"--output_dir",
default=f"data/{SOURCE_NAME}/v0",
help="Where the dolma formatted data goes."
help="Where the dolma formatted data goes.",
)
parser.add_argument(
"--filename", default="bhl.jsonl.gz", help="The base filename for the BHL data"
Expand All @@ -34,8 +38,12 @@
)


def format_dolma(content_file: str, source_name: str = SOURCE_NAME, base_url: str = BASE_URL):
item_id, page_id, page_num = os.path.splitext(os.path.basename(content_file))[0].split("-")
def format_dolma(
content_file: str, source_name: str = SOURCE_NAME, base_url: str = BASE_URL
):
item_id, page_id, page_num = os.path.splitext(os.path.basename(content_file))[
0
].split("-")
with open(content_file) as f:
try:
text = f.read()
Expand All @@ -60,7 +68,13 @@ def format_dolma(content_file: str, source_name: str = SOURCE_NAME, base_url: st

def main(args):
# Use iterators so we don't have to load the whole dataset in memory.
content_pages = filter(lambda x: x is not None, map(format_dolma, glob.iglob(os.path.join(args.data, "**", "*.txt"), recursive=True)))
content_pages = filter(
lambda x: x is not None,
map(
format_dolma,
glob.iglob(os.path.join(args.data, "**", "*.txt"), recursive=True),
),
)
to_dolma(content_pages, args.output_dir, args.filename, args.shard_size)


Expand Down
1 change: 1 addition & 0 deletions gutenberg/possible-rights.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import argparse
import glob

import tqdm
from rdflib import Graph

Expand Down

0 comments on commit a71426f

Please sign in to comment.