Skip to content

Commit

Permalink
Add YALTAi- and Download- oriented tests
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Jul 30, 2024
1 parent a29123e commit ae0cb06
Show file tree
Hide file tree
Showing 23 changed files with 497 additions and 43 deletions.
6 changes: 6 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*.jpg filter=lfs diff=lfs merge=lfs -text
*.pdf filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
tests/assets/page1.xml filter=lfs diff=lfs merge=lfs -text
tests/assets/page2.xml filter=lfs diff=lfs merge=lfs -text
12 changes: 10 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,25 @@ jobs:
python-version: ["3.10"]
steps:
- uses: actions/checkout@v4
with:
lfs: true
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install libvips
sudo apt-get install libvips libvips-dev
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest pytest-cov coveralls pytest-sugar
pip install pytest pytest-cov coveralls pytest-sugar requests-mock
- name: Creating environments
run: |
python3 -m venv yaltaienv
yaltaienv/bin/pip install YALTAi --extra-index-url https://download.pytorch.org/whl/cpu
python3 -m venv krakenv
krakenv/bin/pip install kraken --extra-index-url https://download.pytorch.org/whl/cpu
- name: Test
run: |
pytest --doctest-modules --cov=app --verbose
Expand Down
3 changes: 1 addition & 2 deletions example-manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ def kebab(s):
dl.output_files,
binary="yaltaienv/bin/yaltai",
device="cuda:0",
yoloV5_model="GallicorporaSegmentation.pt",
verbose=True,
yolo_model="GallicorporaSegmentation.pt",
raise_on_error=True,
allow_failure=False,
multiprocess=4, # GPU Memory // 5gb
Expand Down
2 changes: 1 addition & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ addopts = --ignore-glob=./example-*.py --doctest-modules --cov=rtk --verbose
testpaths =
tests
pythonpath = .
doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL
doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL
8 changes: 4 additions & 4 deletions quickyaltaiinstall.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
virtualenv -p python3.8 yaltaienv
yaltaienv/bin/pip install yaltai==1.0.0 --extra-index-url https://download.pytorch.org/whl/cu113
virtualenv -p python3.10 yaltaienv
yaltaienv/bin/pip install YALTAi --extra-index-url https://download.pytorch.org/whl/cpu

#virtualenv -p python3.8 krakenv
#krakenv/bin/pip install kraken== --extra-index-url https://download.pytorch.org/whl/cu113
virtualenv -p python3.10 krakenv
krakenv/bin/pip install kraken --extra-index-url https://download.pytorch.org/whl/cpu
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ lxml==4.9.2
PyYAML==6.0
requests==2.28.1
tqdm==4.64.1
pyvips==2.2.1
pyvips==2.2.3
60 changes: 43 additions & 17 deletions rtk/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,23 +73,32 @@ class DownloadIIIFImageTask(Task):
Downloads an image and takes a first input string (URI) and a second one (Directory) [Optional]
:param input_list: List of tuples, where the first value is a URI to download an image, and the second is a folder
"""
def __init__(
self,
input_files: List[Tuple[str, str]],
*args,
output_prefix: Optional[str] = None,
downstream_check: DownstreamCheck = None,
max_height: Optional[int] = None,
max_width: Optional[int] = None,
custom_headers: Optional[Dict[str, str]] = None,
**kwargs):
super(DownloadIIIFImageTask, self).__init__(*args, **kwargs)
super(DownloadIIIFImageTask, self).__init__(input_files=input_files, *args, **kwargs)
self.downstream_check = downstream_check
self.output_prefix: str = output_prefix
self._output_files = []
self._max_h: int = max_height
self._max_w: int = max_width
self._custom_headers: Dict[str, str] = custom_headers or {}
if self._max_h and self._max_w:
raise Exception("Only one parameter max height / max width is accepted")
if self.output_prefix:
self.input_files = [
(uri, os.path.join(output_prefix, target))
for (uri, target) in self.input_files
]

@staticmethod
def rename_download(file: InputType) -> str:
Expand Down Expand Up @@ -303,11 +312,13 @@ def _process(self, inputs: InputListType) -> bool:


class DownloadIIIFManifestTask(Task):
""" Downloads IIIF manifests
Download task takes a first input string (URI)
""" Downloads IIIF manifests (list of URIs as input) and outputs (obj.output_files) a list of
tuples such as [(uri_image_1, folder_manuscript1), (uri_image_2, folder_manuscript1),
(uri_image_last, folder_manuscript_last)]
:param input_files: List of manifests
:param manifest_as_directory: Boolean that uses the manifest filename (can be a function) as a directory container
"""
def __init__(
self,
Expand Down Expand Up @@ -435,24 +446,30 @@ def work(input_list: List[str], pbar) -> List[str]:
if x != "R":
cmd.append(x)
else:
cmd.extend([element for mapped_list in map(self.input_format, input_list) for element in mapped_list])
cmd.extend([
element
for mapped_list in map(self.input_format, input_list)
for element in mapped_list
])

# This allows to control the number of threads used in a subprocess
my_env = os.environ.copy()
my_env["OMP_NUM_THREADS"] = "1"
# The following values are necessary for parsing output
my_env["LINES"] = "40"
my_env["COLUMNS"] = "300"

out = []

proc = subprocess.Popen(
cmd,
# capture_output = True,
text = True,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=my_env,
preexec_fn=lambda: signal.alarm(len(input_list)*self.max_time_per_op),
)


try:
for line in iter(proc.stdout.readline, ""):
for element in self.pbar_parsing(line):
Expand All @@ -471,11 +488,10 @@ def work(input_list: List[str], pbar) -> List[str]:
print("Stopped process")
if not self.allow_failure:
raise InterruptedError
return out
except subprocess.TimeoutExpired as te:
try:
print(proc.stderr.read())
process.kill()
proc.kill()
except Exception as E:
return out
return out
Expand All @@ -487,7 +503,6 @@ def work(input_list: List[str], pbar) -> List[str]:

tp = ThreadPoolExecutor(len([batches for batches in inputs if len(batches)]))
bar = tqdm.tqdm(desc=_sbmsg(f"Processing {self.desc} command"), total=total_texts)

for gen in tp.map(work, inputs, repeat(bar)):
for elem in gen:
if isinstance(elem, str):
Expand All @@ -498,30 +513,41 @@ def input_format(self, inp: str) -> List[str]:
return ["-i", inp, self.rename(inp)]



class YALTAiCommand(KrakenLikeCommand):
""" Runs a Kraken recognizer
KrakenLikeCommand expect `$out` in its command
:param input_list: List of images to process
:type input_list: List[str]
:param yolo_model: Path to a YOLOv8 model
:param line_model: [Optional] Path to a custom kraken line segmentation model
:param device: Device to run inference on
:param allow_failure: Continues to run despite errors
:param binary: Path to the YALTAi binary. If the same environment as RTK can be used, simply `yaltai`
:param raise_on_error: Raise an exception on error
:type raise_on_error: bool
"""
def __init__(
self,
*args,
yoloV5_model: Union[str, pathlib.Path],
yolo_model: Union[str, pathlib.Path],
line_model: Optional[Union[str, pathlib.Path]] = None,
device: str = "cpu",
allow_failure: bool = False,
check_content: bool = False,
binary: str = "yaltai", # Environment can be env/bin/yaltai
**kwargs):
if not os.path.exists(yoloV5_model):
raise ValueError(f"Unknown YOLOv5 model `{yoloV5_model}`")
if not os.path.exists(yolo_model):
raise ValueError(f"Unknown YOLOv8 model `{yolo_model}`")

cmd = f"{binary} kraken {' --verbose ' if kwargs.get('verbose') else ''} {' --raise-on-error ' if kwargs.get('raise-on-error') else ''} --device {device} R segment -y {yoloV5_model}".split(" ")
cmd = (f"{binary} kraken --verbose "
f"{' --raise-on-error ' if kwargs.get('raise_on_error') else ''} --device {device} R "
f"segment -y {yolo_model}").split(" ")

if line_model:
if not os.path.exists(line_model):
raise ValueError(f"Unknown YOLOv5 model `{line_model}`")
raise ValueError(f"Unknown Kraken model `{line_model}`")
cmd.extend(f"-i {line_model}".split(" "))
else:
print("Using default Kraken line segmenter.")
Expand Down
6 changes: 4 additions & 2 deletions rtk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,8 @@ def get_name_before_manifest_json(url):


def string_to_hash(url: str) -> str:
result = hashlib.sha256(url.encode())
return result.digest().decode()
result = hashlib.sha256(url.encode("utf-8"))
return result.hexdigest()[:10]


def alto_zone_extraction(
Expand All @@ -224,6 +224,7 @@ def alto_zone_extraction(
return None
ns = dict(namespaces={"a": "http://www.loc.gov/standards/alto/ns-v4#"})
# <OtherTag ID="TYPE_35" LABEL="Adresse"/>
allowed_tags = []
if zones:
allowed_tags = [
str(otherTag.attrib["ID"])
Expand Down Expand Up @@ -296,6 +297,7 @@ def pdf_name_scheme(pdf_path: str, output_dir: Optional[str] = None, page_prefix
'output/check/f{}.jpg'
>>> pdf_name_scheme("check.pdf", page_prefix='p')
'check/p{}.jpg'
"""
path = Path(pdf_path)
if output_dir:
Expand Down
7 changes: 7 additions & 0 deletions tests/assets/LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Manifests.json

Manifests.json are under the ETALAB license and were provided by the BnF. Some data were removed.

# Pages from JPGs

Archive.org, Public Domain, Google PDF : https://archive.org/details/letheatredelana00fouggoog
Binary file modified tests/assets/lorem.pdf
Binary file not shown.
111 changes: 111 additions & 0 deletions tests/assets/manifest1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
{
"@id" : "https://gallica.bnf.fr/iiif/ark:/12148/bpt6k12401693/manifest.json",
"label" : "BnF, département Estampes et photographie, YD-1 (1880-04-05)-8",
"attribution" : "Bibliothèque nationale de France",
"license" : "https://gallica.bnf.fr/html/und/conditions-dutilisation-des-contenus-de-gallica",
"logo" : "https://gallica.bnf.fr/mbImage/logos/logo-bnf.png",
"related" : "https://gallica.bnf.fr/ark:/12148/bpt6k12401693",
"seeAlso" : [ "http://oai.bnf.fr/oai2/OAIHandler?verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:bnf.fr:gallica/ark:/12148/bpt6k12401693" ],
"description" : "Catalogue de livres précieux ornés de reliures anciennes avec armoiries...",
"metadata" : [ {
"label" : "Repository",
"value" : "Bibliothèque nationale de France"
}, {
"label" : "Digitised by",
"value" : "Bibliothèque nationale de France"
}, {
"label" : "Source Images",
"value" : "https://gallica.bnf.fr/ark:/12148/bpt6k12401693"
}, {
"label" : "Metadata Source",
"value" : "http://oai.bnf.fr/oai2/OAIHandler?verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:bnf.fr:gallica/ark:/12148/bpt6k12401693"
}, {
"label" : "Shelfmark",
"value" : "Bibliothèque nationale de France, département Estampes et photographie, YD-1 (1880-04-05)-8"
}, {
"label" : "Title",
"value" : "Catalogue de livres précieux ornés de reliures anciennes avec armoiries..."
}, {
"label" : "Date",
"value" : "1880"
}, {
"label" : "Language",
"value" : "french"
}, {
"label" : "Format",
"value" : [ {
"@value" : "XII-162 p. ; 25 cm"
}, {
"@value" : "Nombre total de vues : 187"
} ]
}, {
"label" : "Relation",
"value" : "Notice du catalogue : http://catalogue.bnf.fr/ark:/12148/cb415297616"
}, {
"label" : "Type",
"value" : "Book"
} ],
"sequences" : [ {
"canvases" : [ {
"@id" : "https://gallica.bnf.fr/iiif/ark:/12148/bpt6k12401693/canvas/f1",
"label" : "NP",
"height" : 1542,
"width" : 1285,
"images" : [ {
"motivation" : "sc:painting",
"on" : "https://gallica.bnf.fr/iiif/ark:/12148/bpt6k12401693/canvas/f1",
"resource" : {
"format" : "image/jpeg",
"service" : {
"profile" : "http://library.stanford.edu/iiif/image-api/1.1/compliance.html#level2",
"@context" : "http://iiif.io/api/image/1/context.json",
"@id" : "https://gallica.bnf.fr/iiif/ark:/12148/bpt6k12401693/f1"
},
"height" : 1542,
"width" : 1285,
"@id" : "https://gallica.bnf.fr/iiif/ark:/12148/bpt6k12401693/f1/full/full/0/native.jpg",
"@type" : "dctypes:Image"
},
"@type" : "oa:Annotation"
} ],
"thumbnail" : {
"@id" : "https://gallica.bnf.fr/ark:/12148/bpt6k12401693/f1.thumbnail"
},
"@type" : "sc:Canvas"
}, {
"@id" : "https://gallica.bnf.fr/iiif/ark:/12148/bpt6k12401693/canvas/f2",
"label" : "NP",
"height" : 3582,
"width" : 2470,
"images" : [ {
"motivation" : "sc:painting",
"on" : "https://gallica.bnf.fr/iiif/ark:/12148/bpt6k12401693/canvas/f2",
"resource" : {
"format" : "image/jpeg",
"service" : {
"profile" : "http://library.stanford.edu/iiif/image-api/1.1/compliance.html#level2",
"@context" : "http://iiif.io/api/image/1/context.json",
"@id" : "https://gallica.bnf.fr/iiif/ark:/12148/bpt6k12401693/f2"
},
"height" : 3582,
"width" : 2470,
"@id" : "https://gallica.bnf.fr/iiif/ark:/12148/bpt6k12401693/f2/full/full/0/native.jpg",
"@type" : "dctypes:Image"
},
"@type" : "oa:Annotation"
} ],
"thumbnail" : {
"@id" : "https://gallica.bnf.fr/ark:/12148/bpt6k12401693/f2.thumbnail"
},
"@type" : "sc:Canvas"
}],
"label" : "Current Page Order",
"@type" : "sc:Sequence",
"@id" : "https://gallica.bnf.fr/iiif/ark:/12148/bpt6k12401693/sequence/default"
} ],
"thumbnail" : {
"@id" : "https://gallica.bnf.fr/ark:/12148/bpt6k12401693.thumbnail"
},
"@type" : "sc:Manifest",
"@context" : "http://iiif.io/api/presentation/2/context.json"
}
Loading

0 comments on commit ae0cb06

Please sign in to comment.