Skip to content

Commit

Permalink
import: oton converter code and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
MehmedGIT committed Oct 28, 2024
1 parent 46306e2 commit 5eac08f
Show file tree
Hide file tree
Showing 29 changed files with 5,974 additions and 1 deletion.
Empty file.
31 changes: 31 additions & 0 deletions src/utils/operandi_utils/oton/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import click
from .converter import Converter
from .validators.ocrd_validator import OCRDValidator


@click.group()
def cli():
pass


@cli.command("convert", help="Convert an OCR-D workflow to a Nextflow workflow script.")
@click.option('-I', '--input_path', type=click.Path(dir_okay=False, exists=True, readable=True),
show_default=True, help='Path to the OCR-D workflow file to be converted.')
@click.option('-O', '--output_path', type=click.Path(dir_okay=False, writable=True),
show_default=True, help='Path of the Nextflow workflow script to be generated.')
@click.option('-D', '--dockerized', is_flag=True,
help='If set, then the dockerized variant of the Nextflow script is generated.')
def convert(input_path: str, output_path: str, dockerized: bool):
print(f"Converting from: {input_path}")
print(f"Converting to: {output_path}")
Converter().convert_OtoN(input_path, output_path, dockerized)
print("Conversion was successful!")


@cli.command("validate", help="Validate an OCR-D workflow txt file.")
@click.option('-I', '--input_path', show_default=True,
help='Path to the OCR-D workflow file to be validated.')
def validate(input_path: str):
OCRDValidator().validate(input_path)
print(f"Validating: {input_path}")
print("Validation was successful!")
101 changes: 101 additions & 0 deletions src/utils/operandi_utils/oton/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from json import load
from os import environ
from pkg_resources import resource_filename


__all__ = [
"DIR_IN",
"DIR_OUT",
"METS_FILE",

"OCRD_ALL_JSON",
"OTON_LOG_LEVEL",
"OTON_LOG_FORMAT",

"PARAMS_KEY_DOCKER_PWD",
"PARAMS_KEY_DOCKER_VOLUME",
"PARAMS_KEY_DOCKER_MODELS",
"PARAMS_KEY_DOCKER_IMAGE",
"PARAMS_KEY_DOCKER_COMMAND",
"PARAMS_KEY_INPUT_FILE_GRP",
"PARAMS_KEY_METS_PATH",
"PARAMS_KEY_WORKSPACE_PATH",

"PH_DOCKER_COMMAND",
"PH_DIR_IN",
"PH_DIR_OUT",
"PH_METS_FILE",

"REPR_DOCKER_COMMAND",
"REPR_DOCKER_IMAGE",
"REPR_DOCKER_MODELS",
"REPR_DOCKER_MODELS_DIR",
"REPR_DOCKER_PWD",
"REPR_DOCKER_VOLUME",
"REPR_INPUT_FILE_GRP",
"REPR_METS_PATH",
"REPR_MODELS_PATH",
"REPR_WORKSPACE_PATH",

"SPACES"
]

OCRD_ALL_JSON_FILE = resource_filename(__name__, 'ocrd_all_tool.json')
with open(OCRD_ALL_JSON_FILE) as f:
OCRD_ALL_JSON = load(f)

OTON_LOG_LEVEL = environ.get("OTON_LOG_LEVEL", "INFO")
OTON_LOG_FORMAT = '%(asctime)s %(levelname)s %(name)s:%(funcName)s: %(lineno)s: %(message)s'

# Parameter keys
PARAMS_KEY_DOCKER_COMMAND: str = 'params.docker_command'
PARAMS_KEY_DOCKER_IMAGE: str = 'params.docker_image'
PARAMS_KEY_DOCKER_PWD: str = 'params.docker_pwd'
PARAMS_KEY_DOCKER_VOLUME: str = 'params.docker_volume'
PARAMS_KEY_DOCKER_MODELS: str = 'params.docker_models'
PARAMS_KEY_DOCKER_MODELS_DIR: str = 'params.docker_models_dir'
PARAMS_KEY_METS_PATH: str = 'params.mets_path'
PARAMS_KEY_INPUT_FILE_GRP: str = 'params.input_file_grp'
PARAMS_KEY_MODELS_PATH: str = 'params.models_path'
PARAMS_KEY_WORKSPACE_PATH: str = 'params.workspace_path'


def __build_docker_command():
docker_command = 'docker run --rm'
docker_command += f' -u \\$(id -u)'
docker_command += f' -v ${PARAMS_KEY_DOCKER_VOLUME}'
docker_command += f' -v ${PARAMS_KEY_DOCKER_MODELS}'
docker_command += f' -w ${PARAMS_KEY_DOCKER_PWD}'
docker_command += f' -- ${PARAMS_KEY_DOCKER_IMAGE}'
return docker_command


def __build_repr(parameter, value):
return f'{parameter} = "{value}"'


# Parameters - file representation
REPR_DOCKER_COMMAND: str = __build_repr(PARAMS_KEY_DOCKER_COMMAND, __build_docker_command())
REPR_DOCKER_IMAGE: str = __build_repr(PARAMS_KEY_DOCKER_IMAGE, "null")
REPR_DOCKER_MODELS: str = __build_repr(PARAMS_KEY_DOCKER_MODELS,
f'${PARAMS_KEY_MODELS_PATH}:${PARAMS_KEY_DOCKER_MODELS_DIR}')
REPR_DOCKER_MODELS_DIR: str = __build_repr(PARAMS_KEY_DOCKER_MODELS_DIR, "null")
REPR_DOCKER_PWD: str = __build_repr(PARAMS_KEY_DOCKER_PWD, "null")
REPR_DOCKER_VOLUME: str = __build_repr(PARAMS_KEY_DOCKER_VOLUME,
f'${PARAMS_KEY_WORKSPACE_PATH}:${PARAMS_KEY_DOCKER_PWD}')
REPR_METS_PATH: str = __build_repr(PARAMS_KEY_METS_PATH, "null")
REPR_INPUT_FILE_GRP: str = __build_repr(PARAMS_KEY_INPUT_FILE_GRP, "null")
REPR_MODELS_PATH: str = __build_repr(PARAMS_KEY_MODELS_PATH, "null")
REPR_WORKSPACE_PATH: str = __build_repr(PARAMS_KEY_WORKSPACE_PATH, "null")

DIR_IN: str = 'input_file_grp'
DIR_OUT: str = 'output_file_grp'
METS_FILE: str = 'mets_file'

# Placeholders
BS: str = '{}'
PH_DOCKER_COMMAND: str = f'${BS[0]}{PARAMS_KEY_DOCKER_COMMAND}{BS[1]}'
PH_DIR_IN: str = f'${BS[0]}{DIR_IN}{BS[1]}'
PH_DIR_OUT: str = f'${BS[0]}{DIR_OUT}{BS[1]}'
PH_METS_FILE: str = f'${BS[0]}{METS_FILE}{BS[1]}'
SPACES = ' '
22 changes: 22 additions & 0 deletions src/utils/operandi_utils/oton/converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from .models import NextflowFileExecutable
from .validator import OCRDValidator


class Converter:
def __init__(self):
pass

@staticmethod
def convert_OtoN(input_path: str, output_path: str, dockerized: bool = False):
validator = OCRDValidator()
validator.validate(input_path)

nf_file_executable = NextflowFileExecutable()
nf_file_executable.build_parameters(dockerized)
# TODO: first_file_grps replacement is a wacky hack
# to replace the default value of the pre-built REPR_INPUT_FILE_GRP
nf_processes, first_file_grps = nf_file_executable.build_nextflow_processes(validator.processors, dockerized)
# TODO: This index is currently 3, but may change!
nf_file_executable.nf_lines_parameters[3] = nf_file_executable.nf_lines_parameters[3].replace("null", f"{first_file_grps}")
nf_file_executable.build_main_workflow(nf_processes)
nf_file_executable.produce_nextflow_file(output_path)
5 changes: 5 additions & 0 deletions src/utils/operandi_utils/oton/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__all__ = ["NextflowBlockProcess", "NextflowBlockWorkflow", "NextflowFileExecutable"]

from .nf_block_process import NextflowBlockProcess
from .nf_block_workflow import NextflowBlockWorkflow
from .nf_file_executable import NextflowFileExecutable
66 changes: 66 additions & 0 deletions src/utils/operandi_utils/oton/models/nf_block_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import logging
from ..validator import ProcessorCallArguments
from ..constants import OTON_LOG_LEVEL, OTON_LOG_FORMAT, PH_DIR_IN, PH_DIR_OUT, PH_METS_FILE, PH_DOCKER_COMMAND, SPACES


class NextflowBlockProcess:
def __init__(self, processor_call_arguments: ProcessorCallArguments, index_pos: int, dockerized: bool = False):
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.getLevelName(OTON_LOG_LEVEL))
logging.basicConfig(format=OTON_LOG_FORMAT)

self.dockerized = dockerized
self.nf_process_name = processor_call_arguments.executable.replace('-', '_') + "_" + str(index_pos)
self.repr_in_workflow = [
self.nf_process_name,
f'"{processor_call_arguments.input_file_grps}"',
f'"{processor_call_arguments.output_file_grps}"'
]

processor_call_arguments.input_file_grps = PH_DIR_IN
processor_call_arguments.output_file_grps = PH_DIR_OUT
processor_call_arguments.mets_file_path = PH_METS_FILE

self.ocrd_command_bash = f'{processor_call_arguments}'
self.directives = []
self.input_params = []
self.output_params = []

def file_representation(self):
representation = f'process {self.nf_process_name}' + ' {\n'

for directive in self.directives:
representation += f'{SPACES}{directive}\n'
representation += '\n'

representation += f'{SPACES}input:\n'
for input_param in self.input_params:
representation += f'{SPACES}{SPACES}{input_param}\n'
representation += '\n'

representation += f'{SPACES}output:\n'
for output_param in self.output_params:
representation += f'{SPACES}{SPACES}{output_param}\n'
representation += '\n'

representation += f'{SPACES}script:\n'
representation += f'{SPACES}{SPACES}"""\n'
if self.dockerized:
representation += f'{SPACES}{SPACES}{PH_DOCKER_COMMAND} {self.ocrd_command_bash}\n'
else:
representation += f'{SPACES}{SPACES}{self.ocrd_command_bash}\n'
representation += f'{SPACES}{SPACES}"""\n'

representation += '}\n'

self.logger.debug(f"\n{representation}")
return representation

def add_directive(self, directive: str):
self.directives.append(directive)

def add_input_param(self, parameter: str):
self.input_params.append(parameter)

def add_output_param(self, parameter: str):
self.output_params.append(parameter)
34 changes: 34 additions & 0 deletions src/utils/operandi_utils/oton/models/nf_block_workflow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import logging
from typing import List
from ..constants import OTON_LOG_LEVEL, OTON_LOG_FORMAT, PARAMS_KEY_METS_PATH, PARAMS_KEY_INPUT_FILE_GRP, SPACES


class NextflowBlockWorkflow:
def __init__(self, workflow_name: str, nf_processes: List[str]):
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.getLevelName(OTON_LOG_LEVEL))
logging.basicConfig(format=OTON_LOG_FORMAT)

self.workflow_name = workflow_name
self.nf_processes: List[str] = nf_processes

def file_representation(self):
representation = 'workflow {\n'
representation += f'{SPACES}{self.workflow_name}:\n'

previous_nfp = None
for nfp in self.nf_processes:
nfp_0 = nfp[0]
nfp_1 = nfp[1]
nfp_2 = nfp[2]
if previous_nfp is None:
representation += f'{SPACES}{SPACES}{nfp_0}({PARAMS_KEY_METS_PATH}, {PARAMS_KEY_INPUT_FILE_GRP}, {nfp_2})\n'
else:
representation += f'{SPACES}{SPACES}{nfp_0}({previous_nfp}.out, {nfp_1}, {nfp_2})\n'
previous_nfp = nfp_0

representation += '}'

self.logger.debug(f"\n{representation}")
self.logger.info(f"Successfully created Nextflow Workflow: {self.workflow_name}")
return representation
98 changes: 98 additions & 0 deletions src/utils/operandi_utils/oton/models/nf_file_executable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import logging
from typing import List, Tuple

from ..validator import ProcessorCallArguments
from ..constants import (
DIR_IN,
DIR_OUT,
METS_FILE,

OTON_LOG_FORMAT,
OTON_LOG_LEVEL,

REPR_DOCKER_COMMAND,
REPR_DOCKER_IMAGE,
REPR_DOCKER_MODELS,
REPR_DOCKER_MODELS_DIR,
REPR_DOCKER_PWD,
REPR_DOCKER_VOLUME,
REPR_METS_PATH,
REPR_INPUT_FILE_GRP,
REPR_MODELS_PATH,
REPR_WORKSPACE_PATH
)
from .nf_block_process import NextflowBlockProcess
from .nf_block_workflow import NextflowBlockWorkflow


class NextflowFileExecutable:
def __init__(self):
self.nf_lines_parameters = []
self.nf_lines_processes = []
self.nf_lines_workflow = []
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.getLevelName(OTON_LOG_LEVEL))
logging.basicConfig(format=OTON_LOG_FORMAT)

def build_parameters(self, dockerized: bool = False):
self.nf_lines_parameters.append('nextflow.enable.dsl = 2')
self.nf_lines_parameters.append('')

self.nf_lines_parameters.append(REPR_METS_PATH)
self.nf_lines_parameters.append(REPR_INPUT_FILE_GRP)

if dockerized:
self.nf_lines_parameters.append(REPR_WORKSPACE_PATH)
self.nf_lines_parameters.append(REPR_DOCKER_PWD)
self.nf_lines_parameters.append(REPR_DOCKER_VOLUME)
self.nf_lines_parameters.append(REPR_DOCKER_MODELS_DIR)
self.nf_lines_parameters.append(REPR_MODELS_PATH)
self.nf_lines_parameters.append(REPR_DOCKER_MODELS)
self.nf_lines_parameters.append(REPR_DOCKER_IMAGE)
self.nf_lines_parameters.append(REPR_DOCKER_COMMAND)

self.nf_lines_parameters.append('')

def build_nextflow_processes(
self,
ocrd_processor: List[ProcessorCallArguments],
dockerized: bool = False
) -> Tuple[List[str], str]:

nf_processes = []
first_file_grps = "DEFAULT"
index = 0
for processor in ocrd_processor:
nf_process_block = NextflowBlockProcess(processor, index, dockerized)
nf_process_block.add_directive('maxForks 1')
nf_process_block.add_input_param(f'path {METS_FILE}')
nf_process_block.add_input_param(f'val {DIR_IN}')
nf_process_block.add_input_param(f'val {DIR_OUT}')
nf_process_block.add_output_param(f'path {METS_FILE}')
self.nf_lines_processes.append(nf_process_block.file_representation())

# Take the input_file_grp of the first processor and change the value of the
# REPR_INPUT_FILE_GRP to set the desired file group as default
if index == 0:
first_file_grps = nf_process_block.repr_in_workflow[1].strip('"')

# This list is used when building the workflow
nf_processes.append(nf_process_block.repr_in_workflow)
self.logger.info(f"Successfully created Nextflow Process: {nf_process_block.nf_process_name}")
index += 1

return nf_processes, first_file_grps

def build_main_workflow(self, nf_processes: List[str]):
nf_workflow_block = NextflowBlockWorkflow("main", nf_processes)
self.nf_lines_workflow.append(nf_workflow_block.file_representation())

def produce_nextflow_file(self, output_path: str):
# Write Nextflow line tokens to an output file
with open(output_path, mode='w', encoding='utf-8') as nextflow_file:
for nextflow_line in self.nf_lines_parameters:
nextflow_file.write(f'{nextflow_line}\n')
for nextflow_line in self.nf_lines_processes:
nextflow_file.write(f'{nextflow_line}\n')
for nextflow_line in self.nf_lines_workflow:
nextflow_file.write(f'{nextflow_line}\n')
Loading

0 comments on commit 5eac08f

Please sign in to comment.