Skip to content

Commit

Permalink
Merge pull request #57 from slub/workflow-volume
Browse files Browse the repository at this point in the history
Workflow volume
  • Loading branch information
markusweigelt authored Apr 12, 2023
2 parents a42880b + f49398b commit 57e863f
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 20 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ EXPOSE 22

WORKDIR /data
VOLUME /data
VOLUME /workflows

# simulate a virtual env for the makefile,
# coinciding with the Python system prefix
Expand Down
10 changes: 8 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Variables:
currently: "$(PRIVATE)"
- DATA host directory to mount into `/data`
currently: "$(DATA)"
- WORKFLOWS host directory to mount into `/workflows`
currently: "$(WORKFLOWS)"
- UID user id to use in logins
currently: $(UID)
- GID group id to use in logins
Expand All @@ -47,6 +49,7 @@ help: ; @eval "$$HELP"
KEYS ?= $(firstword $(wildcard $(HOME)/.ssh/authorized_keys* $(HOME)/.ssh/id_*.pub))
PRIVATE ?= $(firstword $(filter-out %.pub,$(wildcard $(HOME)/.ssh/id_*)))
DATA ?= $(CURDIR)
WORKFLOWS ?= $(CURDIR)/workflows
UID ?= $(shell id -u)
GID ?= $(shell id -g)
UMASK ?= 0002
Expand All @@ -62,6 +65,7 @@ run: $(DATA)
--name ocrd_manager \
--network=$(NETWORK) \
-v $(DATA):/data \
-v $(WORKFLOWS):/workflows \
--mount type=bind,source=$(KEYS),target=/authorized_keys \
--mount type=bind,source=$(PRIVATE),target=/id_rsa \
-e UID=$(UID) -e GID=$(GID) -e UMASK=$(UMASK) \
Expand All @@ -88,7 +92,8 @@ test-production: $(DATA)/testdata-production
ifeq ($(NETWORK),bridge)
ssh -i $(PRIVATE) -Tn -p $(PORT) ocrd@localhost $(SCRIPT) $(<F)
else
docker exec -t -u ocrd `docker container ls -qf name=ocrd-manager` $(SCRIPT) $(<F)
if test -t 0 -a -t 1; then TTY=-i; fi; \
docker exec $$TTY -t -u ocrd `docker container ls -n1 -qf name=ocrd-manager` $(SCRIPT) $(<F)
endif
test -d $</ocr/alto
test -s $</ocr/alto/00000009.tif.original.xml
Expand All @@ -100,7 +105,8 @@ test-presentation:
ifeq ($(NETWORK),bridge)
ssh -i $(PRIVATE) -Tn -p $(PORT) ocrd@localhost $(SCRIPT) $(<F)/mets.xml
else
docker exec -t -u ocrd `docker container ls -qf name=ocrd-manager` $(SCRIPT) $(<F)/mets.xml
if test -t 0 -a -t 1; then TTY=-i; fi; \
docker exec $$TTY -t -u ocrd `docker container ls -n1 -qf name=ocrd-manager` $(SCRIPT) $(<F)/mets.xml
endif
diff -u <(docker run --rm -v $(DATA):/data $(TAGNAME) ocrd workspace -d $(<F) find -G FULLTEXT -g PHYS_0017..PHYS_0021) <(for file in FULLTEXT/FULLTEXT_PHYS_00{17..21}.xml; do echo $(PREFIX)/$$file; done)

Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,12 @@ Build or pull the Docker image:

### Starting and mounting

Then run the container – providing a **host-side directory** for the volume
Then run the container – providing a **host-side directory** for the volumes

* `DATA`: directory for data processing (including images or existing workspaces),
defaults to current working directory
* `WORKFLOWS`: directory for scripts (preconfigured workflows),
defaults to `./workflows` in current working directory

… but also files …

Expand All @@ -73,7 +75,7 @@ Then run the container – providing a **host-side directory** for the volume

… thus, for **example**:

make run DATA=/mnt/workspaces MODELS=~/.local/share KEYS=~/.ssh/id_rsa.pub PORT=9022 PRIVATE=~/.ssh/id_rsa
make run DATA=/mnt/workspaces WORKFLOWS=/mnt/workflows KEYS=~/.ssh/id_rsa.pub PORT=9022 PRIVATE=~/.ssh/id_rsa

(You can also run the service via `docker-compose` manually – just `cp .env.example .env` and edit to your needs.)

Expand Down Expand Up @@ -140,7 +142,7 @@ which contains a trivial workflow:
- preprocessing, layout analysis and text recognition with a single Tesseract processor call
- format conversion of the result from PAGE-XML to ALTO-XML

It can be replaced with the (path) name of any workflow script mounted under `/data`.
It can be replaced with the (path) name of any workflow script mounted under `/workflows` or `/data`.

For example (assuming `testdata` is a directory with image files mounted under `/data`):

Expand Down Expand Up @@ -180,7 +182,7 @@ ENVIRONMENT VARIABLES:
CONTROLLER: host name and port of OCR-D Controller for processing
```

The same goes here for the `workflow parameter`.
For the `workflow` parameter, the same goes here as [above](#from-image-to-alto-files).

For example (assuming `testdata` is a directory with image files mounted under `/data`):

Expand Down
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ services:
source: ${MANAGER_KEY} # Manager private key (for access to Controller)
target: /id_rsa
- ${MANAGER_DATA}:/data # metadata directory
- ${MANAGER_WORKFLOWS}:/workflows
- shared:/run/lock/ocrd.jobs

tty: true # docker run -t
Expand Down
40 changes: 28 additions & 12 deletions ocrd_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,17 @@ logerr() {
logger -p user.info -t $TASK "terminating with error \$?=$? from ${BASH_COMMAND} on line $(caller)"
}

stopbg() {
logger -p user.crit -t $TASK "passing SIGKILL to child $!"
# pass signal on to children
kill -KILL $!
}

# initialize variables, create ord-d work directory and exit if something is missing
init() {
trap logerr ERR

trap stopbg INT TERM KILL

PID=$$

cd /data
Expand All @@ -29,13 +36,28 @@ init() {
exit 2
fi

WORKDIR=ocr-d/"$PROCESS_DIR" # use subdirectory of same volume so --reflink CoW still possible
if ! mkdir -p "$WORKDIR"; then
logger -p user.error -t $TASK "insufficient permissions on /data volume"
exit 5
fi
# try to be unique here (to avoid clashes)
REMOTEDIR="KitodoJob_${PID}_$(basename $PROCESS_DIR)"

WORKFLOW=$(command -v "$WORKFLOW" || realpath "$WORKFLOW")
if ! test -f "$WORKFLOW"; then
logger -p user.error -t $TASK "invalid workflow '$WORKFLOW'"
exit 3
fi
logger -p user.notice -t $TASK "using workflow '$WORKFLOW':"
ocrd_format_workflow | logger -p user.notice -t $TASK
if test "${WORKFLOW#/workflows/}" = "$WORKFLOW"; then
# full path does not start with /workflows/
# this is not a standard workflow - so make a copy
# in the workspace and use that path instead
cp -p "$WORKFLOW" "$WORKDIR/workflow.sh"
WORKFLOW="$WORKDIR/workflow.sh"
fi

if test -z "$CONTROLLER" -o "$CONTROLLER" = "${CONTROLLER#*:}"; then
logger -p user.error -t $TASK "envvar CONTROLLER='$CONTROLLER' must contain host:port"
Expand All @@ -44,18 +66,11 @@ init() {
CONTROLLERHOST=${CONTROLLER%:*}
CONTROLLERPORT=${CONTROLLER#*:}

WORKDIR=ocr-d/"$PROCESS_DIR" # will use other mount-point than /data soon
if ! mkdir -p "$WORKDIR"; then
logger -p user.error -t $TASK "insufficient permissions on /data volume"
exit 5
fi
# try to be unique here (to avoid clashes)
REMOTEDIR="KitodoJob_${PID}_$(basename $PROCESS_DIR)"

# create stats for monitor
mkdir -p /run/lock/ocrd.jobs/
{
echo PID=$PID
echo TIME_CREATED=$(date --rfc-3339=seconds)
echo PROCESS_ID=$PROCESS_ID
echo TASK_ID=$TASK_ID
echo PROCESS_DIR=$PROCESS_DIR
Expand All @@ -68,7 +83,8 @@ init() {
}

logret() {
sed -i 1s/.*/RETVAL=$?/ /run/lock/ocrd.jobs/$REMOTEDIR
sed -i "1s/PID=.*/RETVAL=$?/" /run/lock/ocrd.jobs/$REMOTEDIR
sed -i "2a TIME_TERMINATED=$(date --rfc-3339=seconds)" /run/lock/ocrd.jobs/$REMOTEDIR
}

init_task() {
Expand All @@ -83,15 +99,15 @@ ocrd_format_workflow() {

# ocrd import from workdir
ocrd_import_workdir() {
echo "echo \$\$ > $REMOTEDIR/ocrd.pid"
echo "if test -f '$REMOTEDIR/mets.xml'; then OV=--overwrite; else OV=; ocrd-import -i '$REMOTEDIR'; fi"
echo "cd '$REMOTEDIR'"
echo 'echo $$ > ocrd.pid'
}

ocrd_enter_workdir() {
echo "echo \$\$ > $REMOTEDIR/ocrd.pid"
echo "if test -f '$REMOTEDIR/mets.xml'; then OV=--overwrite; else OV=; fi"
echo "cd '$REMOTEDIR'"
echo 'echo $$ > ocrd.pid'
}

ocrd_process_workflow() {
Expand Down
2 changes: 1 addition & 1 deletion process_images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ parse_args() {
SCRIPT=
PROCESS_ID=
TASK_ID=
WORKFLOW=ocr-workflow-default.sh
WORKFLOW=/workflows/ocr-workflow-default.sh
IMAGES_SUBDIR=images
RESULT_SUBDIR=ocr/alto
while (($#)); do
Expand Down
2 changes: 1 addition & 1 deletion process_mets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ parse_args() {
SCRIPT=
PROCESS_ID=
TASK_ID=
WORKFLOW=ocr-workflow-default.sh
WORKFLOW=/workflows/ocr-workflow-default.sh
PAGES=
IMAGES_GRP=DEFAULT
RESULT_GRP=FULLTEXT
Expand Down
File renamed without changes.

0 comments on commit 57e863f

Please sign in to comment.