Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workflow volume #57

Merged
merged 7 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ EXPOSE 22

WORKDIR /data
VOLUME /data
VOLUME /workflows

# simulate a virtual env for the makefile,
# coinciding with the Python system prefix
Expand Down
10 changes: 8 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Variables:
currently: "$(PRIVATE)"
- DATA host directory to mount into `/data`
currently: "$(DATA)"
- WORKFLOWS host directory to mount into `/workflows`
currently: "$(WORKFLOWS)"
- UID user id to use in logins
currently: $(UID)
- GID group id to use in logins
Expand All @@ -47,6 +49,7 @@ help: ; @eval "$$HELP"
KEYS ?= $(firstword $(wildcard $(HOME)/.ssh/authorized_keys* $(HOME)/.ssh/id_*.pub))
PRIVATE ?= $(firstword $(filter-out %.pub,$(wildcard $(HOME)/.ssh/id_*)))
DATA ?= $(CURDIR)
WORKFLOWS ?= $(CURDIR)/workflows
UID ?= $(shell id -u)
GID ?= $(shell id -g)
UMASK ?= 0002
Expand All @@ -62,6 +65,7 @@ run: $(DATA)
--name ocrd_manager \
--network=$(NETWORK) \
-v $(DATA):/data \
-v $(WORKFLOWS):/workflows \
--mount type=bind,source=$(KEYS),target=/authorized_keys \
--mount type=bind,source=$(PRIVATE),target=/id_rsa \
-e UID=$(UID) -e GID=$(GID) -e UMASK=$(UMASK) \
Expand All @@ -88,7 +92,8 @@ test-production: $(DATA)/testdata-production
ifeq ($(NETWORK),bridge)
ssh -i $(PRIVATE) -Tn -p $(PORT) ocrd@localhost $(SCRIPT) $(<F)
else
docker exec -t -u ocrd `docker container ls -qf name=ocrd-manager` $(SCRIPT) $(<F)
if test -t 0 -a -t 1; then TTY=-i; fi; \
docker exec $$TTY -t -u ocrd `docker container ls -n1 -qf name=ocrd-manager` $(SCRIPT) $(<F)
endif
test -d $</ocr/alto
test -s $</ocr/alto/00000009.tif.original.xml
Expand All @@ -100,7 +105,8 @@ test-presentation:
ifeq ($(NETWORK),bridge)
ssh -i $(PRIVATE) -Tn -p $(PORT) ocrd@localhost $(SCRIPT) $(<F)/mets.xml
else
docker exec -t -u ocrd `docker container ls -qf name=ocrd-manager` $(SCRIPT) $(<F)/mets.xml
if test -t 0 -a -t 1; then TTY=-i; fi; \
docker exec $$TTY -t -u ocrd `docker container ls -n1 -qf name=ocrd-manager` $(SCRIPT) $(<F)/mets.xml
endif
diff -u <(docker run --rm -v $(DATA):/data $(TAGNAME) ocrd workspace -d $(<F) find -G FULLTEXT -g PHYS_0017..PHYS_0021) <(for file in FULLTEXT/FULLTEXT_PHYS_00{17..21}.xml; do echo $(PREFIX)/$$file; done)

Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,12 @@ Build or pull the Docker image:

### Starting and mounting

Then run the container – providing a **host-side directory** for the volume
Then run the container – providing a **host-side directory** for the volumes

* `DATA`: directory for data processing (including images or existing workspaces),
defaults to current working directory
* `WORKFLOWS`: directory for scripts (preconfigured workflows),
defaults to `./workflows` in current working directory

… but also files …

Expand All @@ -73,7 +75,7 @@ Then run the container – providing a **host-side directory** for the volume

… thus, for **example**:

make run DATA=/mnt/workspaces MODELS=~/.local/share KEYS=~/.ssh/id_rsa.pub PORT=9022 PRIVATE=~/.ssh/id_rsa
make run DATA=/mnt/workspaces WORKFLOWS=/mnt/workflows KEYS=~/.ssh/id_rsa.pub PORT=9022 PRIVATE=~/.ssh/id_rsa

(You can also run the service via `docker-compose` manually – just `cp .env.example .env` and edit to your needs.)

Expand Down Expand Up @@ -140,7 +142,7 @@ which contains a trivial workflow:
- preprocessing, layout analysis and text recognition with a single Tesseract processor call
- format conversion of the result from PAGE-XML to ALTO-XML

It can be replaced with the (path) name of any workflow script mounted under `/data`.
It can be replaced with the (path) name of any workflow script mounted under `/workflows` or `/data`.

For example (assuming `testdata` is a directory with image files mounted under `/data`):

Expand Down Expand Up @@ -180,7 +182,7 @@ ENVIRONMENT VARIABLES:
CONTROLLER: host name and port of OCR-D Controller for processing
```

The same goes here for the `workflow parameter`.
For the `workflow` parameter, the same goes here as [above](#from-image-to-alto-files).

For example (assuming `testdata` is a directory with image files mounted under `/data`):

Expand Down
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ services:
source: ${MANAGER_KEY} # Manager private key (for access to Controller)
target: /id_rsa
- ${MANAGER_DATA}:/data # metadata directory
- ${MANAGER_WORKFLOWS}:/workflows
- shared:/run/lock/ocrd.jobs

tty: true # docker run -t
Expand Down
37 changes: 25 additions & 12 deletions ocrd_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,17 @@ logerr() {
logger -p user.info -t $TASK "terminating with error \$?=$? from ${BASH_COMMAND} on line $(caller)"
}

stopbg() {
logger -p user.crit -t $TASK "passing SIGKILL to child $!"
# pass signal on to children
kill -KILL $!
}

# initialize variables, create ord-d work directory and exit if something is missing
init() {
trap logerr ERR

trap stopbg INT TERM KILL

PID=$$

cd /data
Expand All @@ -29,13 +36,25 @@ init() {
exit 2
fi

WORKDIR=ocr-d/"$PROCESS_DIR" # use subdirectory of same volume so --reflink CoW still possible
if ! mkdir -p "$WORKDIR"; then
logger -p user.error -t $TASK "insufficient permissions on /data volume"
exit 5
fi
# try to be unique here (to avoid clashes)
REMOTEDIR="KitodoJob_${PID}_$(basename $PROCESS_DIR)"

WORKFLOW=$(command -v "$WORKFLOW" || realpath "$WORKFLOW")
if ! test -f "$WORKFLOW"; then
logger -p user.error -t $TASK "invalid workflow '$WORKFLOW'"
exit 3
fi
logger -p user.notice -t $TASK "using workflow '$WORKFLOW':"
ocrd_format_workflow | logger -p user.notice -t $TASK
if test "${WORKFLOW#/workflows/}" = "$WORKFLOW"; then
Copy link
Collaborator

@markusweigelt markusweigelt Apr 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not understand the check here cause of Line 48 and i think that needs a comment.

So with this parameter substitution it looks like the file test Line 48 will fail or this check will not be successful.

IMO for example:

With /workflows/ocr-workflow-default.sh as $WORKFLOW Line 48 will successful.

Line 54 "ocr-workflow-default.sh" = "/workflows/ocr-workflow-default.sh" will fail.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's precisely the point. The check trivially looks whether the realpath of the workflow starts with /workflows and thus, needs no further action. If it does not, then it gets copied into the workdir under a default name, to ensure it will be accessible.

Copy link
Collaborator

@markusweigelt markusweigelt Apr 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alone that already the inquiry came from me would be worth to write comment there cause it is not easy to read (for none linux power users those for whom parameter substitution syntax has not yet become flesh and blood 😉 ) and understand why you do that here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that by this logic, I would have to second every other statement in the script with an explanatory comment. Code is also documentation in itself. Only complex / non-trivial code bits should be commented.

If you encounter an unknown construct in a (new) language, it's up to you to visit the documentation first. This expansion test is very common in shell scripts.

Feel free to make a suggestion with a better formulation (preferbly inside the condition, i.e. from the perspective after the test succeeded).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel free to make a suggestion with a better formulation (preferbly inside the condition, i.e. from the perspective after the test succeeded).

Maybe a little bit simpler otherwise i would separate the variable or leave a small comment.

Suggested change
if test "${WORKFLOW#/workflows/}" = "$WORKFLOW"; then
if test "${WORKFLOW:0:11}" != "/workflow/"; then

IMO not only knowledge was required, you also had to think outside the box. I think code should be clearly visible without thinking outside the box. But I understand your points and I can also live with the current state.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know what you mean by thinking outside the box here. This substitution test is straightforward and very common (e.g. you can see tons of these tests in every autoconf script). Your offeset-length test would also work (with plural s), but is less idiomatic.

I just added a comment – happy now?

(You still need to unrequest changes, otherwise the PR is still blocked.)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thx sorry for the circumstances 🤐

markusweigelt marked this conversation as resolved.
Show resolved Hide resolved
cp -p "$WORKFLOW" "$WORKDIR/workflow.sh"
WORKFLOW="$WORKDIR/workflow.sh"
fi

if test -z "$CONTROLLER" -o "$CONTROLLER" = "${CONTROLLER#*:}"; then
logger -p user.error -t $TASK "envvar CONTROLLER='$CONTROLLER' must contain host:port"
Expand All @@ -44,18 +63,11 @@ init() {
CONTROLLERHOST=${CONTROLLER%:*}
CONTROLLERPORT=${CONTROLLER#*:}

WORKDIR=ocr-d/"$PROCESS_DIR" # will use other mount-point than /data soon
if ! mkdir -p "$WORKDIR"; then
logger -p user.error -t $TASK "insufficient permissions on /data volume"
exit 5
fi
# try to be unique here (to avoid clashes)
REMOTEDIR="KitodoJob_${PID}_$(basename $PROCESS_DIR)"

# create stats for monitor
mkdir -p /run/lock/ocrd.jobs/
{
echo PID=$PID
echo TIME_CREATED=$(date --rfc-3339=seconds)
echo PROCESS_ID=$PROCESS_ID
echo TASK_ID=$TASK_ID
echo PROCESS_DIR=$PROCESS_DIR
Expand All @@ -68,7 +80,8 @@ init() {
}

logret() {
sed -i 1s/.*/RETVAL=$?/ /run/lock/ocrd.jobs/$REMOTEDIR
sed -i "1s/PID=.*/RETVAL=$?/" /run/lock/ocrd.jobs/$REMOTEDIR
sed -i "2a TIME_TERMINATED=$(date --rfc-3339=seconds)" /run/lock/ocrd.jobs/$REMOTEDIR
}

init_task() {
Expand All @@ -83,15 +96,15 @@ ocrd_format_workflow() {

# ocrd import from workdir
ocrd_import_workdir() {
echo "echo \$\$ > $REMOTEDIR/ocrd.pid"
echo "if test -f '$REMOTEDIR/mets.xml'; then OV=--overwrite; else OV=; ocrd-import -i '$REMOTEDIR'; fi"
echo "cd '$REMOTEDIR'"
echo 'echo $$ > ocrd.pid'
}

ocrd_enter_workdir() {
echo "echo \$\$ > $REMOTEDIR/ocrd.pid"
echo "if test -f '$REMOTEDIR/mets.xml'; then OV=--overwrite; else OV=; fi"
echo "cd '$REMOTEDIR'"
echo 'echo $$ > ocrd.pid'
}

ocrd_process_workflow() {
Expand Down
2 changes: 1 addition & 1 deletion process_images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ parse_args() {
SCRIPT=
PROCESS_ID=
TASK_ID=
WORKFLOW=ocr-workflow-default.sh
WORKFLOW=/workflows/ocr-workflow-default.sh
IMAGES_SUBDIR=images
RESULT_SUBDIR=ocr/alto
while (($#)); do
Expand Down
2 changes: 1 addition & 1 deletion process_mets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ parse_args() {
SCRIPT=
PROCESS_ID=
TASK_ID=
WORKFLOW=ocr-workflow-default.sh
WORKFLOW=/workflows/ocr-workflow-default.sh
PAGES=
IMAGES_GRP=DEFAULT
RESULT_GRP=FULLTEXT
Expand Down
File renamed without changes.