Skip to content

Commit

Permalink
OCR: Tesseract 4.1.1 / Ghostscript 9.54.0
Browse files Browse the repository at this point in the history
With tesseract v4.0.0-beta.3 we often observe crashes with:

```
contains_unichar_id(unichar_id):Error:Assert failed:in file ../../src/ccutil/unicharset.h, line 511
```

This seems to have been fixed by tesseract-ocr/tesseract#1954

Still, even after updating to 4.1.1, text recognition from PDF in ERP5 is too expensive. We also update Ghostscript to 9.54.0, because this version has built-in OCR, which does not need to convert the PDF to PNG then TIFF as we currently do in ERP5.

See merge request nexedi/slapos!985
  • Loading branch information
perrinjerome committed Jun 4, 2021
2 parents 582b0b0 + 1b29141 commit ec129b7
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 64 deletions.
27 changes: 16 additions & 11 deletions component/ghostscript/buildout.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,41 @@
extends =
../fontconfig/buildout.cfg
../freetype/buildout.cfg
../libjpeg/buildout.cfg
../libtiff/buildout.cfg
../libxml2/buildout.cfg
../pkgconfig/buildout.cfg
../tesseract/buildout.cfg
../xz-utils/buildout.cfg

parts = ghostscript

[ghostscript-common]
[ghostscript]
recipe = slapos.recipe.cmmi
shared = true
pkg_config_depends = ${libtiff:location}/lib/pkgconfig:${fontconfig:location}/lib/pkgconfig:${fontconfig:pkg_config_depends}
url = https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs9540/ghostscript-9.54.0.tar.gz
md5sum = 5d571792a8eb826c9f618fb69918d9fc
pkg_config_depends = ${libtiff:location}/lib/pkgconfig:${libjpeg:location}/lib/pkgconfig:${fontconfig:location}/lib/pkgconfig:${fontconfig:pkg_config_depends}
# XXX --with-tessdata work arounds a slaprunner bug of having softwares installed in a path containing //
configure-options =
--disable-cups
--disable-threadsafe
--with-system-libtiff
--without-libidn
--without-x
--with-drivers=FILES
# it seems that parallel build sometimes fails for ghostscript.
make-options = -j1
--with-tessdata=$(python -c 'print("""${:tessdata-location}""".replace("//", "/"))')
environment =
PATH=${pkgconfig:location}/bin:${xz-utils:location}/bin:%(PATH)s
PKG_CONFIG_PATH=${:pkg_config_depends}
LDFLAGS=-Wl,-rpath=${fontconfig:location}/lib -Wl,-rpath=${freetype:location}/lib -Wl,-rpath=${libtiff:location}/lib
CFLAGS=-I${libjpeg:location}/include
LDFLAGS=-Wl,-rpath=${fontconfig:location}/lib -Wl,-rpath=${freetype:location}/lib -Wl,-rpath=${libtiff:location}/lib -L${libjpeg:location}/lib -Wl,-rpath=${libjpeg:location}/lib
LD_LIBRARY_PATH=${fontconfig:location}/lib:${freetype:location}/lib:${libtiff:location}/lib:${libxml2:location}/lib

[ghostscript]
<= ghostscript-9
# configure gives priority to local jpeg library and refuse mixing local libjpeg with "system" libtiff.
# We remove this local jpeg library source folder so that configure picks up the slapos versions of these libraries.
pre-configure = rm -r jpeg

[ghostscript-9]
<= ghostscript-common
url = https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs950/ghostscript-9.50.tar.xz
md5sum = 6cea6bae4a7cdfac6ccb09f07f0caf8c
post-make-hook = ${tesseract-download-traineddata:post-make-hook}
tessdata-location = @@LOCATION@@/share/tessdata/
tessdata-urls = ${tesseract-download-traineddata:urls}
7 changes: 2 additions & 5 deletions component/leptonica/buildout.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,14 @@ extends =
../libtiff/buildout.cfg
../webp/buildout.cfg
../giflib/buildout.cfg
../patch/buildout.cfg

[leptonica]
recipe = slapos.recipe.cmmi
url = http://www.leptonica.com/source/leptonica-1.76.0.tar.gz
md5sum = a263a5e4f7e8f8a661fb121a265d2d20
shared = true
url = http://www.leptonica.org/source/leptonica-1.80.0.tar.gz
md5sum = d640d684234442a84c9e8902f0b3ff36
configure-options =
--disable-static

environment =
CPPFLAGS=-I${zlib:location}/include -I${libjpeg:location}/include -I${libpng:location}/include -I${libtiff:location}/include -I${webp:location}/include -I${giflib:location}/include
LDFLAGS=-L${zlib:location}/lib -Wl,-rpath=${zlib:location}/lib -L${libjpeg:location}/lib -Wl,-rpath=${libjpeg:location}/lib -L${libpng:location}/lib -Wl,-rpath=${libpng:location}/lib -L${libtiff:location}/lib -Wl,-rpath=${libtiff:location}/lib -L${webp:location}/lib -Wl,-rpath=${webp:location}/lib -L${giflib:location}/lib -Wl,-rpath=${giflib:location}/lib
PATH=${patch:location}/bin:%(PATH)s
22 changes: 0 additions & 22 deletions component/leptonica/leptonica-1.69-zlib-include.patch

This file was deleted.

39 changes: 15 additions & 24 deletions component/tesseract/buildout.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -10,43 +10,34 @@ extends =
../fontconfig/buildout.cfg
../lcms/buildout.cfg
../pkgconfig/buildout.cfg

./buildout.hash.cfg
parts =
tesseract
tesseract-traineddata
tesseract-eng-traineddata
tesseract-osd-traineddata

[tesseract]
recipe = slapos.recipe.cmmi
url = https://github.com/tesseract-ocr/tesseract/archive/6b250b58121a9858d3e3019a78a6f7d421bd0fc7.tar.gz
md5sum = fdc38148ad8eb1bd0485a217503dd6d5
shared = true
url = https://github.com/tesseract-ocr/tesseract/archive/refs/tags/4.1.1.tar.gz
md5sum = 51fe2bcbff1bbce77a25d180fd247f7d
pkg_config_depends = ${leptonica:location}/lib/pkgconfig:${fontconfig:location}/lib/pkgconfig:${fontconfig:pkg_config_depends}:${lcms2:location}/lib/pkgconfig:${xz-utils:location}/lib/pkgconfig
pre-configure =
autoreconf -ivf -I${pkgconfig:location}/share/aclocal -I${libtool:location}/share/aclocal -Wno-portability

configure-options =
--disable-static
--datarootdir=${tesseract-traineddata:location}
# XXX: tesseract seems not easily configurable at runtime about where to find
# its trained data, so we set its datarootdir above to a controlled location

environment =
PATH=${pkgconfig:location}/bin:${autoconf:location}/bin:${automake:location}/bin:${libtool:location}/bin:${m4:location}/bin:${patch:location}/bin:%(PATH)s
PKG_CONFIG_PATH=${:pkg_config_depends}
LDFLAGS=-L${leptonica:location}/lib -Wl,-rpath=${leptonica:location}/lib -L${jbigkit:location}/lib -Wl,-rpath=${jbigkit:location}/lib -L${zlib:location}/lib -Wl,-rpath=${zlib:location}/lib

[tesseract-traineddata]
location = ${buildout:parts-directory}/${:_buildout_section_name_}
post-make-hook = ${tesseract-download-traineddata:post-make-hook}
tessdata-urls = ${tesseract-download-traineddata:urls}
tessdata-location = @@LOCATION@@/share/tessdata/

[tesseract-eng-traineddata]
recipe = slapos.recipe.build:download
destination = ${tesseract-traineddata:location}/tessdata/eng.traineddata
url = https://github.com/tesseract-ocr/tessdata/raw/590567f20dc044f6948a8e2c61afc714c360ad0e/eng.traineddata
md5sum = 57e0df3d84fed9fbf8c7a8e589f8f012

[tesseract-osd-traineddata]
recipe = slapos.recipe.build:download
destination = ${tesseract-traineddata:location}/tessdata/osd.traineddata
url = https://github.com/tesseract-ocr/tessdata/raw/590567f20dc044f6948a8e2c61afc714c360ad0e/osd.traineddata
md5sum = 7611737524efd1ce2dde67eff629bbcf
[tesseract-download-traineddata]
post-make-hook = ${:_profile_base_location_}/${download-tessdata.py:filename}#${download-tessdata.py:md5sum}:post_make_hook
urls =
https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/eng.traineddata#57e0df3d84fed9fbf8c7a8e589f8f012
https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/osd.traineddata#7611737524efd1ce2dde67eff629bbcf
https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/fra.traineddata#a73e70c872f262895d93976febeb1638
https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/jpn.traineddata#af3a30a9bec904e106aa8521e7caaeca
https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/chi_sim.traineddata#6965cb3213edd961cb16264e2ea45f5c
3 changes: 3 additions & 0 deletions component/tesseract/buildout.hash.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[download-tessdata.py]
filename = download-tessdata.py
md5sum = 2d283a6d8662d6bb8c9de7b26162b702
27 changes: 27 additions & 0 deletions component/tesseract/download-tessdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# This is a post-make hook script to download tesseract training data.
#
# This script uses the following buildout options:
# - tessdata-urls: list of URLs and their expected md5sum as URL fragments
# - tessdata-location: path where to install the data.

import zc.buildout
import os


def post_make_hook(options, buildout, env):
if not os.path.exists(options['tessdata-location']):
os.makedirs(options['tessdata-location'])

download = zc.buildout.download.Download(
buildout['buildout'],
hash_name=True,
)
for url in options['tessdata-urls'].splitlines():
url, _, md5sum = url.partition('#')
if url:
download(
url,
md5sum=md5sum,
path=os.path.join(options['tessdata-location'],
os.path.basename(url)),
)
4 changes: 2 additions & 2 deletions stack/erp5/buildout.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ extends =
buildout.hash.cfg
../../component/fonts/buildout.cfg
../../component/git/buildout.cfg
../../component/ghostscript/buildout.cfg
../../component/graphviz/buildout.cfg
../../component/gzip/buildout.cfg
../../component/xz-utils/buildout.cfg
Expand Down Expand Up @@ -65,8 +66,6 @@ parts +=
slapos-cookbook
mroonga-mariadb
tesseract
tesseract-eng-traineddata
tesseract-osd-traineddata
zabbix-agent

# Buildoutish
Expand Down Expand Up @@ -252,6 +251,7 @@ link-binary =
${graphviz:location}/bin/dot
${grep:location}/bin/grep
${imagemagick:location}/bin/convert
${ghostscript:location}/bin/gs
${imagemagick:location}/bin/identify
${jpegoptim:location}/bin/jpegoptim
${jsl:location}/bin/jsl
Expand Down

0 comments on commit ec129b7

Please sign in to comment.