From 3f1e459d6747d1853b1995bb6df9c21a59df022d Mon Sep 17 00:00:00 2001 From: breezedeus Date: Tue, 18 Jun 2024 10:44:14 +0800 Subject: [PATCH] update docs --- README.md | 24 ++++++++++----------- docs/RELEASE.md | 8 +++++++ docs/buymeacoffee.md | 8 +++---- docs/command.md | 4 ++-- docs/contact.md | 8 +++---- docs/demo.md | 2 +- docs/examples.md | 30 +++++++++++++-------------- docs/examples_en.md | 30 +++++++++++++-------------- docs/index.md | 12 +++++------ docs/index_en.md | 8 ++++--- docs/models.md | 38 +++++++++++++++++++--------------- docs/requirements.txt | 2 +- docs/usage.md | 12 +++++------ pix2text/text_formula_ocr.py | 6 ------ tests/test_latex_ocr.py | 2 +- tests/test_pix2text.py | 27 +++++++++++------------- tests/test_text_formula_ocr.py | 18 +++++++--------- 17 files changed, 121 insertions(+), 118 deletions(-) diff --git a/README.md b/README.md index 9c26ebc..55236c4 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,8 @@ Major changes: -* Added layout analysis and table recognition models, supporting the conversion of images with complex layouts into Markdown format. See examples: [Pix2Text Online Documentation / Examples](https://pix2text.readthedocs.io/zh/latest/examples_en/). -* Added support for converting entire PDF files to Markdown format. See examples: [Pix2Text Online Documentation / Examples](https://pix2text.readthedocs.io/zh/latest/examples_en/). +* Added layout analysis and table recognition models, supporting the conversion of images with complex layouts into Markdown format. See examples: [Pix2Text Online Documentation / Examples](https://pix2text.readthedocs.io/zh/stable/examples_en/). +* Added support for converting entire PDF files to Markdown format. See examples: [Pix2Text Online Documentation / Examples](https://pix2text.readthedocs.io/zh/stable/examples_en/). * Enhanced the interface with more features, including adjustments to existing interface parameters. * Launched the [Pix2Text Online Documentation](https://pix2text.readthedocs.io). @@ -56,7 +56,7 @@ See more at: [RELEASE.md](docs/RELEASE.md) . - **Layout Analysis Model**: [breezedeus/pix2text-layout](https://huggingface.co/breezedeus/pix2text-layout) ([Mirror](https://hf-mirror.com/breezedeus/pix2text-layout)). - **Table Recognition Model**: [breezedeus/pix2text-table-rec](https://huggingface.co/breezedeus/pix2text-table-rec) ([Mirror](https://hf-mirror.com/breezedeus/pix2text-table-rec)). - **Text Recognition Engine**: Supports **80+ languages** such as **English, Simplified Chinese, Traditional Chinese, Vietnamese**, etc. For English and Simplified Chinese recognition, it uses the open-source OCR tool [CnOCR](https://github.com/breezedeus/cnocr), while for other languages, it uses the open-source OCR tool [EasyOCR](https://github.com/JaidedAI/EasyOCR). -- **Mathematical Formula Detection Model (MFD)**: Mathematical formula detection model (MFD) from [CnSTD](https://github.com/breezedeus/cnstd). +- **Mathematical Formula Detection Model (MFD)**: [breezedeus/pix2text-mfd](https://huggingface.co/breezedeus/pix2text-mfd) ([Mirror](https://hf-mirror.com/breezedeus/pix2text-mfd)). Implemented based on [CnSTD](https://github.com/breezedeus/cnstd). - **Mathematical Formula Recognition Model (MFR)**: [breezedeus/pix2text-mfr](https://huggingface.co/breezedeus/pix2text-mfr) ([Mirror](https://hf-mirror.com/breezedeus/pix2text-mfr)). Several models are contributed by other open-source authors, and their contributions are highly appreciated. @@ -65,7 +65,7 @@ Several models are contributed by other open-source authors, and their contribut Pix2Text Arch Flow -For detailed explanations, please refer to the [Pix2Text Online Documentation/Models](https://pix2text.readthedocs.io/zh/latest/models/). +For detailed explanations, please refer to the [Pix2Text Online Documentation/Models](https://pix2text.readthedocs.io/zh/stable/models/).
@@ -74,12 +74,12 @@ As a Python3 toolkit, P2T may not be very user-friendly for those who are not fa If you're interested, feel free to add the assistant as a friend by scanning the QR code and mentioning `p2t`. The assistant will regularly invite everyone to join the group where the latest developments related to P2T tools will be announced:
- Wechat-QRCode + Wechat-QRCode
The author also maintains a **Knowledge Planet** [**P2T/CnOCR/CnSTD Private Group**](https://t.zsxq.com/FEYZRJQ), where questions are answered promptly. You're welcome to join. The **knowledge planet private group** will also gradually release some private materials related to P2T/CnOCR/CnSTD, including **some unreleased models**, **discounts on purchasing premium models**, **code snippets for different application scenarios**, and answers to difficult problems encountered during use. The planet will also publish the latest research materials related to P2T/OCR/STD. -For more contact method, please refer to [Contact](https://pix2text.readthedocs.io/zh/latest/contact/). +For more contact method, please refer to [Contact](https://pix2text.readthedocs.io/zh/stable/contact/). ## List of Supported Languages @@ -196,15 +196,15 @@ You can also try the **[Online Demo](https://huggingface.co/spaces/breezedeus/Pi ## Examples -See: [Pix2Text Online Documentation/Examples](https://pix2text.readthedocs.io/zh/latest/examples_en/). +See: [Pix2Text Online Documentation/Examples](https://pix2text.readthedocs.io/zh/stable/examples_en/). ## Usage -See: [Pix2Text Online Documentation/Usage](https://pix2text.readthedocs.io/zh/latest/usage/). +See: [Pix2Text Online Documentation/Usage](https://pix2text.readthedocs.io/zh/stable/usage/). ## Models -See: [Pix2Text Online Documentation/Models](https://pix2text.readthedocs.io/zh/latest/models/). +See: [Pix2Text Online Documentation/Models](https://pix2text.readthedocs.io/zh/stable/models/). ## Install @@ -226,15 +226,15 @@ If the installation is slow, you can specify an installation source, such as usi pip install pix2text -i https://mirrors.aliyun.com/pypi/simple ``` -For more information, please refer to: [Pix2Text Online Documentation/Install](https://pix2text.readthedocs.io/zh/latest/install/). +For more information, please refer to: [Pix2Text Online Documentation/Install](https://pix2text.readthedocs.io/zh/stable/install/). ## Command Line Tool -See: [Pix2Text Online Documentation/Command Tool](https://pix2text.readthedocs.io/zh/latest/command/). +See: [Pix2Text Online Documentation/Command Tool](https://pix2text.readthedocs.io/zh/stable/command/). ## HTTP Service -See: [Pix2Text Online Documentation/Command Tool/Start Service](https://pix2text.readthedocs.io/zh/latest/command/). +See: [Pix2Text Online Documentation/Command Tool/Start Service](https://pix2text.readthedocs.io/zh/stable/command/). ## MacOS Desktop Application diff --git a/docs/RELEASE.md b/docs/RELEASE.md index a199654..dd1ac95 100644 --- a/docs/RELEASE.md +++ b/docs/RELEASE.md @@ -1,5 +1,13 @@ # Release Notes +## Update 2024.06.17:**V1.1.1** Released + +主要变更: + +* 数学公式检测模型(MFD)更新,检测精度获得较大提升。 +* 修复 bugs。 + + ## Update 2024.06.17:**V1.1.0.7** Released Major changes: diff --git a/docs/buymeacoffee.md b/docs/buymeacoffee.md index dfb7ced..308d452 100644 --- a/docs/buymeacoffee.md +++ b/docs/buymeacoffee.md @@ -9,11 +9,11 @@ By supporting my projects through a donation, you can be a part of this journey ## 1. 知识星球 -欢迎加入**知识星球** **[CnOCR/CnSTD私享群](https://t.zsxq.com/FEYZRJQ)**。**知识星球私享群**会陆续发布一些 CnOCR/CnSTD/P2T 相关的私有资料。 -关于星球的更详细说明请参考:[知识星球 | Breezedeus.com](https://www.breezedeus.com/article/zsxq)。 +欢迎加入**知识星球** **[P2T/CnOCR/CnSTD私享群](https://t.zsxq.com/FEYZRJQ)**。**知识星球私享群**会陆续发布一些 CnOCR/CnSTD/P2T 相关的私有资料。 +关于星球会员享受福利的更详细说明请参考:[知识星球 | Breezedeus.com](https://www.breezedeus.com/article/zsxq)。
-![知识星球二维码](https://cnocr.readthedocs.io/zh/latest/cnocr-zsxq.jpeg){: style="width:280px"} +![知识星球二维码](https://cnocr.readthedocs.io/zh/stable/cnocr-zsxq.jpeg){: style="width:280px"}
@@ -23,7 +23,7 @@ By supporting my projects through a donation, you can be a part of this journey Give the author a reward through Alipay.
-![支付宝收款码](https://cnocr.readthedocs.io/zh/latest/cnocr-zfb.jpg){: style="width:280px"} +![支付宝收款码](https://cnocr.readthedocs.io/zh/stable/cnocr-zfb.jpg){: style="width:280px"}
diff --git a/docs/command.md b/docs/command.md index de027de..b33197c 100644 --- a/docs/command.md +++ b/docs/command.md @@ -53,7 +53,7 @@ p2t predict -l en,ch_sim --resized-shape 768 --file-type pdf -i docs/examples/te 预测时也支持使用自定义的参数或模型。例如,使用自定义的模型进行预测: ```bash -p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --rec-kwargs '{"page_numbers": [0, 1]}' --resized-shape 768 --file-type pdf -i docs/examples/test-doc.pdf -o output-md --save-debug-res output-debug +p2t predict -l en,ch_sim --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --rec-kwargs '{"page_numbers": [0, 1]}' --resized-shape 768 --file-type pdf -i docs/examples/test-doc.pdf -o output-md --save-debug-res output-debug ``` @@ -99,7 +99,7 @@ p2t serve -l en,ch_sim -H 0.0.0.0 -p 8503 服务开启时也支持使用自定义的参数或模型。例如,使用自定义的模型进行预测: ```bash -p2t serve -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' -H 0.0.0.0 -p 8503 +p2t serve -l en,ch_sim --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' -H 0.0.0.0 -p 8503 ``` ### 服务调用 diff --git a/docs/contact.md b/docs/contact.md index 1f28a1f..2000a8a 100644 --- a/docs/contact.md +++ b/docs/contact.md @@ -5,7 +5,7 @@ ## 一、知识星球 [**P2T/CnOCR/CnSTD私享群**](https://t.zsxq.com/FEYZRJQ) 作者维护 **知识星球** [**P2T/CnOCR/CnSTD私享群**](https://t.zsxq.com/FEYZRJQ) ,欢迎加入。**知识星球私享群**会陆续发布一些 P2T/CnOCR/CnSTD 相关的私有资料。 -关于星球的更详细说明请参考:[知识星球 | Breezedeus.com](https://www.breezedeus.com/article/zsxq)。 +关于星球会员享受福利的更详细说明请参考:[知识星球 | Breezedeus.com](https://www.breezedeus.com/article/zsxq)。
![知识星球二维码](https://cnocr.readthedocs.io/zh/latest/cnocr-zsxq.jpeg){: style="width:280px"} @@ -20,14 +20,14 @@ ![微信交流群](https://huggingface.co/datasets/breezedeus/cnocr-wx-qr-code/resolve/main/wx-qr-code.JPG){: style="width:270px"}
-正常情况小助手会定期邀请入群,但无法保证时间。如果期望尽快得到答复,可以加入上面的知识星球 [**CnOCR/CnSTD私享群**](https://t.zsxq.com/FEYZRJQ) 。 +正常情况小助手会定期邀请入群,但无法保证时间。如果期望尽快得到答复,可以加入上面的知识星球 [**P2T/CnOCR/CnSTD私享群**](https://t.zsxq.com/FEYZRJQ) 。 ## 三、Discord -欢迎加入 [**我的Discord 服务器**](https://discord.gg/GgD87WM8Tf) 。 +欢迎加入 [**Pix2Text Discord 服务器**](https://discord.gg/GgD87WM8Tf) 。 -Welcome to join [**my Discord Server**](https://discord.gg/GgD87WM8Tf) . +Welcome to join [**Pix2Text Discord Server**](https://discord.gg/GgD87WM8Tf) . ## 四、邮件 / Email diff --git a/docs/demo.md b/docs/demo.md index f55b70e..95b43e7 100644 --- a/docs/demo.md +++ b/docs/demo.md @@ -11,7 +11,7 @@ ## 在线 Demo 🤗 -也可以使用 **[在线 Demo](https://huggingface.co/spaces/breezedeus/Pix2Text-Demo)**(无法科学上网可以使用 [国内 Demo](https://hf-mirror.com/spaces/breezedeus/Pix2Text-Demo)) 尝试 **P2T** 在不同语言上的效果。但在线 Demo 使用的硬件配置较低,速度会较慢。如果是简体中文或者英文图片,建议使用 **[P2T网页版](https://p2t.breezedeus.com)**。 +也可以使用 **[在线 Demo](https://huggingface.co/spaces/breezedeus/Pix2Text-Demo)**(无法科学上网可以使用 [国内镜像](https://hf.qhduan.com/spaces/breezedeus/Pix2Text-Demo)) 尝试 **P2T** 在不同语言上的效果。但在线 Demo 使用的硬件配置较低,速度会较慢。如果是简体中文或者英文图片,建议使用 **[P2T网页版](https://p2t.breezedeus.com)**。
![在线 Demo](https://pic3.zhimg.com/80/v2-ebe8d3d955a580a297aabcd27439604e_720w.webp) diff --git a/docs/examples.md b/docs/examples.md index 21a5d32..c9104ed 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -23,7 +23,7 @@ doc.to_markdown('output-md') # 导出的 Markdown 信息保存在 output-md 目 也可以使用命令行完成一样的功能,如下面命令使用了付费版模型(MFD + MFR + CnOCR 三个付费模型)进行识别: ```bash -p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --rec-kwargs '{"page_numbers": [0, 1]}' --resized-shape 768 --file-type pdf -i docs/examples/test-doc.pdf -o output-md --save-debug-res output-debug +p2t predict -l en,ch_sim --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --rec-kwargs '{"page_numbers": [0, 1]}' --resized-shape 768 --file-type pdf -i docs/examples/test-doc.pdf -o output-md --save-debug-res output-debug ``` 识别结果见 [output-md/output.md](output-md/output.md)。 @@ -36,7 +36,7 @@ p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Us 可以使用函数 `.recognize_page()` 识别图片中的文字和数学公式。如针对以下图片 ([examples/page2.png](examples/page2.png)):
- Page-image + Page-image
调用方式如下: @@ -53,7 +53,7 @@ page.to_markdown('output-page') # 导出的 Markdown 信息保存在 output-pag 也可以使用命令行完成一样的功能,如下面命令使用了付费版模型(MFD + MFR + CnOCR 三个付费模型)进行识别: ```bash -p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type page -i docs/examples/page2.png -o output-page --save-debug-res output-debug-page +p2t predict -l en,ch_sim --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type page -i docs/examples/page2.png -o output-page --save-debug-res output-debug-page ``` 识别结果和 [output-md/output.md](output-md/output.md) 类似。 @@ -64,7 +64,7 @@ p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Us 可以使用函数 `.recognize_text_formula()` 识别图片中的文字和数学公式。如针对以下图片 ([examples/en1.jpg](examples/en1.jpg)):
- English-mixed-image + English-mixed-image
调用方式如下: @@ -83,13 +83,13 @@ print(outs) 也可以使用命令行完成一样的功能,如下面命令使用了付费版模型(MFD + MFR + CnOCR 三个付费模型)进行识别: ```bash -p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg +p2t predict -l en,ch_sim --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg --save-debug-res out-debug-en1.jpg ``` 或者使用免费开源模型进行识别: ```bash -p2t predict -l en,ch_sim --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg +p2t predict -l en,ch_sim --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg --save-debug-res out-debug-en1.jpg ``` ## 识别纯公式图片 @@ -97,7 +97,7 @@ p2t predict -l en,ch_sim --resized-shape 768 --file-type text_formula -i docs/ex 对于只包含数学公式的图片,使用函数 `.recognize_formula()` 可以把数学公式识别为 LaTeX 表达式。如针对以下图片 ([examples/math-formula-42.png](examples/math-formula-42.png)):
- Pure-Math-Formula-image + Pure-Math-Formula-image
@@ -123,7 +123,7 @@ p2t predict -l en,ch_sim --formula-ocr-config '{"model_name":"mfr-pro","model_ba 或者使用免费开源模型进行识别: ```bash -p2t predict -l en,ch_sim --file-type textformula -i docs/examples/math-formula-42.png +p2t predict -l en,ch_sim --file-type formula -i docs/examples/math-formula-42.png ``` ## 识别纯文字图片 @@ -131,7 +131,7 @@ p2t predict -l en,ch_sim --file-type textformula -i docs/examples/math-formula-4 对于只包含文字不包含数学公式的图片,使用函数 `.recognize_text()` 可以识别出图片中的文字。此时 Pix2Text 相当于一般的文字 OCR 引擎。如针对以下图片 ([examples/general.jpg](examples/general.jpg)):
- Pure-Math-Formula-image + Pure-Math-Formula-image
@@ -151,13 +151,13 @@ print(outs) 也可以使用命令行完成一样的功能,如下面命令使用了付费版模型(CnOCR 一个付费模型)进行识别: ```bash -p2t predict -l en,ch_sim --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --file-type text -i docs/examples/general.jpg +p2t predict -l en,ch_sim --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --file-type text --no-return-text -i docs/examples/general.jpg --save-debug-res out-debug-general.jpg ``` 或者使用免费开源模型进行识别: ```bash -p2t predict -l en,ch_sim --file-type text -i docs/examples/general.jpg +p2t predict -l en,ch_sim --file-type text --no-return-text -i docs/examples/general.jpg --save-debug-res out-debug-general.jpg ``` @@ -172,7 +172,7 @@ p2t predict -l en,ch_sim --file-type text -i docs/examples/general.jpg **识别命令**: ```bash -p2t predict -l en --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg +p2t predict -l en --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg ``` ### 简体中文 @@ -184,7 +184,7 @@ p2t predict -l en --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/kin **识别命令**: ```bash -p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --auto-line-break --file-type text_formula -i docs/examples/mixed.jpg +p2t predict -l en,ch_sim --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --auto-line-break --file-type text_formula -i docs/examples/mixed.jpg --save-debug-res out-debug-mixed.jpg ``` ### 繁体中文 @@ -196,7 +196,7 @@ p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Us **识别命令**: ```bash -p2t predict -l en,ch_tra --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --resized-shape 768 --auto-line-break --file-type text_formula -i docs/examples/ch_tra.jpg +p2t predict -l en,ch_tra --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --resized-shape 768 --auto-line-break --file-type text_formula -i docs/examples/ch_tra.jpg --save-debug-res out-debug-tra.jpg ``` > 注意 ⚠️ :请通过以下命令安装 pix2text 的多语言版本: @@ -213,7 +213,7 @@ p2t predict -l en,ch_tra --mfd-config '{"model_type": "yolov7", "model_fp": "/Us **识别命令**: ```bash -p2t predict -l en,vi --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --resized-shape 768 --no-auto-line-break --file-type text_formula -i docs/examples/vietnamese.jpg +p2t predict -l en,vi --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --resized-shape 608 --no-auto-line-break --file-type text_formula -i docs/examples/vietnamese.jpg --save-debug-res out-debug-vi.jpg ``` > 注意 ⚠️ :请通过以下命令安装 pix2text 的多语言版本: diff --git a/docs/examples_en.md b/docs/examples_en.md index 9ee94a0..ba67e18 100644 --- a/docs/examples_en.md +++ b/docs/examples_en.md @@ -23,7 +23,7 @@ doc.to_markdown('output-md') # The exported Markdown information is saved in th You can also achieve the same functionality using the command line. Below is a command that uses the premium models (MFD + MFR + CnOCR) for recognition: ```bash -p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --rec-kwargs '{"page_numbers": [0, 1]}' --resized-shape 768 --file-type pdf -i docs/examples/test-doc.pdf -o output-md --save-debug-res output-debug +p2t predict -l en,ch_sim --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --rec-kwargs '{"page_numbers": [0, 1]}' --resized-shape 768 --file-type pdf -i docs/examples/test-doc.pdf -o output-md --save-debug-res output-debug ``` You can find the recognition result in [output-md/output.md](output-md/output.md). @@ -37,7 +37,7 @@ You can find the recognition result in [output-md/output.md](output-md/output.md You can use the `.recognize_page()` function to recognize text and mathematical formulas in images. For example, for the following image ([examples/page2.png](examples/page2.png)):
- Page-image + Page-image
You can call the function like this: @@ -54,7 +54,7 @@ page.to_markdown('output-page') # The exported Markdown information is saved in You can also achieve the same functionality using the command line. Below is a command that uses the premium models (MFD + MFR + CnOCR) for recognition: ```bash -p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type page -i docs/examples/page2.png -o output-page --save-debug-res output-debug-page +p2t predict -l en,ch_sim --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type page -i docs/examples/page2.png -o output-page --save-debug-res output-debug-page ``` The recognition result is similar to [output-md/output.md](output-md/output.md). @@ -65,7 +65,7 @@ The recognition result is similar to [output-md/output.md](output-md/output.md). For paragraph images containing both formulas and texts, you don't need to use the layout analysis model. You can use the `.recognize_text_formula()` function to recognize both texts and mathematical formulas in the image. For example, for the following image ([examples/en1.jpg](examples/en1.jpg)):
- English-mixed-image + English-mixed-image
You can call the function like this: @@ -84,13 +84,13 @@ The returned result `outs` is a dictionary, where the key `position` represents You can also achieve the same functionality using the command line. Below is a command that uses the premium models (MFD + MFR + CnOCR) for recognition: ```bash -p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg +p2t predict -l en,ch_sim --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg --save-debug-res out-debug-en1.jpg ``` Or use the free open-source models for recognition: ```bash -p2t predict -l en,ch_sim --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg +p2t predict -l en,ch_sim --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg --save-debug-res out-debug-en1.jpg ``` ## Recognize Pure Formula Images @@ -98,7 +98,7 @@ p2t predict -l en,ch_sim --resized-shape 768 --file-type text_formula -i docs/ex For images containing only mathematical formulas, you can use the `.recognize_formula()` function to recognize the formulas as LaTeX expressions. For example, for the following image ([examples/math-formula-42.png](examples/math-formula-42.png)):
- Pure-Math-Formula-image + Pure-Math-Formula-image
You can call the function like this: @@ -123,7 +123,7 @@ p2t predict -l en,ch_sim --formula-ocr-config '{"model_name":"mfr-pro","model_ba Or use the free open-source model for recognition: ```bash -p2t predict -l en,ch_sim --file-type textformula -i docs/examples/math-formula-42.png +p2t predict -l en,ch_sim --file-type formula -i docs/examples/math-formula-42.png ``` ## Recognize Pure Text Images @@ -131,7 +131,7 @@ p2t predict -l en,ch_sim --file-type textformula -i docs/examples/math-formula-4 For images containing only text without mathematical formulas, you can use the `.recognize_text()` function to recognize the text in the image. In this case, Pix2Text acts as a general text OCR engine. For example, for the following image ([examples/general.jpg](examples/general.jpg)):
- Pure-Math-Formula-image + Pure-Math-Formula-image
You can call the function like this: @@ -150,13 +150,13 @@ The returned result is a string representing the corresponding text sequence. Fo You can also achieve the same functionality using the command line. Below is a command that uses the premium model (CnOCR) for recognition: ```bash -p2t predict -l en,ch_sim --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --file-type text -i docs/examples/general.jpg +p2t predict -l en,ch_sim --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --file-type text --no-return-text -i docs/examples/general.jpg --save-debug-res out-debug-general.jpg ``` Or use the free open-source model for recognition: ```bash -p2t predict -l en,ch_sim --file-type text -i docs/examples/general.jpg +p2t predict -l en,ch_sim --file-type text --no-return-text -i docs/examples/general.jpg --save-debug-res out-debug-general.jpg ``` ## For Different Languages @@ -170,7 +170,7 @@ p2t predict -l en,ch_sim --file-type text -i docs/examples/general.jpg **Recognition Command**: ```bash -p2t predict -l en --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg +p2t predict -l en --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --file-type text_formula -i docs/examples/en1.jpg ``` ### Simplified Chinese @@ -182,7 +182,7 @@ p2t predict -l en --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/kin **Recognition Command**: ```bash -p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --auto-line-break --file-type text_formula -i docs/examples/mixed.jpg +p2t predict -l en,ch_sim --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --text-ocr-config '{"rec_model_name": "doc-densenet_lite_666-gru_large"}' --resized-shape 768 --auto-line-break --file-type text_formula -i docs/examples/mixed.jpg --save-debug-res out-debug-mixed.jpg ``` ### Traditional Chinese @@ -194,7 +194,7 @@ p2t predict -l en,ch_sim --mfd-config '{"model_type": "yolov7", "model_fp": "/Us **Recognition Command**: ```bash -p2t predict -l en,ch_tra --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --resized-shape 768 --auto-line-break --file-type text_formula -i docs/examples/ch_tra.jpg +p2t predict -l en,ch_tra --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --resized-shape 768 --auto-line-break --file-type text_formula -i docs/examples/ch_tra.jpg --save-debug-res out-debug-tra.jpg ``` > Note ⚠️: Please install the multilingual version of pix2text using the following command: @@ -211,7 +211,7 @@ p2t predict -l en,ch_tra --mfd-config '{"model_type": "yolov7", "model_fp": "/Us **Recognition Command**: ```bash -p2t predict -l en,vi --mfd-config '{"model_type": "yolov7", "model_fp": "/Users/king/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --resized-shape 768 --no-auto-line-break --file-type text_formula -i docs/examples/vietnamese.jpg +p2t predict -l en,vi --mfd-config '{"model_name": "mfd-pro", "model_backend": "onnx"}' --formula-ocr-config '{"model_name":"mfr-pro","model_backend":"onnx"}' --resized-shape 608 --no-auto-line-break --file-type text_formula -i docs/examples/vietnamese.jpg --save-debug-res out-debug-vi.jpg ``` > Note ⚠️: Please install the multilingual version of pix2text using the following command: diff --git a/docs/index.md b/docs/index.md index bf1b224..3428fd3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -29,11 +29,11 @@ **Pix2Text (P2T)** 整合了以下模型: -- **版面分析模型**:[breezedeus/pix2text-layout](https://huggingface.co/breezedeus/pix2text-layout) ([国内地址](https://hf-mirror.com/breezedeus/pix2text-layout))。 -- **表格识别模型**:[breezedeus/pix2text-table-rec](https://huggingface.co/breezedeus/pix2text-table-rec) ([国内地址](https://hf-mirror.com/breezedeus/pix2text-table-rec))。 +- **版面分析模型**:[breezedeus/pix2text-layout](https://huggingface.co/breezedeus/pix2text-layout) ([国内镜像](https://hf-mirror.com/breezedeus/pix2text-layout))。 +- **表格识别模型**:[breezedeus/pix2text-table-rec](https://huggingface.co/breezedeus/pix2text-table-rec) ([国内镜像](https://hf-mirror.com/breezedeus/pix2text-table-rec))。 - **文字识别引擎**:支持 **`80+` 种语言**,如**英文、简体中文、繁体中文、越南语**等。其中,**英文**和**简体中文**识别使用的是开源 OCR 工具 [CnOCR](https://github.com/breezedeus/cnocr) ,其他语言的识别使用的是开源 OCR 工具 [EasyOCR](https://github.com/JaidedAI/EasyOCR) 。 -- **数学公式检测模型(MFD)**:来自 [CnSTD](https://github.com/breezedeus/cnstd) 的数学公式检测模型(MFD)。 -- **数学公式识别模型(MFR)**:[breezedeus/pix2text-mfr](https://huggingface.co/breezedeus/pix2text-mfr) ([国内地址](https://hf-mirror.com/breezedeus/pix2text-mfr))。 +- **数学公式检测模型(MFD)**:[breezedeus/pix2text-mfd](https://huggingface.co/breezedeus/pix2text-mfd) ([国内镜像](https://hf-mirror.com/breezedeus/pix2text-mfd))。基于 [CnSTD](https://github.com/breezedeus/cnstd) 实现。 +- **数学公式识别模型(MFR)**:[breezedeus/pix2text-mfr](https://huggingface.co/breezedeus/pix2text-mfr) ([国内镜像](https://hf-mirror.com/breezedeus/pix2text-mfr))。 其中多个模型来自其他开源作者, 非常感谢他们的贡献。 @@ -167,7 +167,7 @@ Pix2Text 的文字识别引擎支持 **`80+` 种语言**,如**英文、简体 ## 在线 Demo 🤗 -也可以使用 **[在线 Demo](https://huggingface.co/spaces/breezedeus/Pix2Text-Demo)**(无法科学上网可以使用 [在线 Demo](https://hf-mirror.com/spaces/breezedeus/Pix2Text-Demo)) 尝试 **P2T** 在不同语言上的效果。但在线 Demo 使用的硬件配置较低,速度会较慢。如果是简体中文或者英文图片,建议使用 **[P2T网页版](https://p2t.breezedeus.com)**。 +也可以使用 **[在线 Demo](https://huggingface.co/spaces/breezedeus/Pix2Text-Demo)**(无法科学上网可以使用 [在线 Demo](https://hf.qhduan.com/spaces/breezedeus/Pix2Text-Demo)) 尝试 **P2T** 在不同语言上的效果。但在线 Demo 使用的硬件配置较低,速度会较慢。如果是简体中文或者英文图片,建议使用 **[P2T网页版](https://p2t.breezedeus.com)**。 ## 安装 @@ -192,7 +192,7 @@ pip install pix2text -i https://mirrors.aliyun.com/pypi/simple 如果是初次使用**OpenCV**,那估计安装都不会很顺利,bless。 -**Pix2Text** 主要依赖 [**CnSTD>=1.2.1**](https://github.com/breezedeus/cnstd)、[**CnOCR>=2.2.2.1**](https://github.com/breezedeus/cnocr) ,以及 [**transformers>=4.37.0**](https://github.com/huggingface/transformers) 。如果安装过程遇到问题,也可参考它们的安装说明文档。 +**Pix2Text** 主要依赖 [**CnSTD>=1.2.4**](https://github.com/breezedeus/cnstd)、[**CnOCR>=2.3**](https://github.com/breezedeus/cnocr) ,以及 [**transformers>=4.37.0**](https://github.com/huggingface/transformers) 。如果安装过程遇到问题,也可参考它们的安装说明文档。 > **Warning** > diff --git a/docs/index_en.md b/docs/index_en.md index 7659176..7f0577c 100644 --- a/docs/index_en.md +++ b/docs/index_en.md @@ -31,7 +31,7 @@ - **Layout Analysis Model**: [breezedeus/pix2text-layout](https://huggingface.co/breezedeus/pix2text-layout) ([Mirror](https://hf-mirror.com/breezedeus/pix2text-layout)). - **Table Recognition Model**: [breezedeus/pix2text-table-rec](https://huggingface.co/breezedeus/pix2text-table-rec) ([Mirror](https://hf-mirror.com/breezedeus/pix2text-table-rec)). - **Text Recognition Engine**: Supports **80+ languages** such as **English, Simplified Chinese, Traditional Chinese, Vietnamese**, etc. For English and Simplified Chinese recognition, it uses the open-source OCR tool [CnOCR](https://github.com/breezedeus/cnocr), while for other languages, it uses the open-source OCR tool [EasyOCR](https://github.com/JaidedAI/EasyOCR). -- **Mathematical Formula Detection Model (MFD)**: Mathematical formula detection model (MFD) from [CnSTD](https://github.com/breezedeus/cnstd). +- **Mathematical Formula Detection Model (MFD)**: [breezedeus/pix2text-mfd](https://huggingface.co/breezedeus/pix2text-mfd) ([Mirror](https://hf-mirror.com/breezedeus/pix2text-mfd)). Implemented based on [CnSTD](https://github.com/breezedeus/cnstd). - **Mathematical Formula Recognition Model (MFR)**: [breezedeus/pix2text-mfr](https://huggingface.co/breezedeus/pix2text-mfr) ([Mirror](https://hf-mirror.com/breezedeus/pix2text-mfr)). Several models are contributed by other open-source authors, and their contributions are highly appreciated. @@ -44,10 +44,12 @@ For detailed explanations, please refer to the [Models](models.md). As a Python3 toolkit, P2T may not be very user-friendly for those who are not familiar with Python. Therefore, we also provide a **[free-to-use P2T Online Web](https://p2t.breezedeus.com)**, where you can directly upload images and get P2T parsing results. The web version uses the latest models, resulting in better performance compared to the open-source models. -If you're interested, feel free to add the assistant as a friend by scanning the QR code and mentioning `p2t`. The assistant will regularly invite everyone to join the group where the latest developments related to P2T tools will be announced: +Welcome to join [**Pix2Text Discord Server**](https://discord.gg/GgD87WM8Tf), if you have any questions or suggestions. + +If you're interested, feel free to add the WeChat assistant as a friend by scanning the QR code and mentioning `p2t`. The assistant will regularly invite everyone to join the group where the latest developments related to P2T tools will be announced:
- Wechat-QRCode + Wechat-QRCode
The author also maintains a **Knowledge Planet** [**P2T/CnOCR/CnSTD Private Group**](https://t.zsxq.com/FEYZRJQ), where questions are answered promptly. You're welcome to join. The **knowledge planet private group** will also gradually release some private materials related to P2T/CnOCR/CnSTD, including **some unreleased models**, **discounts on purchasing premium models**, **code snippets for different application scenarios**, and answers to difficult problems encountered during use. The planet will also publish the latest research materials related to P2T/OCR/STD. diff --git a/docs/models.md b/docs/models.md index bdc2042..08ddbe3 100644 --- a/docs/models.md +++ b/docs/models.md @@ -2,11 +2,11 @@ **Pix2Text (P2T)** 整合了很多不同功能的模型,主要包括: -- **版面分析模型**:[breezedeus/pix2text-layout](https://huggingface.co/breezedeus/pix2text-layout) ([国内地址](https://hf-mirror.com/breezedeus/pix2text-layout))。 -- **表格识别模型**:[breezedeus/pix2text-table-rec](https://huggingface.co/breezedeus/pix2text-table-rec) ([国内地址](https://hf-mirror.com/breezedeus/pix2text-table-rec))。 +- **版面分析模型**:[breezedeus/pix2text-layout](https://huggingface.co/breezedeus/pix2text-layout) ([国内镜像](https://hf-mirror.com/breezedeus/pix2text-layout))。 +- **表格识别模型**:[breezedeus/pix2text-table-rec](https://huggingface.co/breezedeus/pix2text-table-rec) ([国内镜像](https://hf-mirror.com/breezedeus/pix2text-table-rec))。 - **文字识别引擎**:支持 **`80+` 种语言**,如**英文、简体中文、繁体中文、越南语**等。其中,**英文**和**简体中文**识别使用的是开源 OCR 工具 [CnOCR](https://github.com/breezedeus/cnocr) ,其他语言的识别使用的是开源 OCR 工具 [EasyOCR](https://github.com/JaidedAI/EasyOCR) 。 -- **数学公式检测模型(MFD)**:来自 [CnSTD](https://github.com/breezedeus/cnstd) 的数学公式检测模型(MFD)。 -- **数学公式识别模型(MFR)**:[breezedeus/pix2text-mfr](https://huggingface.co/breezedeus/pix2text-mfr) ([国内地址](https://hf-mirror.com/breezedeus/pix2text-mfr))。 +- **数学公式检测模型(MFD)**:[breezedeus/pix2text-mfd](https://huggingface.co/breezedeus/pix2text-mfd) ([国内镜像](https://hf-mirror.com/breezedeus/pix2text-mfd))。基于 [CnSTD](https://github.com/breezedeus/cnstd) 实现。 +- **数学公式识别模型(MFR)**:[breezedeus/pix2text-mfr](https://huggingface.co/breezedeus/pix2text-mfr) ([国内镜像](https://hf-mirror.com/breezedeus/pix2text-mfr))。 其中多个模型来自其他开源作者, 非常感谢他们的贡献。 @@ -22,24 +22,28 @@ 下面的说明主要针对免费的基础模型。 ## 版面分析模型 -**版面分析模型** 下载地址:[breezedeus/pix2text-layout](https://huggingface.co/breezedeus/pix2text-layout) (不能科学上网请使用 [国内地址](https://hf-mirror.com/breezedeus/pix2text-layout))。 +**版面分析模型** 下载地址:[breezedeus/pix2text-layout](https://huggingface.co/breezedeus/pix2text-layout) (不能科学上网请使用 [国内镜像](https://hf-mirror.com/breezedeus/pix2text-layout))。 把这里面的所有文件都下载到 `~/.pix2text/1.1/layout-parser` (Windows 系统放在 `C:\Users\\AppData\Roaming\pix2text\1.1\layout-parser`)目录下即可,目录不存在的话请自己创建。 > 注:上面路径的 `1.1` 是 pix2text 的版本号,`1.1.*` 都对应 `1.1`。如果是其他版本请自行替换。 ## 表格识别模型 -**表格识别模型** 下载地址:[breezedeus/pix2text-table-rec](https://huggingface.co/breezedeus/pix2text-table-rec) (不能科学上网请使用 [国内地址](https://hf-mirror.com/breezedeus/pix2text-table-rec))。 +**表格识别模型** 下载地址:[breezedeus/pix2text-table-rec](https://huggingface.co/breezedeus/pix2text-table-rec) (不能科学上网请使用 [国内镜像](https://hf-mirror.com/breezedeus/pix2text-table-rec))。 把这里面的所有文件都下载到 `~/.pix2text/1.1/table-rec` (Windows 系统放在 `C:\Users\\AppData\Roaming\pix2text\1.1\table-rec`)目录下即可,目录不存在的话请自己创建。 > 注:上面路径的 `1.1` 是 pix2text 的版本号,`1.1.*` 都对应 `1.1`。如果是其他版本请自行替换。 -## 数学公式检测模型 +## 数学公式检测模型(MFD) +### `pix2text >= 1.1.1` +Pix2Text 自 **V1.1.1** 开始,**数学公式检测模型** 下载地址:[breezedeus/pix2text-mfd](https://huggingface.co/breezedeus/pix2text-mfd) (不能科学上网请使用 [国内镜像](https://hf-mirror.com/breezedeus/pix2text-mfd))。 + +### `pix2text < 1.1.1` **数学公式检测模型**(MFD)来自 [CnSTD](https://github.com/breezedeus/cnstd) 的数学公式检测模型(MFD),请参考其代码库说明。 如果系统无法自动成功下载模型文件,则需要手动从 [**cnstd-cnocr-models**](https://huggingface.co/breezedeus/cnstd-cnocr-models) ([国内镜像](https://hf-mirror.com/breezedeus/cnstd-cnocr-models))项目中下载,或者从[百度云盘](https://pan.baidu.com/s/1zDMzArCDrrXHWL0AWxwYQQ?pwd=nstd)(提取码为 `nstd`)下载对应的zip文件并把它存放于 `~/.cnstd/1.2`(Windows下为 `C:\Users\\AppData\Roaming\cnstd\1.2`)目录中。 -## 数学公式识别模型 -**数学公式识别模型** 下载地址:[breezedeus/pix2text-mfr](https://huggingface.co/breezedeus/pix2text-mfr) (不能科学上网请使用 [国内地址](https://hf-mirror.com/breezedeus/pix2text-mfr))。 +## 数学公式识别模型(MFR) +**数学公式识别模型** 下载地址:[breezedeus/pix2text-mfr](https://huggingface.co/breezedeus/pix2text-mfr) (不能科学上网请使用 [国内镜像](https://hf-mirror.com/breezedeus/pix2text-mfr))。 把这里面的所有文件都下载到 `~/.pix2text/1.1/mfr-onnx` (Windows 系统放在 `C:\Users\\AppData\Roaming\pix2text\1.1\mfr-onnx`)目录下即可,目录不存在的话请自己创建。 > 注:上面路径的 `1.1` 是 pix2text 的版本号,`1.1.*` 都对应 `1.1`。如果是其他版本请自行替换。 @@ -77,14 +81,14 @@ EasyOCR 模型下载请参考 [EasyOCR](https://github.com/JaidedAI/EasyOCR)。 **模型购买地址**: -| 模型名称 | 购买地址 | 说明 -|--------------|-----------------------------------------------|-----------------------------------------------------------------------------------| -| MFD pro 模型 | [Lemon Squeezy](https://ocr.lemonsqueezy.com) | 包含企业版和个人版,可开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) | -| MFD pro 模型 | [B站工房](https://gf.bilibili.com/item/detail/1102870055) | 仅包含个人版,不可商用,不能开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) | -| MFR pro 模型 | [Lemon Squeezy](https://ocr.lemonsqueezy.com) | 包含企业版和个人版,可开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) | -| MFR pro 模型 | [B站工房](https://gf.bilibili.com/item/detail/1103052055) | 仅包含个人版,不可商用,不能开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) | -| CnOCR pro 模型 | [Lemon Squeezy](https://ocr.lemonsqueezy.com) | 包含企业版和个人版,可开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) 和 [CnOCR详细资料](https://www.breezedeus.com/article/cnocr) | -| CnOCR pro 模型 | [B站工房](https://gf.bilibili.com/item/detail/1104820055) | 仅包含个人版,不可商用,不能开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) 和 [CnOCR详细资料](https://www.breezedeus.com/article/cnocr) | +| 模型名称 | 购买地址 | 说明 +|--------------|------------------------------------------------------------|-----------------------------------------------------------------------------------| +| MFD pro 模型 | ~~[Lemon Squeezy](https://ocr.lemonsqueezy.com)~~ | 包含企业版和个人版,可开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) | +| MFD pro 模型 | ~~[B站工房](https://gf.bilibili.com/item/detail/1102870055)~~ | 仅包含个人版,不可商用,不能开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) | +| MFR pro 模型 | [Lemon Squeezy](https://ocr.lemonsqueezy.com) | 包含企业版和个人版,可开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) | +| MFR pro 模型 | [B站工房](https://gf.bilibili.com/item/detail/1103052055) | 仅包含个人版,不可商用,不能开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) | +| CnOCR pro 模型 | [Lemon Squeezy](https://ocr.lemonsqueezy.com) | 包含企业版和个人版,可开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) 和 [CnOCR详细资料](https://www.breezedeus.com/article/cnocr) | +| CnOCR pro 模型 | [B站工房](https://gf.bilibili.com/item/detail/1104820055) | 仅包含个人版,不可商用,不能开发票。具体说明见:[P2T详细资料](https://www.breezedeus.com/article/pix2text_cn) 和 [CnOCR详细资料](https://www.breezedeus.com/article/cnocr) | 购买过程遇到问题可以扫码加小助手为好友进行沟通,备注 `p2t`,小助手会尽快答复: diff --git a/docs/requirements.txt b/docs/requirements.txt index b65b954..76a0f10 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -36,7 +36,7 @@ cnocr[ort-cpu]==2.3.0.2 # via # -r requirements.in # cnocr -cnstd==1.2.3.6 +cnstd==1.2.4.1 # via # -r requirements.in # cnocr diff --git a/docs/usage.md b/docs/usage.md index c718291..796b686 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -7,7 +7,8 @@ CnOCR 和 CnSTD 中的模型分别存于 `~/.cnocr` 和 `~/.cnstd` 中(Windows 下载过程请耐心等待,无法科学上网时系统会自动尝试其他可用站点进行下载,所以可能需要等待较长时间。 对于没有网络连接的机器,可以先把模型下载到其他机器上,然后拷贝到对应目录。 -如果系统无法自动成功下载模型文件,则需要手动下载模型文件,可以参考 [huggingface.co/breezedeus](https://huggingface.co/breezedeus) ([国内链接](https://hf-mirror.com/breezedeus))自己手动下载。 +如果系统无法自动成功下载模型文件,则需要手动下载模型文件,可以参考 [huggingface.co/breezedeus](https://huggingface.co/breezedeus) ([国内镜像](https://hf-mirror.com/breezedeus))自己手动下载。 + 具体说明见 [模型下载](models.md)。 @@ -105,17 +106,16 @@ from pix2text import Pix2Text text_formula_config = dict( languages=('en', 'ch_sim'), # 设置识别的语言 - mfd=dict( # 声明 LayoutAnalyzer 的初始化参数 - model_type='yolov7', # 表示使用的是 YoloV7 模型,而不是 YoloV7_Tiny 模型 - model_fp=os.path.expanduser( - '~/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt' + mfd=dict( # 声明 MFD 的初始化参数 + model_path=os.path.expanduser( + '~/.pix2text/1.1/mfd-onnx/mfd-v20240618.onnx' ), # 注:修改成你的模型文件所存储的路径 ), formula=dict( model_name='mfr-pro', model_backend='onnx', model_dir=os.path.expanduser( - '~/.pix2text/1.0/mfr-pro-onnx' + '~/.pix2text/1.1/mfr-pro-onnx' ), # 注:修改成你的模型文件所存储的路径 ), text=dict( diff --git a/pix2text/text_formula_ocr.py b/pix2text/text_formula_ocr.py index ee350a4..66d0b89 100644 --- a/pix2text/text_formula_ocr.py +++ b/pix2text/text_formula_ocr.py @@ -12,7 +12,6 @@ from PIL import Image import numpy as np import torch -# from cnstd import LayoutAnalyzer from cnstd.utils import box_partial_overlap from spellchecker import SpellChecker @@ -130,9 +129,6 @@ def from_config( text_ocr = prepare_ocr_engine(languages, text_config) if enable_formula: - # if 'model_name' in mfd_config: - # mfd_config.pop('model_name') - # mfd = LayoutAnalyzer(model_name='mfd', **mfd_config) mfd = MathFormulaDetector(**mfd_config) latex_ocr = LatexOCR(**formula_config) else: @@ -219,8 +215,6 @@ def recognize( * `line_number`: The line number of the box (first line `line_number==0`), boxes with the same value indicate they are on the same line """ - # 对于大图片,把图片宽度resize到此大小;宽度比此小的图片,其实不会放大到此大小, - # 具体参考:cnstd.yolov7.layout_analyzer.LayoutAnalyzer._preprocess_images 中的 `letterbox` 行 resized_shape = kwargs.get('resized_shape', 768) if isinstance(img, Image.Image): img0 = img.convert('RGB') diff --git a/tests/test_latex_ocr.py b/tests/test_latex_ocr.py index 60d50a4..fc228d1 100644 --- a/tests/test_latex_ocr.py +++ b/tests/test_latex_ocr.py @@ -28,7 +28,7 @@ def test_infer_with_transformers(): from transformers import TrOCRProcessor from optimum.onnxruntime import ORTModelForVision2Seq - model_dir = os.path.expanduser('~/.pix2text/1.0/mfr-pro-onnx') + model_dir = os.path.expanduser('~/.pix2text/1.1/mfr-pro-onnx') processor = TrOCRProcessor.from_pretrained(model_dir) model = ORTModelForVision2Seq.from_pretrained(model_dir, use_cache=False) diff --git a/tests/test_pix2text.py b/tests/test_pix2text.py index 3fdce07..7482eff 100644 --- a/tests/test_pix2text.py +++ b/tests/test_pix2text.py @@ -12,17 +12,16 @@ def test_recognize_pdf(): img_fp = f'./docs/examples/{pdf_fn}.pdf' text_formula_config = dict( languages=('en', 'ch_sim'), - mfd=dict( # 声明 LayoutAnalyzer 的初始化参数 - model_type='yolov7', # 表示使用的是 YoloV7 模型,而不是 YoloV7_Tiny 模型 - model_fp=os.path.expanduser( - '~/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt' + mfd=dict( # 声明 MFD 的初始化参数 + model_path=os.path.expanduser( + '~/.pix2text/1.1/mfd-onnx/mfd-v20240618.onnx' ), # 注:修改成你的模型文件所存储的路径 ), formula=dict( model_name='mfr-pro', model_backend='onnx', model_dir=os.path.expanduser( - '~/.pix2text/1.0/mfr-pro-onnx' + '~/.pix2text/1.1/mfr-pro-onnx' ), # 注:修改成你的模型文件所存储的路径 ), text=dict( @@ -96,17 +95,16 @@ def test_blog_example(): img_fp = './docs/examples/mixed.jpg' text_formula_config = dict( - mfd=dict( # 声明 LayoutAnalyzer 的初始化参数 - model_type='yolov7', # 表示使用的是 YoloV7 模型,而不是 YoloV7_Tiny 模型 - model_fp=os.path.expanduser( - '~/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt' + mfd=dict( # 声明 MFD 的初始化参数 + model_path=os.path.expanduser( + '~/.pix2text/1.1/mfd-onnx/mfd-v20240618.onnx' ), # 注:修改成你的模型文件所存储的路径 ), formula=dict( model_name='mfr-pro', model_backend='onnx', model_dir=os.path.expanduser( - '~/.pix2text/1.0/mfr-pro-onnx' + '~/.pix2text/1.1/mfr-pro-onnx' ), # 注:修改成你的模型文件所存储的路径 ), ) @@ -129,17 +127,16 @@ def test_blog_pro_example(): text_formula_config = dict( languages=('en', 'ch_sim'), - mfd=dict( # 声明 LayoutAnalyzer 的初始化参数 - model_type='yolov7', # 表示使用的是 YoloV7 模型,而不是 YoloV7_Tiny 模型 - model_fp=os.path.expanduser( - '~/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt' + mfd=dict( # 声明 MFD 的初始化参数 + model_path=os.path.expanduser( + '~/.pix2text/1.1/mfd-onnx/mfd-v20240618.onnx' ), # 注:修改成你的模型文件所存储的路径 ), formula=dict( model_name='mfr-pro', model_backend='onnx', model_dir=os.path.expanduser( - '~/.pix2text/1.0/mfr-pro-onnx' + '~/.pix2text/1.1/mfr-pro-onnx' ), # 注:修改成你的模型文件所存储的路径 ), text=dict( diff --git a/tests/test_text_formula_ocr.py b/tests/test_text_formula_ocr.py index cb63592..21c12fc 100644 --- a/tests/test_text_formula_ocr.py +++ b/tests/test_text_formula_ocr.py @@ -36,17 +36,16 @@ def test_blog_example(): img_fp = './docs/examples/mixed.jpg' total_config = dict( - mfd=dict( # 声明 LayoutAnalyzer 的初始化参数 - model_type='yolov7', # 表示使用的是 YoloV7 模型,而不是 YoloV7_Tiny 模型 - model_fp=os.path.expanduser( - '~/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt' + mfd=dict( # 声明 MFD 的初始化参数 + model_path=os.path.expanduser( + '~/.pix2text/1.1/mfd-onnx/mfd-v20240618.onnx' ), # 注:修改成你的模型文件所存储的路径 ), formula=dict( model_name='mfr-pro', model_backend='onnx', model_dir=os.path.expanduser( - '~/.pix2text/1.0/mfr-pro-onnx' + '~/.pix2text/1.1/mfr-pro-onnx' ), # 注:修改成你的模型文件所存储的路径 ), ) @@ -65,17 +64,16 @@ def test_blog_pro_example(): total_config = dict( languages=('en', 'ch_sim'), - mfd=dict( # 声明 LayoutAnalyzer 的初始化参数 - model_type='yolov7', # 表示使用的是 YoloV7 模型,而不是 YoloV7_Tiny 模型 - model_fp=os.path.expanduser( - '~/.cnstd/1.2/analysis/mfd-yolov7-epoch224-20230613.pt' + mfd=dict( # 声明 MFD 的初始化参数 + model_path=os.path.expanduser( + '~/.pix2text/1.1/mfd-onnx/mfd-v20240618.onnx' ), # 注:修改成你的模型文件所存储的路径 ), formula=dict( model_name='mfr-pro', model_backend='onnx', model_dir=os.path.expanduser( - '~/.pix2text/1.0/mfr-pro-onnx' + '~/.pix2text/1.1/mfr-pro-onnx' ), # 注:修改成你的模型文件所存储的路径 ), text=dict(