forked from datawhalechina/llm-cookbook
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from SQ-AMD/main
为 LLM 应用预处理非结构化数据
- Loading branch information
Showing
11 changed files
with
3,857 additions
and
0 deletions.
There are no files selected for viewing
3,505 changes: 3,505 additions & 0 deletions
3,505
...eprocessing Unstructured Data for LLM Applications/3. 规范化内容 Normalizing the Content.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
69 changes: 69 additions & 0 deletions
69
content/选修-Preprocessing Unstructured Data for LLM Applications/Utils_Ch3.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import os | ||
import panel as pn | ||
from dotenv import load_dotenv | ||
|
||
|
||
# 确保加载面板扩展以实现基于 Web 的可视化。 | ||
pn.extension() | ||
|
||
|
||
class Utils: | ||
""" | ||
实用工具类,用于从环境变量中获取配置信息。 | ||
""" | ||
def __init__(self): | ||
# 实例创建时只加载一次环境变量。 | ||
load_dotenv() | ||
|
||
def get_dlai_api_key(self): | ||
""" | ||
从环境变量中检索 DLAI API 密钥。 | ||
返回值: | ||
str: DLAI API 密钥(如果已设置);否则为 None。 | ||
""" | ||
return os.getenv("DLAI_API_KEY") | ||
|
||
def get_dlai_url(self): | ||
""" | ||
从环境变量中获取 DLAI API URL。 | ||
返回值: | ||
str: DLAI API URL(如果已设置);否则为 None。 | ||
""" | ||
print(os.getenv("DLAI_API_URL")) | ||
|
||
return os.getenv("DLAI_API_URL") | ||
|
||
|
||
class UploadFile: | ||
""" | ||
通过面板小部件处理文件上传,仅允许特定文件类型。 | ||
""" | ||
def __init__(self): | ||
self.widget_file_upload = pn.widgets.FileInput(accept='.pdf,.ppt,.png,.html', multiple=False) | ||
# 注意 "文件名" 的变化,以触发 save_filename 方法。 | ||
self.widget_file_upload.param.watch(self.save_filename, 'filename') | ||
|
||
def save_filename(self, event): | ||
""" | ||
如果上传文件的大小在限制范围内(2 MB),则保存该文件。 | ||
参数: | ||
event: 包含文件输入小部件中更改的详细信息。不直接用于此功能,但回调签名需要它。 | ||
如果文件大小超过 2 MB 限制,则打印信息,否则保存文件。 | ||
""" | ||
# 将文件大小限制为 2 MB。 | ||
max_file_size = 2 * 1024 * 1024 | ||
|
||
if len(self.widget_file_upload.value) > max_file_size: | ||
print("文件过大。2 MB 限制!") | ||
else: | ||
# 确保目录存在。 | ||
output_dir = './example_files' | ||
os.makedirs(output_dir, exist_ok=True) | ||
|
||
# 将文件保存到指定目录。 | ||
with open(os.path.join(output_dir, self.widget_file_upload.filename), 'wb') as f: | ||
f.write(self.widget_file_upload.value) |
Binary file added
BIN
+141 KB
content/选修-Preprocessing Unstructured Data for LLM Applications/example_files/CoT.pdf
Binary file not shown.
184 changes: 184 additions & 0 deletions
184
content/选修-Preprocessing Unstructured Data for LLM Applications/example_files/el_nino.html
Large diffs are not rendered by default.
Oops, something went wrong.
75 changes: 75 additions & 0 deletions
75
...nt/选修-Preprocessing Unstructured Data for LLM Applications/example_files/medium_blog.html
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file added
BIN
+3.25 MB
...nt/选修-Preprocessing Unstructured Data for LLM Applications/example_files/msft_openai.pptx
Binary file not shown.
Binary file added
BIN
+28.9 KB
...-Preprocessing Unstructured Data for LLM Applications/example_files/大语言模型原理简介及高校训练实践.docx
Binary file not shown.
Binary file added
BIN
+106 KB
content/选修-Preprocessing Unstructured Data for LLM Applications/images/3-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+158 KB
content/选修-Preprocessing Unstructured Data for LLM Applications/images/3-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+520 KB
content/选修-Preprocessing Unstructured Data for LLM Applications/images/3-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
24 changes: 24 additions & 0 deletions
24
content/选修-Preprocessing Unstructured Data for LLM Applications/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
chromadb==0.4.22 | ||
langchain==0.1.5 | ||
langchain-community==0.0.17 | ||
langchain-core==0.1.19 | ||
langchain-openai==0.0.5 | ||
openai==1.30.2 | ||
tiktoken==0.5.2 | ||
unstructured-client==0.14.0 | ||
unstructured==0.11.8 | ||
unstructured-inference==0.7.23 | ||
unstructured.pytesseract==0.3.12 | ||
urllib3==1.26.18 | ||
python-dotenv==1.0.1 | ||
panel==1.3.0a8 | ||
ipython==8.12.3 | ||
python-pptx==0.6.23 | ||
pdf2image==1.17.0 | ||
pdfminer==20191125 | ||
opencv-python==4.9.0.80 | ||
pikepdf==8.13.0 | ||
pypdf==4.0.1 | ||
protobuf==4.22 | ||
requests==2.29.0 | ||
urllib3==1.25.11 |