Skip to content

Commit

Permalink
🐛 fix normalization lang detect
Browse files Browse the repository at this point in the history
- 移除 langdetect 依赖
- 修复 test unit
  • Loading branch information
zhzLuke96 committed Jun 24, 2024
1 parent 650a668 commit bd5e6eb
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 37 deletions.
8 changes: 3 additions & 5 deletions modules/SentenceSplitter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re

import langdetect
import zhon

from modules.utils.detect_lang import guess_lang


def split_zhon_sentence(text):
result = []
Expand Down Expand Up @@ -37,10 +38,7 @@ def split_en_sentence(text):


def is_eng_sentence(text):
try:
return langdetect.detect(text) == "en"
except langdetect.LangDetectException:
return False
return guess_lang(text) == "en"


def split_zhon_paragraph(text):
Expand Down
32 changes: 2 additions & 30 deletions modules/normalization.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import html
import re
from functools import lru_cache
from typing import Literal

import emojiswitch
import ftfy
import langdetect

from modules import models
from modules.utils.detect_lang import guess_lang
from modules.utils.HomophonesReplacer import HomophonesReplacer
from modules.utils.html import remove_html_tags as _remove_html_tags
from modules.utils.markdown import markdown_to_text
Expand All @@ -18,33 +16,6 @@
DISABLE_UNK_TOKEN_CHECK = False


@lru_cache(maxsize=64)
def is_chinese(text):
try:
lang = langdetect.detect(text)
return lang.lower() in ["zh", "zh-cn", "zh-tw"]
except langdetect.LangDetectException:
return False


@lru_cache(maxsize=64)
def is_eng(text):
try:
lang = langdetect.detect(text)
return lang.lower() in ["en"]
except langdetect.LangDetectException:
return False


@lru_cache(maxsize=64)
def guess_lang(text) -> Literal["zh", "en"]:
if is_chinese(text):
return "zh"
if is_eng(text):
return "en"
return "zh"


post_normalize_pipeline = []
pre_normalize_pipeline = []

Expand Down Expand Up @@ -332,6 +303,7 @@ def text_normalize(text, is_end=False):
" [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149",
" 明天有62%的概率降雨",
"大🍌,一条大🍌,嘿,你的感觉真的很奇妙 [lbreak]",
"I like eating 🍏",
"""
# 你好,世界
```js
Expand Down
27 changes: 27 additions & 0 deletions modules/utils/detect_lang.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from functools import lru_cache
from typing import Literal


@lru_cache(maxsize=64)
def is_chinese(text):
for char in text:
if "\u4e00" <= char <= "\u9fff":
return True
return False


@lru_cache(maxsize=64)
def is_eng(text):
for char in text:
if "a" <= char.lower() <= "z":
return True
return False


@lru_cache(maxsize=64)
def guess_lang(text) -> Literal["zh", "en"]:
if is_chinese(text):
return "zh"
if is_eng(text):
return "en"
return "zh"
1 change: 1 addition & 0 deletions modules/utils/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def list(self, text: str, ordered: bool, **attrs) -> str:
return html + "\n" + text + "\n"
return "\n" + text + "\n"

# FIXME: 现在的 list 转换没法保留序号
def list_item(self, text):
return "" + text + "\n"

Expand Down
4 changes: 2 additions & 2 deletions tests/test_normalize/test_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"input_text, expected_output",
[
# 测试标题
("# 你好,世界", "你好世界"),
("# 你好,世界", "你好,世界"),
# 测试代码块
("```js\nconsole.log('1')\n```", ""),
# 测试加粗
Expand Down Expand Up @@ -53,7 +53,7 @@
---
""".strip(),
# FIXME: 有序list现在没有序列号...
"你好世界\n加粗\n一条文本\nGoogle\n项目一\n项目二\n第一项\n第二项\n这是一段引用",
"你好,世界\n加粗\n一条文本\nGoogle\n项目一\n项目二\n第一项\n第二项\n这是一段引用",
),
],
)
Expand Down

0 comments on commit bd5e6eb

Please sign in to comment.