🐛 fix normalization lang detect

- 移除 langdetect 依赖 - 修复 test unit
lenML · Jun 24, 2024 · bd5e6eb · bd5e6eb
1 parent 650a668
commit bd5e6eb
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 37 deletions.
diff --git a/modules/SentenceSplitter.py b/modules/SentenceSplitter.py
@@ -1,8 +1,9 @@
 import re
 
-import langdetect
 import zhon
 
+from modules.utils.detect_lang import guess_lang
+
 
 def split_zhon_sentence(text):
     result = []
@@ -37,10 +38,7 @@ def split_en_sentence(text):
 
 
 def is_eng_sentence(text):
-    try:
-        return langdetect.detect(text) == "en"
-    except langdetect.LangDetectException:
-        return False
+    return guess_lang(text) == "en"
 
 
 def split_zhon_paragraph(text):

diff --git a/modules/normalization.py b/modules/normalization.py
@@ -1,13 +1,11 @@
 import html
 import re
-from functools import lru_cache
-from typing import Literal
 
 import emojiswitch
 import ftfy
-import langdetect
 
 from modules import models
+from modules.utils.detect_lang import guess_lang
 from modules.utils.HomophonesReplacer import HomophonesReplacer
 from modules.utils.html import remove_html_tags as _remove_html_tags
 from modules.utils.markdown import markdown_to_text
@@ -18,33 +16,6 @@
 DISABLE_UNK_TOKEN_CHECK = False
 
 
-@lru_cache(maxsize=64)
-def is_chinese(text):
-    try:
-        lang = langdetect.detect(text)
-        return lang.lower() in ["zh", "zh-cn", "zh-tw"]
-    except langdetect.LangDetectException:
-        return False
-
-
-@lru_cache(maxsize=64)
-def is_eng(text):
-    try:
-        lang = langdetect.detect(text)
-        return lang.lower() in ["en"]
-    except langdetect.LangDetectException:
-        return False
-
-
-@lru_cache(maxsize=64)
-def guess_lang(text) -> Literal["zh", "en"]:
-    if is_chinese(text):
-        return "zh"
-    if is_eng(text):
-        return "en"
-    return "zh"
-
-
 post_normalize_pipeline = []
 pre_normalize_pipeline = []
 
@@ -332,6 +303,7 @@ def text_normalize(text, is_end=False):
         " [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149",
         " 明天有62％的概率降雨",
         "大🍌，一条大🍌，嘿，你的感觉真的很奇妙  [lbreak]",
+        "I like eating 🍏",
         """
 # 你好，世界
 ```js

diff --git a/modules/utils/detect_lang.py b/modules/utils/detect_lang.py
@@ -0,0 +1,27 @@
+from functools import lru_cache
+from typing import Literal
+
+
+@lru_cache(maxsize=64)
+def is_chinese(text):
+    for char in text:
+        if "\u4e00" <= char <= "\u9fff":
+            return True
+    return False
+
+
+@lru_cache(maxsize=64)
+def is_eng(text):
+    for char in text:
+        if "a" <= char.lower() <= "z":
+            return True
+    return False
+
+
+@lru_cache(maxsize=64)
+def guess_lang(text) -> Literal["zh", "en"]:
+    if is_chinese(text):
+        return "zh"
+    if is_eng(text):
+        return "en"
+    return "zh"
diff --git a/modules/utils/markdown.py b/modules/utils/markdown.py
@@ -36,6 +36,7 @@ def list(self, text: str, ordered: bool, **attrs) -> str:
             return html + "\n" + text + "\n"
         return "\n" + text + "\n"
 
+    # FIXME: 现在的 list 转换没法保留序号
     def list_item(self, text):
         return "" + text + "\n"
 

diff --git a/tests/test_normalize/test_md.py b/tests/test_normalize/test_md.py
@@ -10,7 +10,7 @@
     "input_text, expected_output",
     [
         # 测试标题
-        ("# 你好，世界", "你好，世界"),
+        ("# 你好，世界", "你好,世界"),
         # 测试代码块
         ("```js\nconsole.log('1')\n```", ""),
         # 测试加粗
@@ -53,7 +53,7 @@
 ---
             """.strip(),
             # FIXME: 有序list现在没有序列号...
-            "你好，世界\n加粗\n一条文本\nGoogle\n项目一\n项目二\n第一项\n第二项\n这是一段引用",
+            "你好,世界\n加粗\n一条文本\nGoogle\n项目一\n项目二\n第一项\n第二项\n这是一段引用",
         ),
     ],
 )