✨ SentenceSplitter use tokenizer

lenML · Jun 26, 2024 · d8b8596 · d8b8596
1 parent 6af9e24
commit d8b8596
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 0 deletions.
diff --git a/modules/SentenceSplitter.py b/modules/SentenceSplitter.py
@@ -2,6 +2,7 @@
 
 import zhon
 
+from modules.models import get_tokenizer
 from modules.utils.detect_lang import guess_lang
 
 
@@ -11,7 +12,15 @@ class SentenceSplitter:
     SEP_TOKEN = " "
 
     def __init__(self, threshold=100):
+        assert (
+            isinstance(threshold, int) and threshold > 0
+        ), "Threshold must be greater than 0."
+
         self.sentence_threshold = threshold
+        self.tokenizer = get_tokenizer()
+
+    def count_tokens(self, text: str):
+        return len(self.tokenizer.tokenize(text))
 
     def parse(self, text: str):
         sentences = self.split_paragraph(text)

diff --git a/modules/models.py b/modules/models.py
@@ -3,6 +3,7 @@
 import threading
 
 import torch
+from transformers import LlamaTokenizer
 
 from modules import config
 from modules.ChatTTS import ChatTTS
@@ -76,3 +77,9 @@ def reload_chat_tts():
     instance = load_chat_tts()
     logger.info("ChatTTS models reloaded")
     return instance
+
+
+def get_tokenizer() -> LlamaTokenizer:
+    chat_tts = load_chat_tts()
+    tokenizer = chat_tts.pretrain_models["tokenizer"]
+    return tokenizer