Skip to content

Commit

Permalink
✨ SentenceSplitter use tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
zhzLuke96 committed Jun 26, 2024
1 parent 6af9e24 commit d8b8596
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 0 deletions.
9 changes: 9 additions & 0 deletions modules/SentenceSplitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import zhon

from modules.models import get_tokenizer
from modules.utils.detect_lang import guess_lang


Expand All @@ -11,7 +12,15 @@ class SentenceSplitter:
SEP_TOKEN = " "

def __init__(self, threshold=100):
assert (
isinstance(threshold, int) and threshold > 0
), "Threshold must be greater than 0."

self.sentence_threshold = threshold
self.tokenizer = get_tokenizer()

def count_tokens(self, text: str):
return len(self.tokenizer.tokenize(text))

def parse(self, text: str):
sentences = self.split_paragraph(text)
Expand Down
7 changes: 7 additions & 0 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import threading

import torch
from transformers import LlamaTokenizer

from modules import config
from modules.ChatTTS import ChatTTS
Expand Down Expand Up @@ -76,3 +77,9 @@ def reload_chat_tts():
instance = load_chat_tts()
logger.info("ChatTTS models reloaded")
return instance


def get_tokenizer() -> LlamaTokenizer:
chat_tts = load_chat_tts()
tokenizer = chat_tts.pretrain_models["tokenizer"]
return tokenizer

0 comments on commit d8b8596

Please sign in to comment.