Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

document adds spacy multi-language processing support and uses langde… #302

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 110 additions & 8 deletions goldenverba/components/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,95 @@
import spacy
import json

from langdetect import detect

SUPPORTED_LANGUAGES = {
"en": "English",
"zh": "Simplified Chinese",
"zh-hant": "Traditional Chinese",
"fr": "French",
"de": "German",
"nl": "Dutch"
}


def load_nlp_for_language(language: str):
"""Load SpaCy models based on language"""
if language == "en":
nlp = spacy.blank("en")
elif language == "zh":
nlp = spacy.blank("zh")
elif language == "zh-hant":
nlp = spacy.blank("zh-hant")
elif language == "fr":
nlp = spacy.blank("fr")
elif language == "de":
nlp = spacy.blank("de")
elif language == "nl":
nlp = spacy.blank("nl")
else:
raise ValueError(f"Unsupported language: {language}")

# Add sentence segmentation to languages
if language == "en":
nlp.add_pipe("sentencizer", config={"punct_chars": None})
else:
nlp.add_pipe("sentencizer") #
return nlp


def detect_language(text: str) -> str:
"""Automatically detect language"""
try:
detected_lang = detect(text)
if detected_lang == "zh-cn":
return "zh"
elif detected_lang == "zh-tw" or detected_lang == "zh-hk":
return "zh-hant"
return detected_lang
except:
return "unknown"


def split_text_by_language(text: str):
"""Separate text into language parts based on character ranges"""
chinese_simplified = ''.join([char for char in text if '\u4e00' <= char <= '\u9fff'])
chinese_traditional = ''.join(
[char for char in text if '\u3400' <= char <= '\u4dbf' or '\u4e00' <= char <= '\u9fff'])
english_part = ''.join([char for char in text if char.isascii()])
other_text = ''.join([char for char in text if not (char.isascii() or '\u4e00' <= char <= '\u9fff')])

return chinese_simplified, chinese_traditional, english_part, other_text


def process_mixed_language(content: str):
"""Process mixed language text"""
chinese_simplified, chinese_traditional, english_text, other_text = split_text_by_language(content)

docs = []

if chinese_simplified:
nlp_zh = load_nlp_for_language("zh")
docs.append(nlp_zh(chinese_simplified))

if chinese_traditional:
nlp_zh_hant = load_nlp_for_language("zh-hant")
docs.append(nlp_zh_hant(chinese_traditional))

if english_text:
nlp_en = load_nlp_for_language("en")
docs.append(nlp_en(english_text))

if other_text:
detected_lang = detect_language(other_text)
if detected_lang in SUPPORTED_LANGUAGES:
nlp_other = load_nlp_for_language(detected_lang)
docs.append(nlp_other(other_text))

# Merge all processed documents
doc = Doc.from_docs(docs)
return doc


class Document:
def __init__(
Expand All @@ -30,20 +119,33 @@ def __init__(

MAX_BATCH_SIZE = 500000

nlp = spacy.blank("en")
nlp.add_pipe("sentencizer", config={"punct_chars": None})

if nlp and len(content) > MAX_BATCH_SIZE:
if len(content) > MAX_BATCH_SIZE:
# Process content in batches
docs = []
for i in range(0, len(content), MAX_BATCH_SIZE):
batch = content[i : i + MAX_BATCH_SIZE]
docs.append(nlp(batch))
batch = content[i: i + MAX_BATCH_SIZE]

# Check language for each batch
detected_language = detect_language(batch)

if detected_language in SUPPORTED_LANGUAGES:
nlp = load_nlp_for_language(detected_language)
docs.append(nlp(batch))
else:
# Process batches of mixed languages
docs.append(process_mixed_language(batch))

# Merge all processed docs
# Merged all processed docs
doc = Doc.from_docs(docs)
else:
doc = nlp(content) if nlp else None
# Process smaller content, directly based on language
detected_language = detect_language(content)
if detected_language in SUPPORTED_LANGUAGES:
nlp = load_nlp_for_language(detected_language)
doc = nlp(content)
else:
# Process mixed language content
doc = process_mixed_language(content)

self.spacy_doc = doc

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"aiofiles==24.1.0",
"assemblyai==0.33.0",
"beautifulsoup4==4.12.3",
"langdetect==1.0.9",
],
extras_require={
"dev": ["pytest", "wheel", "twine", "black>=23.7.0", "setuptools"],
Expand Down