Skip to content

Commit

Permalink
Language detection to language module and language to table atttributes
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Jul 18, 2022
1 parent 8c7b9d0 commit 4b93fb2
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 33 deletions.
39 changes: 6 additions & 33 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@
import scipy.sparse as sp
from gensim import corpora

from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException

from Orange.data import (
Variable,
ContinuousVariable,
Expand All @@ -35,9 +32,6 @@
except ImportError:
summarize, PartialSummary = None, None

DetectorFactory.seed = 0
MAX_DOC_REC = 50 # a largest number of documents considered for language inference


def get_sample_corpora_dir():
path = os.path.dirname(__file__)
Expand Down Expand Up @@ -100,16 +94,13 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
self.__used_preprocessor = PreprocessorList([]) # required for compute values
self._titles: Optional[np.ndarray] = None
self._pp_documents = None # preprocessed documents
self._recommended_language = None
self.language = None

if text_features is None:
self._infer_text_features()
else:
self.set_text_features(text_features)

self._set_unique_titles()
self._set_recommended_language()

@property
def used_preprocessor(self):
Expand Down Expand Up @@ -149,23 +140,6 @@ def _find_identical_feature(self, feature: Variable) -> Optional[Variable]:
return var
return None

def _set_recommended_language(self):
texts = [
' '.join(t.replace('\n', ' ').split(' ')[:2000])
for t in self.documents[:MAX_DOC_REC]
]
languages = list()
for text in texts:
try:
languages.append(detect(text))
except LangDetectException:
languages.append('unknown')
self._recommended_language = sorted(
Counter(languages).items(), key=lambda x: x[1], reverse=True
)[0][0] if len(languages) > 0 else 'unknown'
if self._recommended_language == 'unknown':
self._recommended_language = 'en'

def set_text_features(self, feats: Optional[List[Variable]]) -> None:
"""
Select which meta-attributes to include when mining text.
Expand Down Expand Up @@ -504,7 +478,7 @@ def copy(self):
"""Return a copy of the table."""
c = super().copy()
# since tokens and dictionary are considered immutable copies are not needed
c._setup_corpus(copy(self.text_features))
c._setup_corpus(text_features=copy(self.text_features))
c._tokens = self._tokens
c._dictionary = self._dictionary
c.ngram_range = self.ngram_range
Expand All @@ -514,12 +488,11 @@ def copy(self):
c._titles = self._titles
c._pp_documents = self._pp_documents
c._ngrams_corpus = self._ngrams_corpus
c.language = self.language
return c

@staticmethod
def from_documents(documents, name, attributes=None, class_vars=None, metas=None,
title_indices=None):
title_indices=None, language=None):
"""
Create corpus from documents.
Expand Down Expand Up @@ -568,6 +541,7 @@ def to_val(attr, val):
domain=domain, X=X, Y=Y, metas=metas, text_features=[]
)
corpus.name = name
corpus.attributes["language"] = language
return corpus

def __getitem__(self, key):
Expand Down Expand Up @@ -601,14 +575,15 @@ def from_numpy(
)
# t is corpus but corpus specific attributes were not set yet
t._setup_corpus(text_features=text_features)
t.language = language
t.attributes["language"] = language
return t

@classmethod
def from_list(cls, domain, rows, weights=None):
def from_list(cls, domain, rows, weights=None, language=None):
t = super().from_list(domain, rows, weights)
# t is corpus but corpus specific attributes were not set yet
t._setup_corpus()
t.attributes["language"] = language
return t

@classmethod
Expand Down Expand Up @@ -682,8 +657,6 @@ def retain_preprocessing(orig, new, key=...):
new.used_preprocessor = orig.used_preprocessor
if orig._ngrams_corpus is not None:
new.ngrams_corpus = orig._ngrams_corpus[key]

new.language = orig.language
else: # orig is not Corpus
new._set_unique_titles()
new._infer_text_features()
Expand Down
25 changes: 25 additions & 0 deletions orangecontrib/text/language.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
from collections import Counter
from typing import List

from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException


iso2lang = {
"af": "Afrikaans",
"am": "Amharic",
Expand Down Expand Up @@ -76,3 +83,21 @@
"zh-tw": "Traditional Chinese",
}
lang2iso = {lang: code for code, lang in iso2lang.items()}


DetectorFactory.seed = 0
MAX_DOCS = 50 # max number of documents considered for language detection
MAX_WORDS = 2000 # max number of words in document considered for lang detection


def detect_language(texts: List[str]):
texts = [
" ".join(t.replace("\n", " ").split(" ")[:MAX_WORDS]) for t in texts[:MAX_DOCS]
]
languages = list()
for text in texts:
try:
languages.append(detect(text))
except LangDetectException:
languages.append(None)
return Counter(languages).most_common(1)[0][0] if len(languages) > 0 else None

0 comments on commit 4b93fb2

Please sign in to comment.