Skip to content

Commit

Permalink
Corpus: language detection functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
djukicn committed Jun 30, 2022
1 parent 76457e2 commit 6f8ab87
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 0 deletions.
31 changes: 31 additions & 0 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
import scipy.sparse as sp
from gensim import corpora

from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException

from Orange.data import (
Variable,
ContinuousVariable,
Expand All @@ -32,6 +35,9 @@
except ImportError:
summarize, PartialSummary = None, None

DetectorFactory.seed = 0
MAX_DOC_REC = 50 # a largest number of documents considered for language inference


def get_sample_corpora_dir():
path = os.path.dirname(__file__)
Expand Down Expand Up @@ -94,13 +100,16 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
self.__used_preprocessor = PreprocessorList([]) # required for compute values
self._titles: Optional[np.ndarray] = None
self._pp_documents = None # preprocessed documents
self._recommended_language = None
self.language = None

if text_features is None:
self._infer_text_features()
else:
self.set_text_features(text_features)

self._set_unique_titles()
self._set_recommended_language()

@property
def used_preprocessor(self):
Expand Down Expand Up @@ -140,6 +149,23 @@ def _find_identical_feature(self, feature: Variable) -> Optional[Variable]:
return var
return None

def _set_recommended_language(self):
texts = [
' '.join(t.replace('\n', ' ').split(' ')[:2000])
for t in self.documents[:MAX_DOC_REC]
]
languages = list()
for text in texts:
try:
languages.append(detect(text))
except LangDetectException:
languages.append('unknown')
self._recommended_language = sorted(
Counter(languages).items(), key=lambda x: x[1], reverse=True
)[0][0] if len(languages) > 0 else 'unknown'
if self._recommended_language == 'unknown':
self._recommended_language = 'en'

def set_text_features(self, feats: Optional[List[Variable]]) -> None:
"""
Select which meta-attributes to include when mining text.
Expand Down Expand Up @@ -488,6 +514,7 @@ def copy(self):
c._titles = self._titles
c._pp_documents = self._pp_documents
c._ngrams_corpus = self._ngrams_corpus
c.language = self.language
return c

@staticmethod
Expand Down Expand Up @@ -567,12 +594,14 @@ def from_numpy(
attributes=None,
ids=None,
text_features=None,
language=None
):
t = super().from_numpy(
domain, X, Y=Y, metas=metas, W=W, attributes=attributes, ids=ids
)
# t is corpus but corpus specific attributes were not set yet
t._setup_corpus(text_features=text_features)
t.language = language
return t

@classmethod
Expand Down Expand Up @@ -653,6 +682,8 @@ def retain_preprocessing(orig, new, key=...):
new.used_preprocessor = orig.used_preprocessor
if orig._ngrams_corpus is not None:
new.ngrams_corpus = orig._ngrams_corpus[key]

new.language = orig.language
else: # orig is not Corpus
new._set_unique_titles()
new._infer_text_features()
Expand Down
48 changes: 48 additions & 0 deletions orangecontrib/text/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,51 @@ def __getitem__(
"""
sparse = self.sparse.__getitem__((slice(None, None, None), key))
return Sparse2CorpusSliceable(sparse)


# ISO language codes from https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
LANGUAGES = [
'Abkhazian', 'Afar', 'Afrikaans', 'Akan', 'Albanian', 'Amharic', 'Arabic',
'Aragonese', 'Armenian', 'Assamese', 'Avaric', 'Avestan', 'Aymara',
'Azerbaijani', 'Bambara', 'Bashkir', 'Basque', 'Belarusian', 'Bengali',
'Bislama', 'Bosnian', 'Breton', 'Bulgarian', 'Burmese', 'Catalan', 'Chamorro',
'Chechen', 'Chichewa', 'Chinese', 'Church Slavic', 'Chuvash', 'Cornish', 'Corsican',
'Cree', 'Croatian', 'Czech', 'Danish', 'Divehi', 'Dutch', 'Dzongkha',
'English', 'Esperanto', 'Estonian', 'Ewe', 'Faroese', 'Fijian', 'Finnish',
'French', 'Western', 'Fulah', 'Gaelic', 'Galician', 'Ganda', 'Georgian', 'German',
'Greek', 'Kalaallisut', 'Guarani', 'Gujarati', 'Haitian', 'Hausa', 'Hebrew', 'Herero',
'Hindi', 'Hiri', 'Hungarian', 'Icelandic', 'Ido', 'Igbo', 'Indonesian', 'Interlingua',
'Interlingue', 'Inuktitut', 'Inupiaq', 'Irish', 'Italian', 'Japanese', 'Javanese',
'Kannada', 'Kanuri', 'Kashmiri', 'Kazakh', 'Central', 'Kikuyu', 'Kinyarwanda',
'Kirghiz', 'Komi', 'Kongo', 'Korean', 'Kuanyama', 'Kurdish', 'Lao', 'Latin',
'Latvian', 'Limburgan', 'Lingala', 'Lithuanian', 'Luba-Katanga', 'Luxembourgish',
'Macedonian', 'Malagasy', 'Malay', 'Malayalam', 'Maltese', 'Manx', 'Maori',
'Marathi', 'Marshallese', 'Mongolian', 'Nauru', 'Navajo', 'North', 'South', 'Ndonga',
'Nepali', 'Norwegian', 'Norwegian', 'Norwegian', 'Sichuan Yi', 'Occitan', 'Ojibwa', 'Oriya',
'Oromo', 'Ossetian', 'Pali', 'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi',
'Quechua', 'Romanian', 'Romansh', 'Rundi', 'Russian', 'Northern Sami', 'Samoan', 'Sango',
'Sanskrit', 'Sardinian', 'Serbian', 'Shona', 'Sindhi', 'Sinhala', 'Slovak', 'Slovenian',
'Somali', 'Southern Sotho', 'Spanish', 'Sundanese', 'Swahili', 'Swati', 'Swedish', 'Tagalog',
'Tahitian', 'Tajik', 'Tamil', 'Tatar', 'Telugu', 'Thai', 'Tibetan', 'Tigrinya', 'Tonga',
'Tsonga', 'Tswana', 'Turkish', 'Turkmen', 'Twi', 'Uighur', 'Ukrainian', 'Urdu', 'Uzbek',
'Venda', 'Vietnamese', 'Volapük', 'Walloon', 'Welsh', 'Wolof', 'Xhosa', 'Yiddish', 'Yoruba',
'Zhuang', 'Zulu'
]
ISO = [
'ab', 'aa', 'af', 'ak', 'sq', 'am', 'ar', 'an', 'hy', 'as', 'av', 'ae', 'ay',
'az', 'bm', 'ba', 'eu', 'be', 'bn', 'bi', 'bs', 'br', 'bg', 'my', 'ca', 'ch',
'ce', 'ny', 'zh', 'cu', 'cv', 'kw', 'co', 'cr', 'hr', 'cs', 'da', 'dv', 'nl',
'dz', 'en', 'eo', 'et', 'ee', 'fo', 'fj', 'fi', 'fr', 'fy', 'ff', 'gd', 'gl',
'lg', 'ka', 'de', 'el', 'kl', 'gn', 'gu', 'ht', 'ha', 'he', 'hz', 'hi', 'ho',
'hu', 'is', 'io', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv',
'kn', 'kr', 'ks', 'kk', 'km', 'ki', 'rw', 'ky', 'kv', 'kg', 'ko', 'kj', 'ku',
'lo', 'la', 'lv', 'li', 'ln', 'lt', 'lu', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt',
'gv', 'mi', 'mr', 'mh', 'mn', 'na', 'nv', 'nd', 'nr', 'ng', 'ne', 'no', 'nb',
'nn', 'ii', 'oc', 'oj', 'or', 'om', 'os', 'pi', 'ps', 'fa', 'pl', 'pt', 'pa',
'qu', 'ro', 'rm', 'rn', 'ru', 'se', 'sm', 'sg', 'sa', 'sc', 'sr', 'sn', 'sd',
'si', 'sk', 'sl', 'so', 'st', 'es', 'su', 'sw', 'ss', 'sv', 'tl', 'ty', 'tg',
'ta', 'tt', 'te', 'th', 'bo', 'ti', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'ug',
'uk', 'ur', 'uz', 've', 'vi', 'vo', 'wa', 'cy', 'wo', 'xh', 'yi', 'yo', 'za',
'zu '
]
ISO2LANG = dict(zip(ISO, LANGUAGES))
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ tweepy >=4.0.0
ufal.udpipe >=1.2.0.3
wikipedia
yake
langdetect

0 comments on commit 6f8ab87

Please sign in to comment.