diff --git a/.gitignore b/.gitignore index 18dff633d..d6f0b0a65 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ target/ # Jupyter Notebook .ipynb_checkpoints +Untitled*.ipynb # IDE files .idea diff --git a/examples/spell.py b/examples/spell.py index 0d39ff07f..773a122ae 100644 --- a/examples/spell.py +++ b/examples/spell.py @@ -1,8 +1,21 @@ # -*- coding: utf-8 -*- from pythainlp.spell import spell +from pythainlp.spell.pn import spell as pn_tnc_spell +from pythainlp.spell.pn import correct as pn_tnc_correct +from pythainlp.spell.pn import NorvigSpellChecker +from pythainlp.corpus import ttc -a = spell("สี่เหลียม") -print(a) # ['สี่เหลี่ยม'] +# checker from pythainlp.spell module (generic) +spell("สี่เหลียม") # ['สี่เหลี่ยม'] +# spell("สี่เหลียม", engine="hunspell") # available in some Linux systems -# a = spell("สี่เหลียม", engine="hunspell") # available in some Linux systems +# checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's) +pn_tnc_spell("เหลืยม") +pn_tnc_correct("เหลืยม") + +# checker from pythainlp.spell.pn module (specified algorithm, custom dictionary) +ttc_word_freqs = ttc.get_word_frequency_all() +pn_ttc_spell_checker = NorvigSpellChecker(custom_dict=ttc_word_freqs) +pn_ttc_spell_checker.spell("เหลืยม") +pn_ttc_spell_checker.correct("เหลืยม") diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 62b71194f..8310894e7 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -3,16 +3,16 @@ import os import requests -from future.moves.urllib.request import urlopen from pythainlp.tools import get_path_data, get_path_db from tinydb import Query, TinyDB from tqdm import tqdm +from urllib.request import urlopen CORPUS_DB_URL = ( "https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json" ) -# __all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"] +# __all__ = ["thaipos", "thaiword", "alphabet", "tone", "country", "wordnet"] path_db_ = get_path_db() diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 5b9a15438..28c2c0780 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """ -Word frequency from Thai National Corpus +Thai National Corpus word frequency + Credit: Korakot Chaovavanich‎ https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1 """ @@ -57,6 +58,6 @@ def get_word_frequency_all(): listword = [] for line in lines: listindata = line.split(" ") - listword.append((listindata[0], listindata[1])) + listword.append((listindata[0], int(listindata[1]))) return listword diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py index 015b8e2ae..fbf82d9a5 100644 --- a/pythainlp/corpus/ttc.py +++ b/pythainlp/corpus/ttc.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """ -TTC Thai word frequency +Thai Textbook Corpus (TTC) word frequency + Credit: Korakot Chaovavanich‎ https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1 """ @@ -13,7 +14,7 @@ def get_word_frequency_all(): """ - ดึงข้อมูลความถี่คำของ TTC มาใช้งาน + ดึงข้อมูลความถี่คำของ Thai Textbook Corpus (TTC) มาใช้งาน โดยมีรูปแบบข้อมูลเป็น List[Tuple] [(word, frequency), ...] """ path = os.path.join(os.path.expanduser("~"), "pythainlp-data") @@ -34,6 +35,6 @@ def get_word_frequency_all(): listword = [] for line in lines: listindata = line.split(" ") - listword.append((listindata[0], listindata[1])) + listword.append((listindata[0], int(listindata[1]))) return listword diff --git a/pythainlp/ner/__init__.py b/pythainlp/ner/__init__.py index 12089b927..b73ef4402 100644 --- a/pythainlp/ner/__init__.py +++ b/pythainlp/ner/__init__.py @@ -5,6 +5,7 @@ from pythainlp.corpus import download, get_file, stopwords from pythainlp.tag import pos_tag from pythainlp.tokenize import word_tokenize +from pythainlp.util import is_thaiword try: import sklearn_crfsuite @@ -22,20 +23,6 @@ _STOPWORDS = stopwords.words("thai") -def _is_thaichar(ch): # เป็นอักษรไทยหรือไม่ - ch_val = ord(ch) - if ch_val >= 3584 and ch_val <= 3711: - return True - return False - - -def _is_thaiword(word): # เป็นคำที่มีแต่อักษรไทยหรือไม่ - for ch in word: - if ch != "." and not _is_thaichar(ch): - return False - return True - - def _is_stopword(word): # เช็คว่าเป็นคำฟุ่มเฟือย return word in _STOPWORDS @@ -43,41 +30,48 @@ def _is_stopword(word): # เช็คว่าเป็นคำฟุ่ม def _doc2features(doc, i): word = doc[i][0] postag = doc[i][1] + # Features from current word features = { "word.word": word, "word.stopword": _is_stopword(word), - "word.isthai": _is_thaiword(word), + "word.isthai": is_thaiword(word), "word.isspace": word.isspace(), "postag": postag, "word.isdigit()": word.isdigit(), } - if word.isdigit() and len(word) == 5: features["word.islen5"] = True + # Features from previous word if i > 0: prevword = doc[i - 1][0] - postag1 = doc[i - 1][1] - features["word.prevword"] = prevword - features["word.previsspace"] = prevword.isspace() - features["word.previsthai"] = _is_thaiword(prevword) - features["word.prevstopword"] = _is_stopword(prevword) - features["word.prepostag"] = postag1 - features["word.prevwordisdigit"] = prevword.isdigit() + prevpostag = doc[i - 1][1] + prev_features = { + "word.prevword": prevword, + "word.previsspace": prevword.isspace(), + "word.previsthai": is_thaiword(prevword), + "word.prevstopword": _is_stopword(prevword), + "word.prevpostag": prevpostag, + "word.prevwordisdigit": prevword.isdigit(), + } + features.update(prev_features) else: features["BOS"] = True # Special "Beginning of Sequence" tag # Features from next word if i < len(doc) - 1: nextword = doc[i + 1][0] - postag1 = doc[i + 1][1] - features["word.nextword"] = nextword - features["word.nextisspace"] = nextword.isspace() - features["word.nextpostag"] = postag1 - features["word.nextisthai"] = _is_thaiword(nextword) - features["word.nextstopword"] = _is_stopword(nextword) - features["word.nextwordisdigit"] = nextword.isdigit() + nextpostag = doc[i + 1][1] + next_features = { + "word.nextword": nextword, + "word.nextisspace": nextword.isspace(), + "word.nextpostag": nextpostag, + "word.nextisthai": is_thaiword(nextword), + "word.nextstopword": _is_stopword(nextword), + "word.nextwordisdigit": nextword.isdigit(), + } + features.update(next_features) else: features["EOS"] = True # Special "End of Sequence" tag diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index fe3ff225e..5099ca30f 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -1,122 +1,278 @@ # -*- coding: utf-8 -*- """ -Spell checker +Spell checker, using Peter Norvig algorithm. +Spelling dictionary can be customized. +Default spelling dictionary is based on Thai National Corpus. -Based on Peter Norvig's Python code at http://norvig.com/spell-correct.html +Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html """ from collections import Counter -from pythainlp.corpus.thaiword import get_data -WORDS = Counter(get_data()) +from pythainlp.corpus import tnc +from pythainlp.util import is_thaichar +_THAI_CHARS = [ + "ก", + "ข", + "ฃ", + "ค", + "ฅ", + "ฆ", + "ง", + "จ", + "ฉ", + "ช", + "ซ", + "ฌ", + "ญ", + "ฎ", + "ฏ", + "ฐ", + "ฑ", + "ฒ", + "ณ", + "ด", + "ต", + "ถ", + "ท", + "ธ", + "น", + "บ", + "ป", + "ผ", + "ฝ", + "พ", + "ฟ", + "ภ", + "ม", + "ย", + "ร", + "ฤ", + "ล", + "ฦ", + "ว", + "ศ", + "ษ", + "ส", + "ห", + "ฬ", + "อ", + "ฮ", + "ฯ", + "ะ", + "ั", + "า", + "ำ", + "ิ", + "ี", + "ึ", + "ื", + "ุ", + "ู", + "ฺ", + "\u0e3b", + "\u0e3c", + "\u0e3d", + "\u0e3e", + "฿", + "เ", + "แ", + "โ", + "ใ", + "ไ", + "ๅ", + "ๆ", + "็", + "่", + "้", + "๊", + "๋", + "์", +] -def prob(word, n=sum(WORDS.values())): - "Probability of `word`." - return WORDS[word] / n +def _no_filter(word): + return True -def correction(word): - "แสดงคำที่เป็นไปได้มากที่สุด" - return max(spell(word), key=prob) +def _is_thai_and_not_num(word): + for ch in word: + if ch != "." and not is_thaichar(ch): + return False + if ch in "๐๑๒๓๔๕๖๗๘๙0123456789": + return False + return True -def known(words): - return list(w for w in words if w in WORDS) - - -def edits1(word): - letters = [ - "ก", - "ข", - "ฃ", - "ค", - "ฅ", - "ฆ", - "ง", - "จ", - "ฉ", - "ช", - "ซ", - "ฌ", - "ญ", - "ฎ", - "ฏ", - "ฐ", - "ฑ", - "ฒ", - "ณ", - "ด", - "ต", - "ถ", - "ท", - "ธ", - "น", - "บ", - "ป", - "ผ", - "ฝ", - "พ", - "ฟ", - "ภ", - "ม", - "ย", - "ร", - "ฤ", - "ล", - "ฦ", - "ว", - "ศ", - "ษ", - "ส", - "ห", - "ฬ", - "อ", - "ฮ", - "ฯ", - "ะ", - "ั", - "า", - "ำ", - "ิ", - "ี", - "ึ", - "ื", - "ุ", - "ู", - "ฺ", - "\u0e3b", - "\u0e3c", - "\u0e3d", - "\u0e3e", - "฿", - "เ", - "แ", - "โ", - "ใ", - "ไ", - "ๅ", - "ๆ", - "็", - "่", - "้", - "๊", - "๋", - "์", - ] + +def _keep(word_freq, min_freq, min_len, max_len, dict_filter): + """ + Keep only Thai words with at least min_freq frequency + and has length between min_len and max_len characters + """ + if not word_freq or word_freq[1] < min_freq: + return False + + word = word_freq[0] + if not word or len(word) < min_len or len(word) > max_len or word[0] == ".": + return False + + return dict_filter(word) + + +def _edits1(word): + """ + Return a set of words with edit distance of 1 from the input word + """ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] - replaces = [L + c + R[1:] for L, R in splits if R for c in letters] - inserts = [L + c + R for L, R in splits for c in letters] + replaces = [L + c + R[1:] for L, R in splits if R for c in _THAI_CHARS] + inserts = [L + c + R for L, R in splits for c in _THAI_CHARS] + return set(deletes + transposes + replaces + inserts) -def edits2(word): - return (e2 for e1 in edits1(word) for e2 in edits1(e1)) +def _edits2(word): + """ + Return a set of words with edit distance of 2 from the input word + """ + return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1)) + + +class NorvigSpellChecker: + def __init__( + self, + custom_dict=None, + min_freq=2, + min_len=2, + max_len=40, + dict_filter=_is_thai_and_not_num, + ): + """ + Initialize Peter Norvig's spell checker object + + :param str custom_dict: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words). + :param int min_freq: Minimum frequency of a word to keep (default = 2) + :param int min_len: Minimum length (in characters) of a word to keep (default = 2) + :param int max_len: Maximum length (in characters) of a word to keep (default = 40) + :param func dict_filter: A function to filter the dictionary. Default filter removes any word with number or non-Thai characters. If no filter is required, use None. + """ + if not custom_dict: # default, use Thai National Corpus + custom_dict = tnc.get_word_frequency_all() + + if not dict_filter: + dict_filter = _no_filter + + # filter word list + custom_dict = [ + word_freq + for word_freq in custom_dict + if _keep(word_freq, min_freq, min_len, max_len, dict_filter) + ] + + self.__WORDS = Counter(dict(custom_dict)) + self.__WORDS_TOTAL = sum(self.__WORDS.values()) + if self.__WORDS_TOTAL < 1: + self.__WORDS_TOTAL = 0 + + def dictionary(self): + """ + Return the spelling dictionary currently used by this spell checker + """ + return self.__WORDS.items() + + def known(self, words): + """ + Return a list of given words that found in the spelling dictionary + + :param str words: A list of words to check if they are in the spelling dictionary + """ + return list(w for w in words if w in self.__WORDS) + + def prob(self, word): + """ + Return probability of an input word, according to the spelling dictionary + + :param str word: A word to check its probability of occurrence + """ + return self.__WORDS[word] / self.__WORDS_TOTAL + + def spell(self, word): + """ + Return a list of possible words, according to edit distance of 1 and 2, + sorted by probability of word occurrance in the spelling dictionary + + :param str word: A word to check its spelling + """ + if not word: + return "" + + candidates = ( + self.known([word]) + or self.known(_edits1(word)) + or self.known(_edits2(word)) + or [word] + ) + candidates.sort(key=self.prob, reverse=True) + + return candidates + + def correct(self, word): + """ + Return the most possible word, using the probability from the spelling dictionary + + :param str word: A word to correct its spelling + """ + if not word: + return "" + + return self.spell(word)[0] + + +DEFAULT_SPELL_CHECKER = NorvigSpellChecker() + + +def dictionary(): + """ + Return the spelling dictionary currently used by this spell checker. + The spelling dictionary is based on words found in the Thai National Corpus. + """ + return DEFAULT_SPELL_CHECKER.dictionary() + + +def known(words): + """ + Return a list of given words that found in the spelling dictionary. + The spelling dictionary is based on words found in the Thai National Corpus. + + :param str words: A list of words to check if they are in the spelling dictionary + """ + return DEFAULT_SPELL_CHECKER.known(words) + + +def prob(word): + """ + Return probability of an input word, according to the Thai National Corpus + + :param str word: A word to check its probability of occurrence + """ + return DEFAULT_SPELL_CHECKER.prob(word) def spell(word): - if not word: - return "" - else: - return known([word]) or known(edits1(word)) or known(edits2(word)) or [word] + """ + Return a list of possible words, according to edit distance of 1 and 2, + sorted by probability of word occurrance in the Thai National Corpus. + + :param str word: A word to check its spelling + """ + return DEFAULT_SPELL_CHECKER.spell(word) + + +def correct(word): + """ + Return the most possible word, according to probability from the Thai National Corpus + + :param str word: A word to correct its spelling + """ + return DEFAULT_SPELL_CHECKER.correct(word) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 7566d83a3..f129fe5ad 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -7,6 +7,20 @@ from nltk.util import ngrams as ngramsdata +def is_thaichar(ch): # เป็นอักษรไทยหรือไม่ + ch_val = ord(ch) + if ch_val >= 3584 and ch_val <= 3711: + return True + return False + + +def is_thaiword(word): # เป็นคำที่มีแต่อักษรไทยหรือไม่ + for ch in word: + if ch != "." and not is_thaichar(ch): + return False + return True + + def ngrams(token, num): """ ngrams สร้าง ngrams @@ -34,7 +48,7 @@ def trigram(token): return ngrams(token, 3) -RULE1 = [ +_NORMALIZE_RULE1 = [ "ะ", "ั", "็", @@ -61,7 +75,7 @@ def trigram(token): ] # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา -RULE2 = [ +_NORMALIZE_RULE2 = [ ("เเ", "แ"), # เ เ -> แ ("ํ(t)า", "\\1ำ"), ("ํา(t)", "\\1ำ"), @@ -81,9 +95,9 @@ def normalize(text): >>> print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก True """ - for data in RULE2: + for data in _NORMALIZE_RULE2: text = re.sub(data[0].replace("t", "[่้๊๋]"), data[1], text) - for data in list(zip(RULE1, RULE1)): + for data in list(zip(_NORMALIZE_RULE1, _NORMALIZE_RULE1)): text = re.sub(data[0].replace("t", "[่้๊๋]") + "+", data[1], text) return text