PyThaiNLP · bact · Nov 4, 2018 · Nov 1, 2018 · Nov 1, 2018 · Nov 2, 2018
diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
@@ -393,36 +393,25 @@ from pythainlp.change import *
 
 ### soundex
 
-เครดิต Korakot Chaovavanich https://gist.github.com/korakot/0b772e09340cac2f493868da035597e8
-
 กฎที่รองรับ
-- LK82 - กฎการเข้ารหัสซาวน์เด็กซ์ของ วิชิตหล่อจีระชุณห์กุล และ เจริญ คุวินทร์พันธุ์
-- Udom83 - กฎการเข้ารหัสซาวน์เด็กซ์ของ วรรณี อุดมพาณิชย์
-
-**การใช้งาน**
+- lk82 - กฎการเข้ารหัสซาวน์เด็กซ์ของ วิชิตหล่อจีระชุณห์กุล และ เจริญ คุวินทร์พันธุ์
+- udom83 - กฎการเข้ารหัสซาวน์เด็กซ์ของ วรรณี อุดมพาณิชย์
 
-```python
-from pythainlp.soundex import LK82, Udom83
-
-print(LK82("รถ"))  # ร3000
-print(LK82("รด"))  # ร3000
-print(LK82("จัน"))  # จ4000
-print(LK82("จันทร์"))  # จ4000
-print(Udom83("รถ"))  # ร800000
-```
-
-### MetaSound ภาษาไทย
-
-```
-Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical Ontology for Analysing Names Given in Accordance with Thai Astrology. Retrieved from https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
-```
+เครดิต
+- Korakot Chaovavanich https://gist.github.com/korakot/0b772e09340cac2f493868da035597e8
+- Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical Ontology for Analysing Names Given in Accordance with Thai Astrology. Retrieved from https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
 
 **การใช้งาน**
 
 ```python
-from pythainlp.metasound import metasound
+from pythainlp.soundex import lk82, metasound, udom83
 
-metasound("รัก")  # 'ร100'
+print(lk82("รถ"))  # ร3000
+print(lk82("รด"))  # ร3000
+print(lk82("จัน"))  # จ4000
+print(lk82("จันทร์"))  # จ4000
+print(udom83("รถ"))  # ร800000
+print(metasound("รัก"))  # 'ร100'
 ```
 
 ### sentiment
@@ -447,20 +436,9 @@ sentiment(str)
 from pythainlp.util import *
 ```
 
-#### ngrams
-
-สำหรับสร้าง n-grams
-
-```python
-ngrams(token, num)
-```
-
-- token คือ list
-- num คือ จำนวน ngrams
-
 #### bigrams
 
-สำหรับสร้าง bigrams
+สร้าง bigram
 
 ```python
 bigrams(token)
@@ -473,7 +451,7 @@ bigrams(token)
 สำหรับสร้าง trigram
 
 ```python
-trigram(token)
+trigrams(token)
 ```
 
 - token คือ list
@@ -559,61 +537,52 @@ print(wordnet.synset("spy.n.01").lemma_names("tha"))
 #### stopword ภาษาไทย
 
 ```python
-from pythainlp.corpus import stopwords
-stopwords = stopwords.words("thai")
+from pythainlp.corpus import thai_stopwords
+stopwords = thai_stopwords()
 ```
 
 #### ชื่อประเทศ ภาษาไทย
 
 ```python
-from pythainlp.corpus import country
-country.get_data()
+from pythainlp.corpus import countries
+countries()
 ```
 
-#### ตัววรรณยุกต์ในภาษาไทย
+#### วรรณยุกต์ในภาษาไทย
 
 ```python
-from pythainlp.corpus import tone
-tone.get_data()
+from pythainlp.corpus import THAI_TONEMARKS
 ```
 
-#### ตัวพยัญชนะในภาษาไทย
+#### พยัญชนะในภาษาไทย
 
 ```python
-from pythainlp.corpus import alphabet
-alphabet.get_data()
+from pythainlp.corpus import THAI_ALPHABETS
 ```
 
 #### รายการคำในภาษาไทย
 
 ```python
-from pythainlp.corpus.thaiword import get_data  # ข้อมูลเก่า
-get_data()
-
-from pythainlp.corpus.newthaiword import get_data  # ข้อมูลใหม่
-get_data()
+from pythainlp.corpus import thai_words
+thai_words()
 ```
 
 #### provinces
 
 ข้อมูลชื่อจังหวัดในประเทศไทย
 
-##### get_data
-
-รับข้อมูลชื่อจังหวัดในประเทศไทบ
-
 ```python
-get_data()
+from pythainlp.corpus import provinces
+provinces()
 ```
 
-คืนค่าเป็น list
-
-##### parsed_docs
+##### tag_provinces
 
 สำหรับใช้ติดป้ายกำกับชื่อจังหวัดในประเทศไทย
 
 ```python
-parsed_docs(text_list)
+from pythainlp.ner.locations import tag_provinces
+tag_provinces(text_list)
 ```
 
 text_list คือ ข้อความภาษาไทยที่อยู่ใน list โดยผ่านการตัดคำมาแล้ว

diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -10,6 +10,6 @@
 from pythainlp.spell import spell
 from pythainlp.tag import pos_tag
 from pythainlp.tokenize import etcc, sent_tokenize, tcc, word_tokenize
-from pythainlp.util import bigrams, ngrams, trigram
+from pythainlp.util import bigrams, trigrams
 
 __version__ = 1.7
diff --git a/pythainlp/chunk/__init__.py b/pythainlp/chunk/__init__.py
diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
@@ -2,25 +2,96 @@
 
 import os
 
+from pythainlp.tools import get_full_data_path, get_pythainlp_path
 import requests
-from pythainlp.tools import get_path_data, get_path_db
 from tinydb import Query, TinyDB
 from tqdm import tqdm
 from urllib.request import urlopen
 
-CORPUS_DB_URL = (
+_CORPUS_DIRNAME = "corpus"
+CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME)
+
+_CORPUS_DB_URL = (
     "https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json"
 )
+_CORPUS_DB_FILENAME = "db.json"
+CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME)
+if not os.path.exists(CORPUS_DB_PATH):
+    TinyDB(CORPUS_DB_PATH)
+
+_THAI_COUNTRIES_FILENAME = "countries_th.txt"
+_THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.txt"
+_THAI_SYLLABLES_FILENAME = "syllables_th.txt"
+_THAI_WORDS_FILENAME = "words_th.txt"
+_THAI_STOPWORDS_FILENAME = "stopwords_th.txt"
+
+_THAI_NEGATIONS = frozenset(["ไม่", "แต่"])
+
+THAI_NUMBERS = "๐๑๒๓๔๕๖๗๘๙"  # 10
+THAI_ALPHABETS = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ"  # 44
+THAI_VOWELS = "ฤฦะ\u0e31าำ\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39เแโใไ\u0e45"  # 18
+THAI_SYMBOLS = "ฯ\u0e3a฿ๆ\u0e47\u0e4c\u0e4d\u0e4e\u0e4f\u0e5a\u0e5b"  # 11
+THAI_TONEMARKS = "\u0e48\u0e49\u0e4a\u0e4b"  # 4
+THAI_LETTERS = "".join(
+    [THAI_ALPHABETS, THAI_VOWELS, THAI_TONEMARKS, THAI_SYMBOLS]
+)  # 77
+
+
+def get_corpus(filename):
+    """
+    Read corpus from file and return a frozenset
+    """
+    lines = []
+    with open(os.path.join(CORPUS_PATH, filename), "r", encoding="utf8") as fh:
+        lines = fh.read().splitlines()
+    return frozenset(lines)
+
+
+def countries():
+    """
+    Return a frozenset of country names in Thai
+    """
+    return get_corpus(_THAI_COUNTRIES_FILENAME)
+
+
+def provinces():
+    """
+    Return a frozenset of Thailand province names in Thai
+    """
+    return get_corpus(_THAI_THAILAND_PROVINCES_FILENAME)
+
+
+def thai_syllables():
+    """
+    Return a frozenset of Thai syllables
+    """
+    return get_corpus(_THAI_SYLLABLES_FILENAME)
+
+
+def thai_words():
+    """
+    Return a frozenset of Thai words
+    """
+    return get_corpus(_THAI_WORDS_FILENAME)
+
+
+def thai_stopwords():
+    """
+    Return a frozenset of Thai stopwords
+    """
+    # TODO: Cache? Not reading the disk everytime
+    return get_corpus(_THAI_STOPWORDS_FILENAME)
+
 
-# __all__ = ["thaipos", "thaiword", "alphabet", "tone", "country", "wordnet"]
-path_db_ = get_path_db()
+def thai_negations():
+    return _THAI_NEGATIONS
 
 
 def get_file(name):
-    db = TinyDB(path_db_)
+    db = TinyDB(CORPUS_DB_PATH)
     temp = Query()
     if len(db.search(temp.name == name)) > 0:
-        path = get_path_data(db.search(temp.name == name)[0]["file"])
+        path = get_full_data_path(db.search(temp.name == name)[0]["file"])
         db.close()
         if not os.path.exists(path):
             download(name)
@@ -48,7 +119,7 @@ def download_(url, dst):
         desc=url.split("/")[-1],
     )
     req = requests.get(url, headers=header, stream=True)
-    with (open(get_path_data(dst), "wb")) as f:
+    with (open(get_full_data_path(dst), "wb")) as f:
         for chunk in req.iter_content(chunk_size=1024):
             if chunk:
                 f.write(chunk)
@@ -58,9 +129,9 @@ def download_(url, dst):
 
 
 def download(name, force=False):
-    db = TinyDB(path_db_)
+    db = TinyDB(CORPUS_DB_PATH)
     temp = Query()
-    data = requests.get(CORPUS_DB_URL)
+    data = requests.get(_CORPUS_DB_URL)
     data_json = data.json()
     if name in list(data_json.keys()):
         temp_name = data_json[name]
@@ -119,7 +190,7 @@ def download(name, force=False):
 
 
 def remove(name):
-    db = TinyDB(path_db_)
+    db = TinyDB(CORPUS_DB_PATH)
     temp = Query()
     data = db.search(temp.name == name)
     if len(data) > 0:

diff --git a/pythainlp/corpus/alphabet.py b/pythainlp/corpus/alphabet.py
diff --git a/pythainlp/corpus/corpus_license.md b/pythainlp/corpus/corpus_license.md
@@ -42,7 +42,7 @@ Technology and LICENSEE agrees to preserve same.
 
 ## Dictionaries and Word Lists
 
-thaiword.txt, new-thaidict.txt, stopwords-th.txt, stopwords-th1.txt, stopwords-th2.txt, stopwords-th3.txt, stopwords-th4.txt, stopwords-th-old.txt, and คำมูล-คำอ่าน.db use Creative Commons Attribution-ShareAlike 4.0 International Public License
+words_th.txt, stopwords_th.txt, and คำมูล-คำอ่าน.db use Creative Commons Attribution-ShareAlike 4.0 International Public License
 
 ## Creative Commons