Init.

IDDT · Jan 21, 2021 · ad75b6a · ad75b6a
commit ad75b6a
Show file tree

Hide file tree

Showing 21 changed files with 7,801 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+__pycache__/
+/temp/*
+!/temp/.keep
+!/temp/bpe_merges_*
+/build/
+/dist/
+/thai_tokenizer.egg-info/
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Kirill Orlov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/readme.md b/readme.md
@@ -0,0 +1,46 @@
+# Thai Tokenizer
+Fast and accurate Thai tokenization library using supervised BPE training designed for full-text search applications.
+
+
+
+## Installation
+```bash
+pip3 install thai_tokenizer
+```
+
+
+
+## Usage
+Default set of pairs is optimized for short Thai-English product descriptions.
+```python
+from thai_tokenizer import Tokenizer, bpe_merges
+tokenizer = Tokenizer(bpe_merges)
+tokenizer('iPad Mini 256GB เครื่องไทย') #> 'iPad Mini 256GB เครื่อง ไทย'
+tokenizer.split('เครื่องไทย') #> ['เครื่อง', 'ไทย']
+```
+
+
+
+## Training
+It might be desirable to train own pairs to capture the specifics of the dataset at hand. Figuring out which pairs should be merged can be highly subjective. As a general guideline here are some of the pairs that original author deemed as separate:
+```
+"ส่ง" + "ฟรี" | free + delivery -> freedelivery
+"เจ้า" + "หญิง" | royal + woman = royalwoman (princess)
+"งาน" + "แท้" | real + work = realwork (genuine)
+"รอง" + "เท้า" | support + foot = supportfoot (shoe)
+```
+```bash
+git clone thai_tokenizer
+cd thai_tokenizer
+python3 -m thai_tokenizer --help
+```
+
+
+
+## Contributing
+Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
+
+
+
+## License
+[MIT](https://choosealicense.com/licenses/mit/)
diff --git a/sample_bpe.py b/sample_bpe.py
@@ -0,0 +1,9 @@
+import json
+from thai_tokenizer import segment, ThaiTokenizer
+
+
+with open('temp/bpe_merges.jsonl', 'rt') as f:
+    tokenizer = ThaiTokenizer((json.loads(x)[0] for x in f))
+
+
+tokenizer.tokenize('ศูนย์รวมล้อเดิมป้ายแดง')
diff --git a/setup.py b/setup.py
@@ -0,0 +1,32 @@
+import setuptools
+
+
+
+setuptools.setup(
+    name='thai_tokenizer',
+    version='0.1.0',
+    description='Fast and accurate Thai tokenization library.',
+    url='https://github.com/IDDT/thai-tokenizer',
+    author='Kirill Orlov',
+    author_email='IDDT@users.noreply.github.com',
+    license='MIT',
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Natural Language :: Thai',
+        'Operating System :: OS Independent',
+        'Topic :: Text Processing :: Linguistic'
+    ],
+    keywords=['thai', 'tokenizer'],
+    packages=setuptools.find_packages(exclude=['tests']),
+    python_requires='>=3.6',
+    include_package_data=True,
+    package_data={'': ['data/bpe_merges.jsonl']},
+    test_suite='tests',
+    zip_safe=True
+)
diff --git a/sort_pairs.py b/sort_pairs.py
@@ -0,0 +1,16 @@
+import sys
+import json
+
+
+
+pairs = []
+with open(sys.argv[1], 'rt') as f:
+    for line in (x.strip() for x in f):
+        if line:
+            pairs.append(tuple(json.loads(line)))
+
+with open(sys.argv[1], 'wt') as f:
+    for pair in sorted(set(pairs)):
+        f.write(json.dumps(pair, ensure_ascii=False) + '\n')
+
+print('Done.')
diff --git a/temp/.keep b/temp/.keep