-
Notifications
You must be signed in to change notification settings - Fork 5
/
CharLevelLanguageModel.py
54 lines (44 loc) · 1.68 KB
/
CharLevelLanguageModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from collections import defaultdict, Counter
from pathlib import Path
from random import random
class CharLevelLanguageModel:
def __init__(self, ngram=10):
self.ngram = ngram
self.__lm = defaultdict(Counter)
self.__start = '~'
def train(self, path_to_corpus_file):
print('Get data from the file...')
with Path(path_to_corpus_file).open() as f:
print('Collecting of letters\' probabilities...')
n_chars = self.__start * self.ngram
for char in f.read():
self.__lm[n_chars][char] += 1
n_chars = n_chars[1:] + char
self.__lm = {hist: self.__normalize(chars) for hist, chars in self.__lm.items()}
return self
@staticmethod
def __normalize(counter):
total = float(sum(counter.values()))
return [(c, cnt / total) for c, cnt in counter.items()]
def __generate_letter(self, history):
history = history[-self.ngram:]
dist = self.__lm[history]
x = random()
for c, v in dist:
x = x - v
if x <= 0:
return c
def __generate_tail(self, history, n_letters):
for i in range(n_letters):
c = self.__generate_letter(history)
history = history[-self.ngram:] + c
yield c
def generate_text(self, n_letters=1000):
print('Generating of an example of text...')
history = self.__start * self.ngram
return "".join(self.__generate_tail(history, n_letters))
if __name__ == '__main__':
fname = 'resources/corpus/Dostoevsky.txt'
model = CharLevelLanguageModel(ngram=15)
model.train(fname)
print(model.generate_text())