-
Notifications
You must be signed in to change notification settings - Fork 0
/
005.py
executable file
·35 lines (28 loc) · 1.07 KB
/
005.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
def n_gram(seq, n=1, mode='word', nested=False):
if isinstance(seq, (str, unicode)) and mode == 'word':
seq = [w for w in re.split(r' |,|\.', seq) if len(w) > 0]
# 文字列のスライスは文字列なので,いったんリストにする
if isinstance(seq, (str, unicode)) and mode == 'char':
seq = list(seq)
l = [seq[i:i + n] for i in range(len(seq))]
if nested:
return l
if mode == 'word' and not nested:
return [' '.join(v) for v in l]
if mode == 'char' and not nested:
return [''.join(v) for v in l]
def main():
"""05. n-gram
与えられたシーケンス(文字列やリストなど)からn-gramを作る関数を作成せよ.
この関数を用い,"I am an NLPer"という文から単語bi-gram,文字bi-gramを得よ.
"""
s = 'I am an NLPer'
print n_gram(s, 2)
print n_gram(s, 2, nested=True)
print n_gram(s, 2, mode='char')
print n_gram(s, 2, mode='char', nested=True)
if __name__ == '__main__':
main()