-
Notifications
You must be signed in to change notification settings - Fork 1
/
Stemming.py
66 lines (51 loc) · 1.84 KB
/
Stemming.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding: utf-8 -*-
"""
@author:sourabh garg
"""
import Trie
import words_tagging
from words_tagging import all_noun
from Declension import Declension_noun
from Tokenize import complete_tokenize,join
def search_noun(word,noun):
'''
This function search the word in the declension of the noun word,
if the word exist in it, then it returns True.
'''
dec=Declension_noun(noun)
for row in dec:
for col in row:
if word==col:
return True
def initialize():
mytrie=Trie.Trie()
for stem_cls in words_tagging.all_noun:
for noun in stem_cls:
mytrie.insert(complete_tokenize(noun))
return mytrie
mytrie=Trie.Trie()
mytrie=initialize()
def stem(word):
"""
It inputs an inflected word and outputs the stem for that inflected word provided.
In this function, first we make a trie of all available noun words,
then taking tokenization of the inflected word and find any possible
match in the trie, if it found an exact match in the declension of that
matched noun word,then it would be our stem, else it would truncate
the word and repeat the above step, until we get our desired result.
"""
lis=complete_tokenize(word)
if ' ' in word:
lis=lis[2:] #for vocative case , as they include 'हे' at initial, while searching it must be removed
length=len(lis)
for i in range(length):
serch=lis[:length-i]
for trie_word in mytrie.find(serch):
joined_word=join(trie_word)
if search_noun(word,joined_word)==True:
found=True
return joined_word
if __name__ == '__main__':
print(stem('नद्योः'))
print(stem('पोत्रोः'))
print(stem('हे क्षुधः'))