-
Notifications
You must be signed in to change notification settings - Fork 9
/
main.py
89 lines (70 loc) · 2.45 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import sys
import nltk
import glob
from nltk.tokenize import RegexpTokenizer
from nltk import PorterStemmer
def combine_indexes(words_list_stemmed, files_list):
index, freq_word = create_inverse_index(files_list)
sum_freq = 0
index_list = []
print words_list_stemmed
for term in words_list_stemmed:
if term in index.keys():
print "Term is " + str(term)
print "Index term "+ str(index[term])
index_list.append(index[term])
sum_freq = sum_freq + freq_word[term]
print "Index list " + str(index_list)
if sum_freq:
index_result = list(set.intersection(*index_list))
print "Index result is " + str(index_result)
return index_result, sum_freq
else:
return ["No results found"], 0
def parse_input(word):
word = word.strip()
if ',' in word:
words_list = word.split(',')
elif ' ' in word:
words_list = word.split(' ')
elif ';' in word:
words_list = word.split(';')
elif ':' in word:
words_list = word.split(':')
else:
words_list = [word]
return words_list
def stemming(word):
word = PorterStemmer().stem_word(word.lower())
return word
def create_inverse_index(files_list):
#creating a dictionary of words
index = dict()
#creating frequency of the words
freq_word = dict()
#reading multiple files and tokenizing the contents of the files
for f in files_list:
file_content = open(f).read()
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(file_content)
#creating inverted index data structure
for word in words:
#keeping all the words in lower case
word = stemming(word)
if word not in index.keys():
index[word] = [f]
else:
index[word].append(f)
for word in index.keys():
freq_word[word] = len(index[word])
index[word] = set(index[word])
return index, freq_word
def search(term, files_list):
words_list = parse_input(term)
print "WOrds list is " + str(words_list)
words_list_stemmed = [stemming(word.strip()) for word in words_list]
index_result, sum_freq = combine_indexes(words_list_stemmed, files_list)
return index_result, sum_freq
if __name__ == '__main__':
files_list = ['adventur.txt', 'apples.txt', 'hVDacrN0.html']
search('html', files_list)