-
Notifications
You must be signed in to change notification settings - Fork 1
/
app.py
161 lines (139 loc) · 6.4 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from flask import Flask, jsonify, request
import numpy as np
import pandas as pd
import re
import joblib
import time
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
import gensim
import smart_open
import warnings
warnings.filterwarnings('ignore')
# create app
import flask
app = Flask(__name__)
################################## Helper functions ######################################################################################
# below mentioned functions are helper functions which is use for text preprocessing
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
phrase = re.sub(r"\n", "", phrase)
return phrase
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
"you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
"hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
"mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
'won', "won't", 'wouldn', "wouldn't"])
def remove_stopwords(text):
'''this function will remove stopwords from text '''
final_text = ''
for word in text.split():
if word not in stopwords:
final_text += word + ' '
return final_text
def preprocess_title(text):
# convert to lower case
text = text.lower()
# decontract
text = decontracted(text)
# remove all punctuations except a-z and c# and c++
text = re.sub('[^a-zc#c++]+',' ',text)
# remove stop words
text = remove_stopwords(text)
return text
# load train data
X_train = pd.read_csv('X_train_limited.csv')
print('X_train Loaded')
# trained gensim w2v model on train data
tfidf_w2v_vectors_gensim = joblib.load('gensim_tfidf_w2v_vectors_limited.pkl')
print('tfidf_w2v_vectors_gensim loaded')
# Dictionary of words and idf value
tfidf_gensim = joblib.load('tfidf.pkl')
print('tfidf_gensim Loaded')
# trained w2v model using gensim
w2v_model_gensim = joblib.load('gensim_w2v_model.pkl')
print('w2v_model_gensim loaded')
# w2vec words vocabulary
w2v_words = list(w2v_model_gensim.wv.vocab)
# tfidf words
tfidf_words = set(tfidf_gensim.get_feature_names())
# dictionary of tfidf and idf values
dictionary = dict(zip(tfidf_gensim.get_feature_names(),tfidf_gensim.idf_))
######################################### route paths ##################################################
#@app.route('/')
#def hello_world():
# return 'Hello World!'
@app.route('/')
def index():
return flask.render_template('index.html')
@app.route('/predict', methods=['POST'])
def final_func(top_n=10):
''' This function will find top similar result for given query using gensim w2v'''
start = time.time()
# initialize vector for user query
main_vec = np.zeros(60)
# initialize tfidf weight
weight_sum = 0
# get text from user
to_predict_list = request.form.to_dict()
# preprocess question
text = preprocess_title(to_predict_list['user_question'])
#splitting the sentence
text_list = list(text.split())
for word in text_list:
#finding if word is present in tfidf and in w2v words
if word in tfidf_words and word in w2v_words :
#finding vector of word from glove model
vect = w2v_model_gensim[word]
#compute tfidf
tf_idf = dictionary[word]*(text_list.count(word)/len(text_list))
# adding vector * tfidf to main_vec
main_vec+= (vect*tf_idf)
# summing tfidf values
weight_sum += tf_idf
if weight_sum !=0:
# devide by weight_sum
main_vec /= weight_sum
# find cosine similarity
# tfidf word2vec have trained using gensim
similarities = cosine_similarity((main_vec).reshape(1, -1), Y=tfidf_w2v_vectors_gensim, dense_output=True)
# sort similarities
sort = np.argsort(similarities[0])
# get top similarity indices in descending order
similarity_index = np.array(list(reversed(sort)))
# find top n similarities
top_similarity_index = similarity_index[:top_n]
# print top similarity values
print('Top cosine similarities are ======>',similarities[0][top_similarity_index])
similar_questions_title = X_train['title'][top_similarity_index]
similar_questions_url = X_train['url'][top_similarity_index]
total_time = (time.time() - start)
print('\t')
print('Total time ===========> ',total_time)
dictt = {'top_10' : list(zip(similar_questions_title,similar_questions_url))}
#return jsonify({'Top 10 Similar questions using gensim w2v': list(zip(similar_questions_title,similar_questions_url))})
return flask.render_template('response.html',res = dictt )
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8080, debug=True)