-
Notifications
You must be signed in to change notification settings - Fork 1
/
import_trees.py
159 lines (135 loc) · 5.49 KB
/
import_trees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import string
import pandas as pd
from tqdm import tqdm
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pickle
import logging
logging.basicConfig(level=logging.DEBUG, format='[%(levelname)s] (%(threadName)-10s) %(message)s')
#
# Attempts to analyze text to get named entity classification
#
# from nltk.corpus import state_union
# from nltk.tokenize import PunktSentenceTokenizer
#
# custom_sent_tokenizer = PunktSentenceTokenizer()
#
# train_text = state_union.raw("2005-GWBush.txt")
# sample_text = state_union.raw("2006-GWBush.txt")
#
# custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
#
# tokenized = custom_sent_tokenizer.tokenize(sample_text)
#
# def process_content():
# try:
# for i in tokenized[5:]:
# words = nltk.word_tokenize(i)
# tagged = nltk.pos_tag(words)
# namedEnt = nltk.ne_chunk(tagged, binary=False)
# namedEnt.draw()
# except Exception as e:
# print(str(e))
#
# process_content()
def generate_industries_tree():
industries_csv = pd.read_csv('data/trees_industry.csv', header=None, names=['L1', 'L2', 'L3', 'L4'])
l1 = None
l2 = None
l3 = None
l4 = None
l2_key = None
l3_key = None
l4_key = None
industries = pd.DataFrame(columns=['L1', 'L2', 'L3', 'L4'])
ind_tree = {}
industry_cat = -1
for idx, row in industries_csv.iterrows():
str_l1 = str(row.L1)
str_l2 = str(row.L2)
str_l3 = str(row.L3)
str_l4 = str(row.L4)
l1_sep = str_l1.find(':')
if l1_sep != -1:
industry_cat = industry_cat + 1
l1 = str_l1[l1_sep+1:].strip()
ind_tree[l1] = {'parent': None, 'value': l1, 'level': 1, 'industry_cat': chr(65 + industry_cat)}
pass
if str_l2 != 'nan':
l2 = str_l2.strip()
l2_key = ":".join([l1, l2])
ind_tree[l2_key] = {'parent': l1, 'value': l2, 'level': 2, 'industry_cat': chr(65 + industry_cat)}
else:
if l2 is None:
l2 = ""
if str_l3 != 'nan':
l3 = str_l3.strip()
l3_key = ":".join([l1, l2, l3])
ind_tree[l3_key] = {'parent': l2_key, 'value': l3, 'level': 3, 'industry_cat': chr(65 + industry_cat)}
else:
if l3 is None:
l3 = ""
if str_l4 != 'nan':
l4 = str_l4.strip()
l4_key = ":".join([l1, l2, l3, l4])
ind_tree[l4_key] = {'parent': l3_key, 'value': l4, 'level': 4, 'industry_cat': chr(65 + industry_cat)}
else:
if l4 is None:
l4 = ""
if l1 is not None:
# industries.loc[industries.shape[0]+1] = [l1, l2, l3, l4]
l4 = None
# industries.to_csv('data/industries.csv', index_label='idx', header=True)
pickle.dump(ind_tree, open("data/ind_tree.p", "wb"))
generate_industries_tree()
def generate_sentence_similarity():
#
# Attempt to use word net synonyms to infer relevance, not effective hence superseded with word2vec
#
industries = pd.read_csv('data/industries.csv', header=0, index_col=0)
industries = industries.fillna('')
course_w_geo = pd.read_csv('data/course_w_geo.csv', header=0, index_col=0)
# stop_words = set(stopwords.words('english'))
punct_trans = {ord(c): ' ' for c in string.punctuation}
score_df = pd.DataFrame(columns=['course_title', 'industry', 'score'])
score_cache = {}
# Iterate through courses
for c_idx, c_row in tqdm(course_w_geo.iterrows()):
course_str = c_row.TITLE.strip()
course_str = course_str.translate(punct_trans)
c_word_tokens = word_tokenize(course_str)
score_cache[course_str] = {}
# Iterate through Industries
for t_idx, t_row in industries.iterrows():
for l in range(1):
score = 0
num_syms = 0
industry_str = t_row[f'L{l+1}'].strip()
industry_str = industry_str.translate(punct_trans)
if course_str in score_cache and industry_str in score_cache[course_str]:
score = score_cache[course_str][industry_str]
else:
t_word_tokens = word_tokenize(industry_str)
for t_tok in t_word_tokens:
for c_tok in c_word_tokens:
# Find synonym sets for industry and course word tokens
t_ss = wn.synsets(t_tok)
c_ss = wn.synsets(c_tok)
# Sum and average WUP path similarity for all found synonyms
for t_s in t_ss:
for c_s in c_ss:
t_score = t_s.wup_similarity(c_s)
if t_score:
num_syms = num_syms + 1
score = score + t_score
if num_syms > 0:
score = score / num_syms
logging.debug(f'{course_str} - {industry_str}: {score} ({num_syms})')
score_cache[course_str][industry_str] = score
score_df.loc[score_df.shape[0]+1] = [course_str, industry_str, score]
pass
pickle.dump(score_cache, open("data/score_cache.p", "wb"))
score_df.to_csv('data/course_to_industry_score.csv', index_label='idx', header=True)
# generate_sentence_similarity()
pass