-
Notifications
You must be signed in to change notification settings - Fork 0
/
ner_test_recursive_with_html.py
411 lines (329 loc) · 17.3 KB
/
ner_test_recursive_with_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
import spacy
import os
from lxml import etree
from itertools import islice
import statistics
from collections import Counter
from tika import parser
from sys import argv
import tika
tika.TikaClientOnly = True
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import ldamodel
from spacy.lemmatizer import Lemmatizer
from pprint import pprint
import glob
import openpyxl
import matplotlib.pyplot as plt
import numpy as np
import textwrap
import chardet
import shutil
import sys
def take(n, iterable):
"Return first n items of the iterable as a list"
return list(islice(iterable, n))
def return_pairs(mydict):
return {k: mydict[k] for k in list(mydict)[:2]}
def remove_stopwords(doc):
# This will remove stopwords and punctuation.
# Use token.text to return strings, which we'll need for Gensim.
doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
return doc
def lemmatizer(doc):
# This takes in a doc of tokens from the NER and lemmatizes them.
# Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
nlp = spacy.load("en_core_web_sm")
doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
doc = u' '.join(doc)
return nlp.make_doc(doc)
def absolute_value(pct, allvals):
absolute = int(pct/100.*np.sum(allvals))
return "{:d} hits".format(absolute)
def entity_separator(doc_text, nlp, person, norp, fac, org, gpe, loc, product, event, doc_list):
doc = nlp(doc_text)
for ent in doc.ents:
if ent.label_ == 'PERSON':
person.append(ent.text)
elif ent.label_ == 'NORP':
norp.append(ent.text)
elif ent.label_ == 'FAC':
fac.append(ent.text)
elif ent.label_ == 'ORG':
org.append(ent.text)
elif ent.label_ == 'GPE':
gpe.append(ent.text)
elif ent.label_ == 'LOC':
loc.append(ent.text)
elif ent.label_ == 'PRODUCT':
product.append(ent.text)
elif ent.label_ == 'EVENT':
event.append(ent.text)
else:
continue
#remove stopwords and add to our document list for topic modeling
lemmatized = lemmatizer(doc)
no_stops = remove_stopwords(lemmatized)
doc_list.append(no_stops)
return person, norp, fac, org, gpe, loc, product, event, doc_list
def main():
#load English model. Small provides good enough NER while being faster than en_core_web_md
nlp = spacy.load("en_core_web_sm")
#set variables
try:
ship_dir = argv[1]
except IndexError:
print('\n\nWarning: missing "shipment" argument. Include a python-friendly path to the directory you want to run NER/topic modeling on.')
sys.exit()
shipmentID = os.path.basename(ship_dir)
output_folder = os.path.join('C:/temp', 'ner-testing', shipmentID)
output_files = os.path.join(output_folder, 'files')
output = os.path.join(output_folder, '{}_ner-report.html'.format(shipmentID))
css_file = 'C:/BDPL/resources/NER/mystyle.css'
if not os.path.exists(output_files):
os.makedirs(output_files)
#get spreadsheet and set openpyxl variables
spreadsheet = glob.glob(os.path.join(ship_dir, '*.xlsx'))[0]
wb = openpyxl.load_workbook(spreadsheet)
ws = wb['Appraisal']
#start html doc
html = etree.Element('html')
head = etree.SubElement(html, 'head')
script = etree.SubElement(head, 'script', src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js")
link = etree.SubElement(head, 'link')
link.attrib['rel'] = "stylesheet"
link.attrib['type'] = "text/css"
link.attrib['href'] = "./files/mystyle.css"
body = etree.SubElement(html, 'body')
h1 = etree.SubElement(body, 'h1')
h1.text ="Content Analysis"
div = etree.SubElement(body, 'div')
div.attrib['class'] = 'scrollable'
table = etree.SubElement(div, 'table')
tr = etree.SubElement(table, 'tr')
th = etree.SubElement(tr, 'th')
th.text = 'Object'
th = etree.SubElement(tr, 'th')
th.text = 'Creator'
th = etree.SubElement(tr, 'th')
th.text = 'Label Transcription'
th = etree.SubElement(tr, 'th')
th.text = 'Initial Appraisal'
th = etree.SubElement(tr, 'th')
th.text = 'Named Entities: People'
th = etree.SubElement(tr, 'th')
th.text = 'Named Entities: Nationalities or religious/political groups'
th = etree.SubElement(tr, 'th')
th.text = 'Named Entities: Buildings and Facilities'
th = etree.SubElement(tr, 'th')
th.text = 'Named Entities: Organizations'
th = etree.SubElement(tr, 'th')
th.text = 'Named Entities: Countries, cities, states'
th = etree.SubElement(tr, 'th')
th.text = 'Named Entities: Geographic Features/Locations'
th = etree.SubElement(tr, 'th')
th.text = 'Named Entities: Products'
th = etree.SubElement(tr, 'th')
th.text = 'Named Entities: Events'
th = etree.SubElement(tr, 'th')
th.text = 'Topic Modeling'
for item_barcode in [d for d in os.listdir(ship_dir) if os.path.isdir(os.path.join(ship_dir, d))]:
# if not '30000152027102' in item_barcode:
# continue
print('\nWorking on: ', item_barcode)
found = False
for cell in ws['A']:
if (cell.value is not None):
if item_barcode == str(cell.value).strip():
found = True
creator = ws.cell(row=cell.row, column=5).value
transcription = ws.cell(row=cell.row, column=8).value
appraisal = ws.cell(row=cell.row, column=9).value
if not found:
creator = 'N/A'
transcription = 'N/A'
appraisal = 'N/A'
#add barcode to html
tr = etree.SubElement(table, 'tr')
td = etree.SubElement(tr, 'td')
td.text = item_barcode
td = etree.SubElement(tr, 'td') #creator
td.text = creator
td = etree.SubElement(tr, 'td') #label transcription
td.text = transcription
td = etree.SubElement(tr, 'td') #initial appraisal notes
td.text = appraisal
files_dir = os.path.join(ship_dir, item_barcode, 'files')
if not os.path.exists(files_dir):
continue
#create lists to store extracted entities and text for gensim
doc_list = []
person = []
norp = []
fac = []
org = []
gpe = []
loc = []
product = []
event = []
#Loop through files in our item file_dir.
for root, dirs, files, in os.walk(files_dir):
for f in files:
ner_target = os.path.join(root, f)
print('\tProcessing: {}\n'.format(ner_target))
#use python-tika 'parser' to pull text from file; skip if we get UnicodeEncodeError (will try to figure out how to handle those later)
try:
content = parser.from_file(ner_target)
except UnicodeEncodeError:
continue
#isolate the 'content' from python-tika parser output
if 'content' in content:
text = content['content']
else:
continue
#to make our results a little more manageable, 'split' the content to remove newlines, tabs, etc. Then rejoin everything (separating each word by a space)
text = str(text).split()
combined_text = ' '.join(t for t in text)
#We will now send our text to the 'entity_separator' function--created so that we could handle all of our text in the same way, as we discovered that Spacy will run out of memory if the text is too big. To handle that, we check size; if text is too big, we will just process one chunk of it at a time and concatenate the results.
if len(combined_text) > 1000000:
continue
for chunk in textwrap.wrap(combined_text, 900000):
person, norp, fac, org, gpe, loc, product, event, doc_list = entity_separator(chunk, nlp, person, norp, fac, org, gpe, loc, product, event, doc_list)
else:
person, norp, fac, org, gpe, loc, product, event, doc_list = entity_separator(combined_text, nlp, person, norp, fac, org, gpe, loc, product, event, doc_list)
#now loop through each list of entities we created
for ls in [person, norp, fac, org, gpe, loc, product, event]:
#tally the number of unique entities in each list and sort the resulting dictionary so we can present results in descending order, with the most frequent first. Reconciling near matches or eliminating false positives would require too much human intervention
tally = dict(Counter(ls))
#sort the resulting dictionary so we can present results in descending order, with the most frequent first
sorted_tally = {k: v for k, v in sorted(tally.items(), key=lambda item: item[1], reverse=True)}
#if no results (i.e., empty list), we just note entity is 'N/A' and we do not create a pie-chart or full text file of entities
if len(sorted_tally) == 0:
td = etree.SubElement(tr, 'td')
ul = etree.SubElement(td, 'ul')
li = etree.SubElement(ul, 'li')
li.text ='N/A'
#if we do have results, we will write the top
else:
#set graph title
if ls == person:
graph_title = 'people'
elif ls == org:
graph_title = 'organizations'
elif ls == gpe:
graph_title = 'countries-cities-states'
elif ls == norp:
graph_title = 'groups'
elif ls == loc:
graph_title = 'geographic-features'
elif ls == fac:
graph_title = 'buildings-facilities'
elif ls == product:
graph_title = 'products'
elif ls == event:
graph_title = 'events'
current_chart = os.path.join(output_files, '{}-{}.png'.format(item_barcode, graph_title))
current_report = os.path.join(output_files, '{}-{}.txt'.format(item_barcode, graph_title))
#write full list of entities to file.
with open(current_report, 'wb') as f:
f.write('{} Entities for {}\n\n'.format(graph_title.upper(), item_barcode).encode())
#The code below adds some spaces so that this text file is better formatted
for k in sorted_tally.keys():
if len(k) < 30:
diff = 30 - len(k)
spaces = len(k) + diff
value = k.rjust(spaces, ' ')
elif len(k) > 30:
content = textwrap.wrap(k, 30)
for i, c in enumerate(content):
if len(c) < 30:
diff = 30 - len(c)
spaces = len(c) + diff
content[i] = c.rjust(spaces, ' ')
value = '\n'.join(content)
else:
value = k
#if we have UnicodeEncodeError, use chardet to try and ID the encoding.
try:
f.write("{} : {}\n\n".format(value, sorted_tally[k]).encode())
except UnicodeEncodeError:
enc = chardet.detect(value.encode())['encoding']
f.write("{} : {}\n\n".format(value.encode(encoding = enc, errors="ignore"), str(sorted_tally[k]).encode()))
#create link to full list of entities; add cell to html
td = etree.SubElement(tr, 'td')
a = etree.SubElement(td, 'a')
a.attrib['href'] = './{}/{}'.format(os.path.basename(os.path.dirname(current_report)), os.path.basename(current_report))
a.attrib['target'] = "_blank"
p = etree.SubElement(a, 'p')
p.text = 'Full list of entities'
ul = etree.SubElement(td, 'ul')
#determine which results are above average; the rest are 'others'. If there are too many and chart is unreadable' what is upper limit?
ner_median = statistics.median(sorted(set(list(sorted_tally.values()))))
#we are only going to include results in the 90th percentile and above on our html file. Change variable as needed
percentile = 90
ner_percentile = np.percentile(sorted(set(list(sorted_tally.values()))), percentile)
#If we only have 10 or fewer entities, all will get included in piechart and html; if there are more, we will report the top percentiles and then lump everything else into 'other' for our pie chart and html report
if len(sorted_tally) <= 10:
results = sorted_tally
others = {}
else:
results = {k:v for k, v in sorted_tally.items() if v > ner_median}
others = {k:v for k, v in sorted_tally.items() if v <= ner_median}
#if we have too many results, we won't be able to view results; use 'percentile' as cut-off
if len(results) > 25:
results = {k:v for k, v in sorted_tally.items() if v > ner_percentile}
others = {k:v for k, v in sorted_tally.items() if v <= ner_percentile}
#write to html in a list; for each list, include a link to pie chart
labels = []
hits = []
a = etree.SubElement(ul, 'a')
a.attrib['href'] = './{}/{}'.format(os.path.basename(os.path.dirname(current_chart)), os.path.basename(current_chart))
a.attrib['target'] = "_blank"
for k, v in results.items():
li = etree.SubElement(a, 'li')
li.text ='{} : {}'.format(k, sorted_tally[k])
#add entities and # of hits to list for pie charts
labels.append(k)
hits.append(v)
#if there are 'other' results, add these to list as a group
if len(others) > 0:
labels.append('Other')
hits.append(sum(others.values()))
#create pie chart; clear previous one first
plt.clf()
plt.title('{} Entities for {}'.format(graph_title.upper(), item_barcode))
plt.pie(hits, labels=labels, autopct=lambda pct: absolute_value(pct, hits))
plt.axis('equal')
plt.savefig(current_chart, bbox_inches='tight')
#now do topic modeling
td = etree.SubElement(tr, 'td')
ul = etree.SubElement(td, 'ul')
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(doc_list)
# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]
#use gensim LdaModel class to produce topics; 'num_topics' defines # of topics that will be created
try:
lda_model = ldamodel.LdaModel(corpus=corpus, id2word=words, num_topics=6, random_state=2, update_every=1, passes=10, alpha='auto', per_word_topics=True)
x=lda_model.show_topics(num_topics=6, num_words=8,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
#Print topic modeling output
if len(topics_words) > 0:
for topic,words in topics_words:
li = etree.SubElement(ul, 'li')
li.text = " ".join(words)
else:
li = etree.SubElement(ul, 'li')
li.text = "N/A"
except ValueError:
li = etree.SubElement(ul, 'li')
li.text = "N/A"
#write html to file
html_doc = etree.ElementTree(html)
html_doc.write(output, method="html", pretty_print=True)
shutil.copy(css_file, output_files)
if __name__ == "__main__":
main()