-
Notifications
You must be signed in to change notification settings - Fork 0
/
entity_object.py
64 lines (54 loc) · 2.4 KB
/
entity_object.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import sys
from nltk.stem.snowball import SnowballStemmer
from lib_process import *
from list_term_object import List_Term_Object
from config import *
from document_object import Document_Object
from nltk.util import ngrams
class Entity_Object(Document_Object):
categories=None
dict_obj=None
term_freq=None
term_freqs=None
lengths=None
def __init__(self):
self.dict_obj={}
self.dict_attr={}
def updateFromIndex(self,d_pair,mongoObj,w2vmodel,lucene_obj):
# d_pair:(document,docid) entity: dict
entity,docid=d_pair[0],d_pair[1]
for idf in entity.iterator():
self.setAttr(idf.name(),idf.stringValue())
#print ('%s\t%s'%(idf.name(),idf.stringValue()))
self.setAttr('name',self.label)
if IS_SAS_USED==True:
self.update_categories(mongoObj)
self.update_term_freq(docid,USED_CONTENT_FIELD,lucene_obj)
self.length=sum(self.term_freq.values())
self.update_term_freqs(docid,lucene_obj)
def update_term_freq(self,docid,field,lucene_obj):
self.term_freq=lucene_obj.get_term_freq(docid,field,False)
def update_term_freqs(self,docid,lucene_obj):
self.term_freqs={}
self.lengths={}
for f in LIST_F:
try:
self.term_freqs[f]=lucene_obj.get_term_freq(docid,f,False)
self.lengths[f]=sum(self.term_freqs[f].values())
except:
self.term_freqs[f]={}
self.lengths[f]=0
if LIST_F[0].find('stemmed')>-1:
self.term_freqs['stemmed_catchall']=lucene_obj.get_term_freq(docid,'stemmed_catchall',False)
self.lengths['stemmed_catchall']=sum(self.term_freqs['stemmed_catchall'].values())
else:
self.term_freqs['catchall']=lucene_obj.get_term_freq(docid,'catchall',False)
self.lengths['catchall']=sum(self.term_freqs['catchall'].values())
def update_categories(self,mongoObj):
if mongoObj.conn_acs==None:
return
item=mongoObj.conn_acs.find_one({'uri':self.uri})
if item is None:
self.categories=[]
return
self.categories=item['categories'].strip().split('|')