forked from Rakshitha03/DBLP_processing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reviewer_info.py
332 lines (308 loc) · 12.6 KB
/
reviewer_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
# import dblp
import inspect
import json
import os
from time import sleep
import requests
from lxml import etree
from collections import namedtuple
DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
DBLP_AUTHOR_SEARCH_URL = DBLP_BASE_URL + 'search/author'
DBLP_PERSON_URL = DBLP_BASE_URL + 'pers/xk/{urlpt}'
DBLP_PUBLICATION_URL = DBLP_BASE_URL + 'rec/bibtex/{key}.xml'
class LazyAPIData(object):
def __init__(self, lazy_attrs):
self.lazy_attrs = set(lazy_attrs)
self.data = None
def __getattr__(self, key):
if key in self.lazy_attrs:
if self.data is None:
self.load_data()
return self.data[key]
raise AttributeError, key
def load_data(self):
pass
class Author(LazyAPIData):
"""
Represents a DBLP author. All data but the author's key is lazily loaded.
Fields that aren't provided by the underlying XML are None.
Attributes:
name - the author's primary name record
publications - a list of lazy-loaded Publications results by this author
homepages - a list of author homepage URLs
homonyms - a list of author aliases
"""
def __init__(self, urlpt):
self.urlpt = urlpt
self.xml = None
super(Author, self).__init__(['name','publications','homepages',
'homonyms'])
def load_data(self):
resp = requests.get(DBLP_PERSON_URL.format(urlpt=self.urlpt))
# TODO error handling
xml = resp.content
self.xml = xml
root = etree.fromstring(xml)
data = {
'name':root.attrib['name'],
'publications':[Publication(k) for k in
root.xpath('/dblpperson/dblpkey[not(@type)]/text()')],
'homepages':root.xpath(
'/dblpperson/dblpkey[@type="person record"]/text()'),
'homonyms':root.xpath('/dblpperson/homonym/text()')
}
self.data = data
def first_or_none(seq):
try:
return next(iter(seq))
except StopIteration:
pass
Publisher = namedtuple('Publisher', ['name', 'href'])
Series = namedtuple('Series', ['text','href'])
Citation = namedtuple('Citation', ['reference','label'])
class Publication(LazyAPIData):
"""
Represents a DBLP publication- eg, article, inproceedings, etc. All data but
the key is lazily loaded. Fields that aren't provided by the underlying XML
are None.
Attributes:
type - the publication type, eg "article", "inproceedings", "proceedings",
"incollection", "book", "phdthesis", "mastersthessis"
sub_type - further type information, if provided- eg, "encyclopedia entry",
"informal publication", "survey"
title - the title of the work
authors - a list of author names
journal - the journal the work was published in, if applicable
volume - the volume, if applicable
number - the number, if applicable
chapter - the chapter, if this work is part of a book or otherwise
applicable
pages - the page numbers of the work, if applicable
isbn - the ISBN for works that have them
ee - an ee URL
crossref - a crossrel relative URL
publisher - the publisher, returned as a (name, href) named tuple
citations - a list of (text, label) named tuples representing cited works
series - a (text, href) named tuple describing the containing series, if
applicable
"""
def __init__(self, key):
self.key = key
self.xml = None
super(Publication, self).__init__( ['type', 'sub_type', 'mdate',
'authors', 'editors', 'title', 'year', 'month', 'journal',
'volume', 'number', 'chapter', 'pages', 'ee', 'isbn', 'url',
'booktitle', 'crossref', 'publisher', 'school', 'citations',
'series'])
def load_data(self):
resp = requests.get(DBLP_PUBLICATION_URL.format(key=self.key))
xml = resp.content
self.xml = xml
root = etree.fromstring(xml)
publication = first_or_none(root.xpath('/dblp/*[1]'))
if publication is None:
raise ValueError
data = {
'type':publication.tag,
'sub_type':publication.attrib.get('publtype', None),
'mdate':publication.attrib.get('mdate', None),
'authors':publication.xpath('author/text()'),
'editors':publication.xpath('editor/text()'),
'title':first_or_none(publication.xpath('title/text()')),
'year':int(first_or_none(publication.xpath('year/text()'))),
'month':first_or_none(publication.xpath('month/text()')),
'journal':first_or_none(publication.xpath('journal/text()')),
'volume':first_or_none(publication.xpath('volume/text()')),
'number':first_or_none(publication.xpath('number/text()')),
'chapter':first_or_none(publication.xpath('chapter/text()')),
'pages':first_or_none(publication.xpath('pages/text()')),
'ee':first_or_none(publication.xpath('ee/text()')),
'isbn':first_or_none(publication.xpath('isbn/text()')),
'url':first_or_none(publication.xpath('url/text()')),
'booktitle':first_or_none(publication.xpath('booktitle/text()')),
'crossref':first_or_none(publication.xpath('crossref/text()')),
'publisher':first_or_none(publication.xpath('publisher/text()')),
'school':first_or_none(publication.xpath('school/text()')),
'citations':[Citation(c.text, c.attrib.get('label',None))
for c in publication.xpath('cite') if c.text != '...'],
'series':first_or_none(Series(s.text, s.attrib.get('href', None))
for s in publication.xpath('series'))
}
self.data = data
def search(author_str):
resp = requests.get(DBLP_AUTHOR_SEARCH_URL, params={'xauthor':author_str})
try:
root = etree.fromstring(resp.content)
except:
print resp.content
import pdb
pdb.set_trace()
print 'z'
return [Author(urlpt) for urlpt in root.xpath('/authors/author/@urlpt')]
'''
So I copied the dblp code into this file. I wanted to check what was going
wrong. It was a protected file and it was some archived file and so on.
Didnt have the patience to extract, find file and edit, instead copied
everything here. Will clean up code when we have to :P Till then let it be xD
'''
# reading csv file containing the email id, first name, last name
fp = open("./iclr_accepted_reviewers.csv")
no_name = 0
more_than_one = 0
no_hits = 0
author_disambiguation = {}
no_hits_names = []
no_name_list = []
for eachLine in fp.readlines():
emailId, firstName, lastName = eachLine.split(',')
name = firstName + " " + lastName.strip()
file_name = firstName + "_" + lastName.strip()
if os.path.isfile("/home/rakshitha03/OpenReview/DBLP_processing/Reviewer_Info/"+file_name):
print file_name + " Exists"
continue
if name == '' or name == ' ':
print emailId
no_name += 1
no_name_list.append(emailId)
else:
try:
sleep(3)
authors = search(name)
except Exception as e:
modified_name = name + ' '
authors = search(modified_name)
print "DBLP parsing error for ", name
if authors:
if len(authors) > 1:
print "More than one authors found! "
more_than_one += 1
author_disambiguation[name] = []
for ath in authors:
author_disambiguation[name].append(ath)
author = authors[0]
print name + ' : ' + unicode(author.name).encode('utf-8')
if len(author.publications) > 0:
print len(author.publications)
''' There's something fishy about this publications.
When I extract their title, somehow it seems to give me DBLP parsing error.
Check the reviewer_info.txt file. For the same authors for
which it printed info that time, now it doesnt even detect
them :/ Very strange
'''
'''This is the part which might probably be giving an error.. '''
fp = open("./Reviewer_Info/" + file_name, 'a')
publications_set = set()
for publication in author.publications:
sleep(1)
try:
pub_title = publication.title
publications_set.add(pub_title)
except Exception as e:
continue
for title in publications_set:
title = unicode(title).encode('utf-8')
fp.write(title + '\n')
fp.close()
''' Problematic code ends here'''
else:
author = authors[0]
print name + ' : ' + unicode(author.name).encode('utf-8')
if len(author.publications) > 0:
print len(author.publications)
fp1 = open("./Reviewer_Info/" + file_name, 'a')
publications_set = set()
for publication in author.publications:
sleep(1)
try:
pub_title = publication.title
publications_set.add(pub_title)
# fp1.write(publication.title + '\n')
# print "Came here: ", file_name
except Exception as e:
continue
for title in publications_set:
title = unicode(title).encode('utf-8')
fp1.write(title + '\n')
fp1.close()
else:
print "No hit in dblp for ", name
no_hits_names.append(name)
no_hits += 1
print "Stats: "
print "No names for: ", no_name
print "No names list: ", no_name_list
print "More than 1 author hits for ", more_than_one
print "No hits for ", no_hits
print "No hit list: ", no_hits_names
# # These people with non ascii names -_-
# try:
# file_pointer_2.write("\nAuthor:\t" + authors[
# 0].name + "\nEmail:\t" + emailID + "Publication List:\n")
# except:
# file_pointer_2.write(
# "\nAuthor:\t" + name + "\nEmail:\t" + emailID + "Publication List:\n")
# reviewersInfo[name]['publications'] = authors[0].publications
# count = 1
# for publication in reviewersInfo[name]['publications']:
# # Please can you try solving ascii encoding error for other
# # authors and publication title and email id! Don't seem to have
# # the patience to solve it now.
# # Skipping it for the time being.
# try:
# file_pointer_2.write(str(
# count) + ") " + publication.title + "\nAuthors:" + ','.join(
# publication.authors) + "\n")
# count += 1
# except Exception:
# pass
# # Need to fix this
# # print(publication.title+"\t"+','.join(publication.authors)+"\n")
# else:
# file_pointer_1.write("\n" + name)
# file_pointer_1.close()
# file_pointer_2.close()
# '''
'''
authorData = dblp.search('Andrew McCallum')
for eachAuthObject in authorData:
print eachAuthObject.__dict__
print eachAuthObject.xml
print eachAuthObject.urlpt
print eachAuthObject.name
print "Len of publications ", len(eachAuthObject.publications)
count = 0
publication_set = set()
for publication in eachAuthObject.publications:
try:
publication_set.add(publication.title)
# print publication.title
except:
print publication.isbn
print publication.data
count += 1
print "*"*10
print "total count",count
print len(publication_set)
# print eachAuthObject.publications[2].isbn
# print "----------------------------------------"
# '''
# for eachPub in eachAuthObject.publications:
# print eachPub.__dict__
# break
# '''
# print eachAuthObject.publications[2].data
# print "----------------------------------------"
# print eachAuthObject.publications[2].data.keys()
'''
#print eachAuthObject.name
print "\n"
break
'''
'''
for each in authors:
#print(each.__dict__)
#print(each.homepages)
for eachPub in each.publications:
#print dir(eachPub)
print eachPub.data
'''