-
Notifications
You must be signed in to change notification settings - Fork 3
/
bibdig.py
119 lines (106 loc) · 4.07 KB
/
bibdig.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Requires: bibtex.py
bibliograph package
"""
bad_chars = ['\\','"']
def bibfile2dictlist(fname,do_postprocess=True,scopus=False,printupdates=False):
"""
Takes a *.bib file name as input, and returns a list, with each
element a dictionary corresponding to one of the BibTeX entries
in the file.
This should really be rewritten as a proper parser.
Issues:
- Chokes on blank lines in the middle of bibtex entries
"""
from bibliograph.parsing.parsers.bibtex import BibtexParser
import time
bp=BibtexParser()
f=file(fname)
line=f.readline()
biblist=[]
entry=''
while True:
try: line=f.readline()
except:
bibrec=bp.parseEntry(entry)
biblist.append(bibrec)
if do_postprocess: biblist = postprocess(biblist)
return biblist
if line.startswith('@'):
bibrec=bp.parseEntry(entry)
if type(bibrec) is dict: biblist.append(bibrec)
else: print 'Not a bibtex entry: '+entry
entry=line
if printupdates: print len(biblist)
continue
else:
if scopus: #Scopus messes up the author format
if line.strip().startswith('author='):
line=line.replace('a ',' ')
line=line.replace('b ',' ')
line=line.replace('c ',' ')
line=line.replace('d ',' ')
line=line.replace('e ',' ')
line=line.replace(' , ',' and ')
line=line.replace('., ','. and ')
entry=entry+line
if len(line)==0:
bibrec=bp.parseEntry(entry)
if type(bibrec) is dict: biblist.append(bibrec)
else: print 'Not a bibtex entry: '+entry
if do_postprocess: biblist = postprocess(biblist)
return biblist
if do_postprocess: biblist = postprocess(biblist)
return biblist
def postprocess(biblist):
"""
Reformat the output of BibtexParser to be more helpful.
"""
res=[]
charstr='abcdefghijklmnopqrstuvwxyz1234567890 '
for i,pub in enumerate(biblist):
okay=True
#Remove pubs that are just reports
if pub['title'].find('Report')>-1:
okay=False
print 'Report rejected: ',pub['title']
#Remove non-English pubs:
if pub.has_key('journal'): wherepub='journal'
elif pub.has_key('booktitle'): wherepub='booktitle'
elif pub.has_key('school'): wherepub='school'
elif pub.has_key('institution'): wherepub='institution'
else:
print i, ' what is this one? ', pub['pid']
print pub['reference_type']
wherepub='title'
okay=False
capwords=[s.capitalize() for s in pub[wherepub].split()]
engtest=' '.join(capwords)
if engtest[0].lower() not in charstr:
okay=False
print 'Non-english pub rejected: ', pub['title']
print pub[wherepub]
if pub.has_key('publisher'):
if pub['publisher'][0].lower() not in charstr:
okay=False
print 'Non-english pub rejected: ', pub['title']
print 'Publisher: ',pub['publisher']
print pub[wherepub]
#reformat author set
#Check if it is raw or already processed:
if not pub.has_key('authors'):
print 'Paper has no author'
continue
if type(pub['authors'][0]) is dict:
authlist=[]
for author in pub['authors']:
firstname=author['firstname'].replace('.','').upper()
# Remove strange characters that cause problems for pygraphviz:
authname=firstname+' '+author['lastname'].capitalize()
for char in bad_chars:
authname = authname.replace(char,'')
authlist.append(authname)
pub['authors']=authlist
if ' others' in pub['authors']: pub['authors'].remove(' others')
if okay: res.append(pub)
return res