-
Notifications
You must be signed in to change notification settings - Fork 1
/
bibparser.py
56 lines (47 loc) · 1.93 KB
/
bibparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import requests
from tqdm import tqdm
from thefuzz import fuzz
from bs4 import BeautifulSoup
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
input_path = r'C:\Users\ignore\Desktop\Reference.bib'
output_path = r'C:\Users\ignore\Desktop\Reference_new.bib'
with open(input_path, encoding='utf-8') as bibtex_file:
bib_database = bibtexparser.load(bibtex_file)
new_entries = []
for entry in tqdm(bib_database.entries):
title = entry['title'].replace("{","").replace("}","")
if 'bibsource' in entry and 'dblp' in entry['bibsource']:
new_entries.append(entry)
continue
# search
base_url = 'https://dblp.org/search'
output = requests.get(base_url, params={"q":title})
soup = BeautifulSoup(output.text, 'html.parser')
res = soup.find_all('ul',{'class':'publ-list'})[0]
all_item = res.find_all('li',{"itemtype":"http://schema.org/ScholarlyArticle"})
bibtex_url = None
for item in all_item:
new_title = item.find_all('span',{"class":"title"})[0].text
new_url = item.find_all('a',{"rel":"nofollow"})[0]['href']
if fuzz.ratio(new_title.lower(), title.lower()) >= 90:
bibtex_url = new_url
break
if bibtex_url is None or 'bibtex' not in bibtex_url:
print(f'`{title}` not find!')
new_entries.append(entry)
continue
# get bibtext
output = requests.get(bibtex_url)
soup = BeautifulSoup(output.text,'html.parser')
tex_str = soup.find_all('pre',{'class':'verbatim select-on-click'})[0].text
new_entry = bibtexparser.loads(tex_str).entries[0]
new_title = new_entry['title']
new_entry['ID'] = entry['ID']
new_entries.append(new_entry)
db = BibDatabase()
db.entries = new_entries
writer = BibTexWriter()
with open(output_path, 'w', encoding='utf-8') as bibfile:
bibfile.write(writer.write(db))