-
Notifications
You must be signed in to change notification settings - Fork 4
/
oa_pdf_downloader.py
59 lines (50 loc) · 1.76 KB
/
oa_pdf_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
import numpy as np
import requests
import os
def get_pdf_url_and_doi(data_path):
data = pd.read_csv(data_path)
return data["pdf_url"], data["dois_id"]
def generate_downloaded_list(file_tracking='download_trace.txt' ,folder_pdfs='../pdf_info_extractor/data_pdf/'):
downloaded_list = os.listdir(folder_pdfs)
with open(file_tracking, 'w') as f:
f.write('\n'.join(downloaded_list))
def check_downloaded_list(name,file_path):
with open(file_path) as f:
downloaded_list = f.readlines()
downloaded_list = [x.strip() for x in downloaded_list]
if name.strip() in downloaded_list:
return True
else:
return False
def download_pdf(url,name_of_pdf):
# replace dois_id / with _
name = name_of_pdf.replace('/','_')+ '.pdf'
if check_downloaded_list(name,'download_trace.txt'):
print('already downloaded')
return 404
try:
r = requests.get(url)
# Save the pdf
with open('../pdf_info_extractor/data_pdf/' + name, 'wb') as f:
f.write(r.content)
# make a file for downloading trace
with open('download_trace.txt', 'a') as f:
f.write(f'\n{name}')
return 200
except:
# make a file for the error trace
with open('error_trace.txt', 'a') as f:
f.write(f'\n{url}')
return 404
def download_oa():
oa_url,doi_id = get_pdf_url_and_doi('dois_not_arxiv_with_pdf_url.csv')
for url,doi in zip(oa_url,doi_id):
#check if url is not empty
if url is not np.nan:
print('Downloading: ', url)
download_pdf(url,doi)
break #remove this line to download all the pdfs
if __name__ == '__main__':
generate_downloaded_list()
download_oa()