-
Notifications
You must be signed in to change notification settings - Fork 0
/
PubMed_Scrapper.py
210 lines (169 loc) · 6.26 KB
/
PubMed_Scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time
from pymongo import MongoClient
def get_PMID_file(keyword):
def enable_download_headless(browser, download_dir):
browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
browser.execute("send_command", params)
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
"download.default_directory": "<path_to_download_default_directory>",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing_for_trusted_sources_enabled": False,
"safebrowsing.enabled": False
})
keyword = keyword.replace(" ", "+")
url = "https://pubmed.ncbi.nlm.nih.gov/?term=Keyword"
url = url.replace('Keyword', keyword)
browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=ChromeDriverManager().install())
download_dir = 'C:\\Users\\zakaria\\PycharmProjects\\BI_project\\dataset\\PubMed'
enable_download_headless(browser, download_dir)
browser.get(url)
save = browser.find_element(By.ID,'save-results-panel-trigger').click()
time.sleep(1)
select = browser.find_element(By.ID,'save-action-selection').click()
time.sleep(1)
select = browser.find_element(By.XPATH,'/html/body/main/div[1]/div/form/div[1]/div[1]/select/option[2]').click()
time.sleep(1)
select = browser.find_element(By.ID, 'save-action-format').click()
time.sleep(1)
select = browser.find_element(By.XPATH, '/html/body/main/div[1]/div/form/div[2]/select/option[3]').click()
time.sleep(1)
select = browser.find_element(By.CLASS_NAME,'action-panel-submit').click()
browser.get('chrome://downloads')
time.sleep(2)
filename = browser.execute_script(
"return document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList downloads-item').shadowRoot.querySelector('div#content #file-link').text")
time.sleep(2)
file_path = "C:\\Users\\zakaria\\PycharmProjects\\BI_project\\dataset\\PubMed\\"+filename
file = open(file_path,'r')
idlist = []
for line in file:
line = line.replace("\n", "")
idlist.append(line)
return idlist
def scrapp_data(keyword):
client = MongoClient('localhost', 27017)
db = client['BI_project_db']
coll = db.PubMed_db
#keyword = str(input('Please enter the keyword '))
#num = int(input('Please enter the number of results '))
idlist = get_PMID_file(keyword)
print(idlist)
Articles = []
for link in idlist:
if already_exist(db,link):
print("exist")
continue
else:
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=idlist"
url = url.replace("idlist", link)
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
# pprint.pprint(soup)
row = get_info(soup,keyword)
Articles.append(row)
print(row)
if len(Articles) != 0:
coll.insert_many(Articles)
print(Articles)
return idlist
def get_info(soup,keyword):
article_info = {}
journal=soup.find('journal')
#print(journal)
try:
articleTitle= soup.find('articletitle').text
except:
articleTitle='none'
#print(date)
#print(articleTitle)
try:
journalTitle = journal.find('title').text
volume = journal.find('volume').text
except:
journalTitle = 'none'
volume = 'none'
try:
issue = journal.find('issue').text
date = journal.find('year').text + " " + journal.find('month').text
abst = soup.select('AbstractText')[0].text
except:
issue = 'none'
date = 'none'
abst = 'none'
#print(abst)
authorlist = soup.find('authorlist')
authors=""
try :
for auth in authorlist.find_all('author'):
try:
lastname = auth.find('lastname').text
initial = auth.find('initials').text
author = ",".join([lastname, initial])
except:
author = ""
authors = author + " " + authors
except:
authors='none'
#print(authors)
try:
country = soup.find('country').text
language = soup.find('language').text
except:
country = 'none'
language='none'
pmid=soup.find('pmid').text
PMarticleLink = "https://pubmed.ncbi.nlm.nih.gov/"+pmid
#print(PMarticleLink)
req = requests.get(PMarticleLink)
#print(req)
soup_article = BeautifulSoup(req.content, 'html.parser')
link_list= soup_article.select('.full-text-links-list')
links=[]
for link in link_list:
links.append(link.find('a')['href'])
MH_list=[]
try:
for mh in soup.find('MeshHeadingList'):
try:
desc_name = mh.find('DescriptorName').text
except:
desc_name='none'
MH_list.append(desc_name)
except:
MH_list = 'none'
journal_info = ''
info=soup.find('MedlineJournalInfo')
try:
journal_info =info.find('Country').text + '-'+info.find('MedlineTA').text + '-'+info.find('NlmUniqueID').text
except:
journal_info='none'
article_info['pmid']=pmid
article_info['articleTitle']=articleTitle
article_info['date']=date
article_info['journalTitle']=journalTitle
article_info['journal_info'] = journal_info
article_info['volume']= volume
article_info['issue']=issue
article_info['authors']= authors
article_info['language'] = language
article_info['country'] = country
article_info['abstract'] = abst
article_info['MeshHeading'] = MH_list
article_info['links'] = links
article_info['keyword'] = keyword
return article_info
def already_exist(db,pmid):
if db.PubMed_db.find({"pmid":pmid}).count() > 0:
return True
else:
return False
#scrapp_data()