-
Notifications
You must be signed in to change notification settings - Fork 3
/
get_books_data.py
36 lines (26 loc) · 1 KB
/
get_books_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import csv
import requests
import logging
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# ["id", "name", "pdf_url"]
all_books = []
pages = [f"https://www.hindawi.org/books/{n}/" for n in range(1, 216)]
for page in pages:
logger.info(f"Entering {page}")
resp = requests.get(page)
soup = BeautifulSoup(resp.text, "html.parser")
page_books = soup.find_all(class_="book")
for book in page_books:
details = book.find(class_="details")
id = details.h2.a["href"].split("/")[2]
name = details.h2.a.text.strip()
pdf_url = f"https://www.hindawi.org/books/{id}.pdf"
all_books.append([id, name, pdf_url])
logger.info(f"{id}, {name}, {pdf_url}")
with open("all_books.csv", mode="w") as f:
writer = csv.writer(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(["id", "name", "pdf_url"])
for book in all_books:
writer.writerow([book[0], book[1], book[2]])