-
Notifications
You must be signed in to change notification settings - Fork 0
/
web-scraping-workshop.py
170 lines (114 loc) · 6.13 KB
/
web-scraping-workshop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import tabulate
import pprint
def exportData(title, author, rating, desc, image, reviews):
if isinstance(title, list):
assert len(title) == len(author) == len(rating) == len(desc) == len(image), "they all should match in size"
# create a dataframe
data = {
"title": title,
"author": author,
"rating": rating,
"desc": desc,
"image": image
}
df = pd.DataFrame(data)
# export to csv
df.to_csv("book_data.csv", index=False)
for title_ in title:
# fetch image from url and export to file with name title
with open(title_ + ".jpg", "wb") as f:
f.write(requests.get(image).content)
else:
# export to json
data = {
"title": title,
"author": author,
"rating": rating,
"desc": desc,
"image": image
}
with open(f"{title.replace(':', '-').replace(' ', '-')}.json", "w+") as f:
json.dump(data, f)
# export image to title
with open(title.replace(':', '-').replace(' ', '-') + ".jpg", "wb") as f:
f.write(requests.get(image).content)
reviewsDF = pd.DataFrame(reviews)
reviewsDF.to_csv(f"{title.replace(':', '-').replace('/', '-').replace(' ', '-')}.csv", index=False)
def performNLP(reviews):
# credits Neil Chowdhury for this function
import nltk
from nltk.sentiment import vader
def regressionGraph(reviews):
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
sns.regplot(x = [ x['rating'] for x in reviews ], y = [ x['polarity'] for x in reviews ], line_kws={'color':'red'}).set_title('Polarity vs Rating')
plt.show()
x = np.array([ x['rating'] for x in reviews ]).reshape((-1, 1))
y = np.array([ x['polarity'] for x in reviews ]).reshape((-1, 1))
model = LinearRegression().fit(x, y).score(x, y)
print("Regression Coefficient:", model)
def polarityScore(reviews):
sentimentIntensityAnalyzer = vader.SentimentIntensityAnalyzer()
for review in reviews:
review["polarity"] = (sentimentIntensityAnalyzer.polarity_scores(review["review"])['compound']+1)*(2)+1
return reviews
reviews = polarityScore(reviews)
ratingList = [ review["rating"] for review in reviews ]
polarityList = [ review["polarity"] for review in reviews ]
averageRating = sum(ratingList)/len(ratingList)
averagePolarity = sum(polarityList)/len(polarityList)
print(tabulate.tabulate([[averageRating, averagePolarity]], headers=["Average Rating", "Average Polarity"]))
regressionGraph(reviews)
def bsoupScrape(url):
# important, remember the .text at the end to avoid a parsing error
pageText = requests.get(url).text
# this requires correct version of bs4 to be installed if using apt-get, if using pip it's all good. html.parser is included
soup = BeautifulSoup(pageText, 'html.parser')
# id == id, class == class_, attrs == attrs={"name": "val"}
bookTitle = soup.find('h1', id='bookTitle').get_text().replace(' ', '').replace('\n', '')
print(bookTitle)
bookAuthors = soup.find_all('div', class_='authorName__container')
bookAuthors_info = []
for bookAuthor in bookAuthors:
authorName = bookAuthor.find('span').get_text()
# find_parent finds the argument tag which is the parent.
# you can use .get to get some attribute
authorLink = bookAuthor.find('span').find_parent("a").get("href")
bookAuthors_info.append({"name": authorName, "link": authorLink})
pprint.pprint(bookAuthors_info)
# show nested tag example using reviews
bookRating = float(soup.find('div', id='bookMeta').find('span', attrs={"itemprop": "ratingValue"}).get_text())
print(bookRating)
# show how this one has two, a shorter revealed one, and a longer hidden one. Also show how the id of the span
# might be different across different books, so we have to handle it recursively. Do a pprint or a tabulate to show the two elements
bookDescription = soup.find('div', id='descriptionContainer').find('div', id='description').find_all('span')
# pprint.pprint(bookDescription)
# get the last span tag, get the text from it, and remove the whitespaces and text formatting tags.
bookDescription = bookDescription[-1].get_text().replace('<i>', ' ').replace('</i>', ' ').replace('\n', ' ').replace(' ', ' ')
print(bookDescription)
bookImage = soup.find('img', id='coverImage').get('src')
print(bookImage)
# finds all the review blocks
bookReviewBlocks = soup.find_all('div', class_='friendReviews')
bookReviews = list()
for bookReviewBlock in bookReviewBlocks:
# get the title of the review
reviewAuthor = bookReviewBlock.find('a', class_='imgcol').get('title').replace(' ', '').replace('\n', '')
# review text was obtained by finding the two spans, one which reveals on reveal.
reviewText = bookReviewBlock.find('div', class_='bodycol').find('span', class_='readable').find_all('span')[-1].get_text().replace('<i>', ' ').replace('</i>', ' ').replace('\n', ' ').replace(' ', ' ')
# rating is obtained by counting the number of p10 objects, which contain the full stars
reviewRating = len(bookReviewBlock.find('div', class_='bodycol').find('div', class_='reviewHeader').find_all('span', class_='p10'))
# append the reviews to the list
bookReviews.append({"author": reviewAuthor, "review": reviewText, "rating": reviewRating})
# ooooh what even is NLP?
performNLP(bookReviews)
exportData(bookTitle, bookAuthors_info, bookRating, bookDescription, bookImage, bookReviews)
if __name__ == '__main__':
bsoupScrape('https://www.goodreads.com/book/show/1386986.The_Lost_Ravioli_Recipes_of_Hoboken')