-
Notifications
You must be signed in to change notification settings - Fork 1
/
scratch_1.py
36 lines (26 loc) · 1.16 KB
/
scratch_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from googlesearch import search
import pandas as pd
from requests_html import HTMLSession
from bs4 import BeautifulSoup
# Step 1. Getting top n urls from Google's search results
def get_links_from_google(term, num_results=10, lang='en'):
url_list = [x for x in search(term=term, lang=lang, num_results=num_results)]
return pd.DataFrame(url_list, columns=['url'])
# Testing step 1
df = get_links_from_google('Latest Mobiles in India', 5)
# Step 2. Getting the content of the top n pages
def get_page_content(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
with HTMLSession() as session:
try:
res = session.get(url, headers=headers, timeout=200)
return BeautifulSoup(res.content, 'html.parser').text
except:
return BeautifulSoup('', 'html.parser').text
finally:
print(f"Done Scrapping")
# Testing step 2
df['text'] = df['url'].apply(lambda x: get_page_content(x))
df.to_pickle('data.pkl')