-
Notifications
You must be signed in to change notification settings - Fork 4
/
app.py
69 lines (56 loc) · 2.29 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# PasteHunter v.10
# Author : Dibyendu Sikdar
# Credits:
# Code of Python to scrap urls from results is taken from here...
# https://raw.githubusercontent.com/getlinksc/scrape_google/master/search.py
import urllib
import requests
from bs4 import BeautifulSoup
from colorama import init
from colorama import Fore, Back, Style
init()
#stores the pastes in raw/ directory
basedir = "raw/"
#CHANGE THE intext: WITH THE INFORMATION YOU WANT TO SEARCH
query = "site:pastebin.com intext:smtp.sendgrid.net"
query = query.replace(' ', '+')
def getContentRaw(url):
fname = url[url.rindex("/")+1:len(url)]
url = "https://pastebin.com/raw/"+url[url.rindex("/")+1:len(url)]
USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
headers = {"user-agent": USER_AGENT}
r = requests.get(url,headers=headers)
w = open(basedir+fname,'w')
w.write(r.text)
w.close()
def beginScraping():
print(Fore.GREEN+"Starting digging google to find juicy information about "+query)
#user-agent
USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
headers = {"user-agent": USER_AGENT}
results = []
#Searching 10 results results only at a time ( per page returns 10 results )
#Just querying 1 page to avoid detection/blacklisting, adjust as you want
#Change the 2nd parameter with multiple of 10 to scrap pages, like 10, 20, 30, etc
for start in range(0,10,10):
pos = str(start)
URL = f"https://google.com/search?q={query}&start{pos}"
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
for g in soup.find_all('div', class_='g'):
# anchor div
rc = g.find('div', class_='rc')
# description div
s = g.find('div', class_='s')
if rc:
divs = rc.find_all('div', recursive=False)
if len(divs) >= 2:
anchor = divs[0].find('a')
link = anchor['href']
results.append(link)
#Get the results
for url in results:
print(Fore.RED+"Fetching contents of "+url)
getContentRaw(url)
beginScraping()