-
Notifications
You must be signed in to change notification settings - Fork 0
/
didscraper.py
148 lines (123 loc) · 4.4 KB
/
didscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/python
# --
#
# File : didscraper.py
# Maintainer : Alakazam (alakazamjoined@gmail.com)
# Date : 13/06/2018
#
# Version : v0.1.0
#
# --
import random
import json
import time
import urllib.request as urlq
import requests as req
from lxml.html import fromstring
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
outputfile_px = "didscraper-proxies.txt"
outputfile_ua = "didscraper-useragents.txt"
writing_mode = 'w' # Change to 'a' to save predecent proxies
timeout = 2.5
sleep = 0
proxies_nb = 300
verbose = True
ip_tester = "https://httpbin.org/ip"
ua_tester = "https://httpbin.org/user-agent"
proxy_list_url = "https://free-proxy-list.net/"
ua_list_urls = [
'https://developers.whatismybrowser.com/useragents/explore/software_name/chrome/?order_by=-times_seen',
'https://developers.whatismybrowser.com/useragents/explore/operating_system_name/windows/?order_by=-times_seen'
'https://developers.whatismybrowser.com/useragents/explore/operating_system_name/mac-os-x/?order_by=-times_seen'
]
def retrieve_ua():
j = json.loads(urlq.urlopen(ua_tester).read())
return j['user-agent'].rstrip()
def retrieve_ip():
j = json.loads(urlq.urlopen(ip_tester).read())
return j['origin'].rstrip()
def retrieve_fake_ip(proxy):
try:
response = req.get(ip_tester, proxies={'http':proxy, 'https':proxy}, verify=False, timeout=timeout)
j = json.loads(response.text)
return j['origin'].rstrip()
except:
return False
def retrieve_fake_ua(fake_ua):
headers = {'User-Agent': fake_ua}
response = req.get(ua_tester, headers=headers, timeout=timeout)
j = json.loads(response.text)
return j['user-agent'].rstrip()
def get_proxies():
proxies = []
xpath_all = '//tbody/tr'
xpath_https = './/td[7][contains(text(), "yes")]'
xpath_anon = './/td[5][contains(text(), "elite proxy")]'
xpath_coun = './/td[4]/text()'
xpath_ip = './/td[1]/text()'
xpath_port = './/td[2]/text()'
parser = fromstring(req.get(proxy_list_url).text)
n = 0
for xp in parser.xpath(xpath_all)[:proxies_nb]:
if xp.xpath(xpath_https) and xp.xpath(xpath_anon):
n += 1
ip = xp.xpath(xpath_ip)[0]
port = xp.xpath(xpath_port)[0]
country = xp.xpath(xpath_coun)[0]
outline = ':'.join([ip,port])
proxies.append(outline)
if verbose: print('{}.Elite Proxy : {} - {}\n-----'.format(n, outline, country))
time.sleep(sleep)
if verbose: print(len(proxies)," elite proxies found on {} tested.\n".format(proxies_nb))
random.shuffle(proxies)
return proxies
def get_user_agents():
user_agents = []
xpath_all = '//tbody/tr'
xpath_ua = './/a/text()'
n=0
for url in ua_list_urls:
parser = fromstring(req.get(url).text)
for elem in parser.xpath(xpath_all)[:hm_fakes]:
ua = elem.xpath(xpath_ua)[0]
user_agents.append(ua)
print("{}. User-Agent: {}\n-----\n".format(n, ua))
time.sleep(sleep)
n+=1
if verbose: print(n, ' user agents found.')
return user_agents
def test_proxies(proxies):
true = []
print("----Original----")
print("-IP: ", str(retrieve_ip()))
print("-User-Agent: ",str(retrieve_ua()))
t = 0
print('-'*12)
for proxy in proxies:
ip = str(retrieve_fake_ip(proxy))
if ip == 'False':
if verbose: print('[X]',proxy)
else:
t+=1
if verbose: print('[V]',proxy)
true.append(proxy)
if verbose: print(t, "/", len(proxies), ' working proxies')
return true
def save(proxies, useragents):
with open(outputfile_px, writing_mode) as f:
for proxy in proxies:
f.write(str(proxy)+'\n')
with open(outputfile_ua, writing_mode) as f:
for ua in useragents:
f.write(str(ua)+'\n')
if __name__ == "__main__":
print("[*] Retrieving Proxies...")
proxies = get_proxies()
hm_fakes = int(len(proxies)/len(ua_list_urls))
print("[*] Retrieving User-Agents...")
user_agents = get_user_agents()
print("\n[*] Testing proxies")
proxies = test_proxies(proxies)
print("[*] Saving data to", outputfile_ua, "and", outputfile_px)
save(proxies, user_agents)