-
Notifications
You must be signed in to change notification settings - Fork 0
/
darknoisy.py
209 lines (174 loc) · 8.24 KB
/
darknoisy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import argparse
import datetime
import json
import logging
import random
import re
import time
import os
from urllib.parse import urljoin, urlparse
import requests
from fake_useragent import UserAgent
from urllib3.exceptions import LocationParseError
UA = UserAgent(min_percentage=15.1)
REQUEST_COUNTER = -1
SYS_RANDOM = random.SystemRandom()
total_bandwidth = 0
main_logo = '''
\033[92m ░▒▓███████▓▒░ ░▒▓██████▓▒░░▒▓█▓▒░░▒▓███████▓▒░▒▓█▓▒░░▒▓█▓▒░ \033[0m
\033[92m ░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░▒▓█▓▒░ ░▒▓█▓▒░░▒▓█▓▒░ \033[0m
\033[92m ░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░▒▓█▓▒░ ░▒▓█▓▒░░▒▓█▓▒░ \033[0m
\033[92m ░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░░▒▓██████▓▒░ ░▒▓██████▓▒░ \033[0m
\033[92m ░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░ ░▒▓█▓▒░ ░▒▓█▓▒░ \033[0m
\033[92m ░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░ ░▒▓█▓▒░ ░▒▓█▓▒░ \033[0m
\033[92m ░▒▓█▓▒░░▒▓█▓▒░░▒▓██████▓▒░░▒▓█▓▒░▒▓███████▓▒░ ░▒▓█▓▒░ \033[0m
'''
infoabt = '''
\033[95m****************************\033[0m
\033[96m DARKNOISY \033[0m
\033[94m Onion Crawler \033[0m
\033[95m****************************\033[0m
\033[96m Build Date: August 6 2024 \033[0m
\033[34m Visit the github for more information.\033[0m
'''
print(main_logo)
print(infoabt)
class Crawler:
def __init__(self):
self._config = {}
self._links = []
self._start_time = None
class CrawlerTimedOut(Exception):
pass
@staticmethod
def _request(url):
global total_bandwidth
random_user_agent = UA.random
headers = {"user-agent": random_user_agent}
try:
response = requests.get(url, headers=headers, timeout=4)
if response:
content_length = len(response.content)
total_bandwidth += content_length
print_responsive_link(url)
print_bandwidth_usage(content_length)
return response
except requests.exceptions.RequestException as e:
return None
@staticmethod
def _normalize_link(link, root_url):
try:
parsed_url = urlparse(link)
except ValueError:
return None
parsed_root_url = urlparse(root_url)
if link.startswith("//"):
return "{}://{}{}".format(parsed_root_url.scheme, parsed_url.netloc, parsed_url.path)
if not parsed_url.scheme:
return urljoin(root_url, link)
return link
@staticmethod
def _is_valid_url(url):
regex = re.compile(
r"^(?:http|ftp)s?://"
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"
r"(?::\d+)?"
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)
return re.match(regex, url) is not None
@staticmethod
def _contains_unwanted_strings(url):
unwanted_strings = ['.ico', '.webm', '.pdf', '.doc', '.docx', '.svg', '.json', '.i2p', '.com', '.net', '.zip', '.gov', '.edu', '.gg', '.me', '.mp3', '.wav', '.mkv', '.mp4', '.m4a', '.flac', '.ogg', '.opus', '.avif', '.hc', '.tc', '.xyz', '.exe', '.msi', '.tar', '.7z', '.tif', '.css', '.csv']
return any(unwanted_string in url.lower() for unwanted_string in unwanted_strings)
def _is_blacklisted(self, url):
return any(blacklisted_url in url for blacklisted_url in self._config["blacklisted_urls"])
def _should_accept_url(self, url):
return (
url
and self._is_valid_url(url)
and not self._is_blacklisted(url)
and not self._contains_unwanted_strings(url)
)
def _extract_urls(self, body, root_url):
pattern = r"href=[\"'](?!#)(.*?)[\"'].*?"
urls = re.findall(pattern, str(body))
normalize_urls = [self._normalize_link(url, root_url) for url in urls]
return list(filter(self._should_accept_url, normalize_urls))
def _remove_and_blacklist(self, link):
self._config["blacklisted_urls"].append(link)
del self._links[self._links.index(link)]
def _browse_from_links(self, depth=0):
max_depth = SYS_RANDOM.randint(1, self._config["max_depth"])
is_depth_reached = depth >= max_depth
if not self._links or is_depth_reached:
return
if self._is_timeout_reached():
raise self.CrawlerTimedOut
random_link = SYS_RANDOM.choice(self._links)
try:
response = self._request(random_link)
if response:
sub_page = response.content
sub_links = self._extract_urls(sub_page, random_link)
time.sleep(SYS_RANDOM.randrange(self._config["min_sleep"], self._config["max_sleep"]))
if len(sub_links) > 1:
self._links = self._extract_urls(sub_page, random_link)
else:
self._remove_and_blacklist(random_link)
self.save_links(sub_links) # Save the found links to a file
except (requests.exceptions.RequestException, UnicodeDecodeError):
self._remove_and_blacklist(random_link)
self._browse_from_links(depth + 1)
def load_config_file(self, file_path):
with open(file_path, "r") as config_file:
config = json.load(config_file)
self.set_config(config)
def set_config(self, config):
self._config = config
def set_option(self, option, value):
self._config[option] = value
def _is_timeout_reached(self):
is_timeout_set = self._config["timeout"] is not False
end_time = self._start_time + datetime.timedelta(seconds=self._config["timeout"])
is_timed_out = datetime.datetime.now() >= end_time
return is_timeout_set and is_timed_out
def crawl(self):
self._start_time = datetime.datetime.now()
while True:
url = SYS_RANDOM.choice(self._config["root_urls"])
try:
response = self._request(url)
if response:
body = response.content
self._links = self._extract_urls(body, url)
self._browse_from_links()
else:
time.sleep(SYS_RANDOM.randrange(self._config["min_sleep"], self._config["max_sleep"]))
except (requests.exceptions.RequestException, LocationParseError):
continue
def save_links(self, links):
with open('output_links.txt', 'a') as file:
for link in links:
file.write(link + '\n')
def print_responsive_link(url):
print(f"\033[92mResponsive Link: {url}\033[0m")
def print_bandwidth_usage(content_length=0):
global total_bandwidth
total_bandwidth += content_length
print(f"\033[34mTotal Bandwidth Used: {total_bandwidth / (1024 * 1024):.2f} MB\033[0m", end='\r')
def main():
parser = argparse.ArgumentParser()
parser.add_argument("config", nargs="?", help="Path to a configuration file")
args = parser.parse_args()
crawler = Crawler()
if args.config:
crawler.load_config_file(args.config)
else:
with open("config.json", "r") as default_config_file:
default_config = json.load(default_config_file)
crawler.set_config(default_config)
crawler.crawl()
if __name__ == "__main__":
main()