-
Notifications
You must be signed in to change notification settings - Fork 8
/
waybackurls.py
56 lines (47 loc) · 2.66 KB
/
waybackurls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
'''
Nebula Expired Article Hunter: article scraper for the wayback machine.
Copyright (C) 2021 Eneiro A. Matos B.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
'''
import re
import requests
import config
class WaybackmachineUrls:
def __init__(self, domain: str):
self.domain = domain.casefold().removesuffix('\n')
def get_url_list(self) -> tuple[str]:
exeption_rules = ['.jpg', '.txt', '.js', '.css', '.png', '.gif', '.xml', '.cgi', '.ico',
'.php', '/wp-', '/comments', '/feed', '/cat', '/page', '/tag',
'/author', '/trackback', '/contact', '/aviso-', '/politica-', 'comment-page', '/privacy',
'=', '/amp', '/js', '//', '?', '/media', '.svg', '/fonts', '/theme', '/template', '/search',
'/buscar', '/busqueda', '/nosotros', '/cgi', '/cdn', '/img', '.asp', '.jsp', '/photos/',
'/images/', '/sample-page', '/cookies-policy', '/user/', '/users/', '/fotos/', '/images/',
'/products/', '/product/', '/producto/', '/productos/', '/shop/', '/tienda/', '/usuario/',
'/user/', '/usuarios/', '/users/', '/catalogo/', '/admin/']
clean_url_list = list()
header = {'user-agent': config.user_agent}
wburl = f'https://web.archive.org/cdx/search/cdx?url={self.domain}*' \
f'&statuscode=200&filter=mimetype:text/html&fl=original&output=txt'
req = requests.get(wburl, headers=header)
urllist = [url.replace(':80', '').removeprefix('http://').removeprefix('https://').removesuffix('/').casefold()
for url in req.text.split('\n')]
urllist = set(urllist)
for url in urllist:
is_good_url = True
for rule in exeption_rules:
if url.find(rule) > -1 or url == '' or re.search(r'[^a-zA-Z0-9/:.-]', url) is not None:
is_good_url = False
break
if is_good_url:
clean_url_list.append(url)
clean_url_list.sort()
return tuple(clean_url_list)