-
Notifications
You must be signed in to change notification settings - Fork 8
/
scrapearticle.py
71 lines (58 loc) · 2.64 KB
/
scrapearticle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
'''
Nebula Expired Article Hunter: article scraper for the wayback machine.
Copyright (C) 2021 Eneiro A. Matos B.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
'''
import os
import re
from newspaper import Article
from newspaper import Config as CfgNews
import config
class ScrapeArticle:
def __init__(self, url: str, url_dir: str):
self.url = url
self.__title = str()
self.__domain_dir = url_dir
self.__wburl = 'https://web.archive.org/web/19900101/' + url
self.__word_count = int()
def __scrape_article(self) -> str:
wburl = self.__wburl
# Configuration for Newspaper.Article instance
conf_news = CfgNews()
conf_news.fetch_images = False
conf_news.browser_user_agent = config.user_agent
conf_news.request_timeout = config.request_time_out
article = Article(wburl, config=conf_news)
article.download()
article.parse()
self.__word_count = len(f'{article.text} {article.title}'.strip().split(' '))
if self.__word_count < config.min_word_len or self.__word_count > config.max_word_len:
return None
self.__title = article.title
return article.title + '\n' + article.text
def create_article_file(self):
article = self.__scrape_article()
if article is None:
return None
projects_dir = config.projects_folder
domain_dir = self.__domain_dir.replace('/', '').replace(':', '_').replace('\n', '')
current_dir = projects_dir + '/' + domain_dir
filename = f'{self.__title} {self.__word_count} words'
#replace any special character with space
filename = re.sub(r"\W+|_", " ", filename).removesuffix(' ').removeprefix(' ') + '.txt'
save_patch = f'{current_dir}/{filename}'
if not os.path.exists(projects_dir):
os.mkdir(projects_dir)
if not os.path.exists(projects_dir + '/' + domain_dir):
os.mkdir(projects_dir + '/' + domain_dir)
with open(save_patch, mode='w', encoding='utf-8') as txt:
txt.write(article)