This repository has been archived by the owner on Jan 2, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
articles.py
110 lines (96 loc) · 3.79 KB
/
articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
from collections import defaultdict
from multiprocessing import Pool
from newspaper import Article
import newspaper.nlp as nlp
from timeoutcontext import timeout
from helpers import loadj, writej
from links import get_user_links
from crawler import PREFIX
NUM_WORKERS = 1
KEYWORDS_PATH = PREFIX + 'links_keywords.json'
PARSED_LINKS_PATH = PREFIX + 'parsed_links.json'
USER_IDS = loadj(PREFIX + 'user_ids')
def gen_keywords():
if os.path.exists(KEYWORDS_PATH):
keywords = load_keywords()
else:
keywords = {}
# We're keeping a cache of already parsed links and also
# are storing their matched canonical link to ensure that we can
# add a user to the set of an already parsed link.
if os.path.exists(PARSED_LINKS_PATH):
parsed_links = loadj(PARSED_LINKS_PATH)
else:
parsed_links = {}
graph_names = [n['id'] for n in loadj('website/outgroup.json')['nodes']]
user_ids = [USER_IDS[name] for name in graph_names]
user_links = get_user_links(user_ids)
all_links = []
for user_name, links in user_links.items():
for l in links:
# Don't process the link if we already did.
if l not in parsed_links:
all_links.append((user_name, l))
parsed_links[l] = ''
elif parsed_links[l] != '' and parsed_links[l] in keywords:
keywords[parsed_links[l]]['users'].add(user_name)
print(f'{len(all_links)} to parse...')
p = Pool(NUM_WORKERS)
kw_tuples = p.starmap(get_keywords_pmap, all_links)
for user_name, c_link, l, kws, p_time, title in kw_tuples:
if c_link is not None:
parsed_links[l] = c_link
keywords[c_link]['kws'] = kws
keywords[c_link]['time'] = p_time
if 'users' not in keywords[c_link]:
keywords[c_link]['users'] = set()
keywords[c_link]['users'].add(user_name)
keywords[c_link]['title'] = title
else:
parsed_links[l] = ''
# Make the keywords dict serializable.
for c_link in keywords:
keywords[c_link]['users'] = list(keywords[c_link]['users'])
keywords[c_link]['kws'] = list(keywords[c_link]['kws'])
writej(keywords, KEYWORDS_PATH)
writej(parsed_links, PARSED_LINKS_PATH)
def get_keywords_pmap(user_name, url):
try:
link_info = get_article_info(url)
except TimeoutError:
return (None, None, None, None, None, None)
if len(link_info['keywords']) > 0:
return (user_name, link_info['c_link'], url, link_info['keywords'], link_info['published_time'], link_info['title'])
else:
return (None, None, None, None, None, None)
@timeout(3)
def get_article_info(url):
try:
a = Article(url, fetch_images=False)
a.download()
a.parse()
# Not doing a.nlp() to be more efficient.
text_keyws = list(nlp.keywords(a.text).keys())
title_keyws = list(nlp.keywords(a.title).keys())
keyws = list(set(title_keyws + text_keyws))
if 'published_time' in a.meta_data['article']:
published_time = a.meta_data['article']['published_time']
else:
published_time = ''
return {'keywords': keyws, 'c_link': a.canonical_link, 'published_time': published_time, 'title': a.title}
except:
return {'keywords': [], 'c_link': a.canonical_link, 'published_time': '', 'title': ''}
def load_keywords():
kws = loadj(KEYWORDS_PATH)
keywords = defaultdict(lambda:{})
for l, v in kws.items():
keywords[l]['users'] = set(v['users'])
keywords[l]['kws'] = frozenset(v['kws'])
keywords[l]['title'] = v['title']
return keywords
def split_words(s):
words = nlp.split_words(s)
return [w for w in words if w not in nlp.stopwords]
if __name__ == "__main__":
gen_keywords()