-
Notifications
You must be signed in to change notification settings - Fork 0
/
todays-zaman.recipe
180 lines (150 loc) · 6.08 KB
/
todays-zaman.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2014, spswerling'
'''
www.todayszaman.com
'''
import re, string, time
from urlparse import urljoin
from calibre import strftime
from datetime import timedelta, date
from time import sleep
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
class TodaysZaman(BasicNewsRecipe):
title = u'Todays Zaman'
__author__ = u'spswerling'
description = 'English version of Turkish Daily "Zaman"'
max_articles_per_feed = 100
encoding = 'utf-8'
category = 'news'
language = 'en_TR'
publication_type = 'newspaper'
cover_img_url = 'http://medya.todayszaman.com/todayszaman/images/logo/todays_yenilogo.bmp' # yep, bmp
masthead_url = cover_img_url
remove_empty_feeds = True
# on kindle, images can make things kind of fat. Slim them down.
recursions = 0
oldest_article = 1.5
compress_news_images = True
compress_news_images_max_size = 7
scale_news_images = (150,200) # (kindle touch: 600x800)
useHighResImages = False
sections = [
(u'Columnists',u'columnists'),
(u'Opinion',u'op-ed'),
(u'World',u'world'),
(u'National',u'national'),
(u'Diplomacy',u'diplomacy'),
(u'Business',u'business'),
]
# util for creating remove_tags and keep_tags style regex matchers
def tag_matcher(elt, attr, str):
return dict(name=elt, attrs={attr:re.compile(str, re.IGNORECASE)})
keep_only_tags = [
tag_matcher('div', 'class', '^pageNewsDetailContainer$'),
tag_matcher('div', 'class', '^pageColumnistDetailContainer$'),
]
remove_tags = [
tag_matcher('div', 'class', 'DetailKeyword'),
tag_matcher('div', 'class', 'MainContentSocial'),
tag_matcher('div', 'class','SocialNetwork'),
tag_matcher('div', 'class', 'DetailLeftOther'),
tag_matcher('div', 'class', 'RelatedNews'),
tag_matcher('div', 'class', '^topMenuWrapper$'),
tag_matcher('div', 'class', '^logo$'),
tag_matcher('a', 'class', 'cf_email'),
]
articles = {}
def parse_index(self):
for (sect_title,sect_uri) in self.sections:
self.parse_section(sect_title, sect_uri)
ans = []
for k in self.articles:
ans.append((k, self.articles[k]))
return ans
def parse_section(self, sect_title, sect_uri):
url = 'http://www.todayszaman.com/'+sect_uri
print 'Start section ' + sect_title + ', ' + url
try:
soup = self.index_to_soup(url)
except:
return
# Find each article
for div in soup.findAll('div'):
div_class = div.get('class')
if div_class:
if div_class in [ 'pageColumnistsMainContent',
'pageCategoryContainer' ]:
for link in div.findAll('a', href=True):
self.process_link(sect_title, div_class, link)
print 'Finished section: ' + sect_title
def process_link(self, section_title, layout, link):
def p(s):
print '[PROCESS LINK] ' + s[0:80]
href = link['href']
full_href = urljoin('http://www.todayszaman.com/', href)
next_sib = link.nextSibling
prev_sib = link.previousSibling
child_h3 = link.find('h3')
child_h2 = link.find('h2')
link_text = self.tag_to_string(link).strip()
title_node = None
if layout in ['pageColumnistsMainContent']:
if child_h2:
title_node = child_h2
else:
return
elif layout in ['pageCategoryContainer']:
top_title = link.find(attrs={'class':'pageCategoryTopTitle'})
if top_title:
title_node = top_title
elif (not link_text) and (next_sib and next_sib.find('h4')):
title_node = next_sib.find('h4')
elif (not link_text) and (next_sib and next_sib.find('h3')):
title_node = next_sib.find('h3')
elif link_text:
title_node = link
if title_node:
title = self.tag_to_string(title_node)
self.queue_article_link(section_title, full_href, title)
def queue_article_link(self, section, url, title):
if not self.articles.has_key(section):
self.articles[section] = []
self.articles[section].append(
dict(title=title,
url=url,
date='',
description='',
author='',
content=''))
def populate_article_metadata(self, article, soup, first):
def p(s):
print '[POPULATE METADATA] ' + s[0:80]
tnode = soup.find('title')
if tnode:
tstring = self.tag_to_string(tnode)
if ' - ' in tstring:
author = tstring.split('-')[0]
if author:
article.author = author
article.title = author + ' - ' + article.title.strip()
p('Add author to title:' + author)
# known matches: pageNewsDetailDate, pageColumnistDetailLeftDate
regex = re.compile('(DetailDate|DetailLeftDate)$', re.IGNORECASE)
date_node = soup.find('div', {'class':regex})
if date_node:
date = self.tag_to_string(date_node).__str__().split('/')[0]
date = ','.join(date.split(',')[:2]).strip()
article.title = date + ' - ' + article.title.strip()
article.date = date
p('Add date to title: ' + date)
strong = soup.find('strong')
if strong:
article.text_summary = self.tag_to_string(strong)
p('Summary: ' + article.text_summary)
def _dbg_soup_node(self, node):
s = ' cls: ' + node.get('class').__str__().strip() + \
' txt: ' + self.tag_to_string(node).strip()
return s