todays-zaman.recipe

#!/usr/bin/env  python
# -*- coding: utf-8 -*-
__license__   = 'GPL v3'
__copyright__ = '2014, spswerling'
'''
www.todayszaman.com
'''
import re, string, time
from urlparse import urljoin
from calibre import strftime
from datetime import timedelta, date
from time import sleep
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup

class TodaysZaman(BasicNewsRecipe):

    title = u'Todays Zaman'
    __author__  = u'spswerling'
    description = 'English version of Turkish Daily "Zaman"'
    max_articles_per_feed = 100
    encoding = 'utf-8'
    category = 'news'
    language = 'en_TR'
    publication_type = 'newspaper'
    cover_img_url = 'http://medya.todayszaman.com/todayszaman/images/logo/todays_yenilogo.bmp' # yep, bmp
    masthead_url = cover_img_url
    remove_empty_feeds = True

    # on kindle, images can make things kind of fat. Slim them down.
    recursions = 0
    oldest_article = 1.5
    compress_news_images = True
    compress_news_images_max_size = 7
    scale_news_images = (150,200) # (kindle touch: 600x800)
    useHighResImages = False


    sections = [
                (u'Columnists',u'columnists'),
                (u'Opinion',u'op-ed'),
                (u'World',u'world'),
                (u'National',u'national'),
                (u'Diplomacy',u'diplomacy'),
                (u'Business',u'business'),
                ]


    # util for creating remove_tags and keep_tags style regex matchers
    def tag_matcher(elt, attr, str):
        return dict(name=elt, attrs={attr:re.compile(str, re.IGNORECASE)})

    keep_only_tags = [
        tag_matcher('div', 'class', '^pageNewsDetailContainer$'),
        tag_matcher('div', 'class', '^pageColumnistDetailContainer$'),
    ]

    remove_tags = [
        tag_matcher('div', 'class', 'DetailKeyword'),
        tag_matcher('div', 'class', 'MainContentSocial'),
        tag_matcher('div', 'class','SocialNetwork'),
        tag_matcher('div', 'class', 'DetailLeftOther'),
        tag_matcher('div', 'class', 'RelatedNews'),
        tag_matcher('div', 'class', '^topMenuWrapper$'),
        tag_matcher('div', 'class', '^logo$'),
        tag_matcher('a', 'class', 'cf_email'),
    ]
    articles = {}

    def parse_index(self):
        for (sect_title,sect_uri) in self.sections:
            self.parse_section(sect_title, sect_uri)

        ans = []
        for k in self.articles:
            ans.append((k, self.articles[k]))
        return ans


    def parse_section(self, sect_title, sect_uri):
        url = 'http://www.todayszaman.com/'+sect_uri
        print 'Start section ' + sect_title + ', ' + url
        try:
            soup = self.index_to_soup(url)
        except:
            return

        # Find each article
        for div in soup.findAll('div'):
          div_class = div.get('class')
          if div_class:
              if div_class in [ 'pageColumnistsMainContent', 
                                'pageCategoryContainer' ]:
                  for link in div.findAll('a', href=True):
                      self.process_link(sect_title, div_class, link)
              
        print 'Finished section: ' + sect_title

    def process_link(self, section_title, layout, link):
        def p(s):
            print '[PROCESS LINK] ' + s[0:80]

        href = link['href']
        full_href = urljoin('http://www.todayszaman.com/', href)
        next_sib = link.nextSibling
        prev_sib = link.previousSibling
        child_h3 = link.find('h3')
        child_h2 = link.find('h2')
        link_text = self.tag_to_string(link).strip()
        title_node = None

        if layout in ['pageColumnistsMainContent']:
            if child_h2:
                title_node = child_h2
            else:
                return
        elif layout in ['pageCategoryContainer']:
            top_title = link.find(attrs={'class':'pageCategoryTopTitle'})
            if top_title:
                title_node = top_title
            elif (not link_text) and (next_sib and next_sib.find('h4')):
                title_node = next_sib.find('h4')
            elif (not link_text) and (next_sib and next_sib.find('h3')):
                title_node = next_sib.find('h3')
        elif link_text:
            title_node = link
        
        if title_node:
            title = self.tag_to_string(title_node)
            self.queue_article_link(section_title, full_href, title)

    def queue_article_link(self, section, url, title):
        if not self.articles.has_key(section):
            self.articles[section] = []
        self.articles[section].append(
                        dict(title=title, 
                             url=url, 
                             date='',
                             description='', 
                             author='',
                             content=''))

    def populate_article_metadata(self, article, soup, first):

        def p(s):
            print '[POPULATE METADATA] ' + s[0:80]

        tnode =  soup.find('title')
        if tnode:
            tstring = self.tag_to_string(tnode)
            if ' - ' in tstring:
              author = tstring.split('-')[0]
              if author:
                  article.author = author
                  article.title = author + ' - ' + article.title.strip()
                  p('Add author to title:' + author)

        # known matches: pageNewsDetailDate, pageColumnistDetailLeftDate
        regex = re.compile('(DetailDate|DetailLeftDate)$', re.IGNORECASE)
        date_node = soup.find('div', {'class':regex})
        if date_node:
            date = self.tag_to_string(date_node).__str__().split('/')[0]
            date = ','.join(date.split(',')[:2]).strip()
            article.title = date + ' - ' + article.title.strip()
            article.date = date 
            p('Add date to title: ' + date)

        strong = soup.find('strong')
        if strong:
          article.text_summary = self.tag_to_string(strong)
          p('Summary: ' + article.text_summary)


    def _dbg_soup_node(self, node):
        s = '   cls: ' + node.get('class').__str__().strip() + \
              ' txt: ' + self.tag_to_string(node).strip()
        return s