ted-json2json.py

#!/usr/bin/env python
# -*- coding: utf8 -*-
# ted-json2html.py - Converts the JSON generated by ted-scrape.py into HTML.
# Copyright (C) 2012 Gabriel Rodríguez Alberich <chewie@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import datetime
import json
import re
import time


def main():

    with open("ted-scrape.json") as f:
        talks = json.load(f)

    rows = []
    n = 0
    for talk in talks:
        row = []

        if not talk['nativeDownloads']:
            continue

        # thumbnail
        row.append(talk['thumb'].split("/")[-1])

        # title
        title = '<a href="%s">%s</a>' % (
            talk['canonical'].encode('utf8'), talk['title'].encode('utf8'))
        row.append(title)

        # speaker
        row.append(talk['speaker'].encode("utf8"))

        # download
        row.append("<br>".join(['<a href="%s">%s</a>' % (url and url.split("?apikey")[0] or None, quality) 
                              for quality, url in
                              talk['nativeDownloads'].items() + 
                              [("audio", talk['audioDownload'])] if url ]))

        # download subtitled
        if talk['subtitledDownloads'] and not 'error' in  talk['subtitledDownloads']:
           row.append([re.sub("http://download.ted.com/talks/(.*)-.+\.mp4.*", "\g<1>", talk['subtitledDownloads'].items()[0][1].get('high') or talk['subtitledDownloads'].items()[0][1]['low']),
                        [url.get('high') and re.sub(".*-(.+)\.mp4.*", "\g<1>", url['high']) or re.sub(".*-(.+)\.mp4.*", "\g<1>", url['low']) for lang, url in talk['subtitledDownloads'].items()]])
        else:
            row.append("")

        # summary
        row.append(talk.get('summary', '').encode("utf8"))

        # duration
        row.append(str(datetime.timedelta(seconds=talk['duration'])))

        # dates
        for date_field in ('filmed', 'published'):
            if date_field in talk:
                row.append(time.strftime('%Y-%m-%d',
                    time.localtime(talk[date_field])))
            else:
                row.append("")

        # tags
        row.append(talk.get('tags', ''))
        
        row.append(talk.get('event', '').encode("utf8"))

        rows.append(row)

    with open('ted-talks-cooked.json', 'w') as f:
        json.dump({'data': rows}, f, indent=2)

if __name__ == '__main__':
    main()