epub.py

#!/usr/bin/env python
'''
python/curses epub reader. Requires BeautifulSoup

Keyboard commands:
    Esc/q          - quit
    Tab/Left/Right - toggle between TOC and chapter views
    TOC view:
        Up         - up a line
        Down       - down a line
        PgUp       - up a page
        PgDown     - down a page
    Chapter view:
        Up         - up a page
        Down       - down a page
        PgUp       - up a line
        PgDown     - down a line
        i          - open images on page in web browser
'''

import curses.wrapper
import curses.ascii
import formatter
import htmllib
import locale
import os
import StringIO
import re
import tempfile
import zipfile

from bs4 import BeautifulSoup

try:
    from fabulous import image
    import Pillow
except ImportError:
    images = False
else:
    images = True

locale.setlocale(locale.LC_ALL, 'en_US.utf-8')

basedir = ''

ESCAPE_KEYS = [ord('q'), curses.ascii.ESC]
TOC_DOWN_LINE_KEYS = [curses.KEY_DOWN]
TOC_UP_LINE_KEYS = [curses.KEY_UP]
TOC_DOWN_PAGE_KEYS = [curses.KEY_NPAGE]
TOC_UP_PAGE_KEYS = [curses.KEY_PPAGE]
CHAPTER_DOWN_PAGE_KEYS = [curses.KEY_DOWN]
CHAPTER_UP_PAGE_KEYS = [curses.KEY_UP]
CHAPTER_DOWN_LINE_KEYS = [curses.KEY_NPAGE]
CHAPTER_UP_LINE_KEYS = [curses.KEY_PPAGE]
CHAPTERTOCSWITCH_KEYS = [curses.ascii.HT, curses.KEY_RIGHT, curses.KEY_LEFT]


def run(screen, program, *args):
    curses.nocbreak()
    screen.keypad(0)
    curses.echo()
    pid = os.fork()
    if not pid:
        os.execvp(program, (program,) + args)
    os.wait()[0]
    curses.noecho()
    screen.keypad(1)
    curses.cbreak()


def open_image(screen, name, s):
    ''' show images with PIL and fabulous '''
    if not images:
        screen.addstr(0, 0, "missing PIL or fabulous", curses.A_REVERSE)
        return

    ext = os.path.splitext(name)[1]

    screen.erase()
    screen.refresh()
    curses.setsyx(0, 0)
    image_file = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
    image_file.write(s)
    image_file.close()
    try:
        print image.Image(image_file.name)
    except:
        print image_file.name
    finally:
        os.unlink(image_file.name)


def textify(html_snippet, img_size=(80, 45), maxcol=72):
    ''' text dump of html '''
    class Formatter(formatter.AbstractFormatter):
        pass

    class Parser(htmllib.HTMLParser):

        def anchor_end(self):
            self.anchor = None

        def handle_image(self, source, alt, ismap, alight, width, height):
            global basedir
            self.handle_data(
                '[img="{0}{1}" "{2}"]'.format(basedir, source, alt)
            )

    class Writer(formatter.DumbWriter):

        def __init__(self, fl, maxcol=72):
            formatter.DumbWriter.__init__(self, fl)
            self.maxcol = maxcol

        def send_label_data(self, data):
            self.send_flowing_data(data)
            self.send_flowing_data(' ')

    o = StringIO.StringIO()
    p = Parser(Formatter(Writer(o, maxcol)))
    p.feed(html_snippet)
    p.close()

    return o.getvalue()


def table_of_contents(fl):
    global basedir

    # find opf file
    soup = BeautifulSoup(fl.read('META-INF/container.xml'))
    opf = dict(soup.find('rootfile').attrs)['full-path']

    basedir = os.path.dirname(opf)
    if basedir:
        basedir = '{0}/'.format(basedir)

    soup = BeautifulSoup(fl.read(opf))

    # title
    yield (soup.find('dc:title').text, None)

    # all files, not in order
    x, ncx = {}, None
    for item in soup.find('manifest').findAll('item'):
        d = dict(item.attrs)
        x[d['id']] = '{0}{1}'.format(basedir, d['href'])
        if d['media-type'] == 'application/x-dtbncx+xml':
            ncx = '{0}{1}'.format(basedir, d['href'])

    # reading order, not all files
    y = []
    for item in soup.find('spine').findAll('itemref'):
        y.append(x[dict(item.attrs)['idref']])

    z = {}
    if ncx:
        # get titles from the toc
        soup = BeautifulSoup(fl.read(ncx))

        for navpoint in soup('navpoint'):
            k = navpoint.content.get('src', None)
            # strip off any anchor text
            k = k.split('#')[0]
            if k:
                z[k] = navpoint.navlabel.text

    # output
    for section in y:
        if section in z:
            yield (z[section].encode('utf-8'), section.encode('utf-8'))
        else:
            yield (u'', section.encode('utf-8').strip())


def list_chaps(screen, chaps, start, length):
    for i, (title, src) in enumerate(chaps[start:start + length]):
        try:
            if start == 0:
                screen.addstr(i, 0, '      {0}'.format(title), curses.A_BOLD)
            else:
                screen.clrtoeol()
                screen.addstr(i, 0, '{0:-5} {1}'.format(start, title.strip()))
        except:
            pass
        start += 1
    screen.refresh()
    return i


def check_epub(fl):
    if os.path.isfile(fl) and os.path.splitext(fl)[1].lower() == '.epub':
        return True


def dump_epub(fl, maxcol=float("+inf")):
    if not check_epub(fl):
        return
    fl = zipfile.ZipFile(fl, 'r')
    chaps = [i for i in table_of_contents(fl)]
    for title, src in chaps:
        print title
        print '-' * len(title)
        if src:
            soup = BeautifulSoup(fl.read(src))
            print textify(
                unicode(soup.find('body')).encode('utf-8'),
                maxcol=maxcol,
            )
        print '\n'


def curses_epub(screen, fl):
    if not check_epub(fl):
        return

    #curses.mousemask(curses.BUTTON1_CLICKED)

    fl = zipfile.ZipFile(fl, 'r')
    chaps = [i for i in table_of_contents(fl)]
    chaps_pos = [0 for i in chaps]
    start = 0
    cursor_row = 0

    # toc
    while True:
        curses.curs_set(1)
        maxy, maxx = screen.getmaxyx()

        if cursor_row >= maxy:
            cursor_row = maxy - 1

        len_chaps = list_chaps(screen, chaps, start, maxy)
        screen.move(cursor_row, 0)
        ch = screen.getch()

        if ch in ESCAPE_KEYS:
            return

        # up/down line
        if ch in TOC_DOWN_LINE_KEYS:
            if start < len(chaps) - maxy:
                start += 1
                screen.clear()
            elif cursor_row < maxy - 1 and cursor_row < len_chaps:
                cursor_row += 1
        elif ch in TOC_UP_LINE_KEYS:
            if start > 0:
                start -= 1
                screen.clear()
            elif cursor_row > 0:
                cursor_row -= 1

        # up/down page
        elif ch in TOC_DOWN_PAGE_KEYS:
            if start + maxy - 1 < len(chaps):
                start += maxy - 1
                if len_chaps < maxy:
                    start = len(chaps) - maxy
                screen.clear()
        elif ch in TOC_UP_PAGE_KEYS:
            if start > 0:
                start -= maxy - 1
                if start < 0:
                    start = 0
                screen.clear()

        # to chapter
        elif ch in CHAPTERTOCSWITCH_KEYS:
            if chaps[start + cursor_row][1]:
                html = fl.read(chaps[start + cursor_row][1])
                soup = BeautifulSoup(html)
                chap = textify(
                    unicode(soup.find('body')).encode('utf-8'),
                    img_size=screen.getmaxyx(),
                    maxcol=screen.getmaxyx()[1]
                ).split('\n')
            else:
                chap = ''
            screen.clear()
            curses.curs_set(0)

            # chapter
            while True:
                maxy, maxx = screen.getmaxyx()
                images = []
                for i, line in enumerate(chap[
                    chaps_pos[start + cursor_row]:
                    chaps_pos[start + cursor_row] + maxy
                ]):
                    try:
                        screen.addstr(i, 0, line)
                        mch = re.search('\[img="([^"]+)" "([^"]*)"\]', line)
                        if mch:
                            images.append(mch.group(1))
                    except:
                        pass
                screen.refresh()
                ch = screen.getch()

                # quit
                if ch in ESCAPE_KEYS:
                    return

                # to TOC
                if ch in CHAPTERTOCSWITCH_KEYS:
                    screen.clear()
                    break

                # up/down page
                elif ch in CHAPTER_DOWN_PAGE_KEYS:
                    if chaps_pos[start + cursor_row] + maxy - 1 < len(chap):
                        chaps_pos[start + cursor_row] += maxy - 1
                        screen.clear()
                elif ch in CHAPTER_UP_PAGE_KEYS:
                    if chaps_pos[start + cursor_row] > 0:
                        chaps_pos[start + cursor_row] -= maxy - 1
                        if chaps_pos[start + cursor_row] < 0:
                            chaps_pos[start + cursor_row] = 0
                        screen.clear()

                # up/down line
                elif ch in CHAPTER_DOWN_LINE_KEYS:
                    if chaps_pos[start + cursor_row] + maxy - 1 < len(chap):
                        chaps_pos[start + cursor_row] += 1
                        screen.clear()
                elif ch in CHAPTER_UP_LINE_KEYS:
                    if chaps_pos[start + cursor_row] > 0:
                        chaps_pos[start + cursor_row] -= 1
                        screen.clear()

                #elif ch in [curses.KEY_MOUSE]:
                #    id, x, y, z, bstate = curses.getmouse()
                #    line = screen.instr(y, 0)
                #    mch = re.search('\[img="([^"]+)" "([^"]*)"\]', line)
                #    if mch:
                #            img_fl = mch.group(1)

                else:
                    try:
                        if chr(ch) == 'i':
                            for img in images:
                                try:
                                    err = open_image(screen, img, fl.read(img))
                                except KeyError:
                                    err = 'image not found'
                                if err:
                                    screen.addstr(0, 0, err, curses.A_REVERSE)

                        # edit html
                        elif chr(ch) == 'e':

                            tmpfl = tempfile.NamedTemporaryFile(delete=False)
                            tmpfl.write(html)
                            tmpfl.close()
                            run(screen, 'vim', tmpfl.name)
                            with open(tmpfl.name) as changed:
                                new_html = changed.read()
                                os.unlink(tmpfl.name)
                                if new_html != html:
                                    pass
                                    # write to zipfile?

                            # go back to TOC
                            screen.clear()
                            break

                    except (ValueError, IndexError):
                        pass

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__,
    )
    parser.add_argument(
        '-d', '--dump',
        action='store_true',
        help='dump EPUB to text'
    )
    parser.add_argument(
        '-c', '--cols',
        action='store',
        type=int,
        default=float("+inf"),
        help='Number of columns to wrap; default is no wrapping.'
    )
    parser.add_argument('EPUB', help='view EPUB')
    args = parser.parse_args()

    if args.EPUB:
        if args.dump:
            dump_epub(args.EPUB, args.cols)
        else:
            try:
                curses.wrapper(curses_epub, args.EPUB)
            except KeyboardInterrupt:
                pass