Skip to content

Commit

Permalink
Parse <img> tags
Browse files Browse the repository at this point in the history
  • Loading branch information
mgeier committed Apr 21, 2020
1 parent 5dc172f commit afa0ab2
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 10 deletions.
36 changes: 33 additions & 3 deletions doc/markdown-cells.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,39 @@
"\n",
"Remote image: ![Python logo (remote)](https://www.python.org/static/img/python-logo-large.png)\n",
"\n",
" ![Python logo (remote)](https://www.python.org/static/img/python-logo-large.png)\n",
" ![Python logo (remote)](https://www.python.org/static/img/python-logo-large.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using the HTML `<img>` tag\n",
"\n",
"The aforementioned Markdown syntax for including images\n",
"doesn't allow specifying the image size.\n",
"\n",
"If you want to control the size of the included image,\n",
"you can use the HTML\n",
"[\\<img\\>](https://www.w3.org/TR/html52/semantics-embedded-content.html#the-img-element)\n",
"element with the `width` attribute like this:\n",
"\n",
"```html\n",
"<img src=\"images/notebook_icon.png\" alt=\"Jupyter notebook icon\" width=\"300\">\n",
"```\n",
"\n",
"<img src=\"images/notebook_icon.png\" alt=\"Jupyter notebook icon\" width=\"300\">\n",
"\n",
"In addition to the `src`, `alt`, `width` and `height` attributes,\n",
"you can also use the `class` attribute,\n",
"which is simply forwarded to the HTML output (and ignored in LaTeX output).\n",
"All other attributes are ignored."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SVG support for LaTeX\n",
"\n",
"LaTeX doesn't support SVG images, but there are Sphinx extensions that can be used for automatically converting SVG images for inclusion in LaTeX output.\n",
Expand Down Expand Up @@ -443,8 +474,7 @@
"\n",
"```\n",
"[beginning of this section](#Links-to-Other-Notebooks)\n",
"```",
"\n",
"```\n",
"It's also possible to create a\n",
"[link to the beginning of the current page](#),\n",
"by simply using a `#` character:\n",
Expand Down
70 changes: 63 additions & 7 deletions src/nbsphinx.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import re
import subprocess
from urllib.parse import unquote
import uuid

import docutils
from docutils.parsers import rst
Expand Down Expand Up @@ -1245,6 +1246,45 @@ def reset(self):
self.cite = ''


class ImgParser(html.parser.HTMLParser):
"""Turn HTML <img> tags into raw RST blocks."""

def handle_starttag(self, tag, attrs):
self._check_img(tag, attrs)

def handle_startendtag(self, tag, attrs):
self._check_img(tag, attrs)

def _check_img(self, tag, attrs):
if tag != 'img':
return
# NB: attrs is a list of pairs
attrs = dict(attrs)
if 'src' not in attrs:
return
img_path = nbconvert.filters.posix_path(attrs['src'])
lines = ['image:: ' + img_path]
indent = ' ' * 4
if 'class' in attrs:
lines.append(indent + ':class: ' + attrs['class'])
if 'alt' in attrs:
lines.append(indent + ':alt: ' + attrs['alt'])
if 'width' in attrs:
lines.append(indent + ':width: ' + attrs['width'])
if 'height' in attrs:
lines.append(indent + ':height: ' + attrs['height'])

definition = '\n'.join(lines)
hex_id = uuid.uuid4().hex
definition = '.. |' + hex_id + '| ' + definition
self.obj = {'t': 'RawInline', 'c': ['rst', '|' + hex_id + '|']}
self.definition = definition

def reset(self):
super().reset()
self.obj = {}


def markdown2rst(text):
"""Convert a Markdown string to reST via pandoc.
Expand All @@ -1259,16 +1299,22 @@ def markdown2rst(text):
"""

def parse_html(obj):
def parse_citation(obj):
p = CitationParser()
p.feed(obj['c'][1])
p.close()
return p

def parse_img(obj):
p = ImgParser()
p.feed(obj['c'][1])
p.close()
return p

def object_hook(obj):
if object_hook.open_cite_tag:
if obj.get('t') == 'RawInline' and obj['c'][0] == 'html':
p = parse_html(obj)
p = parse_citation(obj)
if p.endtag == object_hook.open_cite_tag:
object_hook.open_cite_tag = ''
return {'t': 'Str', 'c': ''} # Object is replaced by empty string
Expand All @@ -1287,14 +1333,20 @@ def object_hook(obj):
obj = {'t': 'RawInline',
'c': ['rst', ':nbsphinx-math:`{}`'.format(obj['c'][1])]}
elif obj.get('t') == 'RawInline' and obj['c'][0] == 'html':
p = parse_html(obj)
p = parse_citation(obj)
if p.starttag:
object_hook.open_cite_tag = p.starttag
if p.cite:
obj = {'t': 'RawInline', 'c': ['rst', p.cite]}
if not p.starttag and not p.cite:
p = parse_img(obj)
if p.obj:
obj = p.obj
object_hook.image_definitions.append(p.definition)
return obj

object_hook.open_cite_tag = ''
object_hook.image_definitions = []

def filter_func(text):
json_data = json.loads(text, object_hook=object_hook)
Expand All @@ -1307,10 +1359,14 @@ def filter_func(text):
input_format += '-native_divs+raw_html'

rststring = pandoc(text, input_format, 'rst', filter_func=filter_func)
return re.sub(r'^\n( *)\x0e:nowrap:\x0f$',
r'\1:nowrap:',
rststring,
flags=re.MULTILINE)
rststring = re.sub(
r'^\n( *)\x0e:nowrap:\x0f$',
r'\1:nowrap:',
rststring,
flags=re.MULTILINE)
rststring += '\n\n'
rststring += '\n'.join(object_hook.image_definitions)
return rststring


def pandoc(source, fmt, to, filter_func=None):
Expand Down

0 comments on commit afa0ab2

Please sign in to comment.