summaryrefslogtreecommitdiff
path: root/isso/utils/html.py
blob: c0a20e40f4c5855bd6d7b96445271e4494dd4181 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- encoding: utf-8 -*-

from __future__ import unicode_literals

import operator
import pkg_resources

from distutils.version import LooseVersion as Version

from isso.compat import reduce

import bleach

import misaka


# attributes found in Sundown's HTML serializer [1] except for <img> tag,
# because images are not generated anyways.
#
# [1] https://github.com/vmg/sundown/blob/master/html/html.c
ALLOWED_ELEMENTS = ["a", "p", "hr", "br", "ol", "ul", "li",
                    "pre", "code", "blockquote",
                    "del", "ins", "strong", "em",
                    "h1", "h2", "h3", "h4", "h5", "h6",
                    "table", "thead", "tbody", "th", "td"]

# href for <a> and align for <table>
ALLOWED_ATTRIBUTES = ["align", "href"]


class Sanitizer(object):

    def __init__(self, elements, attributes):
        self.elements = ALLOWED_ELEMENTS + elements
        self.attributes = ALLOWED_ATTRIBUTES + attributes

    def sanitize(self, text):
        return bleach.clean(text, tags=self.elements,
            attributes=self.attributes, strip=True)


def Markdown(extensions=("strikethrough", "superscript", "autolink")):

    flags = reduce(operator.xor, map(
        lambda ext: getattr(misaka, 'EXT_' + ext.upper()), extensions), 0)
    md = misaka.Markdown(Unofficial(), extensions=flags)

    def inner(text):
        rv = md.render(text).rstrip("\n")
        if rv.startswith("<p>") or rv.endswith("</p>"):
            return rv
        return "<p>" + rv + "</p>"

    return inner


class Unofficial(misaka.HtmlRenderer):
    """A few modifications to process "common" Markdown.

    For instance, fenced code blocks (~~~ or ```) are just wrapped in <code>
    which does not preserve line breaks. If a language is given, it is added
    to <code class="$lang">, compatible with Highlight.js.
    """

    def block_code(self, text, lang):
        lang = ' class="{0}"'.format(lang) if lang else ''
        return "<pre><code{1}>{0}</code></pre>\n".format(text, lang)


class Markup(object):

    def __init__(self, conf):

        parser = Markdown(conf.getlist("options"))
        sanitizer = Sanitizer(
            conf.getlist("allowed-elements"),
            conf.getlist("allowed-attributes"))

        self._render = lambda text: sanitizer.sanitize(parser(text))

    def render(self, text):
        return self._render(text)