diff options
Diffstat (limited to 'isso/utils/html.py')
-rw-r--r-- | isso/utils/html.py | 60 |
1 files changed, 20 insertions, 40 deletions
diff --git a/isso/utils/html.py b/isso/utils/html.py index 294b8d4..c0a20e4 100644 --- a/isso/utils/html.py +++ b/isso/utils/html.py @@ -7,56 +7,36 @@ import pkg_resources from distutils.version import LooseVersion as Version -HTML5LIB_VERSION = Version(pkg_resources.get_distribution("html5lib").version) -HTML5LIB_SIMPLETREE = Version("0.95") - from isso.compat import reduce -import html5lib -from html5lib.sanitizer import HTMLSanitizer -from html5lib.serializer import HTMLSerializer +import bleach import misaka -def Sanitizer(elements, attributes): - - class Inner(HTMLSanitizer): - - # attributes found in Sundown's HTML serializer [1] except for <img> tag, - # because images are not generated anyways. - # - # [1] https://github.com/vmg/sundown/blob/master/html/html.c - allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li", - "pre", "code", "blockquote", - "del", "ins", "strong", "em", - "h1", "h2", "h3", "h4", "h5", "h6", - "table", "thead", "tbody", "th", "td"] + elements - - # href for <a> and align for <table> - allowed_attributes = ["align", "href"] + attributes - - # remove disallowed tokens from the output - def disallowed_token(self, token, token_type): - return None - - return Inner - +# attributes found in Sundown's HTML serializer [1] except for <img> tag, +# because images are not generated anyways. +# +# [1] https://github.com/vmg/sundown/blob/master/html/html.c +ALLOWED_ELEMENTS = ["a", "p", "hr", "br", "ol", "ul", "li", + "pre", "code", "blockquote", + "del", "ins", "strong", "em", + "h1", "h2", "h3", "h4", "h5", "h6", + "table", "thead", "tbody", "th", "td"] -def sanitize(tokenizer, document): +# href for <a> and align for <table> +ALLOWED_ATTRIBUTES = ["align", "href"] - parser = html5lib.HTMLParser(tokenizer=tokenizer) - domtree = parser.parseFragment(document) - if HTML5LIB_VERSION > HTML5LIB_SIMPLETREE: - builder = "etree" - else: - builder = "simpletree" +class Sanitizer(object): - stream = html5lib.treewalkers.getTreeWalker(builder)(domtree) - serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) + def __init__(self, elements, attributes): + self.elements = ALLOWED_ELEMENTS + elements + self.attributes = ALLOWED_ATTRIBUTES + attributes - return serializer.render(stream) + def sanitize(self, text): + return bleach.clean(text, tags=self.elements, + attributes=self.attributes, strip=True) def Markdown(extensions=("strikethrough", "superscript", "autolink")): @@ -96,7 +76,7 @@ class Markup(object): conf.getlist("allowed-elements"), conf.getlist("allowed-attributes")) - self._render = lambda text: sanitize(sanitizer, parser(text)) + self._render = lambda text: sanitizer.sanitize(parser(text)) def render(self, text): return self._render(text) |