summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2016-04-10 07:39:36 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2016-04-10 07:39:36 -0700
commit773bbb8fc73a3b6598188dbae64a841eb6680b38 (patch)
treefb7ff99dd8595bc779aa1aabbb1a122abf2204e1 /src/Text/Pandoc
parentcb8b1c2655509544b04e31a29142c98fd391d9f9 (diff)
Markdown + HTML readers: be more forgiving about unescaped &.
We are now more forgiving about parsing invalid HTML with unescaped `&` as raw HTML. (Previously any unescaped `&` would cause pandoc not to recognize the string as raw HTML.) Closes #2410.
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs25
1 files changed, 15 insertions, 10 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index fb936cff7..8ee5da543 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -971,11 +971,20 @@ htmlTag :: Monad m
htmlTag f = try $ do
lookAhead (char '<')
inp <- getInput
- let (next : rest) = canonicalizeTags $ parseTagsOptions
- parseOptions{ optTagWarning = True } inp
+ let (next : _) = canonicalizeTags $ parseTagsOptions
+ parseOptions{ optTagWarning = False } inp
guard $ f next
+ let handleTag tagname = do
+ -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
+ -- should NOT be parsed as an HTML tag, see #2277
+ guard $ not ('.' `elem` tagname)
+ -- <https://example.org> should NOT be a tag either.
+ -- tagsoup will parse it as TagOpen "https:" [("example.org","")]
+ guard $ not (null tagname)
+ guard $ last tagname /= ':'
+ rendered <- manyTill anyChar (char '>')
+ return (next, rendered ++ ">")
case next of
- TagWarning _ -> fail "encountered TagWarning"
TagComment s
| "<!--" `isPrefixOf` inp -> do
count (length s + 4) anyChar
@@ -983,13 +992,9 @@ htmlTag f = try $ do
char '>'
return (next, "<!--" ++ s ++ "-->")
| otherwise -> fail "bogus comment mode, HTML5 parse error"
- _ -> do
- -- we get a TagWarning on things like
- -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
- -- which should NOT be parsed as an HTML tag, see #2277
- guard $ not $ hasTagWarning rest
- rendered <- manyTill anyChar (char '>')
- return (next, rendered ++ ">")
+ TagOpen tagname _attr -> handleTag tagname
+ TagClose tagname -> handleTag tagname
+ _ -> mzero
mkAttr :: [(String, String)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)