summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2017-10-23 17:29:32 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2017-10-23 17:29:32 -0700
commit896803b0d5d1f5d680d125eb75913025fa734190 (patch)
tree18c316bc936de98eaf75323609fadf1b4165c0cf
parent1a82ecbb6866a00689e3220d304a0fafd81358bb (diff)
HTML reader: `htmlTag` improvements.
We previously failed on cases where an attribute contained a `>` character. This patch fixes the bug. Closes #3989.
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs27
-rw-r--r--test/command/3989.md9
2 files changed, 28 insertions, 8 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 4cbc03089..a545f3f3d 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1125,9 +1125,15 @@ htmlTag :: (HasReaderOptions st, Monad m)
htmlTag f = try $ do
lookAhead (char '<')
inp <- getInput
- let (next : _) = canonicalizeTags $ parseTagsOptions
- parseOptions{ optTagWarning = False } inp
- guard $ f next
+ let ts = canonicalizeTags $ parseTagsOptions
+ parseOptions{ optTagWarning = False
+ , optTagPosition = True }
+ (inp ++ " ") -- add space to ensure that
+ -- we get a TagPosition after the tag
+ (next, ln, col) <- case ts of
+ (TagPosition{} : next : TagPosition ln col : _)
+ | f next -> return (next, ln, col)
+ _ -> mzero
-- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
-- should NOT be parsed as an HTML tag, see #2277,
@@ -1138,6 +1144,11 @@ htmlTag f = try $ do
[] -> False
(c:cs) -> isLetter c && all isNameChar cs
+ let endAngle = try $ do char '>'
+ pos <- getPosition
+ guard $ (sourceLine pos == ln &&
+ sourceColumn pos >= col) ||
+ sourceLine pos > ln
let handleTag tagname = do
-- basic sanity check, since the parser is very forgiving
-- and finds tags in stuff like x<y)
@@ -1146,14 +1157,14 @@ htmlTag f = try $ do
-- <https://example.org> should NOT be a tag either.
-- tagsoup will parse it as TagOpen "https:" [("example.org","")]
guard $ last tagname /= ':'
- rendered <- manyTill anyChar (char '>')
- return (next, rendered <> ">")
+ char '<'
+ rendered <- manyTill anyChar endAngle
+ return (next, "<" ++ rendered ++ ">")
case next of
TagComment s
| "<!--" `isPrefixOf` inp -> do
- count (length s + 4) anyChar
- skipMany (satisfy (/='>'))
- char '>'
+ char '<'
+ manyTill anyChar endAngle
stripComments <- getOption readerStripComments
if stripComments
then return (next, "")
diff --git a/test/command/3989.md b/test/command/3989.md
new file mode 100644
index 000000000..643de9f5d
--- /dev/null
+++ b/test/command/3989.md
@@ -0,0 +1,9 @@
+```
+pandoc -f markdown -t native
+<span data-toggle="tooltip" data-placement="right" data-html="true"
+title="1st line of text <br> 2nd line of text">
+Hover over me
+</span>
+^D
+[Para [Span ("",[],[("data-toggle","tooltip"),("data-placement","right"),("data-html","true"),("title","1st line of text <br> 2nd line of text")]) [SoftBreak,Str "Hover",Space,Str "over",Space,Str "me",SoftBreak]]]
+```