summaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2017-03-18 22:43:57 +0100
committerJohn MacFarlane <jgm@berkeley.edu>2017-03-18 22:43:57 +0100
commit87f99f3fdf0c372c2e5ded0112ff432aa0d7571f (patch)
tree4f1641d4273c17300b8cb6b8896c995db7c86ddb /src/Text
parent435221a9f36ac7d7ea2b1a618f992a45083bb445 (diff)
HTML reader: Better sanity checks on raw HTML.
This also affects the Markdown reader. Closes #3257.
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs23
1 files changed, 17 insertions, 6 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 0af369469..5251962f2 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -54,7 +54,7 @@ import Text.Pandoc.Walk
import qualified Data.Map as M
import Data.Maybe ( fromMaybe, isJust)
import Data.List ( intercalate, isInfixOf, isPrefixOf )
-import Data.Char ( isDigit )
+import Data.Char ( isDigit, isLetter, isAlphaNum )
import Control.Monad ( guard, mzero, void, unless )
import Control.Arrow ((***))
import Control.Applicative ( (<|>) )
@@ -1032,13 +1032,22 @@ htmlTag f = try $ do
let (next : _) = canonicalizeTags $ parseTagsOptions
parseOptions{ optTagWarning = False } inp
guard $ f next
+
+ -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
+ -- should NOT be parsed as an HTML tag, see #2277,
+ -- so we exclude . even though it's a valid character
+ -- in XML elemnet names
+ let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_'
+ let isName s = case s of
+ [] -> False
+ (c:cs) -> isLetter c && all isNameChar cs
+
let handleTag tagname = do
- -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
- -- should NOT be parsed as an HTML tag, see #2277
- guard $ not ('.' `elem` tagname)
+ -- basic sanity check, since the parser is very forgiving
+ -- and finds tags in stuff like x<y)
+ guard $ isName tagname
-- <https://example.org> should NOT be a tag either.
-- tagsoup will parse it as TagOpen "https:" [("example.org","")]
- guard $ not (null tagname)
guard $ last tagname /= ':'
rendered <- manyTill anyChar (char '>')
return (next, rendered ++ ">")
@@ -1050,7 +1059,9 @@ htmlTag f = try $ do
char '>'
return (next, "<!--" ++ s ++ "-->")
| otherwise -> fail "bogus comment mode, HTML5 parse error"
- TagOpen tagname _attr -> handleTag tagname
+ TagOpen tagname attr -> do
+ guard $ all (isName . fst) attr
+ handleTag tagname
TagClose tagname -> handleTag tagname
_ -> mzero