From 8624ed9bd3c38c1907070a3b7de244fd487976c4 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sat, 22 Mar 2008 20:41:56 +0000 Subject: The '--sanitize-html' option now examines URIs in markdown links and images, and in HTML href and src attributes. If the URI scheme is not on a whitelist of safe schemes, it is rejected. The main point is to prevent cross-site scripting attacks using 'javascript:' URIs. See http://www.mail-archive.com/markdown-discuss@six.pairlist.net/msg01186.html and http://ha.ckers.org/xss.html. Resolves Issue #62. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1262 788f1e2b-df1e-0410-8736-df70ead52e1b --- README | 3 ++- Text/Pandoc/Readers/HTML.hs | 32 ++++++++++++++++++++++++-------- Text/Pandoc/Readers/Markdown.hs | 16 +++++++++++----- man/man1/html2markdown.1.md | 4 ---- man/man1/pandoc.1.md | 3 ++- 5 files changed, 39 insertions(+), 19 deletions(-) diff --git a/README b/README index e54821ba3..75d482333 100644 --- a/README +++ b/README @@ -356,7 +356,8 @@ For further documentation, see the `pandoc(1)` man page. `--sanitize-html` : sanitizes HTML (in markdown or HTML input) using a whitelist. Unsafe tags are replaced by HTML comments; unsafe attributes - are omitted. + are omitted. URIs in links and images are also checked against a + whitelist of URI schemes. `--dump-args` : is intended to make it easier to create wrapper scripts that use diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs index 359ff3021..7bd76d983 100644 --- a/Text/Pandoc/Readers/HTML.hs +++ b/Text/Pandoc/Readers/HTML.hs @@ -37,7 +37,8 @@ module Text.Pandoc.Readers.HTML ( anyHtmlEndTag, htmlEndTag, extractTagType, - htmlBlockElement + htmlBlockElement, + unsanitaryURI ) where import Text.ParserCombinators.Parsec @@ -47,6 +48,7 @@ import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) import Data.Maybe ( fromMaybe ) import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf ) import Data.Char ( toLower, isAlphaNum ) +import Network.URI ( parseURIReference, URI (..) ) -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ParserState -- ^ Parser state @@ -110,17 +112,31 @@ sanitaryAttributes = ["abbr", "accept", "accept-charset", -- not on the sanitized tag list. unsanitaryTag tag = do st <- getState - if stateSanitizeHTML st && not (tag `elem` sanitaryTags) - then return True - else return False + return $ stateSanitizeHTML st && tag `notElem` sanitaryTags -- | returns @True@ if sanitization is specified and the specified attribute -- is not on the sanitized attribute list. -unsanitaryAttribute (attr, _, _) = do +unsanitaryAttribute (attr, val, _) = do st <- getState - if stateSanitizeHTML st && not (attr `elem` sanitaryAttributes) - then return True - else return False + return $ stateSanitizeHTML st && + (attr `notElem` sanitaryAttributes || + (attr `elem` ["href","src"] && unsanitaryURI val)) + +-- | Returns @True@ if the specified URI is potentially a security risk. +unsanitaryURI uri = + let safeURISchemes = [ "", "http", "https", "ftp", "mailto", "file", + "telnet", "gopher", "aaa", "aaas", "acap", "cap", "cid", + "crid", "dav", "dict", "dns", "fax", "go", "h323", "im", + "imap", "ldap", "mid", "news", "nfs", "nntp", "pop", + "pres", "sip", "sips", "snmp", "tel", "urn", "wais", + "xmpp", "z39.50r", "z39.50s", "aim", "callto", "cvs", + "ed2k", "feed", "fish", "gg", "irc", "ircs", "lastfm", + "ldaps", "magnet", "mms", "msnim", "notes", "rsync", + "secondlife", "skype", "ssh", "sftp", "smb", "sms", + "snews", "webcal", "ymsgr"] + in case parseURIReference uri of + Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes + Nothing -> True -- | Read blocks until end tag. blocksTilEnd tag = do diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs index e6f09f97a..2dbf9e189 100644 --- a/Text/Pandoc/Readers/Markdown.hs +++ b/Text/Pandoc/Readers/Markdown.hs @@ -41,7 +41,7 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment ) import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, anyHtmlInlineTag, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, - htmlBlockElement ) + htmlBlockElement, unsanitaryURI ) import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) import Text.ParserCombinators.Parsec @@ -921,7 +921,10 @@ linkTitle = try $ do link = try $ do label <- reference src <- source <|> referenceLink label - return $ Link label src + sanitize <- getState >>= return . stateSanitizeHTML + if sanitize && unsanitaryURI (fst src) + then fail "Unsanitary URI" + else return $ Link label src -- a link like [this][ref] or [this][] or [this] referenceLink label = do @@ -941,9 +944,12 @@ autoLink = try $ do then drop 7 src else src st <- getState - return $ if stateStrict st - then Link [Str src'] (src, "") - else Link [Code src'] (src, "") + let sanitize = stateSanitizeHTML st + if sanitize && unsanitaryURI src + then fail "Unsanitary URI" + else return $ if stateStrict st + then Link [Str src'] (src, "") + else Link [Code src'] (src, "") image = try $ do char '!' diff --git a/man/man1/html2markdown.1.md b/man/man1/html2markdown.1.md index 1db37cf47..905bdd0d0 100644 --- a/man/man1/html2markdown.1.md +++ b/man/man1/html2markdown.1.md @@ -51,10 +51,6 @@ a complete list. The following options are most relevant: \--no-wrap : Disable text wrapping in output. (Default is to wrap text.) -\--sanitize-html -: Sanitizes HTML using a whitelist. Unsafe tags are replaced by HTML - comments; unsafe attributes are omitted. - -H *FILE*, \--include-in-header=*FILE* : Include contents of *FILE* at the end of the header. Implies `-s`. diff --git a/man/man1/pandoc.1.md b/man/man1/pandoc.1.md index 5bf734d5a..e3ca8e591 100644 --- a/man/man1/pandoc.1.md +++ b/man/man1/pandoc.1.md @@ -128,7 +128,8 @@ to Pandoc. Or use `html2markdown`(1), a wrapper around `pandoc`. \--sanitize-html : Sanitizes HTML (in markdown or HTML input) using a whitelist. Unsafe tags are replaced by HTML comments; unsafe attributes - are omitted. + are omitted. URIs in links and images are also checked against a + whitelist of URI schemes. \--toc, \--table-of-contents : Include an automatically generated table of contents (HTML, markdown, -- cgit v1.2.3