diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 944 |
1 files changed, 573 insertions, 371 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index abe5f66ce..0e79f9ec3 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,7 +1,10 @@ -{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses, -ViewPatterns#-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} {- -Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +23,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-2015 John MacFarlane + Copyright : Copyright (C) 2006-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -34,66 +37,79 @@ module Text.Pandoc.Readers.HTML ( readHtml , htmlInBalanced , isInlineTag , isBlockTag + , NamedTag(..) , isTextTag , isCommentTag ) where +import Control.Applicative ((<|>)) +import Control.Arrow (first) +import Control.Monad (guard, mplus, msum, mzero, unless, void) +import Control.Monad.Except (throwError) +import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT) +import Data.Char (isAlphaNum, isDigit, isLetter) +import Data.Default (Default (..), def) +import Data.Foldable (for_) +import Data.List (isPrefixOf) +import Data.List.Split (wordsBy, splitWhen) +import qualified Data.Map as M +import Data.Maybe (fromMaybe, isJust, isNothing) +import Data.Monoid (First (..), (<>)) +import qualified Data.Set as Set +import Data.Text (Text) +import qualified Data.Text as T +import Network.URI (URI, nonStrictRelativeTo, parseURIReference) import Text.HTML.TagSoup import Text.HTML.TagSoup.Match -import Text.Pandoc.Definition +import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines) import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) -import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField - , escapeURI, safeRead, mapLeft ) -import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace) - , Extension (Ext_epub_html_exts, - Ext_native_divs, Ext_native_spans)) +import Text.Pandoc.Class (PandocMonad (..)) +import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) +import Text.Pandoc.Definition +import Text.Pandoc.Error +import Text.Pandoc.Logging +import Text.Pandoc.Options ( + Extension (Ext_epub_html_exts, Ext_empty_paragraphs, Ext_native_divs, + Ext_native_spans, Ext_raw_html, Ext_line_blocks), + ReaderOptions (readerExtensions, readerStripComments), + extensionEnabled) import Text.Pandoc.Parsing hiding ((<|>)) +import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, + extractSpaces, safeRead, underlineSpan) import Text.Pandoc.Walk -import qualified Data.Map as M -import Data.Maybe ( fromMaybe, isJust) -import Data.List ( intercalate, isInfixOf, isPrefixOf ) -import Data.Char ( isDigit ) -import Control.Monad ( guard, when, mzero, void, unless ) -import Control.Arrow ((***)) -import Control.Applicative ( (<|>) ) -import Data.Monoid (First (..)) -import Text.Printf (printf) -import Debug.Trace (trace) -import Text.TeXMath (readMathML, writeTeX) -import Data.Default (Default (..), def) -import Control.Monad.Reader (Reader,ask, asks, local, runReader) -import Network.URI (URI, parseURIReference, nonStrictRelativeTo) -import Text.Pandoc.Error -import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) -import Data.Monoid ((<>)) import Text.Parsec.Error -import qualified Data.Set as Set +import Text.TeXMath (readMathML, writeTeX) -- | Convert HTML-formatted string to 'Pandoc' document. -readHtml :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assumes @'\n'@ line endings) - -> Either PandocError Pandoc -readHtml opts inp = - mapLeft (ParseFailure . getError) . flip runReader def $ - runParserT parseDoc - (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty) - "source" tags - where tags = stripPrefixes . canonicalizeTags $ - parseTagsOptions parseOptions{ optTagPosition = True } inp - parseDoc = do - blocks <- (fixPlains False) . mconcat <$> manyTill block eof - meta <- stateMeta . parserState <$> getState - bs' <- replaceNotes (B.toList blocks) - return $ Pandoc meta bs' - getError (errorMessages -> ms) = case ms of - [] -> "" - (m:_) -> messageString m - -replaceNotes :: [Block] -> TagParser [Block] +readHtml :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> Text -- ^ String to parse (assumes @'\n'@ line endings) + -> m Pandoc +readHtml opts inp = do + let tags = stripPrefixes . canonicalizeTags $ + parseTagsOptions parseOptions{ optTagPosition = True } + (crFilter inp) + parseDoc = do + blocks <- fixPlains False . mconcat <$> manyTill block eof + meta <- stateMeta . parserState <$> getState + bs' <- replaceNotes (B.toList blocks) + reportLogMessages + return $ Pandoc meta bs' + getError (errorMessages -> ms) = case ms of + [] -> "" + (m:_) -> messageString m + result <- flip runReaderT def $ + runParserT parseDoc + (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty []) + "source" tags + case result of + Right doc -> return doc + Left err -> throwError $ PandocParseError $ getError err + +replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes = walkM replaceNotes' -replaceNotes' :: Inline -> TagParser Inline +replaceNotes' :: PandocMonad m => Inline -> TagParser m Inline replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes where getNotes = noteTable <$> getState @@ -105,38 +121,46 @@ data HTMLState = noteTable :: [(String, Blocks)], baseHref :: Maybe URI, identifiers :: Set.Set String, - headerMap :: M.Map Inlines String + headerMap :: M.Map Inlines String, + logMessages :: [LogMessage] } data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext - , inChapter :: Bool -- ^ Set if in chapter section - , inPlain :: Bool -- ^ Set if in pPlain + , inChapter :: Bool -- ^ Set if in chapter section + , inPlain :: Bool -- ^ Set if in pPlain } -setInChapter :: HTMLParser s a -> HTMLParser s a +setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInChapter = local (\s -> s {inChapter = True}) -setInPlain :: HTMLParser s a -> HTMLParser s a +setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInPlain = local (\s -> s {inPlain = True}) -type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal) +type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m) -type TagParser = HTMLParser [Tag String] +type TagParser m = HTMLParser m [Tag Text] -pBody :: TagParser Blocks +pHtml :: PandocMonad m => TagParser m Blocks +pHtml = try $ do + (TagOpen "html" attr) <- lookAhead pAnyTag + for_ (lookup "lang" attr) $ + updateState . B.setMeta "lang" . B.text . T.unpack + pInTags "html" block + +pBody :: PandocMonad m => TagParser m Blocks pBody = pInTags "body" block -pHead :: TagParser Blocks +pHead :: PandocMonad m => TagParser m Blocks pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag) where pTitle = pInTags "title" inline >>= setTitle . trimInlines - setTitle t = mempty <$ (updateState $ B.setMeta "title" t) + setTitle t = mempty <$ updateState (B.setMeta "title" t) pMetaTag = do - mt <- pSatisfy (~== TagOpen "meta" []) - let name = fromAttrib "name" mt + mt <- pSatisfy (matchTagOpen "meta" []) + let name = T.unpack $ fromAttrib "name" mt if null name then return mempty else do - let content = fromAttrib "content" mt + let content = T.unpack $ fromAttrib "content" mt updateState $ \s -> let ps = parserState s in s{ parserState = ps{ @@ -144,15 +168,13 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag (stateMeta ps) } } return mempty pBaseTag = do - bt <- pSatisfy (~== TagOpen "base" []) + bt <- pSatisfy (matchTagOpen "base" []) updateState $ \st -> st{ baseHref = - parseURIReference $ fromAttrib "href" bt } + parseURIReference $ T.unpack $ fromAttrib "href" bt } return mempty -block :: TagParser Blocks +block :: PandocMonad m => TagParser m Blocks block = do - tr <- getOption readerTrace - pos <- getPosition res <- choice [ eSection , eSwitch B.para block @@ -166,94 +188,107 @@ block = do , pList , pHrule , pTable + , pHtml , pHead , pBody + , pLineBlock , pDiv , pPlain + , pFigure , pRawHtmlBlock ] - when tr $ trace (printf "line %d: %s" (sourceLine pos) - (take 60 $ show $ B.toList res)) (return ()) + trace (take 60 $ show $ B.toList res) return res -namespaces :: [(String, TagParser Inlines)] +namespaces :: PandocMonad m => [(String, TagParser m Inlines)] namespaces = [(mathMLNamespace, pMath True)] mathMLNamespace :: String mathMLNamespace = "http://www.w3.org/1998/Math/MathML" -eSwitch :: Monoid a => (Inlines -> a) -> TagParser a -> TagParser a +eSwitch :: (PandocMonad m, Monoid a) + => (Inlines -> a) + -> TagParser m a + -> TagParser m a eSwitch constructor parser = try $ do guardEnabled Ext_epub_html_exts - pSatisfy (~== TagOpen "switch" []) + pSatisfy (matchTagOpen "switch" []) cases <- getFirst . mconcat <$> manyTill (First <$> (eCase <* skipMany pBlank) ) - (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) + (lookAhead $ try $ pSatisfy (matchTagOpen "default" [])) skipMany pBlank fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank) skipMany pBlank - pSatisfy (~== TagClose "switch") + pSatisfy (matchTagClose "switch") return $ maybe fallback constructor cases -eCase :: TagParser (Maybe Inlines) +eCase :: PandocMonad m => TagParser m (Maybe Inlines) eCase = do skipMany pBlank - TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) - case (flip lookup namespaces) =<< lookup "required-namespace" attr of - Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) - Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) + TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "case" []) + let attr = toStringAttr attr' + case flip lookup namespaces =<< lookup "required-namespace" attr of + Just p -> Just <$> pInTags "case" (skipMany pBlank *> p <* skipMany pBlank) + Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (matchTagClose "case")) -eFootnote :: TagParser () +eFootnote :: PandocMonad m => TagParser m () eFootnote = try $ do let notes = ["footnote", "rearnote"] guardEnabled Ext_epub_html_exts - (TagOpen tag attr) <- lookAhead $ pAnyTag - guard (maybe False (flip elem notes) (lookup "type" attr)) + (TagOpen tag attr') <- lookAhead pAnyTag + let attr = toStringAttr attr' + guard $ maybe False (`elem` notes) (lookup "type" attr) let ident = fromMaybe "" (lookup "id" attr) content <- pInTags tag block addNote ident content -addNote :: String -> Blocks -> TagParser () -addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)}) +addNote :: PandocMonad m => String -> Blocks -> TagParser m () +addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s}) -eNoteref :: TagParser Inlines +eNoteref :: PandocMonad m => TagParser m Inlines eNoteref = try $ do guardEnabled Ext_epub_html_exts - TagOpen tag attr <- lookAhead $ pAnyTag - guard (maybe False (== "noteref") (lookup "type" attr)) + TagOpen tag attr' <- lookAhead pAnyTag + let attr = toStringAttr attr' + guard $ lookup "type" attr == Just "noteref" let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) guard (not (null ident)) pInTags tag block return $ B.rawInline "noteref" ident -- Strip TOC if there is one, better to generate again -eTOC :: TagParser () +eTOC :: PandocMonad m => TagParser m () eTOC = try $ do guardEnabled Ext_epub_html_exts - (TagOpen tag attr) <- lookAhead $ pAnyTag - guard (maybe False (== "toc") (lookup "type" attr)) + (TagOpen tag attr) <- lookAhead pAnyTag + guard $ lookup "type" attr == Just "toc" void (pInTags tag block) -pList :: TagParser Blocks +pList :: PandocMonad m => TagParser m Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList -pBulletList :: TagParser Blocks +pBulletList :: PandocMonad m => TagParser m Blocks pBulletList = try $ do - pSatisfy (~== TagOpen "ul" []) + pSatisfy (matchTagOpen "ul" []) let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && - not (t ~== TagClose "ul")) + not (matchTagClose "ul" t)) -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pListItem nonItem) (pCloses "ul") return $ B.bulletList $ map (fixPlains True) items -pListItem :: TagParser a -> TagParser Blocks +pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks pListItem nonItem = do - TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" []) - let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr) - (liDiv <>) <$> pInTags "li" block <* skipMany nonItem + TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" []) + let attr = toStringAttr attr' + let addId ident bs = case B.toList bs of + (Plain ils:xs) -> B.fromList (Plain + [Span (ident, [], []) ils] : xs) + _ -> B.divWith (ident, [], []) bs + maybe id addId (lookup "id" attr) <$> + pInTags "li" block <* skipMany nonItem parseListStyleType :: String -> ListNumberStyle parseListStyleType "lower-roman" = LowerRoman @@ -271,9 +306,10 @@ parseTypeAttr "A" = UpperAlpha parseTypeAttr "1" = Decimal parseTypeAttr _ = DefaultStyle -pOrderedList :: TagParser Blocks +pOrderedList :: PandocMonad m => TagParser m Blocks pOrderedList = try $ do - TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) + TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" []) + let attribs = toStringAttr attribs' let (start, style) = (sta', sty') where sta = fromMaybe "1" $ lookup "start" attribs @@ -295,23 +331,23 @@ pOrderedList = try $ do ] let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && - not (t ~== TagClose "ol")) + not (matchTagClose "ol" t)) -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pListItem nonItem) (pCloses "ol") return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items -pDefinitionList :: TagParser Blocks +pDefinitionList :: PandocMonad m => TagParser m Blocks pDefinitionList = try $ do - pSatisfy (~== TagOpen "dl" []) + pSatisfy (matchTagOpen "dl" []) items <- manyTill pDefListItem (pCloses "dl") return $ B.definitionList items -pDefListItem :: TagParser (Inlines, [Blocks]) +pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks]) pDefListItem = try $ do - let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && - not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) + let nonItem = pSatisfy (\t -> not (matchTagOpen "dt" [] t) && + not (matchTagOpen "dd" [] t) && not (matchTagClose "dl" t)) terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem @@ -322,133 +358,165 @@ fixPlains :: Bool -> Blocks -> Blocks fixPlains inList bs = if any isParaish bs' then B.fromList $ map plainToPara bs' else bs - where isParaish (Para _) = True - isParaish (CodeBlock _ _) = True - isParaish (Header _ _ _) = True - isParaish (BlockQuote _) = True - isParaish (BulletList _) = not inList - isParaish (OrderedList _ _) = not inList - isParaish (DefinitionList _) = not inList - isParaish _ = False + where isParaish Para{} = True + isParaish CodeBlock{} = True + isParaish Header{} = True + isParaish BlockQuote{} = True + isParaish BulletList{} = not inList + isParaish OrderedList{} = not inList + isParaish DefinitionList{} = not inList + isParaish _ = False plainToPara (Plain xs) = Para xs - plainToPara x = x + plainToPara x = x bs' = B.toList bs -pRawTag :: TagParser String +pRawTag :: PandocMonad m => TagParser m Text pRawTag = do tag <- pAnyTag let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"] if tagOpen ignorable (const True) tag || tagClose ignorable tag - then return [] + then return mempty else return $ renderTags' [tag] -pDiv :: TagParser Blocks +pLineBlock :: PandocMonad m => TagParser m Blocks +pLineBlock = try $ do + guardEnabled Ext_line_blocks + _ <- pSatisfy $ tagOpen (=="div") (== [("class","line-block")]) + ils <- trimInlines . mconcat <$> manyTill inline (pSatisfy (tagClose (=="div"))) + let lns = map B.fromList $ + splitWhen (== LineBreak) $ filter (/= SoftBreak) $ + B.toList ils + return $ B.lineBlock lns + +pDiv :: PandocMonad m => TagParser m Blocks pDiv = try $ do guardEnabled Ext_native_divs - let isDivLike "div" = True + let isDivLike "div" = True isDivLike "section" = True - isDivLike _ = False - TagOpen tag attr <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) + isDivLike "main" = True + isDivLike _ = False + TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) + let attr = toStringAttr attr' contents <- pInTags tag block let (ident, classes, kvs) = mkAttr attr let classes' = if tag == "section" then "section":classes else classes - return $ B.divWith (ident, classes', kvs) contents + kvs' = if tag == "main" && isNothing (lookup "role" kvs) + then ("role", "main"):kvs + else kvs + return $ B.divWith (ident, classes', kvs') contents -pRawHtmlBlock :: TagParser Blocks +pRawHtmlBlock :: PandocMonad m => TagParser m Blocks pRawHtmlBlock = do - raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag - parseRaw <- getOption readerParseRaw - if parseRaw && not (null raw) + raw <- T.unpack <$> (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag) + exts <- getOption readerExtensions + if extensionEnabled Ext_raw_html exts && not (null raw) then return $ B.rawBlock "html" raw - else return mempty + else ignore raw + +ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a +ignore raw = do + pos <- getPosition + -- raw can be null for tags like <!DOCTYPE>; see paRawTag + -- in this case we don't want a warning: + unless (null raw) $ + logMessage $ SkippedContent raw pos + return mempty -pHtmlBlock :: String -> TagParser String +pHtmlBlock :: PandocMonad m => Text -> TagParser m Text pHtmlBlock t = try $ do - open <- pSatisfy (~== TagOpen t []) - contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) - return $ renderTags' $ [open] ++ contents ++ [TagClose t] + open <- pSatisfy (matchTagOpen t []) + contents <- manyTill pAnyTag (pSatisfy (matchTagClose t)) + return $ renderTags' $ [open] <> contents <> [TagClose t] -- Sets chapter context -eSection :: TagParser Blocks +eSection :: PandocMonad m => TagParser m Blocks eSection = try $ do - let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) + let matchChapter as = maybe False (T.isInfixOf "chapter") (lookup "type" as) let sectTag = tagOpen (`elem` sectioningContent) matchChapter TagOpen tag _ <- lookAhead $ pSatisfy sectTag setInChapter (pInTags tag block) -headerLevel :: String -> TagParser Int -headerLevel tagtype = do - let level = read (drop 1 tagtype) - (try $ do - guardEnabled Ext_epub_html_exts - asks inChapter >>= guard - return (level - 1)) - <|> - return level - -eTitlePage :: TagParser () +headerLevel :: PandocMonad m => Text -> TagParser m Int +headerLevel tagtype = + case safeRead (T.unpack (T.drop 1 tagtype)) of + Just level -> +-- try (do +-- guardEnabled Ext_epub_html_exts +-- asks inChapter >>= guard +-- return (level - 1)) +-- <|> + return level + Nothing -> fail "Could not retrieve header level" + +eTitlePage :: PandocMonad m => TagParser m () eTitlePage = try $ do - let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as) + let isTitlePage as = maybe False (T.isInfixOf "titlepage") (lookup "type" as) let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section") isTitlePage TagOpen tag _ <- lookAhead $ pSatisfy groupTag () <$ pInTags tag block -pHeader :: TagParser Blocks +pHeader :: PandocMonad m => TagParser m Blocks pHeader = try $ do - TagOpen tagtype attr <- pSatisfy $ + TagOpen tagtype attr' <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) - let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] + let attr = toStringAttr attr' + let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text) + [("class","title")] level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] - attr' <- registerHeader (ident, classes, keyvals) contents + attr'' <- registerHeader (ident, classes, keyvals) contents return $ if bodyTitle then mempty -- skip a representation of the title in the body - else B.headerWith attr' level contents + else B.headerWith attr'' level contents -pHrule :: TagParser Blocks +pHrule :: PandocMonad m => TagParser m Blocks pHrule = do pSelfClosing (=="hr") (const True) return B.horizontalRule -pTable :: TagParser Blocks +pTable :: PandocMonad m => TagParser m Blocks pTable = try $ do - TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) + TagOpen _ _ <- pSatisfy (matchTagOpen "table" []) skipMany pBlank caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank widths' <- (mconcat <$> many1 pColgroup) <|> many pCol let pTh = option [] $ pInTags "tr" (pCell "th") - pTr = try $ skipMany pBlank >> pInTags "tr" (pCell "td" <|> pCell "th") - pTBody = do pOptInTag "tbody" $ many1 pTr + pTr = try $ skipMany pBlank >> + pInTags "tr" (pCell "td" <|> pCell "th") + pTBody = pOptInTag "tbody" $ many1 pTr head'' <- pOptInTag "thead" pTh - head' <- pOptInTag "tbody" $ do - if null head'' - then pTh - else return head'' + head' <- map snd <$> + pOptInTag "tbody" + (if null head'' then pTh else return head'') rowsLs <- many pTBody rows' <- pOptInTag "tfoot" $ many pTr - TagClose _ <- pSatisfy (~== TagClose "table") - let rows'' = (concat rowsLs) ++ rows' + TagClose _ <- pSatisfy (matchTagClose "table") + let rows'' = concat rowsLs <> rows' + let rows''' = map (map snd) rows'' + -- let rows''' = map (map snd) rows'' -- fail on empty table - guard $ not $ null head' && null rows'' + guard $ not $ null head' && null rows''' let isSinglePlain x = case B.toList x of [] -> True [Plain _] -> True _ -> False - let isSimple = all isSinglePlain $ concat (head':rows'') - let cols = length $ if null head' then head rows'' else head' + let isSimple = all isSinglePlain $ concat (head':rows''') + let cols = length $ if null head' then head rows''' else head' -- add empty cells to short rows let addEmpties r = case cols - length r of - n | n > 0 -> r ++ replicate n mempty + n | n > 0 -> r <> replicate n mempty | otherwise -> r - let rows = map addEmpties rows'' - let aligns = replicate cols AlignDefault + let rows = map addEmpties rows''' + let aligns = case rows'' of + (cs:_) -> map fst cs + _ -> replicate cols AlignDefault let widths = if null widths' then if isSimple then replicate cols 0 @@ -456,80 +524,120 @@ pTable = try $ do else widths' return $ B.table caption (zip aligns widths) head' rows -pCol :: TagParser Double +pCol :: PandocMonad m => TagParser m Double pCol = try $ do - TagOpen _ attribs <- pSatisfy (~== TagOpen "col" []) + TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) + let attribs = toStringAttr attribs' skipMany pBlank - optional $ pSatisfy (~== TagClose "col") + optional $ pSatisfy (matchTagClose "col") skipMany pBlank - return $ case lookup "width" attribs of + let width = case lookup "width" attribs of Nothing -> case lookup "style" attribs of Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs -> - fromMaybe 0.0 $ safeRead ('0':'.':filter - (`notElem` " \t\r\n%'\";") xs) + fromMaybe 0.0 $ safeRead (filter + (`notElem` (" \t\r\n%'\";" :: [Char])) xs) _ -> 0.0 Just x | not (null x) && last x == '%' -> - fromMaybe 0.0 $ safeRead ('0':'.':init x) + fromMaybe 0.0 $ safeRead (init x) _ -> 0.0 + if width > 0.0 + then return $ width / 100.0 + else return 0.0 -pColgroup :: TagParser [Double] +pColgroup :: PandocMonad m => TagParser m [Double] pColgroup = try $ do - pSatisfy (~== TagOpen "colgroup" []) + pSatisfy (matchTagOpen "colgroup" []) skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank -noColOrRowSpans :: Tag String -> Bool +noColOrRowSpans :: Tag Text -> Bool noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan" where isNullOrOne x = case fromAttrib x t of "" -> True "1" -> True _ -> False -pCell :: String -> TagParser [Blocks] +pCell :: PandocMonad m => Text -> TagParser m [(Alignment, Blocks)] pCell celltype = try $ do skipMany pBlank + tag <- lookAhead $ + pSatisfy (\t -> t ~== TagOpen celltype [] && noColOrRowSpans t) + let extractAlign' [] = "" + extractAlign' ("text-align":x:_) = x + extractAlign' (_:xs) = extractAlign' xs + let extractAlign = extractAlign' . wordsBy (`elem` [' ','\t',';',':']) + let align = case maybeFromAttrib "align" tag `mplus` + (extractAlign <$> maybeFromAttrib "style" tag) of + Just "left" -> AlignLeft + Just "right" -> AlignRight + Just "center" -> AlignCenter + _ -> AlignDefault res <- pInTags' celltype noColOrRowSpans block skipMany pBlank - return [res] + return [(align, res)] -pBlockQuote :: TagParser Blocks +pBlockQuote :: PandocMonad m => TagParser m Blocks pBlockQuote = do contents <- pInTags "blockquote" block return $ B.blockQuote $ fixPlains False contents -pPlain :: TagParser Blocks +pPlain :: PandocMonad m => TagParser m Blocks pPlain = do contents <- setInPlain $ trimInlines . mconcat <$> many1 inline if B.isNull contents then return mempty else return $ B.plain contents -pPara :: TagParser Blocks +pPara :: PandocMonad m => TagParser m Blocks pPara = do contents <- trimInlines <$> pInTags "p" inline - return $ B.para contents + (do guardDisabled Ext_empty_paragraphs + guard (B.isNull contents) + return mempty) + <|> return (B.para contents) + +pFigure :: PandocMonad m => TagParser m Blocks +pFigure = try $ do + TagOpen _ _ <- pSatisfy (matchTagOpen "figure" []) + skipMany pBlank + let pImg = (\x -> (Just x, Nothing)) <$> + (pOptInTag "p" pImage <* skipMany pBlank) + pCapt = (\x -> (Nothing, Just x)) <$> do + bs <- pInTags "figcaption" block + return $ blocksToInlines' $ B.toList bs + pSkip = (Nothing, Nothing) <$ pSatisfy (not . matchTagClose "figure") + res <- many (pImg <|> pCapt <|> pSkip) + let mbimg = msum $ map fst res + let mbcap = msum $ map snd res + TagClose _ <- pSatisfy (matchTagClose "figure") + let caption = fromMaybe mempty mbcap + case B.toList <$> mbimg of + Just [Image attr _ (url, tit)] -> + return $ B.para $ B.imageWith attr url ("fig:" ++ tit) caption + _ -> mzero -pCodeBlock :: TagParser Blocks +pCodeBlock :: PandocMonad m => TagParser m Blocks pCodeBlock = try $ do - TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) + TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" []) + let attr = toStringAttr attr' contents <- manyTill pAnyTag (pCloses "pre" <|> eof) let rawText = concatMap tagToString contents -- drop leading newline if any let result' = case rawText of - '\n':xs -> xs - _ -> rawText + '\n':xs -> xs + _ -> rawText -- drop trailing newline if any let result = case reverse result' of - '\n':_ -> init result' - _ -> result' + '\n':_ -> init result' + _ -> result' return $ B.codeBlockWith (mkAttr attr) result -tagToString :: Tag String -> String -tagToString (TagText s) = s +tagToString :: Tag Text -> String +tagToString (TagText s) = T.unpack s tagToString (TagOpen "br" _) = "\n" -tagToString _ = "" +tagToString _ = "" -inline :: TagParser Inlines +inline :: PandocMonad m => TagParser m Inlines inline = choice [ eNoteref , eSwitch id inline @@ -540,6 +648,7 @@ inline = choice , pSuperscript , pSubscript , pStrikeout + , pUnderline , pLineBreak , pLink , pImage @@ -549,30 +658,31 @@ inline = choice , pRawHtmlInline ] -pLocation :: TagParser () +pLocation :: PandocMonad m => TagParser m () pLocation = do (TagPosition r c) <- pSat isTagPosition setPosition $ newPos "input" r c -pSat :: (Tag String -> Bool) -> TagParser (Tag String) +pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) pSat f = do pos <- getPosition token show (const pos) (\x -> if f x then Just x else Nothing) -pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String) +pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) pSatisfy f = try $ optional pLocation >> pSat f -pAnyTag :: TagParser (Tag String) +pAnyTag :: PandocMonad m => TagParser m (Tag Text) pAnyTag = pSatisfy (const True) -pSelfClosing :: (String -> Bool) -> ([Attribute String] -> Bool) - -> TagParser (Tag String) +pSelfClosing :: PandocMonad m + => (Text -> Bool) -> ([Attribute Text] -> Bool) + -> TagParser m (Tag Text) pSelfClosing f g = do open <- pSatisfy (tagOpen f g) optional $ pSatisfy (tagClose f) return open -pQ :: TagParser Inlines +pQ :: PandocMonad m => TagParser m Inlines pQ = do context <- asks quoteContext let quoteType = case context of @@ -587,45 +697,50 @@ pQ = do withQuoteContext innerQuoteContext $ pInlinesInTags "q" constructor -pEmph :: TagParser Inlines +pEmph :: PandocMonad m => TagParser m Inlines pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph -pStrong :: TagParser Inlines +pStrong :: PandocMonad m => TagParser m Inlines pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong -pSuperscript :: TagParser Inlines +pSuperscript :: PandocMonad m => TagParser m Inlines pSuperscript = pInlinesInTags "sup" B.superscript -pSubscript :: TagParser Inlines +pSubscript :: PandocMonad m => TagParser m Inlines pSubscript = pInlinesInTags "sub" B.subscript -pStrikeout :: TagParser Inlines -pStrikeout = do +pStrikeout :: PandocMonad m => TagParser m Inlines +pStrikeout = pInlinesInTags "s" B.strikeout <|> pInlinesInTags "strike" B.strikeout <|> pInlinesInTags "del" B.strikeout <|> - try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) + try (do pSatisfy (matchTagOpen "span" [("class","strikeout")]) contents <- mconcat <$> manyTill inline (pCloses "span") return $ B.strikeout contents) -pLineBreak :: TagParser Inlines +pUnderline :: PandocMonad m => TagParser m Inlines +pUnderline = pInlinesInTags "u" underlineSpan <|> pInlinesInTags "ins" underlineSpan + +pLineBreak :: PandocMonad m => TagParser m Inlines pLineBreak = do pSelfClosing (=="br") (const True) return B.linebreak -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. -maybeFromAttrib :: String -> Tag String -> Maybe String -maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs +maybeFromAttrib :: String -> Tag Text -> Maybe String +maybeFromAttrib name (TagOpen _ attrs) = + T.unpack <$> lookup (T.pack name) attrs maybeFromAttrib _ _ = Nothing -pLink :: TagParser Inlines +pLink :: PandocMonad m => TagParser m Inlines pLink = try $ do tag <- pSatisfy $ tagOpenLit "a" (const True) - let title = fromAttrib "title" tag + let title = T.unpack $ fromAttrib "title" tag -- take id from id attribute if present, otherwise name - let uid = maybe (fromAttrib "name" tag) id $ maybeFromAttrib "id" tag - let cls = words $ fromAttrib "class" tag + let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $ + maybeFromAttrib "id" tag + let cls = words $ T.unpack $ fromAttrib "class" tag lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") -- check for href; if href, then a link, otherwise a span case maybeFromAttrib "href" tag of @@ -639,128 +754,149 @@ pLink = try $ do _ -> url' return $ B.linkWith (uid, cls, []) (escapeURI url) title lab -pImage :: TagParser Inlines +pImage :: PandocMonad m => TagParser m Inlines pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") mbBaseHref <- baseHref <$> getState - let url' = fromAttrib "src" tag + let url' = T.unpack $ fromAttrib "src" tag let url = case (parseURIReference url', mbBaseHref) of (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) _ -> url' - let title = fromAttrib "title" tag - let alt = fromAttrib "alt" tag - let uid = fromAttrib "id" tag - let cls = words $ fromAttrib "class" tag + let title = T.unpack $ fromAttrib "title" tag + let alt = T.unpack $ fromAttrib "alt" tag + let uid = T.unpack $ fromAttrib "id" tag + let cls = words $ T.unpack $ fromAttrib "class" tag let getAtt k = case fromAttrib k tag of "" -> [] - v -> [(k, v)] - let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"] + v -> [(T.unpack k, T.unpack v)] + let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"] return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt) -pCode :: TagParser Inlines +pCode :: PandocMonad m => TagParser m Inlines pCode = try $ do - (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) + (TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) + let attr = toStringAttr attr' result <- manyTill pAnyTag (pCloses open) - return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result + return $ B.codeWith (mkAttr attr) $ unwords $ lines $ T.unpack $ + innerText result -pSpan :: TagParser Inlines +pSpan :: PandocMonad m => TagParser m Inlines pSpan = try $ do guardEnabled Ext_native_spans - TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) + TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) + let attr = toStringAttr attr' contents <- pInTags "span" inline - let isSmallCaps = fontVariant == "small-caps" + let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes where styleAttr = fromMaybe "" $ lookup "style" attr fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr + classes = fromMaybe [] $ + words <$> lookup "class" attr let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr) return $ tag contents -pRawHtmlInline :: TagParser Inlines +pRawHtmlInline :: PandocMonad m => TagParser m Inlines pRawHtmlInline = do inplain <- asks inPlain result <- pSatisfy (tagComment (const True)) <|> if inplain then pSatisfy (not . isBlockTag) else pSatisfy isInlineTag - parseRaw <- getOption readerParseRaw - if parseRaw - then return $ B.rawInline "html" $ renderTags' [result] - else return mempty + exts <- getOption readerExtensions + let raw = T.unpack $ renderTags' [result] + if extensionEnabled Ext_raw_html exts + then return $ B.rawInline "html" raw + else ignore raw mathMLToTeXMath :: String -> Either String String mathMLToTeXMath s = writeTeX <$> readMathML s -pMath :: Bool -> TagParser Inlines +toStringAttr :: [(Text, Text)] -> [(String, String)] +toStringAttr = map go + where go (x,y) = (T.unpack x, T.unpack y) + +pMath :: PandocMonad m => Bool -> TagParser m Inlines pMath inCase = try $ do - open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True) + open@(TagOpen _ attr') <- pSatisfy $ tagOpen (=="math") (const True) -- we'll assume math tags are MathML unless specially marked -- otherwise... + let attr = toStringAttr attr' unless inCase $ guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr)) - contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math")) - case mathMLToTeXMath (renderTags $ [open] ++ contents ++ [TagClose "math"]) of + contents <- manyTill pAnyTag (pSatisfy (matchTagClose "math")) + case mathMLToTeXMath (T.unpack $ renderTags $ + [open] <> contents <> [TagClose "math"]) of Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $ - innerText contents + T.unpack $ innerText contents Right [] -> return mempty Right x -> return $ case lookup "display" attr of Just "block" -> B.displayMath x _ -> B.math x -pInlinesInTags :: String -> (Inlines -> Inlines) - -> TagParser Inlines +pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines) + -> TagParser m Inlines pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline -pInTags :: (Monoid a) => String -> TagParser a -> TagParser a +pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a pInTags tagtype parser = pInTags' tagtype (const True) parser -pInTags' :: (Monoid a) => String -> (Tag String -> Bool) -> TagParser a - -> TagParser a +pInTags' :: (PandocMonad m, Monoid a) + => Text + -> (Tag Text -> Bool) + -> TagParser m a + -> TagParser m a pInTags' tagtype tagtest parser = try $ do pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) mconcat <$> manyTill parser (pCloses tagtype <|> eof) -- parses p, preceeded by an optional opening tag -- and followed by an optional closing tags -pOptInTag :: String -> TagParser a -> TagParser a +pOptInTag :: PandocMonad m => Text -> TagParser m a -> TagParser m a pOptInTag tagtype p = try $ do skipMany pBlank - optional $ pSatisfy (~== TagOpen tagtype []) + optional $ pSatisfy (matchTagOpen tagtype []) skipMany pBlank x <- p skipMany pBlank - optional $ pSatisfy (~== TagClose tagtype) + optional $ pSatisfy (matchTagClose tagtype) skipMany pBlank return x -pCloses :: String -> TagParser () +pCloses :: PandocMonad m => Text -> TagParser m () pCloses tagtype = try $ do t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag case t of - (TagClose t') | t' == tagtype -> pAnyTag >> return () + (TagClose t') | t' == tagtype -> void pAnyTag (TagOpen t' _) | t' `closes` tagtype -> return () (TagClose "ul") | tagtype == "li" -> return () (TagClose "ol") | tagtype == "li" -> return () (TagClose "dl") | tagtype == "dd" -> return () (TagClose "table") | tagtype == "td" -> return () (TagClose "table") | tagtype == "tr" -> return () + (TagClose "td") | tagtype `Set.member` blockHtmlTags -> return () + (TagClose "th") | tagtype `Set.member` blockHtmlTags -> return () + (TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags + -> return () -- see #3794 _ -> mzero -pTagText :: TagParser Inlines +pTagText :: PandocMonad m => TagParser m Inlines pTagText = try $ do (TagText str) <- pSatisfy isTagText st <- getState qu <- ask - case flip runReader qu $ runParserT (many pTagContents) st "text" str of - Left _ -> fail $ "Could not parse `" ++ str ++ "'" + parsed <- lift $ lift $ + flip runReaderT qu $ runParserT (many pTagContents) st "text" str + case parsed of + Left _ -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'" Right result -> return $ mconcat result -pBlank :: TagParser () +pBlank :: PandocMonad m => TagParser m () pBlank = try $ do (TagText str) <- pSatisfy isTagText - guard $ all isSpace str + guard $ T.all isSpace str -type InlinesParser = HTMLParser String +type InlinesParser m = HTMLParser m Text -pTagContents :: InlinesParser Inlines +pTagContents :: PandocMonad m => InlinesParser m Inlines pTagContents = B.displayMath <$> mathDisplay <|> B.math <$> mathInline @@ -770,7 +906,7 @@ pTagContents = <|> pSymbol <|> pBad -pStr :: InlinesParser Inlines +pStr :: PandocMonad m => InlinesParser m Inlines pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) @@ -778,24 +914,24 @@ pStr = do return $ B.str result isSpecial :: Char -> Bool -isSpecial '"' = True -isSpecial '\'' = True -isSpecial '.' = True -isSpecial '-' = True -isSpecial '$' = True +isSpecial '"' = True +isSpecial '\'' = True +isSpecial '.' = True +isSpecial '-' = True +isSpecial '$' = True isSpecial '\8216' = True isSpecial '\8217' = True isSpecial '\8220' = True isSpecial '\8221' = True -isSpecial _ = False +isSpecial _ = False -pSymbol :: InlinesParser Inlines +pSymbol :: PandocMonad m => InlinesParser m Inlines pSymbol = satisfy isSpecial >>= return . B.str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML -pBad :: InlinesParser Inlines +pBad :: PandocMonad m => InlinesParser m Inlines pBad = do c <- satisfy isBad let c' = case c of @@ -829,7 +965,7 @@ pBad = do _ -> '?' return $ B.str [c'] -pSpace :: InlinesParser Inlines +pSpace :: PandocMonad m => InlinesParser m Inlines pSpace = many1 (satisfy isSpace) >>= \xs -> if '\n' `elem` xs then return B.softbreak @@ -839,86 +975,96 @@ pSpace = many1 (satisfy isSpace) >>= \xs -> -- Constants -- -eitherBlockOrInline :: [String] -eitherBlockOrInline = ["audio", "applet", "button", "iframe", "embed", - "del", "ins", - "progress", "map", "area", "noscript", "script", - "object", "svg", "video", "source"] - -{- -inlineHtmlTags :: [[Char]] -inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", - "br", "cite", "code", "dfn", "em", "font", "i", "img", - "input", "kbd", "label", "q", "s", "samp", "select", - "small", "span", "strike", "strong", "sub", "sup", - "textarea", "tt", "u", "var"] --} - -blockHtmlTags :: [String] -blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside", - "blockquote", "body", "button", "canvas", - "caption", "center", "col", "colgroup", "dd", "dir", "div", - "dl", "dt", "fieldset", "figcaption", "figure", - "footer", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "head", "header", "hgroup", "hr", "html", - "isindex", "menu", "noframes", "ol", "output", "p", "pre", - "section", "table", "tbody", "textarea", - "thead", "tfoot", "ul", "dd", - "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script", "style"] +eitherBlockOrInline :: Set.Set Text +eitherBlockOrInline = Set.fromList + ["audio", "applet", "button", "iframe", "embed", + "del", "ins", "progress", "map", "area", "noscript", "script", + "object", "svg", "video", "source"] + +blockHtmlTags :: Set.Set Text +blockHtmlTags = Set.fromList + ["?xml", "!DOCTYPE", "address", "article", "aside", + "blockquote", "body", "canvas", + "caption", "center", "col", "colgroup", "dd", "details", + "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", + "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "head", "header", "hgroup", "hr", "html", + "isindex", "main", "menu", "meta", "noframes", "ol", "output", "p", "pre", + "section", "table", "tbody", "textarea", + "thead", "tfoot", "ul", "dd", + "dt", "frameset", "li", "tbody", "td", "tfoot", + "th", "thead", "tr", "script", "style"] -- We want to allow raw docbook in markdown documents, so we -- include docbook block tags here too. -blockDocBookTags :: [String] -blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist", - "orderedlist", "segmentedlist", "simplelist", - "variablelist", "caution", "important", "note", "tip", - "warning", "address", "literallayout", "programlisting", - "programlistingco", "screen", "screenco", "screenshot", - "synopsis", "example", "informalexample", "figure", - "informalfigure", "table", "informaltable", "para", - "simpara", "formalpara", "equation", "informalequation", - "figure", "screenshot", "mediaobject", "qandaset", - "procedure", "task", "cmdsynopsis", "funcsynopsis", - "classsynopsis", "blockquote", "epigraph", "msgset", - "sidebar", "title"] - -epubTags :: [String] -epubTags = ["case", "switch", "default"] - -blockTags :: [String] -blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags - -isInlineTag :: Tag String -> Bool -isInlineTag t = tagOpen isInlineTagName (const True) t || - tagClose isInlineTagName t || - tagComment (const True) t - where isInlineTagName x = x `notElem` blockTags - -isBlockTag :: Tag String -> Bool -isBlockTag t = tagOpen isBlockTagName (const True) t || - tagClose isBlockTagName t || - tagComment (const True) t - where isBlockTagName ('?':_) = True - isBlockTagName ('!':_) = True - isBlockTagName x = x `elem` blockTags - || x `elem` eitherBlockOrInline - -isTextTag :: Tag String -> Bool +blockDocBookTags :: Set.Set Text +blockDocBookTags = Set.fromList + ["calloutlist", "bibliolist", "glosslist", "itemizedlist", + "orderedlist", "segmentedlist", "simplelist", + "variablelist", "caution", "important", "note", "tip", + "warning", "address", "literallayout", "programlisting", + "programlistingco", "screen", "screenco", "screenshot", + "synopsis", "example", "informalexample", "figure", + "informalfigure", "table", "informaltable", "para", + "simpara", "formalpara", "equation", "informalequation", + "figure", "screenshot", "mediaobject", "qandaset", + "procedure", "task", "cmdsynopsis", "funcsynopsis", + "classsynopsis", "blockquote", "epigraph", "msgset", + "sidebar", "title"] + +epubTags :: Set.Set Text +epubTags = Set.fromList ["case", "switch", "default"] + +blockTags :: Set.Set Text +blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags] + +class NamedTag a where + getTagName :: a -> Maybe Text + +instance NamedTag (Tag Text) where + getTagName (TagOpen t _) = Just t + getTagName (TagClose t) = Just t + getTagName _ = Nothing + +instance NamedTag (Tag String) where + getTagName (TagOpen t _) = Just (T.pack t) + getTagName (TagClose t) = Just (T.pack t) + getTagName _ = Nothing + +isInlineTag :: NamedTag (Tag a) => Tag a -> Bool +isInlineTag t = isInlineTagName || isCommentTag t + where isInlineTagName = case getTagName t of + Just x -> x + `Set.notMember` blockTags + Nothing -> False + +isBlockTag :: NamedTag (Tag a) => Tag a -> Bool +isBlockTag t = isBlockTagName || isTagComment t + where isBlockTagName = + case getTagName t of + Just x + | "?" `T.isPrefixOf` x -> True + | "!" `T.isPrefixOf` x -> True + | otherwise -> x `Set.member` blockTags + || x `Set.member` eitherBlockOrInline + Nothing -> False + +isTextTag :: Tag a -> Bool isTextTag = tagText (const True) -isCommentTag :: Tag String -> Bool +isCommentTag :: Tag a -> Bool isCommentTag = tagComment (const True) -- taken from HXT and extended -- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags -closes :: String -> String -> Bool +closes :: Text -> Text -> Bool _ `closes` "body" = False _ `closes` "html" = False "body" `closes` "head" = True "a" `closes` "a" = True "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True +"td" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True "dd" `closes` t | t `elem` ["dt", "dd"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True @@ -926,12 +1072,12 @@ _ `closes` "html" = False "optgroup" `closes` "optgroup" = True "optgroup" `closes` "option" = True "option" `closes` "option" = True --- http://www.w3.org/TR/html-markup/p.html +-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section", + "h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section", "table", "ul"] = True -"meta" `closes` "meta" = True +_ `closes` "meta" = True "form" `closes` "form" = True "label" `closes` "label" = True "map" `closes` "map" = True @@ -942,17 +1088,18 @@ t `closes` "select" | t /= "option" = True "tfoot" `closes` t | t `elem` ["thead","colgroup"] = True "tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True t `closes` t2 | - t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] && - t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" + t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] && + t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main" t1 `closes` t2 | - t1 `elem` blockTags && - t2 `notElem` (blockTags ++ eitherBlockOrInline) = True + t1 `Set.member` blockTags && + t2 `Set.notMember` blockTags && + t2 `Set.notMember` eitherBlockOrInline = True _ `closes` _ = False --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. -htmlInBalanced :: (Monad m) +htmlInBalanced :: (HasReaderOptions st, Monad m) => (Tag String -> Bool) -> ParserT String st m String htmlInBalanced f = try $ do @@ -973,8 +1120,11 @@ htmlInBalanced f = try $ do let cs = ec - sc lscontents <- unlines <$> count ls anyLine cscontents <- count cs anyChar - (_,closetag) <- htmlTag (~== TagClose tn) - return (lscontents ++ cscontents ++ closetag) + closetag <- do + x <- many (satisfy (/='>')) + char '>' + return (x <> ">") + return (lscontents <> cscontents <> closetag) _ -> mzero _ -> mzero @@ -992,64 +1142,99 @@ htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts go n (t:ts') = (t :) <$> go n ts' go _ [] = mzero -hasTagWarning :: [Tag String] -> Bool +hasTagWarning :: [Tag a] -> Bool hasTagWarning (TagWarning _:_) = True -hasTagWarning _ = False +hasTagWarning _ = False -- | Matches a tag meeting a certain condition. -htmlTag :: Monad m +htmlTag :: (HasReaderOptions st, Monad m) => (Tag String -> Bool) -> ParserT [Char] st m (Tag String, String) htmlTag f = try $ do lookAhead (char '<') + startpos <- getPosition inp <- getInput - let (next : _) = canonicalizeTags $ parseTagsOptions - parseOptions{ optTagWarning = False } inp - guard $ f next + let ts = canonicalizeTags $ parseTagsOptions + parseOptions{ optTagWarning = False + , optTagPosition = True } + (inp ++ " ") -- add space to ensure that + -- we get a TagPosition after the tag + (next, ln, col) <- case ts of + (TagPosition{} : next : TagPosition ln col : _) + | f next -> return (next, ln, col) + _ -> mzero + + -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66> + -- should NOT be parsed as an HTML tag, see #2277, + -- so we exclude . even though it's a valid character + -- in XML element names + let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_' + let isName s = case s of + [] -> False + ('?':_) -> True -- processing instruction + (c:cs) -> isLetter c && all isNameChar cs + + let endpos = if ln == 1 + then setSourceColumn startpos + (sourceColumn startpos + (col - 1)) + else setSourceColumn (setSourceLine startpos + (sourceLine startpos + (ln - 1))) + col + let endAngle = try $ + do char '>' + pos <- getPosition + guard $ pos >= endpos + let handleTag tagname = do - -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66> - -- should NOT be parsed as an HTML tag, see #2277 - guard $ not ('.' `elem` tagname) + -- basic sanity check, since the parser is very forgiving + -- and finds tags in stuff like x<y) + guard $ isName tagname + guard $ not $ null tagname -- <https://example.org> should NOT be a tag either. -- tagsoup will parse it as TagOpen "https:" [("example.org","")] - guard $ not (null tagname) guard $ last tagname /= ':' - rendered <- manyTill anyChar (char '>') - return (next, rendered ++ ">") + char '<' + rendered <- manyTill anyChar endAngle + return (next, "<" ++ rendered ++ ">") case next of TagComment s | "<!--" `isPrefixOf` inp -> do - count (length s + 4) anyChar - skipMany (satisfy (/='>')) - char '>' - return (next, "<!--" ++ s ++ "-->") + string "<!--" + count (length s) anyChar + string "-->" + stripComments <- getOption readerStripComments + if stripComments + then return (next, "") + else return (next, "<!--" <> s <> "-->") | otherwise -> fail "bogus comment mode, HTML5 parse error" - TagOpen tagname _attr -> handleTag tagname - TagClose tagname -> handleTag tagname + TagOpen tagname attr -> do + guard $ all (isName . fst) attr + handleTag tagname + TagClose tagname -> + handleTag tagname _ -> mzero mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes + attribsClasses = words (fromMaybe "" $ lookup "class" attr) <> epubTypes attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr -- Strip namespace prefixes -stripPrefixes :: [Tag String] -> [Tag String] +stripPrefixes :: [Tag Text] -> [Tag Text] stripPrefixes = map stripPrefix -stripPrefix :: Tag String -> Tag String +stripPrefix :: Tag Text -> Tag Text stripPrefix (TagOpen s as) = - TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as) + TagOpen (stripPrefix' s) (map (first stripPrefix') as) stripPrefix (TagClose s) = TagClose (stripPrefix' s) stripPrefix x = x -stripPrefix' :: String -> String +stripPrefix' :: Text -> Text stripPrefix' s = - case span (/= ':') s of - (_, "") -> s - (_, (_:ts)) -> ts + if T.null t then s else T.drop 1 t + where (_, t) = T.span (/= ':') s isSpace :: Char -> Bool isSpace ' ' = True @@ -1068,9 +1253,13 @@ instance HasHeaderMap HTMLState where extractHeaderMap = headerMap updateHeaderMap f s = s{ headerMap = f (headerMap s) } +instance HasLogMessages HTMLState where + addLogMessage m s = s{ logMessages = m : logMessages s } + getLogMessages = reverse . logMessages + -- This signature should be more general -- MonadReader HTMLLocal m => HasQuoteContext st m -instance HasQuoteContext st (Reader HTMLLocal) where +instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where getQuoteContext = asks quoteContext withQuoteContext q = local (\s -> s{quoteContext = q}) @@ -1088,19 +1277,32 @@ instance HasLastStrPosition HTMLState where setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} getLastStrPos = getLastStrPos . parserState +-- For now we need a special verison here; the one in Shared has String type +renderTags' :: [Tag Text] -> Text +renderTags' = renderTagsOptions + renderOptions{ optMinimize = matchTags ["hr", "br", "img", + "meta", "link"] + , optRawTag = matchTags ["script", "style"] } + where matchTags tags = flip elem tags . T.toLower + -- EPUB Specific -- -- -sectioningContent :: [String] +sectioningContent :: [Text] sectioningContent = ["article", "aside", "nav", "section"] -groupingContent :: [String] +groupingContent :: [Text] groupingContent = ["p", "hr", "pre", "blockquote", "ol" , "ul", "li", "dl", "dt", "dt", "dd" , "figure", "figcaption", "div", "main"] +matchTagClose :: Text -> (Tag Text -> Bool) +matchTagClose t = (~== TagClose t) + +matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool) +matchTagOpen t as = (~== TagOpen t as) {- @@ -1108,7 +1310,7 @@ types :: [(String, ([String], Int))] types = -- Document divisions map (\s -> (s, (["section", "body"], 0))) ["volume", "part", "chapter", "division"] - ++ -- Document section and components + <> -- Document section and components [ ("abstract", ([], 0))] -} |