1 files changed, 573 insertions, 371 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index abe5f66ce..0e79f9ec3 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1,7 +1,10 @@
-{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses,
-ViewPatterns#-}
+{-# LANGUAGE FlexibleContexts      #-}
+{-# LANGUAGE FlexibleInstances     #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE OverloadedStrings     #-}
+{-# LANGUAGE ViewPatterns          #-}
 {-
-Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -20,7 +23,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
 {- |
    Module      : Text.Pandoc.Readers.HTML
-   Copyright   : Copyright (C) 2006-2015 John MacFarlane
+   Copyright   : Copyright (C) 2006-2018 John MacFarlane
    License     : GNU GPL, version 2 or above
 
    Maintainer  : John MacFarlane <jgm@berkeley.edu>
@@ -34,66 +37,79 @@ module Text.Pandoc.Readers.HTML ( readHtml
                                 , htmlInBalanced
                                 , isInlineTag
                                 , isBlockTag
+                                , NamedTag(..)
                                 , isTextTag
                                 , isCommentTag
                                 ) where
 
+import Control.Applicative ((<|>))
+import Control.Arrow (first)
+import Control.Monad (guard, mplus, msum, mzero, unless, void)
+import Control.Monad.Except (throwError)
+import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT)
+import Data.Char (isAlphaNum, isDigit, isLetter)
+import Data.Default (Default (..), def)
+import Data.Foldable (for_)
+import Data.List (isPrefixOf)
+import Data.List.Split (wordsBy, splitWhen)
+import qualified Data.Map as M
+import Data.Maybe (fromMaybe, isJust, isNothing)
+import Data.Monoid (First (..), (<>))
+import qualified Data.Set as Set
+import Data.Text (Text)
+import qualified Data.Text as T
+import Network.URI (URI, nonStrictRelativeTo, parseURIReference)
 import Text.HTML.TagSoup
 import Text.HTML.TagSoup.Match
-import Text.Pandoc.Definition
+import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines)
 import qualified Text.Pandoc.Builder as B
-import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
-import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField
-                          , escapeURI, safeRead, mapLeft )
-import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace)
-                           , Extension (Ext_epub_html_exts,
-                               Ext_native_divs, Ext_native_spans))
+import Text.Pandoc.Class (PandocMonad (..))
+import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
+import Text.Pandoc.Definition
+import Text.Pandoc.Error
+import Text.Pandoc.Logging
+import Text.Pandoc.Options (
+    Extension (Ext_epub_html_exts, Ext_empty_paragraphs, Ext_native_divs,
+               Ext_native_spans, Ext_raw_html, Ext_line_blocks),
+    ReaderOptions (readerExtensions, readerStripComments),
+    extensionEnabled)
 import Text.Pandoc.Parsing hiding ((<|>))
+import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
+                           extractSpaces, safeRead, underlineSpan)
 import Text.Pandoc.Walk
-import qualified Data.Map as M
-import Data.Maybe ( fromMaybe, isJust)
-import Data.List ( intercalate, isInfixOf, isPrefixOf )
-import Data.Char ( isDigit )
-import Control.Monad ( guard, when, mzero, void, unless )
-import Control.Arrow ((***))
-import Control.Applicative ( (<|>) )
-import Data.Monoid (First (..))
-import Text.Printf (printf)
-import Debug.Trace (trace)
-import Text.TeXMath (readMathML, writeTeX)
-import Data.Default (Default (..), def)
-import Control.Monad.Reader (Reader,ask, asks, local, runReader)
-import Network.URI (URI, parseURIReference, nonStrictRelativeTo)
-import Text.Pandoc.Error
-import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
-import Data.Monoid ((<>))
 import Text.Parsec.Error
-import qualified Data.Set as Set
+import Text.TeXMath (readMathML, writeTeX)
 
 -- | Convert HTML-formatted string to 'Pandoc' document.
-readHtml :: ReaderOptions -- ^ Reader options
-         -> String        -- ^ String to parse (assumes @'\n'@ line endings)
-         -> Either PandocError Pandoc
-readHtml opts inp =
-    mapLeft (ParseFailure . getError) . flip runReader def $
-      runParserT parseDoc
-        (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty)
-        "source" tags
-    where tags = stripPrefixes . canonicalizeTags $
-                   parseTagsOptions parseOptions{ optTagPosition = True } inp
-          parseDoc = do
-             blocks <- (fixPlains False) . mconcat <$> manyTill block eof
-             meta <- stateMeta . parserState <$> getState
-             bs' <- replaceNotes (B.toList blocks)
-             return $ Pandoc meta bs'
-          getError (errorMessages -> ms) = case ms of
-                                                []    -> ""
-                                                (m:_) -> messageString m
-
-replaceNotes :: [Block] -> TagParser [Block]
+readHtml :: PandocMonad m
+         => ReaderOptions -- ^ Reader options
+         -> Text        -- ^ String to parse (assumes @'\n'@ line endings)
+         -> m Pandoc
+readHtml opts inp = do
+  let tags = stripPrefixes . canonicalizeTags $
+             parseTagsOptions parseOptions{ optTagPosition = True }
+             (crFilter inp)
+      parseDoc = do
+        blocks <- fixPlains False . mconcat <$> manyTill block eof
+        meta <- stateMeta . parserState <$> getState
+        bs' <- replaceNotes (B.toList blocks)
+        reportLogMessages
+        return $ Pandoc meta bs'
+      getError (errorMessages -> ms) = case ms of
+                                         []    -> ""
+                                         (m:_) -> messageString m
+  result <- flip runReaderT def $
+       runParserT parseDoc
+       (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty [])
+       "source" tags
+  case result of
+    Right doc -> return doc
+    Left  err -> throwError $ PandocParseError $ getError err
+
+replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
 replaceNotes = walkM replaceNotes'
 
-replaceNotes' :: Inline -> TagParser Inline
+replaceNotes' :: PandocMonad m => Inline -> TagParser m Inline
 replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes
   where
     getNotes = noteTable <$> getState
@@ -105,38 +121,46 @@ data HTMLState =
      noteTable   :: [(String, Blocks)],
      baseHref    :: Maybe URI,
      identifiers :: Set.Set String,
-     headerMap   :: M.Map Inlines String
+     headerMap   :: M.Map Inlines String,
+     logMessages :: [LogMessage]
   }
 
 data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
-                           , inChapter :: Bool -- ^ Set if in chapter section
-                           , inPlain :: Bool -- ^ Set if in pPlain
+                           , inChapter    :: Bool -- ^ Set if in chapter section
+                           , inPlain      :: Bool -- ^ Set if in pPlain
                            }
 
-setInChapter :: HTMLParser s a -> HTMLParser s a
+setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
 setInChapter = local (\s -> s {inChapter = True})
 
-setInPlain :: HTMLParser s a -> HTMLParser s a
+setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
 setInPlain = local (\s -> s {inPlain = True})
 
-type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal)
+type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
 
-type TagParser = HTMLParser [Tag String]
+type TagParser m = HTMLParser m [Tag Text]
 
-pBody :: TagParser Blocks
+pHtml :: PandocMonad m => TagParser m Blocks
+pHtml = try $ do
+  (TagOpen "html" attr) <- lookAhead pAnyTag
+  for_ (lookup "lang" attr) $
+    updateState . B.setMeta "lang" . B.text . T.unpack
+  pInTags "html" block
+
+pBody :: PandocMonad m => TagParser m Blocks
 pBody = pInTags "body" block
 
-pHead :: TagParser Blocks
+pHead :: PandocMonad m => TagParser m Blocks
 pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag)
   where pTitle = pInTags "title" inline >>= setTitle . trimInlines
-        setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
+        setTitle t = mempty <$ updateState (B.setMeta "title" t)
         pMetaTag = do
-          mt <- pSatisfy (~== TagOpen "meta" [])
-          let name = fromAttrib "name" mt
+          mt <- pSatisfy (matchTagOpen "meta" [])
+          let name = T.unpack $ fromAttrib "name" mt
           if null name
              then return mempty
              else do
-               let content = fromAttrib "content" mt
+               let content = T.unpack $ fromAttrib "content" mt
                updateState $ \s ->
                  let ps = parserState s in
                  s{ parserState = ps{
@@ -144,15 +168,13 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag
                                    (stateMeta ps) } }
                return mempty
         pBaseTag = do
-          bt <- pSatisfy (~== TagOpen "base" [])
+          bt <- pSatisfy (matchTagOpen "base" [])
           updateState $ \st -> st{ baseHref =
-               parseURIReference $ fromAttrib "href" bt }
+               parseURIReference $ T.unpack $ fromAttrib "href" bt }
           return mempty
 
-block :: TagParser Blocks
+block :: PandocMonad m => TagParser m Blocks
 block = do
-  tr <- getOption readerTrace
-  pos <- getPosition
   res <- choice
             [ eSection
             , eSwitch B.para block
@@ -166,94 +188,107 @@ block = do
             , pList
             , pHrule
             , pTable
+            , pHtml
             , pHead
             , pBody
+            , pLineBlock
             , pDiv
             , pPlain
+            , pFigure
             , pRawHtmlBlock
             ]
-  when tr $ trace (printf "line %d: %s" (sourceLine pos)
-             (take 60 $ show $ B.toList res)) (return ())
+  trace (take 60 $ show $ B.toList res)
   return res
 
-namespaces :: [(String, TagParser Inlines)]
+namespaces :: PandocMonad m => [(String, TagParser m Inlines)]
 namespaces = [(mathMLNamespace, pMath True)]
 
 mathMLNamespace :: String
 mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
 
-eSwitch :: Monoid a => (Inlines -> a) -> TagParser a -> TagParser a
+eSwitch :: (PandocMonad m, Monoid a)
+        => (Inlines -> a)
+        -> TagParser m a
+        -> TagParser m a
 eSwitch constructor parser = try $ do
   guardEnabled Ext_epub_html_exts
-  pSatisfy (~== TagOpen "switch" [])
+  pSatisfy (matchTagOpen "switch" [])
   cases <- getFirst . mconcat <$>
             manyTill (First <$> (eCase <* skipMany pBlank) )
-              (lookAhead $ try $ pSatisfy (~== TagOpen "default" []))
+              (lookAhead $ try $ pSatisfy (matchTagOpen "default" []))
   skipMany pBlank
   fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank)
   skipMany pBlank
-  pSatisfy (~== TagClose "switch")
+  pSatisfy (matchTagClose "switch")
   return $ maybe fallback constructor cases
 
-eCase :: TagParser (Maybe Inlines)
+eCase :: PandocMonad m => TagParser m (Maybe Inlines)
 eCase = do
   skipMany pBlank
-  TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" [])
-  case (flip lookup namespaces) =<< lookup "required-namespace" attr of
-    Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank))
-    Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case"))
+  TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "case" [])
+  let attr = toStringAttr attr'
+  case flip lookup namespaces =<< lookup "required-namespace" attr of
+    Just p -> Just <$> pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)
+    Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (matchTagClose "case"))
 
-eFootnote :: TagParser ()
+eFootnote :: PandocMonad m => TagParser m ()
 eFootnote = try $ do
   let notes = ["footnote", "rearnote"]
   guardEnabled Ext_epub_html_exts
-  (TagOpen tag attr) <- lookAhead $ pAnyTag
-  guard (maybe False (flip elem notes) (lookup "type" attr))
+  (TagOpen tag attr') <- lookAhead pAnyTag
+  let attr = toStringAttr attr'
+  guard $ maybe False (`elem` notes) (lookup "type" attr)
   let ident = fromMaybe "" (lookup "id" attr)
   content <- pInTags tag block
   addNote ident content
 
-addNote :: String -> Blocks -> TagParser ()
-addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)})
+addNote :: PandocMonad m => String -> Blocks -> TagParser m ()
+addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s})
 
-eNoteref :: TagParser Inlines
+eNoteref :: PandocMonad m => TagParser m Inlines
 eNoteref = try $ do
   guardEnabled Ext_epub_html_exts
-  TagOpen tag attr <- lookAhead $ pAnyTag
-  guard (maybe False (== "noteref") (lookup "type" attr))
+  TagOpen tag attr' <- lookAhead pAnyTag
+  let attr = toStringAttr attr'
+  guard $ lookup "type" attr == Just "noteref"
   let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)
   guard (not (null ident))
   pInTags tag block
   return $ B.rawInline "noteref" ident
 
 -- Strip TOC if there is one, better to generate again
-eTOC :: TagParser ()
+eTOC :: PandocMonad m => TagParser m ()
 eTOC = try $ do
   guardEnabled Ext_epub_html_exts
-  (TagOpen tag attr) <- lookAhead $ pAnyTag
-  guard (maybe False (== "toc") (lookup "type" attr))
+  (TagOpen tag attr) <- lookAhead pAnyTag
+  guard $ lookup "type" attr == Just "toc"
   void (pInTags tag block)
 
-pList :: TagParser Blocks
+pList :: PandocMonad m => TagParser m Blocks
 pList = pBulletList <|> pOrderedList <|> pDefinitionList
 
-pBulletList :: TagParser Blocks
+pBulletList :: PandocMonad m => TagParser m Blocks
 pBulletList = try $ do
-  pSatisfy (~== TagOpen "ul" [])
+  pSatisfy (matchTagOpen "ul" [])
   let nonItem = pSatisfy (\t ->
                   not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
-                  not (t ~== TagClose "ul"))
+                  not (matchTagClose "ul" t))
   -- note: if they have an <ol> or <ul> not in scope of a <li>,
   -- treat it as a list item, though it's not valid xhtml...
   skipMany nonItem
   items <- manyTill (pListItem nonItem) (pCloses "ul")
   return $ B.bulletList $ map (fixPlains True) items
 
-pListItem :: TagParser a -> TagParser Blocks
+pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks
 pListItem nonItem = do
-  TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" [])
-  let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr)
-  (liDiv <>) <$> pInTags "li" block <* skipMany nonItem
+  TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" [])
+  let attr = toStringAttr attr'
+  let addId ident bs = case B.toList bs of
+                           (Plain ils:xs) -> B.fromList (Plain
+                                [Span (ident, [], []) ils] : xs)
+                           _ -> B.divWith (ident, [], []) bs
+  maybe id addId (lookup "id" attr) <$>
+    pInTags "li" block <* skipMany nonItem
 
 parseListStyleType :: String -> ListNumberStyle
 parseListStyleType "lower-roman" = LowerRoman
@@ -271,9 +306,10 @@ parseTypeAttr "A" = UpperAlpha
 parseTypeAttr "1" = Decimal
 parseTypeAttr _   = DefaultStyle
 
-pOrderedList :: TagParser Blocks
+pOrderedList :: PandocMonad m => TagParser m Blocks
 pOrderedList = try $ do
-  TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
+  TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" [])
+  let attribs = toStringAttr attribs'
   let (start, style) = (sta', sty')
                        where sta = fromMaybe "1" $
                                    lookup "start" attribs
@@ -295,23 +331,23 @@ pOrderedList = try $ do
                                       ]
   let nonItem = pSatisfy (\t ->
                   not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
-                  not (t ~== TagClose "ol"))
+                  not (matchTagClose "ol" t))
   -- note: if they have an <ol> or <ul> not in scope of a <li>,
   -- treat it as a list item, though it's not valid xhtml...
   skipMany nonItem
   items <- manyTill (pListItem nonItem) (pCloses "ol")
   return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
 
-pDefinitionList :: TagParser Blocks
+pDefinitionList :: PandocMonad m => TagParser m Blocks
 pDefinitionList = try $ do
-  pSatisfy (~== TagOpen "dl" [])
+  pSatisfy (matchTagOpen "dl" [])
   items <- manyTill pDefListItem (pCloses "dl")
   return $ B.definitionList items
 
-pDefListItem :: TagParser (Inlines, [Blocks])
+pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks])
 pDefListItem = try $ do
-  let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
-                  not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
+  let nonItem = pSatisfy (\t -> not (matchTagOpen "dt" [] t) &&
+                  not (matchTagOpen "dd" [] t) && not (matchTagClose "dl" t))
   terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
   defs  <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
   skipMany nonItem
@@ -322,133 +358,165 @@ fixPlains :: Bool -> Blocks -> Blocks
 fixPlains inList bs = if any isParaish bs'
                          then B.fromList $ map plainToPara bs'
                          else bs
-  where isParaish (Para _) = True
-        isParaish (CodeBlock _ _) = True
-        isParaish (Header _ _ _) = True
-        isParaish (BlockQuote _) = True
-        isParaish (BulletList _) = not inList
-        isParaish (OrderedList _ _) = not inList
-        isParaish (DefinitionList _) = not inList
-        isParaish _        = False
+  where isParaish Para{}           = True
+        isParaish CodeBlock{}      = True
+        isParaish Header{}         = True
+        isParaish BlockQuote{}     = True
+        isParaish BulletList{}     = not inList
+        isParaish OrderedList{}    = not inList
+        isParaish DefinitionList{} = not inList
+        isParaish _                = False
         plainToPara (Plain xs) = Para xs
-        plainToPara x = x
+        plainToPara x          = x
         bs' = B.toList bs
 
-pRawTag :: TagParser String
+pRawTag :: PandocMonad m => TagParser m Text
 pRawTag = do
   tag <- pAnyTag
   let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"]
   if tagOpen ignorable (const True) tag || tagClose ignorable tag
-     then return []
+     then return mempty
      else return $ renderTags' [tag]
 
-pDiv :: TagParser Blocks
+pLineBlock :: PandocMonad m => TagParser m Blocks
+pLineBlock = try $ do
+  guardEnabled Ext_line_blocks
+  _ <- pSatisfy $ tagOpen (=="div") (== [("class","line-block")])
+  ils <- trimInlines . mconcat <$> manyTill inline (pSatisfy (tagClose (=="div")))
+  let lns = map B.fromList $
+            splitWhen (== LineBreak) $ filter (/= SoftBreak) $
+            B.toList ils
+  return $ B.lineBlock lns
+
+pDiv :: PandocMonad m => TagParser m Blocks
 pDiv = try $ do
   guardEnabled Ext_native_divs
-  let isDivLike "div" = True
+  let isDivLike "div"     = True
       isDivLike "section" = True
-      isDivLike _ = False
-  TagOpen tag attr <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True)
+      isDivLike "main"    = True
+      isDivLike _         = False
+  TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True)
+  let attr = toStringAttr attr'
   contents <- pInTags tag block
   let (ident, classes, kvs) = mkAttr attr
   let classes' = if tag == "section"
                     then "section":classes
                     else classes
-  return $ B.divWith (ident, classes', kvs) contents
+      kvs' = if tag == "main" && isNothing (lookup "role" kvs)
+               then ("role", "main"):kvs
+               else kvs
+  return $ B.divWith (ident, classes', kvs') contents
 
-pRawHtmlBlock :: TagParser Blocks
+pRawHtmlBlock :: PandocMonad m => TagParser m Blocks
 pRawHtmlBlock = do
-  raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag
-  parseRaw <- getOption readerParseRaw
-  if parseRaw && not (null raw)
+  raw <- T.unpack <$> (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag)
+  exts <- getOption readerExtensions
+  if extensionEnabled Ext_raw_html exts && not (null raw)
      then return $ B.rawBlock "html" raw
-     else return mempty
+     else ignore raw
+
+ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a
+ignore raw = do
+  pos <- getPosition
+  -- raw can be null for tags like <!DOCTYPE>; see paRawTag
+  -- in this case we don't want a warning:
+  unless (null raw) $
+    logMessage $ SkippedContent raw pos
+  return mempty
 
-pHtmlBlock :: String -> TagParser String
+pHtmlBlock :: PandocMonad m => Text -> TagParser m Text
 pHtmlBlock t = try $ do
-  open <- pSatisfy (~== TagOpen t [])
-  contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
-  return $ renderTags' $ [open] ++ contents ++ [TagClose t]
+  open <- pSatisfy (matchTagOpen t [])
+  contents <- manyTill pAnyTag (pSatisfy (matchTagClose t))
+  return $ renderTags' $ [open] <> contents <> [TagClose t]
 
 -- Sets chapter context
-eSection :: TagParser Blocks
+eSection :: PandocMonad m => TagParser m Blocks
 eSection = try $ do
-  let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as)
+  let matchChapter as = maybe False (T.isInfixOf "chapter") (lookup "type" as)
   let sectTag = tagOpen (`elem` sectioningContent) matchChapter
   TagOpen tag _ <- lookAhead $ pSatisfy sectTag
   setInChapter (pInTags tag block)
 
-headerLevel :: String -> TagParser Int
-headerLevel tagtype = do
-  let level = read (drop 1 tagtype)
-  (try $ do
-    guardEnabled Ext_epub_html_exts
-    asks inChapter >>= guard
-    return (level - 1))
-    <|>
-      return level
-
-eTitlePage :: TagParser ()
+headerLevel :: PandocMonad m => Text -> TagParser m Int
+headerLevel tagtype =
+  case safeRead (T.unpack (T.drop 1 tagtype)) of
+        Just level ->
+--          try (do
+--            guardEnabled Ext_epub_html_exts
+--            asks inChapter >>= guard
+--            return (level - 1))
+--            <|>
+              return level
+        Nothing -> fail "Could not retrieve header level"
+
+eTitlePage :: PandocMonad m => TagParser m ()
 eTitlePage = try $ do
-  let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as)
+  let isTitlePage as = maybe False (T.isInfixOf "titlepage") (lookup "type" as)
   let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section")
                           isTitlePage
   TagOpen tag _ <- lookAhead $ pSatisfy groupTag
   () <$ pInTags tag block
 
-pHeader :: TagParser Blocks
+pHeader :: PandocMonad m => TagParser m Blocks
 pHeader = try $ do
-  TagOpen tagtype attr <- pSatisfy $
+  TagOpen tagtype attr' <- pSatisfy $
                            tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
                            (const True)
-  let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
+  let attr = toStringAttr attr'
+  let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text)
+                                               [("class","title")]
   level <- headerLevel tagtype
   contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
   let ident = fromMaybe "" $ lookup "id" attr
   let classes = maybe [] words $ lookup "class" attr
   let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
-  attr' <- registerHeader (ident, classes, keyvals) contents
+  attr'' <- registerHeader (ident, classes, keyvals) contents
   return $ if bodyTitle
               then mempty  -- skip a representation of the title in the body
-              else B.headerWith attr' level contents
+              else B.headerWith attr'' level contents
 
-pHrule :: TagParser Blocks
+pHrule :: PandocMonad m => TagParser m Blocks
 pHrule = do
   pSelfClosing (=="hr") (const True)
   return B.horizontalRule
 
-pTable :: TagParser Blocks
+pTable :: PandocMonad m => TagParser m Blocks
 pTable = try $ do
-  TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
+  TagOpen _ _ <- pSatisfy (matchTagOpen "table" [])
   skipMany pBlank
   caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
   widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
   let pTh = option [] $ pInTags "tr" (pCell "th")
-      pTr = try $ skipMany pBlank >> pInTags "tr" (pCell "td" <|> pCell "th")
-      pTBody = do pOptInTag "tbody" $ many1 pTr
+      pTr = try $ skipMany pBlank >>
+                  pInTags "tr" (pCell "td" <|> pCell "th")
+      pTBody = pOptInTag "tbody" $ many1 pTr
   head'' <- pOptInTag "thead" pTh
-  head'  <- pOptInTag "tbody" $ do
-              if null head''
-                 then pTh
-                 else return head''
+  head'  <- map snd <$>
+             pOptInTag "tbody"
+               (if null head'' then pTh else return head'')
   rowsLs <- many pTBody
   rows'  <- pOptInTag "tfoot" $ many pTr
-  TagClose _ <- pSatisfy (~== TagClose "table")
-  let rows'' = (concat rowsLs) ++ rows'
+  TagClose _ <- pSatisfy (matchTagClose "table")
+  let rows'' = concat rowsLs <> rows'
+  let rows''' = map (map snd) rows''
+  -- let rows''' = map (map snd) rows''
   -- fail on empty table
-  guard $ not $ null head' && null rows''
+  guard $ not $ null head' && null rows'''
   let isSinglePlain x = case B.toList x of
                              []        -> True
                              [Plain _] -> True
                              _         -> False
-  let isSimple = all isSinglePlain $ concat (head':rows'')
-  let cols = length $ if null head' then head rows'' else head'
+  let isSimple = all isSinglePlain $ concat (head':rows''')
+  let cols = length $ if null head' then head rows''' else head'
   -- add empty cells to short rows
   let addEmpties r = case cols - length r of
-                           n | n > 0 -> r ++ replicate n mempty
+                           n | n > 0 -> r <> replicate n mempty
                              | otherwise -> r
-  let rows = map addEmpties rows''
-  let aligns = replicate cols AlignDefault
+  let rows = map addEmpties rows'''
+  let aligns = case rows'' of
+                    (cs:_) -> map fst cs
+                    _      -> replicate cols AlignDefault
   let widths = if null widths'
                   then if isSimple
                        then replicate cols 0
@@ -456,80 +524,120 @@ pTable = try $ do
                   else widths'
   return $ B.table caption (zip aligns widths) head' rows
 
-pCol :: TagParser Double
+pCol :: PandocMonad m => TagParser m Double
 pCol = try $ do
-  TagOpen _ attribs <- pSatisfy (~== TagOpen "col" [])
+  TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
+  let attribs = toStringAttr attribs'
   skipMany pBlank
-  optional $ pSatisfy (~== TagClose "col")
+  optional $ pSatisfy (matchTagClose "col")
   skipMany pBlank
-  return $ case lookup "width" attribs of
+  let width = case lookup "width" attribs of
            Nothing -> case lookup "style" attribs of
                Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs ->
-                 fromMaybe 0.0 $ safeRead ('0':'.':filter
-                   (`notElem` " \t\r\n%'\";") xs)
+                 fromMaybe 0.0 $ safeRead (filter
+                   (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
                _ -> 0.0
            Just x | not (null x) && last x == '%' ->
-             fromMaybe 0.0 $ safeRead ('0':'.':init x)
+             fromMaybe 0.0 $ safeRead (init x)
            _ -> 0.0
+  if width > 0.0
+    then return $ width / 100.0
+    else return 0.0
 
-pColgroup :: TagParser [Double]
+pColgroup :: PandocMonad m => TagParser m [Double]
 pColgroup = try $ do
-  pSatisfy (~== TagOpen "colgroup" [])
+  pSatisfy (matchTagOpen "colgroup" [])
   skipMany pBlank
   manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
 
-noColOrRowSpans :: Tag String -> Bool
+noColOrRowSpans :: Tag Text -> Bool
 noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
   where isNullOrOne x = case fromAttrib x t of
                               ""  -> True
                               "1" -> True
                               _   -> False
 
-pCell :: String -> TagParser [Blocks]
+pCell :: PandocMonad m => Text -> TagParser m [(Alignment, Blocks)]
 pCell celltype = try $ do
   skipMany pBlank
+  tag <- lookAhead $
+           pSatisfy (\t -> t ~== TagOpen celltype [] && noColOrRowSpans t)
+  let extractAlign' []                 = ""
+      extractAlign' ("text-align":x:_) = x
+      extractAlign' (_:xs)             = extractAlign' xs
+  let extractAlign = extractAlign' . wordsBy (`elem` [' ','\t',';',':'])
+  let align = case maybeFromAttrib "align" tag `mplus`
+                   (extractAlign <$> maybeFromAttrib "style" tag) of
+                   Just "left"   -> AlignLeft
+                   Just "right"  -> AlignRight
+                   Just "center" -> AlignCenter
+                   _             -> AlignDefault
   res <- pInTags' celltype noColOrRowSpans block
   skipMany pBlank
-  return [res]
+  return [(align, res)]
 
-pBlockQuote :: TagParser Blocks
+pBlockQuote :: PandocMonad m => TagParser m Blocks
 pBlockQuote = do
   contents <- pInTags "blockquote" block
   return $ B.blockQuote $ fixPlains False contents
 
-pPlain :: TagParser Blocks
+pPlain :: PandocMonad m => TagParser m Blocks
 pPlain = do
   contents <- setInPlain $ trimInlines . mconcat <$> many1 inline
   if B.isNull contents
      then return mempty
      else return $ B.plain contents
 
-pPara :: TagParser Blocks
+pPara :: PandocMonad m => TagParser m Blocks
 pPara = do
   contents <- trimInlines <$> pInTags "p" inline
-  return $ B.para contents
+  (do guardDisabled Ext_empty_paragraphs
+      guard (B.isNull contents)
+      return mempty)
+    <|> return (B.para contents)
+
+pFigure :: PandocMonad m => TagParser m Blocks
+pFigure = try $ do
+  TagOpen _ _ <- pSatisfy (matchTagOpen "figure" [])
+  skipMany pBlank
+  let pImg  = (\x -> (Just x, Nothing)) <$>
+               (pOptInTag "p" pImage <* skipMany pBlank)
+      pCapt = (\x -> (Nothing, Just x)) <$> do
+                bs <- pInTags "figcaption" block
+                return $ blocksToInlines' $ B.toList bs
+      pSkip = (Nothing, Nothing) <$ pSatisfy (not . matchTagClose "figure")
+  res <- many (pImg <|> pCapt <|> pSkip)
+  let mbimg = msum $ map fst res
+  let mbcap = msum $ map snd res
+  TagClose _ <- pSatisfy (matchTagClose "figure")
+  let caption = fromMaybe mempty mbcap
+  case B.toList <$> mbimg of
+       Just [Image attr _ (url, tit)] ->
+         return $ B.para $ B.imageWith attr url ("fig:" ++ tit) caption
+       _ -> mzero
 
-pCodeBlock :: TagParser Blocks
+pCodeBlock :: PandocMonad m => TagParser m Blocks
 pCodeBlock = try $ do
-  TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
+  TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" [])
+  let attr = toStringAttr attr'
   contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
   let rawText = concatMap tagToString contents
   -- drop leading newline if any
   let result' = case rawText of
-                     '\n':xs  -> xs
-                     _        -> rawText
+                     '\n':xs -> xs
+                     _       -> rawText
   -- drop trailing newline if any
   let result = case reverse result' of
-                    '\n':_   -> init result'
-                    _        -> result'
+                    '\n':_ -> init result'
+                    _      -> result'
   return $ B.codeBlockWith (mkAttr attr) result
 
-tagToString :: Tag String -> String
-tagToString (TagText s) = s
+tagToString :: Tag Text -> String
+tagToString (TagText s)      = T.unpack s
 tagToString (TagOpen "br" _) = "\n"
-tagToString _ = ""
+tagToString _                = ""
 
-inline :: TagParser Inlines
+inline :: PandocMonad m => TagParser m Inlines
 inline = choice
            [ eNoteref
            , eSwitch id inline
@@ -540,6 +648,7 @@ inline = choice
            , pSuperscript
            , pSubscript
            , pStrikeout
+           , pUnderline
            , pLineBreak
            , pLink
            , pImage
@@ -549,30 +658,31 @@ inline = choice
            , pRawHtmlInline
            ]
 
-pLocation :: TagParser ()
+pLocation :: PandocMonad m => TagParser m ()
 pLocation = do
   (TagPosition r c) <- pSat isTagPosition
   setPosition $ newPos "input" r c
 
-pSat :: (Tag String -> Bool) -> TagParser (Tag String)
+pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
 pSat f = do
   pos <- getPosition
   token show (const pos) (\x -> if f x then Just x else Nothing)
 
-pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String)
+pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
 pSatisfy f = try $ optional pLocation >> pSat f
 
-pAnyTag :: TagParser (Tag String)
+pAnyTag :: PandocMonad m => TagParser m (Tag Text)
 pAnyTag = pSatisfy (const True)
 
-pSelfClosing :: (String -> Bool) -> ([Attribute String] -> Bool)
-             -> TagParser (Tag String)
+pSelfClosing :: PandocMonad m
+             => (Text -> Bool) -> ([Attribute Text] -> Bool)
+             -> TagParser m (Tag Text)
 pSelfClosing f g = do
   open <- pSatisfy (tagOpen f g)
   optional $ pSatisfy (tagClose f)
   return open
 
-pQ :: TagParser Inlines
+pQ :: PandocMonad m => TagParser m Inlines
 pQ = do
   context <- asks quoteContext
   let quoteType = case context of
@@ -587,45 +697,50 @@ pQ = do
   withQuoteContext innerQuoteContext $
     pInlinesInTags "q" constructor
 
-pEmph :: TagParser Inlines
+pEmph :: PandocMonad m => TagParser m Inlines
 pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph
 
-pStrong :: TagParser Inlines
+pStrong :: PandocMonad m => TagParser m Inlines
 pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong
 
-pSuperscript :: TagParser Inlines
+pSuperscript :: PandocMonad m => TagParser m Inlines
 pSuperscript = pInlinesInTags "sup" B.superscript
 
-pSubscript :: TagParser Inlines
+pSubscript :: PandocMonad m => TagParser m Inlines
 pSubscript = pInlinesInTags "sub" B.subscript
 
-pStrikeout :: TagParser Inlines
-pStrikeout = do
+pStrikeout :: PandocMonad m => TagParser m Inlines
+pStrikeout =
   pInlinesInTags "s" B.strikeout <|>
     pInlinesInTags "strike" B.strikeout <|>
     pInlinesInTags "del" B.strikeout <|>
-    try (do pSatisfy (~== TagOpen "span" [("class","strikeout")])
+    try (do pSatisfy (matchTagOpen "span" [("class","strikeout")])
             contents <- mconcat <$> manyTill inline (pCloses "span")
             return $ B.strikeout contents)
 
-pLineBreak :: TagParser Inlines
+pUnderline :: PandocMonad m => TagParser m Inlines
+pUnderline = pInlinesInTags "u" underlineSpan <|> pInlinesInTags "ins" underlineSpan
+
+pLineBreak :: PandocMonad m => TagParser m Inlines
 pLineBreak = do
   pSelfClosing (=="br") (const True)
   return B.linebreak
 
 -- Unlike fromAttrib from tagsoup, this distinguishes
 -- between a missing attribute and an attribute with empty content.
-maybeFromAttrib :: String -> Tag String -> Maybe String
-maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs
+maybeFromAttrib :: String -> Tag Text -> Maybe String
+maybeFromAttrib name (TagOpen _ attrs) =
+  T.unpack <$> lookup (T.pack name) attrs
 maybeFromAttrib _ _ = Nothing
 
-pLink :: TagParser Inlines
+pLink :: PandocMonad m => TagParser m Inlines
 pLink = try $ do
   tag <- pSatisfy $ tagOpenLit "a" (const True)
-  let title = fromAttrib "title" tag
+  let title = T.unpack $ fromAttrib "title" tag
   -- take id from id attribute if present, otherwise name
-  let uid = maybe (fromAttrib "name" tag) id $ maybeFromAttrib "id" tag
-  let cls = words $ fromAttrib "class" tag
+  let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $
+               maybeFromAttrib "id" tag
+  let cls = words $ T.unpack $ fromAttrib "class" tag
   lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
   -- check for href; if href, then a link, otherwise a span
   case maybeFromAttrib "href" tag of
@@ -639,128 +754,149 @@ pLink = try $ do
                         _                   -> url'
          return $ B.linkWith (uid, cls, []) (escapeURI url) title lab
 
-pImage :: TagParser Inlines
+pImage :: PandocMonad m => TagParser m Inlines
 pImage = do
   tag <- pSelfClosing (=="img") (isJust . lookup "src")
   mbBaseHref <- baseHref <$> getState
-  let url' = fromAttrib "src" tag
+  let url' = T.unpack $ fromAttrib "src" tag
   let url = case (parseURIReference url', mbBaseHref) of
                  (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
                  _                   -> url'
-  let title = fromAttrib "title" tag
-  let alt = fromAttrib "alt" tag
-  let uid = fromAttrib "id" tag
-  let cls = words $ fromAttrib "class" tag
+  let title = T.unpack $ fromAttrib "title" tag
+  let alt = T.unpack $ fromAttrib "alt" tag
+  let uid = T.unpack $ fromAttrib "id" tag
+  let cls = words $ T.unpack $ fromAttrib "class" tag
   let getAtt k = case fromAttrib k tag of
                    "" -> []
-                   v  -> [(k, v)]
-  let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"]
+                   v  -> [(T.unpack k, T.unpack v)]
+  let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"]
   return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt)
 
-pCode :: TagParser Inlines
+pCode :: PandocMonad m => TagParser m Inlines
 pCode = try $ do
-  (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
+  (TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
+  let attr = toStringAttr attr'
   result <- manyTill pAnyTag (pCloses open)
-  return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result
+  return $ B.codeWith (mkAttr attr) $ unwords $ lines $ T.unpack $
+           innerText result
 
-pSpan :: TagParser Inlines
+pSpan :: PandocMonad m => TagParser m Inlines
 pSpan = try $ do
   guardEnabled Ext_native_spans
-  TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
+  TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
+  let attr = toStringAttr attr'
   contents <- pInTags "span" inline
-  let isSmallCaps = fontVariant == "small-caps"
+  let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes
                     where styleAttr   = fromMaybe "" $ lookup "style" attr
                           fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr
+                          classes     = fromMaybe [] $
+                                          words <$> lookup "class" attr
   let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr)
   return $ tag contents
 
-pRawHtmlInline :: TagParser Inlines
+pRawHtmlInline :: PandocMonad m => TagParser m Inlines
 pRawHtmlInline = do
   inplain <- asks inPlain
   result <- pSatisfy (tagComment (const True))
             <|> if inplain
                    then pSatisfy (not . isBlockTag)
                    else pSatisfy isInlineTag
-  parseRaw <- getOption readerParseRaw
-  if parseRaw
-     then return $ B.rawInline "html" $ renderTags' [result]
-     else return mempty
+  exts <- getOption readerExtensions
+  let raw = T.unpack $ renderTags' [result]
+  if extensionEnabled Ext_raw_html exts
+     then return $ B.rawInline "html" raw
+     else ignore raw
 
 mathMLToTeXMath :: String -> Either String String
 mathMLToTeXMath s = writeTeX <$> readMathML s
 
-pMath :: Bool -> TagParser Inlines
+toStringAttr :: [(Text, Text)] -> [(String, String)]
+toStringAttr = map go
+  where go (x,y) = (T.unpack x, T.unpack y)
+
+pMath :: PandocMonad m => Bool -> TagParser m Inlines
 pMath inCase = try $ do
-  open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True)
+  open@(TagOpen _ attr') <- pSatisfy $ tagOpen (=="math") (const True)
   -- we'll assume math tags are MathML unless specially marked
   -- otherwise...
+  let attr = toStringAttr attr'
   unless inCase $
     guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr))
-  contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math"))
-  case mathMLToTeXMath (renderTags $ [open] ++ contents ++ [TagClose "math"]) of
+  contents <- manyTill pAnyTag (pSatisfy (matchTagClose "math"))
+  case mathMLToTeXMath (T.unpack $ renderTags $
+          [open] <> contents <> [TagClose "math"]) of
        Left _   -> return $ B.spanWith ("",["math"],attr) $ B.text $
-                             innerText contents
+                             T.unpack $ innerText contents
        Right [] -> return mempty
        Right x  -> return $ case lookup "display" attr of
                                  Just "block" -> B.displayMath x
                                  _            -> B.math x
 
-pInlinesInTags :: String -> (Inlines -> Inlines)
-               -> TagParser Inlines
+pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines)
+               -> TagParser m Inlines
 pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
 
-pInTags :: (Monoid a) => String -> TagParser a -> TagParser a
+pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
 pInTags tagtype parser = pInTags' tagtype (const True) parser
 
-pInTags' :: (Monoid a) => String -> (Tag String -> Bool) -> TagParser a
-                       -> TagParser a
+pInTags' :: (PandocMonad m, Monoid a)
+         => Text
+         -> (Tag Text -> Bool)
+         -> TagParser m a
+         -> TagParser m a
 pInTags' tagtype tagtest parser = try $ do
   pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
   mconcat <$> manyTill parser (pCloses tagtype <|> eof)
 
 -- parses p, preceeded by an optional opening tag
 -- and followed by an optional closing tags
-pOptInTag :: String -> TagParser a -> TagParser a
+pOptInTag :: PandocMonad m => Text -> TagParser m a -> TagParser m a
 pOptInTag tagtype p = try $ do
   skipMany pBlank
-  optional $ pSatisfy (~== TagOpen tagtype [])
+  optional $ pSatisfy (matchTagOpen tagtype [])
   skipMany pBlank
   x <- p
   skipMany pBlank
-  optional $ pSatisfy (~== TagClose tagtype)
+  optional $ pSatisfy (matchTagClose tagtype)
   skipMany pBlank
   return x
 
-pCloses :: String -> TagParser ()
+pCloses :: PandocMonad m => Text -> TagParser m ()
 pCloses tagtype = try $ do
   t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
   case t of
-       (TagClose t') | t' == tagtype -> pAnyTag >> return ()
+       (TagClose t') | t' == tagtype -> void pAnyTag
        (TagOpen t' _) | t' `closes` tagtype -> return ()
        (TagClose "ul") | tagtype == "li" -> return ()
        (TagClose "ol") | tagtype == "li" -> return ()
        (TagClose "dl") | tagtype == "dd" -> return ()
        (TagClose "table") | tagtype == "td" -> return ()
        (TagClose "table") | tagtype == "tr" -> return ()
+       (TagClose "td") | tagtype `Set.member` blockHtmlTags -> return ()
+       (TagClose "th") | tagtype `Set.member` blockHtmlTags -> return ()
+       (TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags
+                                            -> return () -- see #3794
        _ -> mzero
 
-pTagText :: TagParser Inlines
+pTagText :: PandocMonad m => TagParser m Inlines
 pTagText = try $ do
   (TagText str) <- pSatisfy isTagText
   st <- getState
   qu <- ask
-  case flip runReader qu $ runParserT (many pTagContents) st "text" str of
-       Left _        -> fail $ "Could not parse `" ++ str ++ "'"
+  parsed <- lift $ lift $
+            flip runReaderT qu $ runParserT (many pTagContents) st "text" str
+  case parsed of
+       Left _        -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'"
        Right result  -> return $ mconcat result
 
-pBlank :: TagParser ()
+pBlank :: PandocMonad m => TagParser m ()
 pBlank = try $ do
   (TagText str) <- pSatisfy isTagText
-  guard $ all isSpace str
+  guard $ T.all isSpace str
 
-type InlinesParser = HTMLParser String
+type InlinesParser m = HTMLParser m Text
 
-pTagContents :: InlinesParser Inlines
+pTagContents :: PandocMonad m => InlinesParser m Inlines
 pTagContents =
       B.displayMath <$> mathDisplay
   <|> B.math        <$> mathInline
@@ -770,7 +906,7 @@ pTagContents =
   <|> pSymbol
   <|> pBad
 
-pStr :: InlinesParser Inlines
+pStr :: PandocMonad m => InlinesParser m Inlines
 pStr = do
   result <- many1 $ satisfy $ \c ->
                      not (isSpace c) && not (isSpecial c) && not (isBad c)
@@ -778,24 +914,24 @@ pStr = do
   return $ B.str result
 
 isSpecial :: Char -> Bool
-isSpecial '"' = True
-isSpecial '\'' = True
-isSpecial '.' = True
-isSpecial '-' = True
-isSpecial '$' = True
+isSpecial '"'     = True
+isSpecial '\''    = True
+isSpecial '.'     = True
+isSpecial '-'     = True
+isSpecial '$'     = True
 isSpecial '\8216' = True
 isSpecial '\8217' = True
 isSpecial '\8220' = True
 isSpecial '\8221' = True
-isSpecial _ = False
+isSpecial _       = False
 
-pSymbol :: InlinesParser Inlines
+pSymbol :: PandocMonad m => InlinesParser m Inlines
 pSymbol = satisfy isSpecial >>= return . B.str . (:[])
 
 isBad :: Char -> Bool
 isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
 
-pBad :: InlinesParser Inlines
+pBad :: PandocMonad m => InlinesParser m Inlines
 pBad = do
   c <- satisfy isBad
   let c' = case c of
@@ -829,7 +965,7 @@ pBad = do
                 _      -> '?'
   return $ B.str [c']
 
-pSpace :: InlinesParser Inlines
+pSpace :: PandocMonad m => InlinesParser m Inlines
 pSpace = many1 (satisfy isSpace) >>= \xs ->
             if '\n' `elem` xs
                then return B.softbreak
@@ -839,86 +975,96 @@ pSpace = many1 (satisfy isSpace) >>= \xs ->
 -- Constants
 --
 
-eitherBlockOrInline :: [String]
-eitherBlockOrInline = ["audio", "applet", "button", "iframe", "embed",
-                       "del", "ins",
-                       "progress", "map", "area", "noscript", "script",
-                       "object", "svg", "video", "source"]
-
-{-
-inlineHtmlTags :: [[Char]]
-inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
-                  "br", "cite", "code", "dfn", "em", "font", "i", "img",
-                  "input", "kbd", "label", "q", "s", "samp", "select",
-                  "small", "span", "strike", "strong", "sub", "sup",
-                  "textarea", "tt", "u", "var"]
--}
-
-blockHtmlTags :: [String]
-blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside",
-                 "blockquote", "body", "button", "canvas",
-                 "caption", "center", "col", "colgroup", "dd", "dir", "div",
-                 "dl", "dt", "fieldset", "figcaption", "figure",
-                 "footer", "form", "h1", "h2", "h3", "h4",
-                 "h5", "h6", "head", "header", "hgroup", "hr", "html",
-                 "isindex", "menu", "noframes", "ol", "output", "p", "pre",
-                 "section", "table", "tbody", "textarea",
-                 "thead", "tfoot", "ul", "dd",
-                 "dt", "frameset", "li", "tbody", "td", "tfoot",
-                 "th", "thead", "tr", "script", "style"]
+eitherBlockOrInline :: Set.Set Text
+eitherBlockOrInline = Set.fromList
+  ["audio", "applet", "button", "iframe", "embed",
+   "del", "ins", "progress", "map", "area", "noscript", "script",
+   "object", "svg", "video", "source"]
+
+blockHtmlTags :: Set.Set Text
+blockHtmlTags = Set.fromList
+   ["?xml", "!DOCTYPE", "address", "article", "aside",
+    "blockquote", "body", "canvas",
+    "caption", "center", "col", "colgroup", "dd", "details",
+    "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure",
+    "footer", "form", "h1", "h2", "h3", "h4",
+    "h5", "h6", "head", "header", "hgroup", "hr", "html",
+    "isindex", "main", "menu", "meta", "noframes", "ol", "output", "p", "pre",
+    "section", "table", "tbody", "textarea",
+    "thead", "tfoot", "ul", "dd",
+    "dt", "frameset", "li", "tbody", "td", "tfoot",
+    "th", "thead", "tr", "script", "style"]
 
 -- We want to allow raw docbook in markdown documents, so we
 -- include docbook block tags here too.
-blockDocBookTags :: [String]
-blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist",
-                    "orderedlist", "segmentedlist", "simplelist",
-                    "variablelist", "caution", "important", "note", "tip",
-                    "warning", "address", "literallayout", "programlisting",
-                    "programlistingco", "screen", "screenco", "screenshot",
-                    "synopsis", "example", "informalexample", "figure",
-                    "informalfigure", "table", "informaltable", "para",
-                    "simpara", "formalpara", "equation", "informalequation",
-                    "figure", "screenshot", "mediaobject", "qandaset",
-                    "procedure", "task", "cmdsynopsis", "funcsynopsis",
-                    "classsynopsis", "blockquote", "epigraph", "msgset",
-                    "sidebar", "title"]
-
-epubTags :: [String]
-epubTags = ["case", "switch", "default"]
-
-blockTags :: [String]
-blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags
-
-isInlineTag :: Tag String -> Bool
-isInlineTag t = tagOpen isInlineTagName (const True) t ||
-                tagClose isInlineTagName t ||
-                tagComment (const True) t
-                 where isInlineTagName x = x `notElem` blockTags
-
-isBlockTag :: Tag String -> Bool
-isBlockTag t = tagOpen isBlockTagName (const True) t ||
-               tagClose isBlockTagName t ||
-               tagComment (const True) t
-                 where isBlockTagName ('?':_) = True
-                       isBlockTagName ('!':_) = True
-                       isBlockTagName x       = x `elem` blockTags
-                                             || x `elem` eitherBlockOrInline
-
-isTextTag :: Tag String -> Bool
+blockDocBookTags :: Set.Set Text
+blockDocBookTags = Set.fromList
+   ["calloutlist", "bibliolist", "glosslist", "itemizedlist",
+    "orderedlist", "segmentedlist", "simplelist",
+    "variablelist", "caution", "important", "note", "tip",
+    "warning", "address", "literallayout", "programlisting",
+    "programlistingco", "screen", "screenco", "screenshot",
+    "synopsis", "example", "informalexample", "figure",
+    "informalfigure", "table", "informaltable", "para",
+    "simpara", "formalpara", "equation", "informalequation",
+    "figure", "screenshot", "mediaobject", "qandaset",
+    "procedure", "task", "cmdsynopsis", "funcsynopsis",
+    "classsynopsis", "blockquote", "epigraph", "msgset",
+    "sidebar", "title"]
+
+epubTags :: Set.Set Text
+epubTags = Set.fromList ["case", "switch", "default"]
+
+blockTags :: Set.Set Text
+blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags]
+
+class NamedTag a where
+  getTagName :: a -> Maybe Text
+
+instance NamedTag (Tag Text) where
+  getTagName (TagOpen t _) = Just t
+  getTagName (TagClose t)  = Just t
+  getTagName _             = Nothing
+
+instance NamedTag (Tag String) where
+  getTagName (TagOpen t _) = Just (T.pack t)
+  getTagName (TagClose t)  = Just (T.pack t)
+  getTagName _             = Nothing
+
+isInlineTag :: NamedTag (Tag a) => Tag a -> Bool
+isInlineTag t = isInlineTagName || isCommentTag t
+                 where isInlineTagName = case getTagName t of
+                                              Just x -> x
+                                                  `Set.notMember` blockTags
+                                              Nothing -> False
+
+isBlockTag :: NamedTag (Tag a) => Tag a -> Bool
+isBlockTag t = isBlockTagName || isTagComment t
+                 where isBlockTagName =
+                         case getTagName t of
+                              Just x
+                                | "?" `T.isPrefixOf` x -> True
+                                | "!" `T.isPrefixOf` x -> True
+                                | otherwise -> x `Set.member` blockTags
+                                    || x `Set.member` eitherBlockOrInline
+                              Nothing -> False
+
+isTextTag :: Tag a -> Bool
 isTextTag = tagText (const True)
 
-isCommentTag :: Tag String -> Bool
+isCommentTag :: Tag a -> Bool
 isCommentTag = tagComment (const True)
 
 -- taken from HXT and extended
 -- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
-closes :: String -> String -> Bool
+closes :: Text -> Text -> Bool
 _ `closes` "body" = False
 _ `closes` "html" = False
 "body" `closes` "head" = True
 "a" `closes` "a" = True
 "li" `closes` "li" = True
 "th" `closes` t | t `elem` ["th","td"] = True
+"td" `closes` t | t `elem` ["th","td"] = True
 "tr" `closes` t | t `elem` ["th","td","tr"] = True
 "dd" `closes` t | t `elem` ["dt", "dd"] = True
 "dt" `closes` t | t `elem` ["dt","dd"] = True
@@ -926,12 +1072,12 @@ _ `closes` "html" = False
 "optgroup" `closes` "optgroup" = True
 "optgroup" `closes` "option" = True
 "option" `closes` "option" = True
--- http://www.w3.org/TR/html-markup/p.html
+-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags
 x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
    "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
-   "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section",
+   "h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section",
    "table", "ul"] = True
-"meta" `closes` "meta" = True
+_ `closes` "meta" = True
 "form" `closes` "form" = True
 "label" `closes` "label" = True
 "map" `closes` "map" = True
@@ -942,17 +1088,18 @@ t `closes` "select" | t /= "option" = True
 "tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
 "tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
 t `closes` t2 |
-   t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] &&
-   t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div"
+   t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] &&
+   t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main"
 t1 `closes` t2 |
-   t1 `elem` blockTags &&
-   t2 `notElem` (blockTags ++ eitherBlockOrInline) = True
+   t1 `Set.member` blockTags &&
+   t2 `Set.notMember` blockTags &&
+   t2 `Set.notMember` eitherBlockOrInline = True
 _ `closes` _ = False
 
 --- parsers for use in markdown, textile readers
 
 -- | Matches a stretch of HTML in balanced tags.
-htmlInBalanced :: (Monad m)
+htmlInBalanced :: (HasReaderOptions st, Monad m)
                => (Tag String -> Bool)
                -> ParserT String st m String
 htmlInBalanced f = try $ do
@@ -973,8 +1120,11 @@ htmlInBalanced f = try $ do
                           let cs = ec - sc
                           lscontents <- unlines <$> count ls anyLine
                           cscontents <- count cs anyChar
-                          (_,closetag) <- htmlTag (~== TagClose tn)
-                          return (lscontents ++ cscontents ++ closetag)
+                          closetag <- do
+                            x <- many (satisfy (/='>'))
+                            char '>'
+                            return (x <> ">")
+                          return (lscontents <> cscontents <> closetag)
                         _ -> mzero
     _ -> mzero
 
@@ -992,64 +1142,99 @@ htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
         go n (t:ts') = (t :) <$> go n ts'
         go _ [] = mzero
 
-hasTagWarning :: [Tag String] -> Bool
+hasTagWarning :: [Tag a] -> Bool
 hasTagWarning (TagWarning _:_) = True
-hasTagWarning _ = False
+hasTagWarning _                = False
 
 -- | Matches a tag meeting a certain condition.
-htmlTag :: Monad m
+htmlTag :: (HasReaderOptions st, Monad m)
         => (Tag String -> Bool)
         -> ParserT [Char] st m (Tag String, String)
 htmlTag f = try $ do
   lookAhead (char '<')
+  startpos <- getPosition
   inp <- getInput
-  let (next : _) = canonicalizeTags $ parseTagsOptions
-                       parseOptions{ optTagWarning = False } inp
-  guard $ f next
+  let ts = canonicalizeTags $ parseTagsOptions
+                               parseOptions{ optTagWarning = False
+                                           , optTagPosition = True }
+                               (inp ++ " ") -- add space to ensure that
+                               -- we get a TagPosition after the tag
+  (next, ln, col) <- case ts of
+                      (TagPosition{} : next : TagPosition ln col : _)
+                        | f next -> return (next, ln, col)
+                      _ -> mzero
+
+  -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
+  -- should NOT be parsed as an HTML tag, see #2277,
+  -- so we exclude . even though it's a valid character
+  -- in XML element names
+  let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_'
+  let isName s = case s of
+                      []      -> False
+                      ('?':_) -> True -- processing instruction
+                      (c:cs)  -> isLetter c && all isNameChar cs
+
+  let endpos = if ln == 1
+                  then setSourceColumn startpos
+                         (sourceColumn startpos + (col - 1))
+                  else setSourceColumn (setSourceLine startpos
+                                        (sourceLine startpos + (ln - 1)))
+                         col
+  let endAngle = try $
+        do char '>'
+           pos <- getPosition
+           guard $ pos >= endpos
+
   let handleTag tagname = do
-       -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
-       -- should NOT be parsed as an HTML tag, see #2277
-       guard $ not ('.' `elem` tagname)
+       -- basic sanity check, since the parser is very forgiving
+       -- and finds tags in stuff like x<y)
+       guard $ isName tagname
+       guard $ not $ null tagname
        -- <https://example.org> should NOT be a tag either.
        -- tagsoup will parse it as TagOpen "https:" [("example.org","")]
-       guard $ not (null tagname)
        guard $ last tagname /= ':'
-       rendered <- manyTill anyChar (char '>')
-       return (next, rendered ++ ">")
+       char '<'
+       rendered <- manyTill anyChar endAngle
+       return (next, "<" ++ rendered ++ ">")
   case next of
        TagComment s
          | "<!--" `isPrefixOf` inp -> do
-          count (length s + 4) anyChar
-          skipMany (satisfy (/='>'))
-          char '>'
-          return (next, "<!--" ++ s ++ "-->")
+          string "<!--"
+          count (length s) anyChar
+          string "-->"
+          stripComments <- getOption readerStripComments
+          if stripComments
+             then return (next, "")
+             else return (next, "<!--" <> s <> "-->")
          | otherwise -> fail "bogus comment mode, HTML5 parse error"
-       TagOpen tagname _attr -> handleTag tagname
-       TagClose tagname -> handleTag tagname
+       TagOpen tagname attr -> do
+         guard $ all (isName . fst) attr
+         handleTag tagname
+       TagClose tagname ->
+         handleTag tagname
        _ -> mzero
 
 mkAttr :: [(String, String)] -> Attr
 mkAttr attr = (attribsId, attribsClasses, attribsKV)
   where attribsId = fromMaybe "" $ lookup "id" attr
-        attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes
+        attribsClasses = words (fromMaybe "" $ lookup "class" attr) <> epubTypes
         attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
         epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
 
 -- Strip namespace prefixes
-stripPrefixes :: [Tag String] -> [Tag String]
+stripPrefixes :: [Tag Text] -> [Tag Text]
 stripPrefixes = map stripPrefix
 
-stripPrefix :: Tag String -> Tag String
+stripPrefix :: Tag Text -> Tag Text
 stripPrefix (TagOpen s as) =
-    TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)
+    TagOpen (stripPrefix' s) (map (first stripPrefix') as)
 stripPrefix (TagClose s) = TagClose (stripPrefix' s)
 stripPrefix x = x
 
-stripPrefix' :: String -> String
+stripPrefix' :: Text -> Text
 stripPrefix' s =
-  case span (/= ':') s of
-    (_, "") -> s
-    (_, (_:ts)) -> ts
+  if T.null t then s else T.drop 1 t
+  where (_, t) = T.span (/= ':') s
 
 isSpace :: Char -> Bool
 isSpace ' '  = True
@@ -1068,9 +1253,13 @@ instance HasHeaderMap HTMLState where
   extractHeaderMap = headerMap
   updateHeaderMap  f s = s{ headerMap = f (headerMap s) }
 
+instance HasLogMessages HTMLState where
+  addLogMessage m s = s{ logMessages = m : logMessages s }
+  getLogMessages = reverse . logMessages
+
 -- This signature should be more general
 -- MonadReader HTMLLocal m => HasQuoteContext st m
-instance HasQuoteContext st (Reader HTMLLocal) where
+instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where
   getQuoteContext = asks quoteContext
   withQuoteContext q = local (\s -> s{quoteContext = q})
 
@@ -1088,19 +1277,32 @@ instance HasLastStrPosition HTMLState where
   setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
   getLastStrPos = getLastStrPos . parserState
 
+-- For now we need a special verison here; the one in Shared has String type
+renderTags' :: [Tag Text] -> Text
+renderTags' = renderTagsOptions
+               renderOptions{ optMinimize = matchTags ["hr", "br", "img",
+                                                       "meta", "link"]
+                            , optRawTag   = matchTags ["script", "style"] }
+              where matchTags tags = flip elem tags . T.toLower
+
 
 -- EPUB Specific
 --
 --
-sectioningContent :: [String]
+sectioningContent :: [Text]
 sectioningContent = ["article", "aside", "nav", "section"]
 
 
-groupingContent :: [String]
+groupingContent :: [Text]
 groupingContent = ["p", "hr", "pre", "blockquote", "ol"
                   , "ul", "li", "dl", "dt", "dt", "dd"
                   , "figure", "figcaption", "div", "main"]
 
+matchTagClose :: Text -> (Tag Text -> Bool)
+matchTagClose t = (~== TagClose t)
+
+matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
+matchTagOpen t as = (~== TagOpen t as)
 
 {-
 
@@ -1108,7 +1310,7 @@ types :: [(String, ([String], Int))]
 types =  -- Document divisions
    map (\s -> (s, (["section", "body"], 0)))
     ["volume", "part", "chapter", "division"]
-  ++ -- Document section and components
+  <> -- Document section and components
   [
     ("abstract",  ([], 0))]
 -}