diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/CommonMark.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/CommonMark.hs | 228 |
1 files changed, 168 insertions, 60 deletions
diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs index d20d386e7..6fbc09c17 100644 --- a/src/Text/Pandoc/Readers/CommonMark.hs +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2015 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2015-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.CommonMark - Copyright : Copyright (C) 2015 John MacFarlane + Copyright : Copyright (C) 2015-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -32,48 +32,94 @@ CommonMark is a strongly specified variant of Markdown: http://commonmark.org. module Text.Pandoc.Readers.CommonMark (readCommonMark) where -import CMark -import Data.Text (unpack, pack) +import CMarkGFM +import Control.Monad.State +import Data.Char (isAlphaNum, isLetter, isSpace, toLower) import Data.List (groupBy) +import qualified Data.Map as Map +import Data.Text (Text, unpack) +import Text.Pandoc.Class (PandocMonad) import Text.Pandoc.Definition +import Text.Pandoc.Emoji (emojis) import Text.Pandoc.Options -import Text.Pandoc.Error +import Text.Pandoc.Shared (stringify) +import Text.Pandoc.Walk (walkM) -- | Parse a CommonMark formatted string into a 'Pandoc' structure. -readCommonMark :: ReaderOptions -> String -> Either PandocError Pandoc -readCommonMark opts = Right . nodeToPandoc . commonmarkToNode opts' . pack - where opts' = if readerSmart opts - then [optNormalize, optSmart] - else [optNormalize] - -nodeToPandoc :: Node -> Pandoc -nodeToPandoc (Node _ DOCUMENT nodes) = - Pandoc nullMeta $ foldr addBlock [] nodes -nodeToPandoc n = -- shouldn't happen - Pandoc nullMeta $ foldr addBlock [] [n] - -addBlocks :: [Node] -> [Block] -addBlocks = foldr addBlock [] - -addBlock :: Node -> [Block] -> [Block] -addBlock (Node _ PARAGRAPH nodes) = - (Para (addInlines nodes) :) -addBlock (Node _ THEMATIC_BREAK _) = +readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readCommonMark opts s = return $ + (if isEnabled Ext_gfm_auto_identifiers opts + then addHeaderIdentifiers + else id) $ + nodeToPandoc opts $ commonmarkToNode opts' exts s + where opts' = [ optSmart | isEnabled Ext_smart opts ] + exts = [ extStrikethrough | isEnabled Ext_strikeout opts ] ++ + [ extTable | isEnabled Ext_pipe_tables opts ] ++ + [ extAutolink | isEnabled Ext_autolink_bare_uris opts ] + +convertEmojis :: String -> String +convertEmojis (':':xs) = + case break (==':') xs of + (ys,':':zs) -> + case Map.lookup ys emojis of + Just s -> s ++ convertEmojis zs + Nothing -> ':' : ys ++ convertEmojis (':':zs) + _ -> ':':xs +convertEmojis (x:xs) = x : convertEmojis xs +convertEmojis [] = [] + +addHeaderIdentifiers :: Pandoc -> Pandoc +addHeaderIdentifiers doc = evalState (walkM addHeaderId doc) mempty + +addHeaderId :: Block -> State (Map.Map String Int) Block +addHeaderId (Header lev (_,classes,kvs) ils) = do + idmap <- get + let ident = toIdent ils + ident' <- case Map.lookup ident idmap of + Nothing -> do + put (Map.insert ident 1 idmap) + return ident + Just i -> do + put (Map.adjust (+ 1) ident idmap) + return (ident ++ "-" ++ show i) + return $ Header lev (ident',classes,kvs) ils +addHeaderId x = return x + +toIdent :: [Inline] -> String +toIdent = map (\c -> if isSpace c then '-' else c) + . filter (\c -> isLetter c || isAlphaNum c || isSpace c || + c == '_' || c == '-') + . map toLower . stringify + +nodeToPandoc :: ReaderOptions -> Node -> Pandoc +nodeToPandoc opts (Node _ DOCUMENT nodes) = + Pandoc nullMeta $ foldr (addBlock opts) [] nodes +nodeToPandoc opts n = -- shouldn't happen + Pandoc nullMeta $ foldr (addBlock opts) [] [n] + +addBlocks :: ReaderOptions -> [Node] -> [Block] +addBlocks opts = foldr (addBlock opts) [] + +addBlock :: ReaderOptions -> Node -> [Block] -> [Block] +addBlock opts (Node _ PARAGRAPH nodes) = + (Para (addInlines opts nodes) :) +addBlock _ (Node _ THEMATIC_BREAK _) = (HorizontalRule :) -addBlock (Node _ BLOCK_QUOTE nodes) = - (BlockQuote (addBlocks nodes) :) -addBlock (Node _ (HTML_BLOCK t) _) = - (RawBlock (Format "html") (unpack t) :) +addBlock opts (Node _ BLOCK_QUOTE nodes) = + (BlockQuote (addBlocks opts nodes) :) +addBlock opts (Node _ (HTML_BLOCK t) _) + | isEnabled Ext_raw_html opts = (RawBlock (Format "html") (unpack t) :) + | otherwise = id -- Note: the cmark parser will never generate CUSTOM_BLOCK, -- so we don't need to handle it: -addBlock (Node _ (CUSTOM_BLOCK _onEnter _onExit) _nodes) = +addBlock _ (Node _ (CUSTOM_BLOCK _onEnter _onExit) _nodes) = id -addBlock (Node _ (CODE_BLOCK info t) _) = +addBlock _ (Node _ (CODE_BLOCK info t) _) = (CodeBlock ("", take 1 (words (unpack info)), []) (unpack t) :) -addBlock (Node _ (HEADING lev) nodes) = - (Header lev ("",[],[]) (addInlines nodes) :) -addBlock (Node _ (LIST listAttrs) nodes) = - (constructor (map (setTightness . addBlocks . children) nodes) :) +addBlock opts (Node _ (HEADING lev) nodes) = + (Header lev ("",[],[]) (addInlines opts nodes) :) +addBlock opts (Node _ (LIST listAttrs) nodes) = + (constructor (map (setTightness . addBlocks opts . children) nodes) :) where constructor = case listType listAttrs of BULLET_LIST -> BulletList ORDERED_LIST -> OrderedList @@ -82,46 +128,108 @@ addBlock (Node _ (LIST listAttrs) nodes) = setTightness = if listTight listAttrs then map paraToPlain else id - paraToPlain (Para xs) = Plain (xs) + paraToPlain (Para xs) = Plain xs paraToPlain x = x delim = case listDelim listAttrs of - PERIOD_DELIM -> Period - PAREN_DELIM -> OneParen -addBlock (Node _ ITEM _) = id -- handled in LIST -addBlock _ = id + PERIOD_DELIM -> Period + PAREN_DELIM -> OneParen +addBlock opts (Node _ (TABLE alignments) nodes) = + (Table [] aligns widths headers rows :) + where aligns = map fromTableCellAlignment alignments + fromTableCellAlignment NoAlignment = AlignDefault + fromTableCellAlignment LeftAligned = AlignLeft + fromTableCellAlignment RightAligned = AlignRight + fromTableCellAlignment CenterAligned = AlignCenter + widths = replicate numcols 0.0 + numcols = if null rows' + then 0 + else maximum $ map length rows' + rows' = map toRow $ filter isRow nodes + (headers, rows) = case rows' of + (h:rs) -> (h, rs) + [] -> ([], []) + isRow (Node _ TABLE_ROW _) = True + isRow _ = False + isCell (Node _ TABLE_CELL _) = True + isCell _ = False + toRow (Node _ TABLE_ROW ns) = map toCell $ filter isCell ns + toRow (Node _ t _) = error $ "toRow encountered non-row " ++ show t + toCell (Node _ TABLE_CELL []) = [] + toCell (Node _ TABLE_CELL (n:ns)) + | isBlockNode n = addBlocks opts (n:ns) + | otherwise = [Plain (addInlines opts (n:ns))] + toCell (Node _ t _) = error $ "toCell encountered non-cell " ++ show t +addBlock _ (Node _ TABLE_ROW _) = id -- handled in TABLE +addBlock _ (Node _ TABLE_CELL _) = id -- handled in TABLE +addBlock _ _ = id + +isBlockNode :: Node -> Bool +isBlockNode (Node _ nodetype _) = + case nodetype of + DOCUMENT -> True + THEMATIC_BREAK -> True + PARAGRAPH -> True + BLOCK_QUOTE -> True + HTML_BLOCK _ -> True + CUSTOM_BLOCK _ _ -> True + CODE_BLOCK _ _ -> True + HEADING _ -> True + LIST _ -> True + ITEM -> True + TEXT _ -> False + SOFTBREAK -> False + LINEBREAK -> False + HTML_INLINE _ -> False + CUSTOM_INLINE _ _ -> False + CODE _ -> False + EMPH -> False + STRONG -> False + LINK _ _ -> False + IMAGE _ _ -> False + STRIKETHROUGH -> False + TABLE _ -> False + TABLE_ROW -> False + TABLE_CELL -> False children :: Node -> [Node] children (Node _ _ ns) = ns -addInlines :: [Node] -> [Inline] -addInlines = foldr addInline [] +addInlines :: ReaderOptions -> [Node] -> [Inline] +addInlines opts = foldr (addInline opts) [] -addInline :: Node -> [Inline] -> [Inline] -addInline (Node _ (TEXT t) _) = (map toinl clumps ++) +addInline :: ReaderOptions -> Node -> [Inline] -> [Inline] +addInline opts (Node _ (TEXT t) _) = (map toinl clumps ++) where raw = unpack t clumps = groupBy samekind raw samekind ' ' ' ' = True samekind ' ' _ = False samekind _ ' ' = False samekind _ _ = True - toinl (' ':_) = Space - toinl xs = Str xs -addInline (Node _ LINEBREAK _) = (LineBreak :) -addInline (Node _ SOFTBREAK _) = (SoftBreak :) -addInline (Node _ (HTML_INLINE t) _) = - (RawInline (Format "html") (unpack t) :) + toinl (' ':_) = Space + toinl xs = Str $ if isEnabled Ext_emoji opts + then convertEmojis xs + else xs +addInline _ (Node _ LINEBREAK _) = (LineBreak :) +addInline opts (Node _ SOFTBREAK _) + | isEnabled Ext_hard_line_breaks opts = (LineBreak :) + | otherwise = (SoftBreak :) +addInline opts (Node _ (HTML_INLINE t) _) + | isEnabled Ext_raw_html opts = (RawInline (Format "html") (unpack t) :) + | otherwise = id -- Note: the cmark parser will never generate CUSTOM_BLOCK, -- so we don't need to handle it: -addInline (Node _ (CUSTOM_INLINE _onEnter _onExit) _nodes) = +addInline _ (Node _ (CUSTOM_INLINE _onEnter _onExit) _nodes) = id -addInline (Node _ (CODE t) _) = +addInline _ (Node _ (CODE t) _) = (Code ("",[],[]) (unpack t) :) -addInline (Node _ EMPH nodes) = - (Emph (addInlines nodes) :) -addInline (Node _ STRONG nodes) = - (Strong (addInlines nodes) :) -addInline (Node _ (LINK url title) nodes) = - (Link nullAttr (addInlines nodes) (unpack url, unpack title) :) -addInline (Node _ (IMAGE url title) nodes) = - (Image nullAttr (addInlines nodes) (unpack url, unpack title) :) -addInline _ = id +addInline opts (Node _ EMPH nodes) = + (Emph (addInlines opts nodes) :) +addInline opts (Node _ STRONG nodes) = + (Strong (addInlines opts nodes) :) +addInline opts (Node _ STRIKETHROUGH nodes) = + (Strikeout (addInlines opts nodes) :) +addInline opts (Node _ (LINK url title) nodes) = + (Link nullAttr (addInlines opts nodes) (unpack url, unpack title) :) +addInline opts (Node _ (IMAGE url title) nodes) = + (Image nullAttr (addInlines opts nodes) (unpack url, unpack title) :) +addInline _ _ = id |