diff options
authorJohn MacFarlane <>2015-03-16 22:20:42 -0700
committerJohn MacFarlane <>2015-03-17 16:15:57 -0700
commite0d234e54d18a82a7c90aa3946f890140e200051 (patch)
parent5721a5d34bc3c0ab4d7fb9424d9f0b6f7055da30 (diff)
Added CommonMark reader using cmark (libcmark bindings).
- Added commonmark as an input format. - Added `Text.Pandoc.Readers.CommonMark.readCommonMark`. - For now, we use the markdown writer to generate benchmark text for the CommonMark reader. We can change this when we get a writer.
5 files changed, 150 insertions, 18 deletions
diff --git a/README b/README
index 41bf0e4db..81e3f877b 100644
--- a/README
+++ b/README
@@ -12,17 +12,18 @@ Description
Pandoc is a [Haskell] library for converting from one markup format to
another, and a command-line tool that uses this library. It can read
-[markdown] and (subsets of) [Textile], [reStructuredText], [HTML],
-[LaTeX], [MediaWiki markup], [TWiki markup], [Haddock markup], [OPML],
-[Emacs Org-mode], [DocBook], [txt2tags], [EPUB] and [Word docx]; and
-it can write plain text, [markdown], [reStructuredText], [XHTML],
-[HTML 5], [LaTeX] (including [beamer] slide shows), [ConTeXt], [RTF],
-[OPML], [DocBook], [OpenDocument], [ODT], [Word docx], [GNU Texinfo],
-[MediaWiki markup], [DokuWiki markup], [Haddock markup], [EPUB] (v2 or v3),
-[FictionBook2], [Textile], [groff man] pages, [Emacs Org-Mode], [AsciiDoc],
-[InDesign ICML], and [Slidy], [Slideous], [DZSlides], [reveal.js] or
-[S5] HTML slide shows. It can also produce [PDF] output on systems where
-LaTeX is installed.
+[Markdown], [CommonMark], and (subsets of) [Textile],
+[reStructuredText], [HTML], [LaTeX], [MediaWiki markup], [TWiki
+markup], [Haddock markup], [OPML], [Emacs Org-mode], [DocBook],
+[txt2tags], [EPUB] and [Word docx]; and it can write plain text,
+[Markdown], [reStructuredText], [XHTML], [HTML 5], [LaTeX] (including
+[beamer] slide shows), [ConTeXt], [RTF], [OPML], [DocBook],
+[OpenDocument], [ODT], [Word docx], [GNU Texinfo], [MediaWiki markup],
+[DokuWiki markup], [Haddock markup], [EPUB] (v2 or v3),
+[FictionBook2], [Textile], [groff man] pages, [Emacs Org-Mode],
+[AsciiDoc], [InDesign ICML], and [Slidy], [Slideous], [DZSlides],
+[reveal.js] or [S5] HTML slide shows. It can also produce [PDF] output
+on systems where LaTeX is installed.
Pandoc's enhanced version of markdown includes syntax for footnotes,
tables, flexible ordered lists, definition lists, fenced code blocks,
@@ -159,6 +160,7 @@ General options
extended markdown), `markdown_strict` (original unextended markdown),
`markdown_phpextra` (PHP Markdown Extra extended markdown),
`markdown_github` (github extended markdown),
+ `commonmark` (CommonMark markdown),
`textile` (Textile), `rst` (reStructuredText), `html` (HTML),
`docbook` (DocBook), `t2t` (txt2tags), `docx` (docx), `epub` (EPUB),
`opml` (OPML), `org` (Emacs Org-mode), `mediawiki` (MediaWiki markup),
@@ -3238,3 +3240,4 @@ Rosenthal.
diff --git a/benchmark/benchmark-pandoc.hs b/benchmark/benchmark-pandoc.hs
index bf67eaa4d..2a34696b9 100644
--- a/benchmark/benchmark-pandoc.hs
+++ b/benchmark/benchmark-pandoc.hs
@@ -26,12 +26,18 @@ import Debug.Trace (trace)
readerBench :: Pandoc
-> (String, ReaderOptions -> String -> IO Pandoc)
-> Maybe Benchmark
-readerBench doc (name, reader) = case lookup name writers of
- Just (PureStringWriter writer) ->
- let inp = writer def{ writerWrapText = True} doc
- in return $ bench (name ++ " reader") $ nfIO $
- (reader def{ readerSmart = True }) inp
- _ -> trace ("\nCould not find writer for " ++ name ++ "\n") Nothing
+readerBench doc (name, reader) =
+ case lookup name writers of
+ Just (PureStringWriter writer) ->
+ let inp = writer def{ writerWrapText = True} doc
+ in return $ bench (name ++ " reader") $ nfIO $
+ (reader def{ readerSmart = True }) inp
+ _ | name == "commonmark" ->
+ let inp = writeMarkdown def{ writerWrapText = True} doc
+ in return $ bench (name ++ " reader") $ nfIO $
+ (reader def{ readerSmart = True }) inp
+ | otherwise -> trace ("\nCould not find writer for " ++ name ++
+ "\n") Nothing
writerBench :: Pandoc
-> (String, WriterOptions -> Pandoc -> String)
diff --git a/pandoc.cabal b/pandoc.cabal
index f4a8b4e69..823e92812 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -254,7 +254,8 @@ Library
deepseq-generics >= 0.1 && < 0.2,
JuicyPixels >= && < 3.3,
- filemanip >= 0.3 && < 0.4
+ filemanip >= 0.3 && < 0.4,
+ cmark >= 0.3 && < 0.4
if flag(old-locale)
Build-Depends: old-locale >= 1 && < 1.1,
time >= 1.2 && < 1.5
@@ -292,6 +293,7 @@ Library
+ Text.Pandoc.Readers.CommonMark,
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs
index d2bb85699..435e60eb1 100644
--- a/src/Text/Pandoc.hs
+++ b/src/Text/Pandoc.hs
@@ -66,6 +66,7 @@ module Text.Pandoc
, mkStringReader
, readDocx
, readMarkdown
+ , readCommonMark
, readMediaWiki
, readRST
, readOrg
@@ -124,6 +125,7 @@ import Text.Pandoc.Definition
import Text.Pandoc.Generic
import Text.Pandoc.JSON
import Text.Pandoc.Readers.Markdown
+import Text.Pandoc.Readers.CommonMark
import Text.Pandoc.Readers.MediaWiki
import Text.Pandoc.Readers.RST
import Text.Pandoc.Readers.Org
@@ -225,6 +227,7 @@ readers = [ ("native" , StringReader $ \_ s -> return $ readNative s)
,("markdown_phpextra" , mkStringReaderWithWarnings readMarkdownWithWarnings)
,("markdown_github" , mkStringReaderWithWarnings readMarkdownWithWarnings)
,("markdown_mmd", mkStringReaderWithWarnings readMarkdownWithWarnings)
+ ,("commonmark" , mkStringReader readCommonMark)
,("rst" , mkStringReaderWithWarnings readRSTWithWarnings )
,("mediawiki" , mkStringReader readMediaWiki)
,("docbook" , mkStringReader readDocBook)
diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs
new file mode 100644
index 000000000..dfad7adc2
--- /dev/null
+++ b/src/Text/Pandoc/Readers/CommonMark.hs
@@ -0,0 +1,118 @@
+Copyright (C) 2015 John MacFarlane <>
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+{- |
+ Module : Text.Pandoc.Readers.CommonMark
+ Copyright : Copyright (C) 2015 John MacFarlane
+ License : GNU GPL, version 2 or above
+ Maintainer : John MacFarlane <>
+ Stability : alpha
+ Portability : portable
+Conversion of CommonMark-formatted plain text to 'Pandoc' document.
+CommonMark is a strongly specified variant of Markdown:
+module Text.Pandoc.Readers.CommonMark (readCommonMark)
+import CMark
+import Data.Text (unpack, pack)
+import Data.List (groupBy)
+import Text.Pandoc.Definition
+import Text.Pandoc.Options
+-- | Parse a CommonMark formatted string into a 'Pandoc' structure.
+readCommonMark :: ReaderOptions -> String -> Pandoc
+readCommonMark opts = nodeToPandoc . commonmarkToNode opts' . pack
+ where opts' = if readerSmart opts
+ then [optNormalize, optSmart]
+ else [optNormalize]
+nodeToPandoc :: Node -> Pandoc
+nodeToPandoc (Node _ DOCUMENT nodes) =
+ Pandoc nullMeta $ foldr addBlock [] nodes
+nodeToPandoc n = -- shouldn't happen
+ Pandoc nullMeta $ foldr addBlock [] [n]
+addBlocks :: [Node] -> [Block]
+addBlocks = foldr addBlock []
+addBlock :: Node -> [Block] -> [Block]
+addBlock (Node _ PARAGRAPH nodes) =
+ (Para (addInlines nodes) :)
+addBlock (Node _ HRULE _) =
+ (HorizontalRule :)
+addBlock (Node _ BLOCK_QUOTE nodes) =
+ (BlockQuote (addBlocks nodes) :)
+addBlock (Node _ (HTML t) _) =
+ (RawBlock (Format "html") (unpack t) :)
+addBlock (Node _ (CODE_BLOCK info t) _) =
+ (CodeBlock ("", take 1 (words (unpack info)), []) (unpack t) :)
+addBlock (Node _ (HEADER lev) nodes) =
+ (Header lev ("",[],[]) (addInlines nodes) :)
+addBlock (Node _ (LIST listAttrs) nodes) =
+ (constructor (map (setTightness . addBlocks . children) nodes) :)
+ where constructor = case listType listAttrs of
+ BULLET_LIST -> BulletList
+ ORDERED_LIST -> OrderedList
+ (start, DefaultStyle, delim)
+ start = listStart listAttrs
+ setTightness = if listTight listAttrs
+ then map paraToPlain
+ else id
+ paraToPlain (Para xs) = Plain (xs)
+ paraToPlain x = x
+ delim = case listDelim listAttrs of
+ PERIOD_DELIM -> Period
+ PAREN_DELIM -> OneParen
+addBlock (Node _ ITEM nodes) = id -- handled in LIST
+addBlock _ = id
+children :: Node -> [Node]
+children (Node _ _ ns) = ns
+addInlines :: [Node] -> [Inline]
+addInlines = foldr addInline []
+addInline :: Node -> [Inline] -> [Inline]
+addInline (Node _ (TEXT t) _) = (map toinl clumps ++)
+ where raw = unpack t
+ clumps = groupBy samekind raw
+ samekind ' ' ' ' = True
+ samekind ' ' _ = False
+ samekind _ ' ' = False
+ samekind _ _ = True
+ toinl (' ':_) = Space
+ toinl xs = Str xs
+addInline (Node _ LINEBREAK _) = (LineBreak :)
+addInline (Node _ SOFTBREAK _) = (Space :)
+addInline (Node _ (INLINE_HTML t) _) =
+ (RawInline (Format "html") (unpack t) :)
+addInline (Node _ (CODE t) _) =
+ (Code ("",[],[]) (unpack t) :)
+addInline (Node _ EMPH nodes) =
+ (Emph (addInlines nodes) :)
+addInline (Node _ STRONG nodes) =
+ (Strong (addInlines nodes) :)
+addInline (Node _ (LINK url title) nodes) =
+ (Link (addInlines nodes) (unpack url, unpack title) :)
+addInline (Node _ (IMAGE url title) nodes) =
+ (Image (addInlines nodes) (unpack url, unpack title) :)
+addInline _ = id