Added basic mediawiki reader.

Text.Pandoc.Readers.MediaWiki module, tests/mediawiki-reader.{txt,native}.
author: John MacFarlane <fiddlosopher@gmail.com> 2012-09-10 10:02:12 -0700
committer: John MacFarlane <fiddlosopher@gmail.com> 2012-09-12 09:05:10 -0700
commit: 3d361b2101c097ecde343625b15da8c197d733eb (patch)
tree: 6c68d7003a64ed263223999e22d112644472aa11
parent: 167012daf75436208bcf275164792f3ec06ee56c (diff)
8 files changed, 435 insertions, 7 deletions
diff --git a/pandoc.cabal b/pandoc.cabal
index e999f1b80..0b234c52f 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -16,10 +16,11 @@ Synopsis:        Conversion between markup formats
 Description:     Pandoc is a Haskell library for converting from one markup
                  format to another, and a command-line tool that uses
                  this library. It can read markdown and (subsets of) HTML,
-                 reStructuredText, LaTeX, DocBook, and Textile, and it can write
-                 markdown, reStructuredText, HTML, LaTeX, ConTeXt, Docbook,
-                 OpenDocument, ODT, Word docx, RTF, MediaWiki, Textile,
-                 groff man pages, plain text, Emacs Org-Mode, AsciiDoc, EPUB,
+                 reStructuredText, LaTeX, DocBook, MediaWiki markup,
+                 and Textile, and it can write markdown, reStructuredText,
+                 HTML, LaTeX, ConTeXt, Docbook, OpenDocument, ODT,
+                 Word docx, RTF, MediaWiki, Textile, groff man pages,
+                 plain text, Emacs Org-Mode, AsciiDoc, EPUB,
                  FictionBook2, and S5, Slidy and Slideous HTML slide shows.
                  .
                  Pandoc extends standard markdown syntax with footnotes,
@@ -120,6 +121,8 @@ Extra-Source-Files:
                  tests/markdown-citations.mhra.txt,
                  tests/markdown-citations.ieee.txt,
                  tests/textile-reader.textile,
+                 tests/mediawiki-reader.wiki,
+                 tests/mediawiki-reader.native,
                  tests/rst-reader.native,
                  tests/rst-reader.rst,
                  tests/s5.basic.html,
@@ -262,6 +265,7 @@ Library
                    Text.Pandoc.Readers.HTML,
                    Text.Pandoc.Readers.LaTeX,
                    Text.Pandoc.Readers.Markdown,
+                   Text.Pandoc.Readers.MediaWiki,
                    Text.Pandoc.Readers.RST,
                    Text.Pandoc.Readers.DocBook,
                    Text.Pandoc.Readers.TeXMath,
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs
index 33706816e..1e6b1d010 100644
--- a/src/Text/Pandoc.hs
+++ b/src/Text/Pandoc.hs
@@ -66,6 +66,7 @@ module Text.Pandoc
                , writers
                -- * Readers: converting /to/ Pandoc format
                , readMarkdown
+               , readMediaWiki
                , readRST
                , readLaTeX
                , readHtml
@@ -110,6 +111,7 @@ module Text.Pandoc
 import Text.Pandoc.Definition
 import Text.Pandoc.Generic
 import Text.Pandoc.Readers.Markdown
+import Text.Pandoc.Readers.MediaWiki
 import Text.Pandoc.Readers.RST
 import Text.Pandoc.Readers.DocBook
 import Text.Pandoc.Readers.LaTeX
@@ -179,6 +181,7 @@ readers = [("native"       , \_ -> readNative)
           ,("markdown_strict" , readMarkdown)
           ,("markdown"     , readMarkdown)
           ,("rst"          , readRST)
+          ,("mediawiki"    , readMediaWiki)
           ,("docbook"      , readDocBook)
           ,("textile"      , readTextile) -- TODO : textile+lhs
           ,("html"         , readHtml)
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index d3d4e72ff..1c2cc12f1 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -1,6 +1,4 @@
 {-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
-{-# LANGUAGE FlexibleInstances, TypeSynonymInstances,
-    GeneralizedNewtypeDeriving #-}
 {-
 Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
 
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
new file mode 100644
index 000000000..f3adbe72e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -0,0 +1,311 @@
+{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
+{-
+Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+-}
+
+{- |
+   Module      : Text.Pandoc.Readers.MediaWiki
+   Copyright   : Copyright (C) 2012 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of mediawiki text to 'Pandoc' document.
+-}
+{-
+TODO:
+_ fix pre parser -- it should use html tagsoup parsers,
+  then just strip out the text from text tags.
+_ correctly handle skipped level in list, e.g. # to ###
+_ tests for lists
+_ support HTML lists
+_ support list style attributes and start values in ol lists, also
+  value attribute on li
+_ support <p> tags in lists (and out?)
+_ support :, ::, etc. for indent (treat as list continuation paras?)
+_ support preformatted text (lines starting with space)
+_ support preformatted text blocks
+_ code highlighting: http://www.mediawiki.org/wiki/Extension:SyntaxHighlight_GeSHi <syntaxhighlight lang="php"> (alternativel, <source...>)
+  if 'line' attribute present, number lines
+  if 'start' present, set starting line number
+_ support internal links http://www.mediawiki.org/wiki/Help:Links
+_ support external links
+_ support automatic linkification of URLs
+_ support images http://www.mediawiki.org/wiki/Help:Images
+_ ignore gallery tag?
+_ support tables http://www.mediawiki.org/wiki/Help:Tables
+_ support <math> tag for latex math
+_ templates or anything in {{}} -> handle as raw wikimedia, can be dealt with in
+  postprocessing
+_ category links
+_ tests for raw html inline
+_ tests for sup, sub, del
+_ tests for pre, haskell
+_ tests for code, tt, hask
+_ test for blockquote
+-}
+module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where
+
+import Text.Pandoc.Definition
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>))
+import Text.Pandoc.Options
+import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced,
+           isInlineTag, isBlockTag, isTextTag, isCommentTag )
+import Text.Pandoc.XML ( fromEntities )
+import Text.Pandoc.Parsing
+import Text.Pandoc.Shared ( stripTrailingNewlines )
+import Data.Monoid (mconcat, mempty)
+import Control.Applicative ((<$>), (<*), (*>), (<$))
+import Control.Monad
+import Data.List (intersperse)
+import Text.HTML.TagSoup
+
+-- | Read mediawiki from an input string and return a Pandoc document.
+readMediaWiki :: ReaderOptions -- ^ Reader options
+               -> String        -- ^ String to parse (assuming @'\n'@ line endings)
+               -> Pandoc
+readMediaWiki opts s =
+  (readWith parseMediaWiki) def{ stateOptions = opts } (s ++ "\n\n")
+
+type MWParser = Parser [Char] ParserState
+
+--
+-- auxiliary functions
+--
+
+specialChars :: [Char]
+specialChars = "'[]<=&*"
+
+spaceChars :: [Char]
+spaceChars = " \n\t"
+
+sym :: String -> MWParser ()
+sym s = () <$ try (string s)
+
+htmlComment :: MWParser ()
+htmlComment = () <$ htmlTag isCommentTag
+
+inlinesInTags :: String -> MWParser Inlines
+inlinesInTags tag = trimInlines . mconcat <$> try
+  (htmlTag (~== TagOpen tag []) *>
+   manyTill inline (htmlTag (~== TagClose tag)))
+
+blocksInTags :: String -> MWParser Blocks
+blocksInTags tag = mconcat <$> try
+  (htmlTag (~== TagOpen tag []) *>
+   manyTill block (htmlTag (~== TagClose tag)))
+
+charsInTags :: String -> MWParser [Char]
+charsInTags tag = fromEntities <$> try
+  (htmlTag (~== TagOpen tag []) *>
+   manyTill anyChar (htmlTag (~== TagClose tag)))
+
+--
+-- main parser
+--
+
+parseMediaWiki :: MWParser Pandoc
+parseMediaWiki = do
+  bs <- mconcat <$> many block
+  spaces
+  eof
+  return $ B.doc bs
+
+--
+-- block parsers
+--
+
+block :: MWParser Blocks
+block = header
+     <|> hrule
+     <|> bulletList
+     <|> orderedList
+     <|> definitionList
+     <|> blockquote
+     <|> codeblock
+     <|> haskell
+     <|> mempty <$ skipMany1 blankline
+     <|> mempty <$ try (spaces *> htmlComment)
+     <|> para
+
+para :: MWParser Blocks
+para = B.para . trimInlines . mconcat <$> many1 inline
+
+hrule :: MWParser Blocks
+hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline)
+
+blockquote :: MWParser Blocks
+blockquote = B.blockQuote <$> blocksInTags "blockquote"
+
+codeblock :: MWParser Blocks
+codeblock = B.codeBlock . trimCode <$> charsInTags "pre"
+
+trimCode :: String -> String
+trimCode ('\n':xs) = stripTrailingNewlines xs
+trimCode xs        = stripTrailingNewlines xs
+
+haskell :: MWParser Blocks
+haskell = B.codeBlockWith ("",["haskell"],[]) . trimCode <$>
+             charsInTags "haskell"
+
+header :: MWParser Blocks
+header = try $ do
+  col <- sourceColumn <$> getPosition
+  guard $ col == 1  -- header must be at beginning of line
+  eqs <- many1 (char '=')
+  let lev = length eqs
+  guard $ lev <= 6
+  contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=')
+  return $ B.header lev contents
+
+bulletList :: MWParser Blocks
+bulletList = B.bulletList <$> many1 (listItem '*')
+
+orderedList :: MWParser Blocks
+orderedList = B.orderedList <$> many1 (listItem '#')
+
+definitionList :: MWParser Blocks
+definitionList = B.definitionList <$> many1 defListItem
+
+defListItem :: MWParser (Inlines, [Blocks])
+defListItem = try $ do
+  terms <- mconcat . intersperse B.linebreak <$> many1 defListTerm
+  defs  <- many1 $ listItem ':'
+  return (terms, defs)
+
+defListTerm  :: MWParser Inlines
+defListTerm = char ';' >> skipMany spaceChar >> manyTill anyChar newline >>=
+  parseFromString (trimInlines . mconcat <$> many inline)
+
+listStart :: Char -> MWParser ()
+listStart c = char c *> notFollowedBy listStartChar
+
+listStartChar :: MWParser Char
+listStartChar = oneOf "*#;:"
+
+anyListStart :: MWParser ()
+anyListStart = listStart '*' <|> listStart '#' <|> listStart ';'
+
+listItem :: Char -> MWParser Blocks
+listItem c = try $ do
+  listStart c
+  first <- manyTill anyChar newline
+  rest <- many (try $ char c *> lookAhead listStartChar *>
+                   manyTill anyChar newline)
+  parseFromString (mconcat <$> many1 block) $ unlines $ first : rest
+
+--
+-- inline parsers
+--
+
+inline :: MWParser Inlines
+inline =  whitespace
+      <|> url
+      <|> str
+      <|> strong
+      <|> emph
+      <|> nowiki
+      <|> linebreak
+      <|> externalLink
+      <|> strikeout
+      <|> subscript
+      <|> superscript
+      <|> code
+      <|> hask
+      <|> B.singleton <$> charRef
+      <|> inlineHtml
+      <|> special
+
+str :: MWParser Inlines
+str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars)
+
+special :: MWParser Inlines
+special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag) *>
+                             oneOf specialChars)
+
+inlineHtml :: MWParser Inlines
+inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag
+
+whitespace :: MWParser Inlines
+whitespace = B.space <$ (skipMany1 spaceChar <|> endline <|> htmlComment)
+
+endline :: MWParser ()
+endline = () <$ try (newline <*
+                     notFollowedBy blankline <*
+                     notFollowedBy' hrule <*
+                     notFollowedBy anyListStart)
+
+linebreak :: MWParser Inlines
+linebreak = B.linebreak <$
+  (htmlTag (~== TagOpen "br" []) *>
+   optional (htmlTag (~== TagClose "br")) *>
+   optional blankline)
+
+externalLink :: MWParser Inlines
+externalLink = try $ do
+  char '['
+  (orig, src) <- uri
+  skipMany1 spaceChar
+  lab <- manyTill inline (char ']')
+  let lab' = if null lab
+                then [B.str "1"] -- TODO generate sequentially from state
+                else lab
+  return $ B.link src "" $ trimInlines $ mconcat lab'
+
+url :: MWParser Inlines
+url = do
+  (_, src) <- uri
+  return $ B.link src "" (B.str orig)
+
+nowiki :: MWParser Inlines
+nowiki = B.text <$> charsInTags "nowiki"
+
+strikeout :: MWParser Inlines
+strikeout = B.strikeout <$> (inlinesInTags "strike" <|> inlinesInTags "del")
+
+superscript :: MWParser Inlines
+superscript = B.superscript <$> inlinesInTags "sup"
+
+subscript :: MWParser Inlines
+subscript = B.subscript <$> inlinesInTags "sub"
+
+code :: MWParser Inlines
+code = B.code <$> (charsInTags "code" <|> charsInTags "tt")
+
+hask :: MWParser Inlines
+hask = B.codeWith ("",["haskell"],[]) <$> charsInTags "hask"
+
+-- | Parses a list of inlines between start and end delimiters.
+inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
+inlinesBetween start end =
+  (trimInlines . mconcat) <$> try (start >> many1Till inner end)
+    where inner      = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
+          innerSpace = try $ whitespace >>~ notFollowedBy' end
+
+emph :: MWParser Inlines
+emph = B.emph <$> nested (inlinesBetween start end)
+    where start = sym "''" >> lookAhead nonspaceChar
+          end   = try $ notFollowedBy' (() <$ strong) >> sym "''"
+
+strong :: MWParser Inlines
+strong = B.strong <$> nested (inlinesBetween start end)
+    where start = sym "'''" >> lookAhead nonspaceChar
+          end   = try $ sym "'''"
+
diff --git a/src/pandoc.hs b/src/pandoc.hs
index af7004352..cb561e817 100644
--- a/src/pandoc.hs
+++ b/src/pandoc.hs
@@ -721,6 +721,7 @@ defaultReaderName fallback (x:xs) =
     ".rst"      -> "rst"
     ".lhs"      -> "markdown+lhs"
     ".db"       -> "docbook"
+    ".wiki"     -> "mediawiki"
     ".textile"  -> "textile"
     ".native"   -> "native"
     ".json"     -> "json"
diff --git a/tests/Tests/Old.hs b/tests/Tests/Old.hs
index 8899fef6f..5360126c2 100644
--- a/tests/Tests/Old.hs
+++ b/tests/Tests/Old.hs
@@ -119,9 +119,14 @@ tests = [ testGroup "markdown"
           , fb2WriterTest "math" [] "fb2.math.markdown" "fb2.math.fb2"
           , fb2WriterTest "testsuite" [] "testsuite.native" "writer.fb2"
           ]
+        , testGroup "mediawiki"
+          [ testGroup "writer" $ writerTests "mediawiki"
+          , test "reader" ["-r", "mediawiki", "-w", "native", "-s"]
+            "mediawiki-reader.wiki" "mediawiki-reader.native"
+          ]
         , testGroup "other writers" $ map (\f -> testGroup f $ writerTests f)
           [ "opendocument" , "context" , "texinfo"
-          , "man" , "plain" , "mediawiki", "rtf", "org", "asciidoc"
+          , "man" , "plain" , "rtf", "org", "asciidoc"
           ]
         ]
 
diff --git a/tests/mediawiki-reader.native b/tests/mediawiki-reader.native
new file mode 100644
index 000000000..8b3eedf42
--- /dev/null
+++ b/tests/mediawiki-reader.native
@@ -0,0 +1,35 @@
+Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
+[Header 1 [Str "header"]
+,Header 2 [Str "header",Space,Str "level",Space,Str "two"]
+,Header 3 [Str "header",Space,Str "level",Space,Str "3"]
+,Header 4 [Str "header",Space,Emph [Str "level"],Space,Str "four"]
+,Header 5 [Str "header",Space,Str "level",Space,Str "5"]
+,Header 6 [Str "header",Space,Str "level",Space,Str "6"]
+,Para [Str "=======",Space,Str "not",Space,Str "a",Space,Str "header",Space,Str "========"]
+,Para [Str "==",Space,Str "not",Space,Str "a",Space,Str "header",Space,Str "=="]
+,Header 2 [Str "emph",Space,Str "and",Space,Str "strong"]
+,Para [Emph [Str "emph"],Space,Strong [Str "strong"]]
+,Para [Strong [Emph [Str "strong",Space,Str "and",Space,Str "emph"]]]
+,Para [Strong [Emph [Str "emph",Space,Str "inside"],Space,Str "strong"]]
+,Para [Strong [Str "strong",Space,Str "with",Space,Emph [Str "emph"]]]
+,Para [Emph [Strong [Str "strong",Space,Str "inside"],Space,Str "emph"]]
+,Header 2 [Str "horizontal",Space,Str "rule"]
+,Para [Str "top"]
+,HorizontalRule
+,Para [Str "bottom"]
+,HorizontalRule
+,Header 2 [Str "nowiki"]
+,Para [Str "''not",Space,Str "emph''"]
+,Header 2 [Str "strikeout"]
+,Para [Strikeout [Str "This",Space,Str "is",Space,Emph [Str "struck",Space,Str "out"]]]
+,Header 2 [Str "entities"]
+,Para [Str "hi",Space,Str "&",Space,Str "low"]
+,Para [Str "hi",Space,Str "&",Space,Str "low"]
+,Para [Str "G\246del"]
+,Para [Str "\777\2730"]
+,Header 2 [Str "comments"]
+,Para [Str "inline",Space,Str "comment"]
+,Para [Str "between",Space,Str "blocks"]
+,Header 2 [Str "linebreaks"]
+,Para [Str "hi",LineBreak,Str "there"]
+,Para [Str "hi",LineBreak,Str "there"]]
diff --git a/tests/mediawiki-reader.wiki b/tests/mediawiki-reader.wiki
new file mode 100644
index 000000000..6e51f1544
--- /dev/null
+++ b/tests/mediawiki-reader.wiki
@@ -0,0 +1,71 @@
+= header =
+
+== header level two ==
+
+===header level 3===
+
+====header ''level'' four====
+
+===== header level 5 =====
+
+====== header level 6 ======
+
+======= not a header ========
+
+ == not a header ==
+
+== emph and strong ==
+
+''emph'' '''strong'''
+
+'''''strong and emph'''''
+
+'''''emph inside'' strong'''
+
+'''strong with ''emph'''''
+
+'''''strong inside''' emph''
+
+== horizontal rule ==
+
+top
+----
+bottom
+
+----
+
+== nowiki ==
+
+<nowiki>''not emph''</nowiki>
+
+== strikeout ==
+
+<strike> This is ''struck out''</strike>
+
+== entities ==
+
+hi & low
+
+hi &amp; low
+
+G&ouml;del
+
+&#777;&#xAAA;
+
+== comments ==
+
+inline<!-- secret --> comment
+
+<!-- secret -->
+
+between blocks
+
+ <!-- secret -->
+
+== linebreaks ==
+
+hi<br/>there
+
+hi<br>
+there
+
author	John MacFarlane <fiddlosopher@gmail.com>	2012-09-10 10:02:12 -0700
committer	John MacFarlane <fiddlosopher@gmail.com>	2012-09-12 09:05:10 -0700
commit	3d361b2101c097ecde343625b15da8c197d733eb (patch)
tree	6c68d7003a64ed263223999e22d112644472aa11
parent	167012daf75436208bcf275164792f3ec06ee56c (diff)