summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <fiddlosopher@gmail.com>2012-09-10 10:02:12 -0700
committerJohn MacFarlane <fiddlosopher@gmail.com>2012-09-12 09:05:10 -0700
commit3d361b2101c097ecde343625b15da8c197d733eb (patch)
tree6c68d7003a64ed263223999e22d112644472aa11
parent167012daf75436208bcf275164792f3ec06ee56c (diff)
Added basic mediawiki reader.
Text.Pandoc.Readers.MediaWiki module, tests/mediawiki-reader.{txt,native}.
-rw-r--r--pandoc.cabal12
-rw-r--r--src/Text/Pandoc.hs3
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs2
-rw-r--r--src/Text/Pandoc/Readers/MediaWiki.hs311
-rw-r--r--src/pandoc.hs1
-rw-r--r--tests/Tests/Old.hs7
-rw-r--r--tests/mediawiki-reader.native35
-rw-r--r--tests/mediawiki-reader.wiki71
8 files changed, 435 insertions, 7 deletions
diff --git a/pandoc.cabal b/pandoc.cabal
index e999f1b80..0b234c52f 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -16,10 +16,11 @@ Synopsis: Conversion between markup formats
Description: Pandoc is a Haskell library for converting from one markup
format to another, and a command-line tool that uses
this library. It can read markdown and (subsets of) HTML,
- reStructuredText, LaTeX, DocBook, and Textile, and it can write
- markdown, reStructuredText, HTML, LaTeX, ConTeXt, Docbook,
- OpenDocument, ODT, Word docx, RTF, MediaWiki, Textile,
- groff man pages, plain text, Emacs Org-Mode, AsciiDoc, EPUB,
+ reStructuredText, LaTeX, DocBook, MediaWiki markup,
+ and Textile, and it can write markdown, reStructuredText,
+ HTML, LaTeX, ConTeXt, Docbook, OpenDocument, ODT,
+ Word docx, RTF, MediaWiki, Textile, groff man pages,
+ plain text, Emacs Org-Mode, AsciiDoc, EPUB,
FictionBook2, and S5, Slidy and Slideous HTML slide shows.
.
Pandoc extends standard markdown syntax with footnotes,
@@ -120,6 +121,8 @@ Extra-Source-Files:
tests/markdown-citations.mhra.txt,
tests/markdown-citations.ieee.txt,
tests/textile-reader.textile,
+ tests/mediawiki-reader.wiki,
+ tests/mediawiki-reader.native,
tests/rst-reader.native,
tests/rst-reader.rst,
tests/s5.basic.html,
@@ -262,6 +265,7 @@ Library
Text.Pandoc.Readers.HTML,
Text.Pandoc.Readers.LaTeX,
Text.Pandoc.Readers.Markdown,
+ Text.Pandoc.Readers.MediaWiki,
Text.Pandoc.Readers.RST,
Text.Pandoc.Readers.DocBook,
Text.Pandoc.Readers.TeXMath,
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs
index 33706816e..1e6b1d010 100644
--- a/src/Text/Pandoc.hs
+++ b/src/Text/Pandoc.hs
@@ -66,6 +66,7 @@ module Text.Pandoc
, writers
-- * Readers: converting /to/ Pandoc format
, readMarkdown
+ , readMediaWiki
, readRST
, readLaTeX
, readHtml
@@ -110,6 +111,7 @@ module Text.Pandoc
import Text.Pandoc.Definition
import Text.Pandoc.Generic
import Text.Pandoc.Readers.Markdown
+import Text.Pandoc.Readers.MediaWiki
import Text.Pandoc.Readers.RST
import Text.Pandoc.Readers.DocBook
import Text.Pandoc.Readers.LaTeX
@@ -179,6 +181,7 @@ readers = [("native" , \_ -> readNative)
,("markdown_strict" , readMarkdown)
,("markdown" , readMarkdown)
,("rst" , readRST)
+ ,("mediawiki" , readMediaWiki)
,("docbook" , readDocBook)
,("textile" , readTextile) -- TODO : textile+lhs
,("html" , readHtml)
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index d3d4e72ff..1c2cc12f1 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -1,6 +1,4 @@
{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
-{-# LANGUAGE FlexibleInstances, TypeSynonymInstances,
- GeneralizedNewtypeDeriving #-}
{-
Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
new file mode 100644
index 000000000..f3adbe72e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -0,0 +1,311 @@
+{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
+{-
+Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.MediaWiki
+ Copyright : Copyright (C) 2012 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of mediawiki text to 'Pandoc' document.
+-}
+{-
+TODO:
+_ fix pre parser -- it should use html tagsoup parsers,
+ then just strip out the text from text tags.
+_ correctly handle skipped level in list, e.g. # to ###
+_ tests for lists
+_ support HTML lists
+_ support list style attributes and start values in ol lists, also
+ value attribute on li
+_ support <p> tags in lists (and out?)
+_ support :, ::, etc. for indent (treat as list continuation paras?)
+_ support preformatted text (lines starting with space)
+_ support preformatted text blocks
+_ code highlighting: http://www.mediawiki.org/wiki/Extension:SyntaxHighlight_GeSHi <syntaxhighlight lang="php"> (alternativel, <source...>)
+ if 'line' attribute present, number lines
+ if 'start' present, set starting line number
+_ support internal links http://www.mediawiki.org/wiki/Help:Links
+_ support external links
+_ support automatic linkification of URLs
+_ support images http://www.mediawiki.org/wiki/Help:Images
+_ ignore gallery tag?
+_ support tables http://www.mediawiki.org/wiki/Help:Tables
+_ support <math> tag for latex math
+_ templates or anything in {{}} -> handle as raw wikimedia, can be dealt with in
+ postprocessing
+_ category links
+_ tests for raw html inline
+_ tests for sup, sub, del
+_ tests for pre, haskell
+_ tests for code, tt, hask
+_ test for blockquote
+-}
+module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where
+
+import Text.Pandoc.Definition
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>))
+import Text.Pandoc.Options
+import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced,
+ isInlineTag, isBlockTag, isTextTag, isCommentTag )
+import Text.Pandoc.XML ( fromEntities )
+import Text.Pandoc.Parsing
+import Text.Pandoc.Shared ( stripTrailingNewlines )
+import Data.Monoid (mconcat, mempty)
+import Control.Applicative ((<$>), (<*), (*>), (<$))
+import Control.Monad
+import Data.List (intersperse)
+import Text.HTML.TagSoup
+
+-- | Read mediawiki from an input string and return a Pandoc document.
+readMediaWiki :: ReaderOptions -- ^ Reader options
+ -> String -- ^ String to parse (assuming @'\n'@ line endings)
+ -> Pandoc
+readMediaWiki opts s =
+ (readWith parseMediaWiki) def{ stateOptions = opts } (s ++ "\n\n")
+
+type MWParser = Parser [Char] ParserState
+
+--
+-- auxiliary functions
+--
+
+specialChars :: [Char]
+specialChars = "'[]<=&*"
+
+spaceChars :: [Char]
+spaceChars = " \n\t"
+
+sym :: String -> MWParser ()
+sym s = () <$ try (string s)
+
+htmlComment :: MWParser ()
+htmlComment = () <$ htmlTag isCommentTag
+
+inlinesInTags :: String -> MWParser Inlines
+inlinesInTags tag = trimInlines . mconcat <$> try
+ (htmlTag (~== TagOpen tag []) *>
+ manyTill inline (htmlTag (~== TagClose tag)))
+
+blocksInTags :: String -> MWParser Blocks
+blocksInTags tag = mconcat <$> try
+ (htmlTag (~== TagOpen tag []) *>
+ manyTill block (htmlTag (~== TagClose tag)))
+
+charsInTags :: String -> MWParser [Char]
+charsInTags tag = fromEntities <$> try
+ (htmlTag (~== TagOpen tag []) *>
+ manyTill anyChar (htmlTag (~== TagClose tag)))
+
+--
+-- main parser
+--
+
+parseMediaWiki :: MWParser Pandoc
+parseMediaWiki = do
+ bs <- mconcat <$> many block
+ spaces
+ eof
+ return $ B.doc bs
+
+--
+-- block parsers
+--
+
+block :: MWParser Blocks
+block = header
+ <|> hrule
+ <|> bulletList
+ <|> orderedList
+ <|> definitionList
+ <|> blockquote
+ <|> codeblock
+ <|> haskell
+ <|> mempty <$ skipMany1 blankline
+ <|> mempty <$ try (spaces *> htmlComment)
+ <|> para
+
+para :: MWParser Blocks
+para = B.para . trimInlines . mconcat <$> many1 inline
+
+hrule :: MWParser Blocks
+hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline)
+
+blockquote :: MWParser Blocks
+blockquote = B.blockQuote <$> blocksInTags "blockquote"
+
+codeblock :: MWParser Blocks
+codeblock = B.codeBlock . trimCode <$> charsInTags "pre"
+
+trimCode :: String -> String
+trimCode ('\n':xs) = stripTrailingNewlines xs
+trimCode xs = stripTrailingNewlines xs
+
+haskell :: MWParser Blocks
+haskell = B.codeBlockWith ("",["haskell"],[]) . trimCode <$>
+ charsInTags "haskell"
+
+header :: MWParser Blocks
+header = try $ do
+ col <- sourceColumn <$> getPosition
+ guard $ col == 1 -- header must be at beginning of line
+ eqs <- many1 (char '=')
+ let lev = length eqs
+ guard $ lev <= 6
+ contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=')
+ return $ B.header lev contents
+
+bulletList :: MWParser Blocks
+bulletList = B.bulletList <$> many1 (listItem '*')
+
+orderedList :: MWParser Blocks
+orderedList = B.orderedList <$> many1 (listItem '#')
+
+definitionList :: MWParser Blocks
+definitionList = B.definitionList <$> many1 defListItem
+
+defListItem :: MWParser (Inlines, [Blocks])
+defListItem = try $ do
+ terms <- mconcat . intersperse B.linebreak <$> many1 defListTerm
+ defs <- many1 $ listItem ':'
+ return (terms, defs)
+
+defListTerm :: MWParser Inlines
+defListTerm = char ';' >> skipMany spaceChar >> manyTill anyChar newline >>=
+ parseFromString (trimInlines . mconcat <$> many inline)
+
+listStart :: Char -> MWParser ()
+listStart c = char c *> notFollowedBy listStartChar
+
+listStartChar :: MWParser Char
+listStartChar = oneOf "*#;:"
+
+anyListStart :: MWParser ()
+anyListStart = listStart '*' <|> listStart '#' <|> listStart ';'
+
+listItem :: Char -> MWParser Blocks
+listItem c = try $ do
+ listStart c
+ first <- manyTill anyChar newline
+ rest <- many (try $ char c *> lookAhead listStartChar *>
+ manyTill anyChar newline)
+ parseFromString (mconcat <$> many1 block) $ unlines $ first : rest
+
+--
+-- inline parsers
+--
+
+inline :: MWParser Inlines
+inline = whitespace
+ <|> url
+ <|> str
+ <|> strong
+ <|> emph
+ <|> nowiki
+ <|> linebreak
+ <|> externalLink
+ <|> strikeout
+ <|> subscript
+ <|> superscript
+ <|> code
+ <|> hask
+ <|> B.singleton <$> charRef
+ <|> inlineHtml
+ <|> special
+
+str :: MWParser Inlines
+str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars)
+
+special :: MWParser Inlines
+special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag) *>
+ oneOf specialChars)
+
+inlineHtml :: MWParser Inlines
+inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag
+
+whitespace :: MWParser Inlines
+whitespace = B.space <$ (skipMany1 spaceChar <|> endline <|> htmlComment)
+
+endline :: MWParser ()
+endline = () <$ try (newline <*
+ notFollowedBy blankline <*
+ notFollowedBy' hrule <*
+ notFollowedBy anyListStart)
+
+linebreak :: MWParser Inlines
+linebreak = B.linebreak <$
+ (htmlTag (~== TagOpen "br" []) *>
+ optional (htmlTag (~== TagClose "br")) *>
+ optional blankline)
+
+externalLink :: MWParser Inlines
+externalLink = try $ do
+ char '['
+ (orig, src) <- uri
+ skipMany1 spaceChar
+ lab <- manyTill inline (char ']')
+ let lab' = if null lab
+ then [B.str "1"] -- TODO generate sequentially from state
+ else lab
+ return $ B.link src "" $ trimInlines $ mconcat lab'
+
+url :: MWParser Inlines
+url = do
+ (_, src) <- uri
+ return $ B.link src "" (B.str orig)
+
+nowiki :: MWParser Inlines
+nowiki = B.text <$> charsInTags "nowiki"
+
+strikeout :: MWParser Inlines
+strikeout = B.strikeout <$> (inlinesInTags "strike" <|> inlinesInTags "del")
+
+superscript :: MWParser Inlines
+superscript = B.superscript <$> inlinesInTags "sup"
+
+subscript :: MWParser Inlines
+subscript = B.subscript <$> inlinesInTags "sub"
+
+code :: MWParser Inlines
+code = B.code <$> (charsInTags "code" <|> charsInTags "tt")
+
+hask :: MWParser Inlines
+hask = B.codeWith ("",["haskell"],[]) <$> charsInTags "hask"
+
+-- | Parses a list of inlines between start and end delimiters.
+inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
+inlinesBetween start end =
+ (trimInlines . mconcat) <$> try (start >> many1Till inner end)
+ where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
+ innerSpace = try $ whitespace >>~ notFollowedBy' end
+
+emph :: MWParser Inlines
+emph = B.emph <$> nested (inlinesBetween start end)
+ where start = sym "''" >> lookAhead nonspaceChar
+ end = try $ notFollowedBy' (() <$ strong) >> sym "''"
+
+strong :: MWParser Inlines
+strong = B.strong <$> nested (inlinesBetween start end)
+ where start = sym "'''" >> lookAhead nonspaceChar
+ end = try $ sym "'''"
+
diff --git a/src/pandoc.hs b/src/pandoc.hs
index af7004352..cb561e817 100644
--- a/src/pandoc.hs
+++ b/src/pandoc.hs
@@ -721,6 +721,7 @@ defaultReaderName fallback (x:xs) =
".rst" -> "rst"
".lhs" -> "markdown+lhs"
".db" -> "docbook"
+ ".wiki" -> "mediawiki"
".textile" -> "textile"
".native" -> "native"
".json" -> "json"
diff --git a/tests/Tests/Old.hs b/tests/Tests/Old.hs
index 8899fef6f..5360126c2 100644
--- a/tests/Tests/Old.hs
+++ b/tests/Tests/Old.hs
@@ -119,9 +119,14 @@ tests = [ testGroup "markdown"
, fb2WriterTest "math" [] "fb2.math.markdown" "fb2.math.fb2"
, fb2WriterTest "testsuite" [] "testsuite.native" "writer.fb2"
]
+ , testGroup "mediawiki"
+ [ testGroup "writer" $ writerTests "mediawiki"
+ , test "reader" ["-r", "mediawiki", "-w", "native", "-s"]
+ "mediawiki-reader.wiki" "mediawiki-reader.native"
+ ]
, testGroup "other writers" $ map (\f -> testGroup f $ writerTests f)
[ "opendocument" , "context" , "texinfo"
- , "man" , "plain" , "mediawiki", "rtf", "org", "asciidoc"
+ , "man" , "plain" , "rtf", "org", "asciidoc"
]
]
diff --git a/tests/mediawiki-reader.native b/tests/mediawiki-reader.native
new file mode 100644
index 000000000..8b3eedf42
--- /dev/null
+++ b/tests/mediawiki-reader.native
@@ -0,0 +1,35 @@
+Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []})
+[Header 1 [Str "header"]
+,Header 2 [Str "header",Space,Str "level",Space,Str "two"]
+,Header 3 [Str "header",Space,Str "level",Space,Str "3"]
+,Header 4 [Str "header",Space,Emph [Str "level"],Space,Str "four"]
+,Header 5 [Str "header",Space,Str "level",Space,Str "5"]
+,Header 6 [Str "header",Space,Str "level",Space,Str "6"]
+,Para [Str "=======",Space,Str "not",Space,Str "a",Space,Str "header",Space,Str "========"]
+,Para [Str "==",Space,Str "not",Space,Str "a",Space,Str "header",Space,Str "=="]
+,Header 2 [Str "emph",Space,Str "and",Space,Str "strong"]
+,Para [Emph [Str "emph"],Space,Strong [Str "strong"]]
+,Para [Strong [Emph [Str "strong",Space,Str "and",Space,Str "emph"]]]
+,Para [Strong [Emph [Str "emph",Space,Str "inside"],Space,Str "strong"]]
+,Para [Strong [Str "strong",Space,Str "with",Space,Emph [Str "emph"]]]
+,Para [Emph [Strong [Str "strong",Space,Str "inside"],Space,Str "emph"]]
+,Header 2 [Str "horizontal",Space,Str "rule"]
+,Para [Str "top"]
+,HorizontalRule
+,Para [Str "bottom"]
+,HorizontalRule
+,Header 2 [Str "nowiki"]
+,Para [Str "''not",Space,Str "emph''"]
+,Header 2 [Str "strikeout"]
+,Para [Strikeout [Str "This",Space,Str "is",Space,Emph [Str "struck",Space,Str "out"]]]
+,Header 2 [Str "entities"]
+,Para [Str "hi",Space,Str "&",Space,Str "low"]
+,Para [Str "hi",Space,Str "&",Space,Str "low"]
+,Para [Str "G\246del"]
+,Para [Str "\777\2730"]
+,Header 2 [Str "comments"]
+,Para [Str "inline",Space,Str "comment"]
+,Para [Str "between",Space,Str "blocks"]
+,Header 2 [Str "linebreaks"]
+,Para [Str "hi",LineBreak,Str "there"]
+,Para [Str "hi",LineBreak,Str "there"]]
diff --git a/tests/mediawiki-reader.wiki b/tests/mediawiki-reader.wiki
new file mode 100644
index 000000000..6e51f1544
--- /dev/null
+++ b/tests/mediawiki-reader.wiki
@@ -0,0 +1,71 @@
+= header =
+
+== header level two ==
+
+===header level 3===
+
+====header ''level'' four====
+
+===== header level 5 =====
+
+====== header level 6 ======
+
+======= not a header ========
+
+ == not a header ==
+
+== emph and strong ==
+
+''emph'' '''strong'''
+
+'''''strong and emph'''''
+
+'''''emph inside'' strong'''
+
+'''strong with ''emph'''''
+
+'''''strong inside''' emph''
+
+== horizontal rule ==
+
+top
+----
+bottom
+
+----
+
+== nowiki ==
+
+<nowiki>''not emph''</nowiki>
+
+== strikeout ==
+
+<strike> This is ''struck out''</strike>
+
+== entities ==
+
+hi & low
+
+hi &amp; low
+
+G&ouml;del
+
+&#777;&#xAAA;
+
+== comments ==
+
+inline<!-- secret --> comment
+
+<!-- secret -->
+
+between blocks
+
+ <!-- secret -->
+
+== linebreaks ==
+
+hi<br/>there
+
+hi<br>
+there
+