summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Textile.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/Textile.hs')
-rw-r--r--src/Text/Pandoc/Readers/Textile.hs523
1 files changed, 523 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
new file mode 100644
index 000000000..19357b343
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -0,0 +1,523 @@
+{-
+Copyright (C) 2010 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@'
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Textile
+ Copyright : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Paul Rivier <paul*rivier#demotera*com>
+ Stability : alpha
+ Portability : portable
+
+Conversion from Textile to 'Pandoc' document, based on the spec
+available at http://redcloth.org/textile.
+
+Implemented and parsed:
+ - Paragraphs
+ - Code blocks
+ - Lists
+ - blockquote
+ - Inlines : strong, emph, cite, code, deleted, superscript,
+ subscript, links
+ - footnotes
+
+Implemented but discarded:
+ - HTML-specific and CSS-specific attributes
+
+Left to be implemented:
+ - dimension sign
+ - all caps
+ - continued blocks (ex bq..)
+
+TODO : refactor common patterns across readers :
+ - autolink
+ - smartPunctuation
+ - more ...
+
+-}
+
+
+module Text.Pandoc.Readers.Textile ( readTextile) where
+
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Text.Pandoc.Parsing
+import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
+import Text.ParserCombinators.Parsec
+import Text.HTML.TagSoup.Match
+import Data.Char ( digitToInt, isLetter )
+import Control.Monad ( guard, liftM )
+
+-- | Parse a Textile text and return a Pandoc document.
+readTextile :: ParserState -- ^ Parser state, including options for parser
+ -> String -- ^ String to parse (assuming @'\n'@ line endings)
+ -> Pandoc
+readTextile state s = (readWith parseTextile) state (s ++ "\n\n")
+
+
+--
+-- Constants and data structure definitions
+--
+
+-- | Special chars border strings parsing
+specialChars :: [Char]
+specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()"
+
+-- | Generate a Pandoc ADT from a textile document
+parseTextile :: GenParser Char ParserState Pandoc
+parseTextile = do
+ -- textile allows raw HTML and does smart punctuation by default
+ updateState (\state -> state { stateParseRaw = True, stateSmart = True })
+ many blankline
+ startPos <- getPosition
+ -- go through once just to get list of reference keys and notes
+ -- docMinusKeys is the raw document with blanks where the keys/notes were...
+ let firstPassParser = noteBlock <|> lineClump
+ manyTill firstPassParser eof >>= setInput . concat
+ setPosition startPos
+ st' <- getState
+ let reversedNotes = stateNotes st'
+ updateState $ \s -> s { stateNotes = reverse reversedNotes }
+ -- now parse it for real...
+ blocks <- parseBlocks
+ return $ Pandoc (Meta [] [] []) blocks -- FIXME
+
+noteMarker :: GenParser Char ParserState [Char]
+noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.')
+
+noteBlock :: GenParser Char ParserState [Char]
+noteBlock = try $ do
+ startPos <- getPosition
+ ref <- noteMarker
+ optional blankline
+ contents <- liftM unlines $ many1Till anyLine (blanklines <|> noteBlock)
+ endPos <- getPosition
+ let newnote = (ref, contents ++ "\n")
+ st <- getState
+ let oldnotes = stateNotes st
+ updateState $ \s -> s { stateNotes = newnote : oldnotes }
+ -- return blanks so line count isn't affected
+ return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
+
+-- | Parse document blocks
+parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks = manyTill block eof
+
+-- | Block parsers list tried in definition order
+blockParsers :: [GenParser Char ParserState Block]
+blockParsers = [ codeBlock
+ , header
+ , blockQuote
+ , hrule
+ , anyList
+ , rawHtmlBlock
+ , maybeExplicitBlock "table" table
+ , maybeExplicitBlock "p" para
+ , nullBlock ]
+
+-- | Any block in the order of definition of blockParsers
+block :: GenParser Char ParserState Block
+block = choice blockParsers <?> "block"
+
+codeBlock :: GenParser Char ParserState Block
+codeBlock = codeBlockBc <|> codeBlockPre
+
+codeBlockBc :: GenParser Char ParserState Block
+codeBlockBc = try $ do
+ string "bc. "
+ contents <- manyTill anyLine blanklines
+ return $ CodeBlock ("",[],[]) $ unlines contents
+
+-- | Code Blocks in Textile are between <pre> and </pre>
+codeBlockPre :: GenParser Char ParserState Block
+codeBlockPre = try $ do
+ htmlTag (tagOpen (=="pre") null)
+ result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak)
+ -- drop leading newline if any
+ let result'' = case result' of
+ '\n':xs -> xs
+ _ -> result'
+ -- drop trailing newline if any
+ let result''' = case reverse result'' of
+ '\n':_ -> init result''
+ _ -> result''
+ return $ CodeBlock ("",[],[]) result'''
+
+-- | Header of the form "hN. content" with N in 1..6
+header :: GenParser Char ParserState Block
+header = try $ do
+ char 'h'
+ level <- oneOf "123456" >>= return . digitToInt
+ optional attributes
+ char '.'
+ whitespace
+ name <- manyTill inline blockBreak
+ return $ Header level (normalizeSpaces name)
+
+-- | Blockquote of the form "bq. content"
+blockQuote :: GenParser Char ParserState Block
+blockQuote = try $ do
+ string "bq"
+ optional attributes
+ char '.'
+ whitespace
+ para >>= return . BlockQuote . (:[])
+
+-- Horizontal rule
+
+hrule :: GenParser Char st Block
+hrule = try $ do
+ skipSpaces
+ start <- oneOf "-*"
+ count 2 (skipSpaces >> char start)
+ skipMany (spaceChar <|> char start)
+ newline
+ optional blanklines
+ return HorizontalRule
+
+-- Lists handling
+
+-- | Can be a bullet list or an ordered list. This implementation is
+-- strict in the nesting, sublist must start at exactly "parent depth
+-- plus one"
+anyList :: GenParser Char ParserState Block
+anyList = try $ do
+ l <- anyListAtDepth 1
+ blanklines
+ return l
+
+-- | This allow one type of list to be nested into an other type,
+-- provided correct nesting
+anyListAtDepth :: Int -> GenParser Char ParserState Block
+anyListAtDepth depth = choice [ bulletListAtDepth depth,
+ orderedListAtDepth depth,
+ definitionList ]
+
+-- | Bullet List of given depth, depth being the number of leading '*'
+bulletListAtDepth :: Int -> GenParser Char ParserState Block
+bulletListAtDepth depth = try $ do
+ items <- many1 (bulletListItemAtDepth depth)
+ return (BulletList items)
+
+-- | Bullet List Item of given depth, depth being the number of
+-- leading '*'
+bulletListItemAtDepth :: Int -> GenParser Char ParserState [Block]
+bulletListItemAtDepth depth = try $ do
+ count depth (char '*')
+ optional attributes
+ whitespace
+ p <- inlines >>= return . Plain
+ sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[]))
+ return (p:sublist)
+
+-- | Ordered List of given depth, depth being the number of
+-- leadingĀ '#'
+orderedListAtDepth :: Int -> GenParser Char ParserState Block
+orderedListAtDepth depth = try $ do
+ items <- many1 (orderedListItemAtDepth depth)
+ return (OrderedList (1, DefaultStyle, DefaultDelim) items)
+
+-- | Ordered List Item of given depth, depth being the number of
+-- leadingĀ '#'
+orderedListItemAtDepth :: Int -> GenParser Char ParserState [Block]
+orderedListItemAtDepth depth = try $ do
+ count depth (char '#')
+ optional attributes
+ whitespace
+ p <- inlines >>= return . Plain
+ sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[]))
+ return (p:sublist)
+
+-- | A definition list is a set of consecutive definition items
+definitionList :: GenParser Char ParserState Block
+definitionList = try $ do
+ items <- many1 definitionListItem
+ return $ DefinitionList items
+
+-- | A definition list item in textile begins with '- ', followed by
+-- the term defined, then spaces and ":=". The definition follows, on
+-- the same single line, or spaned on multiple line, after a line
+-- break.
+definitionListItem :: GenParser Char ParserState ([Inline], [[Block]])
+definitionListItem = try $ do
+ string "- "
+ term <- many1Till inline (try (whitespace >> string ":="))
+ def <- inlineDef <|> multilineDef
+ return (term, def)
+ where inlineDef :: GenParser Char ParserState [[Block]]
+ inlineDef = liftM (\d -> [[Plain d]]) $ try (whitespace >> inlines)
+ multilineDef :: GenParser Char ParserState [[Block]]
+ multilineDef = try $ do
+ optional whitespace >> newline
+ s <- many1Till anyChar (try (string "=:" >> newline))
+ -- this ++ "\n\n" does not look very good
+ ds <- parseFromString parseBlocks (s ++ "\n\n")
+ return [ds]
+
+-- | This terminates a block such as a paragraph. Because of raw html
+-- blocks support, we have to lookAhead for a rawHtmlBlock.
+blockBreak :: GenParser Char ParserState ()
+blockBreak = try (newline >> blanklines >> return ()) <|>
+ (lookAhead rawHtmlBlock >> return ())
+
+-- | A raw Html Block, optionally followed by blanklines
+rawHtmlBlock :: GenParser Char ParserState Block
+rawHtmlBlock = try $ do
+ (_,b) <- htmlTag isBlockTag
+ optional blanklines
+ return $ RawBlock "html" b
+
+-- | In textile, paragraphs are separated by blank lines.
+para :: GenParser Char ParserState Block
+para = try $ do
+ content <- manyTill inline blockBreak
+ return $ Para $ normalizeSpaces content
+
+
+-- Tables
+
+-- | A table cell spans until a pipe |
+tableCell :: GenParser Char ParserState TableCell
+tableCell = do
+ c <- many1 (noneOf "|\n")
+ content <- parseFromString (many1 inline) c
+ return $ [ Plain $ normalizeSpaces content ]
+
+-- | A table row is made of many table cells
+tableRow :: GenParser Char ParserState [TableCell]
+tableRow = try $ do
+ char '|'
+ cells <- endBy1 tableCell (char '|')
+ newline
+ return cells
+
+-- | Many table rows
+tableRows :: GenParser Char ParserState [[TableCell]]
+tableRows = many1 tableRow
+
+-- | Table headers are made of cells separated by a tag "|_."
+tableHeaders :: GenParser Char ParserState [TableCell]
+tableHeaders = try $ do
+ let separator = (try $ string "|_.")
+ separator
+ headers <- sepBy1 tableCell separator
+ char '|'
+ newline
+ return headers
+
+-- | A table with an optional header. Current implementation can
+-- handle tables with and without header, but will parse cells
+-- alignment attributes as content.
+table :: GenParser Char ParserState Block
+table = try $ do
+ headers <- option [] tableHeaders
+ rows <- tableRows
+ blanklines
+ let nbOfCols = max (length headers) (length $ head rows)
+ return $ Table []
+ (replicate nbOfCols AlignDefault)
+ (replicate nbOfCols 0.0)
+ headers
+ rows
+
+
+-- | Blocks like 'p' and 'table' do not need explicit block tag.
+-- However, they can be used to set HTML/CSS attributes when needed.
+maybeExplicitBlock :: String -- ^ block tag name
+ -> GenParser Char ParserState Block -- ^ implicit block
+ -> GenParser Char ParserState Block
+maybeExplicitBlock name blk = try $ do
+ optional $ try $ string name >> optional attributes >> char '.' >>
+ ((try whitespace) <|> endline)
+ blk
+
+
+
+----------
+-- Inlines
+----------
+
+
+-- | Any inline element
+inline :: GenParser Char ParserState Inline
+inline = choice inlineParsers <?> "inline"
+
+-- | List of consecutive inlines before a newline
+inlines :: GenParser Char ParserState [Inline]
+inlines = manyTill inline newline
+
+-- | Inline parsers tried in order
+inlineParsers :: [GenParser Char ParserState Inline]
+inlineParsers = [ autoLink
+ , str
+ , whitespace
+ , endline
+ , code
+ , htmlSpan
+ , rawHtmlInline
+ , note
+ , simpleInline (string "??") (Cite [])
+ , simpleInline (string "**") Strong
+ , simpleInline (string "__") Emph
+ , simpleInline (char '*') Strong
+ , simpleInline (char '_') Emph
+ , simpleInline (char '-') Strikeout
+ , simpleInline (char '^') Superscript
+ , simpleInline (char '~') Subscript
+ , link
+ , image
+ , mark
+ , smartPunctuation inline
+ , symbol
+ ]
+
+-- | Trademark, registered, copyright
+mark :: GenParser Char st Inline
+mark = try $ char '(' >> (try tm <|> try reg <|> copy)
+
+reg :: GenParser Char st Inline
+reg = do
+ oneOf "Rr"
+ char ')'
+ return $ Str "\174"
+
+tm :: GenParser Char st Inline
+tm = do
+ oneOf "Tt"
+ oneOf "Mm"
+ char ')'
+ return $ Str "\8482"
+
+copy :: GenParser Char st Inline
+copy = do
+ oneOf "Cc"
+ char ')'
+ return $ Str "\169"
+
+note :: GenParser Char ParserState Inline
+note = try $ do
+ char '['
+ ref <- many1 digit
+ char ']'
+ state <- getState
+ let notes = stateNotes state
+ case lookup ref notes of
+ Nothing -> fail "note not found"
+ Just raw -> liftM Note $ parseFromString parseBlocks raw
+
+-- | Any string
+str :: GenParser Char ParserState Inline
+str = do
+ xs <- many1 (noneOf (specialChars ++ "\t\n "))
+ optional $ try $ do
+ lookAhead (char '(')
+ notFollowedBy' mark
+ getInput >>= setInput . (' ':) -- add space before acronym explanation
+ -- parse a following hyphen if followed by a letter
+ -- (this prevents unwanted interpretation as starting a strikeout section)
+ result <- option xs $ try $ do
+ char '-'
+ next <- lookAhead letter
+ guard $ isLetter (last xs) || isLetter next
+ return $ xs ++ "-"
+ return $ Str result
+
+-- | Textile allows HTML span infos, we discard them
+htmlSpan :: GenParser Char ParserState Inline
+htmlSpan = try $ do
+ char '%'
+ _ <- attributes
+ content <- manyTill anyChar (char '%')
+ return $ Str content
+
+-- | Some number of space chars
+whitespace :: GenParser Char ParserState Inline
+whitespace = many1 spaceChar >> return Space <?> "whitespace"
+
+-- | In Textile, an isolated endline character is a line break
+endline :: GenParser Char ParserState Inline
+endline = try $ do
+ newline >> notFollowedBy blankline
+ return LineBreak
+
+rawHtmlInline :: GenParser Char ParserState Inline
+rawHtmlInline = liftM (RawInline "html" . snd)
+ $ htmlTag isInlineTag
+
+-- | Textile standard link syntax is "label":target
+link :: GenParser Char ParserState Inline
+link = try $ do
+ name <- surrounded (char '"') inline
+ char ':'
+ url <- manyTill (anyChar) (lookAhead $ (space <|> try (oneOf ".;," >> (space <|> newline))))
+ return $ Link name (url, "")
+
+-- | Detect plain links to http or email.
+autoLink :: GenParser Char ParserState Inline
+autoLink = do
+ (orig, src) <- (try uri <|> try emailAddress)
+ return $ Link [Str orig] (src, "")
+
+-- | image embedding
+image :: GenParser Char ParserState Inline
+image = try $ do
+ char '!' >> notFollowedBy space
+ src <- manyTill anyChar (lookAhead $ oneOf "!(")
+ alt <- option "" (try $ (char '(' >> manyTill anyChar (char ')')))
+ char '!'
+ return $ Image [Str alt] (src, alt)
+
+-- | Any special symbol defined in specialChars
+symbol :: GenParser Char ParserState Inline
+symbol = do
+ result <- oneOf specialChars
+ return $ Str [result]
+
+-- | Inline code
+code :: GenParser Char ParserState Inline
+code = code1 <|> code2
+
+code1 :: GenParser Char ParserState Inline
+code1 = surrounded (char '@') anyChar >>= return . Code nullAttr
+
+code2 :: GenParser Char ParserState Inline
+code2 = do
+ htmlTag (tagOpen (=="tt") null)
+ result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
+ return $ Code nullAttr result'
+
+-- | Html / CSS attributes
+attributes :: GenParser Char ParserState String
+attributes = choice [ enclosed (char '(') (char ')') anyChar,
+ enclosed (char '{') (char '}') anyChar,
+ enclosed (char '[') (char ']') anyChar]
+
+-- | Parses material surrounded by a parser.
+surrounded :: GenParser Char st t -- ^ surrounding parser
+ -> GenParser Char st a -- ^ content parser (to be used repeatedly)
+ -> GenParser Char st [a]
+surrounded border = enclosed border border
+
+-- | Inlines are most of the time of the same form
+simpleInline :: GenParser Char ParserState t -- ^ surrounding parser
+ -> ([Inline] -> Inline) -- ^ Inline constructor
+ -> GenParser Char ParserState Inline -- ^ content parser (to be used repeatedly)
+simpleInline border construct = surrounded border (inlineWithAttribute) >>=
+ return . construct . normalizeSpaces
+ where inlineWithAttribute = (try $ optional attributes) >> inline