diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/Textile.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/Textile.hs | 523 |
1 files changed, 523 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs new file mode 100644 index 000000000..19357b343 --- /dev/null +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -0,0 +1,523 @@ +{- +Copyright (C) 2010 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Textile + Copyright : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : Paul Rivier <paul*rivier#demotera*com> + Stability : alpha + Portability : portable + +Conversion from Textile to 'Pandoc' document, based on the spec +available at http://redcloth.org/textile. + +Implemented and parsed: + - Paragraphs + - Code blocks + - Lists + - blockquote + - Inlines : strong, emph, cite, code, deleted, superscript, + subscript, links + - footnotes + +Implemented but discarded: + - HTML-specific and CSS-specific attributes + +Left to be implemented: + - dimension sign + - all caps + - continued blocks (ex bq..) + +TODO : refactor common patterns across readers : + - autolink + - smartPunctuation + - more ... + +-} + + +module Text.Pandoc.Readers.Textile ( readTextile) where + +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.Parsing +import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag ) +import Text.ParserCombinators.Parsec +import Text.HTML.TagSoup.Match +import Data.Char ( digitToInt, isLetter ) +import Control.Monad ( guard, liftM ) + +-- | Parse a Textile text and return a Pandoc document. +readTextile :: ParserState -- ^ Parser state, including options for parser + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readTextile state s = (readWith parseTextile) state (s ++ "\n\n") + + +-- +-- Constants and data structure definitions +-- + +-- | Special chars border strings parsing +specialChars :: [Char] +specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()" + +-- | Generate a Pandoc ADT from a textile document +parseTextile :: GenParser Char ParserState Pandoc +parseTextile = do + -- textile allows raw HTML and does smart punctuation by default + updateState (\state -> state { stateParseRaw = True, stateSmart = True }) + many blankline + startPos <- getPosition + -- go through once just to get list of reference keys and notes + -- docMinusKeys is the raw document with blanks where the keys/notes were... + let firstPassParser = noteBlock <|> lineClump + manyTill firstPassParser eof >>= setInput . concat + setPosition startPos + st' <- getState + let reversedNotes = stateNotes st' + updateState $ \s -> s { stateNotes = reverse reversedNotes } + -- now parse it for real... + blocks <- parseBlocks + return $ Pandoc (Meta [] [] []) blocks -- FIXME + +noteMarker :: GenParser Char ParserState [Char] +noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.') + +noteBlock :: GenParser Char ParserState [Char] +noteBlock = try $ do + startPos <- getPosition + ref <- noteMarker + optional blankline + contents <- liftM unlines $ many1Till anyLine (blanklines <|> noteBlock) + endPos <- getPosition + let newnote = (ref, contents ++ "\n") + st <- getState + let oldnotes = stateNotes st + updateState $ \s -> s { stateNotes = newnote : oldnotes } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +-- | Parse document blocks +parseBlocks :: GenParser Char ParserState [Block] +parseBlocks = manyTill block eof + +-- | Block parsers list tried in definition order +blockParsers :: [GenParser Char ParserState Block] +blockParsers = [ codeBlock + , header + , blockQuote + , hrule + , anyList + , rawHtmlBlock + , maybeExplicitBlock "table" table + , maybeExplicitBlock "p" para + , nullBlock ] + +-- | Any block in the order of definition of blockParsers +block :: GenParser Char ParserState Block +block = choice blockParsers <?> "block" + +codeBlock :: GenParser Char ParserState Block +codeBlock = codeBlockBc <|> codeBlockPre + +codeBlockBc :: GenParser Char ParserState Block +codeBlockBc = try $ do + string "bc. " + contents <- manyTill anyLine blanklines + return $ CodeBlock ("",[],[]) $ unlines contents + +-- | Code Blocks in Textile are between <pre> and </pre> +codeBlockPre :: GenParser Char ParserState Block +codeBlockPre = try $ do + htmlTag (tagOpen (=="pre") null) + result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak) + -- drop leading newline if any + let result'' = case result' of + '\n':xs -> xs + _ -> result' + -- drop trailing newline if any + let result''' = case reverse result'' of + '\n':_ -> init result'' + _ -> result'' + return $ CodeBlock ("",[],[]) result''' + +-- | Header of the form "hN. content" with N in 1..6 +header :: GenParser Char ParserState Block +header = try $ do + char 'h' + level <- oneOf "123456" >>= return . digitToInt + optional attributes + char '.' + whitespace + name <- manyTill inline blockBreak + return $ Header level (normalizeSpaces name) + +-- | Blockquote of the form "bq. content" +blockQuote :: GenParser Char ParserState Block +blockQuote = try $ do + string "bq" + optional attributes + char '.' + whitespace + para >>= return . BlockQuote . (:[]) + +-- Horizontal rule + +hrule :: GenParser Char st Block +hrule = try $ do + skipSpaces + start <- oneOf "-*" + count 2 (skipSpaces >> char start) + skipMany (spaceChar <|> char start) + newline + optional blanklines + return HorizontalRule + +-- Lists handling + +-- | Can be a bullet list or an ordered list. This implementation is +-- strict in the nesting, sublist must start at exactly "parent depth +-- plus one" +anyList :: GenParser Char ParserState Block +anyList = try $ do + l <- anyListAtDepth 1 + blanklines + return l + +-- | This allow one type of list to be nested into an other type, +-- provided correct nesting +anyListAtDepth :: Int -> GenParser Char ParserState Block +anyListAtDepth depth = choice [ bulletListAtDepth depth, + orderedListAtDepth depth, + definitionList ] + +-- | Bullet List of given depth, depth being the number of leading '*' +bulletListAtDepth :: Int -> GenParser Char ParserState Block +bulletListAtDepth depth = try $ do + items <- many1 (bulletListItemAtDepth depth) + return (BulletList items) + +-- | Bullet List Item of given depth, depth being the number of +-- leading '*' +bulletListItemAtDepth :: Int -> GenParser Char ParserState [Block] +bulletListItemAtDepth depth = try $ do + count depth (char '*') + optional attributes + whitespace + p <- inlines >>= return . Plain + sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[])) + return (p:sublist) + +-- | Ordered List of given depth, depth being the number of +-- leadingĀ '#' +orderedListAtDepth :: Int -> GenParser Char ParserState Block +orderedListAtDepth depth = try $ do + items <- many1 (orderedListItemAtDepth depth) + return (OrderedList (1, DefaultStyle, DefaultDelim) items) + +-- | Ordered List Item of given depth, depth being the number of +-- leadingĀ '#' +orderedListItemAtDepth :: Int -> GenParser Char ParserState [Block] +orderedListItemAtDepth depth = try $ do + count depth (char '#') + optional attributes + whitespace + p <- inlines >>= return . Plain + sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[])) + return (p:sublist) + +-- | A definition list is a set of consecutive definition items +definitionList :: GenParser Char ParserState Block +definitionList = try $ do + items <- many1 definitionListItem + return $ DefinitionList items + +-- | A definition list item in textile begins with '- ', followed by +-- the term defined, then spaces and ":=". The definition follows, on +-- the same single line, or spaned on multiple line, after a line +-- break. +definitionListItem :: GenParser Char ParserState ([Inline], [[Block]]) +definitionListItem = try $ do + string "- " + term <- many1Till inline (try (whitespace >> string ":=")) + def <- inlineDef <|> multilineDef + return (term, def) + where inlineDef :: GenParser Char ParserState [[Block]] + inlineDef = liftM (\d -> [[Plain d]]) $ try (whitespace >> inlines) + multilineDef :: GenParser Char ParserState [[Block]] + multilineDef = try $ do + optional whitespace >> newline + s <- many1Till anyChar (try (string "=:" >> newline)) + -- this ++ "\n\n" does not look very good + ds <- parseFromString parseBlocks (s ++ "\n\n") + return [ds] + +-- | This terminates a block such as a paragraph. Because of raw html +-- blocks support, we have to lookAhead for a rawHtmlBlock. +blockBreak :: GenParser Char ParserState () +blockBreak = try (newline >> blanklines >> return ()) <|> + (lookAhead rawHtmlBlock >> return ()) + +-- | A raw Html Block, optionally followed by blanklines +rawHtmlBlock :: GenParser Char ParserState Block +rawHtmlBlock = try $ do + (_,b) <- htmlTag isBlockTag + optional blanklines + return $ RawBlock "html" b + +-- | In textile, paragraphs are separated by blank lines. +para :: GenParser Char ParserState Block +para = try $ do + content <- manyTill inline blockBreak + return $ Para $ normalizeSpaces content + + +-- Tables + +-- | A table cell spans until a pipe | +tableCell :: GenParser Char ParserState TableCell +tableCell = do + c <- many1 (noneOf "|\n") + content <- parseFromString (many1 inline) c + return $ [ Plain $ normalizeSpaces content ] + +-- | A table row is made of many table cells +tableRow :: GenParser Char ParserState [TableCell] +tableRow = try $ do + char '|' + cells <- endBy1 tableCell (char '|') + newline + return cells + +-- | Many table rows +tableRows :: GenParser Char ParserState [[TableCell]] +tableRows = many1 tableRow + +-- | Table headers are made of cells separated by a tag "|_." +tableHeaders :: GenParser Char ParserState [TableCell] +tableHeaders = try $ do + let separator = (try $ string "|_.") + separator + headers <- sepBy1 tableCell separator + char '|' + newline + return headers + +-- | A table with an optional header. Current implementation can +-- handle tables with and without header, but will parse cells +-- alignment attributes as content. +table :: GenParser Char ParserState Block +table = try $ do + headers <- option [] tableHeaders + rows <- tableRows + blanklines + let nbOfCols = max (length headers) (length $ head rows) + return $ Table [] + (replicate nbOfCols AlignDefault) + (replicate nbOfCols 0.0) + headers + rows + + +-- | Blocks like 'p' and 'table' do not need explicit block tag. +-- However, they can be used to set HTML/CSS attributes when needed. +maybeExplicitBlock :: String -- ^ block tag name + -> GenParser Char ParserState Block -- ^ implicit block + -> GenParser Char ParserState Block +maybeExplicitBlock name blk = try $ do + optional $ try $ string name >> optional attributes >> char '.' >> + ((try whitespace) <|> endline) + blk + + + +---------- +-- Inlines +---------- + + +-- | Any inline element +inline :: GenParser Char ParserState Inline +inline = choice inlineParsers <?> "inline" + +-- | List of consecutive inlines before a newline +inlines :: GenParser Char ParserState [Inline] +inlines = manyTill inline newline + +-- | Inline parsers tried in order +inlineParsers :: [GenParser Char ParserState Inline] +inlineParsers = [ autoLink + , str + , whitespace + , endline + , code + , htmlSpan + , rawHtmlInline + , note + , simpleInline (string "??") (Cite []) + , simpleInline (string "**") Strong + , simpleInline (string "__") Emph + , simpleInline (char '*') Strong + , simpleInline (char '_') Emph + , simpleInline (char '-') Strikeout + , simpleInline (char '^') Superscript + , simpleInline (char '~') Subscript + , link + , image + , mark + , smartPunctuation inline + , symbol + ] + +-- | Trademark, registered, copyright +mark :: GenParser Char st Inline +mark = try $ char '(' >> (try tm <|> try reg <|> copy) + +reg :: GenParser Char st Inline +reg = do + oneOf "Rr" + char ')' + return $ Str "\174" + +tm :: GenParser Char st Inline +tm = do + oneOf "Tt" + oneOf "Mm" + char ')' + return $ Str "\8482" + +copy :: GenParser Char st Inline +copy = do + oneOf "Cc" + char ')' + return $ Str "\169" + +note :: GenParser Char ParserState Inline +note = try $ do + char '[' + ref <- many1 digit + char ']' + state <- getState + let notes = stateNotes state + case lookup ref notes of + Nothing -> fail "note not found" + Just raw -> liftM Note $ parseFromString parseBlocks raw + +-- | Any string +str :: GenParser Char ParserState Inline +str = do + xs <- many1 (noneOf (specialChars ++ "\t\n ")) + optional $ try $ do + lookAhead (char '(') + notFollowedBy' mark + getInput >>= setInput . (' ':) -- add space before acronym explanation + -- parse a following hyphen if followed by a letter + -- (this prevents unwanted interpretation as starting a strikeout section) + result <- option xs $ try $ do + char '-' + next <- lookAhead letter + guard $ isLetter (last xs) || isLetter next + return $ xs ++ "-" + return $ Str result + +-- | Textile allows HTML span infos, we discard them +htmlSpan :: GenParser Char ParserState Inline +htmlSpan = try $ do + char '%' + _ <- attributes + content <- manyTill anyChar (char '%') + return $ Str content + +-- | Some number of space chars +whitespace :: GenParser Char ParserState Inline +whitespace = many1 spaceChar >> return Space <?> "whitespace" + +-- | In Textile, an isolated endline character is a line break +endline :: GenParser Char ParserState Inline +endline = try $ do + newline >> notFollowedBy blankline + return LineBreak + +rawHtmlInline :: GenParser Char ParserState Inline +rawHtmlInline = liftM (RawInline "html" . snd) + $ htmlTag isInlineTag + +-- | Textile standard link syntax is "label":target +link :: GenParser Char ParserState Inline +link = try $ do + name <- surrounded (char '"') inline + char ':' + url <- manyTill (anyChar) (lookAhead $ (space <|> try (oneOf ".;," >> (space <|> newline)))) + return $ Link name (url, "") + +-- | Detect plain links to http or email. +autoLink :: GenParser Char ParserState Inline +autoLink = do + (orig, src) <- (try uri <|> try emailAddress) + return $ Link [Str orig] (src, "") + +-- | image embedding +image :: GenParser Char ParserState Inline +image = try $ do + char '!' >> notFollowedBy space + src <- manyTill anyChar (lookAhead $ oneOf "!(") + alt <- option "" (try $ (char '(' >> manyTill anyChar (char ')'))) + char '!' + return $ Image [Str alt] (src, alt) + +-- | Any special symbol defined in specialChars +symbol :: GenParser Char ParserState Inline +symbol = do + result <- oneOf specialChars + return $ Str [result] + +-- | Inline code +code :: GenParser Char ParserState Inline +code = code1 <|> code2 + +code1 :: GenParser Char ParserState Inline +code1 = surrounded (char '@') anyChar >>= return . Code nullAttr + +code2 :: GenParser Char ParserState Inline +code2 = do + htmlTag (tagOpen (=="tt") null) + result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt")) + return $ Code nullAttr result' + +-- | Html / CSS attributes +attributes :: GenParser Char ParserState String +attributes = choice [ enclosed (char '(') (char ')') anyChar, + enclosed (char '{') (char '}') anyChar, + enclosed (char '[') (char ']') anyChar] + +-- | Parses material surrounded by a parser. +surrounded :: GenParser Char st t -- ^ surrounding parser + -> GenParser Char st a -- ^ content parser (to be used repeatedly) + -> GenParser Char st [a] +surrounded border = enclosed border border + +-- | Inlines are most of the time of the same form +simpleInline :: GenParser Char ParserState t -- ^ surrounding parser + -> ([Inline] -> Inline) -- ^ Inline constructor + -> GenParser Char ParserState Inline -- ^ content parser (to be used repeatedly) +simpleInline border construct = surrounded border (inlineWithAttribute) >>= + return . construct . normalizeSpaces + where inlineWithAttribute = (try $ optional attributes) >> inline |