summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Markdown.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/Markdown.hs')
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs1243
1 files changed, 1243 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
new file mode 100644
index 000000000..896f5832e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -0,0 +1,1243 @@
+{-# LANGUAGE CPP #-}
+{-
+Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Markdown
+ Copyright : Copyright (C) 2006-8 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of markdown-formatted plain text to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Markdown (
+ readMarkdown
+ ) where
+
+import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex, intercalate )
+import Data.Ord ( comparing )
+import Data.Char ( isAlphaNum, isAlpha, isLower, isDigit, isUpper )
+import Data.Maybe
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
+import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
+ anyHtmlInlineTag, anyHtmlTag,
+ anyHtmlEndTag, htmlEndTag, extractTagType,
+ htmlBlockElement, unsanitaryURI )
+import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
+import Text.ParserCombinators.Parsec
+import Control.Monad (when)
+
+-- | Read markdown from an input string and return a Pandoc document.
+readMarkdown :: ParserState -> String -> Pandoc
+readMarkdown state s = (readWith parseMarkdown) state (s ++ "\n\n")
+
+--
+-- Constants and data structure definitions
+--
+
+spaceChars :: [Char]
+spaceChars = " \t"
+
+bulletListMarkers :: [Char]
+bulletListMarkers = "*+-"
+
+hruleChars :: [Char]
+hruleChars = "*-_"
+
+setextHChars :: [Char]
+setextHChars = "=-"
+
+-- treat these as potentially non-text when parsing inline:
+specialChars :: [Char]
+specialChars = "\\[]*_~`<>$!^-.&'\"\8216\8217\8220\8221"
+
+--
+-- auxiliary functions
+--
+
+indentSpaces :: GenParser Char ParserState [Char]
+indentSpaces = try $ do
+ state <- getState
+ let tabStop = stateTabStop state
+ try (count tabStop (char ' ')) <|>
+ (many (char ' ') >> string "\t") <?> "indentation"
+
+nonindentSpaces :: GenParser Char ParserState [Char]
+nonindentSpaces = do
+ state <- getState
+ let tabStop = stateTabStop state
+ sps <- many (char ' ')
+ if length sps < tabStop
+ then return sps
+ else unexpected "indented line"
+
+-- | Fail unless we're at beginning of a line.
+failUnlessBeginningOfLine :: GenParser tok st ()
+failUnlessBeginningOfLine = do
+ pos <- getPosition
+ if sourceColumn pos == 1 then return () else fail "not beginning of line"
+
+-- | Fail unless we're in "smart typography" mode.
+failUnlessSmart :: GenParser tok ParserState ()
+failUnlessSmart = do
+ state <- getState
+ if stateSmart state then return () else fail "Smart typography feature"
+
+-- | Parse a sequence of inline elements between square brackets,
+-- including inlines between balanced pairs of square brackets.
+inlinesInBalancedBrackets :: GenParser Char ParserState Inline
+ -> GenParser Char ParserState [Inline]
+inlinesInBalancedBrackets parser = try $ do
+ char '['
+ result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser
+ if res == "["
+ then return ()
+ else pzero
+ bal <- inlinesInBalancedBrackets parser
+ return $ [Str "["] ++ bal ++ [Str "]"])
+ <|> (count 1 parser))
+ (char ']')
+ return $ concat result
+
+--
+-- document structure
+--
+
+titleLine :: GenParser Char ParserState [Inline]
+titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline
+
+authorsLine :: GenParser Char st [String]
+authorsLine = try $ do
+ char '%'
+ skipSpaces
+ authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
+ newline
+ return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors
+
+dateLine :: GenParser Char st String
+dateLine = try $ do
+ char '%'
+ skipSpaces
+ date <- many (noneOf "\n")
+ newline
+ return $ decodeCharacterReferences $ removeTrailingSpace date
+
+titleBlock :: GenParser Char ParserState ([Inline], [String], [Char])
+titleBlock = try $ do
+ failIfStrict
+ title <- option [] titleLine
+ author <- option [] authorsLine
+ date <- option "" dateLine
+ optional blanklines
+ return (title, author, date)
+
+parseMarkdown :: GenParser Char ParserState Pandoc
+parseMarkdown = do
+ -- markdown allows raw HTML
+ updateState (\state -> state { stateParseRaw = True })
+ startPos <- getPosition
+ -- go through once just to get list of reference keys
+ -- docMinusKeys is the raw document with blanks where the keys were...
+ docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>=
+ return . concat
+ setInput docMinusKeys
+ setPosition startPos
+ st <- getState
+ -- go through again for notes unless strict...
+ if stateStrict st
+ then return ()
+ else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>=
+ return . concat
+ st' <- getState
+ let reversedNotes = stateNotes st'
+ updateState $ \s -> s { stateNotes = reverse reversedNotes }
+ setInput docMinusNotes
+ setPosition startPos
+ -- now parse it for real...
+ (title, author, date) <- option ([],[],"") titleBlock
+ blocks <- parseBlocks
+ return $ Pandoc (Meta title author date) $ filter (/= Null) blocks
+
+--
+-- initial pass for references and notes
+--
+
+referenceKey :: GenParser Char ParserState [Char]
+referenceKey = try $ do
+ startPos <- getPosition
+ nonindentSpaces
+ lab <- reference
+ char ':'
+ skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[')
+ let sourceURL excludes = many $
+ optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' referenceTitle >> char ' '))
+ src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n"
+ tit <- option "" referenceTitle
+ blanklines
+ endPos <- getPosition
+ let newkey = (lab, (intercalate "%20" $ words $ removeTrailingSpace src, tit))
+ st <- getState
+ let oldkeys = stateKeys st
+ updateState $ \s -> s { stateKeys = newkey : oldkeys }
+ -- return blanks so line count isn't affected
+ return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
+
+referenceTitle :: GenParser Char st String
+referenceTitle = try $ do
+ skipSpaces >> optional newline >> skipSpaces
+ tit <- (charsInBalanced '(' ')' >>= return . unwords . words)
+ <|> do delim <- char '\'' <|> char '"'
+ manyTill anyChar (try (char delim >> skipSpaces >>
+ notFollowedBy (noneOf ")\n")))
+ return $ decodeCharacterReferences tit
+
+noteMarker :: GenParser Char st [Char]
+noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']')
+
+rawLine :: GenParser Char ParserState [Char]
+rawLine = do
+ notFollowedBy blankline
+ notFollowedBy' noteMarker
+ contents <- many1 nonEndline
+ end <- option "" (newline >> optional indentSpaces >> return "\n")
+ return $ contents ++ end
+
+rawLines :: GenParser Char ParserState [Char]
+rawLines = many1 rawLine >>= return . concat
+
+noteBlock :: GenParser Char ParserState [Char]
+noteBlock = try $ do
+ startPos <- getPosition
+ ref <- noteMarker
+ char ':'
+ optional blankline
+ optional indentSpaces
+ raw <- sepBy rawLines (try (blankline >> indentSpaces))
+ optional blanklines
+ endPos <- getPosition
+ -- parse the extracted text, which may contain various block elements:
+ contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n"
+ let newnote = (ref, contents)
+ st <- getState
+ let oldnotes = stateNotes st
+ updateState $ \s -> s { stateNotes = newnote : oldnotes }
+ -- return blanks so line count isn't affected
+ return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
+
+--
+-- parsing blocks
+--
+
+parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks = manyTill block eof
+
+block :: GenParser Char ParserState Block
+block = do
+ st <- getState
+ choice (if stateStrict st
+ then [ header
+ , codeBlockIndented
+ , blockQuote
+ , hrule
+ , bulletList
+ , orderedList
+ , htmlBlock
+ , para
+ , plain
+ , nullBlock ]
+ else [ codeBlockDelimited
+ , header
+ , table
+ , codeBlockIndented
+ , lhsCodeBlock
+ , blockQuote
+ , hrule
+ , bulletList
+ , orderedList
+ , definitionList
+ , para
+ , rawHtmlBlocks
+ , plain
+ , nullBlock ]) <?> "block"
+
+--
+-- header blocks
+--
+
+header :: GenParser Char ParserState Block
+header = setextHeader <|> atxHeader <?> "header"
+
+atxHeader :: GenParser Char ParserState Block
+atxHeader = try $ do
+ level <- many1 (char '#') >>= return . length
+ notFollowedBy (char '.' <|> char ')') -- this would be a list
+ skipSpaces
+ text <- manyTill inline atxClosing >>= return . normalizeSpaces
+ return $ Header level text
+
+atxClosing :: GenParser Char st [Char]
+atxClosing = try $ skipMany (char '#') >> blanklines
+
+setextHeader :: GenParser Char ParserState Block
+setextHeader = try $ do
+ text <- many1Till inline newline
+ underlineChar <- oneOf setextHChars
+ many (char underlineChar)
+ blanklines
+ let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1
+ return $ Header level (normalizeSpaces text)
+
+--
+-- hrule block
+--
+
+hrule :: GenParser Char st Block
+hrule = try $ do
+ skipSpaces
+ start <- oneOf hruleChars
+ count 2 (skipSpaces >> char start)
+ skipMany (oneOf spaceChars <|> char start)
+ newline
+ optional blanklines
+ return HorizontalRule
+
+--
+-- code blocks
+--
+
+indentedLine :: GenParser Char ParserState [Char]
+indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n")
+
+codeBlockDelimiter :: Maybe Int
+ -> GenParser Char st (Int, ([Char], [[Char]], [([Char], [Char])]))
+codeBlockDelimiter len = try $ do
+ size <- case len of
+ Just l -> count l (char '~') >> many (char '~') >> return l
+ Nothing -> count 3 (char '~') >> many (char '~') >>=
+ return . (+ 3) . length
+ many spaceChar
+ attr <- option ([],[],[]) attributes
+ blankline
+ return (size, attr)
+
+attributes :: GenParser Char st ([Char], [[Char]], [([Char], [Char])])
+attributes = try $ do
+ char '{'
+ many spaceChar
+ attrs <- many (attribute >>~ many spaceChar)
+ char '}'
+ let (ids, classes, keyvals) = unzip3 attrs
+ let id' = if null ids then "" else head ids
+ return (id', concat classes, concat keyvals)
+
+attribute :: GenParser Char st ([Char], [[Char]], [([Char], [Char])])
+attribute = identifierAttr <|> classAttr <|> keyValAttr
+
+identifier :: GenParser Char st [Char]
+identifier = do
+ first <- letter
+ rest <- many alphaNum
+ return (first:rest)
+
+identifierAttr :: GenParser Char st ([Char], [a], [a1])
+identifierAttr = try $ do
+ char '#'
+ result <- identifier
+ return (result,[],[])
+
+classAttr :: GenParser Char st ([Char], [[Char]], [a])
+classAttr = try $ do
+ char '.'
+ result <- identifier
+ return ("",[result],[])
+
+keyValAttr :: GenParser Char st ([Char], [a], [([Char], [Char])])
+keyValAttr = try $ do
+ key <- identifier
+ char '='
+ char '"'
+ val <- manyTill (noneOf "\n") (char '"')
+ return ("",[],[(key,val)])
+
+codeBlockDelimited :: GenParser Char st Block
+codeBlockDelimited = try $ do
+ (size, attr) <- codeBlockDelimiter Nothing
+ contents <- manyTill anyLine (codeBlockDelimiter (Just size))
+ blanklines
+ return $ CodeBlock attr $ intercalate "\n" contents
+
+codeBlockIndented :: GenParser Char ParserState Block
+codeBlockIndented = do
+ contents <- many1 (indentedLine <|>
+ try (do b <- blanklines
+ l <- indentedLine
+ return $ b ++ l))
+ optional blanklines
+ return $ CodeBlock ("",[],[]) $ stripTrailingNewlines $ concat contents
+
+lhsCodeBlock :: GenParser Char ParserState Block
+lhsCodeBlock = do
+ failUnlessLHS
+ contents <- lhsCodeBlockBird <|> lhsCodeBlockLaTeX
+ return $ CodeBlock ("",["sourceCode","haskell"],[]) contents
+
+lhsCodeBlockLaTeX :: GenParser Char ParserState String
+lhsCodeBlockLaTeX = try $ do
+ string "\\begin{code}"
+ manyTill spaceChar newline
+ contents <- many1Till anyChar (try $ string "\\end{code}")
+ blanklines
+ return $ stripTrailingNewlines contents
+
+lhsCodeBlockBird :: GenParser Char ParserState String
+lhsCodeBlockBird = try $ do
+ pos <- getPosition
+ when (sourceColumn pos /= 1) $ fail "Not in first column"
+ lns <- many1 birdTrackLine
+ -- if (as is normal) there is always a space after >, drop it
+ let lns' = if all (\ln -> null ln || take 1 ln == " ") lns
+ then map (drop 1) lns
+ else lns
+ blanklines
+ return $ intercalate "\n" lns'
+
+birdTrackLine :: GenParser Char st [Char]
+birdTrackLine = do
+ char '>'
+ manyTill anyChar newline
+
+
+--
+-- block quotes
+--
+
+emailBlockQuoteStart :: GenParser Char ParserState Char
+emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ')
+
+emailBlockQuote :: GenParser Char ParserState [[Char]]
+emailBlockQuote = try $ do
+ emailBlockQuoteStart
+ raw <- sepBy (many (nonEndline <|>
+ (try (endline >> notFollowedBy emailBlockQuoteStart >>
+ return '\n'))))
+ (try (newline >> emailBlockQuoteStart))
+ newline <|> (eof >> return '\n')
+ optional blanklines
+ return raw
+
+blockQuote :: GenParser Char ParserState Block
+blockQuote = do
+ raw <- emailBlockQuote
+ -- parse the extracted block, which may contain various block elements:
+ contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n"
+ return $ BlockQuote contents
+
+--
+-- list blocks
+--
+
+bulletListStart :: GenParser Char ParserState ()
+bulletListStart = try $ do
+ optional newline -- if preceded by a Plain block in a list context
+ nonindentSpaces
+ notFollowedBy' hrule -- because hrules start out just like lists
+ oneOf bulletListMarkers
+ spaceChar
+ skipSpaces
+
+anyOrderedListStart :: GenParser Char ParserState (Int, ListNumberStyle, ListNumberDelim)
+anyOrderedListStart = try $ do
+ optional newline -- if preceded by a Plain block in a list context
+ nonindentSpaces
+ notFollowedBy $ string "p." >> spaceChar >> digit -- page number
+ state <- getState
+ if stateStrict state
+ then do many1 digit
+ char '.'
+ spaceChar
+ return (1, DefaultStyle, DefaultDelim)
+ else do (num, style, delim) <- anyOrderedListMarker
+ -- if it could be an abbreviated first name, insist on more than one space
+ if delim == Period && (style == UpperAlpha || (style == UpperRoman &&
+ num `elem` [1, 5, 10, 50, 100, 500, 1000]))
+ then char '\t' <|> (char ' ' >>~ notFollowedBy (satisfy isUpper))
+ else spaceChar
+ skipSpaces
+ return (num, style, delim)
+
+listStart :: GenParser Char ParserState ()
+listStart = bulletListStart <|> (anyOrderedListStart >> return ())
+
+-- parse a line of a list item (start = parser for beginning of list item)
+listLine :: GenParser Char ParserState [Char]
+listLine = try $ do
+ notFollowedBy' listStart
+ notFollowedBy blankline
+ notFollowedBy' (do indentSpaces
+ many (spaceChar)
+ listStart)
+ line <- manyTill anyChar newline
+ return $ line ++ "\n"
+
+-- parse raw text for one list item, excluding start marker and continuations
+rawListItem :: GenParser Char ParserState [Char]
+rawListItem = try $ do
+ listStart
+ result <- many1 listLine
+ blanks <- many blankline
+ return $ concat result ++ blanks
+
+-- continuation of a list item - indented and separated by blankline
+-- or (in compact lists) endline.
+-- note: nested lists are parsed as continuations
+listContinuation :: GenParser Char ParserState [Char]
+listContinuation = try $ do
+ lookAhead indentSpaces
+ result <- many1 listContinuationLine
+ blanks <- many blankline
+ return $ concat result ++ blanks
+
+listContinuationLine :: GenParser Char ParserState [Char]
+listContinuationLine = try $ do
+ notFollowedBy blankline
+ notFollowedBy' listStart
+ optional indentSpaces
+ result <- manyTill anyChar newline
+ return $ result ++ "\n"
+
+listItem :: GenParser Char ParserState [Block]
+listItem = try $ do
+ first <- rawListItem
+ continuations <- many listContinuation
+ -- parsing with ListItemState forces markers at beginning of lines to
+ -- count as list item markers, even if not separated by blank space.
+ -- see definition of "endline"
+ state <- getState
+ let oldContext = stateParserContext state
+ setState $ state {stateParserContext = ListItemState}
+ -- parse the extracted block, which may contain various block elements:
+ let raw = concat (first:continuations)
+ contents <- parseFromString parseBlocks raw
+ updateState (\st -> st {stateParserContext = oldContext})
+ return contents
+
+orderedList :: GenParser Char ParserState Block
+orderedList = try $ do
+ (start, style, delim) <- lookAhead anyOrderedListStart
+ items <- many1 listItem
+ return $ OrderedList (start, style, delim) $ compactify items
+
+bulletList :: GenParser Char ParserState Block
+bulletList = try $ do
+ lookAhead bulletListStart
+ many1 listItem >>= return . BulletList . compactify
+
+-- definition lists
+
+definitionListItem :: GenParser Char ParserState ([Inline], [Block])
+definitionListItem = try $ do
+ notFollowedBy blankline
+ notFollowedBy' indentSpaces
+ -- first, see if this has any chance of being a definition list:
+ lookAhead (anyLine >> char ':')
+ term <- manyTill inline newline
+ raw <- many1 defRawBlock
+ state <- getState
+ let oldContext = stateParserContext state
+ -- parse the extracted block, which may contain various block elements:
+ contents <- parseFromString parseBlocks $ concat raw
+ updateState (\st -> st {stateParserContext = oldContext})
+ return ((normalizeSpaces term), contents)
+
+defRawBlock :: GenParser Char ParserState [Char]
+defRawBlock = try $ do
+ char ':'
+ state <- getState
+ let tabStop = stateTabStop state
+ try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t")
+ firstline <- anyLine
+ rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
+ trailing <- option "" blanklines
+ return $ firstline ++ "\n" ++ unlines rawlines ++ trailing
+
+definitionList :: GenParser Char ParserState Block
+definitionList = do
+ items <- many1 definitionListItem
+ let (terms, defs) = unzip items
+ let defs' = compactify defs
+ let items' = zip terms defs'
+ return $ DefinitionList items'
+
+--
+-- paragraph block
+--
+
+isHtmlOrBlank :: Inline -> Bool
+isHtmlOrBlank (HtmlInline _) = True
+isHtmlOrBlank (Space) = True
+isHtmlOrBlank (LineBreak) = True
+isHtmlOrBlank _ = False
+
+para :: GenParser Char ParserState Block
+para = try $ do
+ result <- many1 inline
+ if all isHtmlOrBlank result
+ then fail "treat as raw HTML"
+ else return ()
+ newline
+ blanklines <|> do st <- getState
+ if stateStrict st
+ then lookAhead (blockQuote <|> header) >> return ""
+ else pzero
+ return $ Para $ normalizeSpaces result
+
+plain :: GenParser Char ParserState Block
+plain = many1 inline >>= return . Plain . normalizeSpaces
+
+--
+-- raw html
+--
+
+htmlElement :: GenParser Char ParserState [Char]
+htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
+
+htmlBlock :: GenParser Char ParserState Block
+htmlBlock = try $ do
+ failUnlessBeginningOfLine
+ first <- htmlElement
+ finalSpace <- many (oneOf spaceChars)
+ finalNewlines <- many newline
+ return $ RawHtml $ first ++ finalSpace ++ finalNewlines
+
+-- True if tag is self-closing
+isSelfClosing :: [Char] -> Bool
+isSelfClosing tag =
+ isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag
+
+strictHtmlBlock :: GenParser Char ParserState [Char]
+strictHtmlBlock = try $ do
+ tag <- anyHtmlBlockTag
+ let tag' = extractTagType tag
+ if isSelfClosing tag || tag' == "hr"
+ then return tag
+ else do contents <- many (notFollowedBy' (htmlEndTag tag') >>
+ (htmlElement <|> (count 1 anyChar)))
+ end <- htmlEndTag tag'
+ return $ tag ++ concat contents ++ end
+
+rawHtmlBlocks :: GenParser Char ParserState Block
+rawHtmlBlocks = do
+ htmlBlocks <- many1 $ do (RawHtml blk) <- rawHtmlBlock
+ sps <- do sp1 <- many spaceChar
+ sp2 <- option "" (blankline >> return "\n")
+ sp3 <- many spaceChar
+ sp4 <- option "" blanklines
+ return $ sp1 ++ sp2 ++ sp3 ++ sp4
+ -- note: we want raw html to be able to
+ -- precede a code block, when separated
+ -- by a blank line
+ return $ blk ++ sps
+ let combined = concat htmlBlocks
+ let combined' = if last combined == '\n' then init combined else combined
+ return $ RawHtml combined'
+
+--
+-- Tables
+--
+
+-- Parse a dashed line with optional trailing spaces; return its length
+-- and the length including trailing space.
+dashedLine :: Char
+ -> GenParser Char st (Int, Int)
+dashedLine ch = do
+ dashes <- many1 (char ch)
+ sp <- many spaceChar
+ return $ (length dashes, length $ dashes ++ sp)
+
+-- Parse a table header with dashed lines of '-' preceded by
+-- one line of text.
+simpleTableHeader :: GenParser Char ParserState ([[Char]], [Alignment], [Int])
+simpleTableHeader = try $ do
+ rawContent <- anyLine
+ initSp <- nonindentSpaces
+ dashes <- many1 (dashedLine '-')
+ newline
+ let (lengths, lines') = unzip dashes
+ let indices = scanl (+) (length initSp) lines'
+ let rawHeads = tail $ splitByIndices (init indices) rawContent
+ let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths
+ return (rawHeads, aligns, indices)
+
+-- Parse a table footer - dashed lines followed by blank line.
+tableFooter :: GenParser Char ParserState [Char]
+tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines
+
+-- Parse a table separator - dashed line.
+tableSep :: GenParser Char ParserState String
+tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n"
+
+-- Parse a raw line and split it into chunks by indices.
+rawTableLine :: [Int]
+ -> GenParser Char ParserState [String]
+rawTableLine indices = do
+ notFollowedBy' (blanklines <|> tableFooter)
+ line <- many1Till anyChar newline
+ return $ map removeLeadingTrailingSpace $ tail $
+ splitByIndices (init indices) line
+
+-- Parse a table line and return a list of lists of blocks (columns).
+tableLine :: [Int]
+ -> GenParser Char ParserState [[Block]]
+tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain))
+
+-- Parse a multiline table row and return a list of blocks (columns).
+multilineRow :: [Int]
+ -> GenParser Char ParserState [[Block]]
+multilineRow indices = do
+ colLines <- many1 (rawTableLine indices)
+ optional blanklines
+ let cols = map unlines $ transpose colLines
+ mapM (parseFromString (many plain)) cols
+
+-- Calculate relative widths of table columns, based on indices
+widthsFromIndices :: Int -- Number of columns on terminal
+ -> [Int] -- Indices
+ -> [Double] -- Fractional relative sizes of columns
+widthsFromIndices _ [] = []
+widthsFromIndices numColumns indices =
+ let lengths = zipWith (-) indices (0:indices)
+ totLength = sum lengths
+ quotient = if totLength > numColumns
+ then fromIntegral totLength
+ else fromIntegral numColumns
+ fracs = map (\l -> (fromIntegral l) / quotient) lengths in
+ tail fracs
+
+-- Parses a table caption: inlines beginning with 'Table:'
+-- and followed by blank lines.
+tableCaption :: GenParser Char ParserState [Inline]
+tableCaption = try $ do
+ nonindentSpaces
+ string "Table:"
+ result <- many1 inline
+ blanklines
+ return $ normalizeSpaces result
+
+-- Parse a table using 'headerParser', 'lineParser', and 'footerParser'.
+tableWith :: GenParser Char ParserState ([[Char]], [Alignment], [Int])
+ -> ([Int] -> GenParser Char ParserState [[Block]])
+ -> GenParser Char ParserState end
+ -> GenParser Char ParserState Block
+tableWith headerParser lineParser footerParser = try $ do
+ (rawHeads, aligns, indices) <- headerParser
+ lines' <- many1Till (lineParser indices) footerParser
+ caption <- option [] tableCaption
+ heads <- mapM (parseFromString (many plain)) rawHeads
+ state <- getState
+ let numColumns = stateColumns state
+ let widths = widthsFromIndices numColumns indices
+ return $ Table caption aligns widths heads lines'
+
+-- Parse a simple table with '---' header and one line per row.
+simpleTable :: GenParser Char ParserState Block
+simpleTable = tableWith simpleTableHeader tableLine blanklines
+
+-- Parse a multiline table: starts with row of '-' on top, then header
+-- (which may be multiline), then the rows,
+-- which may be multiline, separated by blank lines, and
+-- ending with a footer (dashed line followed by blank line).
+multilineTable :: GenParser Char ParserState Block
+multilineTable = tableWith multilineTableHeader multilineRow tableFooter
+
+multilineTableHeader :: GenParser Char ParserState ([String], [Alignment], [Int])
+multilineTableHeader = try $ do
+ tableSep
+ rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline)
+ initSp <- nonindentSpaces
+ dashes <- many1 (dashedLine '-')
+ newline
+ let (lengths, lines') = unzip dashes
+ let indices = scanl (+) (length initSp) lines'
+ let rawHeadsList = transpose $ map
+ (\ln -> tail $ splitByIndices (init indices) ln)
+ rawContent
+ let rawHeads = map (intercalate " ") rawHeadsList
+ let aligns = zipWith alignType rawHeadsList lengths
+ return ((map removeLeadingTrailingSpace rawHeads), aligns, indices)
+
+-- Returns an alignment type for a table, based on a list of strings
+-- (the rows of the column header) and a number (the length of the
+-- dashed line under the rows.
+alignType :: [String]
+ -> Int
+ -> Alignment
+alignType [] _ = AlignDefault
+alignType strLst len =
+ let s = head $ sortBy (comparing length) $
+ map removeTrailingSpace strLst
+ leftSpace = if null s then False else (s !! 0) `elem` " \t"
+ rightSpace = length s < len || (s !! (len - 1)) `elem` " \t"
+ in case (leftSpace, rightSpace) of
+ (True, False) -> AlignRight
+ (False, True) -> AlignLeft
+ (True, True) -> AlignCenter
+ (False, False) -> AlignDefault
+
+table :: GenParser Char ParserState Block
+table = simpleTable <|> multilineTable <?> "table"
+
+--
+-- inline
+--
+
+inline :: GenParser Char ParserState Inline
+inline = choice inlineParsers <?> "inline"
+
+inlineParsers :: [GenParser Char ParserState Inline]
+inlineParsers = [ abbrev
+ , str
+ , smartPunctuation
+ , whitespace
+ , endline
+ , code
+ , charRef
+ , strong
+ , emph
+ , note
+ , inlineNote
+ , link
+#ifdef _CITEPROC
+ , inlineCitation
+#endif
+ , image
+ , math
+ , strikeout
+ , superscript
+ , subscript
+ , autoLink
+ , rawHtmlInline'
+ , rawLaTeXInline'
+ , escapedChar
+ , symbol
+ , ltSign ]
+
+inlineNonLink :: GenParser Char ParserState Inline
+inlineNonLink = (choice $
+ map (\parser -> try (parser >>= failIfLink)) inlineParsers)
+ <?> "inline (non-link)"
+
+failIfLink :: Inline -> GenParser tok st Inline
+failIfLink (Link _ _) = pzero
+failIfLink elt = return elt
+
+escapedChar :: GenParser Char ParserState Inline
+escapedChar = do
+ char '\\'
+ state <- getState
+ result <- option '\\' $ if stateStrict state
+ then oneOf "\\`*_{}[]()>#+-.!~"
+ else satisfy (not . isAlphaNum)
+ let result' = if result == ' '
+ then '\160' -- '\ ' is a nonbreaking space
+ else result
+ return $ Str [result']
+
+ltSign :: GenParser Char ParserState Inline
+ltSign = do
+ st <- getState
+ if stateStrict st
+ then char '<'
+ else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html
+ return $ Str ['<']
+
+specialCharsMinusLt :: [Char]
+specialCharsMinusLt = filter (/= '<') specialChars
+
+symbol :: GenParser Char ParserState Inline
+symbol = do
+ result <- oneOf specialCharsMinusLt
+ return $ Str [result]
+
+-- parses inline code, between n `s and n `s
+code :: GenParser Char ParserState Inline
+code = try $ do
+ starts <- many1 (char '`')
+ skipSpaces
+ result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|>
+ (char '\n' >> return " "))
+ (try (skipSpaces >> count (length starts) (char '`') >>
+ notFollowedBy (char '`')))
+ return $ Code $ removeLeadingTrailingSpace $ concat result
+
+mathWord :: GenParser Char st [Char]
+mathWord = many1 ((noneOf " \t\n\\$") <|>
+ (try (char '\\') >>~ notFollowedBy (char '$')))
+
+math :: GenParser Char ParserState Inline
+math = (mathDisplay >>= return . Math DisplayMath)
+ <|> (mathInline >>= return . Math InlineMath)
+
+mathDisplay :: GenParser Char ParserState String
+mathDisplay = try $ do
+ failIfStrict
+ string "$$"
+ many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string "$$")
+
+mathInline :: GenParser Char ParserState String
+mathInline = try $ do
+ failIfStrict
+ char '$'
+ notFollowedBy space
+ words' <- sepBy1 mathWord (many1 (spaceChar <|> (newline >>~ notFollowedBy' blankline)))
+ char '$'
+ notFollowedBy digit
+ return $ intercalate " " words'
+
+emph :: GenParser Char ParserState Inline
+emph = ((enclosed (char '*') (notFollowedBy' strong >> char '*') inline) <|>
+ (enclosed (char '_') (notFollowedBy' strong >> char '_' >>
+ notFollowedBy alphaNum) inline)) >>=
+ return . Emph . normalizeSpaces
+
+strong :: GenParser Char ParserState Inline
+strong = ((enclosed (string "**") (try $ string "**") inline) <|>
+ (enclosed (string "__") (try $ string "__") inline)) >>=
+ return . Strong . normalizeSpaces
+
+strikeout :: GenParser Char ParserState Inline
+strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>=
+ return . Strikeout . normalizeSpaces
+
+superscript :: GenParser Char ParserState Inline
+superscript = failIfStrict >> enclosed (char '^') (char '^')
+ (notFollowedBy' whitespace >> inline) >>= -- may not contain Space
+ return . Superscript
+
+subscript :: GenParser Char ParserState Inline
+subscript = failIfStrict >> enclosed (char '~') (char '~')
+ (notFollowedBy' whitespace >> inline) >>= -- may not contain Space
+ return . Subscript
+
+abbrev :: GenParser Char ParserState Inline
+abbrev = failUnlessSmart >>
+ (assumedAbbrev <|> knownAbbrev) >>= return . Str . (++ ".\160")
+
+-- an string of letters followed by a period that does not end a sentence
+-- is assumed to be an abbreviation. It is assumed that sentences don't
+-- start with lowercase letters or numerals.
+assumedAbbrev :: GenParser Char ParserState [Char]
+assumedAbbrev = try $ do
+ result <- many1 $ satisfy isAlpha
+ string ". "
+ lookAhead $ satisfy (\x -> isLower x || isDigit x)
+ return result
+
+-- these strings are treated as abbreviations even if they are followed
+-- by a capital letter (such as a name).
+knownAbbrev :: GenParser Char ParserState [Char]
+knownAbbrev = try $ do
+ result <- oneOfStrings [ "Mr", "Mrs", "Ms", "Capt", "Dr", "Prof", "Gen",
+ "Gov", "e.g", "i.e", "Sgt", "St", "vol", "vs",
+ "Sen", "Rep", "Pres", "Hon", "Rev" ]
+ string ". "
+ return result
+
+smartPunctuation :: GenParser Char ParserState Inline
+smartPunctuation = failUnlessSmart >>
+ choice [ quoted, apostrophe, dash, ellipses ]
+
+apostrophe :: GenParser Char ParserState Inline
+apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe
+
+quoted :: GenParser Char ParserState Inline
+quoted = doubleQuoted <|> singleQuoted
+
+withQuoteContext :: QuoteContext
+ -> (GenParser Char ParserState Inline)
+ -> GenParser Char ParserState Inline
+withQuoteContext context parser = do
+ oldState <- getState
+ let oldQuoteContext = stateQuoteContext oldState
+ setState oldState { stateQuoteContext = context }
+ result <- parser
+ newState <- getState
+ setState newState { stateQuoteContext = oldQuoteContext }
+ return result
+
+singleQuoted :: GenParser Char ParserState Inline
+singleQuoted = try $ do
+ singleQuoteStart
+ withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>=
+ return . Quoted SingleQuote . normalizeSpaces
+
+doubleQuoted :: GenParser Char ParserState Inline
+doubleQuoted = try $ do
+ doubleQuoteStart
+ withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>=
+ return . Quoted DoubleQuote . normalizeSpaces
+
+failIfInQuoteContext :: QuoteContext -> GenParser tok ParserState ()
+failIfInQuoteContext context = do
+ st <- getState
+ if stateQuoteContext st == context
+ then fail "already inside quotes"
+ else return ()
+
+singleQuoteStart :: GenParser Char ParserState Char
+singleQuoteStart = do
+ failIfInQuoteContext InSingleQuote
+ char '\8216' <|>
+ (try $ do char '\''
+ notFollowedBy (oneOf ")!],.;:-? \t\n")
+ notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
+ satisfy (not . isAlphaNum)))
+ -- possess/contraction
+ return '\'')
+
+singleQuoteEnd :: GenParser Char st Char
+singleQuoteEnd = try $ do
+ char '\8217' <|> char '\''
+ notFollowedBy alphaNum
+ return '\''
+
+doubleQuoteStart :: GenParser Char ParserState Char
+doubleQuoteStart = do
+ failIfInQuoteContext InDoubleQuote
+ char '\8220' <|>
+ (try $ do char '"'
+ notFollowedBy (oneOf " \t\n")
+ return '"')
+
+doubleQuoteEnd :: GenParser Char st Char
+doubleQuoteEnd = char '\8221' <|> char '"'
+
+ellipses :: GenParser Char st Inline
+ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses
+
+dash :: GenParser Char st Inline
+dash = enDash <|> emDash
+
+enDash :: GenParser Char st Inline
+enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash
+
+emDash :: GenParser Char st Inline
+emDash = oneOfStrings ["---", "--"] >> return EmDash
+
+whitespace :: GenParser Char ParserState Inline
+whitespace = do
+ sps <- many1 (oneOf spaceChars)
+ if length sps >= 2
+ then option Space (endline >> return LineBreak)
+ else return Space <?> "whitespace"
+
+nonEndline :: GenParser Char st Char
+nonEndline = satisfy (/='\n')
+
+strChar :: GenParser Char st Char
+strChar = noneOf (specialChars ++ spaceChars ++ "\n")
+
+str :: GenParser Char st Inline
+str = many1 strChar >>= return . Str
+
+-- an endline character that can be treated as a space, not a structural break
+endline :: GenParser Char ParserState Inline
+endline = try $ do
+ newline
+ notFollowedBy blankline
+ st <- getState
+ if stateStrict st
+ then do notFollowedBy emailBlockQuoteStart
+ notFollowedBy (char '#') -- atx header
+ else return ()
+ -- parse potential list-starts differently if in a list:
+ if stateParserContext st == ListItemState
+ then notFollowedBy' (bulletListStart <|>
+ (anyOrderedListStart >> return ()))
+ else return ()
+ return Space
+
+--
+-- links
+--
+
+-- a reference label for a link
+reference :: GenParser Char ParserState [Inline]
+reference = do notFollowedBy' (string "[^") -- footnote reference
+ result <- inlinesInBalancedBrackets inlineNonLink
+ return $ normalizeSpaces result
+
+-- source for a link, with optional title
+source :: GenParser Char st (String, [Char])
+source =
+ (try $ charsInBalanced '(' ')' >>= parseFromString source') <|>
+ -- the following is needed for cases like: [ref](/url(a).
+ (enclosed (char '(') (char ')') anyChar >>=
+ parseFromString source')
+
+-- auxiliary function for source
+source' :: GenParser Char st (String, [Char])
+source' = do
+ skipSpaces
+ let sourceURL excludes = many $
+ optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' linkTitle >> char ' '))
+ src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n"
+ tit <- option "" linkTitle
+ skipSpaces
+ eof
+ return (intercalate "%20" $ words $ removeTrailingSpace src, tit)
+
+linkTitle :: GenParser Char st String
+linkTitle = try $ do
+ (many1 spaceChar >> option '\n' newline) <|> newline
+ skipSpaces
+ delim <- oneOf "'\""
+ tit <- manyTill (optional (char '\\') >> anyChar)
+ (try (char delim >> skipSpaces >> eof))
+ return $ decodeCharacterReferences tit
+
+link :: GenParser Char ParserState Inline
+link = try $ do
+ lab <- reference
+ src <- source <|> referenceLink lab
+ sanitize <- getState >>= return . stateSanitizeHTML
+ if sanitize && unsanitaryURI (fst src)
+ then fail "Unsanitary URI"
+ else return $ Link lab src
+
+-- a link like [this][ref] or [this][] or [this]
+referenceLink :: [Inline]
+ -> GenParser Char ParserState (String, [Char])
+referenceLink lab = do
+ ref <- option [] (try (optional (char ' ') >>
+ optional (newline >> skipSpaces) >> reference))
+ let ref' = if null ref then lab else ref
+ state <- getState
+ case lookupKeySrc (stateKeys state) ref' of
+ Nothing -> fail "no corresponding key"
+ Just target -> return target
+
+autoLink :: GenParser Char ParserState Inline
+autoLink = try $ do
+ char '<'
+ src <- uri <|> (emailAddress >>= (return . ("mailto:" ++)))
+ char '>'
+ let src' = if "mailto:" `isPrefixOf` src
+ then drop 7 src
+ else src
+ st <- getState
+ let sanitize = stateSanitizeHTML st
+ if sanitize && unsanitaryURI src
+ then fail "Unsanitary URI"
+ else return $ if stateStrict st
+ then Link [Str src'] (src, "")
+ else Link [Code src'] (src, "")
+
+image :: GenParser Char ParserState Inline
+image = try $ do
+ char '!'
+ (Link lab src) <- link
+ return $ Image lab src
+
+note :: GenParser Char ParserState Inline
+note = try $ do
+ failIfStrict
+ ref <- noteMarker
+ state <- getState
+ let notes = stateNotes state
+ case lookup ref notes of
+ Nothing -> fail "note not found"
+ Just contents -> return $ Note contents
+
+inlineNote :: GenParser Char ParserState Inline
+inlineNote = try $ do
+ failIfStrict
+ char '^'
+ contents <- inlinesInBalancedBrackets inline
+ return $ Note [Para contents]
+
+rawLaTeXInline' :: GenParser Char ParserState Inline
+rawLaTeXInline' = do
+ failIfStrict
+ (rawConTeXtEnvironment' >>= return . TeX)
+ <|> (rawLaTeXEnvironment' >>= return . TeX)
+ <|> rawLaTeXInline
+
+rawConTeXtEnvironment' :: GenParser Char st String
+rawConTeXtEnvironment' = try $ do
+ string "\\start"
+ completion <- inBrackets (letter <|> digit <|> spaceChar)
+ <|> (many1 letter)
+ contents <- manyTill (rawConTeXtEnvironment' <|> (count 1 anyChar))
+ (try $ string "\\stop" >> string completion)
+ return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion
+
+inBrackets :: (GenParser Char st Char) -> GenParser Char st String
+inBrackets parser = do
+ char '['
+ contents <- many parser
+ char ']'
+ return $ "[" ++ contents ++ "]"
+
+rawHtmlInline' :: GenParser Char ParserState Inline
+rawHtmlInline' = do
+ st <- getState
+ result <- if stateStrict st
+ then choice [htmlBlockElement, anyHtmlTag, anyHtmlEndTag]
+ else anyHtmlInlineTag
+ return $ HtmlInline result
+
+#ifdef _CITEPROC
+inlineCitation :: GenParser Char ParserState Inline
+inlineCitation = try $ do
+ failIfStrict
+ cit <- citeMarker
+ let citations = readWith parseCitation defaultParserState cit
+ mr <- mapM chkCit citations
+ if catMaybes mr /= []
+ then return $ Cite citations []
+ else fail "no citation found"
+
+chkCit :: Target -> GenParser Char ParserState (Maybe Target)
+chkCit t = do
+ st <- getState
+ case lookupKeySrc (stateKeys st) [Str $ fst t] of
+ Just _ -> fail "This is a link"
+ Nothing -> if elem (fst t) $ stateCitations st
+ then return $ Just t
+ else return $ Nothing
+
+citeMarker :: GenParser Char ParserState String
+citeMarker = char '[' >> manyTill ( noneOf "\n" <|> (newline >>~ notFollowedBy blankline) ) (char ']')
+
+parseCitation :: GenParser Char ParserState [(String,String)]
+parseCitation = try $ sepBy (parseLabel) (oneOf ";")
+
+parseLabel :: GenParser Char ParserState (String,String)
+parseLabel = try $ do
+ res <- sepBy (skipSpaces >> optional newline >> skipSpaces >> many1 (noneOf "@;")) (oneOf "@")
+ case res of
+ [lab,loc] -> return (lab, loc)
+ [lab] -> return (lab, "" )
+ _ -> return ("" , "" )
+
+#endif