summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Markdown.hs
diff options
context:
space:
mode:
authordr@jones.dk <dr@jones.dk>2009-12-14 12:57:35 +0100
committerdr@jones.dk <dr@jones.dk>2009-12-14 12:57:35 +0100
commit789d0772d8b5d9c066fb8624bd51576cbde5e30b (patch)
tree7141187124ecc41b13861c81c7b642076cb88078 /src/Text/Pandoc/Readers/Markdown.hs
parent88b315ccee666385e1a4c52e2eb5fb0b0ffe8d60 (diff)
Imported Upstream version 1.3
Diffstat (limited to 'src/Text/Pandoc/Readers/Markdown.hs')
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs219
1 files changed, 144 insertions, 75 deletions
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index ae682e72e..0de700537 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -42,13 +42,15 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
anyHtmlInlineTag, anyHtmlTag,
anyHtmlEndTag, htmlEndTag, extractTagType,
- htmlBlockElement, unsanitaryURI )
+ htmlBlockElement, htmlComment, unsanitaryURI )
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Text.ParserCombinators.Parsec
-import Control.Monad (when)
+import Control.Monad (when, liftM, unless)
-- | Read markdown from an input string and return a Pandoc document.
-readMarkdown :: ParserState -> String -> Pandoc
+readMarkdown :: ParserState -- ^ Parser state, including options for parser
+ -> String -- ^ String to parse (assuming @'\n'@ line endings)
+ -> Pandoc
readMarkdown state s = (readWith parseMarkdown) state (s ++ "\n\n")
--
@@ -107,7 +109,7 @@ failUnlessBeginningOfLine = do
failUnlessSmart :: GenParser tok ParserState ()
failUnlessSmart = do
state <- getState
- if stateSmart state then return () else fail "Smart typography feature"
+ if stateSmart state then return () else pzero
-- | Parse a sequence of inline elements between square brackets,
-- including inlines between balanced pairs of square brackets.
@@ -116,9 +118,7 @@ inlinesInBalancedBrackets :: GenParser Char ParserState Inline
inlinesInBalancedBrackets parser = try $ do
char '['
result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser
- if res == "["
- then return ()
- else pzero
+ unless (res == "[") pzero
bal <- inlinesInBalancedBrackets parser
return $ [Str "["] ++ bal ++ [Str "]"])
<|> (count 1 parser))
@@ -162,23 +162,18 @@ parseMarkdown = do
-- markdown allows raw HTML
updateState (\state -> state { stateParseRaw = True })
startPos <- getPosition
- -- go through once just to get list of reference keys
- -- docMinusKeys is the raw document with blanks where the keys were...
- docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>=
- return . concat
+ -- go through once just to get list of reference keys and notes
+ -- docMinusKeys is the raw document with blanks where the keys/notes were...
+ st <- getState
+ let firstPassParser = referenceKey
+ <|> (if stateStrict st then pzero else noteBlock)
+ <|> lineClump
+ docMinusKeys <- liftM concat $ manyTill firstPassParser eof
setInput docMinusKeys
setPosition startPos
- st <- getState
- -- go through again for notes unless strict...
- if stateStrict st
- then return ()
- else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>=
- return . concat
- st' <- getState
- let reversedNotes = stateNotes st'
- updateState $ \s -> s { stateNotes = reverse reversedNotes }
- setInput docMinusNotes
- setPosition startPos
+ st' <- getState
+ let reversedNotes = stateNotes st'
+ updateState $ \s -> s { stateNotes = reverse reversedNotes }
-- now parse it for real...
(title, author, date) <- option ([],[],"") titleBlock
blocks <- parseBlocks
@@ -201,7 +196,7 @@ referenceKey = try $ do
tit <- option "" referenceTitle
blanklines
endPos <- getPosition
- let newkey = (lab, (intercalate "%20" $ words $ removeTrailingSpace src, tit))
+ let newkey = (lab, (intercalate "+" $ words $ removeTrailingSpace src, tit))
st <- getState
let oldkeys = stateKeys st
updateState $ \s -> s { stateKeys = newkey : oldkeys }
@@ -241,9 +236,7 @@ noteBlock = try $ do
raw <- sepBy rawLines (try (blankline >> indentSpaces))
optional blanklines
endPos <- getPosition
- -- parse the extracted text, which may contain various block elements:
- contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n"
- let newnote = (ref, contents)
+ let newnote = (ref, (intercalate "\n" raw) ++ "\n\n")
st <- getState
let oldnotes = stateNotes st
updateState $ \s -> s { stateNotes = newnote : oldnotes }
@@ -399,13 +392,15 @@ codeBlockIndented = do
l <- indentedLine
return $ b ++ l))
optional blanklines
- return $ CodeBlock ("",[],[]) $ stripTrailingNewlines $ concat contents
+ st <- getState
+ return $ CodeBlock ("", stateIndentedCodeClasses st, []) $
+ stripTrailingNewlines $ concat contents
lhsCodeBlock :: GenParser Char ParserState Block
lhsCodeBlock = do
failUnlessLHS
contents <- lhsCodeBlockBird <|> lhsCodeBlockLaTeX
- return $ CodeBlock ("",["sourceCode","haskell"],[]) contents
+ return $ CodeBlock ("",["sourceCode","literate","haskell"],[]) contents
lhsCodeBlockLaTeX :: GenParser Char ParserState String
lhsCodeBlockLaTeX = try $ do
@@ -502,8 +497,8 @@ listLine = try $ do
notFollowedBy' (do indentSpaces
many (spaceChar)
listStart)
- line <- manyTill anyChar newline
- return $ line ++ "\n"
+ chunks <- manyTill (htmlComment <|> count 1 anyChar) newline
+ return $ concat chunks ++ "\n"
-- parse raw text for one list item, excluding start marker and continuations
rawListItem :: GenParser Char ParserState [Char]
@@ -560,38 +555,61 @@ bulletList = try $ do
-- definition lists
-definitionListItem :: GenParser Char ParserState ([Inline], [Block])
+defListMarker :: GenParser Char ParserState ()
+defListMarker = do
+ sps <- nonindentSpaces
+ char ':' <|> char '~'
+ st <- getState
+ let tabStop = stateTabStop st
+ let remaining = tabStop - (length sps + 1)
+ if remaining > 0
+ then count remaining (char ' ') <|> string "\t"
+ else pzero
+ return ()
+
+definitionListItem :: GenParser Char ParserState ([Inline], [[Block]])
definitionListItem = try $ do
- notFollowedBy blankline
- notFollowedBy' indentSpaces
-- first, see if this has any chance of being a definition list:
- lookAhead (anyLine >> char ':')
+ lookAhead (anyLine >> optional blankline >> defListMarker)
term <- manyTill inline newline
+ optional blankline
raw <- many1 defRawBlock
state <- getState
let oldContext = stateParserContext state
-- parse the extracted block, which may contain various block elements:
- contents <- parseFromString parseBlocks $ concat raw
+ contents <- mapM (parseFromString parseBlocks) raw
updateState (\st -> st {stateParserContext = oldContext})
return ((normalizeSpaces term), contents)
defRawBlock :: GenParser Char ParserState [Char]
defRawBlock = try $ do
- char ':'
- state <- getState
- let tabStop = stateTabStop state
- try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t")
+ defListMarker
firstline <- anyLine
rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
trailing <- option "" blanklines
- return $ firstline ++ "\n" ++ unlines rawlines ++ trailing
+ cont <- liftM concat $ many $ do
+ lns <- many1 $ notFollowedBy blankline >> indentSpaces >> anyLine
+ trl <- option "" blanklines
+ return $ unlines lns ++ trl
+ return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont
definitionList :: GenParser Char ParserState Block
definitionList = do
items <- many1 definitionListItem
- let (terms, defs) = unzip items
- let defs' = compactify defs
- let items' = zip terms defs'
+ -- "compactify" the definition list:
+ let defs = map snd items
+ let defBlocks = reverse $ concat $ concat defs
+ let isPara (Para _) = True
+ isPara _ = False
+ let items' = case take 1 defBlocks of
+ [Para x] -> if not $ any isPara (drop 1 defBlocks)
+ then let (t,ds) = last items
+ lastDef = last ds
+ ds' = init ds ++
+ [init lastDef ++ [Plain x]]
+ in init items ++ [(t, ds')]
+ else items
+ _ -> items
return $ DefinitionList items'
--
@@ -681,26 +699,36 @@ dashedLine ch = do
return $ (length dashes, length $ dashes ++ sp)
-- Parse a table header with dashed lines of '-' preceded by
--- one line of text.
-simpleTableHeader :: GenParser Char ParserState ([[Char]], [Alignment], [Int])
-simpleTableHeader = try $ do
- rawContent <- anyLine
+-- one (or zero) line of text.
+simpleTableHeader :: Bool -- ^ Headerless table
+ -> GenParser Char ParserState ([[Char]], [Alignment], [Int])
+simpleTableHeader headless = try $ do
+ rawContent <- if headless
+ then return ""
+ else anyLine
initSp <- nonindentSpaces
dashes <- many1 (dashedLine '-')
newline
let (lengths, lines') = unzip dashes
let indices = scanl (+) (length initSp) lines'
- let rawHeads = tail $ splitByIndices (init indices) rawContent
+ -- If no header, calculate alignment on basis of first row of text
+ rawHeads <- liftM (tail . splitByIndices (init indices)) $
+ if headless
+ then lookAhead anyLine
+ else return rawContent
let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths
- return (rawHeads, aligns, indices)
+ let rawHeads' = if headless
+ then replicate (length dashes) ""
+ else rawHeads
+ return (rawHeads', aligns, indices)
-- Parse a table footer - dashed lines followed by blank line.
tableFooter :: GenParser Char ParserState [Char]
tableFooter = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> blanklines
-- Parse a table separator - dashed line.
-tableSep :: GenParser Char ParserState String
-tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> string "\n"
+tableSep :: GenParser Char ParserState Char
+tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> char '\n'
-- Parse a raw line and split it into chunks by indices.
rawTableLine :: [Int]
@@ -731,7 +759,17 @@ widthsFromIndices :: Int -- Number of columns on terminal
-> [Double] -- Fractional relative sizes of columns
widthsFromIndices _ [] = []
widthsFromIndices numColumns indices =
- let lengths = zipWith (-) indices (0:indices)
+ let lengths' = zipWith (-) indices (0:indices)
+ lengths = reverse $
+ case reverse lengths' of
+ [] -> []
+ [x] -> [x]
+ -- compensate for the fact that intercolumn
+ -- spaces are counted in widths of all columns
+ -- but the last...
+ (x:y:zs) -> if x < y && y - x <= 2
+ then y:y:zs
+ else x:y:zs
totLength = sum lengths
quotient = if totLength > numColumns
then fromIntegral totLength
@@ -765,30 +803,48 @@ tableWith headerParser lineParser footerParser = try $ do
return $ Table caption aligns widths heads lines'
-- Parse a simple table with '---' header and one line per row.
-simpleTable :: GenParser Char ParserState Block
-simpleTable = tableWith simpleTableHeader tableLine blanklines
+simpleTable :: Bool -- ^ Headerless table
+ -> GenParser Char ParserState Block
+simpleTable headless = do
+ Table c a _w h l <- tableWith (simpleTableHeader headless) tableLine
+ (if headless then tableFooter else tableFooter <|> blanklines)
+ -- Simple tables get 0s for relative column widths (i.e., use default)
+ return $ Table c a (replicate (length a) 0) h l
-- Parse a multiline table: starts with row of '-' on top, then header
-- (which may be multiline), then the rows,
-- which may be multiline, separated by blank lines, and
-- ending with a footer (dashed line followed by blank line).
-multilineTable :: GenParser Char ParserState Block
-multilineTable = tableWith multilineTableHeader multilineRow tableFooter
-
-multilineTableHeader :: GenParser Char ParserState ([String], [Alignment], [Int])
-multilineTableHeader = try $ do
- tableSep
- rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline)
+multilineTable :: Bool -- ^ Headerless table
+ -> GenParser Char ParserState Block
+multilineTable headless =
+ tableWith (multilineTableHeader headless) multilineRow tableFooter
+
+multilineTableHeader :: Bool -- ^ Headerless table
+ -> GenParser Char ParserState ([String], [Alignment], [Int])
+multilineTableHeader headless = try $ do
+ if headless
+ then return '\n'
+ else tableSep
+ rawContent <- if headless
+ then return $ repeat ""
+ else many1
+ (notFollowedBy tableSep >> many1Till anyChar newline)
initSp <- nonindentSpaces
dashes <- many1 (dashedLine '-')
newline
let (lengths, lines') = unzip dashes
let indices = scanl (+) (length initSp) lines'
- let rawHeadsList = transpose $ map
- (\ln -> tail $ splitByIndices (init indices) ln)
- rawContent
- let rawHeads = map (intercalate " ") rawHeadsList
+ rawHeadsList <- if headless
+ then liftM (map (:[]) . tail .
+ splitByIndices (init indices)) $ lookAhead anyLine
+ else return $ transpose $ map
+ (\ln -> tail $ splitByIndices (init indices) ln)
+ rawContent
let aligns = zipWith alignType rawHeadsList lengths
+ let rawHeads = if headless
+ then replicate (length dashes) ""
+ else map (intercalate " ") rawHeadsList
return ((map removeLeadingTrailingSpace rawHeads), aligns, indices)
-- Returns an alignment type for a table, based on a list of strings
@@ -810,7 +866,8 @@ alignType strLst len =
(False, False) -> AlignDefault
table :: GenParser Char ParserState Block
-table = simpleTable <|> multilineTable <?> "table"
+table = multilineTable False <|> simpleTable True <|>
+ simpleTable False <|> multilineTable True <?> "table"
--
-- inline
@@ -826,6 +883,7 @@ inlineParsers = [ str
, endline
, code
, charRef
+ , (fourOrMore '*' <|> fourOrMore '_')
, strong
, emph
, note
@@ -862,10 +920,10 @@ escapedChar = do
result <- option '\\' $ if stateStrict state
then oneOf "\\`*_{}[]()>#+-.!~"
else satisfy (not . isAlphaNum)
- let result' = if result == ' '
- then '\160' -- '\ ' is a nonbreaking space
- else result
- return $ Str [result']
+ return $ case result of
+ ' ' -> Str "\160" -- "\ " is a nonbreaking space
+ '\n' -> LineBreak -- "\[newline]" is a linebreak
+ _ -> Str [result]
ltSign :: GenParser Char ParserState Inline
ltSign = do
@@ -895,8 +953,13 @@ code = try $ do
return $ Code $ removeLeadingTrailingSpace $ concat result
mathWord :: GenParser Char st [Char]
-mathWord = many1 ((noneOf " \t\n\\$") <|>
- (try (char '\\') >>~ notFollowedBy (char '$')))
+mathWord = liftM concat $ many1 mathChunk
+
+mathChunk :: GenParser Char st [Char]
+mathChunk = do char '\\'
+ c <- anyChar
+ return ['\\',c]
+ <|> many1 (noneOf " \t\n\\$")
math :: GenParser Char ParserState Inline
math = (mathDisplay >>= return . Math DisplayMath)
@@ -918,6 +981,12 @@ mathInline = try $ do
notFollowedBy digit
return $ intercalate " " words'
+-- to avoid performance problems, treat 4 or more _ or * in a row as a literal
+-- rather than attempting to parse for emph/strong
+fourOrMore :: Char -> GenParser Char st Inline
+fourOrMore c = try $ count 4 (char c) >> many (char c) >>= \s ->
+ return (Str $ replicate 4 c ++ s)
+
emph :: GenParser Char ParserState Inline
emph = ((enclosed (char '*') (notFollowedBy' strong >> char '*') inline) <|>
(enclosed (char '_') (notFollowedBy' strong >> char '_' >>
@@ -1106,7 +1175,7 @@ source' = do
tit <- option "" linkTitle
skipSpaces
eof
- return (intercalate "%20" $ words $ removeTrailingSpace src, tit)
+ return (intercalate "+" $ words $ removeTrailingSpace src, tit)
linkTitle :: GenParser Char st String
linkTitle = try $ do
@@ -1167,8 +1236,8 @@ note = try $ do
state <- getState
let notes = stateNotes state
case lookup ref notes of
- Nothing -> fail "note not found"
- Just contents -> return $ Note contents
+ Nothing -> fail "note not found"
+ Just raw -> liftM Note $ parseFromString parseBlocks raw
inlineNote :: GenParser Char ParserState Inline
inlineNote = try $ do