diff options
author | dr@jones.dk <dr@jones.dk> | 2009-12-14 12:57:35 +0100 |
---|---|---|
committer | dr@jones.dk <dr@jones.dk> | 2009-12-14 12:57:35 +0100 |
commit | 789d0772d8b5d9c066fb8624bd51576cbde5e30b (patch) | |
tree | 7141187124ecc41b13861c81c7b642076cb88078 /src/Text/Pandoc/Readers/Markdown.hs | |
parent | 88b315ccee666385e1a4c52e2eb5fb0b0ffe8d60 (diff) |
Imported Upstream version 1.3
Diffstat (limited to 'src/Text/Pandoc/Readers/Markdown.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 219 |
1 files changed, 144 insertions, 75 deletions
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index ae682e72e..0de700537 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -42,13 +42,15 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' ) import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, anyHtmlInlineTag, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, - htmlBlockElement, unsanitaryURI ) + htmlBlockElement, htmlComment, unsanitaryURI ) import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) import Text.ParserCombinators.Parsec -import Control.Monad (when) +import Control.Monad (when, liftM, unless) -- | Read markdown from an input string and return a Pandoc document. -readMarkdown :: ParserState -> String -> Pandoc +readMarkdown :: ParserState -- ^ Parser state, including options for parser + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc readMarkdown state s = (readWith parseMarkdown) state (s ++ "\n\n") -- @@ -107,7 +109,7 @@ failUnlessBeginningOfLine = do failUnlessSmart :: GenParser tok ParserState () failUnlessSmart = do state <- getState - if stateSmart state then return () else fail "Smart typography feature" + if stateSmart state then return () else pzero -- | Parse a sequence of inline elements between square brackets, -- including inlines between balanced pairs of square brackets. @@ -116,9 +118,7 @@ inlinesInBalancedBrackets :: GenParser Char ParserState Inline inlinesInBalancedBrackets parser = try $ do char '[' result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser - if res == "[" - then return () - else pzero + unless (res == "[") pzero bal <- inlinesInBalancedBrackets parser return $ [Str "["] ++ bal ++ [Str "]"]) <|> (count 1 parser)) @@ -162,23 +162,18 @@ parseMarkdown = do -- markdown allows raw HTML updateState (\state -> state { stateParseRaw = True }) startPos <- getPosition - -- go through once just to get list of reference keys - -- docMinusKeys is the raw document with blanks where the keys were... - docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= - return . concat + -- go through once just to get list of reference keys and notes + -- docMinusKeys is the raw document with blanks where the keys/notes were... + st <- getState + let firstPassParser = referenceKey + <|> (if stateStrict st then pzero else noteBlock) + <|> lineClump + docMinusKeys <- liftM concat $ manyTill firstPassParser eof setInput docMinusKeys setPosition startPos - st <- getState - -- go through again for notes unless strict... - if stateStrict st - then return () - else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>= - return . concat - st' <- getState - let reversedNotes = stateNotes st' - updateState $ \s -> s { stateNotes = reverse reversedNotes } - setInput docMinusNotes - setPosition startPos + st' <- getState + let reversedNotes = stateNotes st' + updateState $ \s -> s { stateNotes = reverse reversedNotes } -- now parse it for real... (title, author, date) <- option ([],[],"") titleBlock blocks <- parseBlocks @@ -201,7 +196,7 @@ referenceKey = try $ do tit <- option "" referenceTitle blanklines endPos <- getPosition - let newkey = (lab, (intercalate "%20" $ words $ removeTrailingSpace src, tit)) + let newkey = (lab, (intercalate "+" $ words $ removeTrailingSpace src, tit)) st <- getState let oldkeys = stateKeys st updateState $ \s -> s { stateKeys = newkey : oldkeys } @@ -241,9 +236,7 @@ noteBlock = try $ do raw <- sepBy rawLines (try (blankline >> indentSpaces)) optional blanklines endPos <- getPosition - -- parse the extracted text, which may contain various block elements: - contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n" - let newnote = (ref, contents) + let newnote = (ref, (intercalate "\n" raw) ++ "\n\n") st <- getState let oldnotes = stateNotes st updateState $ \s -> s { stateNotes = newnote : oldnotes } @@ -399,13 +392,15 @@ codeBlockIndented = do l <- indentedLine return $ b ++ l)) optional blanklines - return $ CodeBlock ("",[],[]) $ stripTrailingNewlines $ concat contents + st <- getState + return $ CodeBlock ("", stateIndentedCodeClasses st, []) $ + stripTrailingNewlines $ concat contents lhsCodeBlock :: GenParser Char ParserState Block lhsCodeBlock = do failUnlessLHS contents <- lhsCodeBlockBird <|> lhsCodeBlockLaTeX - return $ CodeBlock ("",["sourceCode","haskell"],[]) contents + return $ CodeBlock ("",["sourceCode","literate","haskell"],[]) contents lhsCodeBlockLaTeX :: GenParser Char ParserState String lhsCodeBlockLaTeX = try $ do @@ -502,8 +497,8 @@ listLine = try $ do notFollowedBy' (do indentSpaces many (spaceChar) listStart) - line <- manyTill anyChar newline - return $ line ++ "\n" + chunks <- manyTill (htmlComment <|> count 1 anyChar) newline + return $ concat chunks ++ "\n" -- parse raw text for one list item, excluding start marker and continuations rawListItem :: GenParser Char ParserState [Char] @@ -560,38 +555,61 @@ bulletList = try $ do -- definition lists -definitionListItem :: GenParser Char ParserState ([Inline], [Block]) +defListMarker :: GenParser Char ParserState () +defListMarker = do + sps <- nonindentSpaces + char ':' <|> char '~' + st <- getState + let tabStop = stateTabStop st + let remaining = tabStop - (length sps + 1) + if remaining > 0 + then count remaining (char ' ') <|> string "\t" + else pzero + return () + +definitionListItem :: GenParser Char ParserState ([Inline], [[Block]]) definitionListItem = try $ do - notFollowedBy blankline - notFollowedBy' indentSpaces -- first, see if this has any chance of being a definition list: - lookAhead (anyLine >> char ':') + lookAhead (anyLine >> optional blankline >> defListMarker) term <- manyTill inline newline + optional blankline raw <- many1 defRawBlock state <- getState let oldContext = stateParserContext state -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ concat raw + contents <- mapM (parseFromString parseBlocks) raw updateState (\st -> st {stateParserContext = oldContext}) return ((normalizeSpaces term), contents) defRawBlock :: GenParser Char ParserState [Char] defRawBlock = try $ do - char ':' - state <- getState - let tabStop = stateTabStop state - try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t") + defListMarker firstline <- anyLine rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) trailing <- option "" blanklines - return $ firstline ++ "\n" ++ unlines rawlines ++ trailing + cont <- liftM concat $ many $ do + lns <- many1 $ notFollowedBy blankline >> indentSpaces >> anyLine + trl <- option "" blanklines + return $ unlines lns ++ trl + return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont definitionList :: GenParser Char ParserState Block definitionList = do items <- many1 definitionListItem - let (terms, defs) = unzip items - let defs' = compactify defs - let items' = zip terms defs' + -- "compactify" the definition list: + let defs = map snd items + let defBlocks = reverse $ concat $ concat defs + let isPara (Para _) = True + isPara _ = False + let items' = case take 1 defBlocks of + [Para x] -> if not $ any isPara (drop 1 defBlocks) + then let (t,ds) = last items + lastDef = last ds + ds' = init ds ++ + [init lastDef ++ [Plain x]] + in init items ++ [(t, ds')] + else items + _ -> items return $ DefinitionList items' -- @@ -681,26 +699,36 @@ dashedLine ch = do return $ (length dashes, length $ dashes ++ sp) -- Parse a table header with dashed lines of '-' preceded by --- one line of text. -simpleTableHeader :: GenParser Char ParserState ([[Char]], [Alignment], [Int]) -simpleTableHeader = try $ do - rawContent <- anyLine +-- one (or zero) line of text. +simpleTableHeader :: Bool -- ^ Headerless table + -> GenParser Char ParserState ([[Char]], [Alignment], [Int]) +simpleTableHeader headless = try $ do + rawContent <- if headless + then return "" + else anyLine initSp <- nonindentSpaces dashes <- many1 (dashedLine '-') newline let (lengths, lines') = unzip dashes let indices = scanl (+) (length initSp) lines' - let rawHeads = tail $ splitByIndices (init indices) rawContent + -- If no header, calculate alignment on basis of first row of text + rawHeads <- liftM (tail . splitByIndices (init indices)) $ + if headless + then lookAhead anyLine + else return rawContent let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths - return (rawHeads, aligns, indices) + let rawHeads' = if headless + then replicate (length dashes) "" + else rawHeads + return (rawHeads', aligns, indices) -- Parse a table footer - dashed lines followed by blank line. tableFooter :: GenParser Char ParserState [Char] tableFooter = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> blanklines -- Parse a table separator - dashed line. -tableSep :: GenParser Char ParserState String -tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> string "\n" +tableSep :: GenParser Char ParserState Char +tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> char '\n' -- Parse a raw line and split it into chunks by indices. rawTableLine :: [Int] @@ -731,7 +759,17 @@ widthsFromIndices :: Int -- Number of columns on terminal -> [Double] -- Fractional relative sizes of columns widthsFromIndices _ [] = [] widthsFromIndices numColumns indices = - let lengths = zipWith (-) indices (0:indices) + let lengths' = zipWith (-) indices (0:indices) + lengths = reverse $ + case reverse lengths' of + [] -> [] + [x] -> [x] + -- compensate for the fact that intercolumn + -- spaces are counted in widths of all columns + -- but the last... + (x:y:zs) -> if x < y && y - x <= 2 + then y:y:zs + else x:y:zs totLength = sum lengths quotient = if totLength > numColumns then fromIntegral totLength @@ -765,30 +803,48 @@ tableWith headerParser lineParser footerParser = try $ do return $ Table caption aligns widths heads lines' -- Parse a simple table with '---' header and one line per row. -simpleTable :: GenParser Char ParserState Block -simpleTable = tableWith simpleTableHeader tableLine blanklines +simpleTable :: Bool -- ^ Headerless table + -> GenParser Char ParserState Block +simpleTable headless = do + Table c a _w h l <- tableWith (simpleTableHeader headless) tableLine + (if headless then tableFooter else tableFooter <|> blanklines) + -- Simple tables get 0s for relative column widths (i.e., use default) + return $ Table c a (replicate (length a) 0) h l -- Parse a multiline table: starts with row of '-' on top, then header -- (which may be multiline), then the rows, -- which may be multiline, separated by blank lines, and -- ending with a footer (dashed line followed by blank line). -multilineTable :: GenParser Char ParserState Block -multilineTable = tableWith multilineTableHeader multilineRow tableFooter - -multilineTableHeader :: GenParser Char ParserState ([String], [Alignment], [Int]) -multilineTableHeader = try $ do - tableSep - rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline) +multilineTable :: Bool -- ^ Headerless table + -> GenParser Char ParserState Block +multilineTable headless = + tableWith (multilineTableHeader headless) multilineRow tableFooter + +multilineTableHeader :: Bool -- ^ Headerless table + -> GenParser Char ParserState ([String], [Alignment], [Int]) +multilineTableHeader headless = try $ do + if headless + then return '\n' + else tableSep + rawContent <- if headless + then return $ repeat "" + else many1 + (notFollowedBy tableSep >> many1Till anyChar newline) initSp <- nonindentSpaces dashes <- many1 (dashedLine '-') newline let (lengths, lines') = unzip dashes let indices = scanl (+) (length initSp) lines' - let rawHeadsList = transpose $ map - (\ln -> tail $ splitByIndices (init indices) ln) - rawContent - let rawHeads = map (intercalate " ") rawHeadsList + rawHeadsList <- if headless + then liftM (map (:[]) . tail . + splitByIndices (init indices)) $ lookAhead anyLine + else return $ transpose $ map + (\ln -> tail $ splitByIndices (init indices) ln) + rawContent let aligns = zipWith alignType rawHeadsList lengths + let rawHeads = if headless + then replicate (length dashes) "" + else map (intercalate " ") rawHeadsList return ((map removeLeadingTrailingSpace rawHeads), aligns, indices) -- Returns an alignment type for a table, based on a list of strings @@ -810,7 +866,8 @@ alignType strLst len = (False, False) -> AlignDefault table :: GenParser Char ParserState Block -table = simpleTable <|> multilineTable <?> "table" +table = multilineTable False <|> simpleTable True <|> + simpleTable False <|> multilineTable True <?> "table" -- -- inline @@ -826,6 +883,7 @@ inlineParsers = [ str , endline , code , charRef + , (fourOrMore '*' <|> fourOrMore '_') , strong , emph , note @@ -862,10 +920,10 @@ escapedChar = do result <- option '\\' $ if stateStrict state then oneOf "\\`*_{}[]()>#+-.!~" else satisfy (not . isAlphaNum) - let result' = if result == ' ' - then '\160' -- '\ ' is a nonbreaking space - else result - return $ Str [result'] + return $ case result of + ' ' -> Str "\160" -- "\ " is a nonbreaking space + '\n' -> LineBreak -- "\[newline]" is a linebreak + _ -> Str [result] ltSign :: GenParser Char ParserState Inline ltSign = do @@ -895,8 +953,13 @@ code = try $ do return $ Code $ removeLeadingTrailingSpace $ concat result mathWord :: GenParser Char st [Char] -mathWord = many1 ((noneOf " \t\n\\$") <|> - (try (char '\\') >>~ notFollowedBy (char '$'))) +mathWord = liftM concat $ many1 mathChunk + +mathChunk :: GenParser Char st [Char] +mathChunk = do char '\\' + c <- anyChar + return ['\\',c] + <|> many1 (noneOf " \t\n\\$") math :: GenParser Char ParserState Inline math = (mathDisplay >>= return . Math DisplayMath) @@ -918,6 +981,12 @@ mathInline = try $ do notFollowedBy digit return $ intercalate " " words' +-- to avoid performance problems, treat 4 or more _ or * in a row as a literal +-- rather than attempting to parse for emph/strong +fourOrMore :: Char -> GenParser Char st Inline +fourOrMore c = try $ count 4 (char c) >> many (char c) >>= \s -> + return (Str $ replicate 4 c ++ s) + emph :: GenParser Char ParserState Inline emph = ((enclosed (char '*') (notFollowedBy' strong >> char '*') inline) <|> (enclosed (char '_') (notFollowedBy' strong >> char '_' >> @@ -1106,7 +1175,7 @@ source' = do tit <- option "" linkTitle skipSpaces eof - return (intercalate "%20" $ words $ removeTrailingSpace src, tit) + return (intercalate "+" $ words $ removeTrailingSpace src, tit) linkTitle :: GenParser Char st String linkTitle = try $ do @@ -1167,8 +1236,8 @@ note = try $ do state <- getState let notes = stateNotes state case lookup ref notes of - Nothing -> fail "note not found" - Just contents -> return $ Note contents + Nothing -> fail "note not found" + Just raw -> liftM Note $ parseFromString parseBlocks raw inlineNote :: GenParser Char ParserState Inline inlineNote = try $ do |