Imported Upstream version 1.3

author: dr@jones.dk <dr@jones.dk> 2009-12-14 12:57:35 +0100
committer: dr@jones.dk <dr@jones.dk> 2009-12-14 12:57:35 +0100
commit: 789d0772d8b5d9c066fb8624bd51576cbde5e30b (patch)
tree: 7141187124ecc41b13861c81c7b642076cb88078 /src/Text/Pandoc/Readers/Markdown.hs
parent: 88b315ccee666385e1a4c52e2eb5fb0b0ffe8d60 (diff)
1 files changed, 144 insertions, 75 deletions
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index ae682e72e..0de700537 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -42,13 +42,15 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
 import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, 
                                   anyHtmlInlineTag, anyHtmlTag,
                                   anyHtmlEndTag, htmlEndTag, extractTagType,
-                                  htmlBlockElement, unsanitaryURI )
+                                  htmlBlockElement, htmlComment, unsanitaryURI )
 import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
 import Text.ParserCombinators.Parsec
-import Control.Monad (when)
+import Control.Monad (when, liftM, unless)
 
 -- | Read markdown from an input string and return a Pandoc document.
-readMarkdown :: ParserState -> String -> Pandoc
+readMarkdown :: ParserState -- ^ Parser state, including options for parser
+             -> String      -- ^ String to parse (assuming @'\n'@ line endings)
+             -> Pandoc
 readMarkdown state s = (readWith parseMarkdown) state (s ++ "\n\n")
 
 --
@@ -107,7 +109,7 @@ failUnlessBeginningOfLine = do
 failUnlessSmart :: GenParser tok ParserState ()
 failUnlessSmart = do
   state <- getState
-  if stateSmart state then return () else fail "Smart typography feature"
+  if stateSmart state then return () else pzero
 
 -- | Parse a sequence of inline elements between square brackets,
 -- including inlines between balanced pairs of square brackets.
@@ -116,9 +118,7 @@ inlinesInBalancedBrackets :: GenParser Char ParserState Inline
 inlinesInBalancedBrackets parser = try $ do
   char '['
   result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser
-                                                if res == "["
-                                                   then return ()
-                                                   else pzero
+                                                unless (res == "[") pzero
                            bal <- inlinesInBalancedBrackets parser
                            return $ [Str "["] ++ bal ++ [Str "]"])
                        <|> (count 1 parser))
@@ -162,23 +162,18 @@ parseMarkdown = do
   -- markdown allows raw HTML
   updateState (\state -> state { stateParseRaw = True })
   startPos <- getPosition
-  -- go through once just to get list of reference keys
-  -- docMinusKeys is the raw document with blanks where the keys were...
-  docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= 
-                  return . concat
+  -- go through once just to get list of reference keys and notes
+  -- docMinusKeys is the raw document with blanks where the keys/notes were...
+  st <- getState
+  let firstPassParser = referenceKey
+                     <|> (if stateStrict st then pzero else noteBlock)
+                     <|> lineClump
+  docMinusKeys <- liftM concat $ manyTill firstPassParser eof
   setInput docMinusKeys
   setPosition startPos
-  st <- getState
-  -- go through again for notes unless strict...
-  if stateStrict st
-     then return ()
-     else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>= 
-                              return . concat
-             st' <- getState
-             let reversedNotes = stateNotes st'
-             updateState $ \s -> s { stateNotes = reverse reversedNotes }
-             setInput docMinusNotes
-             setPosition startPos
+  st' <- getState
+  let reversedNotes = stateNotes st'
+  updateState $ \s -> s { stateNotes = reverse reversedNotes }
   -- now parse it for real...
   (title, author, date) <- option ([],[],"") titleBlock
   blocks <- parseBlocks
@@ -201,7 +196,7 @@ referenceKey = try $ do
   tit <- option "" referenceTitle
   blanklines
   endPos <- getPosition
-  let newkey = (lab, (intercalate "%20" $ words $ removeTrailingSpace src,  tit))
+  let newkey = (lab, (intercalate "+" $ words $ removeTrailingSpace src,  tit))
   st <- getState
   let oldkeys = stateKeys st
   updateState $ \s -> s { stateKeys = newkey : oldkeys }
@@ -241,9 +236,7 @@ noteBlock = try $ do
   raw <- sepBy rawLines (try (blankline >> indentSpaces))
   optional blanklines
   endPos <- getPosition
-  -- parse the extracted text, which may contain various block elements:
-  contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n"
-  let newnote = (ref, contents)
+  let newnote = (ref, (intercalate "\n" raw) ++ "\n\n")
   st <- getState
   let oldnotes = stateNotes st
   updateState $ \s -> s { stateNotes = newnote : oldnotes }
@@ -399,13 +392,15 @@ codeBlockIndented = do
                              l <- indentedLine
                              return $ b ++ l))
   optional blanklines
-  return $ CodeBlock ("",[],[]) $ stripTrailingNewlines $ concat contents
+  st <- getState
+  return $ CodeBlock ("", stateIndentedCodeClasses st, []) $
+           stripTrailingNewlines $ concat contents
 
 lhsCodeBlock :: GenParser Char ParserState Block
 lhsCodeBlock = do
   failUnlessLHS
   contents <- lhsCodeBlockBird <|> lhsCodeBlockLaTeX
-  return $ CodeBlock ("",["sourceCode","haskell"],[]) contents
+  return $ CodeBlock ("",["sourceCode","literate","haskell"],[]) contents
 
 lhsCodeBlockLaTeX :: GenParser Char ParserState String
 lhsCodeBlockLaTeX = try $ do
@@ -502,8 +497,8 @@ listLine = try $ do
   notFollowedBy' (do indentSpaces
                      many (spaceChar)
                      listStart)
-  line <- manyTill anyChar newline
-  return $ line ++ "\n"
+  chunks <- manyTill (htmlComment <|> count 1 anyChar) newline
+  return $ concat chunks ++ "\n"
 
 -- parse raw text for one list item, excluding start marker and continuations
 rawListItem :: GenParser Char ParserState [Char]
@@ -560,38 +555,61 @@ bulletList = try $ do
 
 -- definition lists
 
-definitionListItem :: GenParser Char ParserState ([Inline], [Block])
+defListMarker :: GenParser Char ParserState ()
+defListMarker = do
+  sps <- nonindentSpaces
+  char ':' <|> char '~'
+  st <- getState
+  let tabStop = stateTabStop st
+  let remaining = tabStop - (length sps + 1)
+  if remaining > 0
+     then count remaining (char ' ') <|> string "\t"
+     else pzero
+  return ()
+
+definitionListItem :: GenParser Char ParserState ([Inline], [[Block]])
 definitionListItem = try $ do
-  notFollowedBy blankline
-  notFollowedBy' indentSpaces
   -- first, see if this has any chance of being a definition list:
-  lookAhead (anyLine >> char ':')
+  lookAhead (anyLine >> optional blankline >> defListMarker)
   term <- manyTill inline newline
+  optional blankline
   raw <- many1 defRawBlock
   state <- getState
   let oldContext = stateParserContext state
   -- parse the extracted block, which may contain various block elements:
-  contents <- parseFromString parseBlocks $ concat raw
+  contents <- mapM (parseFromString parseBlocks) raw
   updateState (\st -> st {stateParserContext = oldContext})
   return ((normalizeSpaces term), contents)
 
 defRawBlock :: GenParser Char ParserState [Char]
 defRawBlock = try $ do
-  char ':'
-  state <- getState
-  let tabStop = stateTabStop state
-  try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t")
+  defListMarker
   firstline <- anyLine
   rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
   trailing <- option "" blanklines
-  return $ firstline ++ "\n" ++ unlines rawlines ++ trailing
+  cont <- liftM concat $ many $ do
+            lns <- many1 $ notFollowedBy blankline >> indentSpaces >> anyLine
+            trl <- option "" blanklines
+            return $ unlines lns ++ trl
+  return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont
 
 definitionList :: GenParser Char ParserState Block
 definitionList = do
   items <- many1 definitionListItem
-  let (terms, defs) = unzip items
-  let defs' = compactify defs
-  let items' = zip terms defs'
+  -- "compactify" the definition list:
+  let defs = map snd items
+  let defBlocks = reverse $ concat $ concat defs
+  let isPara (Para _) = True
+      isPara _        = False
+  let items' = case take 1 defBlocks of
+                [Para x]   -> if not $ any isPara (drop 1 defBlocks)
+                                 then let (t,ds) = last items
+                                          lastDef = last ds
+                                          ds' = init ds ++
+                                                [init lastDef ++ [Plain x]]
+                                       in init items ++ [(t, ds')]
+                                 else items
+                _          -> items
   return $ DefinitionList items'
 
 --
@@ -681,26 +699,36 @@ dashedLine ch = do
   return $ (length dashes, length $ dashes ++ sp)
 
 -- Parse a table header with dashed lines of '-' preceded by 
--- one line of text.
-simpleTableHeader :: GenParser Char ParserState ([[Char]], [Alignment], [Int])
-simpleTableHeader = try $ do
-  rawContent  <- anyLine
+-- one (or zero) line of text.
+simpleTableHeader :: Bool  -- ^ Headerless table 
+                  -> GenParser Char ParserState ([[Char]], [Alignment], [Int])
+simpleTableHeader headless = try $ do
+  rawContent  <- if headless
+                    then return ""
+                    else anyLine
   initSp      <- nonindentSpaces
   dashes      <- many1 (dashedLine '-')
   newline
   let (lengths, lines') = unzip dashes
   let indices  = scanl (+) (length initSp) lines'
-  let rawHeads = tail $ splitByIndices (init indices) rawContent
+  -- If no header, calculate alignment on basis of first row of text
+  rawHeads <- liftM (tail . splitByIndices (init indices)) $
+              if headless
+                 then lookAhead anyLine 
+                 else return rawContent
   let aligns   = zipWith alignType (map (\a -> [a]) rawHeads) lengths
-  return (rawHeads, aligns, indices)
+  let rawHeads' = if headless
+                     then replicate (length dashes) ""
+                     else rawHeads 
+  return (rawHeads', aligns, indices)
 
 -- Parse a table footer - dashed lines followed by blank line.
 tableFooter :: GenParser Char ParserState [Char]
 tableFooter = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> blanklines
 
 -- Parse a table separator - dashed line.
-tableSep :: GenParser Char ParserState String
-tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> string "\n"
+tableSep :: GenParser Char ParserState Char
+tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> char '\n'
 
 -- Parse a raw line and split it into chunks by indices.
 rawTableLine :: [Int]
@@ -731,7 +759,17 @@ widthsFromIndices :: Int      -- Number of columns on terminal
                   -> [Double] -- Fractional relative sizes of columns
 widthsFromIndices _ [] = []  
 widthsFromIndices numColumns indices = 
-  let lengths = zipWith (-) indices (0:indices)
+  let lengths' = zipWith (-) indices (0:indices)
+      lengths  = reverse $
+                 case reverse lengths' of
+                      []       -> []
+                      [x]      -> [x]
+                      -- compensate for the fact that intercolumn
+                      -- spaces are counted in widths of all columns
+                      -- but the last...
+                      (x:y:zs) -> if x < y && y - x <= 2
+                                     then y:y:zs
+                                     else x:y:zs
       totLength = sum lengths
       quotient = if totLength > numColumns
                    then fromIntegral totLength
@@ -765,30 +803,48 @@ tableWith headerParser lineParser footerParser = try $ do
     return $ Table caption aligns widths heads lines'
 
 -- Parse a simple table with '---' header and one line per row.
-simpleTable :: GenParser Char ParserState Block
-simpleTable = tableWith simpleTableHeader tableLine blanklines
+simpleTable :: Bool  -- ^ Headerless table
+            -> GenParser Char ParserState Block
+simpleTable headless = do
+  Table c a _w h l <- tableWith (simpleTableHeader headless) tableLine
+               (if headless then tableFooter else tableFooter <|> blanklines)
+  -- Simple tables get 0s for relative column widths (i.e., use default)
+  return $ Table c a (replicate (length a) 0) h l
 
 -- Parse a multiline table:  starts with row of '-' on top, then header
 -- (which may be multiline), then the rows,
 -- which may be multiline, separated by blank lines, and
 -- ending with a footer (dashed line followed by blank line).
-multilineTable :: GenParser Char ParserState Block
-multilineTable = tableWith multilineTableHeader multilineRow tableFooter
-
-multilineTableHeader :: GenParser Char ParserState ([String], [Alignment], [Int])
-multilineTableHeader = try $ do
-  tableSep 
-  rawContent  <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline)
+multilineTable :: Bool -- ^ Headerless table
+               -> GenParser Char ParserState Block
+multilineTable headless =
+  tableWith (multilineTableHeader headless) multilineRow tableFooter
+
+multilineTableHeader :: Bool -- ^ Headerless table
+                     -> GenParser Char ParserState ([String], [Alignment], [Int])
+multilineTableHeader headless = try $ do
+  if headless
+     then return '\n'
+     else tableSep
+  rawContent  <- if headless
+                    then return $ repeat "" 
+                    else many1
+                         (notFollowedBy tableSep >> many1Till anyChar newline)
   initSp      <- nonindentSpaces
   dashes      <- many1 (dashedLine '-')
   newline
   let (lengths, lines') = unzip dashes
   let indices  = scanl (+) (length initSp) lines'
-  let rawHeadsList = transpose $ map 
-                     (\ln -> tail $ splitByIndices (init indices) ln)
-                     rawContent
-  let rawHeads = map (intercalate " ") rawHeadsList
+  rawHeadsList <- if headless
+                     then liftM (map (:[]) . tail .
+                              splitByIndices (init indices)) $ lookAhead anyLine
+                     else return $ transpose $ map 
+                           (\ln -> tail $ splitByIndices (init indices) ln)
+                           rawContent
   let aligns   = zipWith alignType rawHeadsList lengths
+  let rawHeads = if headless
+                    then replicate (length dashes) ""
+                    else map (intercalate " ") rawHeadsList
   return ((map removeLeadingTrailingSpace rawHeads), aligns, indices)
 
 -- Returns an alignment type for a table, based on a list of strings
@@ -810,7 +866,8 @@ alignType strLst len =
         (False, False)   -> AlignDefault
 
 table :: GenParser Char ParserState Block
-table = simpleTable <|> multilineTable <?> "table"
+table = multilineTable False <|> simpleTable True <|>
+        simpleTable False <|> multilineTable True <?> "table"
 
 -- 
 -- inline
@@ -826,6 +883,7 @@ inlineParsers = [ str
                 , endline
                 , code
                 , charRef
+                , (fourOrMore '*' <|> fourOrMore '_')
                 , strong
                 , emph
                 , note
@@ -862,10 +920,10 @@ escapedChar = do
   result <- option '\\' $ if stateStrict state 
                              then oneOf "\\`*_{}[]()>#+-.!~"
                              else satisfy (not . isAlphaNum)
-  let result' = if result == ' '
-                   then '\160'  -- '\ ' is a nonbreaking space
-                   else result
-  return $ Str [result']
+  return $ case result of
+                ' '   -> Str "\160" -- "\ " is a nonbreaking space
+                '\n'  -> LineBreak  -- "\[newline]" is a linebreak
+                _     -> Str [result]
 
 ltSign :: GenParser Char ParserState Inline
 ltSign = do
@@ -895,8 +953,13 @@ code = try $ do
   return $ Code $ removeLeadingTrailingSpace $ concat result
 
 mathWord :: GenParser Char st [Char]
-mathWord = many1 ((noneOf " \t\n\\$") <|>
-                  (try (char '\\') >>~ notFollowedBy (char '$')))
+mathWord = liftM concat $ many1 mathChunk
+
+mathChunk :: GenParser Char st [Char]
+mathChunk = do char '\\'
+               c <- anyChar
+               return ['\\',c]
+        <|> many1 (noneOf " \t\n\\$")
 
 math :: GenParser Char ParserState Inline
 math = (mathDisplay >>= return . Math DisplayMath)
@@ -918,6 +981,12 @@ mathInline = try $ do
   notFollowedBy digit
   return $ intercalate " " words'
 
+-- to avoid performance problems, treat 4 or more _ or * in a row as a literal
+-- rather than attempting to parse for emph/strong
+fourOrMore :: Char -> GenParser Char st Inline
+fourOrMore c = try $ count 4 (char c) >> many (char c) >>= \s ->
+                       return (Str $ replicate 4 c ++ s)
+
 emph :: GenParser Char ParserState Inline
 emph = ((enclosed (char '*') (notFollowedBy' strong >> char '*') inline) <|>
         (enclosed (char '_') (notFollowedBy' strong >> char '_' >> 
@@ -1106,7 +1175,7 @@ source' = do
   tit <- option "" linkTitle
   skipSpaces
   eof
-  return (intercalate "%20" $ words $ removeTrailingSpace src, tit)
+  return (intercalate "+" $ words $ removeTrailingSpace src, tit)
 
 linkTitle :: GenParser Char st String
 linkTitle = try $ do 
@@ -1167,8 +1236,8 @@ note = try $ do
   state <- getState
   let notes = stateNotes state
   case lookup ref notes of
-    Nothing       -> fail "note not found"
-    Just contents -> return $ Note contents
+    Nothing   -> fail "note not found"
+    Just raw  -> liftM Note $ parseFromString parseBlocks raw
 
 inlineNote :: GenParser Char ParserState Inline
 inlineNote = try $ do
author	dr@jones.dk <dr@jones.dk>	2009-12-14 12:57:35 +0100
committer	dr@jones.dk <dr@jones.dk>	2009-12-14 12:57:35 +0100
commit	789d0772d8b5d9c066fb8624bd51576cbde5e30b (patch)
tree	7141187124ecc41b13861c81c7b642076cb88078 /src/Text/Pandoc/Readers/Markdown.hs
parent	88b315ccee666385e1a4c52e2eb5fb0b0ffe8d60 (diff)