diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 84 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 39 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 127 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 178 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/TeXMath.hs | 253 |
5 files changed, 232 insertions, 449 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 5c188e3d9..f47309d3f 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -44,13 +44,14 @@ module Text.Pandoc.Readers.HTML ( import Text.ParserCombinators.Parsec import Text.Pandoc.Definition -import Text.Pandoc.Shared +import Text.Pandoc.Shared +import Text.Pandoc.Parsing import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) import Data.Maybe ( fromMaybe ) import Data.List ( isPrefixOf, isSuffixOf, intercalate ) import Data.Char ( toLower, isAlphaNum ) import Network.URI ( parseURIReference, URI (..) ) -import Control.Monad ( liftM ) +import Control.Monad ( liftM, when ) -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ParserState -- ^ Parser state @@ -198,11 +199,11 @@ inlinesTilEnd tag = manyTill inline (htmlEndTag tag) -- | Parse blocks between open and close tag. blocksIn :: String -> GenParser Char ParserState [Block] -blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag +blocksIn tag = try $ htmlOpenTag tag >> spaces >> blocksTilEnd tag -- | Parse inlines between open and close tag. inlinesIn :: String -> GenParser Char ParserState [Inline] -inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag +inlinesIn tag = try $ htmlOpenTag tag >> spaces >> inlinesTilEnd tag -- | Extract type from a tag: e.g. @br@ from @\<br\>@ extractTagType :: String -> String @@ -258,18 +259,33 @@ anyHtmlEndTag = try $ do then return $ "<!-- unsafe HTML removed -->" else return result -htmlTag :: String -> GenParser Char ParserState (String, [(String, String)]) -htmlTag tag = try $ do +htmlTag :: Bool + -> String + -> GenParser Char ParserState (String, [(String, String)]) +htmlTag selfClosing tag = try $ do char '<' spaces stringAnyCase tag attribs <- many htmlAttribute spaces - optional (string "/") - spaces + -- note: we want to handle both HTML and XHTML, + -- so we don't require the / + when selfClosing $ optional $ char '/' >> spaces char '>' return (tag, (map (\(name, content, _) -> (name, content)) attribs)) +htmlOpenTag :: String + -> GenParser Char ParserState (String, [(String, String)]) +htmlOpenTag = htmlTag False + +htmlCloseTag :: String + -> GenParser Char ParserState (String, [(String, String)]) +htmlCloseTag = htmlTag False . ('/':) + +htmlSelfClosingTag :: String + -> GenParser Char ParserState (String, [(String, String)]) +htmlSelfClosingTag = htmlTag True + -- parses a quoted html attribute value quoted :: Char -> GenParser Char st (String, String) quoted quoteChar = do @@ -344,7 +360,7 @@ anyHtmlInlineTag = try $ do -- Scripts must be treated differently, because they can contain '<>' etc. htmlScript :: GenParser Char ParserState [Char] htmlScript = try $ do - lookAhead $ htmlTag "script" + lookAhead $ htmlOpenTag "script" open <- anyHtmlTag rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script") st <- getState @@ -379,7 +395,7 @@ scriptChunk = jsComment <|> jsString <|> jsChars -- Style tags must be treated differently, because they can contain CSS htmlStyle :: GenParser Char ParserState [Char] htmlStyle = try $ do - lookAhead $ htmlTag "style" + lookAhead $ htmlOpenTag "style" open <- anyHtmlTag rest <- manyTill anyChar (htmlEndTag "style") st <- getState @@ -411,7 +427,8 @@ rawVerbatimBlock = try $ do -- We don't want to parse </body> or </html> as raw HTML, since these -- are handled in parseHtml. rawHtmlBlock' :: GenParser Char ParserState Block -rawHtmlBlock' = do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") +rawHtmlBlock' = do notFollowedBy' (htmlCloseTag "body" <|> + htmlCloseTag "html") rawHtmlBlock -- | Parses an HTML comment. @@ -441,13 +458,13 @@ definition = try $ do nonTitleNonHead :: GenParser Char ParserState Char nonTitleNonHead = try $ do - notFollowedBy $ (htmlTag "title" >> return ' ') <|> + notFollowedBy $ (htmlOpenTag "title" >> return ' ') <|> (htmlEndTag "head" >> return ' ') (rawHtmlBlock >> return ' ') <|> anyChar parseTitle :: GenParser Char ParserState [Inline] parseTitle = try $ do - (tag, _) <- htmlTag "title" + (tag, _) <- htmlOpenTag "title" contents <- inlinesTilEnd tag spaces return contents @@ -455,7 +472,7 @@ parseTitle = try $ do -- parse header and return meta-information (for now, just title) parseHead :: GenParser Char ParserState Meta parseHead = try $ do - htmlTag "head" + htmlOpenTag "head" spaces skipMany nonTitleNonHead contents <- option [] parseTitle @@ -463,13 +480,10 @@ parseHead = try $ do htmlEndTag "head" return $ Meta contents [] [] -skipHtmlTag :: String -> GenParser Char ParserState () -skipHtmlTag tag = optional (htmlTag tag) - -- h1 class="title" representation of title in body bodyTitle :: GenParser Char ParserState [Inline] bodyTitle = try $ do - (_, attribs) <- htmlTag "h1" + (_, attribs) <- htmlOpenTag "h1" case (extractAttribute "class" attribs) of Just "title" -> return "" _ -> fail "not title" @@ -487,11 +501,11 @@ parseHtml :: GenParser Char ParserState Pandoc parseHtml = do sepEndBy (choice [xmlDec, definition, htmlComment]) spaces spaces - skipHtmlTag "html" + optional $ htmlOpenTag "html" spaces meta <- option (Meta [] [] []) parseHead spaces - skipHtmlTag "body" + optional $ htmlOpenTag "body" spaces optional bodyTitle -- skip title in body, because it's represented in meta blocks <- parseBlocks @@ -527,7 +541,7 @@ header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" headerLevel :: Int -> GenParser Char ParserState Block headerLevel n = try $ do let level = "h" ++ show n - htmlTag level + htmlOpenTag level contents <- inlinesTilEnd level return $ Header n (normalizeSpaces contents) @@ -537,7 +551,7 @@ headerLevel n = try $ do hrule :: GenParser Char ParserState Block hrule = try $ do - (_, attribs) <- htmlTag "hr" + (_, attribs) <- htmlSelfClosingTag "hr" state <- getState if not (null attribs) && stateParseRaw state then unexpected "attributes in hr" -- parse as raw in this case @@ -551,7 +565,7 @@ hrule = try $ do -- skipped, because they are not portable to output formats other than HTML. codeBlock :: GenParser Char ParserState Block codeBlock = try $ do - htmlTag "pre" + htmlOpenTag "pre" result <- manyTill (many1 (satisfy (/= '<')) <|> ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) @@ -572,7 +586,7 @@ codeBlock = try $ do -- blockQuote :: GenParser Char ParserState Block -blockQuote = try $ htmlTag "blockquote" >> spaces >> +blockQuote = try $ htmlOpenTag "blockquote" >> spaces >> blocksTilEnd "blockquote" >>= (return . BlockQuote) -- @@ -584,7 +598,7 @@ list = choice [ bulletList, orderedList, definitionList ] <?> "list" orderedList :: GenParser Char ParserState Block orderedList = try $ do - (_, attribs) <- htmlTag "ol" + (_, attribs) <- htmlOpenTag "ol" (start, style) <- option (1, DefaultStyle) $ do failIfStrict let sta = fromMaybe "1" $ @@ -609,7 +623,7 @@ orderedList = try $ do bulletList :: GenParser Char ParserState Block bulletList = try $ do - htmlTag "ul" + htmlOpenTag "ul" spaces -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... @@ -620,7 +634,7 @@ bulletList = try $ do definitionList :: GenParser Char ParserState Block definitionList = try $ do failIfStrict -- def lists not part of standard markdown - htmlTag "dl" + htmlOpenTag "dl" spaces items <- sepEndBy1 definitionListItem spaces htmlEndTag "dl" @@ -638,7 +652,7 @@ definitionListItem = try $ do -- para :: GenParser Char ParserState Block -para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= +para = try $ htmlOpenTag "p" >> inlinesTilEnd "p" >>= return . Para . normalizeSpaces -- @@ -672,8 +686,8 @@ inline = choice [ charRef code :: GenParser Char ParserState Inline code = try $ do - htmlTag "code" - result <- manyTill anyChar (htmlEndTag "code") + result <- (htmlOpenTag "code" >> manyTill anyChar (htmlEndTag "code")) + <|> (htmlOpenTag "tt" >> manyTill anyChar (htmlEndTag "tt")) -- remove internal line breaks, leading and trailing space, -- and decode character references return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ @@ -686,7 +700,7 @@ rawHtmlInline = do if stateParseRaw state then return (HtmlInline result) else return (Str "") betweenTags :: [Char] -> GenParser Char ParserState [Inline] -betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= +betweenTags tag = try $ htmlOpenTag tag >> inlinesTilEnd tag >>= return . normalizeSpaces emph :: GenParser Char ParserState Inline @@ -708,7 +722,7 @@ strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= spanStrikeout :: GenParser Char ParserState Inline spanStrikeout = try $ do failIfStrict -- strict markdown has no strikeout, so treat as raw HTML - (_, attributes) <- htmlTag "span" + (_, attributes) <- htmlOpenTag "span" result <- case (extractAttribute "class" attributes) of Just "strikeout" -> inlinesTilEnd "span" _ -> fail "not a strikeout" @@ -719,7 +733,7 @@ whitespace = many1 space >> return Space -- hard line break linebreak :: GenParser Char ParserState Inline -linebreak = htmlTag "br" >> optional newline >> return LineBreak +linebreak = htmlSelfClosingTag "br" >> optional newline >> return LineBreak str :: GenParser Char st Inline str = many1 (noneOf "< \t\n&") >>= return . Str @@ -740,7 +754,7 @@ extractAttribute name ((attrName, contents):rest) = link :: GenParser Char ParserState Inline link = try $ do - (_, attributes) <- htmlTag "a" + (_, attributes) <- htmlOpenTag "a" url <- case (extractAttribute "href" attributes) of Just url -> return url Nothing -> fail "no href" @@ -750,7 +764,7 @@ link = try $ do image :: GenParser Char ParserState Inline image = try $ do - (_, attributes) <- htmlTag "img" + (_, attributes) <- htmlSelfClosingTag "img" url <- case (extractAttribute "src" attributes) of Just url -> return url Nothing -> fail "no src" diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 36940fab0..406809dfc 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -35,7 +35,8 @@ module Text.Pandoc.Readers.LaTeX ( import Text.ParserCombinators.Parsec import Text.Pandoc.Definition -import Text.Pandoc.Shared +import Text.Pandoc.Shared +import Text.Pandoc.Parsing import Data.Maybe ( fromMaybe ) import Data.Char ( chr ) import Data.List ( isPrefixOf, isSuffixOf ) @@ -167,16 +168,37 @@ block = choice [ hrule -- header :: GenParser Char ParserState Block -header = try $ do +header = section <|> chapter + +chapter :: GenParser Char ParserState Block +chapter = try $ do + string "\\chapter" + result <- headerWithLevel 1 + updateState $ \s -> s{ stateHasChapters = True } + return result + +section :: GenParser Char ParserState Block +section = try $ do char '\\' subs <- many (try (string "sub")) base <- try (string "section" >> return 1) <|> (string "paragraph" >> return 4) + st <- getState + let lev = if stateHasChapters st + then length subs + base + 1 + else length subs + base + headerWithLevel lev + +headerWithLevel :: Int -> GenParser Char ParserState Block +headerWithLevel lev = try $ do + spaces optional (char '*') + spaces optional $ bracketedText '[' ']' -- alt title + spaces char '{' title' <- manyTill inline (char '}') spaces - return $ Header (length subs + base) (normalizeSpaces title') + return $ Header lev (normalizeSpaces title') -- -- hrule block @@ -453,7 +475,7 @@ inline = choice [ str , accentedChar , nonbreakingSpace , specialChar - , rawLaTeXInline + , rawLaTeXInline' , escapedChar , unescapedChar ] <?> "inline" @@ -771,11 +793,16 @@ footnote = try $ do setInput rest return $ Note blocks +-- | Parse any LaTeX inline command and return it in a raw TeX inline element. +rawLaTeXInline' :: GenParser Char ParserState Inline +rawLaTeXInline' = do + notFollowedBy' $ oneOfStrings ["\\begin", "\\end", "\\item", "\\ignore", + "\\section"] + rawLaTeXInline + -- | Parse any LaTeX command and return it in a raw TeX inline element. rawLaTeXInline :: GenParser Char ParserState Inline rawLaTeXInline = try $ do - notFollowedBy' $ oneOfStrings ["\\begin", "\\end", "\\item", "\\ignore", - "\\section"] state <- getState if stateParseRaw state then do diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 13edd0586..33fb3d8e6 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -32,11 +32,13 @@ module Text.Pandoc.Readers.Markdown ( ) where import Data.List ( transpose, isSuffixOf, sortBy, findIndex, intercalate ) +import qualified Data.Map as M import Data.Ord ( comparing ) import Data.Char ( isAlphaNum ) import Data.Maybe import Text.Pandoc.Definition -import Text.Pandoc.Shared +import Text.Pandoc.Shared +import Text.Pandoc.Parsing import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' ) import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, anyHtmlInlineTag, anyHtmlTag, @@ -67,7 +69,7 @@ setextHChars = "=-" -- treat these as potentially non-text when parsing inline: specialChars :: [Char] -specialChars = "\\[]*_~`<>$!^-.&'\"\8216\8217\8220\8221;" +specialChars = "\\[]*_~`<>$!^-.&@'\";" -- -- auxiliary functions @@ -183,7 +185,18 @@ parseMarkdown = do -- now parse it for real... (title, author, date) <- option ([],[],[]) titleBlock blocks <- parseBlocks - return $ Pandoc (Meta title author date) $ filter (/= Null) blocks + let doc = Pandoc (Meta title author date) $ filter (/= Null) blocks + -- if there are labeled examples, change references into numbers + examples <- liftM stateExamples getState + let handleExampleRef :: Inline -> Inline + handleExampleRef z@(Str ('@':xs)) = + case M.lookup xs examples of + Just n -> Str (show n) + Nothing -> z + handleExampleRef z = z + if M.null examples + then return doc + else return $ processWith handleExampleRef doc -- -- initial pass for references and notes @@ -202,10 +215,10 @@ referenceKey = try $ do tit <- option "" referenceTitle blanklines endPos <- getPosition - let newkey = (lab, (escapeURI $ removeTrailingSpace src, tit)) + let target = (escapeURI $ removeTrailingSpace src, tit) st <- getState let oldkeys = stateKeys st - updateState $ \s -> s { stateKeys = newkey : oldkeys } + updateState $ \s -> s { stateKeys = M.insert (Key lab) target oldkeys } -- return blanks so line count isn't affected return $ replicate (sourceLine endPos - sourceLine startPos) '\n' @@ -715,7 +728,7 @@ dashedLine ch = do -- Parse a table header with dashed lines of '-' preceded by -- one (or zero) line of text. simpleTableHeader :: Bool -- ^ Headerless table - -> GenParser Char ParserState ([[Char]], [Alignment], [Int]) + -> GenParser Char ParserState ([[Block]], [Alignment], [Int]) simpleTableHeader headless = try $ do rawContent <- if headless then return "" @@ -734,7 +747,9 @@ simpleTableHeader headless = try $ do let rawHeads' = if headless then replicate (length dashes) "" else rawHeads - return (rawHeads', aligns, indices) + heads <- mapM (parseFromString (many plain)) $ + map removeLeadingTrailingSpace rawHeads' + return (heads, aligns, indices) -- Parse a table footer - dashed lines followed by blank line. tableFooter :: GenParser Char ParserState [Char] @@ -763,65 +778,27 @@ multilineRow :: [Int] -> GenParser Char ParserState [[Block]] multilineRow indices = do colLines <- many1 (rawTableLine indices) - optional blanklines let cols = map unlines $ transpose colLines mapM (parseFromString (many plain)) cols --- Calculate relative widths of table columns, based on indices -widthsFromIndices :: Int -- Number of columns on terminal - -> [Int] -- Indices - -> [Double] -- Fractional relative sizes of columns -widthsFromIndices _ [] = [] -widthsFromIndices numColumns indices = - let lengths' = zipWith (-) indices (0:indices) - lengths = reverse $ - case reverse lengths' of - [] -> [] - [x] -> [x] - -- compensate for the fact that intercolumn - -- spaces are counted in widths of all columns - -- but the last... - (x:y:zs) -> if x < y && y - x <= 2 - then y:y:zs - else x:y:zs - totLength = sum lengths - quotient = if totLength > numColumns - then fromIntegral totLength - else fromIntegral numColumns - fracs = map (\l -> (fromIntegral l) / quotient) lengths in - tail fracs - -- Parses a table caption: inlines beginning with 'Table:' -- and followed by blank lines. tableCaption :: GenParser Char ParserState [Inline] tableCaption = try $ do skipNonindentSpaces - string "Table:" + string ":" <|> string "Table:" result <- many1 inline blanklines return $ normalizeSpaces result --- Parse a table using 'headerParser', 'lineParser', and 'footerParser'. -tableWith :: GenParser Char ParserState ([[Char]], [Alignment], [Int]) - -> ([Int] -> GenParser Char ParserState [[Block]]) - -> GenParser Char ParserState end - -> GenParser Char ParserState Block -tableWith headerParser lineParser footerParser = try $ do - (rawHeads, aligns, indices) <- headerParser - lines' <- many1Till (lineParser indices) footerParser - caption <- option [] tableCaption - heads <- mapM (parseFromString (many plain)) rawHeads - state <- getState - let numColumns = stateColumns state - let widths = widthsFromIndices numColumns indices - return $ Table caption aligns widths heads lines' - -- Parse a simple table with '---' header and one line per row. simpleTable :: Bool -- ^ Headerless table -> GenParser Char ParserState Block simpleTable headless = do Table c a _w h l <- tableWith (simpleTableHeader headless) tableLine - (if headless then tableFooter else tableFooter <|> blanklines) + (return ()) + (if headless then tableFooter else tableFooter <|> blanklines) + tableCaption -- Simple tables get 0s for relative column widths (i.e., use default) return $ Table c a (replicate (length a) 0) h l @@ -832,10 +809,10 @@ simpleTable headless = do multilineTable :: Bool -- ^ Headerless table -> GenParser Char ParserState Block multilineTable headless = - tableWith (multilineTableHeader headless) multilineRow tableFooter + tableWith (multilineTableHeader headless) multilineRow blanklines tableFooter tableCaption multilineTableHeader :: Bool -- ^ Headerless table - -> GenParser Char ParserState ([String], [Alignment], [Int]) + -> GenParser Char ParserState ([[Block]], [Alignment], [Int]) multilineTableHeader headless = try $ do if headless then return '\n' @@ -859,7 +836,9 @@ multilineTableHeader headless = try $ do let rawHeads = if headless then replicate (length dashes) "" else map (intercalate " ") rawHeadsList - return ((map removeLeadingTrailingSpace rawHeads), aligns, indices) + heads <- mapM (parseFromString (many plain)) $ + map removeLeadingTrailingSpace rawHeads + return (heads, aligns, indices) -- Returns an alignment type for a table, based on a list of strings -- (the rows of the column header) and a number (the length of the @@ -879,9 +858,14 @@ alignType strLst len = (True, True) -> AlignCenter (False, False) -> AlignDefault +gridTable :: Bool -- ^ Headerless table + -> GenParser Char ParserState Block +gridTable = gridTableWith block tableCaption + table :: GenParser Char ParserState Block table = multilineTable False <|> simpleTable True <|> - simpleTable False <|> multilineTable True <?> "table" + simpleTable False <|> multilineTable True <|> + gridTable False <|> gridTable True <?> "table" -- -- inline @@ -915,6 +899,7 @@ inlineParsers = [ str , rawHtmlInline' , rawLaTeXInline' , escapedChar + , exampleRef , symbol , ltSign ] @@ -950,6 +935,14 @@ ltSign = do specialCharsMinusLt :: [Char] specialCharsMinusLt = filter (/= '<') specialChars +exampleRef :: GenParser Char ParserState Inline +exampleRef = try $ do + char '@' + lab <- many1 (alphaNum <|> oneOf "-_") + -- We just return a Str. These are replaced with numbers + -- later. See the end of parseMarkdown. + return $ Str $ '@' : lab + symbol :: GenParser Char ParserState Inline symbol = do result <- oneOf specialCharsMinusLt @@ -1070,30 +1063,28 @@ failIfInQuoteContext context = do singleQuoteStart :: GenParser Char ParserState Char singleQuoteStart = do failIfInQuoteContext InSingleQuote - char '\8216' <|> - (try $ do char '\'' - notFollowedBy (oneOf ")!],.;:-? \t\n") - notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> - satisfy (not . isAlphaNum))) - -- possess/contraction - return '\'') + try $ do char '\'' + notFollowedBy (oneOf ")!],.;:-? \t\n") + notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> + satisfy (not . isAlphaNum))) + -- possess/contraction + return '\'' singleQuoteEnd :: GenParser Char st Char singleQuoteEnd = try $ do - char '\8217' <|> char '\'' + char '\'' notFollowedBy alphaNum return '\'' doubleQuoteStart :: GenParser Char ParserState Char doubleQuoteStart = do failIfInQuoteContext InDoubleQuote - char '\8220' <|> - (try $ do char '"' - notFollowedBy (oneOf " \t\n") - return '"') + try $ do char '"' + notFollowedBy (oneOf " \t\n") + return '"' doubleQuoteEnd :: GenParser Char st Char -doubleQuoteEnd = char '\8221' <|> char '"' +doubleQuoteEnd = char '"' ellipses :: GenParser Char st Inline ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses @@ -1218,7 +1209,7 @@ referenceLink lab = do optional (newline >> skipSpaces) >> reference)) let ref' = if null ref then lab else ref state <- getState - case lookupKeySrc (stateKeys state) ref' of + case lookupKeySrc (stateKeys state) (Key ref') of Nothing -> fail "no corresponding key" Just target -> return target @@ -1303,7 +1294,7 @@ inlineCitation = try $ do chkCit :: Target -> GenParser Char ParserState (Maybe Target) chkCit t = do st <- getState - case lookupKeySrc (stateKeys st) [Str $ fst t] of + case lookupKeySrc (stateKeys st) (Key [Str $ fst t]) of Just _ -> fail "This is a link" Nothing -> if elem (fst t) $ stateCitations st then return $ Just t diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index 5e7ea512e..13afe5053 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -31,10 +31,13 @@ module Text.Pandoc.Readers.RST ( readRST ) where import Text.Pandoc.Definition -import Text.Pandoc.Shared +import Text.Pandoc.Shared +import Text.Pandoc.Parsing import Text.ParserCombinators.Parsec -import Control.Monad ( when, unless, liftM ) -import Data.List ( findIndex, delete, intercalate, transpose ) +import Control.Monad ( when, unless ) +import Data.List ( findIndex, intercalate, transpose, sort ) +import qualified Data.Map as M +import Text.Printf ( printf ) -- | Parse reStructuredText string and return Pandoc document. readRST :: ParserState -- ^ Parser state, including options for parser @@ -93,9 +96,6 @@ parseRST = do docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= return . concat setInput docMinusKeys setPosition startPos - st <- getState - let reversedKeys = stateKeys st - updateState $ \s -> s { stateKeys = reverse reversedKeys } -- now parse it for real... blocks <- parseBlocks let blocks' = filter (/= Null) blocks @@ -425,7 +425,7 @@ bulletListStart = try $ do -- parses ordered list start and returns its length (inc following whitespace) orderedListStart :: ListNumberStyle -> ListNumberDelim - -> GenParser Char st Int + -> GenParser Char ParserState Int orderedListStart style delim = try $ do (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim) white <- many1 spaceChar @@ -540,10 +540,10 @@ referenceName = quotedReferenceName <|> referenceKey :: GenParser Char ParserState [Char] referenceKey = do startPos <- getPosition - key <- choice [imageKey, anonymousKey, regularKey] + (key, target) <- choice [imageKey, anonymousKey, regularKey] st <- getState let oldkeys = stateKeys st - updateState $ \s -> s { stateKeys = key : oldkeys } + updateState $ \s -> s { stateKeys = M.insert key target oldkeys } optional blanklines endPos <- getPosition -- return enough blanks to replace key @@ -558,28 +558,29 @@ targetURI = do blanklines return $ escapeURI $ removeLeadingTrailingSpace $ contents -imageKey :: GenParser Char ParserState ([Inline], (String, [Char])) +imageKey :: GenParser Char ParserState (Key, Target) imageKey = try $ do string ".. |" ref <- manyTill inline (char '|') skipSpaces string "image::" src <- targetURI - return (normalizeSpaces ref, (src, "")) + return (Key (normalizeSpaces ref), (src, "")) -anonymousKey :: GenParser Char st ([Inline], (String, [Char])) +anonymousKey :: GenParser Char st (Key, Target) anonymousKey = try $ do oneOfStrings [".. __:", "__"] src <- targetURI - return ([Str "_"], (src, "")) + pos <- getPosition + return (Key [Str $ "_" ++ printf "%09d" (sourceLine pos)], (src, "")) -regularKey :: GenParser Char ParserState ([Inline], (String, [Char])) +regularKey :: GenParser Char ParserState (Key, Target) regularKey = try $ do string ".. _" ref <- referenceName char ':' src <- targetURI - return (normalizeSpaces ref, (src, "")) + return (Key (normalizeSpaces ref), (src, "")) -- -- tables @@ -607,41 +608,20 @@ dashedLine ch = do simpleDashedLines :: Char -> GenParser Char st [(Int,Int)] simpleDashedLines ch = try $ many1 (dashedLine ch) -gridPart :: Char -> GenParser Char st (Int, Int) -gridPart ch = do - dashes <- many1 (char ch) - char '+' - return (length dashes, length dashes + 1) - -gridDashedLines :: Char -> GenParser Char st [(Int,Int)] -gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline - -- Parse a table row separator simpleTableSep :: Char -> GenParser Char ParserState Char simpleTableSep ch = try $ simpleDashedLines ch >> newline -gridTableSep :: Char -> GenParser Char ParserState Char -gridTableSep ch = try $ gridDashedLines ch >> return '\n' - -- Parse a table footer simpleTableFooter :: GenParser Char ParserState [Char] simpleTableFooter = try $ simpleTableSep '=' >> blanklines -gridTableFooter :: GenParser Char ParserState [Char] -gridTableFooter = blanklines - -- Parse a raw line and split it into chunks by indices. simpleTableRawLine :: [Int] -> GenParser Char ParserState [String] simpleTableRawLine indices = do line <- many1Till anyChar newline return (simpleTableSplitLine indices line) -gridTableRawLine :: [Int] -> GenParser Char ParserState [String] -gridTableRawLine indices = do - char '|' - line <- many1Till anyChar newline - return (gridTableSplitLine indices $ removeTrailingSpace line) - -- Parse a table row and return a list of blocks (columns). simpleTableRow :: [Int] -> GenParser Char ParserState [[Block]] simpleTableRow indices = do @@ -651,64 +631,13 @@ simpleTableRow indices = do let cols = map unlines . transpose $ firstLine : colLines mapM (parseFromString (many plain)) cols -gridTableRow :: [Int] - -> GenParser Char ParserState [[Block]] -gridTableRow indices = do - colLines <- many1 (gridTableRawLine indices) - let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $ - transpose colLines - mapM (liftM compactifyCell . parseFromString (many block)) cols - -compactifyCell :: [Block] -> [Block] -compactifyCell bs = head $ compactify [bs] - simpleTableSplitLine :: [Int] -> String -> [String] simpleTableSplitLine indices line = map removeLeadingTrailingSpace $ tail $ splitByIndices (init indices) line -gridTableSplitLine :: [Int] -> String -> [String] -gridTableSplitLine indices line = - map removeFinalBar $ tail $ splitByIndices (init indices) line - -removeFinalBar :: String -> String -removeFinalBar = reverse . dropWhile (=='|') . dropWhile (`elem` " \t") . - reverse - -removeOneLeadingSpace :: [String] -> [String] -removeOneLeadingSpace xs = - if all startsWithSpace xs - then map (drop 1) xs - else xs - where startsWithSpace "" = True - startsWithSpace (y:_) = y == ' ' - --- Calculate relative widths of table columns, based on indices -widthsFromIndices :: Int -- Number of columns on terminal - -> [Int] -- Indices - -> [Double] -- Fractional relative sizes of columns -widthsFromIndices _ [] = [] -widthsFromIndices numColumns indices = - let lengths' = zipWith (-) indices (0:indices) - lengths = reverse $ - case reverse lengths' of - [] -> [] - [x] -> [x] - -- compensate for the fact that intercolumn - -- spaces are counted in widths of all columns - -- but the last... - (x:y:zs) -> if x < y && y - x <= 2 - then y:y:zs - else x:y:zs - totLength = sum lengths - quotient = if totLength > numColumns - then fromIntegral totLength - else fromIntegral numColumns - fracs = map (\l -> (fromIntegral l) / quotient) lengths in - tail fracs - simpleTableHeader :: Bool -- ^ Headerless table - -> GenParser Char ParserState ([[Char]], [Alignment], [Int]) + -> GenParser Char ParserState ([[Block]], [Alignment], [Int]) simpleTableHeader headless = try $ do optional blanklines rawContent <- if headless @@ -722,64 +651,23 @@ simpleTableHeader headless = try $ do let rawHeads = if headless then replicate (length dashes) "" else simpleTableSplitLine indices rawContent - return (rawHeads, aligns, indices) + heads <- mapM (parseFromString (many plain)) $ + map removeLeadingTrailingSpace rawHeads + return (heads, aligns, indices) -gridTableHeader :: Bool -- ^ Headerless table - -> GenParser Char ParserState ([String], [Alignment], [Int]) -gridTableHeader headless = try $ do - optional blanklines - dashes <- gridDashedLines '-' - rawContent <- if headless - then return $ repeat "" - else many1 - (notFollowedBy (gridTableSep '=') >> char '|' >> many1Till anyChar newline) - if headless - then return () - else gridTableSep '=' >> return () - let lines' = map snd dashes - let indices = scanl (+) 0 lines' - let aligns = replicate (length lines') AlignDefault -- RST does not have a notion of alignments - let rawHeads = if headless - then replicate (length dashes) "" - else map (intercalate " ") $ transpose - $ map (gridTableSplitLine indices) rawContent - return (rawHeads, aligns, indices) - --- Parse a table using 'headerParser', 'lineParser', and 'footerParser'. -tableWith :: GenParser Char ParserState ([[Char]], [Alignment], [Int]) - -> ([Int] -> GenParser Char ParserState [[Block]]) - -> GenParser Char ParserState sep - -> GenParser Char ParserState end - -> GenParser Char ParserState Block -tableWith headerParser rowParser lineParser footerParser = try $ do - (rawHeads, aligns, indices) <- headerParser - lines' <- rowParser indices `sepEndBy` lineParser - footerParser - heads <- mapM (parseFromString (many plain)) rawHeads - state <- getState - let captions = [] -- no notion of captions in RST - let numColumns = stateColumns state - let widths = widthsFromIndices numColumns indices - return $ Table captions aligns widths heads lines' - --- Parse a simple table with '---' header and one line per row. +-- Parse a simple table. simpleTable :: Bool -- ^ Headerless table -> GenParser Char ParserState Block simpleTable headless = do - Table c a _w h l <- tableWith (simpleTableHeader headless) simpleTableRow sep simpleTableFooter + Table c a _w h l <- tableWith (simpleTableHeader headless) simpleTableRow sep simpleTableFooter (return []) -- Simple tables get 0s for relative column widths (i.e., use default) return $ Table c a (replicate (length a) 0) h l where sep = return () -- optional (simpleTableSep '-') --- Parse a grid table: starts with row of '-' on top, then header --- (which may be grid), then the rows, --- which may be grid, separated by blank lines, and --- ending with a footer (dashed line followed by blank line). gridTable :: Bool -- ^ Headerless table - -> GenParser Char ParserState Block -gridTable headless = - tableWith (gridTableHeader headless) gridTableRow (gridTableSep '-') gridTableFooter + -> GenParser Char ParserState Block +gridTable = gridTableWith block (return []) table :: GenParser Char ParserState Block table = gridTable False <|> simpleTable False <|> @@ -889,17 +777,21 @@ explicitLink = try $ do referenceLink :: GenParser Char ParserState Inline referenceLink = try $ do label' <- (quotedReferenceName <|> simpleReferenceName) >>~ char '_' - key <- option label' (do{char '_'; return [Str "_"]}) -- anonymous link state <- getState let keyTable = stateKeys state + let isAnonKey (Key [Str ('_':_)]) = True + isAnonKey _ = False + key <- option (Key label') $ + do char '_' + let anonKeys = sort $ filter isAnonKey $ M.keys keyTable + if null anonKeys + then pzero + else return (head anonKeys) (src,tit) <- case lookupKeySrc keyTable key of Nothing -> fail "no corresponding key" Just target -> return target - -- if anonymous link, remove first anon key so it won't be used again - let keyTable' = if (key == [Str "_"]) -- anonymous link? - then delete ([Str "_"], (src,tit)) keyTable -- remove first anon key - else keyTable - setState $ state { stateKeys = keyTable' } + -- if anonymous link, remove key so it won't be used again + when (isAnonKey key) $ updateState $ \s -> s{ stateKeys = M.delete key keyTable } return $ Link (normalizeSpaces label') (src, tit) autoURI :: GenParser Char ParserState Inline @@ -922,7 +814,7 @@ image = try $ do ref <- manyTill inline (char '|') state <- getState let keyTable = stateKeys state - (src,tit) <- case lookupKeySrc keyTable ref of + (src,tit) <- case lookupKeySrc keyTable (Key ref) of Nothing -> fail "no corresponding key" Just target -> return target return $ Image (normalizeSpaces ref) (src, tit) diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs index 080354be1..b0c6e86d4 100644 --- a/src/Text/Pandoc/Readers/TeXMath.hs +++ b/src/Text/Pandoc/Readers/TeXMath.hs @@ -28,208 +28,67 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of TeX math to a list of 'Pandoc' inline elements. -} module Text.Pandoc.Readers.TeXMath ( - readTeXMath + readTeXMath ) where import Text.ParserCombinators.Parsec import Text.Pandoc.Definition +import Text.TeXMath.Parser --- | Converts a string of raw TeX math to a list of 'Pandoc' inlines. +-- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. +-- Defaults to raw formula between @$@ characters if entire formula +-- can't be converted. readTeXMath :: String -- ^ String to parse (assumes @'\n'@ line endings) -> [Inline] -readTeXMath inp = case parse teXMath ("formula: " ++ inp) inp of - Left _ -> [Str inp] -- if unparseable, just include original - Right res -> res - -teXMath :: GenParser Char st [Inline] -teXMath = manyTill mathPart eof >>= return . concat - -mathPart :: GenParser Char st [Inline] -mathPart = whitespace <|> superscript <|> subscript <|> symbol <|> - argument <|> digits <|> letters <|> misc - -whitespace :: GenParser Char st [Inline] -whitespace = many1 space >> return [] - -symbol :: GenParser Char st [Inline] -symbol = try $ do - char '\\' - res <- many1 letter - case lookup res teXsymbols of - Just m -> return [Str m] - Nothing -> return [Str $ "\\" ++ res] - -argument :: GenParser Char st [Inline] -argument = try $ do - char '{' - res <- many mathPart - char '}' - return $ if null res - then [Str " "] - else [Str "{"] ++ concat res ++ [Str "}"] - -digits :: GenParser Char st [Inline] -digits = do - res <- many1 digit - return [Str res] - -letters :: GenParser Char st [Inline] -letters = do - res <- many1 letter - return [Emph [Str res]] - -misc :: GenParser Char st [Inline] -misc = do - res <- noneOf "}" - return [Str [res]] - -scriptArg :: GenParser Char st [Inline] -scriptArg = try $ do - (try (do{char '{'; r <- many mathPart; char '}'; return $ concat r})) - <|> symbol - <|> (do{c <- (letter <|> digit); return [Str [c]]}) - -superscript :: GenParser Char st [Inline] -superscript = try $ do - char '^' - arg <- scriptArg - return [Superscript arg] - -subscript :: GenParser Char st [Inline] -subscript = try $ do - char '_' - arg <- scriptArg - return [Subscript arg] - -withThinSpace :: String -> String -withThinSpace str = "\x2009" ++ str ++ "\x2009" - -teXsymbols :: [(String, String)] -teXsymbols = - [("alpha","\x3B1") - ,("beta", "\x3B2") - ,("chi", "\x3C7") - ,("delta", "\x3B4") - ,("Delta", "\x394") - ,("epsilon", "\x3B5") - ,("varepsilon", "\x25B") - ,("eta", "\x3B7") - ,("gamma", "\x3B3") - ,("Gamma", "\x393") - ,("iota", "\x3B9") - ,("kappa", "\x3BA") - ,("lambda", "\x3BB") - ,("Lambda", "\x39B") - ,("mu", "\x3BC") - ,("nu", "\x3BD") - ,("omega", "\x3C9") - ,("Omega", "\x3A9") - ,("phi", "\x3C6") - ,("varphi", "\x3D5") - ,("Phi", "\x3A6") - ,("pi", "\x3C0") - ,("Pi", "\x3A0") - ,("psi", "\x3C8") - ,("Psi", "\x3A8") - ,("rho", "\x3C1") - ,("sigma", "\x3C3") - ,("Sigma", "\x3A3") - ,("tau", "\x3C4") - ,("theta", "\x3B8") - ,("vartheta", "\x3D1") - ,("Theta", "\x398") - ,("upsilon", "\x3C5") - ,("xi", "\x3BE") - ,("Xi", "\x39E") - ,("zeta", "\x3B6") - ,("ne", "\x2260") - ,("lt", withThinSpace "<") - ,("le", withThinSpace "\x2264") - ,("leq", withThinSpace "\x2264") - ,("ge", withThinSpace "\x2265") - ,("geq", withThinSpace "\x2265") - ,("prec", withThinSpace "\x227A") - ,("succ", withThinSpace "\x227B") - ,("preceq", withThinSpace "\x2AAF") - ,("succeq", withThinSpace "\x2AB0") - ,("in", withThinSpace "\x2208") - ,("notin", withThinSpace "\x2209") - ,("subset", withThinSpace "\x2282") - ,("supset", withThinSpace "\x2283") - ,("subseteq", withThinSpace "\x2286") - ,("supseteq", withThinSpace "\x2287") - ,("equiv", withThinSpace "\x2261") - ,("cong", withThinSpace "\x2245") - ,("approx", withThinSpace "\x2248") - ,("propto", withThinSpace "\x221D") - ,("cdot", withThinSpace "\x22C5") - ,("star", withThinSpace "\x22C6") - ,("backslash", "\\") - ,("times", withThinSpace "\x00D7") - ,("divide", withThinSpace "\x00F7") - ,("circ", withThinSpace "\x2218") - ,("oplus", withThinSpace "\x2295") - ,("otimes", withThinSpace "\x2297") - ,("odot", withThinSpace "\x2299") - ,("sum", "\x2211") - ,("prod", "\x220F") - ,("wedge", withThinSpace "\x2227") - ,("bigwedge", withThinSpace "\x22C0") - ,("vee", withThinSpace "\x2228") - ,("bigvee", withThinSpace "\x22C1") - ,("cap", withThinSpace "\x2229") - ,("bigcap", withThinSpace "\x22C2") - ,("cup", withThinSpace "\x222A") - ,("bigcup", withThinSpace "\x22C3") - ,("neg", "\x00AC") - ,("implies", withThinSpace "\x21D2") - ,("iff", withThinSpace "\x21D4") - ,("forall", "\x2200") - ,("exists", "\x2203") - ,("bot", "\x22A5") - ,("top", "\x22A4") - ,("vdash", "\x22A2") - ,("models", withThinSpace "\x22A8") - ,("uparrow", "\x2191") - ,("downarrow", "\x2193") - ,("rightarrow", withThinSpace "\x2192") - ,("to", withThinSpace "\x2192") - ,("rightarrowtail", "\x21A3") - ,("twoheadrightarrow", withThinSpace "\x21A0") - ,("twoheadrightarrowtail", withThinSpace "\x2916") - ,("mapsto", withThinSpace "\x21A6") - ,("leftarrow", withThinSpace "\x2190") - ,("leftrightarrow", withThinSpace "\x2194") - ,("Rightarrow", withThinSpace "\x21D2") - ,("Leftarrow", withThinSpace "\x21D0") - ,("Leftrightarrow", withThinSpace "\x21D4") - ,("partial", "\x2202") - ,("nabla", "\x2207") - ,("pm", "\x00B1") - ,("emptyset", "\x2205") - ,("infty", "\x221E") - ,("aleph", "\x2135") - ,("ldots", "...") - ,("therefore", "\x2234") - ,("angle", "\x2220") - ,("quad", "\x00A0\x00A0") - ,("cdots", "\x22EF") - ,("vdots", "\x22EE") - ,("ddots", "\x22F1") - ,("diamond", "\x22C4") - ,("Box", "\x25A1") - ,("lfloor", "\x230A") - ,("rfloor", "\x230B") - ,("lceiling", "\x2308") - ,("rceiling", "\x2309") - ,("langle", "\x2329") - ,("rangle", "\x232A") - ,("int", "\8747") - ,("{", "{") - ,("}", "}") - ,("[", "[") - ,("]", "]") - ,("|", "|") - ,("||", "||") - ] +readTeXMath inp = case readTeXMath' inp of + Nothing -> [Str ("$" ++ inp ++ "$")] + Just res -> res + +-- | Like 'readTeXMath', but without the default. +readTeXMath' :: String -- ^ String to parse (assumes @'\n'@ line endings) + -> Maybe [Inline] +readTeXMath' inp = case parse formula "formula" inp of + Left _ -> Just [Str inp] + Right exps -> expsToInlines exps + +expsToInlines :: [Exp] -> Maybe [Inline] +expsToInlines xs = do + res <- mapM expToInlines xs + return (concat res) + +expToInlines :: Exp -> Maybe [Inline] +expToInlines (ENumber s) = Just [Str s] +expToInlines (EIdentifier s) = Just [Emph [Str s]] +expToInlines (EMathOperator s) = Just [Str s] +expToInlines (ESymbol t s) = Just $ addSpace t (Str s) + where addSpace Op x = [x, thinspace] + addSpace Bin x = [medspace, x, medspace] + addSpace Rel x = [widespace, x, widespace] + addSpace Pun x = [x, thinspace] + addSpace _ x = [x] + thinspace = Str "\x2006" + medspace = Str "\x2005" + widespace = Str "\x2004" +expToInlines (EStretchy x) = expToInlines x +expToInlines (EGrouped xs) = expsToInlines xs +expToInlines (ESpace _) = Just [Str " "] -- variable widths not supported +expToInlines (EBinary _ _ _) = Nothing +expToInlines (ESub x y) = do + x' <- expToInlines x + y' <- expToInlines y + return $ x' ++ [Subscript y'] +expToInlines (ESuper x y) = do + x' <- expToInlines x + y' <- expToInlines y + return $ x' ++ [Superscript y'] +expToInlines (ESubsup x y z) = do + x' <- expToInlines x + y' <- expToInlines y + z' <- expToInlines z + return $ x' ++ [Subscript y'] ++ [Superscript z'] +expToInlines (EDown x y) = expToInlines (ESub x y) +expToInlines (EUp x y) = expToInlines (ESuper x y) +expToInlines (EDownup x y z) = expToInlines (ESubsup x y z) +expToInlines (EText _ x) = Just [Emph [Str x]] +expToInlines _ = Nothing |