From 39e8b4276e6d88d5cbb943d04c866dde9bf6473c Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Sun, 22 May 2016 16:52:06 +0200 Subject: Org reader: extract inline parser to module Inline parsing code is moved to a separate module. Parsers for block starts are extracted as well, as those are used in the `endline` parser. This is part of the Org-mode reader cleanup effort. --- src/Text/Pandoc/Readers/Org.hs | 797 ++--------------------------- src/Text/Pandoc/Readers/Org/BlockStarts.hs | 112 ++++ src/Text/Pandoc/Readers/Org/Inlines.hs | 715 ++++++++++++++++++++++++++ src/Text/Pandoc/Readers/Org/Parsing.hs | 19 + 4 files changed, 887 insertions(+), 756 deletions(-) create mode 100644 src/Text/Pandoc/Readers/Org/BlockStarts.hs create mode 100644 src/Text/Pandoc/Readers/Org/Inlines.hs (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index fd811c078..605d2220e 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -1,5 +1,4 @@ {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE OverloadedStrings #-} {- Copyright (C) 2014-2016 Albert Krewinkel @@ -29,6 +28,8 @@ Conversion of org-mode formatted plain text to 'Pandoc' document. -} module Text.Pandoc.Readers.Org ( readOrg ) where +import Text.Pandoc.Readers.Org.BlockStarts +import Text.Pandoc.Readers.Org.Inlines import Text.Pandoc.Readers.Org.ParserState import Text.Pandoc.Readers.Org.Parsing @@ -38,19 +39,16 @@ import Text.Pandoc.Definition import Text.Pandoc.Compat.Monoid ((<>)) import Text.Pandoc.Error import Text.Pandoc.Options -import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) -import Text.Pandoc.Shared (compactify', compactify'DL) -import Text.TeXMath (readTeX, writePandoc, DisplayType(..)) -import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap +import Text.Pandoc.Shared ( compactify', compactify'DL ) -import Control.Arrow (first) -import Control.Monad (foldM, guard, mplus, mzero, when) +import Control.Arrow ( first ) +import Control.Monad ( foldM, guard, mzero ) import Control.Monad.Reader ( runReader ) -import Data.Char (isAlphaNum, isSpace, toLower, toUpper) -import Data.List ( foldl', intersperse, isPrefixOf, isSuffixOf ) +import Data.Char ( toLower, toUpper) +import Data.List ( foldl', intersperse, isPrefixOf ) import qualified Data.Map as M import Data.Maybe ( fromMaybe, isNothing ) -import Network.HTTP (urlEncode) +import Network.HTTP ( urlEncode ) -- | Parse org-mode string and return a Pandoc document. @@ -59,54 +57,6 @@ readOrg :: ReaderOptions -- ^ Reader options -> Either PandocError Pandoc readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n") --- --- Functions acting on the parser state --- -recordAnchorId :: String -> OrgParser () -recordAnchorId i = updateState $ \s -> - s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } - -pushToInlineCharStack :: Char -> OrgParser () -pushToInlineCharStack c = updateState $ \s -> - s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s } - -popInlineCharStack :: OrgParser () -popInlineCharStack = updateState $ \s -> - s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s } - -surroundingEmphasisChar :: OrgParser [Char] -surroundingEmphasisChar = - take 1 . drop 1 . orgStateEmphasisCharStack <$> getState - -startEmphasisNewlinesCounting :: Int -> OrgParser () -startEmphasisNewlinesCounting maxNewlines = updateState $ \s -> - s{ orgStateEmphasisNewlines = Just maxNewlines } - -decEmphasisNewlinesCount :: OrgParser () -decEmphasisNewlinesCount = updateState $ \s -> - s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s } - -newlinesCountWithinLimits :: OrgParser Bool -newlinesCountWithinLimits = do - st <- getState - return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True - -resetEmphasisNewlines :: OrgParser () -resetEmphasisNewlines = updateState $ \s -> - s{ orgStateEmphasisNewlines = Nothing } - -addLinkFormat :: String - -> (String -> String) - -> OrgParser () -addLinkFormat key formatter = updateState $ \s -> - let fs = orgStateLinkFormatters s - in s{ orgStateLinkFormatters = M.insert key formatter fs } - -addToNotesTable :: OrgNoteRecord -> OrgParser () -addToNotesTable note = do - oldnotes <- orgStateNotes' <$> getState - updateState $ \s -> s{ orgStateNotes' = note:oldnotes } - -- -- Export Settings -- @@ -259,7 +209,7 @@ block = choice [ mempty <$ blanklines , genericDrawer , specialLine , header - , return <$> hline + , horizontalRule , list , latexFragment , noteBlock @@ -457,9 +407,6 @@ indentWith num = do type SwitchOption = (Char, Maybe String) -orgArgWord :: OrgParser String -orgArgWord = many1 orgArgWordChar - -- | Parse code block arguments -- TODO: We currently don't handle switches. codeHeaderArgs :: OrgParser ([String], [(String, String)]) @@ -474,7 +421,10 @@ codeHeaderArgs = try $ do , map toRundocAttrib (("language", language) : parameters) ) else ([ pandocLang ], parameters) - where hasRundocParameters = not . null + where + hasRundocParameters = not . null + toRundocAttrib = first ("rundoc-" ++) + switch :: OrgParser SwitchOption switch = try $ simpleSwitch <|> lineNumbersSwitch @@ -508,17 +458,6 @@ blockOption = try $ do paramValue <- option "yes" orgParamValue return (argKey, paramValue) -inlineBlockOption :: OrgParser (String, String) -inlineBlockOption = try $ do - argKey <- orgArgKey - paramValue <- option "yes" orgInlineParamValue - return (argKey, paramValue) - -orgArgKey :: OrgParser String -orgArgKey = try $ - skipSpaces *> char ':' - *> many1 orgArgWordChar - orgParamValue :: OrgParser String orgParamValue = try $ skipSpaces @@ -526,19 +465,6 @@ orgParamValue = try $ *> many1 (noneOf "\t\n\r ") <* skipSpaces -orgInlineParamValue :: OrgParser String -orgInlineParamValue = try $ - skipSpaces - *> notFollowedBy (char ':') - *> many1 (noneOf "\t\n\r ]") - <* skipSpaces - -orgArgWordChar :: OrgParser Char -orgArgWordChar = alphaNum <|> oneOf "-_" - -toRundocAttrib :: (String, String) -> (String, String) -toRundocAttrib = first ("rundoc-" ++) - commaEscaped :: String -> String commaEscaped (',':cs@('*':_)) = cs commaEscaped (',':cs@('#':'+':_)) = cs @@ -552,7 +478,10 @@ exampleCode :: String -> Blocks exampleCode = B.codeBlockWith ("", ["example"], []) exampleLine :: OrgParser String -exampleLine = try $ skipSpaces *> string ": " *> anyLine +exampleLine = try $ exampleLineStart *> anyLine + +horizontalRule :: OrgParser (F Blocks) +horizontalRule = return B.horizontalRule <$ try hline -- @@ -582,11 +511,6 @@ genericDrawer = try $ do drawerDiv :: String -> F Blocks -> F Blocks drawerDiv drawerName = fmap $ B.divWith (mempty, [drawerName, "drawer"], mempty) -drawerStart :: OrgParser String -drawerStart = try $ - skipSpaces *> drawerName <* skipSpaces <* newline - where drawerName = char ':' *> manyTill nonspaceChar (char ':') - drawerLine :: OrgParser String drawerLine = anyLine @@ -639,31 +563,38 @@ figure = try $ do let attr = (mempty, mempty, figKeyVals) return $ (B.para . B.imageWith attr src (withFigPrefix figName) <$> figCaption) where + withFigPrefix :: String -> String withFigPrefix cs = if "fig:" `isPrefixOf` cs then cs else "fig:" ++ cs + selfTarget :: OrgParser String + selfTarget = try $ char '[' *> linkTarget <* char ']' + + -- -- Comments, Options and Metadata -- + +addLinkFormat :: String + -> (String -> String) + -> OrgParser () +addLinkFormat key formatter = updateState $ \s -> + let fs = orgStateLinkFormatters s + in s{ orgStateLinkFormatters = M.insert key formatter fs } + specialLine :: OrgParser (F Blocks) specialLine = fmap return . try $ metaLine <|> commentLine -metaLine :: OrgParser Blocks -metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine) - -- The order, in which blocks are tried, makes sure that we're not looking at -- the beginning of a block, so we don't need to check for it -metaLineStart :: OrgParser () -metaLineStart = try $ skipSpaces <* string "#+" +metaLine :: OrgParser Blocks +metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine) commentLine :: OrgParser Blocks commentLine = commentLineStart *> anyLine *> pure mempty -commentLineStart :: OrgParser () -commentLineStart = try $ skipSpaces <* string "# " - declarationLine :: OrgParser () declarationLine = try $ do key <- metaKey @@ -741,23 +672,6 @@ header = try $ do *> many1 tag <* skipSpaces -headerStart :: OrgParser Int -headerStart = try $ - (length <$> many1 (char '*')) <* many1 (char ' ') <* updateLastPreCharPos - - --- Don't use (or need) the reader wrapper here, we want hline to be --- @show@able. Otherwise we can't use it with @notFollowedBy'@. - --- | Horizontal Line (five -- dashes or more) -hline :: OrgParser Blocks -hline = try $ do - skipSpaces - string "-----" - many (char '-') - skipSpaces - newline - return B.horizontalRule -- -- Tables @@ -793,9 +707,6 @@ orgToPandocTable :: OrgTable orgToPandocTable (OrgTable aligns heads lns) caption = B.table caption (zip aligns $ repeat 0) heads lns -tableStart :: OrgParser Char -tableStart = try $ skipSpaces *> char '|' - tableRows :: OrgParser [OrgTableRow] tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) @@ -899,25 +810,12 @@ latexFragment = try $ do , "\\end{", e, "}\n" ] -latexEnvStart :: OrgParser String -latexEnvStart = try $ do - skipSpaces *> string "\\begin{" - *> latexEnvName - <* string "}" - <* blankline - latexEnd :: String -> OrgParser () latexEnd envName = try $ () <$ skipSpaces <* string ("\\end{" ++ envName ++ "}") <* blankline --- | Parses a LaTeX environment name. -latexEnvName :: OrgParser String -latexEnvName = try $ do - mappend <$> many1 alphaNum - <*> option "" (string "*") - -- -- Footnote defintions @@ -942,7 +840,7 @@ paraOrPlain = try $ do -- is directly followed by a list item, in which case the block is read as -- plain text. try (guard nl - *> notFollowedBy (inList *> (orderedListStart <|> bulletListStart)) + *> notFollowedBy (inList *> (() <$ orderedListStart <|> bulletListStart)) *> return (B.para <$> ils)) <|> (return (B.plain <$> ils)) @@ -971,38 +869,21 @@ orderedList :: OrgParser (F Blocks) orderedList = fmap B.orderedList . fmap compactify' . sequence <$> many1 (listItem orderedListStart) -genericListStart :: OrgParser String - -> OrgParser Int -genericListStart listMarker = try $ - (+) <$> (length <$> many spaceChar) - <*> (length <$> listMarker <* many1 spaceChar) - --- parses bullet list marker. maybe we know the indent level -bulletListStart :: OrgParser Int -bulletListStart = bulletListStart' Nothing - bulletListStart' :: Maybe Int -> OrgParser Int -- returns length of bulletList prefix, inclusive of marker bulletListStart' Nothing = do ind <- length <$> many spaceChar - when (ind == 0) $ notFollowedBy (char '*') - oneOf bullets - many1 spaceChar + oneOf (bullets $ ind == 0) + skipSpaces1 return (ind + 1) - -- Unindented lists are legal, but they can't use '*' bullets - -- We return n to maintain compatibility with the generic listItem bulletListStart' (Just n) = do count (n-1) spaceChar - when (n == 1) $ notFollowedBy (char '*') - oneOf bullets + oneOf (bullets $ n == 1) many1 spaceChar return n -bullets :: String -bullets = "*+-" - -orderedListStart :: OrgParser Int -orderedListStart = genericListStart orderedListMarker - -- Ordered list markers allowed in org-mode - where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)") +-- Unindented lists are legal, but they can't use '*' bullets. +-- We return n to maintain compatibility with the generic listItem. +bullets :: Bool -> String +bullets unindented = if unindented then "+-" else "*+-" definitionListItem :: OrgParser Int -> OrgParser (F (Inlines, [Blocks])) @@ -1040,602 +921,6 @@ listContinuation markerLength = try $ <*> many blankline) where listLine = try $ indentWith markerLength *> anyLineNewline +-- | Parse any line, include the final newline in the output. anyLineNewline :: OrgParser String anyLineNewline = (++ "\n") <$> anyLine - - --- --- inline --- - -inline :: OrgParser (F Inlines) -inline = - choice [ whitespace - , linebreak - , cite - , footnote - , linkOrImage - , anchor - , inlineCodeBlock - , str - , endline - , emph - , strong - , strikeout - , underline - , code - , math - , displayMath - , verbatim - , subscript - , superscript - , inlineLaTeX - , smart - , symbol - ] <* (guard =<< newlinesCountWithinLimits) - "inline" - -parseInlines :: OrgParser (F Inlines) -parseInlines = trimInlinesF . mconcat <$> many1 inline - --- treat these as potentially non-text when parsing inline: -specialChars :: [Char] -specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~" - - -whitespace :: OrgParser (F Inlines) -whitespace = pure B.space <$ skipMany1 spaceChar - <* updateLastPreCharPos - <* updateLastForbiddenCharPos - "whitespace" - -linebreak :: OrgParser (F Inlines) -linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline - -str :: OrgParser (F Inlines) -str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") - <* updateLastStrPos - --- | An endline character that can be treated as a space, not a structural --- break. This should reflect the values of the Emacs variable --- @org-element-pagaraph-separate@. -endline :: OrgParser (F Inlines) -endline = try $ do - newline - notFollowedBy blankline - notFollowedBy' exampleLine - notFollowedBy' hline - notFollowedBy' noteMarker - notFollowedBy' tableStart - notFollowedBy' drawerStart - notFollowedBy' headerStart - notFollowedBy' metaLineStart - notFollowedBy' latexEnvStart - notFollowedBy' commentLineStart - notFollowedBy' bulletListStart - notFollowedBy' orderedListStart - decEmphasisNewlinesCount - guard =<< newlinesCountWithinLimits - updateLastPreCharPos - return . return $ B.softbreak - -cite :: OrgParser (F Inlines) -cite = try $ do - guardEnabled Ext_citations - (cs, raw) <- withRaw normalCite - return $ (flip B.cite (B.text raw)) <$> cs - -normalCite :: OrgParser (F [Citation]) -normalCite = try $ char '[' - *> skipSpaces - *> citeList - <* skipSpaces - <* char ']' - -citeList :: OrgParser (F [Citation]) -citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces) - -citation :: OrgParser (F Citation) -citation = try $ do - pref <- prefix - (suppress_author, key) <- citeKey - suff <- suffix - return $ do - x <- pref - y <- suff - return $ Citation{ citationId = key - , citationPrefix = B.toList x - , citationSuffix = B.toList y - , citationMode = if suppress_author - then SuppressAuthor - else NormalCitation - , citationNoteNum = 0 - , citationHash = 0 - } - where - prefix = trimInlinesF . mconcat <$> - manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) - suffix = try $ do - hasSpace <- option False (notFollowedBy nonspaceChar >> return True) - skipSpaces - rest <- trimInlinesF . mconcat <$> - many (notFollowedBy (oneOf ";]") *> inline) - return $ if hasSpace - then (B.space <>) <$> rest - else rest - -footnote :: OrgParser (F Inlines) -footnote = try $ inlineNote <|> referencedNote - -inlineNote :: OrgParser (F Inlines) -inlineNote = try $ do - string "[fn:" - ref <- many alphaNum - char ':' - note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']') - when (not $ null ref) $ - addToNotesTable ("fn:" ++ ref, note) - return $ B.note <$> note - -referencedNote :: OrgParser (F Inlines) -referencedNote = try $ do - ref <- noteMarker - return $ do - notes <- asksF orgStateNotes' - case lookup ref notes of - Nothing -> return $ B.str $ "[" ++ ref ++ "]" - Just contents -> do - st <- askF - let contents' = runF contents st{ orgStateNotes' = [] } - return $ B.note contents' - -noteMarker :: OrgParser String -noteMarker = try $ do - char '[' - choice [ many1Till digit (char ']') - , (++) <$> string "fn:" - <*> many1Till (noneOf "\n\r\t ") (char ']') - ] - -linkOrImage :: OrgParser (F Inlines) -linkOrImage = explicitOrImageLink - <|> selflinkOrImage - <|> angleLink - <|> plainLink - "link or image" - -explicitOrImageLink :: OrgParser (F Inlines) -explicitOrImageLink = try $ do - char '[' - srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget - title <- enclosedRaw (char '[') (char ']') - title' <- parseFromString (mconcat <$> many inline) title - char ']' - return $ do - src <- srcF - if isImageFilename title - then pure $ B.link src "" $ B.image title mempty mempty - else linkToInlinesF src =<< title' - -selflinkOrImage :: OrgParser (F Inlines) -selflinkOrImage = try $ do - src <- char '[' *> linkTarget <* char ']' - return $ linkToInlinesF src (B.str src) - -plainLink :: OrgParser (F Inlines) -plainLink = try $ do - (orig, src) <- uri - returnF $ B.link src "" (B.str orig) - -angleLink :: OrgParser (F Inlines) -angleLink = try $ do - char '<' - link <- plainLink - char '>' - return link - -selfTarget :: OrgParser String -selfTarget = try $ char '[' *> linkTarget <* char ']' - -linkTarget :: OrgParser String -linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") - -possiblyEmptyLinkTarget :: OrgParser String -possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]") - -applyCustomLinkFormat :: String -> OrgParser (F String) -applyCustomLinkFormat link = do - let (linkType, rest) = break (== ':') link - return $ do - formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters - return $ maybe link ($ drop 1 rest) formatter - --- | Take a link and return a function which produces new inlines when given --- description inlines. -linkToInlinesF :: String -> Inlines -> F Inlines -linkToInlinesF linkStr = - case linkStr of - "" -> pure . B.link mempty "" -- wiki link (empty by convention) - ('#':_) -> pure . B.link linkStr "" -- document-local fraction - _ -> case cleanLinkString linkStr of - (Just cleanedLink) -> if isImageFilename cleanedLink - then const . pure $ B.image cleanedLink "" "" - else pure . B.link cleanedLink "" - Nothing -> internalLink linkStr -- other internal link - --- | Cleanup and canonicalize a string describing a link. Return @Nothing@ if --- the string does not appear to be a link. -cleanLinkString :: String -> Maybe String -cleanLinkString s = - case s of - '/':_ -> Just $ "file://" ++ s -- absolute path - '.':'/':_ -> Just s -- relative path - '.':'.':'/':_ -> Just s -- relative path - -- Relative path or URL (file schema) - 'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s' - _ | isUrl s -> Just s -- URL - _ -> Nothing - where - isUrl :: String -> Bool - isUrl cs = - let (scheme, path) = break (== ':') cs - in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme - && not (null path) - -isImageFilename :: String -> Bool -isImageFilename filename = - any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && - (any (\x -> (x++":") `isPrefixOf` filename) protocols || - ':' `notElem` filename) - where - imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] - protocols = [ "file", "http", "https" ] - -internalLink :: String -> Inlines -> F Inlines -internalLink link title = do - anchorB <- (link `elem`) <$> asksF orgStateAnchorIds - if anchorB - then return $ B.link ('#':link) "" title - else return $ B.emph title - --- | Parse an anchor like @<>@ and return an empty span with --- @anchor-id@ set as id. Legal anchors in org-mode are defined through --- @org-target-regexp@, which is fairly liberal. Since no link is created if --- @anchor-id@ contains spaces, we are more restrictive in what is accepted as --- an anchor. - -anchor :: OrgParser (F Inlines) -anchor = try $ do - anchorId <- parseAnchor - recordAnchorId anchorId - returnF $ B.spanWith (solidify anchorId, [], []) mempty - where - parseAnchor = string "<<" - *> many1 (noneOf "\t\n\r<>\"' ") - <* string ">>" - <* skipSpaces - --- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors --- the org function @org-export-solidify-link-text@. - -solidify :: String -> String -solidify = map replaceSpecialChar - where replaceSpecialChar c - | isAlphaNum c = c - | c `elem` ("_.-:" :: String) = c - | otherwise = '-' - --- | Parses an inline code block and marks it as an babel block. -inlineCodeBlock :: OrgParser (F Inlines) -inlineCodeBlock = try $ do - string "src_" - lang <- many1 orgArgWordChar - opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption - inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r") - let attrClasses = [translateLang lang, rundocBlockClass] - let attrKeyVal = map toRundocAttrib (("language", lang) : opts) - returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode - -enclosedByPair :: Char -- ^ opening char - -> Char -- ^ closing char - -> OrgParser a -- ^ parser - -> OrgParser [a] -enclosedByPair s e p = char s *> many1Till p (char e) - -emph :: OrgParser (F Inlines) -emph = fmap B.emph <$> emphasisBetween '/' - -strong :: OrgParser (F Inlines) -strong = fmap B.strong <$> emphasisBetween '*' - -strikeout :: OrgParser (F Inlines) -strikeout = fmap B.strikeout <$> emphasisBetween '+' - --- There is no underline, so we use strong instead. -underline :: OrgParser (F Inlines) -underline = fmap B.strong <$> emphasisBetween '_' - -verbatim :: OrgParser (F Inlines) -verbatim = return . B.code <$> verbatimBetween '=' - -code :: OrgParser (F Inlines) -code = return . B.code <$> verbatimBetween '~' - -subscript :: OrgParser (F Inlines) -subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr) - -superscript :: OrgParser (F Inlines) -superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr) - -math :: OrgParser (F Inlines) -math = return . B.math <$> choice [ math1CharBetween '$' - , mathStringBetween '$' - , rawMathBetween "\\(" "\\)" - ] - -displayMath :: OrgParser (F Inlines) -displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" - , rawMathBetween "$$" "$$" - ] - -updatePositions :: Char - -> OrgParser (Char) -updatePositions c = do - when (c `elem` emphasisPreChars) updateLastPreCharPos - when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos - return c - -symbol :: OrgParser (F Inlines) -symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) - -emphasisBetween :: Char - -> OrgParser (F Inlines) -emphasisBetween c = try $ do - startEmphasisNewlinesCounting emphasisAllowedNewlines - res <- enclosedInlines (emphasisStart c) (emphasisEnd c) - isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState - when isTopLevelEmphasis - resetEmphasisNewlines - return res - -verbatimBetween :: Char - -> OrgParser String -verbatimBetween c = try $ - emphasisStart c *> - many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c) - --- | Parses a raw string delimited by @c@ using Org's math rules -mathStringBetween :: Char - -> OrgParser String -mathStringBetween c = try $ do - mathStart c - body <- many1TillNOrLessNewlines mathAllowedNewlines - (noneOf (c:"\n\r")) - (lookAhead $ mathEnd c) - final <- mathEnd c - return $ body ++ [final] - --- | Parse a single character between @c@ using math rules -math1CharBetween :: Char - -> OrgParser String -math1CharBetween c = try $ do - char c - res <- noneOf $ c:mathForbiddenBorderChars - char c - eof <|> () <$ lookAhead (oneOf mathPostChars) - return [res] - -rawMathBetween :: String - -> String - -> OrgParser String -rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e) - --- | Parses the start (opening character) of emphasis -emphasisStart :: Char -> OrgParser Char -emphasisStart c = try $ do - guard =<< afterEmphasisPreChar - guard =<< notAfterString - char c - lookAhead (noneOf emphasisForbiddenBorderChars) - pushToInlineCharStack c - return c - --- | Parses the closing character of emphasis -emphasisEnd :: Char -> OrgParser Char -emphasisEnd c = try $ do - guard =<< notAfterForbiddenBorderChar - char c - eof <|> () <$ lookAhead acceptablePostChars - updateLastStrPos - popInlineCharStack - return c - where acceptablePostChars = - surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars) - -mathStart :: Char -> OrgParser Char -mathStart c = try $ - char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars)) - -mathEnd :: Char -> OrgParser Char -mathEnd c = try $ do - res <- noneOf (c:mathForbiddenBorderChars) - char c - eof <|> () <$ lookAhead (oneOf mathPostChars) - return res - - -enclosedInlines :: OrgParser a - -> OrgParser b - -> OrgParser (F Inlines) -enclosedInlines start end = try $ - trimInlinesF . mconcat <$> enclosed start end inline - -enclosedRaw :: OrgParser a - -> OrgParser b - -> OrgParser String -enclosedRaw start end = try $ - start *> (onSingleLine <|> spanningTwoLines) - where onSingleLine = try $ many1Till (noneOf "\n\r") end - spanningTwoLines = try $ - anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine - --- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume --- newlines. -many1TillNOrLessNewlines :: Int - -> OrgParser Char - -> OrgParser a - -> OrgParser String -many1TillNOrLessNewlines n p end = try $ - nMoreLines (Just n) mempty >>= oneOrMore - where - nMoreLines Nothing cs = return cs - nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine - nMoreLines k cs = try $ (final k cs <|> rest k cs) - >>= uncurry nMoreLines - final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine - rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p newline) - finalLine = try $ manyTill p end - minus1 k = k - 1 - oneOrMore cs = guard (not $ null cs) *> return cs - --- Org allows customization of the way it reads emphasis. We use the defaults --- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components` --- for details). - --- | Chars allowed to occur before emphasis (spaces and newlines are ok, too) -emphasisPreChars :: [Char] -emphasisPreChars = "\t \"'({" - --- | Chars allowed at after emphasis -emphasisPostChars :: [Char] -emphasisPostChars = "\t\n !\"'),-.:;?\\}" - --- | Chars not allowed at the (inner) border of emphasis -emphasisForbiddenBorderChars :: [Char] -emphasisForbiddenBorderChars = "\t\n\r \"'," - --- | The maximum number of newlines within -emphasisAllowedNewlines :: Int -emphasisAllowedNewlines = 1 - --- LaTeX-style math: see `org-latex-regexps` for details - --- | Chars allowed after an inline ($...$) math statement -mathPostChars :: [Char] -mathPostChars = "\t\n \"'),-.:;?" - --- | Chars not allowed at the (inner) border of math -mathForbiddenBorderChars :: [Char] -mathForbiddenBorderChars = "\t\n\r ,;.$" - --- | Maximum number of newlines in an inline math statement -mathAllowedNewlines :: Int -mathAllowedNewlines = 2 - --- | Whether we are right behind a char allowed before emphasis -afterEmphasisPreChar :: OrgParser Bool -afterEmphasisPreChar = do - pos <- getPosition - lastPrePos <- orgStateLastPreCharPos <$> getState - return . fromMaybe True $ (== pos) <$> lastPrePos - --- | Whether the parser is right after a forbidden border char -notAfterForbiddenBorderChar :: OrgParser Bool -notAfterForbiddenBorderChar = do - pos <- getPosition - lastFBCPos <- orgStateLastForbiddenCharPos <$> getState - return $ lastFBCPos /= Just pos - --- | Read a sub- or superscript expression -subOrSuperExpr :: OrgParser (F Inlines) -subOrSuperExpr = try $ - choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r") - , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r") - , simpleSubOrSuperString - ] >>= parseFromString (mconcat <$> many inline) - where enclosing (left, right) s = left : s ++ [right] - -simpleSubOrSuperString :: OrgParser String -simpleSubOrSuperString = try $ do - state <- getState - guard . exportSubSuperscripts . orgStateExportSettings $ state - choice [ string "*" - , mappend <$> option [] ((:[]) <$> oneOf "+-") - <*> many1 alphaNum - ] - -inlineLaTeX :: OrgParser (F Inlines) -inlineLaTeX = try $ do - cmd <- inlineLaTeXCommand - maybe mzero returnF $ - parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd - where - parseAsMath :: String -> Maybe Inlines - parseAsMath cs = B.fromList <$> texMathToPandoc cs - - parseAsInlineLaTeX :: String -> Maybe Inlines - parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs - - parseAsMathMLSym :: String -> Maybe Inlines - parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs) - -- drop initial backslash and any trailing "{}" - where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1 - - state :: ParserState - state = def{ stateOptions = def{ readerParseRaw = True }} - - texMathToPandoc :: String -> Maybe [Inline] - texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline - -maybeRight :: Either a b -> Maybe b -maybeRight = either (const Nothing) Just - -inlineLaTeXCommand :: OrgParser String -inlineLaTeXCommand = try $ do - rest <- getInput - case runParser rawLaTeXInline def "source" rest of - Right (RawInline _ cs) -> do - -- drop any trailing whitespace, those are not be part of the command as - -- far as org mode is concerned. - let cmdNoSpc = dropWhileEnd isSpace cs - let len = length cmdNoSpc - count len anyChar - return cmdNoSpc - _ -> mzero - --- Taken from Data.OldList. -dropWhileEnd :: (a -> Bool) -> [a] -> [a] -dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) [] - -smart :: OrgParser (F Inlines) -smart = do - getOption readerSmart >>= guard - doubleQuoted <|> singleQuoted <|> - choice (map (return <$>) [orgApostrophe, orgDash, orgEllipses]) - where - orgDash = dash <* updatePositions '-' - orgEllipses = ellipses <* updatePositions '.' - orgApostrophe = - (char '\'' <|> char '\8217') <* updateLastPreCharPos - <* updateLastForbiddenCharPos - *> return (B.str "\x2019") - -singleQuoted :: OrgParser (F Inlines) -singleQuoted = try $ do - singleQuoteStart - updatePositions '\'' - withQuoteContext InSingleQuote $ - fmap B.singleQuoted . trimInlinesF . mconcat <$> - many1Till inline (singleQuoteEnd <* updatePositions '\'') - --- doubleQuoted will handle regular double-quoted sections, as well --- as dialogues with an open double-quote without a close double-quote --- in the same paragraph. -doubleQuoted :: OrgParser (F Inlines) -doubleQuoted = try $ do - doubleQuoteStart - updatePositions '"' - contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) - (withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return - (fmap B.doubleQuoted . trimInlinesF $ contents)) - <|> (return $ return (B.str "\8220") <> contents) diff --git a/src/Text/Pandoc/Readers/Org/BlockStarts.hs b/src/Text/Pandoc/Readers/Org/BlockStarts.hs new file mode 100644 index 000000000..e4dc31342 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/BlockStarts.hs @@ -0,0 +1,112 @@ +{- +Copyright (C) 2014-2016 Albert Krewinkel + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel + +Parsers for Org-mode inline elements. +-} +module Text.Pandoc.Readers.Org.BlockStarts + ( exampleLineStart + , hline + , noteMarker + , tableStart + , drawerStart + , headerStart + , metaLineStart + , latexEnvStart + , commentLineStart + , bulletListStart + , orderedListStart + ) where + +import Text.Pandoc.Readers.Org.Parsing + +-- | Horizontal Line (five -- dashes or more) +hline :: OrgParser () +hline = try $ do + skipSpaces + string "-----" + many (char '-') + skipSpaces + newline + return () + +-- | Read the start of a header line, return the header level +headerStart :: OrgParser Int +headerStart = try $ + (length <$> many1 (char '*')) <* many1 (char ' ') <* updateLastPreCharPos + +tableStart :: OrgParser Char +tableStart = try $ skipSpaces *> char '|' + +latexEnvStart :: OrgParser String +latexEnvStart = try $ do + skipSpaces *> string "\\begin{" + *> latexEnvName + <* string "}" + <* blankline + where + latexEnvName :: OrgParser String + latexEnvName = try $ mappend <$> many1 alphaNum <*> option "" (string "*") + + +-- | Parses bullet list marker. +bulletListStart :: OrgParser () +bulletListStart = try $ + choice + [ () <$ skipSpaces <* oneOf "+-" <* skipSpaces1 + , () <$ skipSpaces1 <* char '*' <* skipSpaces1 + ] + +genericListStart :: OrgParser String + -> OrgParser Int +genericListStart listMarker = try $ + (+) <$> (length <$> many spaceChar) + <*> (length <$> listMarker <* many1 spaceChar) + +orderedListStart :: OrgParser Int +orderedListStart = genericListStart orderedListMarker + -- Ordered list markers allowed in org-mode + where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)") + +drawerStart :: OrgParser String +drawerStart = try $ + skipSpaces *> drawerName <* skipSpaces <* newline + where drawerName = char ':' *> manyTill nonspaceChar (char ':') + +metaLineStart :: OrgParser () +metaLineStart = try $ skipSpaces <* string "#+" + +commentLineStart :: OrgParser () +commentLineStart = try $ skipSpaces <* string "# " + +exampleLineStart :: OrgParser () +exampleLineStart = () <$ try (skipSpaces *> string ": ") + +noteMarker :: OrgParser String +noteMarker = try $ do + char '[' + choice [ many1Till digit (char ']') + , (++) <$> string "fn:" + <*> many1Till (noneOf "\n\r\t ") (char ']') + ] diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs new file mode 100644 index 000000000..0c3840979 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -0,0 +1,715 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2014-2016 Albert Krewinkel + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel + +Parsers for Org-mode inline elements. +-} +module Text.Pandoc.Readers.Org.Inlines + ( inline + , addToNotesTable + , parseInlines + , isImageFilename + , linkTarget + ) where + +import Text.Pandoc.Readers.Org.BlockStarts +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines ) +import Text.Pandoc.Definition +import Text.Pandoc.Compat.Monoid ( (<>) ) +import Text.Pandoc.Options +import Text.Pandoc.Readers.LaTeX ( inlineCommand, rawLaTeXInline ) +import Text.TeXMath ( readTeX, writePandoc, DisplayType(..) ) +import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap + +import Control.Arrow ( first ) +import Control.Monad ( guard, mplus, mzero, when ) +import Data.Char ( isAlphaNum, isSpace ) +import Data.List ( isPrefixOf, isSuffixOf ) +import Data.Maybe ( fromMaybe ) +import qualified Data.Map as M + +-- | Prefix used for Rundoc classes and arguments. +rundocPrefix :: String +rundocPrefix = "rundoc-" + +-- | The class-name used to mark rundoc blocks. +rundocBlockClass :: String +rundocBlockClass = rundocPrefix ++ "block" + +toRundocAttrib :: (String, String) -> (String, String) +toRundocAttrib = first ("rundoc-" ++) + +translateLang :: String -> String +translateLang "C" = "c" +translateLang "C++" = "cpp" +translateLang "emacs-lisp" = "commonlisp" -- emacs lisp is not supported +translateLang "js" = "javascript" +translateLang "lisp" = "commonlisp" +translateLang "R" = "r" +translateLang "sh" = "bash" +translateLang "sqlite" = "sql" +translateLang cs = cs + +-- +-- Functions acting on the parser state +-- +recordAnchorId :: String -> OrgParser () +recordAnchorId i = updateState $ \s -> + s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } + +pushToInlineCharStack :: Char -> OrgParser () +pushToInlineCharStack c = updateState $ \s -> + s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s } + +popInlineCharStack :: OrgParser () +popInlineCharStack = updateState $ \s -> + s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s } + +surroundingEmphasisChar :: OrgParser [Char] +surroundingEmphasisChar = + take 1 . drop 1 . orgStateEmphasisCharStack <$> getState + +startEmphasisNewlinesCounting :: Int -> OrgParser () +startEmphasisNewlinesCounting maxNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Just maxNewlines } + +decEmphasisNewlinesCount :: OrgParser () +decEmphasisNewlinesCount = updateState $ \s -> + s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s } + +newlinesCountWithinLimits :: OrgParser Bool +newlinesCountWithinLimits = do + st <- getState + return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True + +resetEmphasisNewlines :: OrgParser () +resetEmphasisNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Nothing } + +addToNotesTable :: OrgNoteRecord -> OrgParser () +addToNotesTable note = do + oldnotes <- orgStateNotes' <$> getState + updateState $ \s -> s{ orgStateNotes' = note:oldnotes } + +-- | Parse a single Org-mode inline element +inline :: OrgParser (F Inlines) +inline = + choice [ whitespace + , linebreak + , cite + , footnote + , linkOrImage + , anchor + , inlineCodeBlock + , str + , endline + , emph + , strong + , strikeout + , underline + , code + , math + , displayMath + , verbatim + , subscript + , superscript + , inlineLaTeX + , smart + , symbol + ] <* (guard =<< newlinesCountWithinLimits) + "inline" + +parseInlines :: OrgParser (F Inlines) +parseInlines = trimInlinesF . mconcat <$> many1 inline + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~" + + +whitespace :: OrgParser (F Inlines) +whitespace = pure B.space <$ skipMany1 spaceChar + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + "whitespace" + +linebreak :: OrgParser (F Inlines) +linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline + +str :: OrgParser (F Inlines) +str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + <* updateLastStrPos + +-- | An endline character that can be treated as a space, not a structural +-- break. This should reflect the values of the Emacs variable +-- @org-element-pagaraph-separate@. +endline :: OrgParser (F Inlines) +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy' exampleLineStart + notFollowedBy' hline + notFollowedBy' noteMarker + notFollowedBy' tableStart + notFollowedBy' drawerStart + notFollowedBy' headerStart + notFollowedBy' metaLineStart + notFollowedBy' latexEnvStart + notFollowedBy' commentLineStart + notFollowedBy' bulletListStart + notFollowedBy' orderedListStart + decEmphasisNewlinesCount + guard =<< newlinesCountWithinLimits + updateLastPreCharPos + return . return $ B.softbreak + +cite :: OrgParser (F Inlines) +cite = try $ do + guardEnabled Ext_citations + (cs, raw) <- withRaw normalCite + return $ (flip B.cite (B.text raw)) <$> cs + +normalCite :: OrgParser (F [Citation]) +normalCite = try $ char '[' + *> skipSpaces + *> citeList + <* skipSpaces + <* char ']' + +citeList :: OrgParser (F [Citation]) +citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces) + +citation :: OrgParser (F Citation) +citation = try $ do + pref <- prefix + (suppress_author, key) <- citeKey + suff <- suffix + return $ do + x <- pref + y <- suff + return $ Citation{ citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + where + prefix = trimInlinesF . mconcat <$> + manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) + suffix = try $ do + hasSpace <- option False (notFollowedBy nonspaceChar >> return True) + skipSpaces + rest <- trimInlinesF . mconcat <$> + many (notFollowedBy (oneOf ";]") *> inline) + return $ if hasSpace + then (B.space <>) <$> rest + else rest + +footnote :: OrgParser (F Inlines) +footnote = try $ inlineNote <|> referencedNote + +inlineNote :: OrgParser (F Inlines) +inlineNote = try $ do + string "[fn:" + ref <- many alphaNum + char ':' + note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']') + when (not $ null ref) $ + addToNotesTable ("fn:" ++ ref, note) + return $ B.note <$> note + +referencedNote :: OrgParser (F Inlines) +referencedNote = try $ do + ref <- noteMarker + return $ do + notes <- asksF orgStateNotes' + case lookup ref notes of + Nothing -> return $ B.str $ "[" ++ ref ++ "]" + Just contents -> do + st <- askF + let contents' = runF contents st{ orgStateNotes' = [] } + return $ B.note contents' + +linkOrImage :: OrgParser (F Inlines) +linkOrImage = explicitOrImageLink + <|> selflinkOrImage + <|> angleLink + <|> plainLink + "link or image" + +explicitOrImageLink :: OrgParser (F Inlines) +explicitOrImageLink = try $ do + char '[' + srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget + title <- enclosedRaw (char '[') (char ']') + title' <- parseFromString (mconcat <$> many inline) title + char ']' + return $ do + src <- srcF + if isImageFilename title + then pure $ B.link src "" $ B.image title mempty mempty + else linkToInlinesF src =<< title' + +selflinkOrImage :: OrgParser (F Inlines) +selflinkOrImage = try $ do + src <- char '[' *> linkTarget <* char ']' + return $ linkToInlinesF src (B.str src) + +plainLink :: OrgParser (F Inlines) +plainLink = try $ do + (orig, src) <- uri + returnF $ B.link src "" (B.str orig) + +angleLink :: OrgParser (F Inlines) +angleLink = try $ do + char '<' + link <- plainLink + char '>' + return link + +linkTarget :: OrgParser String +linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") + +possiblyEmptyLinkTarget :: OrgParser String +possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]") + +applyCustomLinkFormat :: String -> OrgParser (F String) +applyCustomLinkFormat link = do + let (linkType, rest) = break (== ':') link + return $ do + formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters + return $ maybe link ($ drop 1 rest) formatter + +-- | Take a link and return a function which produces new inlines when given +-- description inlines. +linkToInlinesF :: String -> Inlines -> F Inlines +linkToInlinesF linkStr = + case linkStr of + "" -> pure . B.link mempty "" -- wiki link (empty by convention) + ('#':_) -> pure . B.link linkStr "" -- document-local fraction + _ -> case cleanLinkString linkStr of + (Just cleanedLink) -> if isImageFilename cleanedLink + then const . pure $ B.image cleanedLink "" "" + else pure . B.link cleanedLink "" + Nothing -> internalLink linkStr -- other internal link + +-- | Cleanup and canonicalize a string describing a link. Return @Nothing@ if +-- the string does not appear to be a link. +cleanLinkString :: String -> Maybe String +cleanLinkString s = + case s of + '/':_ -> Just $ "file://" ++ s -- absolute path + '.':'/':_ -> Just s -- relative path + '.':'.':'/':_ -> Just s -- relative path + -- Relative path or URL (file schema) + 'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s' + _ | isUrl s -> Just s -- URL + _ -> Nothing + where + isUrl :: String -> Bool + isUrl cs = + let (scheme, path) = break (== ':') cs + in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme + && not (null path) + +isImageFilename :: String -> Bool +isImageFilename filename = + any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && + (any (\x -> (x++":") `isPrefixOf` filename) protocols || + ':' `notElem` filename) + where + imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] + protocols = [ "file", "http", "https" ] + +internalLink :: String -> Inlines -> F Inlines +internalLink link title = do + anchorB <- (link `elem`) <$> asksF orgStateAnchorIds + if anchorB + then return $ B.link ('#':link) "" title + else return $ B.emph title + +-- | Parse an anchor like @<>@ and return an empty span with +-- @anchor-id@ set as id. Legal anchors in org-mode are defined through +-- @org-target-regexp@, which is fairly liberal. Since no link is created if +-- @anchor-id@ contains spaces, we are more restrictive in what is accepted as +-- an anchor. + +anchor :: OrgParser (F Inlines) +anchor = try $ do + anchorId <- parseAnchor + recordAnchorId anchorId + returnF $ B.spanWith (solidify anchorId, [], []) mempty + where + parseAnchor = string "<<" + *> many1 (noneOf "\t\n\r<>\"' ") + <* string ">>" + <* skipSpaces + +-- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors +-- the org function @org-export-solidify-link-text@. + +solidify :: String -> String +solidify = map replaceSpecialChar + where replaceSpecialChar c + | isAlphaNum c = c + | c `elem` ("_.-:" :: String) = c + | otherwise = '-' + +-- | Parses an inline code block and marks it as an babel block. +inlineCodeBlock :: OrgParser (F Inlines) +inlineCodeBlock = try $ do + string "src_" + lang <- many1 orgArgWordChar + opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption + inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r") + let attrClasses = [translateLang lang, rundocBlockClass] + let attrKeyVal = map toRundocAttrib (("language", lang) : opts) + returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode + where + inlineBlockOption :: OrgParser (String, String) + inlineBlockOption = try $ do + argKey <- orgArgKey + paramValue <- option "yes" orgInlineParamValue + return (argKey, paramValue) + + orgInlineParamValue :: OrgParser String + orgInlineParamValue = try $ + skipSpaces + *> notFollowedBy (char ':') + *> many1 (noneOf "\t\n\r ]") + <* skipSpaces + + + +enclosedByPair :: Char -- ^ opening char + -> Char -- ^ closing char + -> OrgParser a -- ^ parser + -> OrgParser [a] +enclosedByPair s e p = char s *> many1Till p (char e) + +emph :: OrgParser (F Inlines) +emph = fmap B.emph <$> emphasisBetween '/' + +strong :: OrgParser (F Inlines) +strong = fmap B.strong <$> emphasisBetween '*' + +strikeout :: OrgParser (F Inlines) +strikeout = fmap B.strikeout <$> emphasisBetween '+' + +-- There is no underline, so we use strong instead. +underline :: OrgParser (F Inlines) +underline = fmap B.strong <$> emphasisBetween '_' + +verbatim :: OrgParser (F Inlines) +verbatim = return . B.code <$> verbatimBetween '=' + +code :: OrgParser (F Inlines) +code = return . B.code <$> verbatimBetween '~' + +subscript :: OrgParser (F Inlines) +subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr) + +superscript :: OrgParser (F Inlines) +superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr) + +math :: OrgParser (F Inlines) +math = return . B.math <$> choice [ math1CharBetween '$' + , mathStringBetween '$' + , rawMathBetween "\\(" "\\)" + ] + +displayMath :: OrgParser (F Inlines) +displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" + , rawMathBetween "$$" "$$" + ] + +updatePositions :: Char + -> OrgParser (Char) +updatePositions c = do + when (c `elem` emphasisPreChars) updateLastPreCharPos + when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos + return c + +symbol :: OrgParser (F Inlines) +symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) + +emphasisBetween :: Char + -> OrgParser (F Inlines) +emphasisBetween c = try $ do + startEmphasisNewlinesCounting emphasisAllowedNewlines + res <- enclosedInlines (emphasisStart c) (emphasisEnd c) + isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState + when isTopLevelEmphasis + resetEmphasisNewlines + return res + +verbatimBetween :: Char + -> OrgParser String +verbatimBetween c = try $ + emphasisStart c *> + many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c) + +-- | Parses a raw string delimited by @c@ using Org's math rules +mathStringBetween :: Char + -> OrgParser String +mathStringBetween c = try $ do + mathStart c + body <- many1TillNOrLessNewlines mathAllowedNewlines + (noneOf (c:"\n\r")) + (lookAhead $ mathEnd c) + final <- mathEnd c + return $ body ++ [final] + +-- | Parse a single character between @c@ using math rules +math1CharBetween :: Char + -> OrgParser String +math1CharBetween c = try $ do + char c + res <- noneOf $ c:mathForbiddenBorderChars + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return [res] + +rawMathBetween :: String + -> String + -> OrgParser String +rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e) + +-- | Parses the start (opening character) of emphasis +emphasisStart :: Char -> OrgParser Char +emphasisStart c = try $ do + guard =<< afterEmphasisPreChar + guard =<< notAfterString + char c + lookAhead (noneOf emphasisForbiddenBorderChars) + pushToInlineCharStack c + return c + +-- | Parses the closing character of emphasis +emphasisEnd :: Char -> OrgParser Char +emphasisEnd c = try $ do + guard =<< notAfterForbiddenBorderChar + char c + eof <|> () <$ lookAhead acceptablePostChars + updateLastStrPos + popInlineCharStack + return c + where acceptablePostChars = + surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars) + +mathStart :: Char -> OrgParser Char +mathStart c = try $ + char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars)) + +mathEnd :: Char -> OrgParser Char +mathEnd c = try $ do + res <- noneOf (c:mathForbiddenBorderChars) + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return res + + +enclosedInlines :: OrgParser a + -> OrgParser b + -> OrgParser (F Inlines) +enclosedInlines start end = try $ + trimInlinesF . mconcat <$> enclosed start end inline + +enclosedRaw :: OrgParser a + -> OrgParser b + -> OrgParser String +enclosedRaw start end = try $ + start *> (onSingleLine <|> spanningTwoLines) + where onSingleLine = try $ many1Till (noneOf "\n\r") end + spanningTwoLines = try $ + anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine + +-- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume +-- newlines. +many1TillNOrLessNewlines :: Int + -> OrgParser Char + -> OrgParser a + -> OrgParser String +many1TillNOrLessNewlines n p end = try $ + nMoreLines (Just n) mempty >>= oneOrMore + where + nMoreLines Nothing cs = return cs + nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine + nMoreLines k cs = try $ (final k cs <|> rest k cs) + >>= uncurry nMoreLines + final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine + rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p newline) + finalLine = try $ manyTill p end + minus1 k = k - 1 + oneOrMore cs = guard (not $ null cs) *> return cs + +-- Org allows customization of the way it reads emphasis. We use the defaults +-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components` +-- for details). + +-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too) +emphasisPreChars :: [Char] +emphasisPreChars = "\t \"'({" + +-- | Chars allowed at after emphasis +emphasisPostChars :: [Char] +emphasisPostChars = "\t\n !\"'),-.:;?\\}" + +-- | Chars not allowed at the (inner) border of emphasis +emphasisForbiddenBorderChars :: [Char] +emphasisForbiddenBorderChars = "\t\n\r \"'," + +-- | The maximum number of newlines within +emphasisAllowedNewlines :: Int +emphasisAllowedNewlines = 1 + +-- LaTeX-style math: see `org-latex-regexps` for details + +-- | Chars allowed after an inline ($...$) math statement +mathPostChars :: [Char] +mathPostChars = "\t\n \"'),-.:;?" + +-- | Chars not allowed at the (inner) border of math +mathForbiddenBorderChars :: [Char] +mathForbiddenBorderChars = "\t\n\r ,;.$" + +-- | Maximum number of newlines in an inline math statement +mathAllowedNewlines :: Int +mathAllowedNewlines = 2 + +-- | Whether we are right behind a char allowed before emphasis +afterEmphasisPreChar :: OrgParser Bool +afterEmphasisPreChar = do + pos <- getPosition + lastPrePos <- orgStateLastPreCharPos <$> getState + return . fromMaybe True $ (== pos) <$> lastPrePos + +-- | Whether the parser is right after a forbidden border char +notAfterForbiddenBorderChar :: OrgParser Bool +notAfterForbiddenBorderChar = do + pos <- getPosition + lastFBCPos <- orgStateLastForbiddenCharPos <$> getState + return $ lastFBCPos /= Just pos + +-- | Read a sub- or superscript expression +subOrSuperExpr :: OrgParser (F Inlines) +subOrSuperExpr = try $ + choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r") + , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r") + , simpleSubOrSuperString + ] >>= parseFromString (mconcat <$> many inline) + where enclosing (left, right) s = left : s ++ [right] + +simpleSubOrSuperString :: OrgParser String +simpleSubOrSuperString = try $ do + state <- getState + guard . exportSubSuperscripts . orgStateExportSettings $ state + choice [ string "*" + , mappend <$> option [] ((:[]) <$> oneOf "+-") + <*> many1 alphaNum + ] + +inlineLaTeX :: OrgParser (F Inlines) +inlineLaTeX = try $ do + cmd <- inlineLaTeXCommand + maybe mzero returnF $ + parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd + where + parseAsMath :: String -> Maybe Inlines + parseAsMath cs = B.fromList <$> texMathToPandoc cs + + parseAsInlineLaTeX :: String -> Maybe Inlines + parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs + + parseAsMathMLSym :: String -> Maybe Inlines + parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs) + -- drop initial backslash and any trailing "{}" + where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1 + + state :: ParserState + state = def{ stateOptions = def{ readerParseRaw = True }} + + texMathToPandoc :: String -> Maybe [Inline] + texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline + +maybeRight :: Either a b -> Maybe b +maybeRight = either (const Nothing) Just + +inlineLaTeXCommand :: OrgParser String +inlineLaTeXCommand = try $ do + rest <- getInput + case runParser rawLaTeXInline def "source" rest of + Right (RawInline _ cs) -> do + -- drop any trailing whitespace, those are not be part of the command as + -- far as org mode is concerned. + let cmdNoSpc = dropWhileEnd isSpace cs + let len = length cmdNoSpc + count len anyChar + return cmdNoSpc + _ -> mzero + +-- Taken from Data.OldList. +dropWhileEnd :: (a -> Bool) -> [a] -> [a] +dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) [] + +smart :: OrgParser (F Inlines) +smart = do + getOption readerSmart >>= guard + doubleQuoted <|> singleQuoted <|> + choice (map (return <$>) [orgApostrophe, orgDash, orgEllipses]) + where + orgDash = dash <* updatePositions '-' + orgEllipses = ellipses <* updatePositions '.' + orgApostrophe = + (char '\'' <|> char '\8217') <* updateLastPreCharPos + <* updateLastForbiddenCharPos + *> return (B.str "\x2019") + +singleQuoted :: OrgParser (F Inlines) +singleQuoted = try $ do + singleQuoteStart + updatePositions '\'' + withQuoteContext InSingleQuote $ + fmap B.singleQuoted . trimInlinesF . mconcat <$> + many1Till inline (singleQuoteEnd <* updatePositions '\'') + +-- doubleQuoted will handle regular double-quoted sections, as well +-- as dialogues with an open double-quote without a close double-quote +-- in the same paragraph. +doubleQuoted :: OrgParser (F Inlines) +doubleQuoted = try $ do + doubleQuoteStart + updatePositions '"' + contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) + (withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return + (fmap B.doubleQuoted . trimInlinesF $ contents)) + <|> (return $ return (B.str "\8220") <> contents) diff --git a/src/Text/Pandoc/Readers/Org/Parsing.hs b/src/Text/Pandoc/Readers/Org/Parsing.hs index efe2ae25f..9a1420645 100644 --- a/src/Text/Pandoc/Readers/Org/Parsing.hs +++ b/src/Text/Pandoc/Readers/Org/Parsing.hs @@ -34,10 +34,14 @@ module Text.Pandoc.Readers.Org.Parsing , blanklines , newline , parseFromString + , skipSpaces1 , inList , withContext , updateLastForbiddenCharPos , updateLastPreCharPos + , orgArgKey + , orgArgWord + , orgArgWordChar -- * Re-exports from Text.Pandoc.Parser , ParserContext (..) , many1Till @@ -133,6 +137,10 @@ parseFromString parser str' = do updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos } return result +-- | Skip one or more tab or space characters. +skipSpaces1 :: OrgParser () +skipSpaces1 = skipMany1 spaceChar + -- | Like @Text.Parsec.Char.newline@, but causes additional state changes. newline :: OrgParser Char newline = @@ -180,3 +188,14 @@ updateLastForbiddenCharPos = getPosition >>= \p -> updateLastPreCharPos :: OrgParser () updateLastPreCharPos = getPosition >>= \p -> updateState $ \s -> s{ orgStateLastPreCharPos = Just p} + +orgArgKey :: OrgParser String +orgArgKey = try $ + skipSpaces *> char ':' + *> many1 orgArgWordChar + +orgArgWord :: OrgParser String +orgArgWord = many1 orgArgWordChar + +orgArgWordChar :: OrgParser Char +orgArgWordChar = alphaNum <|> oneOf "-_" -- cgit v1.2.3