Org reader: extract inline parser to module

Inline parsing code is moved to a separate module. Parsers for block starts are extracted as well, as those are used in the `endline` parser. This is part of the Org-mode reader cleanup effort.
author: Albert Krewinkel <albert@zeitkraut.de> 2016-05-22 16:52:06 +0200
committer: Albert Krewinkel <albert@zeitkraut.de> 2016-05-25 22:54:45 +0200
commit: 39e8b4276e6d88d5cbb943d04c866dde9bf6473c (patch)
tree: f20d8b1e508c39524fe7084c50172861df4afad4 /src/Text/Pandoc/Readers/Org.hs
parent: a340c7249f8e19d36ee4a68663b4c97e0893292b (diff)
1 files changed, 41 insertions, 756 deletions
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index fd811c078..605d2220e 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -1,5 +1,4 @@
 {-# LANGUAGE FlexibleContexts #-}
-{-# LANGUAGE OverloadedStrings #-}
 {-
 Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
 
@@ -29,6 +28,8 @@ Conversion of org-mode formatted plain text to 'Pandoc' document.
 -}
 module Text.Pandoc.Readers.Org ( readOrg ) where
 
+import           Text.Pandoc.Readers.Org.BlockStarts
+import           Text.Pandoc.Readers.Org.Inlines
 import           Text.Pandoc.Readers.Org.ParserState
 import           Text.Pandoc.Readers.Org.Parsing
 
@@ -38,19 +39,16 @@ import           Text.Pandoc.Definition
 import           Text.Pandoc.Compat.Monoid ((<>))
 import           Text.Pandoc.Error
 import           Text.Pandoc.Options
-import           Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline)
-import           Text.Pandoc.Shared (compactify', compactify'DL)
-import           Text.TeXMath (readTeX, writePandoc, DisplayType(..))
-import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap
+import           Text.Pandoc.Shared ( compactify', compactify'DL )
 
-import           Control.Arrow (first)
-import           Control.Monad (foldM, guard, mplus, mzero, when)
+import           Control.Arrow ( first )
+import           Control.Monad ( foldM, guard, mzero )
 import           Control.Monad.Reader ( runReader )
-import           Data.Char (isAlphaNum, isSpace, toLower, toUpper)
-import           Data.List ( foldl', intersperse, isPrefixOf, isSuffixOf )
+import           Data.Char ( toLower, toUpper)
+import           Data.List ( foldl', intersperse, isPrefixOf )
 import qualified Data.Map as M
 import           Data.Maybe ( fromMaybe, isNothing )
-import           Network.HTTP (urlEncode)
+import           Network.HTTP ( urlEncode )
 
 
 -- | Parse org-mode string and return a Pandoc document.
@@ -60,54 +58,6 @@ readOrg :: ReaderOptions -- ^ Reader options
 readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n")
 
 --
--- Functions acting on the parser state
---
-recordAnchorId :: String -> OrgParser ()
-recordAnchorId i = updateState $ \s ->
-  s{ orgStateAnchorIds = i : (orgStateAnchorIds s) }
-
-pushToInlineCharStack :: Char -> OrgParser ()
-pushToInlineCharStack c = updateState $ \s ->
-  s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s }
-
-popInlineCharStack :: OrgParser ()
-popInlineCharStack = updateState $ \s ->
-  s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s }
-
-surroundingEmphasisChar :: OrgParser [Char]
-surroundingEmphasisChar =
-  take 1 . drop 1 . orgStateEmphasisCharStack <$> getState
-
-startEmphasisNewlinesCounting :: Int -> OrgParser ()
-startEmphasisNewlinesCounting maxNewlines = updateState $ \s ->
-  s{ orgStateEmphasisNewlines = Just maxNewlines }
-
-decEmphasisNewlinesCount :: OrgParser ()
-decEmphasisNewlinesCount = updateState $ \s ->
-  s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s }
-
-newlinesCountWithinLimits :: OrgParser Bool
-newlinesCountWithinLimits = do
-  st <- getState
-  return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True
-
-resetEmphasisNewlines :: OrgParser ()
-resetEmphasisNewlines = updateState $ \s ->
-  s{ orgStateEmphasisNewlines = Nothing }
-
-addLinkFormat :: String
-              -> (String -> String)
-              -> OrgParser ()
-addLinkFormat key formatter = updateState $ \s ->
-  let fs = orgStateLinkFormatters s
-  in s{ orgStateLinkFormatters = M.insert key formatter fs }
-
-addToNotesTable :: OrgNoteRecord -> OrgParser ()
-addToNotesTable note = do
-  oldnotes <- orgStateNotes' <$> getState
-  updateState $ \s -> s{ orgStateNotes' = note:oldnotes }
-
---
 -- Export Settings
 --
 exportSetting :: OrgParser ()
@@ -259,7 +209,7 @@ block = choice [ mempty <$ blanklines
                , genericDrawer
                , specialLine
                , header
-               , return <$> hline
+               , horizontalRule
                , list
                , latexFragment
                , noteBlock
@@ -457,9 +407,6 @@ indentWith num = do
 
 type SwitchOption = (Char, Maybe String)
 
-orgArgWord :: OrgParser String
-orgArgWord = many1 orgArgWordChar
-
 -- | Parse code block arguments
 -- TODO: We currently don't handle switches.
 codeHeaderArgs :: OrgParser ([String], [(String, String)])
@@ -474,7 +421,10 @@ codeHeaderArgs = try $ do
          , map toRundocAttrib (("language", language) : parameters)
          )
     else ([ pandocLang ], parameters)
- where hasRundocParameters = not . null
+ where
+   hasRundocParameters = not . null
+   toRundocAttrib = first ("rundoc-" ++)
+
 
 switch :: OrgParser SwitchOption
 switch = try $ simpleSwitch <|> lineNumbersSwitch
@@ -508,17 +458,6 @@ blockOption = try $ do
   paramValue <- option "yes" orgParamValue
   return (argKey, paramValue)
 
-inlineBlockOption :: OrgParser (String, String)
-inlineBlockOption = try $ do
-  argKey <- orgArgKey
-  paramValue <- option "yes" orgInlineParamValue
-  return (argKey, paramValue)
-
-orgArgKey :: OrgParser String
-orgArgKey = try $
-  skipSpaces *> char ':'
-             *> many1 orgArgWordChar
-
 orgParamValue :: OrgParser String
 orgParamValue = try $
   skipSpaces
@@ -526,19 +465,6 @@ orgParamValue = try $
     *> many1 (noneOf "\t\n\r ")
     <* skipSpaces
 
-orgInlineParamValue :: OrgParser String
-orgInlineParamValue = try $
-  skipSpaces
-    *> notFollowedBy (char ':')
-    *> many1 (noneOf "\t\n\r ]")
-    <* skipSpaces
-
-orgArgWordChar :: OrgParser Char
-orgArgWordChar = alphaNum <|> oneOf "-_"
-
-toRundocAttrib :: (String, String) -> (String, String)
-toRundocAttrib = first ("rundoc-" ++)
-
 commaEscaped :: String -> String
 commaEscaped (',':cs@('*':_))     = cs
 commaEscaped (',':cs@('#':'+':_)) = cs
@@ -552,7 +478,10 @@ exampleCode :: String -> Blocks
 exampleCode = B.codeBlockWith ("", ["example"], [])
 
 exampleLine :: OrgParser String
-exampleLine = try $ skipSpaces *> string ": " *> anyLine
+exampleLine = try $ exampleLineStart *> anyLine
+
+horizontalRule :: OrgParser (F Blocks)
+horizontalRule = return B.horizontalRule <$ try hline
 
 
 --
@@ -582,11 +511,6 @@ genericDrawer = try $ do
   drawerDiv :: String -> F Blocks -> F Blocks
   drawerDiv drawerName = fmap $ B.divWith (mempty, [drawerName, "drawer"], mempty)
 
-drawerStart :: OrgParser String
-drawerStart = try $
-  skipSpaces *> drawerName <* skipSpaces <* newline
- where drawerName = char ':' *> manyTill nonspaceChar (char ':')
-
 drawerLine :: OrgParser String
 drawerLine = anyLine
 
@@ -639,31 +563,38 @@ figure = try $ do
   let attr       = (mempty, mempty, figKeyVals)
   return $ (B.para . B.imageWith attr src (withFigPrefix figName) <$> figCaption)
  where
+   withFigPrefix :: String -> String
    withFigPrefix cs =
      if "fig:" `isPrefixOf` cs
      then cs
      else "fig:" ++ cs
 
+   selfTarget :: OrgParser String
+   selfTarget = try $ char '[' *> linkTarget <* char ']'
+
+
 --
 -- Comments, Options and Metadata
 --
+
+addLinkFormat :: String
+              -> (String -> String)
+              -> OrgParser ()
+addLinkFormat key formatter = updateState $ \s ->
+  let fs = orgStateLinkFormatters s
+  in s{ orgStateLinkFormatters = M.insert key formatter fs }
+
 specialLine :: OrgParser (F Blocks)
 specialLine = fmap return . try $ metaLine <|> commentLine
 
-metaLine :: OrgParser Blocks
-metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine)
-
 -- The order, in which blocks are tried, makes sure that we're not looking at
 -- the beginning of a block, so we don't need to check for it
-metaLineStart :: OrgParser ()
-metaLineStart = try $ skipSpaces <* string "#+"
+metaLine :: OrgParser Blocks
+metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine)
 
 commentLine :: OrgParser Blocks
 commentLine = commentLineStart *> anyLine *> pure mempty
 
-commentLineStart :: OrgParser ()
-commentLineStart = try $ skipSpaces <* string "# "
-
 declarationLine :: OrgParser ()
 declarationLine = try $ do
   key <- metaKey
@@ -741,23 +672,6 @@ header = try $ do
           *> many1 tag
           <* skipSpaces
 
-headerStart :: OrgParser Int
-headerStart = try $
-  (length <$> many1 (char '*')) <* many1 (char ' ') <* updateLastPreCharPos
-
-
--- Don't use (or need) the reader wrapper here, we want hline to be
--- @show@able.  Otherwise we can't use it with @notFollowedBy'@.
-
--- | Horizontal Line (five -- dashes or more)
-hline :: OrgParser Blocks
-hline = try $ do
-  skipSpaces
-  string "-----"
-  many (char '-')
-  skipSpaces
-  newline
-  return B.horizontalRule
 
 --
 -- Tables
@@ -793,9 +707,6 @@ orgToPandocTable :: OrgTable
 orgToPandocTable (OrgTable aligns heads lns) caption =
   B.table caption (zip aligns $ repeat 0) heads lns
 
-tableStart :: OrgParser Char
-tableStart = try $ skipSpaces *> char '|'
-
 tableRows :: OrgParser [OrgTableRow]
 tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow)
 
@@ -899,25 +810,12 @@ latexFragment = try $ do
                               , "\\end{", e, "}\n"
                               ]
 
-latexEnvStart :: OrgParser String
-latexEnvStart = try $ do
-  skipSpaces *> string "\\begin{"
-             *> latexEnvName
-             <* string "}"
-             <* blankline
-
 latexEnd :: String -> OrgParser ()
 latexEnd envName = try $
   () <$ skipSpaces
      <* string ("\\end{" ++ envName ++ "}")
      <* blankline
 
--- | Parses a LaTeX environment name.
-latexEnvName :: OrgParser String
-latexEnvName = try $ do
-  mappend <$> many1 alphaNum
-          <*> option "" (string "*")
-
 
 --
 -- Footnote defintions
@@ -942,7 +840,7 @@ paraOrPlain = try $ do
   -- is directly followed by a list item, in which case the block is read as
   -- plain text.
   try (guard nl
-       *> notFollowedBy (inList *> (orderedListStart <|> bulletListStart))
+       *> notFollowedBy (inList *> (() <$ orderedListStart <|> bulletListStart))
        *> return (B.para <$> ils))
     <|>  (return (B.plain <$> ils))
 
@@ -971,38 +869,21 @@ orderedList :: OrgParser (F Blocks)
 orderedList = fmap B.orderedList . fmap compactify' . sequence
               <$> many1 (listItem orderedListStart)
 
-genericListStart :: OrgParser String
-                 -> OrgParser Int
-genericListStart listMarker = try $
-  (+) <$> (length <$> many spaceChar)
-      <*> (length <$> listMarker <* many1 spaceChar)
-
--- parses bullet list marker. maybe we know the indent level
-bulletListStart :: OrgParser Int
-bulletListStart = bulletListStart' Nothing
-
 bulletListStart' :: Maybe Int -> OrgParser Int
 -- returns length of bulletList prefix, inclusive of marker
 bulletListStart' Nothing  = do ind <- length <$> many spaceChar
-                               when (ind == 0) $ notFollowedBy (char '*')
-                               oneOf bullets
-                               many1 spaceChar
+                               oneOf (bullets $ ind == 0)
+                               skipSpaces1
                                return (ind + 1)
- -- Unindented lists are legal, but they can't use '*' bullets
- -- We return n to maintain compatibility with the generic listItem
 bulletListStart' (Just n) = do count (n-1) spaceChar
-                               when (n == 1) $ notFollowedBy (char '*')
-                               oneOf bullets
+                               oneOf (bullets $ n == 1)
                                many1 spaceChar
                                return n
 
-bullets :: String
-bullets = "*+-"
-
-orderedListStart :: OrgParser Int
-orderedListStart = genericListStart orderedListMarker
-  -- Ordered list markers allowed in org-mode
-  where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)")
+-- Unindented lists are legal, but they can't use '*' bullets.
+-- We return n to maintain compatibility with the generic listItem.
+bullets :: Bool -> String
+bullets unindented = if unindented then "+-" else "*+-"
 
 definitionListItem :: OrgParser Int
                    -> OrgParser (F (Inlines, [Blocks]))
@@ -1040,602 +921,6 @@ listContinuation markerLength = try $
               <*> many blankline)
  where listLine = try $ indentWith markerLength *> anyLineNewline
 
+-- | Parse any line, include the final newline in the output.
 anyLineNewline :: OrgParser String
 anyLineNewline = (++ "\n") <$> anyLine
-
-
---
--- inline
---
-
-inline :: OrgParser (F Inlines)
-inline =
-  choice [ whitespace
-         , linebreak
-         , cite
-         , footnote
-         , linkOrImage
-         , anchor
-         , inlineCodeBlock
-         , str
-         , endline
-         , emph
-         , strong
-         , strikeout
-         , underline
-         , code
-         , math
-         , displayMath
-         , verbatim
-         , subscript
-         , superscript
-         , inlineLaTeX
-         , smart
-         , symbol
-         ] <* (guard =<< newlinesCountWithinLimits)
-  <?> "inline"
-
-parseInlines :: OrgParser (F Inlines)
-parseInlines = trimInlinesF . mconcat <$> many1 inline
-
--- treat these as potentially non-text when parsing inline:
-specialChars :: [Char]
-specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~"
-
-
-whitespace :: OrgParser (F Inlines)
-whitespace = pure B.space <$ skipMany1 spaceChar
-                          <* updateLastPreCharPos
-                          <* updateLastForbiddenCharPos
-             <?> "whitespace"
-
-linebreak :: OrgParser (F Inlines)
-linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline
-
-str :: OrgParser (F Inlines)
-str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
-      <* updateLastStrPos
-
--- | An endline character that can be treated as a space, not a structural
--- break.  This should reflect the values of the Emacs variable
--- @org-element-pagaraph-separate@.
-endline :: OrgParser (F Inlines)
-endline = try $ do
-  newline
-  notFollowedBy blankline
-  notFollowedBy' exampleLine
-  notFollowedBy' hline
-  notFollowedBy' noteMarker
-  notFollowedBy' tableStart
-  notFollowedBy' drawerStart
-  notFollowedBy' headerStart
-  notFollowedBy' metaLineStart
-  notFollowedBy' latexEnvStart
-  notFollowedBy' commentLineStart
-  notFollowedBy' bulletListStart
-  notFollowedBy' orderedListStart
-  decEmphasisNewlinesCount
-  guard =<< newlinesCountWithinLimits
-  updateLastPreCharPos
-  return . return $ B.softbreak
-
-cite :: OrgParser (F Inlines)
-cite = try $ do
-  guardEnabled Ext_citations
-  (cs, raw) <- withRaw normalCite
-  return $ (flip B.cite (B.text raw)) <$> cs
-
-normalCite :: OrgParser (F [Citation])
-normalCite = try $  char '['
-                 *> skipSpaces
-                 *> citeList
-                 <* skipSpaces
-                 <* char ']'
-
-citeList :: OrgParser (F [Citation])
-citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces)
-
-citation :: OrgParser (F Citation)
-citation = try $ do
-  pref <- prefix
-  (suppress_author, key) <- citeKey
-  suff <- suffix
-  return $ do
-    x <- pref
-    y <- suff
-    return $ Citation{ citationId      = key
-                     , citationPrefix  = B.toList x
-                     , citationSuffix  = B.toList y
-                     , citationMode    = if suppress_author
-                                            then SuppressAuthor
-                                            else NormalCitation
-                     , citationNoteNum = 0
-                     , citationHash    = 0
-                     }
- where
-   prefix = trimInlinesF . mconcat <$>
-            manyTill inline (char ']' <|> (']' <$ lookAhead citeKey))
-   suffix = try $ do
-     hasSpace <- option False (notFollowedBy nonspaceChar >> return True)
-     skipSpaces
-     rest <- trimInlinesF . mconcat <$>
-             many (notFollowedBy (oneOf ";]") *> inline)
-     return $ if hasSpace
-              then (B.space <>) <$> rest
-              else rest
-
-footnote :: OrgParser (F Inlines)
-footnote = try $ inlineNote <|> referencedNote
-
-inlineNote :: OrgParser (F Inlines)
-inlineNote = try $ do
-  string "[fn:"
-  ref <- many alphaNum
-  char ':'
-  note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']')
-  when (not $ null ref) $
-       addToNotesTable ("fn:" ++ ref, note)
-  return $ B.note <$> note
-
-referencedNote :: OrgParser (F Inlines)
-referencedNote = try $ do
-  ref <- noteMarker
-  return $ do
-    notes <- asksF orgStateNotes'
-    case lookup ref notes of
-      Nothing   -> return $ B.str $ "[" ++ ref ++ "]"
-      Just contents  -> do
-        st <- askF
-        let contents' = runF contents st{ orgStateNotes' = [] }
-        return $ B.note contents'
-
-noteMarker :: OrgParser String
-noteMarker = try $ do
-  char '['
-  choice [ many1Till digit (char ']')
-         , (++) <$> string "fn:"
-                <*> many1Till (noneOf "\n\r\t ") (char ']')
-         ]
-
-linkOrImage :: OrgParser (F Inlines)
-linkOrImage = explicitOrImageLink
-              <|> selflinkOrImage
-              <|> angleLink
-              <|> plainLink
-              <?> "link or image"
-
-explicitOrImageLink :: OrgParser (F Inlines)
-explicitOrImageLink = try $ do
-  char '['
-  srcF   <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget
-  title  <- enclosedRaw (char '[') (char ']')
-  title' <- parseFromString (mconcat <$> many inline) title
-  char ']'
-  return $ do
-    src <- srcF
-    if isImageFilename title
-      then pure $ B.link src "" $ B.image title mempty mempty
-      else linkToInlinesF src =<< title'
-
-selflinkOrImage :: OrgParser (F Inlines)
-selflinkOrImage = try $ do
-  src <- char '[' *> linkTarget <* char ']'
-  return $ linkToInlinesF src (B.str src)
-
-plainLink :: OrgParser (F Inlines)
-plainLink = try $ do
-  (orig, src) <- uri
-  returnF $ B.link src "" (B.str orig)
-
-angleLink :: OrgParser (F Inlines)
-angleLink = try $ do
-  char '<'
-  link <- plainLink
-  char '>'
-  return link
-
-selfTarget :: OrgParser String
-selfTarget = try $ char '[' *> linkTarget <* char ']'
-
-linkTarget :: OrgParser String
-linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]")
-
-possiblyEmptyLinkTarget :: OrgParser String
-possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]")
-
-applyCustomLinkFormat :: String -> OrgParser (F String)
-applyCustomLinkFormat link = do
-  let (linkType, rest) = break (== ':') link
-  return $ do
-    formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters
-    return $ maybe link ($ drop 1 rest) formatter
-
--- | Take a link and return a function which produces new inlines when given
--- description inlines.
-linkToInlinesF :: String -> Inlines -> F Inlines
-linkToInlinesF linkStr =
-  case linkStr of
-    ""      -> pure . B.link mempty ""       -- wiki link (empty by convention)
-    ('#':_) -> pure . B.link linkStr ""      -- document-local fraction
-    _       -> case cleanLinkString linkStr of
-                 (Just cleanedLink) -> if isImageFilename cleanedLink
-                                       then const . pure $ B.image cleanedLink "" ""
-                                       else pure . B.link cleanedLink ""
-                 Nothing -> internalLink linkStr  -- other internal link
-
--- | Cleanup and canonicalize a string describing a link.  Return @Nothing@ if
--- the string does not appear to be a link.
-cleanLinkString :: String -> Maybe String
-cleanLinkString s =
-  case s of
-    '/':_                  -> Just $ "file://" ++ s  -- absolute path
-    '.':'/':_              -> Just s                 -- relative path
-    '.':'.':'/':_          -> Just s                 -- relative path
-    -- Relative path or URL (file schema)
-    'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s'
-    _ | isUrl s            -> Just s                 -- URL
-    _                      -> Nothing
- where
-   isUrl :: String -> Bool
-   isUrl cs =
-     let (scheme, path) = break (== ':') cs
-     in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme
-          && not (null path)
-
-isImageFilename :: String -> Bool
-isImageFilename filename =
-  any (\x -> ('.':x)  `isSuffixOf` filename) imageExtensions &&
-  (any (\x -> (x++":") `isPrefixOf` filename) protocols ||
-   ':' `notElem` filename)
- where
-   imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
-   protocols = [ "file", "http", "https" ]
-
-internalLink :: String -> Inlines -> F Inlines
-internalLink link title = do
-  anchorB <- (link `elem`) <$> asksF orgStateAnchorIds
-  if anchorB
-    then return $ B.link ('#':link) "" title
-    else return $ B.emph title
-
--- | Parse an anchor like @<<anchor-id>>@ and return an empty span with
--- @anchor-id@ set as id.  Legal anchors in org-mode are defined through
--- @org-target-regexp@, which is fairly liberal.  Since no link is created if
--- @anchor-id@ contains spaces, we are more restrictive in what is accepted as
--- an anchor.
-
-anchor :: OrgParser (F Inlines)
-anchor =  try $ do
-  anchorId <- parseAnchor
-  recordAnchorId anchorId
-  returnF $ B.spanWith (solidify anchorId, [], []) mempty
- where
-       parseAnchor = string "<<"
-                     *> many1 (noneOf "\t\n\r<>\"' ")
-                     <* string ">>"
-                     <* skipSpaces
-
--- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'.  This mirrors
--- the org function @org-export-solidify-link-text@.
-
-solidify :: String -> String
-solidify = map replaceSpecialChar
- where replaceSpecialChar c
-           | isAlphaNum c    = c
-           | c `elem` ("_.-:" :: String) = c
-           | otherwise       = '-'
-
--- | Parses an inline code block and marks it as an babel block.
-inlineCodeBlock :: OrgParser (F Inlines)
-inlineCodeBlock = try $ do
-  string "src_"
-  lang <- many1 orgArgWordChar
-  opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption
-  inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r")
-  let attrClasses = [translateLang lang, rundocBlockClass]
-  let attrKeyVal  = map toRundocAttrib (("language", lang) : opts)
-  returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode
-
-enclosedByPair :: Char          -- ^ opening char
-               -> Char          -- ^ closing char
-               -> OrgParser a   -- ^ parser
-               -> OrgParser [a]
-enclosedByPair s e p = char s *> many1Till p (char e)
-
-emph      :: OrgParser (F Inlines)
-emph      = fmap B.emph         <$> emphasisBetween '/'
-
-strong    :: OrgParser (F Inlines)
-strong    = fmap B.strong       <$> emphasisBetween '*'
-
-strikeout :: OrgParser (F Inlines)
-strikeout = fmap B.strikeout    <$> emphasisBetween '+'
-
--- There is no underline, so we use strong instead.
-underline :: OrgParser (F Inlines)
-underline = fmap B.strong       <$> emphasisBetween '_'
-
-verbatim  :: OrgParser (F Inlines)
-verbatim  = return . B.code     <$> verbatimBetween '='
-
-code      :: OrgParser (F Inlines)
-code      = return . B.code     <$> verbatimBetween '~'
-
-subscript   :: OrgParser (F Inlines)
-subscript   = fmap B.subscript   <$> try (char '_' *> subOrSuperExpr)
-
-superscript :: OrgParser (F Inlines)
-superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr)
-
-math      :: OrgParser (F Inlines)
-math      = return . B.math      <$> choice [ math1CharBetween '$'
-                                            , mathStringBetween '$'
-                                            , rawMathBetween "\\(" "\\)"
-                                            ]
-
-displayMath :: OrgParser (F Inlines)
-displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]"
-                                                , rawMathBetween "$$"  "$$"
-                                                ]
-
-updatePositions :: Char
-                -> OrgParser (Char)
-updatePositions c = do
-  when (c `elem` emphasisPreChars) updateLastPreCharPos
-  when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos
-  return c
-
-symbol :: OrgParser (F Inlines)
-symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
-
-emphasisBetween :: Char
-                -> OrgParser (F Inlines)
-emphasisBetween c = try $ do
-  startEmphasisNewlinesCounting emphasisAllowedNewlines
-  res <- enclosedInlines (emphasisStart c) (emphasisEnd c)
-  isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState
-  when isTopLevelEmphasis
-       resetEmphasisNewlines
-  return res
-
-verbatimBetween :: Char
-                -> OrgParser String
-verbatimBetween c = try $
-  emphasisStart c *>
-  many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c)
-
--- | Parses a raw string delimited by @c@ using Org's math rules
-mathStringBetween :: Char
-                  -> OrgParser String
-mathStringBetween c = try $ do
-  mathStart c
-  body <- many1TillNOrLessNewlines mathAllowedNewlines
-                                   (noneOf (c:"\n\r"))
-                                   (lookAhead $ mathEnd c)
-  final <- mathEnd c
-  return $ body ++ [final]
-
--- | Parse a single character between @c@ using math rules
-math1CharBetween :: Char
-                -> OrgParser String
-math1CharBetween c = try $ do
-  char c
-  res <- noneOf $ c:mathForbiddenBorderChars
-  char c
-  eof <|> () <$ lookAhead (oneOf mathPostChars)
-  return [res]
-
-rawMathBetween :: String
-               -> String
-               -> OrgParser String
-rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e)
-
--- | Parses the start (opening character) of emphasis
-emphasisStart :: Char -> OrgParser Char
-emphasisStart c = try $ do
-  guard =<< afterEmphasisPreChar
-  guard =<< notAfterString
-  char c
-  lookAhead (noneOf emphasisForbiddenBorderChars)
-  pushToInlineCharStack c
-  return c
-
--- | Parses the closing character of emphasis
-emphasisEnd :: Char -> OrgParser Char
-emphasisEnd c = try $ do
-  guard =<< notAfterForbiddenBorderChar
-  char c
-  eof <|> () <$ lookAhead acceptablePostChars
-  updateLastStrPos
-  popInlineCharStack
-  return c
- where acceptablePostChars =
-           surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars)
-
-mathStart :: Char -> OrgParser Char
-mathStart c = try $
-  char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars))
-
-mathEnd :: Char -> OrgParser Char
-mathEnd c = try $ do
-  res <- noneOf (c:mathForbiddenBorderChars)
-  char c
-  eof <|> () <$ lookAhead (oneOf mathPostChars)
-  return res
-
-
-enclosedInlines :: OrgParser a
-                -> OrgParser b
-                -> OrgParser (F Inlines)
-enclosedInlines start end = try $
-  trimInlinesF . mconcat <$> enclosed start end inline
-
-enclosedRaw :: OrgParser a
-            -> OrgParser b
-            -> OrgParser String
-enclosedRaw start end = try $
-  start *> (onSingleLine <|> spanningTwoLines)
- where onSingleLine = try $ many1Till (noneOf "\n\r") end
-       spanningTwoLines = try $
-         anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine
-
--- | Like many1Till, but parses at most @n+1@ lines.  @p@ must not consume
---   newlines.
-many1TillNOrLessNewlines :: Int
-                         -> OrgParser Char
-                         -> OrgParser a
-                         -> OrgParser String
-many1TillNOrLessNewlines n p end = try $
-  nMoreLines (Just n) mempty >>= oneOrMore
- where
-   nMoreLines Nothing  cs = return cs
-   nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine
-   nMoreLines k        cs = try $ (final k cs <|> rest k cs)
-                                  >>= uncurry nMoreLines
-   final _ cs = (\x -> (Nothing,      cs ++ x)) <$> try finalLine
-   rest  m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p newline)
-   finalLine = try $ manyTill p end
-   minus1 k = k - 1
-   oneOrMore cs = guard (not $ null cs) *> return cs
-
--- Org allows customization of the way it reads emphasis.  We use the defaults
--- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components`
--- for details).
-
--- | Chars allowed to occur before emphasis (spaces and newlines are ok, too)
-emphasisPreChars :: [Char]
-emphasisPreChars = "\t \"'({"
-
--- | Chars allowed at after emphasis
-emphasisPostChars :: [Char]
-emphasisPostChars = "\t\n !\"'),-.:;?\\}"
-
--- | Chars not allowed at the (inner) border of emphasis
-emphasisForbiddenBorderChars :: [Char]
-emphasisForbiddenBorderChars = "\t\n\r \"',"
-
--- | The maximum number of newlines within
-emphasisAllowedNewlines :: Int
-emphasisAllowedNewlines = 1
-
--- LaTeX-style math: see `org-latex-regexps` for details
-
--- | Chars allowed after an inline ($...$) math statement
-mathPostChars :: [Char]
-mathPostChars = "\t\n \"'),-.:;?"
-
--- | Chars not allowed at the (inner) border of math
-mathForbiddenBorderChars :: [Char]
-mathForbiddenBorderChars = "\t\n\r ,;.$"
-
--- | Maximum number of newlines in an inline math statement
-mathAllowedNewlines :: Int
-mathAllowedNewlines = 2
-
--- | Whether we are right behind a char allowed before emphasis
-afterEmphasisPreChar :: OrgParser Bool
-afterEmphasisPreChar = do
-  pos <- getPosition
-  lastPrePos <- orgStateLastPreCharPos <$> getState
-  return . fromMaybe True $ (== pos) <$> lastPrePos
-
--- | Whether the parser is right after a forbidden border char
-notAfterForbiddenBorderChar :: OrgParser Bool
-notAfterForbiddenBorderChar = do
-  pos <- getPosition
-  lastFBCPos <- orgStateLastForbiddenCharPos <$> getState
-  return $ lastFBCPos /= Just pos
-
--- | Read a sub- or superscript expression
-subOrSuperExpr :: OrgParser (F Inlines)
-subOrSuperExpr = try $
-  choice [ id                   <$> charsInBalanced '{' '}' (noneOf "\n\r")
-         , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r")
-         , simpleSubOrSuperString
-         ] >>= parseFromString (mconcat <$> many inline)
- where enclosing (left, right) s = left : s ++ [right]
-
-simpleSubOrSuperString :: OrgParser String
-simpleSubOrSuperString = try $ do
-  state <- getState
-  guard . exportSubSuperscripts . orgStateExportSettings $ state
-  choice [ string "*"
-         , mappend <$> option [] ((:[]) <$> oneOf "+-")
-                   <*> many1 alphaNum
-         ]
-
-inlineLaTeX :: OrgParser (F Inlines)
-inlineLaTeX = try $ do
-  cmd <- inlineLaTeXCommand
-  maybe mzero returnF $
-     parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd
- where
-   parseAsMath :: String -> Maybe Inlines
-   parseAsMath cs = B.fromList <$> texMathToPandoc cs
-
-   parseAsInlineLaTeX :: String -> Maybe Inlines
-   parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs
-
-   parseAsMathMLSym :: String -> Maybe Inlines
-   parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs)
-    -- drop initial backslash and any trailing "{}"
-    where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1
-
-   state :: ParserState
-   state = def{ stateOptions = def{ readerParseRaw = True }}
-
-   texMathToPandoc :: String -> Maybe [Inline]
-   texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline
-
-maybeRight :: Either a b -> Maybe b
-maybeRight = either (const Nothing) Just
-
-inlineLaTeXCommand :: OrgParser String
-inlineLaTeXCommand = try $ do
-  rest <- getInput
-  case runParser rawLaTeXInline def "source" rest of
-    Right (RawInline _ cs) -> do
-      -- drop any trailing whitespace, those are not be part of the command as
-      -- far as org mode is concerned.
-      let cmdNoSpc = dropWhileEnd isSpace cs
-      let len = length cmdNoSpc
-      count len anyChar
-      return cmdNoSpc
-    _ -> mzero
-
--- Taken from Data.OldList.
-dropWhileEnd :: (a -> Bool) -> [a] -> [a]
-dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) []
-
-smart :: OrgParser (F Inlines)
-smart = do
-  getOption readerSmart >>= guard
-  doubleQuoted <|> singleQuoted <|>
-    choice (map (return <$>) [orgApostrophe, orgDash, orgEllipses])
-  where
-    orgDash = dash <* updatePositions '-'
-    orgEllipses = ellipses <* updatePositions '.'
-    orgApostrophe =
-          (char '\'' <|> char '\8217') <* updateLastPreCharPos
-                                       <* updateLastForbiddenCharPos
-                                       *> return (B.str "\x2019")
-
-singleQuoted :: OrgParser (F Inlines)
-singleQuoted = try $ do
-  singleQuoteStart
-  updatePositions '\''
-  withQuoteContext InSingleQuote $
-    fmap B.singleQuoted . trimInlinesF . mconcat <$>
-      many1Till inline (singleQuoteEnd <* updatePositions '\'')
-
--- doubleQuoted will handle regular double-quoted sections, as well
--- as dialogues with an open double-quote without a close double-quote
--- in the same paragraph.
-doubleQuoted :: OrgParser (F Inlines)
-doubleQuoted = try $ do
-  doubleQuoteStart
-  updatePositions '"'
-  contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline)
-  (withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return
-       (fmap B.doubleQuoted . trimInlinesF $ contents))
-   <|> (return $ return (B.str "\8220") <> contents)
author	Albert Krewinkel <albert@zeitkraut.de>	2016-05-22 16:52:06 +0200
committer	Albert Krewinkel <albert@zeitkraut.de>	2016-05-25 22:54:45 +0200
commit	39e8b4276e6d88d5cbb943d04c866dde9bf6473c (patch)
tree	f20d8b1e508c39524fe7084c50172861df4afad4 /src/Text/Pandoc/Readers/Org.hs
parent	a340c7249f8e19d36ee4a68663b4c97e0893292b (diff)