summaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2016-01-22 10:16:47 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2016-01-22 10:16:47 -0800
commit20170c328f12fc0214c2c50d14c8f7d03ee05e70 (patch)
treeeba75d165fe6aec92b5bcd5f63ec89b328478b1d /src/Text
parent2a2e3d99d226636166859e63d5259258ba759d5c (diff)
Changed type of Shared.uniqueIdent argument from [String] to Set String.
This avoids performance problems in documents with many identically named headers. Closes #2671.
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Parsing.hs12
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs7
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs6
-rw-r--r--src/Text/Pandoc/Readers/MediaWiki.hs5
-rw-r--r--src/Text/Pandoc/Readers/Odt/ContentReader.hs3
-rw-r--r--src/Text/Pandoc/Readers/Org.hs5
-rw-r--r--src/Text/Pandoc/Shared.hs6
-rw-r--r--src/Text/Pandoc/Writers/Docx.hs7
-rw-r--r--src/Text/Pandoc/Writers/EPUB.hs5
-rw-r--r--src/Text/Pandoc/Writers/Markdown.hs7
-rw-r--r--src/Text/Pandoc/Writers/Texinfo.hs7
11 files changed, 39 insertions, 31 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 16fe75ed5..325231846 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -915,7 +915,7 @@ data ParserState = ParserState
stateMeta' :: F Meta, -- ^ Document metadata
stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used
stateHeaders :: M.Map Inlines String, -- ^ List of headers and ids (used for implicit ref links)
- stateIdentifiers :: [String], -- ^ List of header identifiers used
+ stateIdentifiers :: Set.Set String, -- ^ Header identifiers used
stateNextExample :: Int, -- ^ Number of next example
stateExamples :: M.Map String Int, -- ^ Map from example labels to numbers
stateHasChapters :: Bool, -- ^ True if \chapter encountered
@@ -973,8 +973,8 @@ instance HasHeaderMap ParserState where
updateHeaderMap f st = st{ stateHeaders = f $ stateHeaders st }
class HasIdentifierList st where
- extractIdentifierList :: st -> [String]
- updateIdentifierList :: ([String] -> [String]) -> st -> st
+ extractIdentifierList :: st -> Set.Set String
+ updateIdentifierList :: (Set.Set String -> Set.Set String) -> st -> st
instance HasIdentifierList ParserState where
extractIdentifierList = stateIdentifiers
@@ -1013,7 +1013,7 @@ defaultParserState =
stateMeta' = return nullMeta,
stateHeaderTable = [],
stateHeaders = M.empty,
- stateIdentifiers = [],
+ stateIdentifiers = Set.empty,
stateNextExample = 1,
stateExamples = M.empty,
stateHasChapters = False,
@@ -1092,8 +1092,8 @@ registerHeader (ident,classes,kvs) header' = do
let id'' = if Ext_ascii_identifiers `Set.member` exts
then catMaybes $ map toAsciiChar id'
else id'
- updateState $ updateIdentifierList $
- if id' == id'' then (id' :) else ([id', id''] ++)
+ updateState $ updateIdentifierList $ Set.insert id'
+ updateState $ updateIdentifierList $ Set.insert id''
updateState $ updateHeaderMap $ insert' header' id'
return (id'',classes,kvs)
else do
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 44f67ce75..1b3269136 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -89,6 +89,7 @@ import Text.TeXMath (writeTeX)
import Data.Default (Default)
import qualified Data.ByteString.Lazy as B
import qualified Data.Map as M
+import qualified Data.Set as Set
import Control.Monad.Reader
import Control.Monad.State
import Data.Sequence (ViewL(..), viewl)
@@ -350,7 +351,7 @@ parPartToInlines (BookMark _ anchor) =
-- avoid an extra pass.
let newAnchor =
if not inHdrBool && anchor `elem` (M.elems anchorMap)
- then uniqueIdent [Str anchor] (M.elems anchorMap)
+ then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap)
else anchor
unless inHdrBool
(modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
@@ -393,7 +394,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils)
| (c:cs) <- filter isAnchorSpan ils
, (Span (ident, ["anchor"], _) _) <- c = do
hdrIDMap <- gets docxAnchorMap
- let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+ let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap)
modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap}
return $ Header n (newIdent, classes, kvs) (ils \\ (c:cs))
-- Otherwise we just give it a name, and register that name (associate
@@ -401,7 +402,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils)
makeHeaderAnchor' (Header n (_, classes, kvs) ils) =
do
hdrIDMap <- gets docxAnchorMap
- let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+ let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap)
modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap}
return $ Header n (newIdent, classes, kvs) ils
makeHeaderAnchor' blk = return blk
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index a34e2fb5c..c3be1f544 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -68,7 +68,7 @@ import Text.Pandoc.Error
import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
import Text.Pandoc.Compat.Monoid ((<>))
import Text.Parsec.Error
-
+import qualified Data.Set as Set
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ReaderOptions -- ^ Reader options
@@ -77,7 +77,7 @@ readHtml :: ReaderOptions -- ^ Reader options
readHtml opts inp =
mapLeft (ParseFailure . getError) . flip runReader def $
runParserT parseDoc
- (HTMLState def{ stateOptions = opts } [] Nothing [] M.empty)
+ (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty)
"source" tags
where tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
@@ -104,7 +104,7 @@ data HTMLState =
{ parserState :: ParserState,
noteTable :: [(String, Blocks)],
baseHref :: Maybe String,
- identifiers :: [String],
+ identifiers :: Set.Set String,
headerMap :: M.Map Inlines String
}
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
index d29ec50e7..950497992 100644
--- a/src/Text/Pandoc/Readers/MediaWiki.hs
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -52,6 +52,7 @@ import Text.HTML.TagSoup
import Data.Sequence (viewl, ViewL(..), (<|))
import qualified Data.Foldable as F
import qualified Data.Map as M
+import qualified Data.Set as Set
import Data.Char (isDigit, isSpace)
import Data.Maybe (fromMaybe)
import Text.Printf (printf)
@@ -69,7 +70,7 @@ readMediaWiki opts s =
, mwNextLinkNumber = 1
, mwCategoryLinks = []
, mwHeaderMap = M.empty
- , mwIdentifierList = []
+ , mwIdentifierList = Set.empty
}
(s ++ "\n")
@@ -78,7 +79,7 @@ data MWState = MWState { mwOptions :: ReaderOptions
, mwNextLinkNumber :: Int
, mwCategoryLinks :: [Inlines]
, mwHeaderMap :: M.Map Inlines String
- , mwIdentifierList :: [String]
+ , mwIdentifierList :: Set.Set String
}
type MWParser = Parser [Char] MWState
diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
index 1f1c57646..8c475eefc 100644
--- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs
+++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
@@ -61,6 +61,7 @@ import Text.Pandoc.Readers.Odt.Generic.XMLConverter
import Text.Pandoc.Readers.Odt.Generic.Fallible
import Text.Pandoc.Readers.Odt.Generic.Utils
+import qualified Data.Set as Set
--------------------------------------------------------------------------------
-- State
@@ -221,7 +222,7 @@ getPrettyAnchor = proc (baseIdent, uglyAnchor) -> do
getHeaderAnchor :: OdtReaderSafe Inlines Anchor
getHeaderAnchor = proc title -> do
state <- getExtraState -< ()
- let anchor = uniqueIdent (toList title) (usedAnchors state)
+ let anchor = uniqueIdent (toList title) (Set.fromList $ usedAnchors state)
modifyExtraState (putPrettyAnchor anchor anchor) -<< anchor
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index c7906618c..d82541638 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -53,6 +53,7 @@ import Data.Char (isAlphaNum, toLower)
import Data.Default
import Data.List (intersperse, isPrefixOf, isSuffixOf)
import qualified Data.Map as M
+import qualified Data.Set as Set
import Data.Maybe (fromMaybe, isJust)
import Network.HTTP (urlEncode)
@@ -144,7 +145,7 @@ data OrgParserState = OrgParserState
, orgStateMeta' :: F Meta
, orgStateNotes' :: OrgNoteTable
, orgStateParserContext :: ParserContext
- , orgStateIdentifiers :: [String]
+ , orgStateIdentifiers :: Set.Set String
, orgStateHeaderMap :: M.Map Inlines String
}
@@ -186,7 +187,7 @@ defaultOrgParserState = OrgParserState
, orgStateMeta' = return nullMeta
, orgStateNotes' = []
, orgStateParserContext = NullState
- , orgStateIdentifiers = []
+ , orgStateIdentifiers = Set.empty
, orgStateHeaderMap = M.empty
}
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index aa07c81e1..b5efcf172 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -706,14 +706,14 @@ headerLtEq _ _ = False
-- | Generate a unique identifier from a list of inlines.
-- Second argument is a list of already used identifiers.
-uniqueIdent :: [Inline] -> [String] -> String
+uniqueIdent :: [Inline] -> Set.Set String -> String
uniqueIdent title' usedIdents
= let baseIdent = case inlineListToIdentifier title' of
"" -> "section"
x -> x
numIdent n = baseIdent ++ "-" ++ show n
- in if baseIdent `elem` usedIdents
- then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of
+ in if baseIdent `Set.member` usedIdents
+ then case find (\x -> not $ numIdent x `Set.member` usedIdents) ([1..60000] :: [Int]) of
Just x -> numIdent x
Nothing -> baseIdent -- if we have more than 60,000, allow repeats
else baseIdent
diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs
index 827d32620..150e19043 100644
--- a/src/Text/Pandoc/Writers/Docx.hs
+++ b/src/Text/Pandoc/Writers/Docx.hs
@@ -34,6 +34,7 @@ import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Char8 as BL8
import qualified Data.Map as M
+import qualified Data.Set as Set
import qualified Text.Pandoc.UTF8 as UTF8
import Codec.Archive.Zip
import Data.Time.Clock.POSIX
@@ -95,7 +96,7 @@ data WriterState = WriterState{
stTextProperties :: [Element]
, stParaProperties :: [Element]
, stFootnotes :: [Element]
- , stSectionIds :: [String]
+ , stSectionIds :: Set.Set String
, stExternalLinks :: M.Map String String
, stImages :: M.Map FilePath (String, String, Maybe MimeType, Element, B.ByteString)
, stListLevel :: Int
@@ -117,7 +118,7 @@ defaultWriterState = WriterState{
stTextProperties = []
, stParaProperties = []
, stFootnotes = defaultFootnotes
- , stSectionIds = []
+ , stSectionIds = Set.empty
, stExternalLinks = M.empty
, stImages = M.empty
, stListLevel = -1
@@ -742,7 +743,7 @@ blockToOpenXML opts (Header lev (ident,_,_) lst) = do
let bookmarkName = if null ident
then uniqueIdent lst usedIdents
else ident
- modify $ \s -> s{ stSectionIds = bookmarkName : stSectionIds s }
+ modify $ \s -> s{ stSectionIds = Set.insert bookmarkName $ stSectionIds s }
id' <- getUniqueId
let bookmarkStart = mknode "w:bookmarkStart" [("w:id", id')
,("w:name",bookmarkName)] ()
diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs
index 64f94f41f..804dbb926 100644
--- a/src/Text/Pandoc/Writers/EPUB.hs
+++ b/src/Text/Pandoc/Writers/EPUB.hs
@@ -31,6 +31,7 @@ Conversion of 'Pandoc' documents to EPUB.
module Text.Pandoc.Writers.EPUB ( writeEPUB ) where
import Data.IORef ( IORef, newIORef, readIORef, modifyIORef )
import qualified Data.Map as M
+import qualified Data.Set as Set
import Data.Maybe ( fromMaybe, catMaybes )
import Data.List ( isPrefixOf, isInfixOf, intercalate )
import System.Environment ( getEnv )
@@ -916,13 +917,13 @@ showChapter = printf "ch%03d.xhtml"
-- Add identifiers to any headers without them.
addIdentifiers :: [Block] -> [Block]
-addIdentifiers bs = evalState (mapM go bs) []
+addIdentifiers bs = evalState (mapM go bs) Set.empty
where go (Header n (ident,classes,kvs) ils) = do
ids <- get
let ident' = if null ident
then uniqueIdent ils ids
else ident
- put $ ident' : ids
+ modify $ Set.insert ident'
return $ Header n (ident',classes,kvs) ils
go x = return x
diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs
index 5a92f3cdf..4c4675524 100644
--- a/src/Text/Pandoc/Writers/Markdown.hs
+++ b/src/Text/Pandoc/Writers/Markdown.hs
@@ -53,6 +53,7 @@ import Data.Yaml (Value(Object,String,Array,Bool,Number))
import qualified Data.HashMap.Strict as H
import qualified Data.Vector as V
import qualified Data.Text as T
+import qualified Data.Set as Set
type Notes = [[Block]]
type Ref = ([Inline], Target, Attr)
@@ -61,11 +62,11 @@ data WriterState = WriterState { stNotes :: Notes
, stRefs :: Refs
, stRefShortcutable :: Bool
, stInList :: Bool
- , stIds :: [String]
+ , stIds :: Set.Set String
, stPlain :: Bool }
instance Default WriterState
where def = WriterState{ stNotes = [], stRefs = [], stRefShortcutable = True,
- stInList = False, stIds = [], stPlain = False }
+ stInList = False, stIds = Set.empty, stPlain = False }
-- | Convert Pandoc to Markdown.
writeMarkdown :: WriterOptions -> Pandoc -> String
@@ -364,7 +365,7 @@ blockToMarkdown opts (Header level attr inlines) = do
-- so we know whether to print an explicit identifier
ids <- gets stIds
let autoId = uniqueIdent inlines ids
- modify $ \st -> st{ stIds = autoId : ids }
+ modify $ \st -> st{ stIds = Set.insert autoId ids }
let attr' = case attr of
("",[],[]) -> empty
(id',[],[]) | isEnabled Ext_auto_identifiers opts
diff --git a/src/Text/Pandoc/Writers/Texinfo.hs b/src/Text/Pandoc/Writers/Texinfo.hs
index 1aefaa678..8420704dc 100644
--- a/src/Text/Pandoc/Writers/Texinfo.hs
+++ b/src/Text/Pandoc/Writers/Texinfo.hs
@@ -43,13 +43,14 @@ import Text.Pandoc.Pretty
import Text.Pandoc.ImageSize
import Network.URI ( isURI, unEscapeString )
import System.FilePath
+import qualified Data.Set as Set
data WriterState =
WriterState { stStrikeout :: Bool -- document contains strikeout
, stSuperscript :: Bool -- document contains superscript
, stSubscript :: Bool -- document contains subscript
, stEscapeComma :: Bool -- in a context where we need @comma
- , stIdentifiers :: [String] -- header ids used already
+ , stIdentifiers :: Set.Set String -- header ids used already
, stOptions :: WriterOptions -- writer options
}
@@ -64,7 +65,7 @@ writeTexinfo options document =
evalState (pandocToTexinfo options $ wrapTop document) $
WriterState { stStrikeout = False, stSuperscript = False,
stEscapeComma = False, stSubscript = False,
- stIdentifiers = [], stOptions = options}
+ stIdentifiers = Set.empty, stOptions = options}
-- | Add a "Top" node around the document, needed by Texinfo.
wrapTop :: Pandoc -> Pandoc
@@ -215,7 +216,7 @@ blockToTexinfo (Header level _ lst) = do
txt <- inlineListToTexinfo lst
idsUsed <- gets stIdentifiers
let id' = uniqueIdent lst idsUsed
- modify $ \st -> st{ stIdentifiers = id' : idsUsed }
+ modify $ \st -> st{ stIdentifiers = Set.insert id' idsUsed }
return $ if (level > 0) && (level <= 4)
then blankline <> text "@node " <> node $$
text (seccmd level) <> txt $$