Changed type of Shared.uniqueIdent argument from [String] to Set String.

This avoids performance problems in documents with many identically named headers. Closes #2671.
author: John MacFarlane <jgm@berkeley.edu> 2016-01-22 10:16:47 -0800
committer: John MacFarlane <jgm@berkeley.edu> 2016-01-22 10:16:47 -0800
commit: 20170c328f12fc0214c2c50d14c8f7d03ee05e70 (patch)
tree: eba75d165fe6aec92b5bcd5f63ec89b328478b1d /src/Text
parent: 2a2e3d99d226636166859e63d5259258ba759d5c (diff)
11 files changed, 39 insertions, 31 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 16fe75ed5..325231846 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -915,7 +915,7 @@ data ParserState = ParserState
       stateMeta'           :: F Meta,        -- ^ Document metadata
       stateHeaderTable     :: [HeaderType],  -- ^ Ordered list of header types used
       stateHeaders         :: M.Map Inlines String, -- ^ List of headers and ids (used for implicit ref links)
-      stateIdentifiers     :: [String],      -- ^ List of header identifiers used
+      stateIdentifiers     :: Set.Set String, -- ^ Header identifiers used
       stateNextExample     :: Int,           -- ^ Number of next example
       stateExamples        :: M.Map String Int, -- ^ Map from example labels to numbers
       stateHasChapters     :: Bool,          -- ^ True if \chapter encountered
@@ -973,8 +973,8 @@ instance HasHeaderMap ParserState where
   updateHeaderMap f st = st{ stateHeaders = f $ stateHeaders st }
 
 class HasIdentifierList st where
-  extractIdentifierList  :: st -> [String]
-  updateIdentifierList   :: ([String] -> [String]) -> st -> st
+  extractIdentifierList  :: st -> Set.Set String
+  updateIdentifierList   :: (Set.Set String -> Set.Set String) -> st -> st
 
 instance HasIdentifierList ParserState where
   extractIdentifierList     = stateIdentifiers
@@ -1013,7 +1013,7 @@ defaultParserState =
                   stateMeta'           = return nullMeta,
                   stateHeaderTable     = [],
                   stateHeaders         = M.empty,
-                  stateIdentifiers     = [],
+                  stateIdentifiers     = Set.empty,
                   stateNextExample     = 1,
                   stateExamples        = M.empty,
                   stateHasChapters     = False,
@@ -1092,8 +1092,8 @@ registerHeader (ident,classes,kvs) header' = do
        let id'' = if Ext_ascii_identifiers `Set.member` exts
                      then catMaybes $ map toAsciiChar id'
                      else id'
-       updateState $ updateIdentifierList $
-         if id' == id'' then (id' :) else ([id', id''] ++)
+       updateState $ updateIdentifierList $ Set.insert id'
+       updateState $ updateIdentifierList $ Set.insert id''
        updateState $ updateHeaderMap $ insert' header' id'
        return (id'',classes,kvs)
      else do
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 44f67ce75..1b3269136 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -89,6 +89,7 @@ import Text.TeXMath (writeTeX)
 import Data.Default (Default)
 import qualified Data.ByteString.Lazy as B
 import qualified Data.Map as M
+import qualified Data.Set as Set
 import Control.Monad.Reader
 import Control.Monad.State
 import Data.Sequence (ViewL(..), viewl)
@@ -350,7 +351,7 @@ parPartToInlines (BookMark _ anchor) =
     -- avoid an extra pass.
     let newAnchor =
           if not inHdrBool && anchor `elem` (M.elems anchorMap)
-          then uniqueIdent [Str anchor] (M.elems anchorMap)
+          then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap)
           else anchor
     unless inHdrBool
       (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
@@ -393,7 +394,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils)
   | (c:cs) <- filter isAnchorSpan ils
   , (Span (ident, ["anchor"], _) _) <- c = do
     hdrIDMap <- gets docxAnchorMap
-    let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+    let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap)
     modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap}
     return $ Header n (newIdent, classes, kvs) (ils \\ (c:cs))
 -- Otherwise we just give it a name, and register that name (associate
@@ -401,7 +402,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils)
 makeHeaderAnchor' (Header n (_, classes, kvs) ils) =
   do
     hdrIDMap <- gets docxAnchorMap
-    let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+    let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap)
     modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap}
     return $ Header n (newIdent, classes, kvs) ils
 makeHeaderAnchor' blk = return blk
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index a34e2fb5c..c3be1f544 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -68,7 +68,7 @@ import Text.Pandoc.Error
 import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
 import Text.Pandoc.Compat.Monoid ((<>))
 import Text.Parsec.Error
-
+import qualified Data.Set as Set
 
 -- | Convert HTML-formatted string to 'Pandoc' document.
 readHtml :: ReaderOptions -- ^ Reader options
@@ -77,7 +77,7 @@ readHtml :: ReaderOptions -- ^ Reader options
 readHtml opts inp =
     mapLeft (ParseFailure . getError) . flip runReader def $
       runParserT parseDoc
-        (HTMLState def{ stateOptions = opts } [] Nothing [] M.empty)
+        (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty)
         "source" tags
     where tags = stripPrefixes . canonicalizeTags $
                    parseTagsOptions parseOptions{ optTagPosition = True } inp
@@ -104,7 +104,7 @@ data HTMLState =
   {  parserState :: ParserState,
      noteTable   :: [(String, Blocks)],
      baseHref    :: Maybe String,
-     identifiers :: [String],
+     identifiers :: Set.Set String,
      headerMap   :: M.Map Inlines String
   }
 
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
index d29ec50e7..950497992 100644
--- a/src/Text/Pandoc/Readers/MediaWiki.hs
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -52,6 +52,7 @@ import Text.HTML.TagSoup
 import Data.Sequence (viewl, ViewL(..), (<|))
 import qualified Data.Foldable as F
 import qualified Data.Map as M
+import qualified Data.Set as Set
 import Data.Char (isDigit, isSpace)
 import Data.Maybe (fromMaybe)
 import Text.Printf (printf)
@@ -69,7 +70,7 @@ readMediaWiki opts s =
                                        , mwNextLinkNumber  = 1
                                        , mwCategoryLinks = []
                                        , mwHeaderMap = M.empty
-                                       , mwIdentifierList = []
+                                       , mwIdentifierList = Set.empty
                                        }
            (s ++ "\n")
 
@@ -78,7 +79,7 @@ data MWState = MWState { mwOptions         :: ReaderOptions
                        , mwNextLinkNumber  :: Int
                        , mwCategoryLinks   :: [Inlines]
                        , mwHeaderMap       :: M.Map Inlines String
-                       , mwIdentifierList  :: [String]
+                       , mwIdentifierList  :: Set.Set String
                        }
 
 type MWParser = Parser [Char] MWState
diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
index 1f1c57646..8c475eefc 100644
--- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs
+++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
@@ -61,6 +61,7 @@ import           Text.Pandoc.Readers.Odt.Generic.XMLConverter
 import           Text.Pandoc.Readers.Odt.Generic.Fallible
 import           Text.Pandoc.Readers.Odt.Generic.Utils
 
+import qualified Data.Set as Set
 
 --------------------------------------------------------------------------------
 -- State
@@ -221,7 +222,7 @@ getPrettyAnchor = proc (baseIdent, uglyAnchor) -> do
 getHeaderAnchor :: OdtReaderSafe Inlines Anchor
 getHeaderAnchor = proc title -> do
   state <- getExtraState -< ()
-  let anchor = uniqueIdent (toList title) (usedAnchors state)
+  let anchor = uniqueIdent (toList title) (Set.fromList $ usedAnchors state)
   modifyExtraState (putPrettyAnchor anchor anchor) -<< anchor
 
 
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index c7906618c..d82541638 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -53,6 +53,7 @@ import           Data.Char (isAlphaNum, toLower)
 import           Data.Default
 import           Data.List (intersperse, isPrefixOf, isSuffixOf)
 import qualified Data.Map as M
+import qualified Data.Set as Set
 import           Data.Maybe (fromMaybe, isJust)
 import           Network.HTTP (urlEncode)
 
@@ -144,7 +145,7 @@ data OrgParserState = OrgParserState
                       , orgStateMeta'                :: F Meta
                       , orgStateNotes'               :: OrgNoteTable
                       , orgStateParserContext        :: ParserContext
-                      , orgStateIdentifiers          :: [String]
+                      , orgStateIdentifiers          :: Set.Set String
                       , orgStateHeaderMap            :: M.Map Inlines String
                       }
 
@@ -186,7 +187,7 @@ defaultOrgParserState = OrgParserState
                         , orgStateMeta' = return nullMeta
                         , orgStateNotes' = []
                         , orgStateParserContext = NullState
-                        , orgStateIdentifiers = []
+                        , orgStateIdentifiers = Set.empty
                         , orgStateHeaderMap = M.empty
                         }
 
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index aa07c81e1..b5efcf172 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -706,14 +706,14 @@ headerLtEq _ _ = False
 
 -- | Generate a unique identifier from a list of inlines.
 -- Second argument is a list of already used identifiers.
-uniqueIdent :: [Inline] -> [String] -> String
+uniqueIdent :: [Inline] -> Set.Set String -> String
 uniqueIdent title' usedIdents
   =  let baseIdent = case inlineListToIdentifier title' of
                         ""   -> "section"
                         x    -> x
          numIdent n = baseIdent ++ "-" ++ show n
-     in  if baseIdent `elem` usedIdents
-           then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of
+     in  if baseIdent `Set.member` usedIdents
+           then case find (\x -> not $ numIdent x `Set.member` usedIdents) ([1..60000] :: [Int]) of
                   Just x  -> numIdent x
                   Nothing -> baseIdent   -- if we have more than 60,000, allow repeats
            else baseIdent
diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs
index 827d32620..150e19043 100644
--- a/src/Text/Pandoc/Writers/Docx.hs
+++ b/src/Text/Pandoc/Writers/Docx.hs
@@ -34,6 +34,7 @@ import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.ByteString.Lazy.Char8 as BL8
 import qualified Data.Map as M
+import qualified Data.Set as Set
 import qualified Text.Pandoc.UTF8 as UTF8
 import Codec.Archive.Zip
 import Data.Time.Clock.POSIX
@@ -95,7 +96,7 @@ data WriterState = WriterState{
          stTextProperties :: [Element]
        , stParaProperties :: [Element]
        , stFootnotes      :: [Element]
-       , stSectionIds     :: [String]
+       , stSectionIds     :: Set.Set String
        , stExternalLinks  :: M.Map String String
        , stImages         :: M.Map FilePath (String, String, Maybe MimeType, Element, B.ByteString)
        , stListLevel      :: Int
@@ -117,7 +118,7 @@ defaultWriterState = WriterState{
         stTextProperties = []
       , stParaProperties = []
       , stFootnotes      = defaultFootnotes
-      , stSectionIds     = []
+      , stSectionIds     = Set.empty
       , stExternalLinks  = M.empty
       , stImages         = M.empty
       , stListLevel      = -1
@@ -742,7 +743,7 @@ blockToOpenXML opts (Header lev (ident,_,_) lst) = do
   let bookmarkName = if null ident
                         then uniqueIdent lst usedIdents
                         else ident
-  modify $ \s -> s{ stSectionIds = bookmarkName : stSectionIds s }
+  modify $ \s -> s{ stSectionIds = Set.insert bookmarkName $ stSectionIds s }
   id' <- getUniqueId
   let bookmarkStart = mknode "w:bookmarkStart" [("w:id", id')
                                                ,("w:name",bookmarkName)] ()
diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs
index 64f94f41f..804dbb926 100644
--- a/src/Text/Pandoc/Writers/EPUB.hs
+++ b/src/Text/Pandoc/Writers/EPUB.hs
@@ -31,6 +31,7 @@ Conversion of 'Pandoc' documents to EPUB.
 module Text.Pandoc.Writers.EPUB ( writeEPUB ) where
 import Data.IORef ( IORef, newIORef, readIORef, modifyIORef )
 import qualified Data.Map as M
+import qualified Data.Set as Set
 import Data.Maybe ( fromMaybe, catMaybes )
 import Data.List ( isPrefixOf, isInfixOf, intercalate )
 import System.Environment ( getEnv )
@@ -916,13 +917,13 @@ showChapter = printf "ch%03d.xhtml"
 
 -- Add identifiers to any headers without them.
 addIdentifiers :: [Block] -> [Block]
-addIdentifiers bs = evalState (mapM go bs) []
+addIdentifiers bs = evalState (mapM go bs) Set.empty
  where go (Header n (ident,classes,kvs) ils) = do
          ids <- get
          let ident' = if null ident
                          then uniqueIdent ils ids
                          else ident
-         put $ ident' : ids
+         modify $ Set.insert ident'
          return $ Header n (ident',classes,kvs) ils
        go x = return x
 
diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs
index 5a92f3cdf..4c4675524 100644
--- a/src/Text/Pandoc/Writers/Markdown.hs
+++ b/src/Text/Pandoc/Writers/Markdown.hs
@@ -53,6 +53,7 @@ import Data.Yaml (Value(Object,String,Array,Bool,Number))
 import qualified Data.HashMap.Strict as H
 import qualified Data.Vector as V
 import qualified Data.Text as T
+import qualified Data.Set as Set
 
 type Notes = [[Block]]
 type Ref   = ([Inline], Target, Attr)
@@ -61,11 +62,11 @@ data WriterState = WriterState { stNotes           :: Notes
                                , stRefs            :: Refs
                                , stRefShortcutable :: Bool
                                , stInList          :: Bool
-                               , stIds             :: [String]
+                               , stIds             :: Set.Set String
                                , stPlain           :: Bool }
 instance Default WriterState
   where def = WriterState{ stNotes = [], stRefs = [], stRefShortcutable = True,
-                           stInList = False, stIds = [], stPlain = False }
+                           stInList = False, stIds = Set.empty, stPlain = False }
 
 -- | Convert Pandoc to Markdown.
 writeMarkdown :: WriterOptions -> Pandoc -> String
@@ -364,7 +365,7 @@ blockToMarkdown opts (Header level attr inlines) = do
   -- so we know whether to print an explicit identifier
   ids <- gets stIds
   let autoId = uniqueIdent inlines ids
-  modify $ \st -> st{ stIds = autoId : ids }
+  modify $ \st -> st{ stIds = Set.insert autoId ids }
   let attr' = case attr of
                    ("",[],[]) -> empty
                    (id',[],[]) | isEnabled Ext_auto_identifiers opts
diff --git a/src/Text/Pandoc/Writers/Texinfo.hs b/src/Text/Pandoc/Writers/Texinfo.hs
index 1aefaa678..8420704dc 100644
--- a/src/Text/Pandoc/Writers/Texinfo.hs
+++ b/src/Text/Pandoc/Writers/Texinfo.hs
@@ -43,13 +43,14 @@ import Text.Pandoc.Pretty
 import Text.Pandoc.ImageSize
 import Network.URI ( isURI, unEscapeString )
 import System.FilePath
+import qualified Data.Set as Set
 
 data WriterState =
   WriterState { stStrikeout   :: Bool  -- document contains strikeout
               , stSuperscript :: Bool -- document contains superscript
               , stSubscript   :: Bool -- document contains subscript
               , stEscapeComma :: Bool -- in a context where we need @comma
-              , stIdentifiers :: [String] -- header ids used already
+              , stIdentifiers :: Set.Set String -- header ids used already
               , stOptions     :: WriterOptions -- writer options
               }
 
@@ -64,7 +65,7 @@ writeTexinfo options document =
   evalState (pandocToTexinfo options $ wrapTop document) $
   WriterState { stStrikeout = False, stSuperscript = False,
                 stEscapeComma = False, stSubscript = False,
-                stIdentifiers = [], stOptions = options}
+                stIdentifiers = Set.empty, stOptions = options}
 
 -- | Add a "Top" node around the document, needed by Texinfo.
 wrapTop :: Pandoc -> Pandoc
@@ -215,7 +216,7 @@ blockToTexinfo (Header level _ lst) = do
   txt <- inlineListToTexinfo lst
   idsUsed <- gets stIdentifiers
   let id' = uniqueIdent lst idsUsed
-  modify $ \st -> st{ stIdentifiers = id' : idsUsed }
+  modify $ \st -> st{ stIdentifiers = Set.insert id' idsUsed }
   return $ if (level > 0) && (level <= 4)
               then blankline <> text "@node " <> node $$
                    text (seccmd level) <> txt $$
author	John MacFarlane <jgm@berkeley.edu>	2016-01-22 10:16:47 -0800
committer	John MacFarlane <jgm@berkeley.edu>	2016-01-22 10:16:47 -0800
commit	20170c328f12fc0214c2c50d14c8f7d03ee05e70 (patch)
tree	eba75d165fe6aec92b5bcd5f63ec89b328478b1d /src/Text
parent	2a2e3d99d226636166859e63d5259258ba759d5c (diff)