summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Shared.hs
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-01-27 03:04:40 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-01-27 03:04:40 +0000
commitd06417125dd4d8cb177abd2d472c0c1cad4c49be (patch)
treea3a6d7991f01e4184b678a5fcce023947d015451 /src/Text/Pandoc/Shared.hs
parentf2de08864ee9870147839389554b9353631d21f8 (diff)
Changes in entity handling:
+ Entities are parsed (and unicode characters returned) in both Markdown and HTML readers. + Parsers characterEntity, namedEntity, decimalEntity, hexEntity added to Entities.hs; these parse a string and return a unicode character. + Changed 'entity' parser in HTML reader to use the 'characterEntity' parser from Entities.hs. + Added new 'entity' parser to Markdown reader, and added '&' as a special character. Adjusted test suite accordingly since now we get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T".. + stringToSGML moved to Entities.hs. escapeSGML removed as redundant, given encodeEntities. + stringToSGML, encodeEntities, and specialCharToEntity are given a boolean parameter that causes only numerical entities to be used. This is used in the docbook writer. The HTML writer uses named entities where possible, but not all docbook-consumers know about the named entities without special instructions, so it seems safer to use numerical entities there. + decodeEntities is rewritten in a way that avoids Text.Regex, using the new parsers. + charToEntity and charToNumericalEntity added to Entities.hs. + Moved specialCharToEntity from Shared.hs to Entities.hs. + Removed unneeded 'decodeEntities' from 'str' parser in HTML and Markdown readers. + Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and sgmlCharacterEntity from Shared.hs. + Modified Docbook writer so that it doesn't rely on Text.Regex for detecting "mailto" links. git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Shared.hs')
-rw-r--r--src/Text/Pandoc/Shared.hs58
1 files changed, 3 insertions, 55 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index b82357d7a..f63ca4ce4 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -65,8 +65,6 @@ module Text.Pandoc.Shared (
replaceReferenceLinks,
replaceRefLinksBlockList,
-- * SGML
- escapeSGML,
- stringToSGML,
inTags,
selfClosingTag,
inTagsSimple,
@@ -74,7 +72,7 @@ module Text.Pandoc.Shared (
) where
import Text.Pandoc.Definition
import Text.ParserCombinators.Parsec as Parsec
-import Text.Pandoc.Entities ( decodeEntities, charToEntity )
+import Text.Pandoc.Entities ( decodeEntities, encodeEntities, stringToSGML )
import Text.PrettyPrint.HughesPJ as PP ( text, char, (<>),
($$), nest, Doc, isEmpty )
import Data.Char ( toLower, ord )
@@ -523,61 +521,11 @@ replaceRefLinksInline keytable (Quoted t lst) =
Quoted t (map (replaceRefLinksInline keytable) lst)
replaceRefLinksInline keytable other = other
--- | Parse SGML character entity.
-sgmlCharacterEntity :: GenParser Char st [Char]
-sgmlCharacterEntity = sgmlNamedEntity <|> sgmlDecimalEntity <|>
- sgmlHexEntity <?> "SGML entity"
-
--- | Parse SGML character entity.
-sgmlNamedEntity :: GenParser Char st [Char]
-sgmlNamedEntity = try $ do
- st <- Parsec.char '&'
- body <- many1 alphaNum
- end <- Parsec.char ';'
- return $ (st:body) ++ [end]
-
--- | Parse SGML decimal entity.
-sgmlDecimalEntity :: GenParser Char st [Char]
-sgmlDecimalEntity = try $ do
- st <- string "&#"
- body <- many1 (oneOf "0123456789")
- end <- Parsec.char ';'
- return $ st ++ body ++ [end]
-
--- | Parse SGML hexadecimal entity.
-sgmlHexEntity :: GenParser Char st [Char]
-sgmlHexEntity = try $ do
- st <- string "&#"
- hex <- oneOf "Xx"
- body <- many1 (oneOf "0123456789ABCDEFabcdef")
- end <- Parsec.char ';'
- return $ st ++ (hex:body) ++ [end]
-
--- | Escape special character to SGML entity.
-specialCharToEntity :: Char -> [Char]
-specialCharToEntity c = if (c `elem` "&<>\"") || (ord c > 127)
- then charToEntity c
- else [c]
-
--- | Escape string, preserving character entities.
-stringToSGML :: String -> String
-stringToSGML str =
- let segment = sgmlCharacterEntity <|>
- (do{c <- anyChar;
- return $ specialCharToEntity c})
- sgmlString = (do{segs <- many segment; return $ concat segs}) in
- case parse sgmlString str str of
- Left err -> error $ "\nError:\n" ++ show err
- Right result -> result
-
--- | Escape string as needed for SGML. Entity references are not preserved.
-escapeSGML :: String -> String
-escapeSGML = concatMap specialCharToEntity
-
-- | Return a text object with a string of formatted SGML attributes.
attributeList :: [(String, String)] -> Doc
attributeList = text . concatMap
- (\(a, b) -> " " ++ stringToSGML a ++ "=\"" ++ stringToSGML b ++ "\"")
+ (\(a, b) -> " " ++ stringToSGML True a ++ "=\"" ++
+ stringToSGML True b ++ "\"")
-- | Put the supplied contents between start and end tags of tagType,
-- with specified attributes and (if specified) indentation.