summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-11-03 22:14:03 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-11-03 22:14:03 +0000
commit4a841bfc5464907adea4cdd655485565565b40ae (patch)
tree36c0a21e3639614c8d25b5fb1909c32d0ab11dcd /src/Text/Pandoc/Readers/HTML.hs
parent3116d30133196e1bb258f7e74e03d4a85f3b21ae (diff)
Use template haskell to avoid the need for templates:
+ Added library Text.Pandoc.Include, with a template haskell function $(includeStrFrom fname) to include a file as a string constant at compile time. + This removes the need for the 'templates' directory or Makefile target. These have been removed. + The base source directory has been changed from src to . + A new 'data' directory has been added, containing the ASCIIMathML.js script, writer headers, and S5 files. + The src/wrappers directory has been moved to 'wrappers'. + The Text.Pandoc.ASCIIMathML library is no longer needed, since Text.Pandoc.Writers.HTML can use includeStrFrom to include the ASCIIMathML.js code directly. It has been removed. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1063 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs496
1 files changed, 0 insertions, 496 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
deleted file mode 100644
index 70a071152..000000000
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ /dev/null
@@ -1,496 +0,0 @@
-{-
-Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
--}
-
-{- |
- Module : Text.Pandoc.Readers.HTML
- Copyright : Copyright (C) 2006-7 John MacFarlane
- License : GNU GPL, version 2 or above
-
- Maintainer : John MacFarlane <jgm@berkeley.edu>
- Stability : alpha
- Portability : portable
-
-Conversion of HTML to 'Pandoc' document.
--}
-module Text.Pandoc.Readers.HTML (
- readHtml,
- rawHtmlInline,
- rawHtmlBlock,
- anyHtmlBlockTag,
- anyHtmlInlineTag,
- anyHtmlTag,
- anyHtmlEndTag,
- htmlEndTag,
- extractTagType,
- htmlBlockElement
- ) where
-
-import Text.ParserCombinators.Parsec
-import Text.Pandoc.Definition
-import Text.Pandoc.Shared
-import Text.Pandoc.CharacterReferences ( characterReference,
- decodeCharacterReferences )
-import Data.Maybe ( fromMaybe )
-import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf )
-import Data.Char ( toUpper, toLower, isAlphaNum )
-
--- | Convert HTML-formatted string to 'Pandoc' document.
-readHtml :: ParserState -- ^ Parser state
- -> String -- ^ String to parse
- -> Pandoc
-readHtml = readWith parseHtml
-
---
--- Constants
---
-
-eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
- "map", "area", "object", "script"]
-
-inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
- "br", "cite", "code", "dfn", "em", "font", "i", "img",
- "input", "kbd", "label", "q", "s", "samp", "select",
- "small", "span", "strike", "strong", "sub", "sup",
- "textarea", "tt", "u", "var"] ++ eitherBlockOrInline
-
-blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
- "dl", "fieldset", "form", "h1", "h2", "h3", "h4",
- "h5", "h6", "hr", "isindex", "menu", "noframes",
- "noscript", "ol", "p", "pre", "table", "ul", "dd",
- "dt", "frameset", "li", "tbody", "td", "tfoot",
- "th", "thead", "tr"] ++ eitherBlockOrInline
-
---
--- HTML utility functions
---
-
--- | Read blocks until end tag.
-blocksTilEnd tag = do
- blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
- return $ filter (/= Null) blocks
-
--- | Read inlines until end tag.
-inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
-
--- | Parse blocks between open and close tag.
-blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
-
--- | Parse inlines between open and close tag.
-inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
-
--- | Extract type from a tag: e.g. @br@ from @\<br\>@
-extractTagType :: String -> String
-extractTagType ('<':rest) =
- let isSpaceOrSlash c = c `elem` "/ \n\t" in
- map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
-extractTagType _ = ""
-
--- | Parse any HTML tag (opening or self-closing) and return text of tag
-anyHtmlTag = try $ do
- char '<'
- spaces
- tag <- many1 alphaNum
- attribs <- many htmlAttribute
- spaces
- ender <- option "" (string "/")
- let ender' = if null ender then "" else " /"
- spaces
- char '>'
- return $ "<" ++ tag ++
- concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
-
-anyHtmlEndTag = try $ do
- char '<'
- spaces
- char '/'
- spaces
- tagType <- many1 alphaNum
- spaces
- char '>'
- return $ "</" ++ tagType ++ ">"
-
-htmlTag :: String -> GenParser Char st (String, [(String, String)])
-htmlTag tag = try $ do
- char '<'
- spaces
- stringAnyCase tag
- attribs <- many htmlAttribute
- spaces
- optional (string "/")
- spaces
- char '>'
- return (tag, (map (\(name, content, raw) -> (name, content)) attribs))
-
--- parses a quoted html attribute value
-quoted quoteChar = do
- result <- between (char quoteChar) (char quoteChar)
- (many (noneOf [quoteChar]))
- return (result, [quoteChar])
-
-htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
-
--- minimized boolean attribute
-htmlMinimizedAttribute = try $ do
- many1 space
- name <- many1 (choice [letter, oneOf ".-_:"])
- return (name, name, name)
-
-htmlRegularAttribute = try $ do
- many1 space
- name <- many1 (choice [letter, oneOf ".-_:"])
- spaces
- char '='
- spaces
- (content, quoteStr) <- choice [ (quoted '\''),
- (quoted '"'),
- (do
- a <- many (alphaNum <|> (oneOf "-._:"))
- return (a,"")) ]
- return (name, content,
- (name ++ "=" ++ quoteStr ++ content ++ quoteStr))
-
--- | Parse an end tag of type 'tag'
-htmlEndTag tag = try $ do
- char '<'
- spaces
- char '/'
- spaces
- stringAnyCase tag
- spaces
- char '>'
- return $ "</" ++ tag ++ ">"
-
--- | Returns @True@ if the tag is (or can be) an inline tag.
-isInline tag = (extractTagType tag) `elem` inlineHtmlTags
-
--- | Returns @True@ if the tag is (or can be) a block tag.
-isBlock tag = (extractTagType tag) `elem` blockHtmlTags
-
-anyHtmlBlockTag = try $ do
- tag <- anyHtmlTag <|> anyHtmlEndTag
- if isBlock tag then return tag else fail "inline tag"
-
-anyHtmlInlineTag = try $ do
- tag <- anyHtmlTag <|> anyHtmlEndTag
- if isInline tag then return tag else fail "not an inline tag"
-
--- | Parses material between script tags.
--- Scripts must be treated differently, because they can contain '<>' etc.
-htmlScript = try $ do
- open <- string "<script"
- rest <- manyTill anyChar (htmlEndTag "script")
- return $ open ++ rest ++ "</script>"
-
-htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
-
-rawHtmlBlock = try $ do
- notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
- body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
- sp <- many space
- state <- getState
- if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null
-
--- | Parses an HTML comment.
-htmlComment = try $ do
- string "<!--"
- comment <- manyTill anyChar (try (string "-->"))
- return $ "<!--" ++ comment ++ "-->"
-
---
--- parsing documents
---
-
-xmlDec = try $ do
- string "<?"
- rest <- manyTill anyChar (char '>')
- return $ "<?" ++ rest ++ ">"
-
-definition = try $ do
- string "<!"
- rest <- manyTill anyChar (char '>')
- return $ "<!" ++ rest ++ ">"
-
-nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >>
- ((rawHtmlBlock >> return ' ') <|> anyChar)
-
-parseTitle = try $ do
- (tag, _) <- htmlTag "title"
- contents <- inlinesTilEnd tag
- spaces
- return contents
-
--- parse header and return meta-information (for now, just title)
-parseHead = try $ do
- htmlTag "head"
- spaces
- skipMany nonTitleNonHead
- contents <- option [] parseTitle
- skipMany nonTitleNonHead
- htmlTag "/head"
- return (contents, [], "")
-
-skipHtmlTag tag = optional (htmlTag tag)
-
--- h1 class="title" representation of title in body
-bodyTitle = try $ do
- (tag, attribs) <- htmlTag "h1"
- cl <- case (extractAttribute "class" attribs) of
- Just "title" -> return ""
- otherwise -> fail "not title"
- inlinesTilEnd "h1"
-
-parseHtml = do
- sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
- skipHtmlTag "html"
- spaces
- (title, authors, date) <- option ([], [], "") parseHead
- spaces
- skipHtmlTag "body"
- spaces
- optional bodyTitle -- skip title in body, because it's represented in meta
- blocks <- parseBlocks
- spaces
- optional (htmlEndTag "body")
- spaces
- optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
- eof
- return $ Pandoc (Meta title authors date) blocks
-
---
--- parsing blocks
---
-
-parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
-
-block = choice [ codeBlock
- , header
- , hrule
- , list
- , blockQuote
- , para
- , plain
- , rawHtmlBlock ] <?> "block"
-
---
--- header blocks
---
-
-header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
-
-headerLevel n = try $ do
- let level = "h" ++ show n
- (tag, attribs) <- htmlTag level
- contents <- inlinesTilEnd level
- return $ Header n (normalizeSpaces contents)
-
---
--- hrule block
---
-
-hrule = try $ do
- (tag, attribs) <- htmlTag "hr"
- state <- getState
- if not (null attribs) && stateParseRaw state
- then unexpected "attributes in hr" -- parse as raw in this case
- else return HorizontalRule
-
---
--- code blocks
---
-
--- Note: HTML tags in code blocks (e.g. for syntax highlighting) are
--- skipped, because they are not portable to output formats other than HTML.
-codeBlock = try $ do
- htmlTag "pre"
- result <- manyTill
- (many1 (satisfy (/= '<')) <|>
- ((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
- (htmlEndTag "pre")
- let result' = concat result
- -- drop leading newline if any
- let result'' = if "\n" `isPrefixOf` result'
- then drop 1 result'
- else result'
- -- drop trailing newline if any
- let result''' = if "\n" `isSuffixOf` result''
- then init result''
- else result''
- return $ CodeBlock $ decodeCharacterReferences result'''
-
---
--- block quotes
---
-
-blockQuote = try $ htmlTag "blockquote" >> spaces >>
- blocksTilEnd "blockquote" >>= (return . BlockQuote)
-
---
--- list blocks
---
-
-list = choice [ bulletList, orderedList, definitionList ] <?> "list"
-
-orderedList = try $ do
- (_, attribs) <- htmlTag "ol"
- (start, style) <- option (1, DefaultStyle) $
- do failIfStrict
- let sta = fromMaybe "1" $
- lookup "start" attribs
- let sty = fromMaybe (fromMaybe "" $
- lookup "style" attribs) $
- lookup "class" attribs
- let sty' = case sty of
- "lower-roman" -> LowerRoman
- "upper-roman" -> UpperRoman
- "lower-alpha" -> LowerAlpha
- "upper-alpha" -> UpperAlpha
- "decimal" -> Decimal
- _ -> DefaultStyle
- return (read sta, sty')
- spaces
- items <- sepEndBy1 (blocksIn "li") spaces
- htmlEndTag "ol"
- return $ OrderedList (start, style, DefaultDelim) items
-
-bulletList = try $ do
- htmlTag "ul"
- spaces
- items <- sepEndBy1 (blocksIn "li") spaces
- htmlEndTag "ul"
- return $ BulletList items
-
-definitionList = try $ do
- failIfStrict -- def lists not part of standard markdown
- tag <- htmlTag "dl"
- spaces
- items <- sepEndBy1 definitionListItem spaces
- htmlEndTag "dl"
- return $ DefinitionList items
-
-definitionListItem = try $ do
- terms <- sepEndBy1 (inlinesIn "dt") spaces
- defs <- sepEndBy1 (blocksIn "dd") spaces
- let term = joinWithSep [LineBreak] terms
- return (term, concat defs)
-
---
--- paragraph block
---
-
-para = try $ htmlTag "p" >> inlinesTilEnd "p" >>=
- return . Para . normalizeSpaces
-
---
--- plain block
---
-
-plain = many1 inline >>= return . Plain . normalizeSpaces
-
---
--- inline
---
-
-inline = choice [ charRef
- , strong
- , emph
- , superscript
- , subscript
- , strikeout
- , spanStrikeout
- , code
- , str
- , linebreak
- , whitespace
- , link
- , image
- , rawHtmlInline
- ] <?> "inline"
-
-code = try $ do
- htmlTag "code"
- result <- manyTill anyChar (htmlEndTag "code")
- -- remove internal line breaks, leading and trailing space,
- -- and decode character references
- return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $
- joinWithSep " " $ lines result
-
-rawHtmlInline = do
- result <- htmlScript <|> htmlComment <|> anyHtmlInlineTag
- state <- getState
- if stateParseRaw state then return (HtmlInline result) else return (Str "")
-
-betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>=
- return . normalizeSpaces
-
-emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph
-
-strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
-
-superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
-
-subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
-
-strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
- return . Strikeout
-
-spanStrikeout = try $ do
- failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
- (tag, attributes) <- htmlTag "span"
- result <- case (extractAttribute "class" attributes) of
- Just "strikeout" -> inlinesTilEnd "span"
- _ -> fail "not a strikeout"
- return $ Strikeout result
-
-whitespace = many1 space >> return Space
-
--- hard line break
-linebreak = htmlTag "br" >> optional newline >> return LineBreak
-
-str = many1 (noneOf "<& \t\n") >>= return . Str
-
---
--- links and images
---
-
--- extract contents of attribute (attribute names are case-insensitive)
-extractAttribute name [] = Nothing
-extractAttribute name ((attrName, contents):rest) =
- let name' = map toLower name
- attrName' = map toLower attrName
- in if attrName' == name'
- then Just (decodeCharacterReferences contents)
- else extractAttribute name rest
-
-link = try $ do
- (tag, attributes) <- htmlTag "a"
- url <- case (extractAttribute "href" attributes) of
- Just url -> return url
- Nothing -> fail "no href"
- let title = fromMaybe "" $ extractAttribute "title" attributes
- label <- inlinesTilEnd "a"
- return $ Link (normalizeSpaces label) (url, title)
-
-image = try $ do
- (tag, attributes) <- htmlTag "img"
- url <- case (extractAttribute "src" attributes) of
- Just url -> return url
- Nothing -> fail "no src"
- let title = fromMaybe "" $ extractAttribute "title" attributes
- let alt = fromMaybe "" (extractAttribute "alt" attributes)
- return $ Image [Str alt] (url, title)
-