diff options
author | dr@jones.dk <dr@jones.dk> | 2011-02-04 00:01:35 +0100 |
---|---|---|
committer | dr@jones.dk <dr@jones.dk> | 2011-02-04 00:01:35 +0100 |
commit | 91179df4907bec919e0884019da785be1ceb01b3 (patch) | |
tree | 2a6655fb4ec4655c554ea17ad074859d707b7709 | |
parent | 1f6b4aee268fefc72c84bd305b10d4f9103901eb (diff) |
Imported Upstream version 1.8.0.1
141 files changed, 17135 insertions, 7287 deletions
diff --git a/Benchmark.hs b/Benchmark.hs new file mode 100644 index 000000000..1fd787945 --- /dev/null +++ b/Benchmark.hs @@ -0,0 +1,45 @@ +import Text.Pandoc +import Text.Pandoc.Shared (readDataFile, normalize) +import Criterion.Main +import Data.List (isSuffixOf) +import Text.JSON.Generic + +readerBench :: Pandoc + -> (String, ParserState -> String -> Pandoc) + -> Benchmark +readerBench doc (name, reader) = + let writer = case lookup name writers of + Just w -> w + Nothing -> error $ "Could not find writer for " ++ name + inp = writer defaultWriterOptions{ writerWrapText = True + , writerLiterateHaskell = + "+lhs" `isSuffixOf` name } doc + -- we compute the length to force full evaluation + getLength (Pandoc (Meta a b c) d) = + length a + length b + length c + length d + in bench (name ++ " reader") $ whnf (getLength . + reader defaultParserState{ stateSmart = True + , stateStandalone = True + , stateLiterateHaskell = + "+lhs" `isSuffixOf` name }) inp + +writerBench :: Pandoc + -> (String, WriterOptions -> Pandoc -> String) + -> Benchmark +writerBench doc (name, writer) = bench (name ++ " writer") $ nf + (writer defaultWriterOptions{ + writerWrapText = True + , writerLiterateHaskell = "+lhs" `isSuffixOf` name }) doc + +normalizeBench :: Pandoc -> [Benchmark] +normalizeBench doc = [ bench "normalize - with" $ nf (encodeJSON . normalize) doc + , bench "normalize - without" $ nf encodeJSON doc + ] + +main = do + inp <- readDataFile (Just ".") "README" + let ps = defaultParserState{ stateSmart = True } + let doc = readMarkdown ps inp + let readerBs = map (readerBench doc) readers + defaultMain $ map (writerBench doc) writers ++ readerBs ++ normalizeBench doc + @@ -44,8 +44,20 @@ Copyright (C) 2008-2010 Andrea Rossato and John MacFarlane Released under the GPL. ---------------------------------------------------------------------- +src/Text/Pandoc/Writers/Org.hs +Copyright (C) 2010 Puneeth Chaganti + +Released under the GPL. + +---------------------------------------------------------------------- +src/Text/Pandoc/Readers/Textile.hs +Copyright (C) 2010 Paul Rivier + +Released under the GPL. + +---------------------------------------------------------------------- src/Text/Pandoc/Biblio.hs -Copyright (C) 2008 Andrea Rossato +Copyright (C) 2008-2010 Andrea Rossato Released under the GPL. @@ -73,8 +73,6 @@ you will need [zip-archive] and (if you want syntax highlighting) - `wrappers`: build the wrapper `markdown2pdf` (default yes) - `highlighting`: compile with syntax highlighting support (increases the size of the executable) (default no) - - `citeproc`: compile with bibliographic support using `citeproc-hs` - (default no) So, for example, @@ -109,22 +107,3 @@ you will need [zip-archive] and (if you want syntax highlighting) [highlighting-kate]: http://hackage.haskell.org/package/highlighting-kate [Cabal User's Guide]: http://www.haskell.org/cabal/release/latest/doc/users-guide/builders.html#setup-configure-paths -Optional citeproc support -------------------------- - -Pandoc can optionally be compiled with support for bibliographic citations -using Andrea Rossato's [`citeproc-hs` library]. This allows you -to specify citations in markdown using an intuitive syntax (for example, -`[jones2005@p. 3; smith2006]`). These are automatically changed into -appropriately styled citations in the output, and a bibliography is -added. The bibliography data and style information are taken from XML -files that must be specified on the command line. (Note: `citeproc-hs` -support is experimental, and the interface may change in the future.) - -If you are using Cabal to compile pandoc, specify the `citeproc` flag in -the configure step: - - runhaskell Setup configure --flags="citeproc" - -[`citeproc-hs` library]: http://code.haskell.org/citeproc-hs/ - diff --git a/MakeManPage.hs b/MakeManPage.hs new file mode 100644 index 000000000..47b284380 --- /dev/null +++ b/MakeManPage.hs @@ -0,0 +1,97 @@ +-- Create pandoc.1 man page from README +import Text.Pandoc +import Data.ByteString.UTF8 (toString, fromString) +import Data.Char (toUpper) +import qualified Data.ByteString as B +import Control.Monad +import System.FilePath +import System.Environment (getArgs) +import Text.Pandoc.Shared (normalize) +import System.Directory (getModificationTime) +import System.IO.Error (isDoesNotExistError) +import System.Time (ClockTime(..)) +import Data.Maybe (catMaybes) + +main = do + rmContents <- liftM toString $ B.readFile "README" + let (Pandoc meta blocks) = readMarkdown defaultParserState rmContents + let manBlocks = removeSect [Str "Wrappers"] + $ removeSect [Str "Pandoc's",Space,Str "markdown"] blocks + let syntaxBlocks = extractSect [Str "Pandoc's",Space,Str "markdown"] blocks + args <- getArgs + let verbose = "--verbose" `elem` args + makeManPage verbose ("man" </> "man1" </> "pandoc.1") + meta manBlocks + makeManPage verbose ("man" </> "man5" </> "pandoc_markdown.5") + meta syntaxBlocks + let markdown2pdfpage = "man" </> "man1" </> "markdown2pdf.1" + modDeps <- modifiedDependencies markdown2pdfpage [markdown2pdfpage <.> "md"] + unless (null modDeps) $ do + mpdfContents <- liftM toString $ B.readFile $ markdown2pdfpage <.> "md" + templ <- liftM toString $ B.readFile $ "templates" </> "man.template" + let doc = readMarkdown defaultParserState{ stateStandalone = True } + mpdfContents + writeManPage markdown2pdfpage templ doc + when verbose $ + putStrLn $ "Created " ++ markdown2pdfpage + +makeManPage :: Bool -> FilePath -> Meta -> [Block] -> IO () +makeManPage verbose page meta blocks = do + let templ = page <.> "template" + modDeps <- modifiedDependencies page ["README", templ] + unless (null modDeps) $ do + manTemplate <- liftM toString $ B.readFile templ + writeManPage page manTemplate (Pandoc meta blocks) + when verbose $ + putStrLn $ "Created " ++ page + +writeManPage :: FilePath -> String -> Pandoc -> IO () +writeManPage page templ doc = do + let opts = defaultWriterOptions{ writerStandalone = True + , writerTemplate = templ } + let manPage = writeMan opts $ + bottomUp (concatMap removeLinks) $ + bottomUp capitalizeHeaders doc + B.writeFile page $ fromString manPage + +-- | Returns a list of 'dependencies' that have been modified after 'file'. +modifiedDependencies :: FilePath -> [FilePath] -> IO [FilePath] +modifiedDependencies file dependencies = do + fileModTime <- catch (getModificationTime file) $ + \e -> if isDoesNotExistError e + then return (TOD 0 0) -- the minimum ClockTime + else ioError e + depModTimes <- mapM getModificationTime dependencies + let modified = zipWith (\dep time -> if time > fileModTime then Just dep else Nothing) dependencies depModTimes + return $ catMaybes modified + +removeLinks :: Inline -> [Inline] +removeLinks (Link l _) = l +removeLinks x = [x] + +capitalizeHeaders :: Block -> Block +capitalizeHeaders (Header 1 xs) = Header 1 $ bottomUp capitalize xs +capitalizeHeaders x = x + +capitalize :: Inline -> Inline +capitalize (Str xs) = Str $ map toUpper xs +capitalize x = x + +removeSect :: [Inline] -> [Block] -> [Block] +removeSect ils (Header 1 x:xs) | normalize x == normalize ils = + dropWhile (not . isHeader1) xs +removeSect ils (x:xs) = x : removeSect ils xs +removeSect _ [] = [] + +extractSect :: [Inline] -> [Block] -> [Block] +extractSect ils (Header 1 z:xs) | normalize z == normalize ils = + bottomUp promoteHeader $ takeWhile (not . isHeader1) xs + where promoteHeader (Header n x) = Header (n-1) x + promoteHeader x = x +extractSect ils (x:xs) = extractSect ils xs +extractSect _ [] = [] + +isHeader1 :: Block -> Bool +isHeader1 (Header 1 _) = True +isHeader1 _ = False + @@ -1,21 +1,30 @@ % Pandoc User's Guide % John MacFarlane -% March 20, 2010 +% January 29, 2011 + +Synopsis +======== + +pandoc [*options*] [*input-file*]... + +Description +=========== Pandoc is a [Haskell] library for converting from one markup format to another, and a command-line tool that uses this library. It can read -[markdown] and (subsets of) [reStructuredText], [HTML], and [LaTeX]; and -it can write plain text, [markdown], [reStructuredText], [HTML], [LaTeX], -[ConTeXt], [RTF], [DocBook XML], [OpenDocument XML], [ODT], [GNU Texinfo], -[MediaWiki markup], [EPUB], [groff man] pages, and [Slidy] or [S5] -HTML slide shows. +[markdown] and (subsets of) [Textile], [reStructuredText], [HTML], +and [LaTeX]; and it can write plain text, [markdown], [reStructuredText], +[HTML], [LaTeX], [ConTeXt], [RTF], [DocBook XML], [OpenDocument XML], [ODT], +[GNU Texinfo], [MediaWiki markup], [EPUB], [Textile], [groff man] pages, +[Emacs Org-Mode], and [Slidy] or [S5] HTML slide shows. Pandoc's enhanced version of markdown includes syntax for footnotes, tables, flexible ordered lists, definition lists, delimited code blocks, superscript, subscript, strikeout, title blocks, automatic tables of -contents, embedded LaTeX math, and markdown inside HTML block elements. -(These enhancements can be disabled if a drop-in replacement for -`Markdown.pl` is desired.) +contents, embedded LaTeX math, citations, and markdown inside HTML block +elements. (These enhancements, described below under +[Pandoc's markdown](#pandocs-markdown), can be disabled using the `--strict` +option.) In contrast to most existing tools for converting markdown to HTML, which use regex substitutions, Pandoc has a modular design: it consists of a @@ -24,60 +33,25 @@ representation of the document, and a set of writers, which convert this native representation into a target format. Thus, adding an input or output format requires only adding a reader or writer. -[markdown]: http://daringfireball.net/projects/markdown/ -[reStructuredText]: http://docutils.sourceforge.net/docs/ref/rst/introduction.html -[S5]: http://meyerweb.com/eric/tools/s5/ -[Slidy]: http://www.w3.org/Talks/Tools/Slidy/ -[HTML]: http://www.w3.org/TR/html40/ -[LaTeX]: http://www.latex-project.org/ -[ConTeXt]: http://www.pragma-ade.nl/ -[RTF]: http://en.wikipedia.org/wiki/Rich_Text_Format -[DocBook XML]: http://www.docbook.org/ -[OpenDocument XML]: http://opendocument.xml.org/ -[ODT]: http://en.wikipedia.org/wiki/OpenDocument -[MediaWiki markup]: http://www.mediawiki.org/wiki/Help:Formatting -[groff man]: http://developer.apple.com/DOCUMENTATION/Darwin/Reference/ManPages/man7/groff_man.7.html -[Haskell]: http://www.haskell.org/ -[GNU Texinfo]: http://www.gnu.org/software/texinfo/ -[EPUB]: http://www.idpf.org/ - -© 2006-2010 John MacFarlane (jgm at berkeley dot edu). Released under the -[GPL], version 2 or greater. This software carries no warranty of -any kind. (See COPYRIGHT for full copyright and warranty notices.) -Other contributors include Recai Oktaş, Paulo Tanimoto, Peter Wang, -Andrea Rossato, Eric Kow, infinity0x, Luke Plant, shreevatsa.public, -rodja.trappe, Bradley Kuhn, thsutton, Justin Bogner. - -[GPL]: http://www.gnu.org/copyleft/gpl.html "GNU General Public License" - Using Pandoc -============ - -If you run `pandoc` without arguments, it will accept input from -stdin. If you run it with file names as arguments, it will take input -from those files. By default, `pandoc` writes its output to stdout.[^1] -If you want to write to a file, use the `-o` option: - - pandoc -o hello.html hello.txt - -[^1]: The exceptions are for `odt` and `epub`. Since these are - a binary output formats, an output file must be specified explicitly. - -Note that you can specify multiple input files on the command line. -`pandoc` will concatenate them all (with blank lines between them) -before parsing: +------------ - pandoc -s ch1.txt ch2.txt refs.txt > book.html +If no *input-file* is specified, input is read from *stdin*. +Otherwise, the *input-files* are concatenated (with a blank +line between each) and used as input. Output goes to *stdout* by +default (though output to *stdout* is disabled for the `odt` and +`epub` output formats). For output to a file, use the `-o` option: -(The `-s` option here tells `pandoc` to produce a standalone HTML file, -with a proper header, rather than a fragment. For more details on this -and many other command-line options, see below.) + pandoc -o output.html input.txt -Instead of a filename, you can specify an absolute URI. In this -case pandoc will attempt to download the content via HTTP: +Instead of a file, an absolute URI may be given. In this case +pandoc will fetch the content using HTTP: pandoc -f html -t markdown http://www.fsf.org +If multiple input files are given, `pandoc` will concatenate them all (with +blank lines between them) before parsing. + The format of the input and output can be specified explicitly using command-line options. The input format can be specified using the `-r/--read` or `-f/--from` options, the output format using the @@ -90,46 +64,29 @@ To convert `hello.html` from html to markdown: pandoc -f html -t markdown hello.html -Supported output formats include `markdown`, `latex`, `context` -(ConTeXt), `html`, `rtf` (rich text format), `rst` -(reStructuredText), `docbook` (DocBook XML), `opendocument` -(OpenDocument XML), `odt` (OpenOffice text document), `texinfo`, (GNU -Texinfo), `mediawiki` (MediaWiki markup), `epub` (EPUB ebook), -`man` (groff man), `slidy` (slidy HTML and javascript slide show), or -`s5` (S5 HTML and javascript slide show). - -Supported input formats include `markdown`, `html`, `latex`, and `rst`. -Note that the `rst` reader only parses a subset of reStructuredText -syntax. For example, it doesn't handle tables, option lists, or -footnotes. But for simple documents it should be adequate. The `latex` -and `html` readers are also limited in what they can do. - -If you don't specify a reader or writer explicitly, `pandoc` will -try to determine the input and output format from the extensions of +Supported output formats are listed below under the `-t/--to` option. +Supported input formats are listed below under the `-f/--from` option. Note +that the `rst`, `textile`, `latex`, and `html` readers are not complete; +there are some constructs that they do not parse. + +If the input or output format is not specified explicitly, `pandoc` +will attempt to guess it from the extensions of the input and output filenames. Thus, for example, pandoc -o hello.tex hello.txt will convert `hello.txt` from markdown to LaTeX. If no output file -is specified (so that output goes to stdout), or if the output file's +is specified (so that output goes to *stdout*), or if the output file's extension is unknown, the output format will default to HTML. -If no input file is specified (so that input comes from stdin), or +If no input file is specified (so that input comes from *stdin*), or if the input files' extensions are unknown, the input format will be assumed to be markdown unless explicitly specified. -Character encodings -------------------- - -All input is assumed to be in the UTF-8 encoding, and all output -is in UTF-8. If your local character encoding is not UTF-8 and you use -accented or foreign characters, you should pipe the input and output -through [`iconv`]. For example, +Pandoc uses the UTF-8 character encoding for both input and output. +If your local character encoding is not UTF-8, you +should pipe input and output through `iconv`: - iconv -t utf-8 source.txt | pandoc | iconv -f utf-8 > output.html - -will convert `source.txt` from the local encoding to UTF-8, then -convert it to HTML, then convert back to the local encoding, -putting the output in `output.html`. + iconv -t utf-8 input.txt | pandoc | iconv -f utf-8 Wrappers ======== @@ -151,7 +108,7 @@ name can be specified explicitly using the `-o` option: markdown2pdf -o book.pdf chap1 chap2 -If no input file is specified, input will be taken from stdin. +If no input file is specified, input will be taken from *stdin*. All of `pandoc`'s options will work with `markdown2pdf` as well. `markdown2pdf` assumes that `pdflatex` is in the path. It also @@ -179,140 +136,246 @@ problems with its simulation of symbolic links. [TeX Live]: http://www.tug.org/texlive/ [MacTeX]: http://www.tug.org/mactex/ -Command-line options -==================== +Options +======= + +`-f` *FORMAT*, `-r` *FORMAT*, `--from=`*FORMAT*, `--read=`*FORMAT* +: Specify input format. *FORMAT* can be `native` (native Haskell), + `json` (JSON version of native AST), `markdown` (markdown), + `textile` (Textile), `rst` (reStructuredText), `html` (HTML), + or `latex` (LaTeX). If `+lhs` is appended to `markdown`, `rst`, + or `latex`, the input will be treated as literate Haskell source: + see [Literate Haskell support](#literate-haskell-support), + below. + +`-t` *FORMAT*, `-w` *FORMAT*, `--to=`*FORMAT*, `--write=`*FORMAT* +: Specify output format. *FORMAT* can be `native` (native Haskell), + `json` (JSON version of native AST), `plain` (plain text), + `markdown` (markdown), `rst` (reStructuredText), + `html` (HTML), `latex` (LaTeX), `context` (ConTeXt), `man` (groff man), + `mediawiki` (MediaWiki markup), `textile` (Textile), `org` (Emacs + Org-Mode), `texinfo` (GNU Texinfo), `docbook` (DocBook XML), + `opendocument` (OpenDocument XML), `odt` (OpenOffice text document), + `epub` (EPUB book), `slidy` (Slidy HTML and javascript slide show), + `s5` (S5 HTML and javascript slide show), or `rtf` (rich text + format). Note that `odt` and `epub` output will not be directed to + *stdout*; an output filename must be specified using the `-o/--output` + option. If `+lhs` is appended to `markdown`, `rst`, `latex`, or `html`, + the output will be rendered as literate Haskell source: + see [Literate Haskell support](#literate-haskell-support), + below. + +`-s`, `--standalone` +: Produce output with an appropriate header and footer (e.g. a + standalone HTML, LaTeX, or RTF file, not a fragment). + +`-o` *FILE*, `--output=`*FILE* +: Write output to *FILE* instead of *stdout*. If *FILE* is + `-`, output will go to *stdout*. (Exception: if the output + format is `odt` or `epub`, output to stdout is disabled.) + +`-p`, `--preserve-tabs` +: Preserve tabs instead of converting them to spaces (the default). + +`--tab-stop=`*NUMBER* +: Specify the number of spaces per tab (default is 4). -Various command-line options can be used to customize the output. -For further documentation, see the `pandoc(1)` man page. +`--strict` +: Use strict markdown syntax, with no pandoc extensions or variants. + When the input format is HTML, this means that constructs that have no + equivalents in standard markdown (e.g. definition lists or strikeout + text) will be parsed as raw HTML. -`-f`, `--from`, `-r`, or `--read` *format* -: specifies the input format (the format Pandoc will be converting - *from*). *format* can be `native`, `markdown`, `rst`, `html`, or - `latex`. (`+lhs` can be appended to indicate that the input should - be treated as literate Haskell source. See - [Literate Haskell support](#literate-haskell-support), below.) +`--normalize` +: Normalize the document after reading: merge adjacent + `Str` or `Emph` elements, for example, and remove repeated `Space`s. -`-t`, `--to`, `-w`, or `--write` *format* -: specifies the output format -- the format Pandoc will - be converting *to*. *format* can be `native`, `html`, `slidy`, `s5`, - `docbook`, `opendocument`, `latex`, `context`, `markdown`, `man`, - `plain`, `rst`, and `rtf`. (`+lhs` can be appended to indicate that - the output should be treated as literate Haskell source. See - [Literate Haskell support](#literate-haskell-support), below.) +`--reference-links` +: Use reference-style links, rather than inline links, in writing markdown + or reStructuredText. By default inline links are used. + +`-R`, `--parse-raw` +: Parse untranslatable HTML codes and LaTeX environments as raw HTML + or LaTeX, instead of ignoring them. Affects only HTML and LaTeX + input. Raw HTML can be printed in markdown, reStructuredText, HTML, Slidy, + and S5 output; raw LaTeX can be printed in markdown, reStructuredText, + LaTeX, and ConTeXt output. The default is for the readers to omit + untranslatable HTML codes and LaTeX environments. (The LaTeX reader + does pass through untranslatable LaTeX *commands*, even if `-R` is not + specified.) + +`-S`, `--smart` +: Produce typographically correct output, converting straight quotes + to curly quotes, `---` and `--` to dashes, ande `...` to ellipses. + Nonbreaking spaces are inserted after certain abbreviations, such + as "Mr." (Note: This option is significant only when the input format is + `markdown` or `textile`. It is selected automatically when the input + format is `textile` or the output format is `latex` or `context`.) + +`-5`, `--html5` +: Produce HTML5 instead of HTML4. This option has no effect for writers + other than `html`. + +`-m` *URL*, `--latexmathml=`*URL* +: Use the [LaTeXMathML] script to display embedded TeX math in HTML output. + To insert a link to a local copy of the `LaTeXMathML.js` script, + provide a *URL*. If no *URL* is provided, the contents of the + script will be inserted directly into the HTML header, preserving + portability at the price of efficiency. If you plan to use math on + several pages, it is much better to link to a copy of the script, + so it can be cached. -`-s` or `--standalone` -: indicates that a standalone document is to be produced (with - appropriate headers and footers), rather than a fragment. +`--mathml` +: Convert TeX math to MathML. In standalone mode, a small javascript + will be inserted that allows the MathML to be viewed on some browsers. -`-o` or `--output` *filename* -: sends output to *filename*. If this option is not specified, - or if its argument is `-`, output will be sent to stdout. - (Exception: if the output format is `odt` or `epub`, output to - stdout is disabled.) +`--jsmath=`*URL* +: Use [jsMath] to display embedded TeX math in HTML output. + The *URL* should point to the jsMath load script (e.g. + `jsMath/easy/load.js`); if provided, it will be linked to in + the header of standalone HTML documents. -`-p` or `--preserve-tabs` -: causes tabs in the source text to be preserved, rather than converted - to spaces (the default). +`--mathjax=`*URL* +: Use [MathJax] to display embedded TeX math in HTML output. + The *URL* should point to the `MathJax.js` load script. -`--tab-stop` *tabstop* -: sets the number of spaces per tab to *tabstop* (defaults to 4). +`--gladtex` +: Enclose TeX math in `<eq>` tags in HTML output. These can then + be processed by [gladTeX] to produce links to images of the typeset + formulas. -`--strict` -: specifies that strict markdown syntax is to be used, without - pandoc's usual extensions and variants (described below). When the - input format is HTML, this means that constructs that have no - equivalents in standard markdown (e.g. definition lists or strikeout - text) will be parsed as raw HTML. +`--mimetex=`*URL* +: Render TeX math using the [mimeTeX] CGI script. If *URL* is not + specified, it is assumed that the script is at `/cgi-bin/mimetex.cgi`. -`--reference-links` -: causes reference-style links to be used in markdown - and reStructuredText output. By default inline links are used. - -`-R` or `--parse-raw` -: causes the HTML and LaTeX readers to parse HTML codes and LaTeX - environments that it can't translate as raw HTML or LaTeX. Raw HTML can - be printed in markdown, reStructuredText, HTML, Slidy, and S5 - output; raw LaTeX can be printed in markdown, reStructuredText, - LaTeX, and ConTeXt output. The default is for the readers to omit - untranslatable HTML codes and LaTeX environments. (The LaTeX reader - does pass through untranslatable LaTeX *commands*, even if `-R` is - not specified.) +`--webtex=`*URL* +: Render TeX formulas using an external script that converts TeX + formulas to images. The formula will be concatenated with the URL + provided. If *URL* is not specified, the Google Chart API will be used. + +`-i`, `--incremental` +: Make list items in Slidy or S5 display incrementally (one by one). + The default is for lists to be displayed all at once. + +`--offline` +: Include all the CSS and javascript needed for a Slidy or S5 slide + show in the output, so that the slide show will work even when no + internet connection is available. + +`--xetex` +: Create LaTeX outut suitable for processing by XeTeX. + +`--chapters` +: Treat top-level headers as chapters in LaTeX, ConTeXt, and DocBook + output. + +`-N`, `--number-sections` +: Number section headings in LaTeX, ConTeXt, or HTML output. + By default, sections are not numbered. + +`--listings` +: Use listings package for LaTeX code blocks + +`--section-divs` +: Wrap sections in `<div>` tags (or `<section>` tags in HTML5), + and attach identifiers to the enclosing `<div>` (or `<section>`) + rather than the header itself. + See [Section identifiers](#header-identifiers-in-html), below. + +`--no-wrap` +: Disable text wrapping in output. By default, text is wrapped + appropriately for the output format. + +`--columns`=*NUMBER* +: Specify length of lines in characters (for text wrapping). + +`--email-obfuscation=`*none|javascript|references* +: Specify a method for obfuscating `mailto:` links in HTML documents. + *none* leaves `mailto:` links as they are. *javascript* obfuscates + them using javascript. *references* obfuscates them by printing their + letters as decimal or hexadecimal character references. + If `--strict` is specified, *references* is used regardless of the + presence of this option. + +`--id-prefix`=*STRING* +: Specify a prefix to be added to all automatically generated identifiers + in HTML output. This is useful for preventing duplicate identifiers + when generating fragments to be included in other pages. -`-C` or `--custom-header` *filename* -: can be used to specify a custom document header. Implies `--standalone`. - *Note: this option is deprecated. Use of `--template` is preferred.* +`--indented-code-classes=`*CLASSES* +: Specify classes to use for indented code blocks--for example, + `perl,numberLines` or `haskell`. Multiple classes may be separated + by spaces or commas. -`--toc` or `--table-of-contents` -: includes an automatically generated table of contents (or, in the - case of `latex`, `context`, and `rst`, an instruction to create - one) in the output document. This option has no effect with `man`, - `docbook`, `slidy`, or `s5` output formats. +`--toc`, `--table-of-contents` +: Include an automatically generated table of contents (or, in + the case of `latex`, `context`, and `rst`, an instruction to create + one) in the output document. This option has no effect on `man`, + `docbook`, `slidy`, or `s5` output. -`--base-header-level` *level* -: specifies the base level for headers (defaults to 1). +`--base-header-level=`*NUMBER* +: Specify the base level for headers (defaults to 1). -`--template=`*file* -: uses *file* as a custom template for the generated document. Implies - `-s`. See [Templates](#templates) below for a description +`--template=`*FILE* +: Use *FILE* as a custom template for the generated document. Implies + `--standalone`. See [Templates](#templates) below for a description of template syntax. If this option is not used, a default template appropriate for the output format will be used. See also `-D/--print-default-template`. -`-V` *key=val*, `--variable=`*key:val* -: sets the template variable *key* to the value *val* when rendering the +`-V` *KEY=VAL*, `--variable=`*KEY:VAL* +: Set the template variable *KEY* to the value *VAL* when rendering the document in standalone mode. This is only useful when the `--template` option is used to specify a custom template, since pandoc automatically sets the variables used in the default templates. -`-c` or `--css` *filename* -: allows the user to specify a custom stylesheet that will be linked to - in HTML, Slidy, and S5 output. This option can be used repeatedly - to include multiple stylesheets. They will be included in the order - specified. Implies `--standalone`. +`-c` *URL*, `--css=`*URL* +: Link to a CSS style sheet. -`-H` or `--include-in-header` *filename* -: includes the contents of *filename* (verbatim) at the end of the - document header. This can be used, for example, to include special +`-H` *FILE*, `--include-in-header=`*FILE* +: Include contents of *FILE*, verbatim, at the end of the header. + This can be used, for example, to include special CSS or javascript in HTML documents. This option can be used repeatedly to include multiple files in the header. They will be included in the order specified. Implies `--standalone`. -`-B` or `--include-before-body` *filename* -: includes the contents of *filename* (verbatim) at the beginning of - the document body (e.g. after the `<body>` tag in HTML, or the +`-B` *FILE*, `--include-before-body=`*FILE* +: Include contents of *FILE*, verbatim, at the beginning of the + document body (e.g. after the `<body>` tag in HTML, or the `\begin{document}` command in LaTeX). This can be used to include navigation bars or banners in HTML documents. This option can be used repeatedly to include multiple files. They will be included in the order specified. Implies `--standalone`. -`-A` or `--include-after-body` *filename* -: includes the contents of *filename* (verbatim) at the end of - the document body (before the `</body>` tag in HTML, or the +`-A` *FILE*, `--include-after-body=`*FILE* +: Include contents of *FILE*, verbatim, at the end of the document + body (before the `</body>` tag in HTML, or the `\end{document}` command in LaTeX). This option can be be used repeatedly to include multiple files. They will be included in the order specified. Implies `--standalone`. -`--reference-odt` *filename* -: uses the specified file as a style reference in producing an ODT. +`--reference-odt=`*FILE* +: Use the specified file as a style reference in producing an ODT. For best results, the reference ODT should be a modified version of an ODT produced using pandoc. The contents of the reference ODT are ignored, but its stylesheets are used in the new ODT. If no reference ODT is specified on the command line, pandoc will look for a file `reference.odt` in the user data directory (see - `--data-dir`, below). If it is not found there, sensible defaults - will be used. + `--data-dir`). If this is not found either, sensible defaults will be + used. -`--epub-stylesheet` *filename* -: uses the specified CSS file to style the EPUB. If no stylesheet +`--epub-stylesheet=`*FILE* +: Use the specified CSS file to style the EPUB. If no stylesheet is specified, pandoc will look for a file `epub.css` in the user data directory (see `--data-dir`, below). If it is not found there, sensible defaults will be used. -`--epub-metadata` *filename* -: looks in the specified XML file for metadata for the EPUB. - The file should contain a series of [Dublin Core elements], - for example: +`--epub-metadata=`*FILE* +: Look in the specified XML file for metadata for the EPUB. + The file should contain a series of Dublin Core elements, + as documented at <http://dublincore.org/documents/dces/>. + For example: <dc:rights>Creative Commons</dc:rights> <dc:language>es-AR</dc:language> @@ -323,123 +386,45 @@ For further documentation, see the `pandoc(1)` man page. `<dc:identifier id="BookId">` (a randomly generated UUID). Any of these may be overridden by elements in the metadata file. -`-D` or `--print-default-template` *format* -: prints the default template for an output *format*. (See `-t` - for a list of possible *format*s.) - -`-T` or `--title-prefix` *string* -: includes *string* as a prefix at the beginning of the title that - appears in the HTML header (but not in the title as it appears at - the beginning of the HTML body). (See below on - [Title Blocks](#title-blocks).) Implies `--standalone`. - -`-S` or `--smart` -: causes `pandoc` to produce typographically correct output, along the - lines of John Gruber's [Smartypants]. Straight quotes are converted - to curly quotes, `---` to dashes, and `...` to ellipses. Nonbreaking - spaces are inserted after certain abbreviations, such as "Mr." - (Note: This option is only significant when the input format is - `markdown`. It is selected automatically when the output format is - `latex` or `context`.) - -`-m`*[url]* or `--latexmathml`*[=url]* -: causes `pandoc` to use the [LaTeXMathML] script to display - TeX math in HTML, Slidy, or S5. If a local copy of `LaTeXMathML.js` - is available on the webserver where the page will be viewed, provide - a *url* and a link will be inserted in the generated HTML. If - no *url* is provided, the contents of the script will be inserted - directly; this provides portability at the price of efficiency. If - you plan to use math on several pages, it is much better to link to - a copy of `LaTeXMathML.js`, which can be cached. (See `--jsmath`, - `--gladtex`, `--webtex`, and `--mimetex` for alternative ways of - dealing with math in HTML.) - -`--mathml` -: causes `pandoc` to convert all TeX math to MathML. - In standalone mode, a small javascript will be inserted that allows - the MathML to be viewed on some browsers. - -`--jsmath`*=[url]* -: causes `pandoc` to use the [jsMath] script to display - TeX math in HTML, Slidy, or S5. The *url* should point to the jsMath - load script (e.g. `jsMath/easy/load.js`). If it is provided, a link - to it will be included in the header of standalone HTML documents. - (See `--latexmathml`, `--mimetex`, `--webtex`, and `--gladtex` for - alternative ways of dealing with math in HTML.) - -`--gladtex`*[=url]* -: causes TeX formulas to be enclosed in `<eq>` tags in HTML, Slidy, or - S5 output. This output can then be processed by [gladTeX] to produce - links to images with the typeset formulas. (See `--latexmathml`, - `--jsmath`, `--webtex`, and `--mimetex` for alternative ways of - dealing with math in HTML.) - -`--mimetex`*[=url]* -: causes TeX formulas to be replaced by `<img>` tags linking to the - [mimeTeX] CGI script, which will produce images with the typeset - formulas. (See `--latexmathml`, `--jsmath`, `--webtex`, and - `--gladtex` for alternative ways of dealing with math in HTML.) - -`--webtex`*[=url]* -: causes TeX formulas to be replaced by `<img>` tags linking to an - external service that converts TeX formulas to images. The formula - will be concatenated with the URL provided. If no URL - is specified, the Google Chart API is used. (See `--latexmathml`, - `--jsmath`, `--mimetex`, and `--gladtex` for alternative ways of - dealing with math in HTML.) - -`-i` or `--incremental` -: causes all lists in Slidy or S5 output to be displayed incrementally by - default (one item at a time). The normal default is for lists to be - displayed all at once. - -`--offline` -: causes all the CSS and javascript needed for a Slidy or S5 slide show - to be included in the output, so that the slide show will work even - when no internet connection is available. - -`--xetex` -: creates LaTeX outut suitable for processing by XeTeX. +`-D` *FORMAT*, `--print-default-template=`*FORMAT* +: Print the default template for an output *FORMAT*. (See `-t` + for a list of possible *FORMAT*s.) + +`-T` *STRING*, `--title-prefix=`*STRING* +: Specify *STRING* as a prefix at the beginning of the title + that appears in the HTML header (but not in the title as it + appears at the beginning of the HTML body). Implies + `--standalone`. + +`--bibliography=`*FILE* +: Specify bibliography database to be used in resolving + citations. The database type will be determined from the + extension of *FILE*, which may be `.mods` (MODS format), + `.bib` (BibTeX format), `.bbx` (BibLaTeX format), + `.ris` (RIS format), `.enl` (EndNote format), + `.xml` (EndNote XML format), `.wos` (ISI format), + `.medline` (MEDLINE format), `.copac` (Copac format), + or `.json` (citeproc JSON). If you want to use multiple + bibliographies, just use this option repeatedly. + +`--csl=`*FILE* +: Specify [CSL] style to be used in formatting citations and + the bibliography. If *FILE* is not found, pandoc will look + for it in + + $HOME/.csl -`-N` or `--number-sections` -: causes sections to be numbered in LaTeX, ConTeXt, or HTML output. - By default, sections are not numbered. - -`--section-divs` -: causes sections to be wrapped in `<div>` tags. In this case, - [section identifiers](#header-identifiers-in-html) - are attached to the enclosing `<div>` rather than the header itself. - -`--no-wrap` -: disables text-wrapping in output. By default, text is wrapped - appropriately for the output format. + in unix and -`--sanitize-html` -: sanitizes HTML (in markdown or HTML input) using a whitelist. - Unsafe tags are replaced by HTML comments; unsafe attributes - are omitted. URIs in links and images are also checked against a - whitelist of URI schemes. + C:\Documents And Settings\USERNAME\Application Data\csl -`--email-obfuscation`*=none|javascript|references* -: specifies a method for obfuscating `mailto:` links in HTML documents. - *none* leaves `mailto:` links as they are. *javascript* obfuscates - them using javascript. *references* obfuscates them by printing their - letters as decimal or hexadecimal character references. If `--strict` - is specified, *references* is used regardless of the presence - of this option. + in Windows. If the `--csl` option is not specified, pandoc + will use a default style: either `default.csl` in the + user data directory (see `--data-dir`), or, if that is + not present, the Chicago author-date style. -`--id-prefix`*=string* -: specifies a prefix to be added to all automatically generated identifiers - in HTML output. This is useful for preventing duplicate identifiers - when generating fragments to be included in other pages. - -`--indented-code-classes`*=classes* -: specifies classes to use for indented code blocks--for example, - `perl,numberLines` or `haskell`. Multiple classes may be separated - by spaces or commas. - -`--data-dir`*=directory* -: specifies the user data directory to search for pandoc data files. +`--data-dir=`*DIRECTORY* +: Specify the user data directory to search for pandoc data files. If this option is not specified, the default user data directory will be used: @@ -454,24 +439,17 @@ For further documentation, see the `pandoc(1)` man page. normal defaults. `--dump-args` -: is intended to make it easier to create wrapper scripts that use - Pandoc. It causes Pandoc to dump information about the arguments - with which it was called to stdout, then exit. The first line - printed is the name of the output file specified using the `-o` - or `--output` option, or `-` if output would go to stdout. The - remaining lines, if any, list command-line arguments. These will - include the names of input files and any special options passed - after ` -- ` on the command line. So, for example, - - pandoc --dump-args -o foo.html -s foo.txt \ - appendix.txt -- -e latin1 - - will cause the following to be printed to stdout: - - foo.html foo.txt appendix.txt -e latin1 +: Print information about command-line arguments to *stdout*, then exit. + This option is intended primarily for use in wrapper scripts. + The first line of output contains the name of the output file specified + with the `-o` option, or `-` (for *stdout*) if no output file was + specified. The remaining lines contain the command-line arguments, + one per line, in the order they appear. These do not include regular + Pandoc options and their arguments, but do include any options appearing + after a `--` separator at the end of the line. `--ignore-args` -: causes Pandoc to ignore all command-line arguments. +: Ignore command-line arguments (for use in wrapper scripts). Regular Pandoc options are not ignored. Thus, for example, pandoc --ignore-args -o foo.html -s foo.txt -- -e latin1 @@ -480,18 +458,18 @@ For further documentation, see the `pandoc(1)` man page. pandoc -o foo.html -s -`-v` or `--version` -: prints the version number to STDERR. +`-v`, `--version` +: Print version. -`-h` or `--help` -: prints a usage message to STDERR. +`-h`, `--help` +: Show usage message. -[Smartypants]: http://daringfireball.net/projects/smartypants/ [LaTeXMathML]: http://math.etsu.edu/LaTeXMathML/ [jsMath]: http://www.math.union.edu/~dpvc/jsmath/ +[MathJax]: http://www.mathjax.org/ [gladTeX]: http://www.math.uio.no/~martingu/gladtex/index.html [mimeTeX]: http://www.forkosh.com/mimetex.html -[Dublin Core elements]: http://dublincore.org/documents/dces/ +[CSL]: http://CitationStyles.org Templates ========= @@ -522,8 +500,6 @@ To write a literal `$` in a template, use `$$`. Some variables are set automatically by pandoc. These vary somewhat depending on the output format, but include: -`legacy-header` -: contents specified by `-C/--custom-header` `header-includes` : contents specified by `-H/--include-in-header` (may have multiple values) @@ -544,6 +520,8 @@ depending on the output format, but include: multiple values) `date` : date of document, as specified in title block +`lang` +: language code for HTML documents Variables may be set at the command line using the `-V/--variable` option. This allows users to include custom variables in their @@ -574,103 +552,347 @@ consecutive items: $for(author)$$author$$sep$, $endfor$ -Pandoc's markdown vs. standard markdown -======================================= +Pandoc's markdown +================= -In parsing markdown, Pandoc departs from and extends [standard markdown] -in a few respects. Except where noted, these differences can -be suppressed by specifying the `--strict` command-line option. +Pandoc understands an extended and slightly revised version of +John Gruber's [markdown] syntax. This document explains the syntax, +noting differences from standard markdown. Except where noted, these +differences can be suppressed by specifying the `--strict` command-line +option. -[standard markdown]: http://daringfireball.net/projects/markdown/syntax - "Markdown syntax description" +Philosophy +---------- -Backslash escapes ------------------ +Markdown is designed to be easy to write, and, even more importantly, +easy to read: -Except inside a code block or inline code, any punctuation or space -character preceded by a backslash will be treated literally, even if it -would normally indicate formatting. Thus, for example, if one writes +> A Markdown-formatted document should be publishable as-is, as plain +> text, without looking like it's been marked up with tags or formatting +> instructions. +> -- [John Gruber](http://daringfireball.net/projects/markdown/syntax#philosophy) - *\*hello\** +This principle has guided pandoc's decisions in finding syntax for +tables, footnotes, and other extensions. -one will get +There is, however, one respect in which pandoc's aims are different +from the original aims of markdown. Whereas markdown was originally +designed with HTML generation in mind, pandoc is designed for multiple +output formats. Thus, while pandoc allows the embedding of raw HTML, +it discourages it, and provides other, non-HTMLish ways of representing +important document elements like definition lists, tables, mathematics, and +footnotes. - <em>*hello*</em> +Paragraphs +---------- -instead of +A paragraph is one or more lines of text followed by one or more blank line. +Newlines are treated as spaces, so you can reflow your paragraphs as you like. +If you need a hard line break, put two or more spaces at the end of a line, +or or type a backslash followed by a newline. - <strong>hello</strong> +Headers +------- -This rule is easier to remember than standard markdown's rule, -which allows only the following characters to be backslash-escaped: +There are two kinds of headers, Setext and atx. - \`*_{}[]()>#+-.! +### Setext-style headers ### -A backslash-escaped space is parsed as a nonbreaking space. It will -appear in TeX output as `~` and in HTML and XML as `\ ` or -`\ `. +A setext-style header is a line of text "underlined" with a row of `=` signs +(for a level one header) of `-` signs (for a level two header): -A backslash-escaped newline (i.e. a backslash occurring at the end of -a line) is parsed as a hard line break. It will appear in TeX output as -`\\` and in HTML as `<br />`. This is a nice alternative to -markdown's "invisible" way of indicating hard line breaks using -two trailing spaces on a line. + A level-one header + ================== -Subscripts and superscripts ---------------------------- + A level-two header + ------------------ -Superscripts may be written by surrounding the superscripted text by `^` -characters; subscripts may be written by surrounding the subscripted -text by `~` characters. Thus, for example, +The header text can contain inline formatting, such as emphasis (see +[Inline formatting](#inline-formatting), below). - H~2~O is a liquid. 2^10^ is 1024. -If the superscripted or subscripted text contains spaces, these spaces -must be escaped with backslashes. (This is to prevent accidental -superscripting and subscripting through the ordinary use of `~` and `^`.) -Thus, if you want the letter P with 'a cat' in subscripts, use -`P~a\ cat~`, not `P~a cat~`. +### Atx-style headers ### -Strikeout ---------- +An Atx-style header consists of one to six `#` signs and a line of +text, optionally followed by any number of `#` signs. The number of +`#` signs at the beginning of the line is the header level: -To strikeout a section of text with a horizontal line, begin and end it -with `~~`. Thus, for example, + ## A level-two header - This ~~is deleted text.~~ + ### A level-three header ### -Nested Lists ------------- +As with setext-style headers, the header text can contain formatting: -Pandoc behaves differently from standard markdown on some "edge -cases" involving lists. Consider this source: + # A level-one header with a [link](/url) and *emphasis* - 1. First - 2. Second: - - Fee - - Fie - - Foe +Standard markdown syntax does not require a blank line before a header. +Pandoc does require this (except, of course, at the beginning of the +document). The reason for the requirement is that it is all too easy for a +`#` to end up at the beginning of a line by accident (perhaps through line +wrapping). Consider, for example: - 3. Third + I like several of their flavors of ice cream: + #22, for example, and #5. + + +### Header identifiers in HTML ### + +*Pandoc extension*. + +Each header element in pandoc's HTML output is given a unique +identifier. This identifier is based on the text of the header. To +derive the identifier from the header text, + + - Remove all formatting, links, etc. + - Remove all punctuation, except underscores, hyphens, and periods. + - Replace all spaces and newlines with hyphens. + - Convert all alphabetic characters to lowercase. + - Remove everything up to the first letter (identifiers may + not begin with a number or punctuation mark). + - If nothing is left after this, use the identifier `section`. + +Thus, for example, + + Header Identifier + ------------------------------- ---------------------------- + Header identifiers in HTML `header-identifiers-in-html` + *Dogs*?--in *my* house? `dogs--in-my-house` + [HTML], [S5], or [RTF]? `html-s5-or-rtf` + 3. Applications `applications` + 33 `section` + +These rules should, in most cases, allow one to determine the identifier +from the header text. The exception is when several headers have the +same text; in this case, the first will get an identifier as described +above; the second will get the same identifier with `-1` appended; the +third with `-2`; and so on. + +These identifiers are used to provide link targets in the table of +contents generated by the `--toc|--table-of-contents` option. They +also make it easy to provide links from one section of a document to +another. A link to this section, for example, might look like this: + + See the section on + [header identifiers](#header-identifiers-in-html). + +Note, however, that this method of providing links to sections works +only in HTML. + +If the `--section-divs` option is specified, then each section will +be wrapped in a `div` (or a `section`, if `--html5` was specified), +and the identifier will be attached to the enclosing `<div>` +(or `<section>`) tag rather than the header itself. This allows entire +sections to be manipulated using javascript or treated differently in +CSS. -Pandoc transforms this into a "compact list" (with no `<p>` tags around -"First", "Second", or "Third"), while markdown puts `<p>` tags around -"Second" and "Third" (but not "First"), because of the blank space -around "Third". Pandoc follows a simple rule: if the text is followed by -a blank line, it is treated as a paragraph. Since "Second" is followed -by a list, and not a blank line, it isn't treated as a paragraph. The -fact that the list is followed by a blank line is irrelevant. (Note: -Pandoc works this way even when the `--strict` option is specified. This -behavior is consistent with the official markdown syntax description, -even though it is different from that of `Markdown.pl`.) -Ordered Lists -------------- +Block quotations +---------------- + +Markdown uses email conventions for quoting blocks of text. +A block quotation is one or more paragraphs or other block elements +(such as lists or headers), with each line preceded by a `>` character +and a space. (The `>` need not start at the left margin, but it should +not be indented more than three spaces.) + + > This is a block quote. This + > paragraph has two lines. + > + > 1. This is a list inside a block quote. + > 2. Second item. + +A "lazy" form, which requires the `>` character only on the first +line of each block, is also allowed: + + > This is a block quote. This + paragraph has two lines. + + > 1. This is a list inside a block quote. + 2. Second item. + +Among the block elements that can be contained in a block quote are +other block quotes. That is, block quotes can be nested: + + > This is a block quote. + > + > > A block quote within a block quote. + +Standard markdown syntax does not require a blank line before a block +quote. Pandoc does require this (except, of course, at the beginning of the +document). The reason for the requirement is that it is all too easy for a +`>` to end up at the beginning of a line by accident (perhaps through line +wrapping). So, unless `--strict` is used, the following does not produce +a nested block quote in pandoc: + + > This is a block quote. + >> Nested. + + +Verbatim (code) blocks +---------------------- + +### Indented code blocks ### + +A block of text indented four spaces (or one tab) is treated as verbatim +text: that is, special characters do not trigger special formatting, +and all spaces and line breaks are preserved. For example, + + if (a > 3) { + moveShip(5 * gravity, DOWN); + } + +The initial (four space or one tab) indentation is not considered part +of the verbatim text, and is removed in the output. + +Note: blank lines in the verbatim text need not begin with four spaces. + + +### Delimited code blocks ### + +*Pandoc extension*. + +In addition to standard indented code blocks, Pandoc supports +*delimited* code blocks. These begin with a row of three or more +tildes (`~`) and end with a row of tildes that must be at least +as long as the starting row. Everything between the tilde-lines +is treated as code. No indentation is necessary: + + ~~~~~~~ + if (a > 3) { + moveShip(5 * gravity, DOWN); + } + ~~~~~~~ + +Like regular code blocks, delimited code blocks must be separated +from surrounding text by blank lines. + +If the code itself contains a row of tildes, just use a longer +row of tildes at the start and end: + + ~~~~~~~~~~~~~~~~ + ~~~~~~~~~~ + code including tildes + ~~~~~~~~~~ + ~~~~~~~~~~~~~~~~ + +Optionally, you may specify the language of the code block using +this syntax: + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ {.haskell .numberLines} + qsort [] = [] + qsort (x:xs) = qsort (filter (< x) xs) ++ [x] ++ + qsort (filter (>= x) xs) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some output formats can use this information to do syntax highlighting. +Currently, the only output format that uses this information is HTML. + +If pandoc has been compiled with syntax highlighting support, then the +code block above will appear highlighted, with numbered lines. (To see +which languages are supported, do `pandoc --version`.) + +If pandoc has not been compiled with syntax highlighting support, the +code block above will appear as follows: + + <pre class="haskell"> + <code> + ... + </code> + </pre> + + +Lists +----- + +### Bullet lists ### + +A bullet list is a list of bulleted list items. A bulleted list +item begins with a bullet (`*`, `+`, or `-`). Here is a simple +example: + + * one + * two + * three + +This will produce a "compact" list. If you want a "loose" list, in which +each item is formatted as a paragraph, put spaces between the items: + + * one + + * two + + * three + +The bullets need not be flush with the left margin; they may be +indented one, two, or three spaces. The bullet must be followed +by whitespace. + +A list item may contain multiple paragraphs and other block-level +content. Subsequent paragraphs must be preceded by a blank line +and indented four spaces or a tab. The list will look better if +the first paragraph is aligned with the rest: + + * First paragraph. + + Continued. + + * Second paragraph. With a code block, which must be indented + eight spaces: + + { code } + +List items may include other lists. In this case the preceding blank +line is optional. The nested list must be indented four spaces or +one tab: + + * fruits + + apples + - macintosh + - red delicious + + pears + + peaches + * vegetables + + brocolli + + chard + +Markdown allows you to write list items "lazily," instead of +indenting continuation lines. However, if there are multiple paragraphs +or other blocks in a list item, the first line of each must be indented. + + + A lazy, lazy, list + item. + + + Another one; this looks + bad but is legal. + + Second paragraph of second + list item. + + +### Ordered lists ### + +Ordered lists work just like bulleted lists, except that the items +begin with enumerators rather than bullets. + +In standard markdown, enumerators are decimal numbers followed +by a period and a space. The numbers themselves are ignored, so +there is no difference between this list: + + 1. one + 2. two + 3. three + +and this one: + + 5. one + 7. two + 1. three + +*Pandoc extension*. Unlike standard markdown, Pandoc allows ordered list items to be marked with uppercase and lowercase letters and roman numerals, in addition to -arabic numerals. (This behavior can be turned off using the `--strict` -option.) List markers may be enclosed in parentheses or followed by a +arabic numerals. List markers may be enclosed in parentheses or followed by a single right-parentheses or period. They must be separated from the text that follows by at least one space, and, if the list marker is a capital letter with a period, by at least two spaces.[^2] @@ -718,34 +940,10 @@ If default list markers are desired, use `#.`: #. two #. three -Numbered examples ------------------ - -The special list marker `@` can be used for sequentially numbered -examples. The first list item with a `@` marker will be numbered '1', -the next '2', and so on, throughout the document. The numbered examples -need not occur in a single list; each new list using `@` will take up -where the last stopped. So, for example: - - (@) My first example will be numbered (1). - (@) My second example will be numbered (2). - Explanation of examples. - - (@) My third example will be numbered (3). - -Numbered examples can be labeled and referred to elsewhere in the -document: - - (@good) This is a good example. - - As (@good) illustrates, ... - -The label can be any string of alphanumeric characters, underscores, -or hyphens. +### Definition lists ### -Definition lists ----------------- +*Pandoc extension*. Pandoc supports definition lists, using a syntax inspired by [PHP Markdown Extra] and [reStructuredText]:[^3] @@ -785,63 +983,114 @@ definition and the next term: [PHP Markdown Extra]: http://www.michelf.com/projects/php-markdown/extra/ -Reference links ---------------- -Pandoc allows implicit reference links with just a single set of -brackets. So, the following links are equivalent: +### Numbered example lists ### - 1. Here's my [link] - 2. Here's my [link][] +*Pandoc extension*. - [link]: linky.com +The special list marker `@` can be used for sequentially numbered +examples. The first list item with a `@` marker will be numbered '1', +the next '2', and so on, throughout the document. The numbered examples +need not occur in a single list; each new list using `@` will take up +where the last stopped. So, for example: -(Note: Pandoc works this way even if `--strict` is specified, because -`Markdown.pl` 1.0.2b7 allows single-bracket links.) + (@) My first example will be numbered (1). + (@) My second example will be numbered (2). -Footnotes ---------- + Explanation of examples. -Pandoc's markdown allows footnotes, using the following syntax: + (@) My third example will be numbered (3). - Here is a footnote reference,[^1] and another.[^longnote] +Numbered examples can be labeled and referred to elsewhere in the +document: - [^1]: Here is the footnote. + (@good) This is a good example. - [^longnote]: Here's one with multiple blocks. + As (@good) illustrates, ... - Subsequent paragraphs are indented to show that they - belong to the previous footnote. +The label can be any string of alphanumeric characters, underscores, +or hyphens. - { some.code } - The whole paragraph can be indented, or just the first - line. In this way, multi-paragraph footnotes work like - multi-paragraph list items. +### Compact and loose lists ### - This paragraph won't be part of the note, because it isn't indented. +Pandoc behaves differently from `Markdown.pl` on some "edge +cases" involving lists. Consider this source: -The identifiers in footnote references may not contain spaces, tabs, -or newlines. These identifiers are used only to correlate the -footnote reference with the note itself; in the output, footnotes -will be numbered sequentially. + + First + + Second: + - Fee + - Fie + - Foe -The footnotes themselves need not be placed at the end of the -document. They may appear anywhere except inside other block elements -(lists, block quotes, tables, etc.). + + Third -Inline footnotes are also allowed (though, unlike regular notes, -they cannot contain multiple paragraphs). The syntax is as follows: +Pandoc transforms this into a "compact list" (with no `<p>` tags around +"First", "Second", or "Third"), while markdown puts `<p>` tags around +"Second" and "Third" (but not "First"), because of the blank space +around "Third". Pandoc follows a simple rule: if the text is followed by +a blank line, it is treated as a paragraph. Since "Second" is followed +by a list, and not a blank line, it isn't treated as a paragraph. The +fact that the list is followed by a blank line is irrelevant. (Note: +Pandoc works this way even when the `--strict` option is specified. This +behavior is consistent with the official markdown syntax description, +even though it is different from that of `Markdown.pl`.) - Here is an inline note.^[Inlines notes are easier to write, since - you don't have to pick an identifier and move down to type the - note.] -Inline and regular footnotes may be mixed freely. +### Ending a list ### + +What if you want to put an indented code block after a list? + + - item one + - item two + + { my code block } + +Trouble! Here pandoc (like other markdown implementations) will treat +`{ my code block }` as the second paragraph of item two, and not as +a code block. + +To "cut off" the list after item two, you can insert some non-indented +content, like an HTML comment, which won't produce visible output in +any format: + + - item one + - item two + + <!-- end of list --> + + { my code block } + +You can use the same trick if you want two consecutive lists instead +of one big list: + + 1. one + 2. two + 3. three + + <!-- --> + + a. uno + b. dos + c. tres + + +Horizontal rules +---------------- + +A line containing a row of three or more `*`, `-`, or `_` characters +(optionally separated by spaces) produces a horizontal rule: + + * * * * + + --------------- + Tables ------ +*Pandoc extension*. + Three kinds of tables may be used. All three kinds presuppose the use of a fixed-width font, such as Courier. @@ -891,7 +1140,8 @@ of the first line of the table body. So, in the tables above, the columns would be right, left, center, and right aligned, respectively. **Multiline tables** allow headers and table rows to span multiple lines -of text. Here is an example: +of text (but cells that span multiple columns or rows of the table are +not supported). Here is an example: ------------------------------------------------------------- Centered Default Right Left @@ -940,7 +1190,7 @@ the table), or the table may be interpreted as a simple table. **Grid tables** look like this: : Sample grid table. - + +---------------+---------------+--------------------+ | Fruit | Price | Advantages | +===============+===============+====================+ @@ -954,84 +1204,16 @@ the table), or the table may be interpreted as a simple table. The row of `=`s separates the header from the table body, and can be omitted for a headerless table. The cells of grid tables may contain arbitrary block elements (multiple paragraphs, code blocks, lists, -etc.). Alignments are not supported, nor are multi-column or multi-row -cells. Grid tables can be created easily using [Emacs table mode]. +etc.). Alignments are not supported, nor are cells that span multiple +columns or rows. Grid tables can be created easily using [Emacs table mode]. [Emacs table mode]: http://table.sourceforge.net/ -Delimited Code blocks ---------------------- - -In addition to standard indented code blocks, Pandoc supports -*delimited* code blocks. These begin with a row of three or more -tildes (`~`) and end with a row of tildes that must be at least -as long as the starting row. Everything between the tilde-lines -is treated as code. No indentation is necessary: - - ~~~~~~~ - {code here} - ~~~~~~~ -Like regular code blocks, delimited code blocks must be separated -from surrounding text by blank lines. +Title block +----------- -If the code itself contains a row of tildes, just use a longer -row of tildes at the start and end: - - ~~~~~~~~~~~~~~~~ - ~~~~~~~~~~ - code including tildes - ~~~~~~~~~~ - ~~~~~~~~~~~~~~~~ - -Optionally, you may specify the language of the code block using -this syntax: - - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ {.haskell .numberLines} - qsort [] = [] - qsort (x:xs) = qsort (filter (< x) xs) ++ [x] ++ - qsort (filter (>= x) xs) - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Some output formats can use this information to do syntax highlighting. -Currently, the only output format that uses this information is HTML. - -If pandoc has been compiled with syntax highlighting support, then the -code block above will appear highlighted, with numbered lines. (To see -which languages are supported, do `pandoc --version`.) - -If pandoc has not been compiled with syntax highlighting support, the -code block above will appear as follows: - - <pre class="haskell"> - <code> - ... - </code> - </pre> - -Images with captions --------------------- - -An image occurring by itself in a paragraph will be rendered as -a figure with a caption.[^5] (In LaTeX, a figure environment will be -used; in HTML, the image will be placed in a `div` with class -`figure`, together with a caption in a `p` with class `caption`.) -The image's alt text will be used as the caption. - -  - -[^5]: This feature is not yet implemented for RTF, OpenDocument, or - ODT. In those formats, you'll just get an image in a paragraph by - itself, with no caption. - -If you just want a regular inline image, just make sure it is not -the only thing in the paragraph. One way to do this is to insert a -nonbreaking space after the image: - - \ - -Title blocks ------------- +*Pandoc extension*. If the file begins with a title block @@ -1107,12 +1289,233 @@ will also have "Pandoc User Manuals" in the footer. will also have "Version 4.0" in the header. -Markdown in HTML blocks ------------------------ -While standard markdown leaves HTML blocks exactly as they are, Pandoc -treats text between HTML tags as markdown. Thus, for example, Pandoc -will turn +Backslash escapes +----------------- + +Except inside a code block or inline code, any punctuation or space +character preceded by a backslash will be treated literally, even if it +would normally indicate formatting. Thus, for example, if one writes + + *\*hello\** + +one will get + + <em>*hello*</em> + +instead of + + <strong>hello</strong> + +This rule is easier to remember than standard markdown's rule, +which allows only the following characters to be backslash-escaped: + + \`*_{}[]()>#+-.! + +(However, if the `--strict` option is supplied, the standard +markdown rule will be used.) + +A backslash-escaped space is parsed as a nonbreaking space. It will +appear in TeX output as `~` and in HTML and XML as `\ ` or +`\ `. + +A backslash-escaped newline (i.e. a backslash occurring at the end of +a line) is parsed as a hard line break. It will appear in TeX output as +`\\` and in HTML as `<br />`. This is a nice alternative to +markdown's "invisible" way of indicating hard line breaks using +two trailing spaces on a line. + +Backslash escapes do not work in verbatim contexts. + +Smart punctuation +----------------- + +If the `--smart` option is specified, pandoc will produce typographically +correct output, converting straight quotes to curly quotes, `---` and `--` +to Em-dashes, and `...` to ellipses. Nonbreaking spaces are inserted after +certain abbreviations, such as "Mr." + +Inline formatting +----------------- + +### Emphasis ### + +To *emphasize* some text, surround it with `*`s or `_`, like this: + + This text is _emphasized with underscores_, and this + is *emphasized with asterisks*. + +Double `*` or `_` produces **strong emphasis**: + + This is **strong emphasis** and __with underscores__. + +A `*` or `_` character surrounded by spaces, or backslash-escaped, +will not trigger emphasis: + + This is * not emphasized *, and \*neither is this\*. + +Because `_` is sometimes used inside words and identifiers, +pandoc does not interpret a `_` surrounded by alphanumeric +characters as an emphasis marker. If you want to emphasize +just part of a word, use `*`: + + feas*ible*, not feas*able*. + + +### Strikeout ### + +*Pandoc extension*. + +To strikeout a section of text with a horizontal line, begin and end it +with `~~`. Thus, for example, + + This ~~is deleted text.~~ + + +### Superscripts and subscripts ### + +*Pandoc extension*. + +Superscripts may be written by surrounding the superscripted text by `^` +characters; subscripts may be written by surrounding the subscripted +text by `~` characters. Thus, for example, + + H~2~O is a liquid. 2^10^ is 1024. + +If the superscripted or subscripted text contains spaces, these spaces +must be escaped with backslashes. (This is to prevent accidental +superscripting and subscripting through the ordinary use of `~` and `^`.) +Thus, if you want the letter P with 'a cat' in subscripts, use +`P~a\ cat~`, not `P~a cat~`. + + +### Verbatim ### + +To make a short span of text verbatim, put it inside backticks: + + What is the difference between `>>=` and `>>`? + +If the verbatim text includes a backtick, use double backticks: + + Here is a literal backtick `` ` ``. + +(The spaces after the opening backticks and before the closing +backticks will be ignored.) + +The general rule is that a verbatim span starts with a string +of consecutive backticks (optionally followed by a space) +and ends with a string of the same number of backticks (optionally +preceded by a space). + +Note that backslash-escapes (and other markdown constructs) do not +work in verbatim contexts: + + This is a backslash followed by an asterisk: `\*`. + + +Math +---- + +*Pandoc extension*. + +Anything between two `$` characters will be treated as TeX math. The +opening `$` must have a character immediately to its right, while the +closing `$` must have a character immediately to its left. Thus, +`$20,000 and $30,000` won't parse as math. If for some reason +you need to enclose text in literal `$` characters, backslash-escape +them and they won't be treated as math delimiters. + +TeX math will be printed in all output formats. How it is rendered +depends on the output format: + +Markdown, reStructuredText, LaTeX, Org-Mode, ConTeXt + ~ It will appear verbatim between `$` characters. + +reStructuredText + ~ It will be rendered using an interpreted text role `:math:`, as described + [here](http://www.american.edu/econ/itex2mml/mathhack.rst). + +Texinfo + ~ It will be rendered inside a `@math` command. + +groff man + ~ It will be rendered verbatim without `$`'s. + +MediaWiki + ~ It will be rendered inside `<math>` tags. + +Textile + ~ It will be rendered inside `<span class="math">` tags. + +RTF, Docbook, OpenDocument, ODT + ~ It will be rendered, if possible, using unicode characters, + and will otherwise appear verbatim. + +HTML, Slidy, S5, EPUB + ~ The way math is rendered in HTML will depend on the + command-line options selected: + + 1. The default is to render TeX math as far as possible using unicode + characters, as with RTF, Docbook, and OpenDocument output. Formulas + are put inside a `span` with `class="math"`, so that they may be + styled differently from the surrounding text if needed. + + 2. If the `--latexmathml` option is used, TeX math will be displayed + between $ or $$ characters and put in `<span>` tags with class `LaTeX`. + The [LaTeXMathML] script will be used to render it as formulas. + (This trick does not work in all browsers, but it works in Firefox. + In browsers that do not support LaTeXMathML, TeX math will appear + verbatim between $ characters.) + + 3. If the `--jsmath` option is used, TeX math will be put inside + `<span>` tags (for inline math) or `<div>` tags (for display math) + with class `math`. The [jsMath] script will be used to render + it. + + 4. If the `--mimetex` option is used, the [mimeTeX] CGI script will + be called to generate images for each TeX formula. This should + work in all browsers. The `--mimetex` option takes an optional URL + as argument. If no URL is specified, it will be assumed that the + mimeTeX CGI script is at `/cgi-bin/mimetex.cgi`. + + 5. If the `--gladtex` option is used, TeX formulas will be enclosed + in `<eq>` tags in the HTML output. The resulting `htex` file may then + be processed by [gladTeX], which will produce image files for each + formula and an `html` file with links to these images. So, the + procedure is: + + pandoc -s --gladtex myfile.txt -o myfile.htex + gladtex -d myfile-images myfile.htex + # produces myfile.html and images in myfile-images + + 6. If the `--webtex` option is used, TeX formulas will be converted + to `<img>` tags that link to an external script that converts + formulas to images. The formula will be URL-encoded and concatenated + with the URL provided. If no URL is specified, the Google Chart + API will be used (`http://chart.apis.google.com/chart?cht=tx&chl=`). + + +Raw HTML +-------- + +Markdown allows you to insert raw HTML anywhere in a document +(except verbatim contexts, where `<`, `>`, and `&` are interpreted +literally). + +The raw HTML is passed through unchanged in HTML, S5, Slidy, EPUB, +Markdown, and Textile output, and suppressed in other formats. + +*Pandoc extension*. + +Standard markdown allows you to include HTML "blocks": blocks +of HTML between balanced tags that are separated from the surrounding text +with blank lines, and start and end at the left margin. Within +these blocks, everything is interpreted as HTML, not markdown; +so (for example), `*` does not signify emphasis. + +Pandoc behaves this way when `--strict` is specified; but by default, +pandoc interprets material between HTML block tags as markdown. +Thus, for example, Pandoc will turn <table> <tr> @@ -1133,168 +1536,272 @@ into whereas `Markdown.pl` will preserve it as is. There is one exception to this rule: text between `<script>` and -`</script>` tags is not interpreted as markdown. +`<style>` tags is not interpreted as markdown. This departure from standard markdown should make it easier to mix markdown with HTML block elements. For example, one can surround a block of markdown text with `<div>` tags without preventing it from being interpreted as markdown. -Header identifiers in HTML --------------------------- -Each header element in pandoc's HTML output is given a unique -identifier. This identifier is based on the text of the header. To -derive the identifier from the header text, +Raw TeX +------- - - Remove all formatting, links, etc. - - Remove all punctuation, except underscores, hyphens, and periods. - - Replace all spaces and newlines with hyphens. - - Convert all alphabetic characters to lowercase. - - Remove everything up to the first letter (identifiers may - not begin with a number or punctuation mark). - - If nothing is left after this, use the identifier `section`. +*Pandoc extension*. -Thus, for example, +In addition to raw HTML, pandoc allows raw LaTeX, TeX, and ConTeXt to be +included in a document. Inline TeX commands will be preserved and passed +unchanged to the LaTeX and ConTeXt writers. Thus, for example, you can use +LaTeX to include BibTeX citations: - Header Identifier - ------------------------------------- --------------------------- - Header identifiers in HTML `header-identifiers-in-html` - *Dogs*?--in *my* house? `dogs--in-my-house` - [HTML], [S5], or [RTF]? `html-s5-or-rtf` - 3. Applications `applications` - 33 `section` + This result was proved in \cite{jones.1967}. -These rules should, in most cases, allow one to determine the identifier -from the header text. The exception is when several headers have the -same text; in this case, the first will get an identifier as described -above; the second will get the same identifier with `-1` appended; the -third with `-2`; and so on. +Note that in LaTeX environments, like -These identifiers are used to provide link targets in the table of -contents generated by the `--toc|--table-of-contents` option. They -also make it easy to provide links from one section of a document to -another. A link to this section, for example, might look like this: + \begin{tabular}{|l|l|}\hline + Age & Frequency \\ \hline + 18--25 & 15 \\ + 26--35 & 33 \\ + 36--45 & 22 \\ \hline + \end{tabular} - See the section on [header identifiers](#header-identifiers-in-html). +the material between the begin and end tags will be interpreted as raw +LaTeX, not as markdown. -Note, however, that this method of providing links to sections works -only in HTML. +Inline LaTeX is ignored in output formats other than Markdown, LaTeX, +and ConTeXt. -If the `--section-divs` option is specified, then each section will -be wrapped in a `div`, and the identifier will be attached to the -enclosing `<div>` tag rather than the header itself. This allows entire -sections to be manipulated using javascript or treated differently in -CSS. +### Macros ### -Blank lines before headers and blockquotes ------------------------------------------- +For output formats other than LaTeX, pandoc will parse LaTeX `\newcommand` and +`\renewcommand` definitions and apply the resulting macros to all LaTeX +math. So, for example, the following will work in all output formats, +not just LaTeX: -Standard markdown syntax does not require a blank line before a header -or blockquote. Pandoc does require this (except, of course, at the -beginning of the document). The reason for the requirement is that -it is all too easy for a `>` or `#` to end up at the beginning of a -line by accident (perhaps through line wrapping). Consider, for -example: + \newcommand{\tuple}[1]{\langle #1 \rangle} - I like several of their flavors of ice cream: #22, for example, and - #5. + $\tuple{a, b, c}$ -Math ----- +In LaTeX output, the `\newcommand` definition will simply be passed +unchanged to the output. -Anything between two $ characters will be treated as TeX math. The -opening $ must have a character immediately to its right, while the -closing $ must have a character immediately to its left. Thus, -`$20,000 and $30,000` won't parse as math. If for some reason -you need to enclose text in literal $ characters, backslash-escape -them and they won't be treated as math delimiters. -TeX math will be printed in all output formats. In Markdown, -reStructuredText, LaTeX, and ConTeXt output, it will appear verbatim -between $ characters. - -In reStructuredText output, it will be rendered using an interpreted -text role `:math:`, as described -[here](http://www.american.edu/econ/itex2mml/mathhack.rst). - -In Texinfo output, it will be rendered inside a `@math` command. - -In groff man output, it will be rendered verbatim without $'s. - -In MediaWiki output, it will be rendered inside `<math>` tags. - -In RTF, Docbook, and OpenDocument output, it will be rendered, as far as -possible, using unicode characters, and will otherwise appear verbatim. -Unknown commands and symbols, and commands that cannot be dealt with -this way (like `\frac`), will be rendered verbatim. So the results may -be a mix of raw TeX code and properly rendered unicode math. - -In HTML, Slidy, and S5 output, the way math is rendered will depend on the -command-line options selected: - -1. The default is to render TeX math as far as possible using unicode - characters, as with RTF, Docbook, and OpenDocument output. Formulas - are put inside a `span` with `class="math"`, so that they may be - styled differently from the surrounding text if needed. - -2. If the `--latexmathml` option is used, TeX math will be displayed - between $ or $$ characters and put in `<span>` tags with class `LaTeX`. - The [LaTeXMathML] script will be used to render it as formulas. - (This trick does not work in all browsers, but it works in Firefox. - In browsers that do not support LaTeXMathML, TeX math will appear - verbatim between $ characters.) - -3. If the `--jsmath` option is used, TeX math will be put inside - `<span>` tags (for inline math) or `<div>` tags (for display math) - with class `math`. The [jsMath] script will be used to render - it. - -4. If the `--mimetex` option is used, the [mimeTeX] CGI script will - be called to generate images for each TeX formula. This should - work in all browsers. The `--mimetex` option takes an optional URL - as argument. If no URL is specified, it will be assumed that the - mimeTeX CGI script is at `/cgi-bin/mimetex.cgi`. - -5. If the `--gladtex` option is used, TeX formulas will be enclosed - in `<eq>` tags in the HTML output. The resulting `htex` file may then - be processed by [gladTeX], which will produce image files for each - formula and an `html` file with links to these images. So, the - procedure is: - - pandoc -s --gladtex myfile.txt -o myfile.htex - gladtex -d myfile-images myfile.htex - # produces myfile.html and images in myfile-images - -6. If the `--webtex` option is used, TeX formulas will be converted - to `<img>` tags that link to an external script that converts - formulas to images. The formula will be URL-encoded and concatenated - with the URL provided. If no URL is specified, the Google Chart - API will be used (`http://chart.apis.google.com/chart?cht=tx&chl=`). - -Inline TeX ----------- +Links +----- -Inline TeX commands will be preserved and passed unchanged to the -LaTeX and ConTeXt writers. Thus, for example, you can use LaTeX to -include BibTeX citations: +Markdown allows links to be specified in several ways. - This result was proved in \cite{jones.1967}. +### Automatic links ### -Note that in LaTeX environments, like +If you enclose a URL or email address in pointy brackets, it +will become a link: - \begin{tabular}{|l|l|}\hline - Age & Frequency \\ \hline - 18--25 & 15 \\ - 26--35 & 33 \\ - 36--45 & 22 \\ \hline - \end{tabular} + <http://google.com> + <sam@green.eggs.ham> -the material between the begin and end tags will be interpreted as raw -LaTeX, not as markdown. -Inline LaTeX is ignored in output formats other than Markdown, LaTeX, -and ConTeXt. +### Inline links ### + +An inline link consists of the link text in square brackets, +followed by the URL in parentheses. (Optionally, the URL can +be followed by a link title, in quotes.) + + This is an [inline link](/url), and here's [one with + a title](http://fsf.org "click here for a good time!"). + +There can be no space between the bracketed part and the parenthesized part. +The link text can contain formatting (such as emphasis), but the title cannot. + + +### Reference links ### + +An *explicit* reference link has two parts, the link itself and the link +definition, which may occur elsewhere in the document (either +before or after the link). + +The link consists of link text in square brackets, followed by a label in +square brackets. (There can be space between the two.) The link definition +must begin at the left margin or indented no more than three spaces. It +consists of the bracketed label, followed by a colon and a space, followed by +the URL, and optionally (after a space) a link title either in quotes or in +parentheses. + +Here are some examples: + + [my label 1]: /foo/bar.html "My title, optional" + [my label 2]: /foo + [my label 3]: http://fsf.org (The free software foundation) + [my label 4]: /bar#special 'A title in single quotes' + +The URL may optionally be surrounded by angle brackets: + + [my label 5]: <http://foo.bar.baz> + +The title may go on the next line: + + [my label 3]: http://fsf.org + "The free software foundation" + +Note that link labels are not case sensitive. So, this will work: + + Here is [my link][FOO] + + [Foo]: /bar/baz + +In an *implicit* reference link, the second pair of brackets is +empty, or omitted entirely: + + See [my website][], or [my website]. + + [my website]: http://foo.bar.baz + + +Images +------ + +A link immediately preceded by a `!` will be treated as an image. +The link text will be used as the image's alt text: + +  + + ![movie reel] + + [movie reel]: movie.gif + +### Pictures with captions ### + +*Pandoc extension*. + +An image occurring by itself in a paragraph will be rendered as +a figure with a caption.[^5] (In LaTeX, a figure environment will be +used; in HTML, the image will be placed in a `div` with class +`figure`, together with a caption in a `p` with class `caption`.) +The image's alt text will be used as the caption. + +  + +[^5]: This feature is not yet implemented for RTF, OpenDocument, or + ODT. In those formats, you'll just get an image in a paragraph by + itself, with no caption. + +If you just want a regular inline image, just make sure it is not +the only thing in the paragraph. One way to do this is to insert a +nonbreaking space after the image: + + \ + + +Footnotes +--------- + +*Pandoc extension*. + +Pandoc's markdown allows footnotes, using the following syntax: + + Here is a footnote reference,[^1] and another.[^longnote] + + [^1]: Here is the footnote. + + [^longnote]: Here's one with multiple blocks. + + Subsequent paragraphs are indented to show that they + belong to the previous footnote. + + { some.code } + + The whole paragraph can be indented, or just the first + line. In this way, multi-paragraph footnotes work like + multi-paragraph list items. + + This paragraph won't be part of the note, because it + isn't indented. + +The identifiers in footnote references may not contain spaces, tabs, +or newlines. These identifiers are used only to correlate the +footnote reference with the note itself; in the output, footnotes +will be numbered sequentially. + +The footnotes themselves need not be placed at the end of the +document. They may appear anywhere except inside other block elements +(lists, block quotes, tables, etc.). + +Inline footnotes are also allowed (though, unlike regular notes, +they cannot contain multiple paragraphs). The syntax is as follows: + + Here is an inline note.^[Inlines notes are easier to write, since + you don't have to pick an identifier and move down to type the + note.] + +Inline and regular footnotes may be mixed freely. + + +Citations +--------- + +*Pandoc extension*. + +Pandoc can automatically generate citations and a bibliography in a number of +styles (using Andrea Rossato's `hs-citeproc`). In order to use this feature, +you will need a bibliographic database in one of the following formats: + + Format File extension + ------------ -------------- + MODS .mods + BibTeX .bib + BibLaTeX .bbx + RIS .ris + EndNote .enl + EndNote XML .xml + ISI .wos + MEDLINE .medline + Copac .copac + JSON citeproc .json + +You will need to specify the bibliography file using the `--bibliography` +command-line option (which may be repeated if you have several +bibliographies). + +By default, pandoc will use a Chicago author-date format for citations +and references. To use another style, you will need to use the +`--csl` option to specify a [CSL] 1.0 style file. A primer on +creating and modifying CSL styles can be found at +<http://citationstyles.org/downloads/primer.html>. +A repository of CSL styles can be found at +<https://github.com/citation-style-language/styles>. + +Citations go inside square brackets and are separated by semicolons. +Each citation must have a key, composed of '@' + the citation +identifier from the database, and may optionally have a prefix, +a locator, and a suffix. Here are some examples: + + Blah blah [see @doe99, pp. 33-35; also @smith04, ch. 1]. + + Blah blah [@doe99, pp. 33-35, 38-39 and *passim*]. + + Blah blah [@smith04; @doe99]. + +A minus sign (`-`) before the `@` will suppress mention of +the author in the citation. This can be useful when the +author is already mentioned in the text: + + Smith says blah [-@smith04]. + +You can also write an in-text citation, as follows: + + @smith04 says blah. + + @smith04 [p. 33] says blah. + +If the style calls for a list of works cited, it will be placed +at the end of the document. Normally, you will want to end your +document with an appropriate header: + + last paragraph... + + # References + +The bibliography will be inserted after this header. + Producing HTML slide shows with Pandoc ====================================== @@ -1410,3 +1917,34 @@ ordinary HTML (without bird tracks). writes HTML with the Haskell code in bird tracks, so it can be copied and pasted as literate Haskell source. +Authors +======= + +© 2006-2011 John MacFarlane (jgm at berkeley dot edu). Released under the +[GPL], version 2 or greater. This software carries no warranty of +any kind. (See COPYRIGHT for full copyright and warranty notices.) +Other contributors include Recai Oktaş, Paulo Tanimoto, Peter Wang, +Andrea Rossato, Eric Kow, infinity0x, Luke Plant, shreevatsa.public, +Puneeth Chaganti, Paul Rivier, rodja.trappe, Bradley Kuhn, thsutton, +Nathan Gass, Jonathan Daugherty, Jérémy Bobbio, Justin Bogner. + +[markdown]: http://daringfireball.net/projects/markdown/ +[reStructuredText]: http://docutils.sourceforge.net/docs/ref/rst/introduction.html +[S5]: http://meyerweb.com/eric/tools/s5/ +[Slidy]: http://www.w3.org/Talks/Tools/Slidy/ +[HTML]: http://www.w3.org/TR/html40/ +[LaTeX]: http://www.latex-project.org/ +[ConTeXt]: http://www.pragma-ade.nl/ +[RTF]: http://en.wikipedia.org/wiki/Rich_Text_Format +[DocBook XML]: http://www.docbook.org/ +[OpenDocument XML]: http://opendocument.xml.org/ +[ODT]: http://en.wikipedia.org/wiki/OpenDocument +[Textile]: http://redcloth.org/textile +[MediaWiki markup]: http://www.mediawiki.org/wiki/Help:Formatting +[groff man]: http://developer.apple.com/DOCUMENTATION/Darwin/Reference/ManPages/man7/groff_man.7.html +[Haskell]: http://www.haskell.org/ +[GNU Texinfo]: http://www.gnu.org/software/texinfo/ +[Emacs Org-Mode]: http://org-mode.org +[EPUB]: http://www.idpf.org/ +[GPL]: http://www.gnu.org/copyleft/gpl.html "GNU General Public License" + @@ -10,10 +10,9 @@ import Distribution.Simple.InstallDirs (mandir, bindir, CopyDest (NoCopyDest)) import Distribution.Simple.Utils (copyFiles) import Control.Exception ( bracket_ ) import Control.Monad ( unless ) -import System.Process ( runCommand, runProcess, waitForProcess ) -import System.FilePath ( (</>), (<.>) ) +import System.Process ( rawSystem, runCommand, waitForProcess ) +import System.FilePath ( (</>) ) import System.Directory -import System.IO ( stderr ) import System.Exit import System.Time import System.IO.Error ( isDoesNotExistError ) @@ -38,41 +37,41 @@ main = do -- | Run test suite. runTestSuite :: Args -> Bool -> PackageDescription -> LocalBuildInfo -> IO a -runTestSuite _ _ pkg _ = do - let isHighlightingKate (Dependency (PackageName "highlighting-kate") _) = True - isHighlightingKate _ = False - let highlightingSupport = any isHighlightingKate $ buildDepends pkg - let testArgs = ["lhs" | highlightingSupport] - let testCmd = "runhaskell -i.. RunTests.hs " ++ unwords testArgs - inDirectory "tests" $ runCommand testCmd >>= waitForProcess >>= exitWith +runTestSuite args _ pkg lbi = do + let testDir = buildDir lbi </> "test-pandoc" + testDir' <- canonicalizePath testDir + let testArgs = concatMap (\arg -> ["-t",arg]) args + if any id [buildable (buildInfo exe) | exe <- executables pkg, exeName exe == "test-pandoc"] + then inDirectory "tests" $ rawSystem (testDir' </> "test-pandoc") testArgs >>= exitWith + else do + putStrLn "Build pandoc with the 'tests' flag to run tests" + exitWith $ ExitFailure 3 --- | Build man pages from markdown sources in man/man1/. +-- | Build man pages from markdown sources in man/ makeManPages :: Args -> BuildFlags -> PackageDescription -> LocalBuildInfo -> IO () -makeManPages _ flags _ buildInfo = - mapM_ (makeManPage pandocPath (fromFlag $ buildVerbosity flags)) manpages - where pandocPath = (buildDir buildInfo) </> "pandoc" </> "pandoc" +makeManPages _ flags _ _ = do + let verbosity = fromFlag $ buildVerbosity flags + ds1 <- modifiedDependencies (manDir </> "man1" </> "pandoc.1") + ["README", manDir </> "man1" </> "pandoc.1.template"] + ds2 <- modifiedDependencies (manDir </> "man1" </> "markdown2pdf.1") + [manDir </> "man1" </> "markdown2pdf.1.md"] + ds3 <- modifiedDependencies (manDir </> "man5" </> "pandoc_markdown.5") + ["README", manDir </> "man5" </> "pandoc_markdown.5.template"] + let cmd = "runghc -package-conf=dist/package.conf.inplace MakeManPage.hs" + let cmd' = if verbosity == silent + then cmd + else cmd ++ " --verbose" + -- Don't run MakeManPage.hs unless we have to + unless (null ds1 && null ds2 && null ds3) $ + runCommand cmd' >>= waitForProcess >>= exitWith manpages :: [FilePath] -manpages = ["pandoc.1", "markdown2pdf.1"] +manpages = ["man1" </> "pandoc.1" + ,"man1" </> "markdown2pdf.1" + ,"man5" </> "pandoc_markdown.5"] manDir :: FilePath -manDir = "man" </> "man1" - --- | Build a man page from markdown source in man/man1. -makeManPage :: FilePath -> Verbosity -> FilePath -> IO () -makeManPage pandoc verbosity manpage = do - let page = manDir </> manpage - let source = page <.> "md" - modifiedDeps <- modifiedDependencies page [source] - unless (null modifiedDeps) $ do - ec <- runProcess pandoc ["-s", "-S", "-r", "markdown", "-w", "man", - "--template=templates/man.template", "-o", page, source] - Nothing Nothing Nothing Nothing (Just stderr) >>= waitForProcess - case ec of - ExitSuccess -> unless (verbosity == silent) $ - putStrLn $ "Created " ++ page - ExitFailure n -> putStrLn ("Error creating " ++ page ++ - ". Exit code = " ++ show n) >> exitWith ec +manDir = "man" installScripts :: PackageDescription -> LocalBuildInfo -> Verbosity -> CopyDest -> IO () @@ -86,7 +85,7 @@ installScripts pkg lbi verbosity copy = installManpages :: PackageDescription -> LocalBuildInfo -> Verbosity -> CopyDest -> IO () installManpages pkg lbi verbosity copy = - copyFiles verbosity (mandir (absoluteInstallDirs pkg lbi copy) </> "man1") + copyFiles verbosity (mandir (absoluteInstallDirs pkg lbi copy)) (zip (repeat manDir) manpages) -- | Returns a list of 'dependencies' that have been modified after 'file'. @@ -1,3 +1,412 @@ +pando (1.8.0.1) + + * Revised Interact.hs so that it works with the CPP macros + in the UTF8 module. + + * Revised Setup.hs so that we don't call MakeManPage.hs unless + the man pages are out of date. + +pandoc (1.8) + + [new features] + + * Support for citations using Andrea Rossato's `citeproc-hs` 0.3. + You can now write, for example, + + Water is wet [see @doe99, pp. 33-35; also @smith04, ch. 1]. + + and, when you process your document using `pandoc`, specifying + a citation style using `--csl` and a bibliography using `--bibliography`, + the citation will be replaced by an appropriately formatted + citation, and a list of works cited will be added to the end + of the document. + + This means that you can switch effortlessly between different citation + and bibliography styles, including footnote, numerical, and author-date + formats. The bibliography can be in any of the following formats: MODS, + BibTeX, BibLaTeX, RIS, EndNote, EndNote XML, ISI, MEDLINE, Copac, or JSON. + See the README for further details. + + Citations are supported in the markdown reader, using a special + syntax, and in the LaTeX reader, using natbib or biblatex syntax. + (Thanks to Nathan Gass for the natbib and biblatex support.) + + * New `textile` reader and writer. Thanks to Paul Rivier for contributing + the `textile` reader, an almost complete implementation of the textile + syntax used by the ruby [RedCloth library](http://redcloth.org/textile). + Resolves Issue #51. + + * New `org` writer, for Emacs Org-mode, contributed by Puneeth Chaganti. + + * New `json` reader and writer, for reading and writing a JSON + representation of the native Pandoc AST. These are much faster + than the `native` reader and writer, and should be used for + serializing Pandoc to text. To convert between the JSON representation + and native Pandoc, use `encodeJSON` and `decodeJSON` from + `Text.JSON.Generic`. + + * A new `jsonFilter` function in `Text.Pandoc` makes it easy + to write scripts that transform a JSON-encoded pandoc document. + For example: + + -- removelinks.hs - removes links from document + import Text.Pandoc + main = interact $ jsonFilter $ bottomUp removeLink + where removeLink (Link xs _) = Emph xs + removeLink x = x + + To use this to remove links while translating markdown to LaTeX: + + pandoc -t json | runghc removelinks.hs | pandoc -f json -t latex + + * Attributes are now allowed in inline `Code` elements, for example: + + In this code, `ulist ! [theclass "special"] << elts`{.haskell} is... + + The attribute syntax is the same as for delimited code blocks. + `Code` inline has an extra argument place for attributes, just like + `CodeBlock`. Inline code will be highlighted in HTML output, if pandoc + is compiled with highlighting support. Resolves Issue #119. + + * New `RawBlock` and `RawInline` elements (replacing `RawHtml`, + `HtmlInline`, and `TeX`) provide lots of flexibility in writing + scripts to transform Pandoc documents. Scripts can now change + how each element is rendered in each output format. + + * You can now define LaTeX macros in markdown documents, and pandoc + will apply them to TeX math. For example, + + \newcommand{\plus}[2]{#1 + #2} + $\plus{3}{4}$ + + yields `3+4`. Since the macros are applied in the reader, they + will work in every output format, not just LaTeX. + + * LaTeX macros can also be used in LaTeX documents (both in math + and in non-math contexts). + + * A new `--mathjax` option has been added for displaying + math in HTML using MathJax. Resolves issue #259. + + * Footnotes are now supported in the RST reader. (Note, however, + that unlike docutils, pandoc ignores the numeral or symbol used in + the note; footnotes are put in an auto-numbered ordered list.) + Resolves Issue #258. + + * A new `--normalize` option causes pandoc to normalize the AST + before writing the document. This means that, for example, + `*hi**there*` will be rendered as `<em>hithere</em>` + instead of `<em>hi</em><em>there</em>`. This is not the default, + because there is a significant performance penalty. + + * A new `--chapters` command-line option causes headers + in DocBook, LaTeX, and ConTeXt to start with "chapter" (level one). + Resolves Issue #265. + + * In DocBook output, `<chapter>` is now used for top-level + headers if the template contains `<book>`. Resolves Issue #265. + + * A new `--listings` option in `pandoc` and `markdown2pdf` causes + the LaTeX writer to use the listings package for code blocks. + (Thanks to Josef Svennigsson for the pandoc patch, and Etienne + Millon for the markdown2pdf patch.) + + * `markdown2pdf` now supports `--data-dir`. + + * URLs in autolinks now have class "url" so they can be styled. + + * Improved prettyprinting in most formats. Lines will be wrapped + more evenly and duplicate blank lines avoided. + + * New `--columns` command-line option sets the column width for + line wrapping and relative width calculations for tables. + + * Made `--smart` work in HTML, RST, and Textile readers, as well + as markdown. + + * Added `--html5` option for HTML5 output. + + * Added support for listings package in LaTeX reader + (Puneeth Chaganti). + + * Added support for simple tables in the LaTeX reader. + + * Added support for simple tables in the HTML reader. + + * Significant performance improvements in many readers and writers. + + [API and program changes] + + * Moved `Text.Pandoc.Definition` from the `pandoc` package to a new + auxiliary package, `pandoc-types`. This will make it possible for other + programs to supply output in Pandoc format, without depending on the whole + pandoc package. + + * Added `Attr` field to `Code`. + + * Removed `RawHtml`, `HtmlInline`, and `TeX` elements; added generic + `RawBlock` and `RawInline`. + + * Moved generic functions to `Text.Pandoc.Generic`. Deprecated + `processWith`, replacing it with two functions, `bottomUp` and `topDown`. + Removed previously deprecated functions `processPandoc` and `queryPandoc`. + + * Added `Text.Pandoc.Builder`, for building `Pandoc` structures. + + * `Text.Pandoc` now exports association lists `readers` and `writers`. + + * Added `Text.Pandoc.Readers.Native`, which exports `readNative`. + `readNative` can now read full pandoc documents, block lists, blocks, + inline lists, or inlines. It will interpret `Str "hi"` + as if it were `Pandoc (Meta [] [] []) [Plain [Str "hi"]]`. + This should make testing easier. + + * Removed deprecated `-C/--custom-header` option. + Use `--template` instead. + + * `--biblio-file` has been replaced by `--bibliography`. + `--biblio-format` has been removed; pandoc now guesses the format + from the file extension (see README). + + * pandoc will treat an argument as a URI only if it has an + `http(s)` scheme. Previously pandoc would treat some + Windows pathnames beginning with `C:/` as URIs. + + * The `--sanitize-html` option and the `stateSanitize` field in + `ParserState` have been removed. Sanitization is better done in the + resulting HTML using `xss-sanitize`, which is based on pandoc's + sanitization, but improved. + + * pandoc now adds a newline to the end of its output in fragment + mode (= not `--standalone`). + + * Added support for `lang` in `html` tag in the HTML template, + so you can do `pandoc -s -V lang=es`, for example. + + * `highlightHtml` in `Text.Pandoc.Highlighting` now takes + a boolean argument that selects between "inline" and + "block" HTML. + + * `Text.Pandoc.Writers.RTF` now exports `rtfEmbedImage`. + Images are embedded in RTF output when possible (png, jpeg). + Resolves Issue #275. + + * Added `Text.Pandoc.Pretty`. This is better suited for pandoc than the + `pretty` package. Changed all writers that used + `Text.PrettyPrint.HughesPJ` to use `Text.Pandoc.Pretty` instead. + + * Rewrote `writeNative` using the new prettyprinting module. It is + now much faster. The output has been made more consistent and compressed. + `writeNative` is also now sensitive to writerStandalone`, and will simply + `print a block list if writerStandalone` is False. + + * Removed `Text.Pandoc.Blocks`. `Text.Pandoc.Pretty` allows you to define + blocks and concatenate them, so a separate module is no longer needed. + + * `Text.Pandoc.Shared`: + + + Added `writerColumns`, `writerChapters`, and `writerHtml5` to + `WriterOptions`. + + Added `normalize`. + + Removed unneeded prettyprinting functions: + `wrapped`, `wrapIfNeeded`, `wrappedTeX`, `wrapTeXIfNeeded`, `hang'`, + `BlockWrapper`, `wrappedBlocksToDoc`. + + Made `splitBy` take a test instead of an element. + + Added `findDataFile`, refactored `readDataFile`. + + Added `stringify`. Rewrote `inlineListToIdentifier` using `stringify`. + + Fixed `inlineListToIdentifier` to treat '\160' as ' '. + + * `Text.Pandoc.Readers.HTML`: + + + Removed `rawHtmlBlock`, `anyHtmlBlockTag`, `anyHtmlInlineTag`, + `anyHtmlTag`, `anyHtmlEndTag`, `htmlEndTag`, `extractTagType`, + `htmlBlockElement`, `htmlComment` + + Added `htmlTag`, `htmlInBalanced`, `isInlineTag`, `isBlockTag`, + `isTextTag` + + * Moved `smartPunctuation` from `Text.Pandoc.Readers.Markdown` + to `Text.Pandoc.Readers.Parsing`, and parameterized it with + an inline parser. + + * Ellipses are no longer allowed to contain spaces. + Previously we allowed '. . .', ' . . . ', etc. This caused + too many complications, and removed author's flexibility in + combining ellipses with spaces and periods. + + * Allow linebreaks in URLs (treat as spaces). Also, a string of + consecutive spaces or tabs is now parsed as a single space. If you have + multiple spaces in your URL, use `%20%20`. + + * `Text.Pandoc.Parsing`: + + + Removed `refsMatch`. + + Hid `Key` constructor. + + Removed custom `Ord` and `Eq` instances for `Key`. + + Added `toKey` and `fromKey` to convert between `Key` and `[Inline]`. + + Generalized type on `readWith`. + + * Small change in calculation of relative widths of table columns. + If the size of the header > the specified column width, use + the header size as 100% for purposes of calculating + relative widths of columns. + + * Markdown writer now uses some pandoc-specific features when `--strict` + is not specified: `\` newline is used for a hard linebreak instead of + two spaces then a newline. And delimited code blocks are used when + there are attributes. + + * HTML writer: improved gladTeX output by setting ENV appropriately + for display or inline math (Jonathan Daugherty). + + * LaTeX writer: Use `\paragraph`, `\subparagraph` for level 4,5 headers. + + * LaTeX reader: + + + `\label{foo}` and `\ref{foo}` now become `{foo}` instead of `(foo)`. + + `\index{}` commands are skipped. + + * Added `fontsize` variable to default LaTeX template. + This makes it easy to set the font size using `markdown2pdf`: + `markdown2pdf -V fontsize=12pt input.txt`. + + * Fixed problem with strikeout in LaTeX headers when using + hyperref, by adding a command to the default LaTeX template + that disables `\sout` inside pdf strings. Thanks to Joost Kremers + for the fix. + + * The `COLUMNS` environment variable no longer has any effect. + + [under-the-hood improvements] + + * Pandoc now compiles with GHC 7. (This alone leads to a + significant performance improvement, 15-20%.) + + * Completely rewrote HTML reader using tagsoup as a lexer. The + new reader is faster and more accurate. Unlike the + old reader, it does not get bogged down on some input + (Issues #277, 255). And it handles namespaces in tags + (Issue #274). + + * Replaced `escapeStringAsXML` with a faster version. + + * Rewrote `spaceChar` and some other parsers in Text.Pandoc.Parsing + for a significant performance boost. + + * Improved performance of all readers by rewriting parsers. + + * Simplified Text.Pandoc.CharacterReferences by using + entity lookup functions from TagSoup. + + * `Text.Pandoc.UTF8` now uses the unicode-aware IO functions + from `System.IO` if base >= 4.2. This gives support for + windows line endings on windows. + + * Remove duplications in documentation by generating the + pandoc man page from README, using `MakeManPage.hs`. + + * README now includes a full description of markdown syntax, + including non-pandoc-specific parts. A new `pandoc_markdown` + man page is extracted from this, so you can look up markdown + syntax by doing `man pandoc_markdown`. + + * Completely revised test framework (with help from Nathan Gass). + The new test framework is built when the `tests` Cabal flag is set. It + includes the old integration tests, but also some new unit and quickcheck + tests. Test output has been much improved, and you can now specify a glob + pattern after `cabal test` to indicate which tests should be run; + for example `cabal test citations` will run all the citation tests. + + * Added a shell script, `stripansi.sh`, for filtering ANSI control + sequences from test output: `cabal test | ./stripansi.sh > test.log`. + + * Added `Interact.hs` to make it easier to use ghci while developing. + `Interact.hs` loads `ghci` from the `src` directory, specifying + all the options needed to load pandoc modules (including + specific package dependencies, which it gets by parsing + dist/setup-config). + + * Added `Benchmark.hs`, testing all readers + writers using criterion. + + * Added `stats.sh`, to make it easier to collect and archive + benchmark and lines-of-code stats. + + * Added upper bounds to all cabal dependencies. + + * Include man pages in extra-source-files. This allows users to + install pandoc from the tarball without needing to build the man + pages. + + [bug fixes] + + * Filenames are encoded as UTF8. Resolves Issue #252. + + * Handle curly quotes better in `--smart` mode. Previously, curly quotes + were just parsed literally, leading to problems in some output formats. + Now they are parsed as `Quoted` inlines, if `--smart` is specified. + Resolves Issue #270. + + * Text.Pandoc.Parsing: Fixed bug in grid table parser. + Spaces at end of line were not being stripped properly, + resulting in unintended LineBreaks. + + * Markdown reader: + + + Allow HTML comments as inline elements in markdown. + So, `aaa <!-- comment --> bbb` can be a single paragraph. + + Fixed superscripts with links: `^[link](/foo)^` gets + recognized as a superscripted link, not an inline note followed by + garbage. + + Fixed regression, making markdown reference keys case-insensitive again. + Resolves Issue #272. + + Properly handle abbreviations (like `Mr.`) at the end of a line. + + Better handling of intraword underscores, avoiding exponential + slowdowns in some cases. Resolves Issue #182. + + Fixed bug in alignments in tables with blank rows in the header. + + * RST reader: + + + Field lists now allow spaces in field names, and + block content in field values. (Thanks to Lachlan Musicman + for pointing out the bug.) + + Definition list items are now always `Para` instead of + `Plain`, matching behavior of `rst2xml.py`. + + In image blocks, the description is parsed properly and + used for the alt attribute, not also the title. + + Skip blank lines at beginning of file. Resolves + Debian #611328. + + * LaTeX reader: + + + Improved parsing of preamble. + Previously you'd get unexpected behavior on a document that + contained `\begin{document}` in, say, a verbatim block. + + Allow spaces between `\begin` or `\end` and `{`. + + Support `\L` and `\l`. + + Skip comments inside paragraphs. + + * LaTeX writer: + + + Escape strings in `\href{..}`. + + In nonsimple tables, put cells in `\parbox`. + + * OpenDocument writer: don't print raw TeX. + + * Markdown writer: + + + Fixed bug in `Image`. URI was getting unescaped twice! + + Avoid printing extra blank lines at the end if there are + no notes or references. + + * LaTeX and ConTeXt: Escape `[` and `]` as `{[}` and `{]}`. + This avoids unwanted interpretation as an optional argument. + + * ConTeXt writer: Fixed problem with inline code. Previously + `}` would be rendered `\type{}}`. Now we check the string for '}' and '{'. + If it contains neither, use `\type{}`; otherwise use `\mono{}` + with an escaped version of the string. + + * `:` now allowed in HTML tags. Resolves Issue #274. + pandoc (1.6) [ John MacFarlane ] @@ -408,7 +817,8 @@ pandoc (1.5) + Removed stLink, link template variable. Reason: we now always include hyperref in the template. - * Latex template: + * LaTeX template: + + Only show \author if there are some. + Always include hyperref package. It is used not just for links but for toc, section heading bookmarks, footnotes, etc. Also added diff --git a/default.csl b/default.csl new file mode 100644 index 000000000..f16f82305 --- /dev/null +++ b/default.csl @@ -0,0 +1,369 @@ +<?xml version="1.0" encoding="utf-8"?> +<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" page-range-format="chicago"> + <info> + <title>Chicago Manual of Style (Author-Date format)</title> + <id>http://www.zotero.org/styles/chicago-author-date</id> + <link href="http://www.zotero.org/styles/chicago-author-date" rel="self"/> + <author> + <name>Julian Onions</name> + <email>julian.onions@gmail.com</email> + </author> + <category citation-format="author-date"/> + <category field="generic-base"/> + <updated>2009-12-04T20:22:16+00:00</updated> + <summary>The author-date variant of the Chicago style</summary> + <link href="http://www.chicagomanualofstyle.org/tools_citationguide.html" rel="documentation"/> + </info> + <macro name="secondary-contributors"> + <choose> + <if match="none" type="chapter"> + <group delimiter=". "> + <choose> + <if variable="author"> + <names variable="editor"> + <label form="verb-short" prefix=" " suffix=". " text-case="capitalize-first" /> + <name and="text" delimiter=", " /> + </names> + </if> + </choose> + <choose> + <if match="any" variable="author editor"> + <names variable="translator"> + <label form="verb-short" prefix=" " suffix=". " text-case="capitalize-first" /> + <name and="text" delimiter=", " /> + </names> + </if> + </choose> + </group> + </if> + </choose> + </macro> + <macro name="container-contributors"> + <choose> + <if type="chapter"> + <group delimiter=", " prefix=","> + <choose> + <if variable="author"> + <names variable="editor"> + <label form="verb-short" prefix=" " suffix=". " text-case="lowercase" /> + <name and="text" delimiter=", " /> + </names> + </if> + </choose> + <choose> + <if match="any" variable="author editor"> + <names variable="translator"> + <label form="verb-short" prefix=" " suffix=". " text-case="lowercase" /> + <name and="text" delimiter=", " /> + </names> + </if> + </choose> + </group> + </if> + </choose> + </macro> + <macro name="anon"> + <choose> + <if match="none" variable="author editor translator"> + <text form="short" term="anonymous" text-case="capitalize-first" /> + </if> + </choose> + </macro> + <macro name="editor"> + <names variable="editor"> + <name and="text" delimiter=", " delimiter-precedes-last="always" name-as-sort-order="first" sort-separator=", " /> + <label form="short" prefix=", " suffix="." /> + </names> + </macro> + <macro name="translator"> + <names variable="translator"> + <name and="text" delimiter=", " delimiter-precedes-last="always" name-as-sort-order="first" sort-separator=", " /> + <label form="verb-short" prefix=", " suffix="." /> + </names> + </macro> + <macro name="recipient"> + <choose> + <if type="personal_communication"> + <choose> + <if variable="genre"> + <text text-case="capitalize-first" variable="genre" /> + </if> + <else> + <text term="letter" text-case="capitalize-first" /> + </else> + </choose> + </if> + </choose> + <names delimiter=", " variable="recipient"> + <label form="verb" prefix=" " suffix=" " text-case="lowercase" /> + <name and="text" delimiter=", " /> + </names> + </macro> + <macro name="contributors"> + <names variable="author"> + <name and="text" delimiter=", " delimiter-precedes-last="always" name-as-sort-order="first" sort-separator=", " /> + <label form="verb-short" prefix=", " suffix="." text-case="lowercase" /> + <substitute> + <text macro="editor" /> + <text macro="translator" /> + </substitute> + </names> + <text macro="anon" /> + <text macro="recipient" /> + </macro> + <macro name="contributors-short"> + <names variable="author"> + <name and="text" delimiter=", " form="short" /> + <substitute> + <names variable="editor" /> + <names variable="translator" /> + </substitute> + </names> + <text macro="anon" /> + </macro> + <macro name="interviewer"> + <names delimiter=", " variable="interviewer"> + <label form="verb" prefix=" " suffix=" " text-case="capitalize-first" /> + <name and="text" delimiter=", " /> + </names> + </macro> + <macro name="archive"> + <group delimiter=". "> + <text text-case="capitalize-first" variable="archive_location" /> + <text variable="archive" /> + <text variable="archive-place" /> + </group> + </macro> + <macro name="access"> + <group delimiter=". "> + <choose> + <if match="any" type="graphic report"> + <text macro="archive" /> + </if> + <else-if match="none" type="book thesis chapter article-journal article-newspaper article-magazine"> + <text macro="archive" /> + </else-if> + </choose> + <text prefix="doi:" variable="DOI" /> + <text variable="URL" /> + </group> + </macro> + <macro name="title"> + <choose> + <if match="none" variable="title"> + <choose> + <if match="none" type="personal_communication"> + <text text-case="capitalize-first" variable="genre" /> + </if> + </choose> + </if> + <else-if type="book"> + <text font-style="italic" variable="title" /> + </else-if> + <else> + <text variable="title" /> + </else> + </choose> + </macro> + <macro name="edition"> + <choose> + <if match="any" type="book chapter"> + <choose> + <if is-numeric="edition"> + <group delimiter=" "> + <number form="ordinal" variable="edition" /> + <text form="short" suffix="." term="edition" /> + </group> + </if> + <else> + <text suffix="." variable="edition" /> + </else> + </choose> + </if> + </choose> + </macro> + <macro name="locators"> + <choose> + <if type="article-journal"> + <text prefix=" " variable="volume" /> + <text prefix=", no. " variable="issue" /> + </if> + <else-if type="book"> + <group delimiter=". " prefix=". "> + <group> + <text form="short" suffix=". " term="volume" text-case="capitalize-first" /> + <number form="numeric" variable="volume" /> + </group> + <group> + <number form="numeric" variable="number-of-volumes" /> + <text form="short" plural="true" prefix=" " suffix="." term="volume" /> + </group> + </group> + </else-if> + </choose> + </macro> + <macro name="locators-chapter"> + <choose> + <if type="chapter"> + <group prefix=", "> + <text suffix=":" variable="volume" /> + <text variable="page" /> + </group> + </if> + </choose> + </macro> + <macro name="locators-article"> + <choose> + <if type="article-newspaper"> + <group delimiter=", " prefix=", "> + <group> + <text suffix=" " variable="edition" /> + <text prefix=" " term="edition" /> + </group> + <group> + <text form="short" suffix=". " term="section" /> + <text variable="section" /> + </group> + </group> + </if> + <else-if type="article-journal"> + <text prefix=": " variable="page" /> + </else-if> + </choose> + </macro> + <macro name="point-locators"> + <group> + <choose> + <if locator="page" match="none"> + <label form="short" suffix=" " variable="locator" /> + </if> + </choose> + <text variable="locator" /> + </group> + </macro> + <macro name="container-prefix"> + <text term="in" text-case="capitalize-first" /> + </macro> + <macro name="container-title"> + <choose> + <if type="chapter"> + <text macro="container-prefix" suffix=" " /> + </if> + </choose> + <text font-style="italic" variable="container-title" /> + </macro> + <macro name="publisher"> + <group delimiter=": "> + <text variable="publisher-place" /> + <text variable="publisher" /> + </group> + </macro> + <macro name="date"> + <date variable="issued"> + <date-part name="year" /> + </date> + </macro> + <macro name="day-month"> + <date variable="issued"> + <date-part name="month" /> + <date-part name="day" prefix=" " /> + </date> + </macro> + <macro name="collection-title"> + <text variable="collection-title" /> + <text prefix=" " variable="collection-number" /> + </macro> + <macro name="event"> + <group> + <text suffix=" " term="presented at" /> + <text variable="event" /> + </group> + </macro> + <macro name="description"> + <group delimiter=". "> + <text macro="interviewer" /> + <text text-case="capitalize-first" variable="medium" /> + </group> + <choose> + <if match="none" variable="title"> </if> + <else-if type="thesis"> </else-if> + <else> + <text prefix=". " text-case="capitalize-first" variable="genre" /> + </else> + </choose> + </macro> + <macro name="issue"> + <choose> + <if type="article-journal"> + <text macro="day-month" prefix=" (" suffix=")" /> + </if> + <else-if type="speech"> + <group delimiter=", " prefix=" "> + <text macro="event" /> + <text macro="day-month" /> + <text variable="event-place" /> + </group> + </else-if> + <else-if match="any" type="article-newspaper article-magazine"> + <text macro="day-month" prefix=", " /> + </else-if> + <else> + <group delimiter=", " prefix=". "> + <choose> + <if type="thesis"> + <text text-case="capitalize-first" variable="genre" /> + </if> + </choose> + <text macro="publisher" /> + <text macro="day-month" /> + </group> + </else> + </choose> + </macro> + <citation + disambiguate-add-givenname="true" + disambiguate-add-names="true" + disambiguate-add-year-suffix="true" + et-al-min="4" + et-al-subsequent-min="4" + et-al-subsequent-use-first="1" + et-al-use-first="1"> + <layout delimiter="; " prefix="(" suffix=")"> + <group delimiter=", "> + <group delimiter=" "> + <text macro="contributors-short" /> + <text macro="date" /> + </group> + <text macro="point-locators" /> + </group> + </layout> + </citation> + <bibliography + entry-spacing="0" + et-al-min="11" + et-al-use-first="7" + hanging-indent="true" + subsequent-author-substitute="---"> + <sort> + <key macro="contributors" /> + <key variable="issued" /> + <key variable="title" /> + </sort> + <layout suffix="."> + <text macro="contributors" suffix=". " /> + <text macro="date" suffix=". " /> + <text macro="title" /> + <text macro="description" /> + <text macro="secondary-contributors" prefix=". " /> + <text macro="container-title" prefix=". " /> + <text macro="container-contributors" /> + <text macro="locators-chapter" /> + <text macro="edition" prefix=". " /> + <text macro="locators" /> + <text macro="collection-title" prefix=". " /> + <text macro="issue" /> + <text macro="locators-article" /> + <text macro="access" prefix=". " /> + </layout> + </bibliography> +</style> diff --git a/man/man1/markdown2pdf.1 b/man/man1/markdown2pdf.1 new file mode 100644 index 000000000..11c0e7ce7 --- /dev/null +++ b/man/man1/markdown2pdf.1 @@ -0,0 +1,165 @@ +.TH MARKDOWN2PDF 1 "January 29, 2011" "Pandoc User Manuals" +.SH NAME +.PP +markdown2pdf - converts markdown-formatted text to PDF, using pdflatex +.SH SYNOPSIS +.PP +markdown2pdf [\f[I]options\f[]] [\f[I]input-file\f[]]... +.SH DESCRIPTION +.PP +\f[C]markdown2pdf\f[] converts \f[I]input-file\f[] (or text from +standard input) from markdown-formatted plain text to PDF, using +\f[C]pandoc\f[] and \f[C]pdflatex\f[]. +If no output filename is specified (using the \f[C]-o\f[] option), the +name of the output file is derived from the input file; thus, for +example, if the input file is \f[I]hello.txt\f[], the output file will +be \f[I]hello.pdf\f[]. +If the input is read from STDIN and no output filename is specified, the +output file will be named \f[I]stdin.pdf\f[]. +If multiple input files are specified, they will be concatenated before +conversion, and the name of the output file will be derived from the +first input file. +.PP +Input is assumed to be in the UTF-8 character encoding. +If your local character encoding is not UTF-8, you should pipe input +through \f[C]iconv\f[]: +.IP +.nf +\f[C] +iconv\ -t\ utf-8\ input.txt\ |\ markdown2pdf +\f[] +.fi +.PP +\f[C]markdown2pdf\f[] assumes that the \f[C]unicode\f[], \f[C]array\f[], +\f[C]fancyvrb\f[], \f[C]graphicx\f[], and \f[C]ulem\f[] packages are in +latex\[aq]s search path. +If these packages are not included in your latex setup, they can be +obtained from \f[C]http://ctan.org\f[]. +.SH OPTIONS +.TP +.B -o \f[I]FILE\f[], --output=\f[I]FILE\f[] +Write output to \f[I]FILE\f[]. +.RS +.RE +.TP +.B --strict +Use strict markdown syntax, with no extensions or variants. +.RS +.RE +.TP +.B --xetex +Use xelatex instead of pdflatex to create the PDF. +.RS +.RE +.TP +.B -N, --number-sections +Number section headings in LaTeX output. +(Default is not to number them.) +.RS +.RE +.TP +.B --listings +Use listings package for LaTeX code blocks +.RS +.RE +.TP +.B --template=\f[I]FILE\f[] +Use \f[I]FILE\f[] as a custom template for the generated document. +Implies \f[C]-s\f[]. +See the section TEMPLATES in \f[C]pandoc\f[](1) for information about +template syntax. +Use \f[C]pandoc\ -D\ latex\f[] to print the default LaTeX template. +.RS +.RE +.TP +.B -V KEY=VAL, --variable=\f[I]KEY:VAL\f[] +Set the template variable KEY to the value VAL when rendering the +document in standalone mode. +Use this to set the font size when using the default LaTeX template: +\f[C]-V\ fontsize=12pt\f[]. +.RS +.RE +.TP +.B -H \f[I]FILE\f[], --include-in-header=\f[I]FILE\f[] +Include (LaTeX) contents of \f[I]FILE\f[] at the end of the header. +Implies \f[C]-s\f[]. +.RS +.RE +.TP +.B -B \f[I]FILE\f[], --include-before-body=\f[I]FILE\f[] +Include (LaTeX) contents of \f[I]FILE\f[] at the beginning of the +document body. +.RS +.RE +.TP +.B -A \f[I]FILE\f[], --include-after-body=\f[I]FILE\f[] +Include (LaTeX) contents of \f[I]FILE\f[] at the end of the document +body. +.RS +.RE +.TP +.B --bibliography=\f[I]FILE\f[] +Specify bibliography database to be used in resolving citations. +The database type will be determined from the extension of +\f[I]FILE\f[], which may be \f[C].xml\f[] (MODS format), \f[C].bib\f[] +(BibTeX format), or \f[C].json\f[] (citeproc JSON). +.RS +.RE +.TP +.B --csl=\f[I]FILE\f[] +Specify CSL style to be used in formatting citations and the +bibliography. +If \f[I]FILE\f[] is not found, pandoc will look for it in +.RS +.IP +.nf +\f[C] +$HOME/.csl +\f[] +.fi +.PP +in unix and +.IP +.nf +\f[C] +C:\\Documents\ And\ Settings\\USERNAME\\Application\ Data\\csl +\f[] +.fi +.PP +in Windows. +If the \f[C]--csl\f[] option is not specified, pandoc will use a default +style: either \f[C]default.csl\f[] in the user data directory (see +\f[C]--data-dir\f[]), or, if that is not present, the Chicago +author-date style. +.RE +.TP +.B --data-dir\f[I]=DIRECTORY\f[] +Specify the user data directory to search for pandoc data files. +If this option is not specified, the default user data directory will be +used: +.RS +.IP +.nf +\f[C] +$HOME/.pandoc +\f[] +.fi +.PP +in unix and +.IP +.nf +\f[C] +C:\\Documents\ And\ Settings\\USERNAME\\Application\ Data\\pandoc +\f[] +.fi +.PP +in Windows. +A \f[C]reference.odt\f[], \f[C]epub.css\f[], \f[C]templates\f[] +directory, or \f[C]s5\f[] directory placed in this directory will +override pandoc\[aq]s normal defaults. +.RE +.SH SEE ALSO +.PP +\f[C]pandoc\f[](1), \f[C]pdflatex\f[](1) +.SH AUTHORS +John MacFarlane, Paulo Tanimoto, and Recai Oktas. diff --git a/man/man1/markdown2pdf.1.md b/man/man1/markdown2pdf.1.md index 3947ef8da..efbdc8184 100644 --- a/man/man1/markdown2pdf.1.md +++ b/man/man1/markdown2pdf.1.md @@ -1,6 +1,6 @@ % MARKDOWN2PDF(1) Pandoc User Manuals -% John MacFarlane and Recai Oktas -% January 8, 2008 +% John MacFarlane, Paulo Tanimoto, and Recai Oktas +% January 29, 2011 # NAME @@ -48,6 +48,9 @@ packages are not included in your latex setup, they can be obtained from -N, \--number-sections : Number section headings in LaTeX output. (Default is not to number them.) +\--listings +: Use listings package for LaTeX code blocks + \--template=*FILE* : Use *FILE* as a custom template for the generated document. Implies `-s`. See the section TEMPLATES in `pandoc`(1) for information about @@ -56,10 +59,8 @@ packages are not included in your latex setup, they can be obtained from -V KEY=VAL, \--variable=*KEY:VAL* : Set the template variable KEY to the value VAL when rendering the - document in standalone mode. This is only useful when the - `--template` option is used to specify a custom template, since - pandoc automatically sets the variables used in the default - templates. + document in standalone mode. Use this to set the font size when + using the default LaTeX template: `-V fontsize=12pt`. -H *FILE*, \--include-in-header=*FILE* : Include (LaTeX) contents of *FILE* at the end of the header. Implies @@ -71,10 +72,46 @@ packages are not included in your latex setup, they can be obtained from -A *FILE*, \--include-after-body=*FILE* : Include (LaTeX) contents of *FILE* at the end of the document body. --C *FILE*, \--custom-header=*FILE* -: Use contents of *FILE* as the document header. *Note: This option is - deprecated. Users should transition to using `--template` instead.* +\--bibliography=*FILE* +: Specify bibliography database to be used in resolving + citations. The database type will be determined from the + extension of *FILE*, which may be `.xml` (MODS format), + `.bib` (BibTeX format), or `.json` (citeproc JSON). + +\--csl=*FILE* +: Specify [CSL] style to be used in formatting citations and + the bibliography. If *FILE* is not found, pandoc will look + for it in + + $HOME/.csl + + in unix and + + C:\Documents And Settings\USERNAME\Application Data\csl + + in Windows. If the `--csl` option is not specified, pandoc + will use a default style: either `default.csl` in the + user data directory (see `--data-dir`), or, if that is + not present, the Chicago author-date style. + +\--data-dir*=DIRECTORY* +: Specify the user data directory to search for pandoc data files. + If this option is not specified, the default user data directory + will be used: + + $HOME/.pandoc + + in unix and + + C:\Documents And Settings\USERNAME\Application Data\pandoc + + in Windows. A `reference.odt`, `epub.css`, `templates` directory, + or `s5` directory placed in this directory will override pandoc's + normal defaults. # SEE ALSO `pandoc`(1), `pdflatex`(1) + +[CSL]: CitationStyles.org + diff --git a/man/man1/pandoc.1 b/man/man1/pandoc.1 new file mode 100644 index 000000000..a9fcfb08d --- /dev/null +++ b/man/man1/pandoc.1 @@ -0,0 +1,905 @@ +.TH PANDOC 1 "January 29, 2011" "Pandoc" +.SH NAME +pandoc - general markup converter +.SH SYNOPSIS +.PP +pandoc [\f[I]options\f[]] [\f[I]input-file\f[]]... +.SH DESCRIPTION +.PP +Pandoc is a Haskell library for converting from one markup format to +another, and a command-line tool that uses this library. +It can read markdown and (subsets of) Textile, reStructuredText, HTML, +and LaTeX; and it can write plain text, markdown, reStructuredText, +HTML, LaTeX, ConTeXt, RTF, DocBook XML, OpenDocument XML, ODT, GNU +Texinfo, MediaWiki markup, EPUB, Textile, groff man pages, Emacs +Org-Mode, and Slidy or S5 HTML slide shows. +.PP +Pandoc\[aq]s enhanced version of markdown includes syntax for footnotes, +tables, flexible ordered lists, definition lists, delimited code blocks, +superscript, subscript, strikeout, title blocks, automatic tables of +contents, embedded LaTeX math, citations, and markdown inside HTML block +elements. +(These enhancements, described below under Pandoc\[aq]s markdown, can be +disabled using the \f[C]--strict\f[] option.) +.PP +In contrast to most existing tools for converting markdown to HTML, +which use regex substitutions, Pandoc has a modular design: it consists +of a set of readers, which parse text in a given format and produce a +native representation of the document, and a set of writers, which +convert this native representation into a target format. +Thus, adding an input or output format requires only adding a reader or +writer. +.SS Using Pandoc +.PP +If no \f[I]input-file\f[] is specified, input is read from +\f[I]stdin\f[]. +Otherwise, the \f[I]input-files\f[] are concatenated (with a blank line +between each) and used as input. +Output goes to \f[I]stdout\f[] by default (though output to +\f[I]stdout\f[] is disabled for the \f[C]odt\f[] and \f[C]epub\f[] +output formats). +For output to a file, use the \f[C]-o\f[] option: +.IP +.nf +\f[C] +pandoc\ -o\ output.html\ input.txt +\f[] +.fi +.PP +Instead of a file, an absolute URI may be given. +In this case pandoc will fetch the content using HTTP: +.IP +.nf +\f[C] +pandoc\ -f\ html\ -t\ markdown\ http://www.fsf.org +\f[] +.fi +.PP +If multiple input files are given, \f[C]pandoc\f[] will concatenate them +all (with blank lines between them) before parsing. +.PP +The format of the input and output can be specified explicitly using +command-line options. +The input format can be specified using the \f[C]-r/--read\f[] or +\f[C]-f/--from\f[] options, the output format using the +\f[C]-w/--write\f[] or \f[C]-t/--to\f[] options. +Thus, to convert \f[C]hello.txt\f[] from markdown to LaTeX, you could +type: +.IP +.nf +\f[C] +pandoc\ -f\ markdown\ -t\ latex\ hello.txt +\f[] +.fi +.PP +To convert \f[C]hello.html\f[] from html to markdown: +.IP +.nf +\f[C] +pandoc\ -f\ html\ -t\ markdown\ hello.html +\f[] +.fi +.PP +Supported output formats are listed below under the \f[C]-t/--to\f[] +option. +Supported input formats are listed below under the \f[C]-f/--from\f[] +option. +Note that the \f[C]rst\f[], \f[C]textile\f[], \f[C]latex\f[], and +\f[C]html\f[] readers are not complete; there are some constructs that +they do not parse. +.PP +If the input or output format is not specified explicitly, +\f[C]pandoc\f[] will attempt to guess it from the extensions of the +input and output filenames. +Thus, for example, +.IP +.nf +\f[C] +pandoc\ -o\ hello.tex\ hello.txt +\f[] +.fi +.PP +will convert \f[C]hello.txt\f[] from markdown to LaTeX. +If no output file is specified (so that output goes to \f[I]stdout\f[]), +or if the output file\[aq]s extension is unknown, the output format will +default to HTML. +If no input file is specified (so that input comes from \f[I]stdin\f[]), +or if the input files\[aq] extensions are unknown, the input format will +be assumed to be markdown unless explicitly specified. +.PP +Pandoc uses the UTF-8 character encoding for both input and output. +If your local character encoding is not UTF-8, you should pipe input and +output through \f[C]iconv\f[]: +.IP +.nf +\f[C] +iconv\ -t\ utf-8\ input.txt\ |\ pandoc\ |\ iconv\ -f\ utf-8 +\f[] +.fi +.SH OPTIONS +.TP +.B \f[C]-f\f[] \f[I]FORMAT\f[], \f[C]-r\f[] \f[I]FORMAT\f[], +\f[C]--from=\f[]\f[I]FORMAT\f[], \f[C]--read=\f[]\f[I]FORMAT\f[] +Specify input format. +\f[I]FORMAT\f[] can be \f[C]native\f[] (native Haskell), \f[C]json\f[] +(JSON version of native AST), \f[C]markdown\f[] (markdown), +\f[C]textile\f[] (Textile), \f[C]rst\f[] (reStructuredText), +\f[C]html\f[] (HTML), or \f[C]latex\f[] (LaTeX). +If \f[C]+lhs\f[] is appended to \f[C]markdown\f[], \f[C]rst\f[], or +\f[C]latex\f[], the input will be treated as literate Haskell source: +see Literate Haskell support, below. +.RS +.RE +.TP +.B \f[C]-t\f[] \f[I]FORMAT\f[], \f[C]-w\f[] \f[I]FORMAT\f[], +\f[C]--to=\f[]\f[I]FORMAT\f[], \f[C]--write=\f[]\f[I]FORMAT\f[] +Specify output format. +\f[I]FORMAT\f[] can be \f[C]native\f[] (native Haskell), \f[C]json\f[] +(JSON version of native AST), \f[C]plain\f[] (plain text), +\f[C]markdown\f[] (markdown), \f[C]rst\f[] (reStructuredText), +\f[C]html\f[] (HTML), \f[C]latex\f[] (LaTeX), \f[C]context\f[] +(ConTeXt), \f[C]man\f[] (groff man), \f[C]mediawiki\f[] (MediaWiki +markup), \f[C]textile\f[] (Textile), \f[C]org\f[] (Emacs Org-Mode), +\f[C]texinfo\f[] (GNU Texinfo), \f[C]docbook\f[] (DocBook XML), +\f[C]opendocument\f[] (OpenDocument XML), \f[C]odt\f[] (OpenOffice text +document), \f[C]epub\f[] (EPUB book), \f[C]slidy\f[] (Slidy HTML and +javascript slide show), \f[C]s5\f[] (S5 HTML and javascript slide show), +or \f[C]rtf\f[] (rich text format). +Note that \f[C]odt\f[] and \f[C]epub\f[] output will not be directed to +\f[I]stdout\f[]; an output filename must be specified using the +\f[C]-o/--output\f[] option. +If \f[C]+lhs\f[] is appended to \f[C]markdown\f[], \f[C]rst\f[], +\f[C]latex\f[], or \f[C]html\f[], the output will be rendered as +literate Haskell source: see Literate Haskell support, below. +.RS +.RE +.TP +.B \f[C]-s\f[], \f[C]--standalone\f[] +Produce output with an appropriate header and footer (e.g. +a standalone HTML, LaTeX, or RTF file, not a fragment). +.RS +.RE +.TP +.B \f[C]-o\f[] \f[I]FILE\f[], \f[C]--output=\f[]\f[I]FILE\f[] +Write output to \f[I]FILE\f[] instead of \f[I]stdout\f[]. +If \f[I]FILE\f[] is \f[C]-\f[], output will go to \f[I]stdout\f[]. +(Exception: if the output format is \f[C]odt\f[] or \f[C]epub\f[], +output to stdout is disabled.) +.RS +.RE +.TP +.B \f[C]-p\f[], \f[C]--preserve-tabs\f[] +Preserve tabs instead of converting them to spaces (the default). +.RS +.RE +.TP +.B \f[C]--tab-stop=\f[]\f[I]NUMBER\f[] +Specify the number of spaces per tab (default is 4). +.RS +.RE +.TP +.B \f[C]--strict\f[] +Use strict markdown syntax, with no pandoc extensions or variants. +When the input format is HTML, this means that constructs that have no +equivalents in standard markdown (e.g. +definition lists or strikeout text) will be parsed as raw HTML. +.RS +.RE +.TP +.B \f[C]--normalize\f[] +Normalize the document after reading: merge adjacent \f[C]Str\f[] or +\f[C]Emph\f[] elements, for example, and remove repeated +\f[C]Space\f[]s. +.RS +.RE +.TP +.B \f[C]--reference-links\f[] +Use reference-style links, rather than inline links, in writing markdown +or reStructuredText. +By default inline links are used. +.RS +.RE +.TP +.B \f[C]-R\f[], \f[C]--parse-raw\f[] +Parse untranslatable HTML codes and LaTeX environments as raw HTML or +LaTeX, instead of ignoring them. +Affects only HTML and LaTeX input. +Raw HTML can be printed in markdown, reStructuredText, HTML, Slidy, and +S5 output; raw LaTeX can be printed in markdown, reStructuredText, +LaTeX, and ConTeXt output. +The default is for the readers to omit untranslatable HTML codes and +LaTeX environments. +(The LaTeX reader does pass through untranslatable LaTeX +\f[I]commands\f[], even if \f[C]-R\f[] is not specified.) +.RS +.RE +.TP +.B \f[C]-S\f[], \f[C]--smart\f[] +Produce typographically correct output, converting straight quotes to +curly quotes, \f[C]---\f[] and \f[C]--\f[] to dashes, ande \f[C]...\f[] +to ellipses. +Nonbreaking spaces are inserted after certain abbreviations, such as +"Mr." (Note: This option is significant only when the input format is +\f[C]markdown\f[] or \f[C]textile\f[]. +It is selected automatically when the input format is \f[C]textile\f[] +or the output format is \f[C]latex\f[] or \f[C]context\f[].) +.RS +.RE +.TP +.B \f[C]-5\f[], \f[C]--html5\f[] +Produce HTML5 instead of HTML4. +This option has no effect for writers other than \f[C]html\f[]. +.RS +.RE +.TP +.B \f[C]-m\f[] \f[I]URL\f[], \f[C]--latexmathml=\f[]\f[I]URL\f[] +Use the LaTeXMathML script to display embedded TeX math in HTML output. +To insert a link to a local copy of the \f[C]LaTeXMathML.js\f[] script, +provide a \f[I]URL\f[]. +If no \f[I]URL\f[] is provided, the contents of the script will be +inserted directly into the HTML header, preserving portability at the +price of efficiency. +If you plan to use math on several pages, it is much better to link to a +copy of the script, so it can be cached. +.RS +.RE +.TP +.B \f[C]--mathml\f[] +Convert TeX math to MathML. +In standalone mode, a small javascript will be inserted that allows the +MathML to be viewed on some browsers. +.RS +.RE +.TP +.B \f[C]--jsmath=\f[]\f[I]URL\f[] +Use jsMath to display embedded TeX math in HTML output. +The \f[I]URL\f[] should point to the jsMath load script (e.g. +\f[C]jsMath/easy/load.js\f[]); if provided, it will be linked to in the +header of standalone HTML documents. +.RS +.RE +.TP +.B \f[C]--mathjax=\f[]\f[I]URL\f[] +Use MathJax to display embedded TeX math in HTML output. +The \f[I]URL\f[] should point to the \f[C]MathJax.js\f[] load script. +.RS +.RE +.TP +.B \f[C]--gladtex\f[] +Enclose TeX math in \f[C]<eq>\f[] tags in HTML output. +These can then be processed by gladTeX to produce links to images of the +typeset formulas. +.RS +.RE +.TP +.B \f[C]--mimetex=\f[]\f[I]URL\f[] +Render TeX math using the mimeTeX CGI script. +If \f[I]URL\f[] is not specified, it is assumed that the script is at +\f[C]/cgi-bin/mimetex.cgi\f[]. +.RS +.RE +.TP +.B \f[C]--webtex=\f[]\f[I]URL\f[] +Render TeX formulas using an external script that converts TeX formulas +to images. +The formula will be concatenated with the URL provided. +If \f[I]URL\f[] is not specified, the Google Chart API will be used. +.RS +.RE +.TP +.B \f[C]-i\f[], \f[C]--incremental\f[] +Make list items in Slidy or S5 display incrementally (one by one). +The default is for lists to be displayed all at once. +.RS +.RE +.TP +.B \f[C]--offline\f[] +Include all the CSS and javascript needed for a Slidy or S5 slide show +in the output, so that the slide show will work even when no internet +connection is available. +.RS +.RE +.TP +.B \f[C]--xetex\f[] +Create LaTeX outut suitable for processing by XeTeX. +.RS +.RE +.TP +.B \f[C]--chapters\f[] +Treat top-level headers as chapters in LaTeX, ConTeXt, and DocBook +output. +.RS +.RE +.TP +.B \f[C]-N\f[], \f[C]--number-sections\f[] +Number section headings in LaTeX, ConTeXt, or HTML output. +By default, sections are not numbered. +.RS +.RE +.TP +.B \f[C]--listings\f[] +Use listings package for LaTeX code blocks +.RS +.RE +.TP +.B \f[C]--section-divs\f[] +Wrap sections in \f[C]<div>\f[] tags (or \f[C]<section>\f[] tags in +HTML5), and attach identifiers to the enclosing \f[C]<div>\f[] (or +\f[C]<section>\f[]) rather than the header itself. +See Section identifiers, below. +.RS +.RE +.TP +.B \f[C]--no-wrap\f[] +Disable text wrapping in output. +By default, text is wrapped appropriately for the output format. +.RS +.RE +.TP +.B \f[C]--columns\f[]=\f[I]NUMBER\f[] +Specify length of lines in characters (for text wrapping). +.RS +.RE +.TP +.B \f[C]--email-obfuscation=\f[]\f[I]none|javascript|references\f[] +Specify a method for obfuscating \f[C]mailto:\f[] links in HTML +documents. +\f[I]none\f[] leaves \f[C]mailto:\f[] links as they are. +\f[I]javascript\f[] obfuscates them using javascript. +\f[I]references\f[] obfuscates them by printing their letters as decimal +or hexadecimal character references. +If \f[C]--strict\f[] is specified, \f[I]references\f[] is used +regardless of the presence of this option. +.RS +.RE +.TP +.B \f[C]--id-prefix\f[]=\f[I]STRING\f[] +Specify a prefix to be added to all automatically generated identifiers +in HTML output. +This is useful for preventing duplicate identifiers when generating +fragments to be included in other pages. +.RS +.RE +.TP +.B \f[C]--indented-code-classes=\f[]\f[I]CLASSES\f[] +Specify classes to use for indented code blocks--for example, +\f[C]perl,numberLines\f[] or \f[C]haskell\f[]. +Multiple classes may be separated by spaces or commas. +.RS +.RE +.TP +.B \f[C]--toc\f[], \f[C]--table-of-contents\f[] +Include an automatically generated table of contents (or, in the case of +\f[C]latex\f[], \f[C]context\f[], and \f[C]rst\f[], an instruction to +create one) in the output document. +This option has no effect on \f[C]man\f[], \f[C]docbook\f[], +\f[C]slidy\f[], or \f[C]s5\f[] output. +.RS +.RE +.TP +.B \f[C]--base-header-level=\f[]\f[I]NUMBER\f[] +Specify the base level for headers (defaults to 1). +.RS +.RE +.TP +.B \f[C]--template=\f[]\f[I]FILE\f[] +Use \f[I]FILE\f[] as a custom template for the generated document. +Implies \f[C]--standalone\f[]. +See Templates below for a description of template syntax. +If this option is not used, a default template appropriate for the +output format will be used. +See also \f[C]-D/--print-default-template\f[]. +.RS +.RE +.TP +.B \f[C]-V\f[] \f[I]KEY=VAL\f[], \f[C]--variable=\f[]\f[I]KEY:VAL\f[] +Set the template variable \f[I]KEY\f[] to the value \f[I]VAL\f[] when +rendering the document in standalone mode. +This is only useful when the \f[C]--template\f[] option is used to +specify a custom template, since pandoc automatically sets the variables +used in the default templates. +.RS +.RE +.TP +.B \f[C]-c\f[] \f[I]URL\f[], \f[C]--css=\f[]\f[I]URL\f[] +Link to a CSS style sheet. +.RS +.RE +.TP +.B \f[C]-H\f[] \f[I]FILE\f[], \f[C]--include-in-header=\f[]\f[I]FILE\f[] +Include contents of \f[I]FILE\f[], verbatim, at the end of the header. +This can be used, for example, to include special CSS or javascript in +HTML documents. +This option can be used repeatedly to include multiple files in the +header. +They will be included in the order specified. +Implies \f[C]--standalone\f[]. +.RS +.RE +.TP +.B \f[C]-B\f[] \f[I]FILE\f[], +\f[C]--include-before-body=\f[]\f[I]FILE\f[] +Include contents of \f[I]FILE\f[], verbatim, at the beginning of the +document body (e.g. +after the \f[C]<body>\f[] tag in HTML, or the \f[C]\\begin{document}\f[] +command in LaTeX). +This can be used to include navigation bars or banners in HTML +documents. +This option can be used repeatedly to include multiple files. +They will be included in the order specified. +Implies \f[C]--standalone\f[]. +.RS +.RE +.TP +.B \f[C]-A\f[] \f[I]FILE\f[], +\f[C]--include-after-body=\f[]\f[I]FILE\f[] +Include contents of \f[I]FILE\f[], verbatim, at the end of the document +body (before the \f[C]</body>\f[] tag in HTML, or the +\f[C]\\end{document}\f[] command in LaTeX). +This option can be be used repeatedly to include multiple files. +They will be included in the order specified. +Implies \f[C]--standalone\f[]. +.RS +.RE +.TP +.B \f[C]--reference-odt=\f[]\f[I]FILE\f[] +Use the specified file as a style reference in producing an ODT. +For best results, the reference ODT should be a modified version of an +ODT produced using pandoc. +The contents of the reference ODT are ignored, but its stylesheets are +used in the new ODT. +If no reference ODT is specified on the command line, pandoc will look +for a file \f[C]reference.odt\f[] in the user data directory (see +\f[C]--data-dir\f[]). +If this is not found either, sensible defaults will be used. +.RS +.RE +.TP +.B \f[C]--epub-stylesheet=\f[]\f[I]FILE\f[] +Use the specified CSS file to style the EPUB. +If no stylesheet is specified, pandoc will look for a file +\f[C]epub.css\f[] in the user data directory (see \f[C]--data-dir\f[], +below). +If it is not found there, sensible defaults will be used. +.RS +.RE +.TP +.B \f[C]--epub-metadata=\f[]\f[I]FILE\f[] +Look in the specified XML file for metadata for the EPUB. +The file should contain a series of Dublin Core elements, as documented +at \f[C]http://dublincore.org/documents/dces/\f[]. +For example: +.RS +.IP +.nf +\f[C] +\ <dc:rights>Creative\ Commons</dc:rights> +\ <dc:language>es-AR</dc:language> +\f[] +.fi +.PP +By default, pandoc will include the following metadata elements: +\f[C]<dc:title>\f[] (from the document title), \f[C]<dc:creator>\f[] +(from the document authors), \f[C]<dc:language>\f[] (from the locale), +and \f[C]<dc:identifier\ id="BookId">\f[] (a randomly generated UUID). +Any of these may be overridden by elements in the metadata file. +.RE +.TP +.B \f[C]-D\f[] \f[I]FORMAT\f[], +\f[C]--print-default-template=\f[]\f[I]FORMAT\f[] +Print the default template for an output \f[I]FORMAT\f[]. +(See \f[C]-t\f[] for a list of possible \f[I]FORMAT\f[]s.) +.RS +.RE +.TP +.B \f[C]-T\f[] \f[I]STRING\f[], \f[C]--title-prefix=\f[]\f[I]STRING\f[] +Specify \f[I]STRING\f[] as a prefix at the beginning of the title that +appears in the HTML header (but not in the title as it appears at the +beginning of the HTML body). +Implies \f[C]--standalone\f[]. +.RS +.RE +.TP +.B \f[C]--bibliography=\f[]\f[I]FILE\f[] +Specify bibliography database to be used in resolving citations. +The database type will be determined from the extension of +\f[I]FILE\f[], which may be \f[C].mods\f[] (MODS format), \f[C].bib\f[] +(BibTeX format), \f[C].bbx\f[] (BibLaTeX format), \f[C].ris\f[] (RIS +format), \f[C].enl\f[] (EndNote format), \f[C].xml\f[] (EndNote XML +format), \f[C].wos\f[] (ISI format), \f[C].medline\f[] (MEDLINE format), +\f[C].copac\f[] (Copac format), or \f[C].json\f[] (citeproc JSON). +If you want to use multiple bibliographies, just use this option +repeatedly. +.RS +.RE +.TP +.B \f[C]--csl=\f[]\f[I]FILE\f[] +Specify CSL style to be used in formatting citations and the +bibliography. +If \f[I]FILE\f[] is not found, pandoc will look for it in +.RS +.IP +.nf +\f[C] +$HOME/.csl +\f[] +.fi +.PP +in unix and +.IP +.nf +\f[C] +C:\\Documents\ And\ Settings\\USERNAME\\Application\ Data\\csl +\f[] +.fi +.PP +in Windows. +If the \f[C]--csl\f[] option is not specified, pandoc will use a default +style: either \f[C]default.csl\f[] in the user data directory (see +\f[C]--data-dir\f[]), or, if that is not present, the Chicago +author-date style. +.RE +.TP +.B \f[C]--data-dir=\f[]\f[I]DIRECTORY\f[] +Specify the user data directory to search for pandoc data files. +If this option is not specified, the default user data directory will be +used: +.RS +.IP +.nf +\f[C] +$HOME/.pandoc +\f[] +.fi +.PP +in unix and +.IP +.nf +\f[C] +C:\\Documents\ And\ Settings\\USERNAME\\Application\ Data\\pandoc +\f[] +.fi +.PP +in Windows. +A \f[C]reference.odt\f[], \f[C]epub.css\f[], \f[C]templates\f[] +directory, or \f[C]s5\f[] directory placed in this directory will +override pandoc\[aq]s normal defaults. +.RE +.TP +.B \f[C]--dump-args\f[] +Print information about command-line arguments to \f[I]stdout\f[], then +exit. +This option is intended primarily for use in wrapper scripts. +The first line of output contains the name of the output file specified +with the \f[C]-o\f[] option, or \f[C]-\f[] (for \f[I]stdout\f[]) if no +output file was specified. +The remaining lines contain the command-line arguments, one per line, in +the order they appear. +These do not include regular Pandoc options and their arguments, but do +include any options appearing after a \f[C]--\f[] separator at the end +of the line. +.RS +.RE +.TP +.B \f[C]--ignore-args\f[] +Ignore command-line arguments (for use in wrapper scripts). +Regular Pandoc options are not ignored. +Thus, for example, +.RS +.IP +.nf +\f[C] +pandoc\ --ignore-args\ -o\ foo.html\ -s\ foo.txt\ --\ -e\ latin1 +\f[] +.fi +.PP +is equivalent to +.IP +.nf +\f[C] +pandoc\ -o\ foo.html\ -s +\f[] +.fi +.RE +.TP +.B \f[C]-v\f[], \f[C]--version\f[] +Print version. +.RS +.RE +.TP +.B \f[C]-h\f[], \f[C]--help\f[] +Show usage message. +.RS +.RE +.SH TEMPLATES +.PP +When the \f[C]-s/--standalone\f[] option is used, pandoc uses a template +to add header and footer material that is needed for a self-standing +document. +To see the default template that is used, just type +.IP +.nf +\f[C] +pandoc\ -D\ FORMAT +\f[] +.fi +.PP +where \f[C]FORMAT\f[] is the name of the output format. +A custom template can be specified using the \f[C]--template\f[] option. +You can also override the system default templates for a given output +format \f[C]FORMAT\f[] by putting a file +\f[C]templates/FORMAT.template\f[] in the user data directory (see +\f[C]--data-dir\f[], above). +.PP +Templates may contain \f[I]variables\f[]. +Variable names are sequences of alphanumerics, \f[C]-\f[], and +\f[C]_\f[], starting with a letter. +A variable name surrounded by \f[C]$\f[] signs will be replaced by its +value. +For example, the string \f[C]$title$\f[] in +.IP +.nf +\f[C] +<title>$title$</title> +\f[] +.fi +.PP +will be replaced by the document title. +.PP +To write a literal \f[C]$\f[] in a template, use \f[C]$$\f[]. +.PP +Some variables are set automatically by pandoc. +These vary somewhat depending on the output format, but include: +.TP +.B \f[C]header-includes\f[] +contents specified by \f[C]-H/--include-in-header\f[] (may have multiple +values) +.RS +.RE +.TP +.B \f[C]toc\f[] +non-null value if \f[C]--toc/--table-of-contents\f[] was specified +.RS +.RE +.TP +.B \f[C]include-before\f[] +contents specified by \f[C]-B/--include-before-body\f[] (may have +multiple values) +.RS +.RE +.TP +.B \f[C]include-after\f[] +contents specified by \f[C]-A/--include-after-body\f[] (may have +multiple values) +.RS +.RE +.TP +.B \f[C]body\f[] +body of document +.RS +.RE +.TP +.B \f[C]title\f[] +title of document, as specified in title block +.RS +.RE +.TP +.B \f[C]author\f[] +author of document, as specified in title block (may have multiple +values) +.RS +.RE +.TP +.B \f[C]date\f[] +date of document, as specified in title block +.RS +.RE +.TP +.B \f[C]lang\f[] +language code for HTML documents +.RS +.RE +.PP +Variables may be set at the command line using the +\f[C]-V/--variable\f[] option. +This allows users to include custom variables in their templates. +.PP +Templates may contain conditionals. +The syntax is as follows: +.IP +.nf +\f[C] +$if(variable)$ +X +$else$ +Y +$endif$ +\f[] +.fi +.PP +This will include \f[C]X\f[] in the template if \f[C]variable\f[] has a +non-null value; otherwise it will include \f[C]Y\f[]. +\f[C]X\f[] and \f[C]Y\f[] are placeholders for any valid template text, +and may include interpolated variables or other conditionals. +The \f[C]$else$\f[] section may be omitted. +.PP +When variables can have multiple values (for example, \f[C]author\f[] in +a multi-author document), you can use the \f[C]$for$\f[] keyword: +.IP +.nf +\f[C] +$for(author)$ +<meta\ name="author"\ content="$author$"\ /> +$endfor$ +\f[] +.fi +.PP +You can optionally specify a separator to be used between consecutive +items: +.IP +.nf +\f[C] +$for(author)$$author$$sep$,\ $endfor$ +\f[] +.fi +.SH PRODUCING HTML SLIDE SHOWS WITH PANDOC +.PP +You can use Pandoc to produce an HTML + javascript slide presentation +that can be viewed via a web browser. +There are two ways to do this, using S5 or Slidy. +.PP +Here\[aq]s the markdown source for a simple slide show, +\f[C]eating.txt\f[]: +.IP +.nf +\f[C] +%\ Eating\ Habits +%\ John\ Doe +%\ March\ 22,\ 2005 + +#\ In\ the\ morning + +-\ Eat\ eggs +-\ Drink\ coffee + +#\ In\ the\ evening + +-\ Eat\ spaghetti +-\ Drink\ wine + +-------------------------- + + +\f[] +.fi +.PP +To produce the slide show, simply type +.IP +.nf +\f[C] +pandoc\ -w\ s5\ -s\ eating.txt\ >\ eating.html +\f[] +.fi +.PP +for S5, or +.IP +.nf +\f[C] +pandoc\ -w\ slidy\ -s\ eating.txt\ >\ eating.html +\f[] +.fi +.PP +for Slidy. +.PP +A title page is constructed automatically from the document\[aq]s title +block. +Each level-one header and horizontal rule begins a new slide. +.PP +The file produced by pandoc with the \f[C]-s/--standalone\f[] option +embeds a link to javascripts and CSS files, which are assumed to be +available at the relative path \f[C]ui/default\f[] (for S5) or at the +Slidy website at \f[C]w3.org\f[] (for Slidy). +If the \f[C]--offline\f[] option is specified, the scripts and CSS will +be included directly in the generated file, so that it may be used +offline. +.PP +You can change the style of the slides by putting customized CSS files +in \f[C]$DATADIR/s5/default\f[] (for S5) or \f[C]$DATADIR/slidy\f[] (for +Slidy), where \f[C]$DATADIR\f[] is the user data directory (see +\f[C]--data-dir\f[], above). +The originals may be found in pandoc\[aq]s system data directory +(generally \f[C]$CABALDIR/pandoc-VERSION/s5/default\f[]). +Pandoc will look there for any files it does not find in the user data +directory. +.SS Incremental lists +.PP +By default, these writers produces lists that display "all at once." If +you want your lists to display incrementally (one item at a time), use +the \f[C]-i\f[] option. +If you want a particular list to depart from the default (that is, to +display incrementally without the \f[C]-i\f[] option and all at once +with the \f[C]-i\f[] option), put it in a block quote: +.IP +.nf +\f[C] +>\ -\ Eat\ spaghetti +>\ -\ Drink\ wine +\f[] +.fi +.PP +In this way incremental and nonincremental lists can be mixed in a +single document. +.SH LITERATE HASKELL SUPPORT +.PP +If you append \f[C]+lhs\f[] to an appropriate input or output format +(\f[C]markdown\f[], \f[C]rst\f[], or \f[C]latex\f[] for input or output; +\f[C]html\f[] for output only), pandoc will treat the document as +literate Haskell source. +This means that +.IP \[bu] 2 +In markdown input, "bird track" sections will be parsed as Haskell code +rather than block quotations. +Text between \f[C]\\begin{code}\f[] and \f[C]\\end{code}\f[] will also +be treated as Haskell code. +.IP \[bu] 2 +In markdown output, code blocks with class \f[C]haskell\f[] will be +rendered using bird tracks, and block quotations will be indented one +space, so they will not be treated as Haskell code. +In addition, headers will be rendered setext-style (with underlines) +rather than atx-style (with \[aq]#\[aq] characters). +(This is because ghc treats \[aq]#\[aq] characters in column 1 as +introducing line numbers.) +.IP \[bu] 2 +In restructured text input, "bird track" sections will be parsed as +Haskell code. +.IP \[bu] 2 +In restructured text output, code blocks with class \f[C]haskell\f[] +will be rendered using bird tracks. +.IP \[bu] 2 +In LaTeX input, text in \f[C]code\f[] environments will be parsed as +Haskell code. +.IP \[bu] 2 +In LaTeX output, code blocks with class \f[C]haskell\f[] will be +rendered inside \f[C]code\f[] environments. +.IP \[bu] 2 +In HTML output, code blocks with class \f[C]haskell\f[] will be rendered +with class \f[C]literatehaskell\f[] and bird tracks. +.PP +Examples: +.IP +.nf +\f[C] +pandoc\ -f\ markdown+lhs\ -t\ html +\f[] +.fi +.PP +reads literate Haskell source formatted with markdown conventions and +writes ordinary HTML (without bird tracks). +.IP +.nf +\f[C] +pandoc\ -f\ markdown+lhs\ -t\ html+lhs +\f[] +.fi +.PP +writes HTML with the Haskell code in bird tracks, so it can be copied +and pasted as literate Haskell source. +.SH AUTHORS +.PP +© 2006-2011 John MacFarlane (jgm at berkeley dot edu). +Released under the GPL, version 2 or greater. +This software carries no warranty of any kind. +(See COPYRIGHT for full copyright and warranty notices.) + Other contributors include Recai Oktaş, Paulo Tanimoto, Peter Wang, +Andrea Rossato, Eric Kow, infinity0x, Luke Plant, shreevatsa.public, +Puneeth Chaganti, Paul Rivier, rodja.trappe, Bradley Kuhn, thsutton, +Nathan Gass, Jonathan Daugherty, Jérémy Bobbio, Justin Bogner. +.SH PANDOC'S MARKDOWN +For a complete description of pandoc's extensions to standard markdown, +see \f[C]pandoc_markdown\f[] (5). +.SH SEE ALSO +.PP +\f[C]markdown2pdf\f[] (1), \f[C]pandoc_markdown\f[] (5). +.PP +The Pandoc source code and all documentation may be downloaded +from <http://johnmacfarlane.net/pandoc/>. diff --git a/man/man1/pandoc.1.md b/man/man1/pandoc.1.md deleted file mode 100644 index 502b0b98d..000000000 --- a/man/man1/pandoc.1.md +++ /dev/null @@ -1,394 +0,0 @@ -% PANDOC(1) Pandoc User Manuals -% John MacFarlane -% January 8, 2008 - -# NAME - -pandoc - general markup converter - -# SYNOPSIS - -pandoc [*options*] [*input-file*]... - -# DESCRIPTION - -Pandoc converts files from one markup format to another. It can -read markdown and (subsets of) reStructuredText, HTML, and LaTeX, and -it can write plain text, markdown, reStructuredText, HTML, LaTeX, -ConTeXt, Texinfo, groff man, MediaWiki markup, RTF, OpenDocument XML, -ODT, DocBook XML, EPUB, and Slidy or S5 HTML slide shows. - -If no *input-file* is specified, input is read from *stdin*. -Otherwise, the *input-files* are concatenated (with a blank -line between each) and used as input. Output goes to *stdout* by -default (though output to *stdout* is disabled for the `odt` and -`epub` output formats). For output to a file, use the `-o` option: - - pandoc -o output.html input.txt - -Instead of a file, an absolute URI may be given. In this case -pandoc will fetch the content using HTTP: - - pandoc -f html -t markdown http://www.fsf.org - -The input and output formats may be specified using command-line options -(see **OPTIONS**, below, for details). If these formats are not -specified explicitly, Pandoc will attempt to determine them -from the extensions of the input and output filenames. If input comes -from *stdin* or from a file with an unknown extension, the input is assumed -to be markdown. If no output filename is specified using the `-o` -option, or if a filename is specified but its extension is unknown, -the output will default to HTML. Thus, for example, - - pandoc -o chap1.tex chap1.txt - -converts *chap1.txt* from markdown to LaTeX. And - - pandoc README - -converts *README* from markdown to HTML. - -Pandoc's version of markdown is an extended variant of standard -markdown: the differences are described in the *README* file in -the user documentation. If standard markdown syntax is desired, the -`--strict` option may be used. - -Pandoc uses the UTF-8 character encoding for both input and output. -If your local character encoding is not UTF-8, you -should pipe input and output through `iconv`: - - iconv -t utf-8 input.txt | pandoc | iconv -f utf-8 - -# OPTIONS - --f *FORMAT*, -r *FORMAT*, \--from=*FORMAT*, \--read=*FORMAT* -: Specify input format. *FORMAT* can be - `native` (native Haskell), `markdown` (markdown or plain text), - `rst` (reStructuredText), `html` (HTML), or `latex` (LaTeX). - If `+lhs` is appended to `markdown`, `rst`, or `latex`, the input - will be treated as literate Haskell source. - --t *FORMAT*, -w *FORMAT*, \--to=*FORMAT*, \--write=*FORMAT* -: Specify output format. *FORMAT* can be `native` (native Haskell), - `plain` (plain text), `markdown` (markdown), `rst` (reStructuredText), - `html` (HTML), `latex` (LaTeX), `context` (ConTeXt), `man` (groff man), - `mediawiki` (MediaWiki markup), `texinfo` (GNU Texinfo), - `docbook` (DocBook XML), `opendocument` (OpenDocument XML), - `odt` (OpenOffice text document), `epub` (EPUB book), - `slidy` (Slidy HTML and javascript slide show), - `s5` (S5 HTML and javascript slide show), or `rtf` (rich text - format). Note that `odt` and `epub` output will not be directed to - *stdout*; an output filename must be specified using the `-o/--output` - option. If `+lhs` is appended to `markdown`, `rst`, `latex`, or `html`, - the output will be rendered as literate Haskell source. - --s, \--standalone -: Produce output with an appropriate header and footer (e.g. a - standalone HTML, LaTeX, or RTF file, not a fragment). - --o *FILE*, \--output=*FILE* -: Write output to *FILE* instead of *stdout*. If *FILE* is - \``-`', output will go to *stdout*. - --p, \--preserve-tabs -: Preserve tabs instead of converting them to spaces. - -\--tab-stop=*TABSTOP* -: Specify tab stop (default is 4). - -\--strict -: Use strict markdown syntax, with no extensions or variants. - -\--reference-links -: Use reference-style links, rather than inline links, in writing markdown - or reStructuredText. - --R, \--parse-raw -: Parse untranslatable HTML codes and LaTeX environments as raw HTML - or LaTeX, instead of ignoring them. - --S, \--smart -: Use smart quotes, dashes, and ellipses. (This option is significant - only when the input format is `markdown`. It is selected automatically - when the output format is `latex` or `context`.) - --m*URL*, \--latexmathml=*URL* -: Use LaTeXMathML to display embedded TeX math in HTML output. - To insert a link to a local copy of the `LaTeXMathML.js` script, - provide a *URL*. If no *URL* is provided, the contents of the - script will be inserted directly into the HTML header. - -\--mathml -: Convert TeX math to MathML. In standalone mode, a small javascript - will be inserted that allows the MathML to be viewed on some browsers. - -\--jsmath=*URL* -: Use jsMath to display embedded TeX math in HTML output. - The *URL* should point to the jsMath load script; if provided, - it will be linked to in the header of standalone HTML documents. - -\--gladtex -: Enclose TeX math in `<eq>` tags in HTML output. These can then - be processed by gladTeX to produce links to images of the typeset - formulas. - -\--mimetex=*URL* -: Render TeX math using the mimeTeX CGI script. If *URL* is not specified, - it is assumed that the script is at `/cgi-bin/mimetex.cgi`. - -\--webtex=*URL* -: Render TeX math using an external script. The formula will be - concatenated with the URL provided. If *URL* is not specified, the - Google Chart API will be used. - --i, \--incremental -: Make list items in Slidy or S5 display incrementally (one by one). - -\--offline -: Include all the CSS and javascript needed for a Slidy or S5 slide - show in the output, so that the slide show will work even when no - internet connection is available. - -\--xetex -: Create LaTeX outut suitable for processing by XeTeX. - --N, \--number-sections -: Number section headings in LaTeX, ConTeXt, or HTML output. - (Default is not to number them.) - -\--section-divs -: Wrap sections in `<div>` tags, and attach identifiers to the - enclosing `<div>` rather than the header itself. - -\--no-wrap -: Disable text wrapping in output. (Default is to wrap text.) - -\--sanitize-html -: Sanitizes HTML (in markdown or HTML input) using a whitelist. - Unsafe tags are replaced by HTML comments; unsafe attributes - are omitted. URIs in links and images are also checked against a - whitelist of URI schemes. - -\--email-obfuscation=*none|javascript|references* -: Specify a method for obfuscating `mailto:` links in HTML documents. - *none* leaves `mailto:` links as they are. *javascript* obfuscates - them using javascript. *references* obfuscates them by printing their - letters as decimal or hexadecimal character references. - If `--strict` is specified, *references* is used regardless of the - presence of this option. - -\--id-prefix*=string* -: Specify a prefix to be added to all automatically generated identifiers - in HTML output. This is useful for preventing duplicate identifiers - when generating fragments to be included in other pages. - -\--indented-code-classes*=classes* -: Specify classes to use for indented code blocks--for example, - `perl,numberLines` or `haskell`. Multiple classes may be separated - by spaces or commas. - -\--toc, \--table-of-contents -: Include an automatically generated table of contents (HTML, markdown, - RTF) or an instruction to create one (LaTeX, reStructuredText). - This option has no effect on man, DocBook, Slidy, or S5 output. - -\--base-header-level=*LEVEL* -: Specify the base level for headers (defaults to 1). - -\--template=*FILE* -: Use *FILE* as a custom template for the generated document. Implies - `-s`. See TEMPLATES below for a description of template syntax. If - this option is not used, a default template appropriate for the - output format will be used. See also `-D/--print-default-template`. - --V KEY=VAL, \--variable=*KEY:VAL* -: Set the template variable KEY to the value VAL when rendering the - document in standalone mode. This is only useful when the - `--template` option is used to specify a custom template, since - pandoc automatically sets the variables used in the default - templates. - --c *CSS*, \--css=*CSS* -: Link to a CSS style sheet. *CSS* is the pathname of the style sheet. - --H *FILE*, \--include-in-header=*FILE* -: Include contents of *FILE* at the end of the header. Implies `-s`. - --B *FILE*, \--include-before-body=*FILE* -: Include contents of *FILE* at the beginning of the document body. - Implies `-s`. - --A *FILE*, \--include-after-body=*FILE* -: Include contents of *FILE* at the end of the document body. - Implies `-s`. - --C *FILE*, \--custom-header=*FILE* -: Use contents of *FILE* as the document header. *Note: This option is - deprecated. Users should transition to using `--template` instead.* - -\--reference-odt=*filename* -: Use the specified file as a style reference in producing an ODT. - For best results, the reference ODT should be a modified version - of an ODT produced using pandoc. The contents of the reference ODT - are ignored, but its stylesheets are used in the new ODT. If no - reference ODT is specified on the command line, pandoc will look - for a file `reference.odt` in the user data directory (see - `--data-dir`). If this is not found either, sensible defaults will be - used. - -\--epub-stylesheet=*filename* -: Use the specified CSS file to style the EPUB. If no stylesheet - is specified, pandoc will look for a file `epub.css` in the - user data directory (see `--data-dir`, below). If it is not - found there, sensible defaults will be used. - -\--epub-metadata=*filename* -: Look in the specified XML file for metadata for the EPUB. - The file should contain a series of Dublin Core elements - (http://dublincore.org/documents/dces/), for example: - - <dc:rights>Creative Commons</dc:rights> - <dc:language>es-AR</dc:language> - - By default, pandoc will include the following metadata elements: - `<dc:title>` (from the document title), `<dc:creator>` (from the - document authors), `<dc:language>` (from the locale), and - `<dc:identifier id="BookId">` (a randomly generated UUID). Any of - these may be overridden by elements in the metadata file. - --D *FORMAT*, \--print-default-template=*FORMAT* -: Print the default template for an output *FORMAT*. (See `-t` - for a list of possible *FORMAT*s.) - --T *STRING*, \--title-prefix=*STRING* -: Specify *STRING* as a prefix to the HTML window title. - -\--data-dir*=DIRECTORY* -: Specify the user data directory to search for pandoc data files. - If this option is not specified, the default user data directory - will be used: - - $HOME/.pandoc - - in unix and - - C:\Documents And Settings\USERNAME\Application Data\pandoc - - in Windows. A `reference.odt`, `epub.css`, `templates` directory, - or `s5` directory placed in this directory will override pandoc's - normal defaults. - -\--dump-args -: Print information about command-line arguments to *stdout*, then exit. - The first line of output contains the name of the output file specified - with the `-o` option, or \``-`' (for *stdout*) if no output file was - specified. The remaining lines contain the command-line arguments, - one per line, in the order they appear. These do not include regular - Pandoc options and their arguments, but do include any options appearing - after a \``--`' separator at the end of the line. - This option is intended primarily for use in wrapper scripts. - -\--ignore-args -: Ignore command-line arguments (for use in wrapper scripts). - Regular Pandoc options are not ignored. Thus, for example, - - pandoc --ignore-args -o foo.html -s foo.txt -- -e latin1 - - is equivalent to - - pandoc -o foo.html -s - --v, \--version -: Print version. - --h, \--help -: Show usage message. - -# TEMPLATES - -When the `-s/--standalone` option is used, pandoc uses a template to -add header and footer material that is needed for a self-standing -document. To see the default template that is used, just type - - pandoc --print-default-template=FORMAT - -where `FORMAT` is the name of the output format. A custom template -can be specified using the `--template` option. You can also override -the system default templates for a given output format `FORMAT` -by putting a file `templates/FORMAT.template` in the user data -directory (see `--data-dir`, below). - -Templates may contain *variables*. Variable names are sequences of -alphanumerics, `-`, and `_`, starting with a letter. A variable name -surrounded by `$` signs will be replaced by its value. For example, -the string `$title$` in - - <title>$title$</title> - -will be replaced by the document title. - -To write a literal `$` in a template, use `$$`. - -Some variables are set automatically by pandoc. These vary somewhat -depending on the output format, but include: - -`legacy-header` -: contents specified by `-C/--custom-header` -`header-includes` -: contents specified by `-H/--include-in-header` (may have multiple - values) -`toc` -: non-null value if `--toc/--table-of-contents` was specified -`include-before` -: contents specified by `-B/--include-before-body` (may have - multiple values) -`include-after` -: contents specified by `-A/--include-after-body` (may have - multiple values) -`body` -: body of document -`title` -: title of document, as specified in title block -`author` -: author of document, as specified in title block (may have - multiple values) -`date` -: date of document, as specified in title block - -Variables may be set at the command line using the `-V/--variable` -option. This allows users to include custom variables in their -templates. - -Templates may contain conditionals. The syntax is as follows: - - $if(variable)$ - X - $else$ - Y - $endif$ - -This will include `X` in the template if `variable` has a non-null -value; otherwise it will include `Y`. `X` and `Y` are placeholders for -any valid template text, and may include interpolated variables or other -conditionals. The `$else$` section may be omitted. - -When variables can have multiple values (for example, `author` in -a multi-author document), you can use the `$for$` keyword: - - $for(author)$ - <meta name="author" content="$author$" /> - $endfor$ - -You can optionally specify a separator to be used between -consecutive items: - - $for(author)$$author$$sep$, $endfor$ - -# SEE ALSO - -`markdown2pdf` (1). -The *README* file distributed with Pandoc contains full documentation. - -The Pandoc source code and all documentation may be downloaded from -<http://johnmacfarlane.net/pandoc/>. - diff --git a/man/man1/pandoc.1.template b/man/man1/pandoc.1.template new file mode 100644 index 000000000..c9b2b20f8 --- /dev/null +++ b/man/man1/pandoc.1.template @@ -0,0 +1,16 @@ +$if(has-tables)$ +.\"t +$endif$ +.TH PANDOC 1 "$date$" "$title$" +.SH NAME +pandoc - general markup converter +$body$ +.SH PANDOC'S MARKDOWN +For a complete description of pandoc's extensions to standard markdown, +see \f[C]pandoc_markdown\f[] (5). +.SH SEE ALSO +.PP +\f[C]markdown2pdf\f[] (1), \f[C]pandoc_markdown\f[] (5). +.PP +The Pandoc source code and all documentation may be downloaded +from <http://johnmacfarlane.net/pandoc/>. diff --git a/man/man5/pandoc_markdown.5 b/man/man5/pandoc_markdown.5 new file mode 100644 index 000000000..1b5c483c1 --- /dev/null +++ b/man/man5/pandoc_markdown.5 @@ -0,0 +1,1692 @@ +.\"t +.TH PANDOC_MARKDOWN 5 "January 29, 2011" "Pandoc" +.SH NAME +pandoc_markdown - markdown syntax for pandoc(1) +.SH DESCRIPTION +.PP +Pandoc understands an extended and slightly revised version of John +Gruber\[aq]s markdown syntax. +This document explains the syntax, noting differences from standard +markdown. +Except where noted, these differences can be suppressed by specifying +the \f[C]--strict\f[] command-line option. +.SH PHILOSOPHY +.PP +Markdown is designed to be easy to write, and, even more importantly, +easy to read: +.RS +.PP +A Markdown-formatted document should be publishable as-is, as plain +text, without looking like it\[aq]s been marked up with tags or +formatting instructions. +-- John Gruber +.RE +.PP +This principle has guided pandoc\[aq]s decisions in finding syntax for +tables, footnotes, and other extensions. +.PP +There is, however, one respect in which pandoc\[aq]s aims are different +from the original aims of markdown. +Whereas markdown was originally designed with HTML generation in mind, +pandoc is designed for multiple output formats. +Thus, while pandoc allows the embedding of raw HTML, it discourages it, +and provides other, non-HTMLish ways of representing important document +elements like definition lists, tables, mathematics, and footnotes. +.SH PARAGRAPHS +.PP +A paragraph is one or more lines of text followed by one or more blank +line. +Newlines are treated as spaces, so you can reflow your paragraphs as you +like. +If you need a hard line break, put two or more spaces at the end of a +line, or or type a backslash followed by a newline. +.SH HEADERS +.PP +There are two kinds of headers, Setext and atx. +.SS Setext-style headers +.PP +A setext-style header is a line of text "underlined" with a row of +\f[C]=\f[] signs (for a level one header) of \f[C]-\f[] signs (for a +level two header): +.IP +.nf +\f[C] +A\ level-one\ header +================== + +A\ level-two\ header +------------------ +\f[] +.fi +.PP +The header text can contain inline formatting, such as emphasis (see +Inline formatting, below). +.SS Atx-style headers +.PP +An Atx-style header consists of one to six \f[C]#\f[] signs and a line +of text, optionally followed by any number of \f[C]#\f[] signs. +The number of \f[C]#\f[] signs at the beginning of the line is the +header level: +.IP +.nf +\f[C] +##\ A\ level-two\ header + +###\ A\ level-three\ header\ ### +\f[] +.fi +.PP +As with setext-style headers, the header text can contain formatting: +.IP +.nf +\f[C] +#\ A\ level-one\ header\ with\ a\ [link](/url)\ and\ *emphasis* +\f[] +.fi +.PP +Standard markdown syntax does not require a blank line before a header. +Pandoc does require this (except, of course, at the beginning of the +document). +The reason for the requirement is that it is all too easy for a +\f[C]#\f[] to end up at the beginning of a line by accident (perhaps +through line wrapping). +Consider, for example: +.IP +.nf +\f[C] +I\ like\ several\ of\ their\ flavors\ of\ ice\ cream: +#22,\ for\ example,\ and\ #5. +\f[] +.fi +.SS Header identifiers in HTML +.PP +\f[I]Pandoc extension\f[]. +.PP +Each header element in pandoc\[aq]s HTML output is given a unique +identifier. +This identifier is based on the text of the header. +To derive the identifier from the header text, +.IP \[bu] 2 +Remove all formatting, links, etc. +.IP \[bu] 2 +Remove all punctuation, except underscores, hyphens, and periods. +.IP \[bu] 2 +Replace all spaces and newlines with hyphens. +.IP \[bu] 2 +Convert all alphabetic characters to lowercase. +.IP \[bu] 2 +Remove everything up to the first letter (identifiers may not begin with +a number or punctuation mark). +.IP \[bu] 2 +If nothing is left after this, use the identifier \f[C]section\f[]. +.PP +Thus, for example, +.PP +.TS +tab(@); +l l. +T{ +Header +T}@T{ +Identifier +T} +_ +T{ +Header identifiers in HTML +T}@T{ +\f[C]header-identifiers-in-html\f[] +T} +T{ +\f[I]Dogs\f[]?--in \f[I]my\f[] house? +T}@T{ +\f[C]dogs--in-my-house\f[] +T} +T{ +HTML, S5, or RTF? +T}@T{ +\f[C]html-s5-or-rtf\f[] +T} +T{ +3. +Applications +T}@T{ +\f[C]applications\f[] +T} +T{ +33 +T}@T{ +\f[C]section\f[] +T} +.TE +.PP +These rules should, in most cases, allow one to determine the identifier +from the header text. +The exception is when several headers have the same text; in this case, +the first will get an identifier as described above; the second will get +the same identifier with \f[C]-1\f[] appended; the third with +\f[C]-2\f[]; and so on. +.PP +These identifiers are used to provide link targets in the table of +contents generated by the \f[C]--toc|--table-of-contents\f[] option. +They also make it easy to provide links from one section of a document +to another. +A link to this section, for example, might look like this: +.IP +.nf +\f[C] +See\ the\ section\ on +[header\ identifiers](#header-identifiers-in-html). +\f[] +.fi +.PP +Note, however, that this method of providing links to sections works +only in HTML. +.PP +If the \f[C]--section-divs\f[] option is specified, then each section +will be wrapped in a \f[C]div\f[] (or a \f[C]section\f[], if +\f[C]--html5\f[] was specified), and the identifier will be attached to +the enclosing \f[C]<div>\f[] (or \f[C]<section>\f[]) tag rather than the +header itself. +This allows entire sections to be manipulated using javascript or +treated differently in CSS. +.SH BLOCK QUOTATIONS +.PP +Markdown uses email conventions for quoting blocks of text. +A block quotation is one or more paragraphs or other block elements +(such as lists or headers), with each line preceded by a \f[C]>\f[] +character and a space. +(The \f[C]>\f[] need not start at the left margin, but it should not be +indented more than three spaces.) +.IP +.nf +\f[C] +>\ This\ is\ a\ block\ quote.\ This +>\ paragraph\ has\ two\ lines. +> +>\ 1.\ This\ is\ a\ list\ inside\ a\ block\ quote. +>\ 2.\ Second\ item. +\f[] +.fi +.PP +A "lazy" form, which requires the \f[C]>\f[] character only on the first +line of each block, is also allowed: +.IP +.nf +\f[C] +>\ This\ is\ a\ block\ quote.\ This +paragraph\ has\ two\ lines. + +>\ 1.\ This\ is\ a\ list\ inside\ a\ block\ quote. +2.\ Second\ item. +\f[] +.fi +.PP +Among the block elements that can be contained in a block quote are +other block quotes. +That is, block quotes can be nested: +.IP +.nf +\f[C] +>\ This\ is\ a\ block\ quote. +> +>\ >\ A\ block\ quote\ within\ a\ block\ quote. +\f[] +.fi +.PP +Standard markdown syntax does not require a blank line before a block +quote. +Pandoc does require this (except, of course, at the beginning of the +document). +The reason for the requirement is that it is all too easy for a +\f[C]>\f[] to end up at the beginning of a line by accident (perhaps +through line wrapping). +So, unless \f[C]--strict\f[] is used, the following does not produce a +nested block quote in pandoc: +.IP +.nf +\f[C] +>\ This\ is\ a\ block\ quote. +>>\ Nested. +\f[] +.fi +.SH VERBATIM (CODE) BLOCKS +.SS Indented code blocks +.PP +A block of text indented four spaces (or one tab) is treated as verbatim +text: that is, special characters do not trigger special formatting, and +all spaces and line breaks are preserved. +For example, +.IP +.nf +\f[C] +\ \ \ \ if\ (a\ >\ 3)\ { +\ \ \ \ \ \ moveShip(5\ *\ gravity,\ DOWN); +\ \ \ \ } +\f[] +.fi +.PP +The initial (four space or one tab) indentation is not considered part +of the verbatim text, and is removed in the output. +.PP +Note: blank lines in the verbatim text need not begin with four spaces. +.SS Delimited code blocks +.PP +\f[I]Pandoc extension\f[]. +.PP +In addition to standard indented code blocks, Pandoc supports +\f[I]delimited\f[] code blocks. +These begin with a row of three or more tildes (\f[C]~\f[]) and end with +a row of tildes that must be at least as long as the starting row. +Everything between the tilde-lines is treated as code. +No indentation is necessary: +.IP +.nf +\f[C] +~~~~~~~ +if\ (a\ >\ 3)\ { +\ \ moveShip(5\ *\ gravity,\ DOWN); +} +~~~~~~~ +\f[] +.fi +.PP +Like regular code blocks, delimited code blocks must be separated from +surrounding text by blank lines. +.PP +If the code itself contains a row of tildes, just use a longer row of +tildes at the start and end: +.IP +.nf +\f[C] +~~~~~~~~~~~~~~~~ +~~~~~~~~~~ +code\ including\ tildes +~~~~~~~~~~ +~~~~~~~~~~~~~~~~ +\f[] +.fi +.PP +Optionally, you may specify the language of the code block using this +syntax: +.IP +.nf +\f[C] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\ {.haskell\ .numberLines} +qsort\ []\ \ \ \ \ =\ [] +qsort\ (x:xs)\ =\ qsort\ (filter\ (<\ x)\ xs)\ ++\ [x]\ ++ +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ qsort\ (filter\ (>=\ x)\ xs)\ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +\f[] +.fi +.PP +Some output formats can use this information to do syntax highlighting. +Currently, the only output format that uses this information is HTML. +.PP +If pandoc has been compiled with syntax highlighting support, then the +code block above will appear highlighted, with numbered lines. +(To see which languages are supported, do \f[C]pandoc\ --version\f[].) +.PP +If pandoc has not been compiled with syntax highlighting support, the +code block above will appear as follows: +.IP +.nf +\f[C] +<pre\ class="haskell"> +\ \ <code> +\ \ ... +\ \ </code> +</pre> +\f[] +.fi +.SH LISTS +.SS Bullet lists +.PP +A bullet list is a list of bulleted list items. +A bulleted list item begins with a bullet (\f[C]*\f[], \f[C]+\f[], or +\f[C]-\f[]). +Here is a simple example: +.IP +.nf +\f[C] +*\ one +*\ two +*\ three +\f[] +.fi +.PP +This will produce a "compact" list. +If you want a "loose" list, in which each item is formatted as a +paragraph, put spaces between the items: +.IP +.nf +\f[C] +*\ one + +*\ two + +*\ three +\f[] +.fi +.PP +The bullets need not be flush with the left margin; they may be indented +one, two, or three spaces. +The bullet must be followed by whitespace. +.PP +A list item may contain multiple paragraphs and other block-level +content. +Subsequent paragraphs must be preceded by a blank line and indented four +spaces or a tab. +The list will look better if the first paragraph is aligned with the +rest: +.IP +.nf +\f[C] +\ \ *\ First\ paragraph. + +\ \ \ \ Continued. + +\ \ *\ Second\ paragraph.\ With\ a\ code\ block,\ which\ must\ be\ indented +\ \ \ \ eight\ spaces: + +\ \ \ \ \ \ \ \ {\ code\ } +\f[] +.fi +.PP +List items may include other lists. +In this case the preceding blank line is optional. +The nested list must be indented four spaces or one tab: +.IP +.nf +\f[C] +*\ fruits +\ \ \ \ +\ apples +\ \ \ \ \ \ \ \ -\ macintosh +\ \ \ \ \ \ \ \ -\ red\ delicious +\ \ \ \ +\ pears +\ \ \ \ +\ peaches +*\ vegetables +\ \ \ \ +\ brocolli +\ \ \ \ +\ chard +\f[] +.fi +.PP +Markdown allows you to write list items "lazily," instead of indenting +continuation lines. +However, if there are multiple paragraphs or other blocks in a list +item, the first line of each must be indented. +.IP +.nf +\f[C] ++\ A\ lazy,\ lazy,\ list +item. + ++\ Another\ one;\ this\ looks +bad\ but\ is\ legal. + +\ \ \ \ Second\ paragraph\ of\ second +list\ item. +\f[] +.fi +.SS Ordered lists +.PP +Ordered lists work just like bulleted lists, except that the items begin +with enumerators rather than bullets. +.PP +In standard markdown, enumerators are decimal numbers followed by a +period and a space. +The numbers themselves are ignored, so there is no difference between +this list: +.IP +.nf +\f[C] +1.\ \ one +2.\ \ two +3.\ \ three +\f[] +.fi +.PP +and this one: +.IP +.nf +\f[C] +5.\ \ one +7.\ \ two +1.\ \ three +\f[] +.fi +.PP +\f[I]Pandoc extension\f[]. +.PP +Unlike standard markdown, Pandoc allows ordered list items to be marked +with uppercase and lowercase letters and roman numerals, in addition to +arabic numerals. +List markers may be enclosed in parentheses or followed by a single +right-parentheses or period. +They must be separated from the text that follows by at least one space, +and, if the list marker is a capital letter with a period, by at least +two spaces.[1] +.PP +Pandoc also pays attention to the type of list marker used, and to the +starting number, and both of these are preserved where possible in the +output format. +Thus, the following yields a list with numbers followed by a single +parenthesis, starting with 9, and a sublist with lowercase roman +numerals: +.IP +.nf +\f[C] +\ 9)\ \ Ninth +10)\ \ Tenth +11)\ \ Eleventh +\ \ \ \ \ \ \ i.\ subone +\ \ \ \ \ \ ii.\ subtwo +\ \ \ \ \ iii.\ subthree +\f[] +.fi +.PP +Note that Pandoc pays attention only to the \f[I]starting\f[] marker in +a list. +So, the following yields a list numbered sequentially starting from 2: +.IP +.nf +\f[C] +(2)\ Two +(5)\ Three +1.\ \ Four +*\ \ \ Five +\f[] +.fi +.PP +If default list markers are desired, use \f[C]#.\f[]: +.IP +.nf +\f[C] +#.\ \ one +#.\ \ two +#.\ \ three +\f[] +.fi +.SS Definition lists +.PP +\f[I]Pandoc extension\f[]. +.PP +Pandoc supports definition lists, using a syntax inspired by PHP +Markdown Extra and reStructuredText:[2] +.IP +.nf +\f[C] +Term\ 1 + +:\ \ \ Definition\ 1 + +Term\ 2\ with\ *inline\ markup* + +:\ \ \ Definition\ 2 + +\ \ \ \ \ \ \ \ {\ some\ code,\ part\ of\ Definition\ 2\ } + +\ \ \ \ Third\ paragraph\ of\ definition\ 2. +\f[] +.fi +.PP +Each term must fit on one line, which may optionally be followed by a +blank line, and must be followed by one or more definitions. +A definition begins with a colon or tilde, which may be indented one or +two spaces. +A term may have multiple definitions, and each definition may consist of +one or more block elements (paragraph, code block, list, etc.) +, each indented four spaces or one tab stop. +.PP +If you leave space after the definition (as in the example above), the +blocks of the definitions will be considered paragraphs. +In some output formats, this will mean greater spacing between +term/definition pairs. +For a compact definition list, do not leave space between the definition +and the next term: +.IP +.nf +\f[C] +Term\ 1 +\ \ ~\ Definition\ 1 +Term\ 2 +\ \ ~\ Definition\ 2a +\ \ ~\ Definition\ 2b +\f[] +.fi +.SS Numbered example lists +.PP +\f[I]Pandoc extension\f[]. +.PP +The special list marker \f[C]\@\f[] can be used for sequentially +numbered examples. +The first list item with a \f[C]\@\f[] marker will be numbered +\[aq]1\[aq], the next \[aq]2\[aq], and so on, throughout the document. +The numbered examples need not occur in a single list; each new list +using \f[C]\@\f[] will take up where the last stopped. +So, for example: +.IP +.nf +\f[C] +(\@)\ \ My\ first\ example\ will\ be\ numbered\ (1). +(\@)\ \ My\ second\ example\ will\ be\ numbered\ (2). + +Explanation\ of\ examples. + +(\@)\ \ My\ third\ example\ will\ be\ numbered\ (3). +\f[] +.fi +.PP +Numbered examples can be labeled and referred to elsewhere in the +document: +.IP +.nf +\f[C] +(\@good)\ \ This\ is\ a\ good\ example. + +As\ (\@good)\ illustrates,\ ... +\f[] +.fi +.PP +The label can be any string of alphanumeric characters, underscores, or +hyphens. +.SS Compact and loose lists +.PP +Pandoc behaves differently from \f[C]Markdown.pl\f[] on some "edge +cases" involving lists. +Consider this source: +.IP +.nf +\f[C] ++\ \ \ First ++\ \ \ Second: +\ -\ \ \ Fee +\ -\ \ \ Fie +\ -\ \ \ Foe + ++\ \ \ Third +\f[] +.fi +.PP +Pandoc transforms this into a "compact list" (with no \f[C]<p>\f[] tags +around "First", "Second", or "Third"), while markdown puts \f[C]<p>\f[] +tags around "Second" and "Third" (but not "First"), because of the blank +space around "Third". +Pandoc follows a simple rule: if the text is followed by a blank line, +it is treated as a paragraph. +Since "Second" is followed by a list, and not a blank line, it isn\[aq]t +treated as a paragraph. +The fact that the list is followed by a blank line is irrelevant. +(Note: Pandoc works this way even when the \f[C]--strict\f[] option is +specified. +This behavior is consistent with the official markdown syntax +description, even though it is different from that of +\f[C]Markdown.pl\f[].) +.SS Ending a list +.PP +What if you want to put an indented code block after a list? +.IP +.nf +\f[C] +-\ \ \ item\ one +-\ \ \ item\ two + +\ \ \ \ {\ my\ code\ block\ } +\f[] +.fi +.PP +Trouble! Here pandoc (like other markdown implementations) will treat +\f[C]{\ my\ code\ block\ }\f[] as the second paragraph of item two, and +not as a code block. +.PP +To "cut off" the list after item two, you can insert some non-indented +content, like an HTML comment, which won\[aq]t produce visible output in +any format: +.IP +.nf +\f[C] +-\ \ \ item\ one +-\ \ \ item\ two + +<!--\ end\ of\ list\ --> + +\ \ \ \ {\ my\ code\ block\ } +\f[] +.fi +.PP +You can use the same trick if you want two consecutive lists instead of +one big list: +.IP +.nf +\f[C] +1.\ \ one +2.\ \ two +3.\ \ three + +<!--\ --> + +a.\ \ uno +b.\ \ dos +c.\ \ tres +\f[] +.fi +.SH HORIZONTAL RULES +.PP +A line containing a row of three or more \f[C]*\f[], \f[C]-\f[], or +\f[C]_\f[] characters (optionally separated by spaces) produces a +horizontal rule: +.IP +.nf +\f[C] +*\ \ *\ \ *\ \ * + +--------------- +\f[] +.fi +.SH TABLES +.PP +\f[I]Pandoc extension\f[]. +.PP +Three kinds of tables may be used. +All three kinds presuppose the use of a fixed-width font, such as +Courier. +.PP +\f[B]Simple tables\f[] look like this: +.IP +.nf +\f[C] +\ \ Right\ \ \ \ \ Left\ \ \ \ \ Center\ \ \ \ \ Default +-------\ \ \ \ \ ------\ ----------\ \ \ ------- +\ \ \ \ \ 12\ \ \ \ \ 12\ \ \ \ \ \ \ \ 12\ \ \ \ \ \ \ \ \ \ \ \ 12 +\ \ \ \ 123\ \ \ \ \ 123\ \ \ \ \ \ \ 123\ \ \ \ \ \ \ \ \ \ 123 +\ \ \ \ \ \ 1\ \ \ \ \ 1\ \ \ \ \ \ \ \ \ \ 1\ \ \ \ \ \ \ \ \ \ \ \ \ 1 + +Table:\ \ Demonstration\ of\ simple\ table\ syntax. +\f[] +.fi +.PP +The headers and table rows must each fit on one line. +Column alignments are determined by the position of the header text +relative to the dashed line below it:[3] +.IP \[bu] 2 +If the dashed line is flush with the header text on the right side but +extends beyond it on the left, the column is right-aligned. +.IP \[bu] 2 +If the dashed line is flush with the header text on the left side but +extends beyond it on the right, the column is left-aligned. +.IP \[bu] 2 +If the dashed line extends beyond the header text on both sides, the +column is centered. +.IP \[bu] 2 +If the dashed line is flush with the header text on both sides, the +default alignment is used (in most cases, this will be left). +.PP +The table must end with a blank line, or a line of dashes followed by a +blank line. +A caption may optionally be provided (as illustrated in the example +above). +A caption is a paragraph beginning with the string \f[C]Table:\f[] (or +just \f[C]:\f[]), which will be stripped off. +It may appear either before or after the table. +.PP +The column headers may be omitted, provided a dashed line is used to end +the table. +For example: +.IP +.nf +\f[C] +-------\ \ \ \ \ ------\ ----------\ \ \ ------- +\ \ \ \ \ 12\ \ \ \ \ 12\ \ \ \ \ \ \ \ 12\ \ \ \ \ \ \ \ \ \ \ \ \ 12 +\ \ \ \ 123\ \ \ \ \ 123\ \ \ \ \ \ \ 123\ \ \ \ \ \ \ \ \ \ \ 123 +\ \ \ \ \ \ 1\ \ \ \ \ 1\ \ \ \ \ \ \ \ \ \ 1\ \ \ \ \ \ \ \ \ \ \ \ \ \ 1 +-------\ \ \ \ \ ------\ ----------\ \ \ ------- +\f[] +.fi +.PP +When headers are omitted, column alignments are determined on the basis +of the first line of the table body. +So, in the tables above, the columns would be right, left, center, and +right aligned, respectively. +.PP +\f[B]Multiline tables\f[] allow headers and table rows to span multiple +lines of text (but cells that span multiple columns or rows of the table +are not supported). +Here is an example: +.IP +.nf +\f[C] +------------------------------------------------------------- +\ Centered\ \ \ Default\ \ \ \ \ \ \ \ \ \ \ Right\ Left +\ \ Header\ \ \ \ Aligned\ \ \ \ \ \ \ \ \ Aligned\ Aligned +-----------\ -------\ ---------------\ ------------------------- +\ \ \ First\ \ \ \ row\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ 12.0\ Example\ of\ a\ row\ that +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ spans\ multiple\ lines. + +\ \ Second\ \ \ \ row\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ 5.0\ Here\[aq]s\ another\ one.\ Note +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ the\ blank\ line\ between +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ rows. +------------------------------------------------------------- + +Table:\ Here\[aq]s\ the\ caption.\ It,\ too,\ may\ span +multiple\ lines. +\f[] +.fi +.PP +These work like simple tables, but with the following differences: +.IP \[bu] 2 +They must begin with a row of dashes, before the header text (unless the +headers are omitted). +.IP \[bu] 2 +They must end with a row of dashes, then a blank line. +.IP \[bu] 2 +The rows must be separated by blank lines. +.PP +In multiline tables, the table parser pays attention to the widths of +the columns, and the writers try to reproduce these relative widths in +the output. +So, if you find that one of the columns is too narrow in the output, try +widening it in the markdown source. +.PP +Headers may be omitted in multiline tables as well as simple tables: +.IP +.nf +\f[C] +-----------\ -------\ ---------------\ ------------------------- +\ \ \ First\ \ \ \ row\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ 12.0\ Example\ of\ a\ row\ that +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ spans\ multiple\ lines. + +\ \ Second\ \ \ \ row\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ 5.0\ Here\[aq]s\ another\ one.\ Note +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ the\ blank\ line\ between +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ rows. +------------------------------------------------------------- + +:\ Here\[aq]s\ a\ multiline\ table\ without\ headers. +\f[] +.fi +.PP +It is possible for a multiline table to have just one row, but the row +should be followed by a blank line (and then the row of dashes that ends +the table), or the table may be interpreted as a simple table. +.PP +\f[B]Grid tables\f[] look like this: +.IP +.nf +\f[C] +:\ Sample\ grid\ table. + ++---------------+---------------+--------------------+ +|\ Fruit\ \ \ \ \ \ \ \ \ |\ Price\ \ \ \ \ \ \ \ \ |\ Advantages\ \ \ \ \ \ \ \ \ | ++===============+===============+====================+ +|\ Bananas\ \ \ \ \ \ \ |\ $1.34\ \ \ \ \ \ \ \ \ |\ -\ built-in\ wrapper\ | +|\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ |\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ |\ -\ bright\ color\ \ \ \ \ | ++---------------+---------------+--------------------+ +|\ Oranges\ \ \ \ \ \ \ |\ $2.10\ \ \ \ \ \ \ \ \ |\ -\ cures\ scurvy\ \ \ \ \ | +|\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ |\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ |\ -\ tasty\ \ \ \ \ \ \ \ \ \ \ \ | ++---------------+---------------+--------------------+ +\f[] +.fi +.PP +The row of \f[C]=\f[]s separates the header from the table body, and can +be omitted for a headerless table. +The cells of grid tables may contain arbitrary block elements (multiple +paragraphs, code blocks, lists, etc.) +\&. +Alignments are not supported, nor are cells that span multiple columns +or rows. +Grid tables can be created easily using Emacs table mode. +.SH TITLE BLOCK +.PP +\f[I]Pandoc extension\f[]. +.PP +If the file begins with a title block +.IP +.nf +\f[C] +%\ title +%\ author(s)\ (separated\ by\ semicolons) +%\ date +\f[] +.fi +.PP +it will be parsed as bibliographic information, not regular text. +(It will be used, for example, in the title of standalone LaTeX or HTML +output.) + The block may contain just a title, a title and an author, or all three +elements. +If you want to include an author but no title, or a title and a date but +no author, you need a blank line: +.IP +.nf +\f[C] +% +%\ Author + +%\ My\ title +% +%\ June\ 15,\ 2006 +\f[] +.fi +.PP +The title may occupy multiple lines, but continuation lines must begin +with leading space, thus: +.IP +.nf +\f[C] +%\ My\ title +\ \ on\ multiple\ lines +\f[] +.fi +.PP +If a document has multiple authors, the authors may be put on separate +lines with leading space, or separated by semicolons, or both. +So, all of the following are equivalent: +.IP +.nf +\f[C] +%\ Author\ One +\ \ Author\ Two + +%\ Author\ One;\ Author\ Two + +%\ Author\ One; +\ \ Author\ Two +\f[] +.fi +.PP +The date must fit on one line. +.PP +All three metadata fields may contain standard inline formatting +(italics, links, footnotes, etc.) +\&. +.PP +Title blocks will always be parsed, but they will affect the output only +when the \f[C]--standalone\f[] (\f[C]-s\f[]) option is chosen. +In HTML output, titles will appear twice: once in the document head -- +this is the title that will appear at the top of the window in a browser +-- and once at the beginning of the document body. +The title in the document head can have an optional prefix attached +(\f[C]--title-prefix\f[] or \f[C]-T\f[] option). +The title in the body appears as an H1 element with class "title", so it +can be suppressed or reformatted with CSS. +If a title prefix is specified with \f[C]-T\f[] and no title block +appears in the document, the title prefix will be used by itself as the +HTML title. +.PP +The man page writer extracts a title, man page section number, and other +header and footer information from the title line. +The title is assumed to be the first word on the title line, which may +optionally end with a (single-digit) section number in parentheses. +(There should be no space between the title and the parentheses.) + Anything after this is assumed to be additional footer and header text. +A single pipe character (\f[C]|\f[]) should be used to separate the +footer text from the header text. +Thus, +.IP +.nf +\f[C] +%\ PANDOC(1) +\f[] +.fi +.PP +will yield a man page with the title \f[C]PANDOC\f[] and section 1. +.IP +.nf +\f[C] +%\ PANDOC(1)\ Pandoc\ User\ Manuals +\f[] +.fi +.PP +will also have "Pandoc User Manuals" in the footer. +.IP +.nf +\f[C] +%\ PANDOC(1)\ Pandoc\ User\ Manuals\ |\ Version\ 4.0 +\f[] +.fi +.PP +will also have "Version 4.0" in the header. +.SH BACKSLASH ESCAPES +.PP +Except inside a code block or inline code, any punctuation or space +character preceded by a backslash will be treated literally, even if it +would normally indicate formatting. +Thus, for example, if one writes +.IP +.nf +\f[C] +*\\*hello\\** +\f[] +.fi +.PP +one will get +.IP +.nf +\f[C] +<em>*hello*</em> +\f[] +.fi +.PP +instead of +.IP +.nf +\f[C] +<strong>hello</strong> +\f[] +.fi +.PP +This rule is easier to remember than standard markdown\[aq]s rule, which +allows only the following characters to be backslash-escaped: +.IP +.nf +\f[C] +\\`*_{}[]()>#+-.! +\f[] +.fi +.PP +(However, if the \f[C]--strict\f[] option is supplied, the standard +markdown rule will be used.) +.PP +A backslash-escaped space is parsed as a nonbreaking space. +It will appear in TeX output as \f[C]~\f[] and in HTML and XML as +\f[C]\\ \f[] or \f[C]\\ \f[]. +.PP +A backslash-escaped newline (i.e. +a backslash occurring at the end of a line) is parsed as a hard line +break. +It will appear in TeX output as \f[C]\\\\\f[] and in HTML as +\f[C]<br\ />\f[]. +This is a nice alternative to markdown\[aq]s "invisible" way of +indicating hard line breaks using two trailing spaces on a line. +.PP +Backslash escapes do not work in verbatim contexts. +.SH SMART PUNCTUATION +.PP +If the \f[C]--smart\f[] option is specified, pandoc will produce +typographically correct output, converting straight quotes to curly +quotes, \f[C]---\f[] and \f[C]--\f[] to Em-dashes, and \f[C]...\f[] to +ellipses. +Nonbreaking spaces are inserted after certain abbreviations, such as +"Mr." +.SH INLINE FORMATTING +.SS Emphasis +.PP +To \f[I]emphasize\f[] some text, surround it with \f[C]*\f[]s or +\f[C]_\f[], like this: +.IP +.nf +\f[C] +This\ text\ is\ _emphasized\ with\ underscores_,\ and\ this +is\ *emphasized\ with\ asterisks*. +\f[] +.fi +.PP +Double \f[C]*\f[] or \f[C]_\f[] produces \f[B]strong emphasis\f[]: +.IP +.nf +\f[C] +This\ is\ **strong\ emphasis**\ and\ __with\ underscores__. +\f[] +.fi +.PP +A \f[C]*\f[] or \f[C]_\f[] character surrounded by spaces, or +backslash-escaped, will not trigger emphasis: +.IP +.nf +\f[C] +This\ is\ *\ not\ emphasized\ *,\ and\ \\*neither\ is\ this\\*. +\f[] +.fi +.PP +Because \f[C]_\f[] is sometimes used inside words and identifiers, +pandoc does not interpret a \f[C]_\f[] surrounded by alphanumeric +characters as an emphasis marker. +If you want to emphasize just part of a word, use \f[C]*\f[]: +.IP +.nf +\f[C] +feas*ible*,\ not\ feas*able*. +\f[] +.fi +.SS Strikeout +.PP +\f[I]Pandoc extension\f[]. +.PP +To strikeout a section of text with a horizontal line, begin and end it +with \f[C]~~\f[]. +Thus, for example, +.IP +.nf +\f[C] +This\ ~~is\ deleted\ text.~~ +\f[] +.fi +.SS Superscripts and subscripts +.PP +\f[I]Pandoc extension\f[]. +.PP +Superscripts may be written by surrounding the superscripted text by +\f[C]^\f[] characters; subscripts may be written by surrounding the +subscripted text by \f[C]~\f[] characters. +Thus, for example, +.IP +.nf +\f[C] +H~2~O\ is\ a\ liquid.\ \ 2^10^\ is\ 1024. +\f[] +.fi +.PP +If the superscripted or subscripted text contains spaces, these spaces +must be escaped with backslashes. +(This is to prevent accidental superscripting and subscripting through +the ordinary use of \f[C]~\f[] and \f[C]^\f[].) + Thus, if you want the letter P with \[aq]a cat\[aq] in subscripts, use +\f[C]P~a\\\ cat~\f[], not \f[C]P~a\ cat~\f[]. +.SS Verbatim +.PP +To make a short span of text verbatim, put it inside backticks: +.IP +.nf +\f[C] +What\ is\ the\ difference\ between\ `>>=`\ and\ `>>`? +\f[] +.fi +.PP +If the verbatim text includes a backtick, use double backticks: +.IP +.nf +\f[C] +Here\ is\ a\ literal\ backtick\ ``\ `\ ``. +\f[] +.fi +.PP +(The spaces after the opening backticks and before the closing backticks +will be ignored.) +.PP +The general rule is that a verbatim span starts with a string of +consecutive backticks (optionally followed by a space) and ends with a +string of the same number of backticks (optionally preceded by a space). +.PP +Note that backslash-escapes (and other markdown constructs) do not work +in verbatim contexts: +.IP +.nf +\f[C] +This\ is\ a\ backslash\ followed\ by\ an\ asterisk:\ `\\*`. +\f[] +.fi +.SH MATH +.PP +\f[I]Pandoc extension\f[]. +.PP +Anything between two \f[C]$\f[] characters will be treated as TeX math. +The opening \f[C]$\f[] must have a character immediately to its right, +while the closing \f[C]$\f[] must have a character immediately to its +left. +Thus, \f[C]$20,000\ and\ $30,000\f[] won\[aq]t parse as math. +If for some reason you need to enclose text in literal \f[C]$\f[] +characters, backslash-escape them and they won\[aq]t be treated as math +delimiters. +.PP +TeX math will be printed in all output formats. +How it is rendered depends on the output format: +.TP +.B Markdown, reStructuredText, LaTeX, Org-Mode, ConTeXt +It will appear verbatim between \f[C]$\f[] characters. +.RS +.RE +.TP +.B reStructuredText +It will be rendered using an interpreted text role \f[C]:math:\f[], as +described here. +.RS +.RE +.TP +.B Texinfo +It will be rendered inside a \f[C]\@math\f[] command. +.RS +.RE +.TP +.B groff man +It will be rendered verbatim without \f[C]$\f[]\[aq]s. +.RS +.RE +.TP +.B MediaWiki +It will be rendered inside \f[C]<math>\f[] tags. +.RS +.RE +.TP +.B Textile +It will be rendered inside \f[C]<span\ class="math">\f[] tags. +.RS +.RE +.TP +.B RTF, Docbook, OpenDocument, ODT +It will be rendered, if possible, using unicode characters, and will +otherwise appear verbatim. +.RS +.RE +.TP +.B HTML, Slidy, S5, EPUB +The way math is rendered in HTML will depend on the command-line options +selected: +.RS +.IP "1." 3 +The default is to render TeX math as far as possible using unicode +characters, as with RTF, Docbook, and OpenDocument output. +Formulas are put inside a \f[C]span\f[] with \f[C]class="math"\f[], so +that they may be styled differently from the surrounding text if needed. +.IP "2." 3 +If the \f[C]--latexmathml\f[] option is used, TeX math will be displayed +between $ or $$ characters and put in \f[C]<span>\f[] tags with class +\f[C]LaTeX\f[]. +The LaTeXMathML script will be used to render it as formulas. +(This trick does not work in all browsers, but it works in Firefox. +In browsers that do not support LaTeXMathML, TeX math will appear +verbatim between $ characters.) +.IP "3." 3 +If the \f[C]--jsmath\f[] option is used, TeX math will be put inside +\f[C]<span>\f[] tags (for inline math) or \f[C]<div>\f[] tags (for +display math) with class \f[C]math\f[]. +The jsMath script will be used to render it. +.IP "4." 3 +If the \f[C]--mimetex\f[] option is used, the mimeTeX CGI script will be +called to generate images for each TeX formula. +This should work in all browsers. +The \f[C]--mimetex\f[] option takes an optional URL as argument. +If no URL is specified, it will be assumed that the mimeTeX CGI script +is at \f[C]/cgi-bin/mimetex.cgi\f[]. +.IP "5." 3 +If the \f[C]--gladtex\f[] option is used, TeX formulas will be enclosed +in \f[C]<eq>\f[] tags in the HTML output. +The resulting \f[C]htex\f[] file may then be processed by gladTeX, which +will produce image files for each formula and an \f[C]html\f[] file with +links to these images. +So, the procedure is: +.RS 4 +.IP +.nf +\f[C] +pandoc\ -s\ --gladtex\ myfile.txt\ -o\ myfile.htex +gladtex\ -d\ myfile-images\ myfile.htex +#\ produces\ myfile.html\ and\ images\ in\ myfile-images +\f[] +.fi +.RE +.IP "6." 3 +If the \f[C]--webtex\f[] option is used, TeX formulas will be converted +to \f[C]<img>\f[] tags that link to an external script that converts +formulas to images. +The formula will be URL-encoded and concatenated with the URL provided. +If no URL is specified, the Google Chart API will be used +(\f[C]http://chart.apis.google.com/chart?cht=tx&chl=\f[]). +.RE +.SH RAW HTML +.PP +Markdown allows you to insert raw HTML anywhere in a document (except +verbatim contexts, where \f[C]<\f[], \f[C]>\f[], and \f[C]&\f[] are +interpreted literally). +.PP +The raw HTML is passed through unchanged in HTML, S5, Slidy, EPUB, +Markdown, and Textile output, and suppressed in other formats. +.PP +\f[I]Pandoc extension\f[]. +.PP +Standard markdown allows you to include HTML "blocks": blocks of HTML +between balanced tags that are separated from the surrounding text with +blank lines, and start and end at the left margin. +Within these blocks, everything is interpreted as HTML, not markdown; so +(for example), \f[C]*\f[] does not signify emphasis. +.PP +Pandoc behaves this way when \f[C]--strict\f[] is specified; but by +default, pandoc interprets material between HTML block tags as markdown. +Thus, for example, Pandoc will turn +.IP +.nf +\f[C] +<table> +\ <tr> +\ \ <td>*one*</td> +\ \ <td>[a\ link](http://google.com)</td> +\ </tr> +</table> +\f[] +.fi +.PP +into +.IP +.nf +\f[C] +<table> +\ <tr> +\ \ <td><em>one</em></td> +\ \ <td><a\ href="http://google.com">a\ link</a></td> +\ </tr> +</table> +\f[] +.fi +.PP +whereas \f[C]Markdown.pl\f[] will preserve it as is. +.PP +There is one exception to this rule: text between \f[C]<script>\f[] and +\f[C]<style>\f[] tags is not interpreted as markdown. +.PP +This departure from standard markdown should make it easier to mix +markdown with HTML block elements. +For example, one can surround a block of markdown text with +\f[C]<div>\f[] tags without preventing it from being interpreted as +markdown. +.SH RAW TEX +.PP +\f[I]Pandoc extension\f[]. +.PP +In addition to raw HTML, pandoc allows raw LaTeX, TeX, and ConTeXt to be +included in a document. +Inline TeX commands will be preserved and passed unchanged to the LaTeX +and ConTeXt writers. +Thus, for example, you can use LaTeX to include BibTeX citations: +.IP +.nf +\f[C] +This\ result\ was\ proved\ in\ \\cite{jones.1967}. +\f[] +.fi +.PP +Note that in LaTeX environments, like +.IP +.nf +\f[C] +\\begin{tabular}{|l|l|}\\hline +Age\ &\ Frequency\ \\\\\ \\hline +18--25\ \ &\ 15\ \\\\ +26--35\ \ &\ 33\ \\\\\ +36--45\ \ &\ 22\ \\\\\ \\hline +\\end{tabular} +\f[] +.fi +.PP +the material between the begin and end tags will be interpreted as raw +LaTeX, not as markdown. +.PP +Inline LaTeX is ignored in output formats other than Markdown, LaTeX, +and ConTeXt. +.SS Macros +.PP +For output formats other than LaTeX, pandoc will parse LaTeX +\f[C]\\newcommand\f[] and \f[C]\\renewcommand\f[] definitions and apply +the resulting macros to all LaTeX math. +So, for example, the following will work in all output formats, not just +LaTeX: +.IP +.nf +\f[C] +\\newcommand{\\tuple}[1]{\\langle\ #1\ \\rangle} + +$\\tuple{a,\ b,\ c}$ +\f[] +.fi +.PP +In LaTeX output, the \f[C]\\newcommand\f[] definition will simply be +passed unchanged to the output. +.SH LINKS +.PP +Markdown allows links to be specified in several ways. +.SS Automatic links +.PP +If you enclose a URL or email address in pointy brackets, it will become +a link: +.IP +.nf +\f[C] +<http://google.com> +<sam\@green.eggs.ham> +\f[] +.fi +.SS Inline links +.PP +An inline link consists of the link text in square brackets, followed by +the URL in parentheses. +(Optionally, the URL can be followed by a link title, in quotes.) +.IP +.nf +\f[C] +This\ is\ an\ [inline\ link](/url),\ and\ here\[aq]s\ [one\ with +a\ title](http://fsf.org\ "click\ here\ for\ a\ good\ time!"). +\f[] +.fi +.PP +There can be no space between the bracketed part and the parenthesized +part. +The link text can contain formatting (such as emphasis), but the title +cannot. +.SS Reference links +.PP +An \f[I]explicit\f[] reference link has two parts, the link itself and +the link definition, which may occur elsewhere in the document (either +before or after the link). +.PP +The link consists of link text in square brackets, followed by a label +in square brackets. +(There can be space between the two.) + The link definition must begin at the left margin or indented no more +than three spaces. +It consists of the bracketed label, followed by a colon and a space, +followed by the URL, and optionally (after a space) a link title either +in quotes or in parentheses. +.PP +Here are some examples: +.IP +.nf +\f[C] +[my\ label\ 1]:\ /foo/bar.html\ \ "My\ title,\ optional" +[my\ label\ 2]:\ /foo +[my\ label\ 3]:\ http://fsf.org\ (The\ free\ software\ foundation) +[my\ label\ 4]:\ /bar#special\ \ \[aq]A\ title\ in\ single\ quotes\[aq] +\f[] +.fi +.PP +The URL may optionally be surrounded by angle brackets: +.IP +.nf +\f[C] +[my\ label\ 5]:\ <http://foo.bar.baz> +\f[] +.fi +.PP +The title may go on the next line: +.IP +.nf +\f[C] +[my\ label\ 3]:\ http://fsf.org +\ \ "The\ free\ software\ foundation" +\f[] +.fi +.PP +Note that link labels are not case sensitive. +So, this will work: +.IP +.nf +\f[C] +Here\ is\ [my\ link][FOO] + +[Foo]:\ /bar/baz +\f[] +.fi +.PP +In an \f[I]implicit\f[] reference link, the second pair of brackets is +empty, or omitted entirely: +.IP +.nf +\f[C] +See\ [my\ website][],\ or\ [my\ website]. + +[my\ website]:\ http://foo.bar.baz +\f[] +.fi +.SH IMAGES +.PP +A link immediately preceded by a \f[C]!\f[] will be treated as an image. +The link text will be used as the image\[aq]s alt text: +.IP +.nf +\f[C] + + +![movie\ reel] + +[movie\ reel]:\ movie.gif +\f[] +.fi +.SS Pictures with captions +.PP +\f[I]Pandoc extension\f[]. +.PP +An image occurring by itself in a paragraph will be rendered as a figure +with a caption.[4] (In LaTeX, a figure environment will be used; in +HTML, the image will be placed in a \f[C]div\f[] with class +\f[C]figure\f[], together with a caption in a \f[C]p\f[] with class +\f[C]caption\f[].) + The image\[aq]s alt text will be used as the caption. +.IP +.nf +\f[C] + +\f[] +.fi +.PP +If you just want a regular inline image, just make sure it is not the +only thing in the paragraph. +One way to do this is to insert a nonbreaking space after the image: +.IP +.nf +\f[C] +![This\ image\ won\[aq]t\ be\ a\ figure](/url/of/image.png)\\\ +\f[] +.fi +.SH FOOTNOTES +.PP +\f[I]Pandoc extension\f[]. +.PP +Pandoc\[aq]s markdown allows footnotes, using the following syntax: +.IP +.nf +\f[C] +Here\ is\ a\ footnote\ reference,[^1]\ and\ another.[^longnote] + +[^1]:\ Here\ is\ the\ footnote. + +[^longnote]:\ Here\[aq]s\ one\ with\ multiple\ blocks. + +\ \ \ \ Subsequent\ paragraphs\ are\ indented\ to\ show\ that\ they\ +belong\ to\ the\ previous\ footnote. + +\ \ \ \ \ \ \ \ {\ some.code\ } + +\ \ \ \ The\ whole\ paragraph\ can\ be\ indented,\ or\ just\ the\ first +\ \ \ \ line.\ \ In\ this\ way,\ multi-paragraph\ footnotes\ work\ like +\ \ \ \ multi-paragraph\ list\ items. + +This\ paragraph\ won\[aq]t\ be\ part\ of\ the\ note,\ because\ it +isn\[aq]t\ indented. +\f[] +.fi +.PP +The identifiers in footnote references may not contain spaces, tabs, or +newlines. +These identifiers are used only to correlate the footnote reference with +the note itself; in the output, footnotes will be numbered sequentially. +.PP +The footnotes themselves need not be placed at the end of the document. +They may appear anywhere except inside other block elements (lists, +block quotes, tables, etc.) +\&. +.PP +Inline footnotes are also allowed (though, unlike regular notes, they +cannot contain multiple paragraphs). +The syntax is as follows: +.IP +.nf +\f[C] +Here\ is\ an\ inline\ note.^[Inlines\ notes\ are\ easier\ to\ write,\ since +you\ don\[aq]t\ have\ to\ pick\ an\ identifier\ and\ move\ down\ to\ type\ the +note.] +\f[] +.fi +.PP +Inline and regular footnotes may be mixed freely. +.SH CITATIONS +.PP +\f[I]Pandoc extension\f[]. +.PP +Pandoc can automatically generate citations and a bibliography in a +number of styles (using Andrea Rossato\[aq]s \f[C]hs-citeproc\f[]). +In order to use this feature, you will need a bibliographic database in +one of the following formats: +.PP +.TS +tab(@); +l l. +T{ +Format +T}@T{ +File extension +T} +_ +T{ +MODS +T}@T{ +\&.mods +T} +T{ +BibTeX +T}@T{ +\&.bib +T} +T{ +BibLaTeX +T}@T{ +\&.bbx +T} +T{ +RIS +T}@T{ +\&.ris +T} +T{ +EndNote +T}@T{ +\&.enl +T} +T{ +EndNote XML +T}@T{ +\&.xml +T} +T{ +ISI +T}@T{ +\&.wos +T} +T{ +MEDLINE +T}@T{ +\&.medline +T} +T{ +Copac +T}@T{ +\&.copac +T} +T{ +JSON citeproc +T}@T{ +\&.json +T} +.TE +.PP +You will need to specify the bibliography file using the +\f[C]--bibliography\f[] command-line option (which may be repeated if +you have several bibliographies). +.PP +By default, pandoc will use a Chicago author-date format for citations +and references. +To use another style, you will need to use the \f[C]--csl\f[] option to +specify a CSL 1.0 style file. +A primer on creating and modifying CSL styles can be found at +\f[C]http://citationstyles.org/downloads/primer.html\f[]. +A repository of CSL styles can be found at +\f[C]https://github.com/citation-style-language/styles\f[]. +.PP +Citations go inside square brackets and are separated by semicolons. +Each citation must have a key, composed of \[aq]\@\[aq] + the citation +identifier from the database, and may optionally have a prefix, a +locator, and a suffix. +Here are some examples: +.IP +.nf +\f[C] +Blah\ blah\ [see\ \@doe99,\ pp.\ 33-35;\ also\ \@smith04,\ ch.\ 1]. + +Blah\ blah\ [\@doe99,\ pp.\ 33-35,\ 38-39\ and\ *passim*]. + +Blah\ blah\ [\@smith04;\ \@doe99]. +\f[] +.fi +.PP +A minus sign (\f[C]-\f[]) before the \f[C]\@\f[] will suppress mention +of the author in the citation. +This can be useful when the author is already mentioned in the text: +.IP +.nf +\f[C] +Smith\ says\ blah\ [-\@smith04]. +\f[] +.fi +.PP +You can also write an in-text citation, as follows: +.IP +.nf +\f[C] +\@smith04\ says\ blah. + +\@smith04\ [p.\ 33]\ says\ blah. +\f[] +.fi +.PP +If the style calls for a list of works cited, it will be placed at the +end of the document. +Normally, you will want to end your document with an appropriate header: +.IP +.nf +\f[C] +last\ paragraph... + +#\ References +\f[] +.fi +.PP +The bibliography will be inserted after this header. +.SH NOTES +.SS [1] +.PP +The point of this rule is to ensure that normal paragraphs starting with +people\[aq]s initials, like +.IP +.nf +\f[C] +B.\ Russell\ was\ an\ English\ philosopher. +\f[] +.fi +.PP +do not get treated as list items. +.PP +This rule will not prevent +.IP +.nf +\f[C] +(C)\ 2007\ Joe\ Smith +\f[] +.fi +.PP +from being interpreted as a list item. +In this case, a backslash escape can be used: +.IP +.nf +\f[C] +(C\\)\ 2007\ Joe\ Smith +\f[] +.fi +.SS [2] +.PP +I have also been influenced by the suggestions of David Wheeler. +.SS [3] +.PP +This scheme is due to Michel Fortin, who proposed it on the Markdown +discussion list. +.SS [4] +.PP +This feature is not yet implemented for RTF, OpenDocument, or ODT. +In those formats, you\[aq]ll just get an image in a paragraph by itself, +with no caption. +.SH SEE ALSO +.PP +\f[C]pandoc\f[] (1). diff --git a/man/man5/pandoc_markdown.5.template b/man/man5/pandoc_markdown.5.template new file mode 100644 index 000000000..f775a4683 --- /dev/null +++ b/man/man5/pandoc_markdown.5.template @@ -0,0 +1,11 @@ +$if(has-tables)$ +.\"t +$endif$ +.TH PANDOC_MARKDOWN 5 "$date$" "$title$" +.SH NAME +pandoc_markdown - markdown syntax for pandoc(1) +.SH DESCRIPTION +$body$ +.SH SEE ALSO +.PP +\f[C]pandoc\f[] (1). diff --git a/pandoc.cabal b/pandoc.cabal index 60e719d12..9d75c6062 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -1,6 +1,6 @@ Name: pandoc -Version: 1.6 -Cabal-Version: >= 1.2 +Version: 1.8.0.1 +Cabal-Version: >= 1.6 Build-Type: Custom License: GPL License-File: COPYING @@ -11,15 +11,16 @@ Bug-Reports: http://code.google.com/p/pandoc/issues/list Stability: alpha Homepage: http://johnmacfarlane.net/pandoc Category: Text -Tested-With: GHC == 6.12.1 +Tested-With: GHC == 6.12.1, GHC == 6.12.3, GHC == 7.0.1 Synopsis: Conversion between markup formats Description: Pandoc is a Haskell library for converting from one markup format to another, and a command-line tool that uses this library. It can read markdown and (subsets of) - reStructuredText, HTML, and LaTeX, and it can write + reStructuredText, HTML, LaTeX and Textile, and it can write markdown, reStructuredText, HTML, LaTeX, ConTeXt, Docbook, - OpenDocument, ODT, RTF, MediaWiki, groff man pages, EPUB, - and S5 and Slidy HTML slide shows. + OpenDocument, ODT, RTF, MediaWiki, Textile, groff man pages, + plain text, Emacs Org-Mode, EPUB, and S5 and Slidy HTML + slide shows. . Pandoc extends standard markdown syntax with footnotes, embedded LaTeX, definition lists, tables, and other @@ -43,6 +44,7 @@ Data-Files: templates/rst.template, templates/plain.template, templates/mediawiki.template, templates/rtf.template, templates/s5.template, templates/slidy.template, + templates/textile.template, templates/org.template -- data for ODT writer reference.odt, -- stylesheet for EPUB writer @@ -62,11 +64,23 @@ Data-Files: -- data for slidy writer slidy/slidy.min.css, slidy/slidy.min.js, + -- data for citeproc + default.csl, -- documentation README, INSTALL, COPYRIGHT, BUGS, changelog Extra-Source-Files: -- sources for man pages - man/man1/pandoc.1.md, man/man1/markdown2pdf.1.md, + man/man1/markdown2pdf.1.md, + -- code to create pandoc.1 man page + MakeManPage.hs, + man/man1/pandoc.1.template, + man/man5/pandoc_markdown.5.template, + -- generated man pages (produced post-build) + man/man1/markdown2pdf.1, + man/man1/pandoc.1, + man/man5/pandoc_markdown.5, + -- benchmarks + Benchmark.hs, -- tests tests/bodybg.gif, tests/html-reader.html, @@ -74,10 +88,21 @@ Extra-Source-Files: tests/insert, tests/lalune.jpg, tests/movie.jpg, + tests/biblio.bib, + tests/chicago-author-date.csl, + tests/ieee.csl, + tests/mhra.csl, tests/latex-reader.latex, tests/latex-reader.native, + tests/textile-reader.textile, + tests/textile-reader.native, tests/markdown-reader-more.txt, tests/markdown-reader-more.native, + tests/markdown-citations.txt, + tests/markdown-citations.chicago-author-date.txt, + tests/markdown-citations.mhra.txt, + tests/markdown-citations.ieee.txt, + tests/textile-reader.textile, tests/rst-reader.native, tests/rst-reader.rst, tests/s5.basic.html, @@ -93,8 +118,10 @@ Extra-Source-Files: tests/tables.plain, tests/tables.markdown, tests/tables.mediawiki, + tests/tables.textile, tests/tables.native, tests/tables.opendocument, + tests/tables.org, tests/tables.texinfo, tests/tables.rst, tests/tables.rtf, @@ -110,8 +137,10 @@ Extra-Source-Files: tests/writer.markdown, tests/writer.plain, tests/writer.mediawiki, + tests/writer.textile, tests/writer.native, tests/writer.opendocument, + tests/writer.org, tests/writer.rst, tests/writer.rtf, tests/writer.texinfo, @@ -124,9 +153,12 @@ Extra-Source-Files: tests/lhs-test.latex+lhs, tests/lhs-test.html, tests/lhs-test.html+lhs, - tests/lhs-test.fragment.html+lhs, - tests/RunTests.hs -Extra-Tmp-Files: man/man1/pandoc.1, man/man1/markdown2pdf.1 + tests/lhs-test.nohl.html, + tests/lhs-test.nohl.html+lhs, + tests/lhs-test.fragment.html+lhs +Extra-Tmp-Files: man/man1/pandoc.1, + man/man1/markdown2pdf.1, + man/man5/pandoc_markdown.5 Flag threaded Description: Compile markdown2pdf with -threaded option. @@ -143,44 +175,57 @@ Flag library Flag wrappers Description: Build the wrappers (markdown2pdf). Default: True -Flag citeproc - Description: Compile in support for citeproc-hs bibliographic formatting. +Flag tests + Description: Build test-pandoc. + Default: False +Flag benchmarks + Description: Build benchmark-pandoc. Default: False Library -- Note: the following material must be in both Library and Executable stanzas. -- It needs to be duplicated because of the library & executable flags. -- BEGIN DUPLICATED SECTION - Build-Depends: pretty >= 1, containers >= 0.1, - parsec >= 2.1, xhtml >= 3000.0, - mtl >= 1.1, network >= 2, filepath >= 1.1, - process >= 1, directory >= 1, - bytestring >= 0.9, zip-archive >= 0.1.1.4, - utf8-string >= 0.3, old-time >= 1, - HTTP >= 4000.0.5, texmath >= 0.3, xml >= 1.3.5 && < 1.4, - random, extensible-exceptions + Build-Depends: containers >= 0.1 && < 0.5, + parsec >= 2.1 && < 3.2, + xhtml >= 3000.0 && < 3000.3, + mtl >= 1.1 && < 2.1, + network >= 2 && < 2.4, + filepath >= 1.1 && < 1.3, + process >= 1 && < 1.1, + directory >= 1 && < 1.2, + bytestring >= 0.9 && < 1.0, + zip-archive >= 0.1.1.7 && < 0.2, + utf8-string >= 0.3 && < 0.4, + old-time >= 1 && < 1.1, + HTTP >= 4000.0.5 && < 4000.2, + texmath >= 0.5 && < 0.6, + xml >= 1.3.5 && < 1.4, + random >= 1 && < 1.1, + extensible-exceptions >= 0.1 && < 0.2, + citeproc-hs >= 0.3.1 && < 0.4, + pandoc-types == 1.8.*, + json >= 0.4 && < 0.5, + dlist >= 0.4 && < 0.6, + tagsoup >= 0.12 && < 0.13 if impl(ghc >= 6.10) - Build-depends: base >= 4 && < 5, syb + Build-depends: base >= 4 && < 5, syb >= 0.1 && < 0.4 else Build-depends: base >= 3 && < 4 if flag(highlighting) - Build-depends: highlighting-kate >= 0.2.7.1 + Build-depends: highlighting-kate >= 0.2.9 && < 0.3 cpp-options: -D_HIGHLIGHTING - if flag(citeproc) - Build-depends: citeproc-hs >= 0.2 - cpp-options: -D_CITEPROC if impl(ghc >= 6.12) Ghc-Options: -O2 -Wall -fno-warn-unused-do-bind else Ghc-Options: -O2 -Wall - Ghc-Prof-Options: -auto-all -caf-all + Ghc-Prof-Options: -auto-all -caf-all Extensions: CPP Hs-Source-Dirs: src -- END DUPLICATED SECTION Exposed-Modules: Text.Pandoc, - Text.Pandoc.Blocks, - Text.Pandoc.Definition, + Text.Pandoc.Pretty, Text.Pandoc.CharacterReferences, Text.Pandoc.Shared, Text.Pandoc.Parsing, @@ -190,6 +235,8 @@ Library Text.Pandoc.Readers.Markdown, Text.Pandoc.Readers.RST, Text.Pandoc.Readers.TeXMath, + Text.Pandoc.Readers.Textile, + Text.Pandoc.Readers.Native, Text.Pandoc.Writers.Native, Text.Pandoc.Writers.Docbook, Text.Pandoc.Writers.HTML, @@ -200,19 +247,20 @@ Library Text.Pandoc.Writers.Man, Text.Pandoc.Writers.Markdown, Text.Pandoc.Writers.RST, + Text.Pandoc.Writers.Org, + Text.Pandoc.Writers.Textile, Text.Pandoc.Writers.MediaWiki, Text.Pandoc.Writers.RTF, Text.Pandoc.Writers.ODT, Text.Pandoc.Writers.EPUB, Text.Pandoc.S5, Text.Pandoc.Templates + Text.Pandoc.Biblio Other-Modules: Text.Pandoc.XML, Text.Pandoc.UTF8, Text.Pandoc.UUID, Paths_pandoc - if flag(citeproc) - Exposed-Modules: Text.Pandoc.Biblio if flag(library) Buildable: True else @@ -222,38 +270,49 @@ Executable pandoc -- Note: the following material must be in both Library and Executable stanzas. -- It needs to be duplicated because of the library & executable flags. -- BEGIN DUPLICATED SECTION - Build-Depends: pretty >= 1, containers >= 0.1, - parsec >= 2.1, xhtml >= 3000.0, - mtl >= 1.1, network >= 2, filepath >= 1.1, - process >= 1, directory >= 1, - bytestring >= 0.9, zip-archive >= 0.1.1.4, - utf8-string >= 0.3, old-time >= 1, - HTTP >= 4000.0.5, texmath, xml >= 1.3.5 && < 1.4, - random, extensible-exceptions + Build-Depends: containers >= 0.1 && < 0.5, + parsec >= 2.1 && < 3.2, + xhtml >= 3000.0 && < 3000.3, + mtl >= 1.1 && < 2.1, + network >= 2 && < 2.4, + filepath >= 1.1 && < 1.3, + process >= 1 && < 1.1, + directory >= 1 && < 1.2, + bytestring >= 0.9 && < 1.0, + zip-archive >= 0.1.1.7 && < 0.2, + utf8-string >= 0.3 && < 0.4, + old-time >= 1 && < 1.1, + HTTP >= 4000.0.5 && < 4000.2, + texmath >= 0.5 && < 0.6, + xml >= 1.3.5 && < 1.4, + random >= 1 && < 1.1, + extensible-exceptions >= 0.1 && < 0.2, + citeproc-hs >= 0.3.1 && < 0.4, + pandoc-types == 1.8.*, + json >= 0.4 && < 0.5, + dlist >= 0.4 && < 0.6, + tagsoup >= 0.12 && < 0.13 if impl(ghc >= 6.10) - Build-depends: base >= 4 && < 5, syb + Build-depends: base >= 4 && < 5, syb >= 0.1 && < 0.4 else Build-depends: base >= 3 && < 4 if flag(highlighting) - Build-depends: highlighting-kate >= 0.2.7.1 + Build-depends: highlighting-kate >= 0.2.9 && < 0.3 cpp-options: -D_HIGHLIGHTING - if flag(citeproc) - Build-depends: citeproc-hs >= 0.2 - cpp-options: -D_CITEPROC if impl(ghc >= 6.12) Ghc-Options: -O2 -Wall -fno-warn-unused-do-bind else Ghc-Options: -O2 -Wall - Ghc-Prof-Options: -auto-all -caf-all + Ghc-Prof-Options: -auto-all -caf-all Extensions: CPP Hs-Source-Dirs: src -- END DUPLICATED SECTION Main-Is: pandoc.hs if flag(executable) || flag(wrappers) - Buildable: True + Buildable: True else - Buildable: False + Buildable: False Executable markdown2pdf Hs-Source-Dirs: src @@ -265,7 +324,41 @@ Executable markdown2pdf Ghc-Prof-Options: -auto-all Extensions: CPP if flag(wrappers) - Buildable: True + Buildable: True else - Buildable: False + Buildable: False +Executable test-pandoc + Hs-Source-Dirs: src + Main-Is: test-pandoc.hs + if flag(highlighting) + cpp-options: -D_HIGHLIGHTING + if impl(ghc >= 7) + cpp-options: -D_LIT=lit + else + cpp-options: -D_LIT=$lit + if !flag(tests) + Buildable: False + else + if impl(ghc >= 6.12) + Ghc-Options: -Wall -fno-warn-unused-do-bind + else + Ghc-Options: -Wall + Extensions: CPP + Build-Depends: base >= 4 && < 5, Diff, test-framework >= 0.3 && < 0.4, + test-framework-hunit >= 0.2 && < 0.3, + test-framework-quickcheck2 >= 0.2.9 && < 0.3, + QuickCheck >= 2.4 && < 2.6, + HUnit >= 1.2 && < 1.3, + template-haskell >= 2.4 && < 2.6, + ansi-terminal == 0.5.* + Other-Modules: Tests.Old + Tests.Helpers + Tests.Arbitrary + Tests.Shared + Tests.Readers.LaTeX + Tests.Readers.Markdown + Tests.Readers.RST + Tests.Writers.Native + Tests.Writers.ConTeXt + Tests.Writers.HTML diff --git a/src/Tests/Arbitrary.hs b/src/Tests/Arbitrary.hs new file mode 100644 index 000000000..978717bef --- /dev/null +++ b/src/Tests/Arbitrary.hs @@ -0,0 +1,181 @@ +{-# OPTIONS_GHC -fno-warn-orphans #-} +{-# LANGUAGE TypeSynonymInstances #-} +-- provides Arbitrary instance for Pandoc types +module Tests.Arbitrary () +where +import Test.QuickCheck.Gen +import Test.QuickCheck.Arbitrary +import Control.Monad (liftM, liftM2) +import Text.Pandoc +import Text.Pandoc.Shared +import Text.Pandoc.Builder + +realString :: Gen String +realString = resize 8 arbitrary -- elements wordlist + +{- +wordlist :: [String] +wordlist = ["foo","Bar","baz","\\","/",":","\"","'","féé"] +-} + +instance Arbitrary Inlines where + arbitrary = liftM fromList arbitrary + +instance Arbitrary Blocks where + arbitrary = liftM fromList arbitrary + +instance Arbitrary Inline where + arbitrary = resize 3 $ arbInline 3 + +-- restrict to 3 levels of nesting max; otherwise we get +-- bogged down in indefinitely large structures +arbInline :: Int -> Gen Inline +arbInline n = frequency $ [ (60, liftM Str realString) + , (60, return Space) + , (10, liftM2 Code arbitrary realString) + , (5, return EmDash) + , (5, return EnDash) + , (5, return Apostrophe) + , (5, return Ellipses) + , (5, elements [ RawInline "html" "<a>*&*</a>" + , RawInline "latex" "\\my{command}" ]) + ] ++ [ x | x <- nesters, n > 1] + where nesters = [ (10, liftM Emph $ listOf $ arbInline (n-1)) + , (10, liftM Strong $ listOf $ arbInline (n-1)) + , (10, liftM Strikeout $ listOf $ arbInline (n-1)) + , (10, liftM Superscript $ listOf $ arbInline (n-1)) + , (10, liftM Subscript $ listOf $ arbInline (n-1)) + , (10, liftM SmallCaps $ listOf $ arbInline (n-1)) + , (10, do x1 <- arbitrary + x2 <- listOf $ arbInline (n-1) + return $ Quoted x1 x2) + , (10, do x1 <- arbitrary + x2 <- realString + return $ Math x1 x2) + , (10, do x1 <- listOf $ arbInline (n-1) + x3 <- realString + x2 <- realString + return $ Link x1 (x2,x3)) + , (10, do x1 <- listOf $ arbInline (n-1) + x3 <- realString + x2 <- realString + return $ Image x1 (x2,x3)) + , (2, liftM Note $ resize 3 $ listOf1 arbitrary) + ] + +instance Arbitrary Block where + arbitrary = resize 3 $ arbBlock 3 + +arbBlock :: Int -> Gen Block +arbBlock n = frequency $ [ (10, liftM Plain arbitrary) + , (15, liftM Para arbitrary) + , (5, liftM2 CodeBlock arbitrary realString) + , (2, elements [ RawBlock "html" + "<div>\n*&*\n</div>" + , RawBlock "latex" + "\\begin[opt]{env}\nhi\n{\\end{env}" + ]) + , (5, do x1 <- choose (1 :: Int, 6) + x2 <- arbitrary + return (Header x1 x2)) + , (2, return HorizontalRule) + ] ++ [x | x <- nesters, n > 0] + where nesters = [ (5, liftM BlockQuote $ listOf $ arbBlock (n-1)) + , (5, liftM2 OrderedList arbitrary + $ (listOf1 $ listOf1 $ arbBlock (n-1))) + , (5, liftM BulletList $ (listOf1 $ listOf1 $ arbBlock (n-1))) + , (5, do x1 <- listOf $ listOf1 $ listOf1 $ arbBlock (n-1) + x2 <- arbitrary + return (DefinitionList $ zip x2 x1)) + , (2, do rs <- choose (1 :: Int, 4) + cs <- choose (1 :: Int, 4) + x1 <- arbitrary + x2 <- vector cs + x3 <- vectorOf cs $ elements [0, 0.25] + x4 <- vectorOf cs $ listOf $ arbBlock (n-1) + x5 <- vectorOf rs $ vectorOf cs + $ listOf $ arbBlock (n-1) + return (Table x1 x2 x3 x4 x5)) + ] + +instance Arbitrary Pandoc where + arbitrary = resize 8 $ liftM normalize + $ liftM2 Pandoc arbitrary arbitrary + +{- +instance Arbitrary CitationMode where + arbitrary + = do x <- choose (0 :: Int, 2) + case x of + 0 -> return AuthorInText + 1 -> return SuppressAuthor + 2 -> return NormalCitation + _ -> error "FATAL ERROR: Arbitrary instance, logic bug" + +instance Arbitrary Citation where + arbitrary + = do x1 <- liftM (filter (`notElem` ",;]@ \t\n")) arbitrary + x2 <- arbitrary + x3 <- arbitrary + x4 <- arbitrary + x5 <- arbitrary + x6 <- arbitrary + return (Citation x1 x2 x3 x4 x5 x6) +-} + +instance Arbitrary MathType where + arbitrary + = do x <- choose (0 :: Int, 1) + case x of + 0 -> return DisplayMath + 1 -> return InlineMath + _ -> error "FATAL ERROR: Arbitrary instance, logic bug" + +instance Arbitrary QuoteType where + arbitrary + = do x <- choose (0 :: Int, 1) + case x of + 0 -> return SingleQuote + 1 -> return DoubleQuote + _ -> error "FATAL ERROR: Arbitrary instance, logic bug" + +instance Arbitrary Meta where + arbitrary + = do x1 <- arbitrary + x2 <- liftM (filter (not . null)) arbitrary + x3 <- arbitrary + return (Meta x1 x2 x3) + +instance Arbitrary Alignment where + arbitrary + = do x <- choose (0 :: Int, 3) + case x of + 0 -> return AlignLeft + 1 -> return AlignRight + 2 -> return AlignCenter + 3 -> return AlignDefault + _ -> error "FATAL ERROR: Arbitrary instance, logic bug" + +instance Arbitrary ListNumberStyle where + arbitrary + = do x <- choose (0 :: Int, 6) + case x of + 0 -> return DefaultStyle + 1 -> return Example + 2 -> return Decimal + 3 -> return LowerRoman + 4 -> return UpperRoman + 5 -> return LowerAlpha + 6 -> return UpperAlpha + _ -> error "FATAL ERROR: Arbitrary instance, logic bug" + +instance Arbitrary ListNumberDelim where + arbitrary + = do x <- choose (0 :: Int, 3) + case x of + 0 -> return DefaultDelim + 1 -> return Period + 2 -> return OneParen + 3 -> return TwoParens + _ -> error "FATAL ERROR: Arbitrary instance, logic bug" + diff --git a/src/Tests/Helpers.hs b/src/Tests/Helpers.hs new file mode 100644 index 000000000..b8d6b83a7 --- /dev/null +++ b/src/Tests/Helpers.hs @@ -0,0 +1,116 @@ +{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, TemplateHaskell #-} +-- Utility functions for the test suite. + +module Tests.Helpers ( lit + , file + , test + , (=?>) + , property + , ToString(..) + , ToPandoc(..) + ) + where + +import Text.Pandoc.Definition +import Text.Pandoc.Builder (Inlines, Blocks, doc, plain) +import Test.Framework +import Test.Framework.Providers.HUnit +import Test.Framework.Providers.QuickCheck2 +import Test.HUnit (assertBool) +import Text.Pandoc.Shared (normalize, defaultWriterOptions, + WriterOptions(..), removeTrailingSpace) +import Text.Pandoc.Writers.Native (writeNative) +import Language.Haskell.TH.Quote (QuasiQuoter(..)) +import Language.Haskell.TH.Syntax (Q, runIO) +import qualified Test.QuickCheck.Property as QP +import System.Console.ANSI +import Data.Algorithm.Diff + +lit :: QuasiQuoter +lit = QuasiQuoter { + quoteExp = (\a -> let b = rnl a in [|b|]) . filter (/= '\r') + , quotePat = error "Cannot use lit as a pattern" + } + where rnl ('\n':xs) = xs + rnl xs = xs + +file :: QuasiQuoter +file = quoteFile lit + +-- adapted from TH 2.5 code +quoteFile :: QuasiQuoter -> QuasiQuoter +quoteFile (QuasiQuoter { quoteExp = qe, quotePat = qp }) = + QuasiQuoter { quoteExp = get qe, quotePat = get qp } + where + get :: (String -> Q a) -> String -> Q a + get old_quoter file_name = do { file_cts <- runIO (readFile file_name) + ; old_quoter file_cts } + +test :: (ToString a, ToString b, ToString c) + => (a -> b) -- ^ function to test + -> String -- ^ name of test case + -> (a, c) -- ^ (input, expected value) + -> Test +test fn name (input, expected) = + testCase name $ assertBool msg (actual' == expected') + where msg = nl ++ dashes "input" ++ nl ++ input' ++ nl ++ + dashes "expected" ++ nl ++ expected'' ++ + dashes "got" ++ nl ++ actual'' ++ + dashes "" + nl = "\n" + input' = toString input + actual' = toString $ fn input + expected' = toString expected + diff = getDiff (lines expected') (lines actual') + expected'' = unlines $ map vividize $ filter (\(d,_) -> d /= S) diff + actual'' = unlines $ map vividize $ filter (\(d,_) -> d /= F) diff + dashes "" = replicate 72 '-' + dashes x = replicate (72 - length x - 5) '-' ++ " " ++ x ++ " ---" + +vividize :: (DI,String) -> String +vividize (B,s) = s +vividize (_,s) = vivid s + +property :: QP.Testable a => TestName -> a -> Test +property = testProperty + +vivid :: String -> String +vivid s = setSGRCode [SetColor Background Dull Red + , SetColor Foreground Vivid White] ++ s + ++ setSGRCode [Reset] + +infix 6 =?> +(=?>) :: a -> b -> (a,b) +x =?> y = (x, y) + +class ToString a where + toString :: a -> String + +instance ToString Pandoc where + toString d = writeNative defaultWriterOptions{ writerStandalone = s } + $ toPandoc d + where s = case d of + (Pandoc (Meta [] [] []) _) -> False + _ -> True + +instance ToString Blocks where + toString = writeNative defaultWriterOptions . toPandoc + +instance ToString Inlines where + toString = removeTrailingSpace . writeNative defaultWriterOptions . + toPandoc + +instance ToString String where + toString = id + +class ToPandoc a where + toPandoc :: a -> Pandoc + +instance ToPandoc Pandoc where + toPandoc = normalize + +instance ToPandoc Blocks where + toPandoc = normalize . doc + +instance ToPandoc Inlines where + toPandoc = normalize . doc . plain diff --git a/src/Tests/Old.hs b/src/Tests/Old.hs new file mode 100644 index 000000000..cb1417ffa --- /dev/null +++ b/src/Tests/Old.hs @@ -0,0 +1,201 @@ +module Tests.Old (tests) where + +import Test.Framework (testGroup, Test ) +import Test.Framework.Providers.HUnit +import Test.HUnit ( assertBool ) + +import System.IO ( openTempFile, stderr ) +import System.Process ( runProcess, waitForProcess ) +import System.FilePath ( (</>), (<.>) ) +import System.Directory +import System.Exit +import Data.Algorithm.Diff +import Text.Pandoc.Shared ( normalize, defaultWriterOptions ) +import Text.Pandoc.Writers.Native ( writeNative ) +import Text.Pandoc.Readers.Native ( readNative ) +import Text.Pandoc.Highlighting ( languages ) +import Prelude hiding ( readFile ) +import qualified Data.ByteString.Lazy as B +import Data.ByteString.Lazy.UTF8 (toString) +import Text.Printf + +readFileUTF8 :: FilePath -> IO String +readFileUTF8 f = B.readFile f >>= return . toString + +pandocPath :: FilePath +pandocPath = ".." </> "dist" </> "build" </> "pandoc" </> "pandoc" + +data TestResult = TestPassed + | TestError ExitCode + | TestFailed String FilePath [(DI, String)] + deriving (Eq) + +instance Show TestResult where + show TestPassed = "PASSED" + show (TestError ec) = "ERROR " ++ show ec + show (TestFailed cmd file d) = '\n' : dash ++ + "\n--- " ++ file ++ + "\n+++ " ++ cmd ++ "\n" ++ showDiff (1,1) d ++ + dash + where dash = replicate 72 '-' + +showDiff :: (Int,Int) -> [(DI, String)] -> String +showDiff _ [] = "" +showDiff (l,r) ((F, ln) : ds) = + printf "+%4d " l ++ ln ++ "\n" ++ showDiff (l+1,r) ds +showDiff (l,r) ((S, ln) : ds) = + printf "-%4d " r ++ ln ++ "\n" ++ showDiff (l,r+1) ds +showDiff (l,r) ((B, _ ) : ds) = + showDiff (l+1,r+1) ds + +tests :: [Test] +tests = [ testGroup "markdown" + [ testGroup "writer" + $ writerTests "markdown" ++ lhsWriterTests "markdown" + , testGroup "reader" + [ test "basic" ["-r", "markdown", "-w", "native", "-s", "-S"] + "testsuite.txt" "testsuite.native" + , test "tables" ["-r", "markdown", "-w", "native", "--columns=80"] + "tables.txt" "tables.native" + , test "more" ["-r", "markdown", "-w", "native", "-S"] + "markdown-reader-more.txt" "markdown-reader-more.native" + , lhsReaderTest "markdown+lhs" + ] + , testGroup "citations" markdownCitationTests + ] + , testGroup "rst" + [ testGroup "writer" (writerTests "rst" ++ lhsWriterTests "rst") + , testGroup "reader" + [ test "basic" ["-r", "rst", "-w", "native", + "-s", "-S", "--columns=80"] "rst-reader.rst" "rst-reader.native" + , test "tables" ["-r", "rst", "-w", "native", "--columns=80"] + "tables.rst" "tables-rstsubset.native" + , lhsReaderTest "rst+lhs" + ] + ] + , testGroup "latex" + [ testGroup "writer" (writerTests "latex" ++ lhsWriterTests "latex") + , testGroup "reader" + [ test "basic" ["-r", "latex", "-w", "native", "-s", "-R"] + "latex-reader.latex" "latex-reader.native" + , lhsReaderTest "latex+lhs" + ] + ] + , testGroup "html" + [ testGroup "writer" (writerTests "html" ++ lhsWriterTests "html") + , test "reader" ["-r", "html", "-w", "native", "-s"] + "html-reader.html" "html-reader.native" + ] + , testGroup "s5" + [ s5WriterTest "basic" ["-s"] "s5" + , s5WriterTest "fancy" ["-s","-m","-i"] "s5" + , s5WriterTest "fragment" [] "html" + , s5WriterTest "inserts" ["-s", "-H", "insert", + "-B", "insert", "-A", "insert", "-c", "main.css"] "html" + ] + , testGroup "textile" + [ testGroup "writer" $ writerTests "textile" + , test "reader" ["-r", "textile", "-w", "native", "-s"] + "textile-reader.textile" "textile-reader.native" + ] + , testGroup "native" + [ testGroup "writer" $ writerTests "native" + , test "reader" ["-r", "native", "-w", "native", "-s"] + "testsuite.native" "testsuite.native" + ] + , testGroup "other writers" $ map (\f -> testGroup f $ writerTests f) + [ "docbook", "opendocument" , "context" , "texinfo" + , "man" , "plain" , "mediawiki", "rtf", "org" + ] + ] + +-- makes sure file is fully closed after reading +readFile' :: FilePath -> IO String +readFile' f = do s <- readFileUTF8 f + return $! (length s `seq` s) + +lhsWriterTests :: String -> [Test] +lhsWriterTests format + = [ t "lhs to normal" format + , t "lhs to lhs" (format ++ "+lhs") + ] + where + t n f = test n ["--columns=78", "-r", "native", "-s", "-w", f] + "lhs-test.native" ("lhs-test" <.> ext f) + ext f = if null languages && format == "html" + then "nohl" <.> f + else f + +lhsReaderTest :: String -> Test +lhsReaderTest format = + testWithNormalize normalizer "lhs" ["-r", format, "-w", "native"] + ("lhs-test" <.> format) "lhs-test.native" + where normalizer = writeNative defaultWriterOptions . normalize . readNative + +writerTests :: String -> [Test] +writerTests format + = [ test "basic" (opts ++ ["-s"]) "testsuite.native" ("writer" <.> format) + , test "tables" opts "tables.native" ("tables" <.> format) + ] + where + opts = ["-r", "native", "-w", format, "--columns=78"] + +s5WriterTest :: String -> [String] -> String -> Test +s5WriterTest modifier opts format + = test (format ++ " writer (" ++ modifier ++ ")") + (["-r", "native", "-w", format] ++ opts) + "s5.native" ("s5." ++ modifier <.> "html") + +markdownCitationTests :: [Test] +markdownCitationTests + = map styleToTest ["chicago-author-date","ieee","mhra"] + ++ [test "natbib" wopts "markdown-citations.txt" + "markdown-citations.txt"] + where + ropts = ["-r", "markdown", "-w", "markdown", "--bibliography", + "biblio.bib", "--no-wrap"] + wopts = ropts ++ ["--natbib"] + styleToTest style = test style (ropts ++ ["--csl", style ++ ".csl"]) + "markdown-citations.txt" + ("markdown-citations." ++ style ++ ".txt") + +-- | Run a test without normalize function, return True if test passed. +test :: String -- ^ Title of test + -> [String] -- ^ Options to pass to pandoc + -> String -- ^ Input filepath + -> FilePath -- ^ Norm (for test results) filepath + -> Test +test = testWithNormalize id + +-- | Run a test with normalize function, return True if test passed. +testWithNormalize :: (String -> String) -- ^ Normalize function for output + -> String -- ^ Title of test + -> [String] -- ^ Options to pass to pandoc + -> String -- ^ Input filepath + -> FilePath -- ^ Norm (for test results) filepath + -> Test +testWithNormalize normalizer testname opts inp norm = testCase testname $ do + (outputPath, hOut) <- openTempFile "" "pandoc-test" + let inpPath = inp + let normPath = norm + let options = ["--data-dir", ".."] ++ [inpPath] ++ opts + let cmd = pandocPath ++ " " ++ unwords options + ph <- runProcess pandocPath options Nothing + (Just [("LANG","en_US.UTF-8"),("HOME", "./")]) Nothing (Just hOut) + (Just stderr) + ec <- waitForProcess ph + result <- if ec == ExitSuccess + then do + -- filter \r so the tests will work on Windows machines + outputContents <- readFile' outputPath >>= + return . filter (/='\r') . normalizer + normContents <- readFile' normPath >>= + return . filter (/='\r') . normalizer + if outputContents == normContents + then return TestPassed + else return + $ TestFailed cmd normPath + $ getDiff (lines outputContents) (lines normContents) + else return $ TestError ec + removeFile outputPath + assertBool (show result) (result == TestPassed) diff --git a/src/Tests/Readers/LaTeX.hs b/src/Tests/Readers/LaTeX.hs new file mode 100644 index 000000000..6d28441f8 --- /dev/null +++ b/src/Tests/Readers/LaTeX.hs @@ -0,0 +1,161 @@ +{-# LANGUAGE OverloadedStrings #-} +module Tests.Readers.LaTeX (tests) where + +import Text.Pandoc.Definition +import Test.Framework +import Tests.Helpers +import Tests.Arbitrary() +import Text.Pandoc.Builder +import Text.Pandoc + +latex :: String -> Pandoc +latex = readLaTeX defaultParserState + +infix 5 =: +(=:) :: ToString c + => String -> (String, c) -> Test +(=:) = test latex + +tests :: [Test] +tests = [ testGroup "basic" + [ "simple" =: + "word" =?> para "word" + , "space" =: + "some text" =?> para ("some text") + , "emphasized" =: + "\\emph{emphasized}" =?> para (emph "emphasized") + ] + + , testGroup "headers" + [ "level 1" =: + "\\section{header}" =?> header 1 "header" + , "level 2" =: + "\\subsection{header}" =?> header 2 "header" + , "level 3" =: + "\\subsubsection{header}" =?> header 3 "header" + , "emph" =: + "\\section{text \\emph{emph}}" =?> + header 1 ("text" +++ space +++ emph "emph") + , "link" =: + "\\section{text \\href{/url}{link}}" =?> + header 1 ("text" +++ space +++ link "/url" "" "link") + ] + + , testGroup "space and comments" + [ "blank lines + space at beginning" =: + "\n \n hi" =?> para "hi" + , "blank lines + space + comments" =: + "% my comment\n\n \n % another\n\nhi" =?> para "hi" + , "comment in paragraph" =: + "hi % this is a comment\nthere\n" =?> para "hi there" + ] + + , testGroup "citations" + [ natbibCitations + , biblatexCitations + ] + ] + +baseCitation :: Citation +baseCitation = Citation{ citationId = "item1" + , citationPrefix = [] + , citationSuffix = [] + , citationMode = AuthorInText + , citationNoteNum = 0 + , citationHash = 0 } + +natbibCitations :: Test +natbibCitations = testGroup "natbib" + [ "citet" =: "\\citet{item1}" + =?> para (cite [baseCitation] empty) + , "suffix" =: "\\citet[p.~30]{item1}" + =?> para + (cite [baseCitation{ citationSuffix = toList $ text "p.\160\&30" }] empty) + , "suffix long" =: "\\citet[p.~30, with suffix]{item1}" + =?> para (cite [baseCitation{ citationSuffix = + toList $ text "p.\160\&30, with suffix" }] empty) + , "multiple" =: "\\citeauthor{item1} \\citetext{\\citeyear{item1}; \\citeyear[p.~30]{item2}; \\citealp[see also][]{item3}}" + =?> para (cite [baseCitation{ citationMode = AuthorInText } + ,baseCitation{ citationMode = SuppressAuthor + , citationSuffix = [Str "p.\160\&30"] + , citationId = "item2" } + ,baseCitation{ citationId = "item3" + , citationPrefix = [Str "see",Space,Str "also"] + , citationMode = NormalCitation } + ] empty) + , "group" =: "\\citetext{\\citealp[see][p.~34--35]{item1}; \\citealp[also][chap. 3]{item3}}" + =?> para (cite [baseCitation{ citationMode = NormalCitation + , citationPrefix = [Str "see"] + , citationSuffix = [Str "p.\160\&34",EnDash,Str "35"] } + ,baseCitation{ citationMode = NormalCitation + , citationId = "item3" + , citationPrefix = [Str "also"] + , citationSuffix = [Str "chap.",Space,Str "3"] } + ] empty) + , "suffix and locator" =: "\\citep[pp.~33, 35--37, and nowhere else]{item1}" + =?> para (cite [baseCitation{ citationMode = NormalCitation + , citationSuffix = [Str "pp.\160\&33,",Space,Str "35",EnDash,Str "37,",Space,Str "and",Space,Str "nowhere",Space, Str "else"] }] empty) + , "suffix only" =: "\\citep[and nowhere else]{item1}" + =?> para (cite [baseCitation{ citationMode = NormalCitation + , citationSuffix = toList $ text "and nowhere else" }] empty) + , "no author" =: "\\citeyearpar{item1}, and now Doe with a locator \\citeyearpar[p.~44]{item2}" + =?> para (cite [baseCitation{ citationMode = SuppressAuthor }] empty +++ + text ", and now Doe with a locator " +++ + cite [baseCitation{ citationMode = SuppressAuthor + , citationSuffix = [Str "p.\160\&44"] + , citationId = "item2" }] empty) + , "markup" =: "\\citep[\\emph{see}][p. \\textbf{32}]{item1}" + =?> para (cite [baseCitation{ citationMode = NormalCitation + , citationPrefix = [Emph [Str "see"]] + , citationSuffix = [Str "p.",Space, + Strong [Str "32"]] }] empty) + ] + +biblatexCitations :: Test +biblatexCitations = testGroup "biblatex" + [ "textcite" =: "\\textcite{item1}" + =?> para (cite [baseCitation] empty) + , "suffix" =: "\\textcite[p.~30]{item1}" + =?> para + (cite [baseCitation{ citationSuffix = toList $ text "p.\160\&30" }] empty) + , "suffix long" =: "\\textcite[p.~30, with suffix]{item1}" + =?> para (cite [baseCitation{ citationSuffix = + toList $ text "p.\160\&30, with suffix" }] empty) + , "multiple" =: "\\textcites{item1}[p.~30]{item2}[see also][]{item3}" + =?> para (cite [baseCitation{ citationMode = AuthorInText } + ,baseCitation{ citationMode = NormalCitation + , citationSuffix = [Str "p.\160\&30"] + , citationId = "item2" } + ,baseCitation{ citationId = "item3" + , citationPrefix = [Str "see",Space,Str "also"] + , citationMode = NormalCitation } + ] empty) + , "group" =: "\\autocites[see][p.~34--35]{item1}[also][chap. 3]{item3}" + =?> para (cite [baseCitation{ citationMode = NormalCitation + , citationPrefix = [Str "see"] + , citationSuffix = [Str "p.\160\&34",EnDash,Str "35"] } + ,baseCitation{ citationMode = NormalCitation + , citationId = "item3" + , citationPrefix = [Str "also"] + , citationSuffix = [Str "chap.",Space,Str "3"] } + ] empty) + , "suffix and locator" =: "\\autocite[pp.~33, 35--37, and nowhere else]{item1}" + =?> para (cite [baseCitation{ citationMode = NormalCitation + , citationSuffix = [Str "pp.\160\&33,",Space,Str "35",EnDash,Str "37,",Space,Str "and",Space,Str "nowhere",Space, Str "else"] }] empty) + , "suffix only" =: "\\autocite[and nowhere else]{item1}" + =?> para (cite [baseCitation{ citationMode = NormalCitation + , citationSuffix = toList $ text "and nowhere else" }] empty) + , "no author" =: "\\autocite*{item1}, and now Doe with a locator \\autocite*[p.~44]{item2}" + =?> para (cite [baseCitation{ citationMode = SuppressAuthor }] empty +++ + text ", and now Doe with a locator " +++ + cite [baseCitation{ citationMode = SuppressAuthor + , citationSuffix = [Str "p.\160\&44"] + , citationId = "item2" }] empty) + , "markup" =: "\\autocite[\\emph{see}][p. \\textbf{32}]{item1}" + =?> para (cite [baseCitation{ citationMode = NormalCitation + , citationPrefix = [Emph [Str "see"]] + , citationSuffix = [Str "p.",Space, + Strong [Str "32"]] }] empty) + , "parencite" =: "\\parencite{item1}" + =?> para (cite [baseCitation{ citationMode = NormalCitation }] empty) + ] diff --git a/src/Tests/Readers/Markdown.hs b/src/Tests/Readers/Markdown.hs new file mode 100644 index 000000000..722a45bdb --- /dev/null +++ b/src/Tests/Readers/Markdown.hs @@ -0,0 +1,29 @@ +{-# LANGUAGE OverloadedStrings, QuasiQuotes #-} +module Tests.Readers.Markdown (tests) where + +import Text.Pandoc.Definition +import Test.Framework +import Tests.Helpers +import Tests.Arbitrary() +import Text.Pandoc.Builder +import Text.Pandoc + +markdown :: String -> Pandoc +markdown = readMarkdown defaultParserState{ stateStandalone = True } + +infix 5 =: +(=:) :: ToString c + => String -> (String, c) -> Test +(=:) = test markdown + +tests :: [Test] +tests = [ testGroup "inline code" + [ "with attribute" =: + "`document.write(\"Hello\");`{.javascript}" + =?> para + (codeWith ("",["javascript"],[]) "document.write(\"Hello\");") + , "with attribute space" =: + "`*` {.haskell .special x=\"7\"}" + =?> para (codeWith ("",["haskell","special"],[("x","7")]) "*") + ] + ] diff --git a/src/Tests/Readers/RST.hs b/src/Tests/Readers/RST.hs new file mode 100644 index 000000000..c0f60ff51 --- /dev/null +++ b/src/Tests/Readers/RST.hs @@ -0,0 +1,46 @@ +{-# LANGUAGE OverloadedStrings, QuasiQuotes #-} +module Tests.Readers.RST (tests) where + +import Text.Pandoc.Definition +import Test.Framework +import Tests.Helpers +import Tests.Arbitrary() +import Text.Pandoc.Builder +import Text.Pandoc + +rst :: String -> Pandoc +rst = readRST defaultParserState{ stateStandalone = True } + +infix 5 =: +(=:) :: ToString c + => String -> (String, c) -> Test +(=:) = test rst + +tests :: [Test] +tests = [ "field list" =: + [_LIT| +:Hostname: media08 +:IP address: 10.0.0.19 +:Size: 3ru +:Date: 2001-08-16 +:Version: 1 +:Authors: - Me + - Myself + - I +:Indentation: Since the field marker may be quite long, the second + and subsequent lines of the field body do not have to line up + with the first line, but they must be indented relative to the + field name marker, and they must line up with each other. +:Parameter i: integer +|] =?> ( setAuthors ["Me","Myself","I"] + $ setDate "2001-08-16" + $ doc + $ definitionList [ (str "Hostname", [para "media08"]) + , (str "IP address", [para "10.0.0.19"]) + , (str "Size", [para "3ru"]) + , (str "Version", [para "1"]) + , (str "Indentation", [para "Since the field marker may be quite long, the second and subsequent lines of the field body do not have to line up with the first line, but they must be indented relative to the field name marker, and they must line up with each other."]) + , (str "Parameter i", [para "integer"]) + ]) + ] + diff --git a/src/Tests/Shared.hs b/src/Tests/Shared.hs new file mode 100644 index 000000000..c35a158c1 --- /dev/null +++ b/src/Tests/Shared.hs @@ -0,0 +1,21 @@ +module Tests.Shared (tests) where + +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Test.Framework +import Tests.Helpers +import Tests.Arbitrary() + +tests :: [Test] +tests = [ testGroup "normalize" + [ property "p_normalize_blocks_rt" p_normalize_blocks_rt + , property "p_normalize_inlines_rt" p_normalize_inlines_rt + ] + ] + +p_normalize_blocks_rt :: [Block] -> Bool +p_normalize_blocks_rt bs = normalize bs == normalize (normalize bs) + +p_normalize_inlines_rt :: [Inline] -> Bool +p_normalize_inlines_rt ils = normalize ils == normalize (normalize ils) + diff --git a/src/Tests/Writers/ConTeXt.hs b/src/Tests/Writers/ConTeXt.hs new file mode 100644 index 000000000..704571e95 --- /dev/null +++ b/src/Tests/Writers/ConTeXt.hs @@ -0,0 +1,72 @@ +{-# LANGUAGE OverloadedStrings, QuasiQuotes #-} +module Tests.Writers.ConTeXt (tests) where + +import Test.Framework +import Text.Pandoc.Builder +import Text.Pandoc +import Tests.Helpers +import Tests.Arbitrary() + +context :: (ToString a, ToPandoc a) => a -> String +context = writeConTeXt defaultWriterOptions . toPandoc + +context' :: (ToString a, ToPandoc a) => a -> String +context' = writeConTeXt defaultWriterOptions{ writerWrapText = False } + . toPandoc + +{- + "my test" =: X =?> Y + +is shorthand for + + test context "my test" $ X =?> Y + +which is in turn shorthand for + + test context "my test" (X,Y) +-} + +infix 5 =: +(=:) :: (ToString a, ToPandoc a) + => String -> (a, String) -> Test +(=:) = test context + +tests :: [Test] +tests = [ testGroup "inline code" + [ "with '}'" =: code "}" =?> "\\mono{\\letterclosebrace{}}" + , "without '}'" =: code "]" =?> "\\type{]}" + , property "code property" $ \s -> null s || + if '{' `elem` s || '}' `elem` s + then (context' $ code s) == "\\mono{" ++ + (context' $ str s) ++ "}" + else (context' $ code s) == "\\type{" ++ s ++ "}" + ] + , testGroup "headers" + [ "level 1" =: + header 1 "My header" =?> "\\subject{My header}" + , property "header 1 property" $ \ils -> + context' (header 1 ils) == "\\subject{" ++ context' ils ++ "}" + ] + , testGroup "bullet lists" + [ "nested" =: + bulletList [plain (text "top") + ,bulletList [plain (text "next") + ,bulletList [plain (text "bot")]]] + =?> [_LIT| +\startitemize +\item + top +\item + \startitemize + \item + next + \item + \startitemize + \item + bot + \stopitemize + \stopitemize +\stopitemize|] + ] + ] + diff --git a/src/Tests/Writers/HTML.hs b/src/Tests/Writers/HTML.hs new file mode 100644 index 000000000..e13d0dc87 --- /dev/null +++ b/src/Tests/Writers/HTML.hs @@ -0,0 +1,41 @@ +{-# LANGUAGE OverloadedStrings, QuasiQuotes #-} +module Tests.Writers.HTML (tests) where + +import Test.Framework +import Text.Pandoc.Builder +import Text.Pandoc +import Tests.Helpers +import Tests.Arbitrary() +import Text.Pandoc.Highlighting (languages) -- null if no hl support + +html :: (ToString a, ToPandoc a) => a -> String +html = writeHtmlString defaultWriterOptions{ writerWrapText = False } . toPandoc + +{- + "my test" =: X =?> Y + +is shorthand for + + test html "my test" $ X =?> Y + +which is in turn shorthand for + + test html "my test" (X,Y) +-} + +infix 5 =: +(=:) :: (ToString a, ToPandoc a) + => String -> (a, String) -> Test +(=:) = test html + +tests :: [Test] +tests = [ testGroup "inline code" + [ "basic" =: code "@&" =?> "<code>@&</code>" + , "haskell" =: codeWith ("",["haskell"],[]) ">>=" + =?> if null languages + then "<code class=\"haskell\">>>=</code>" + else "<code class=\"sourceCode haskell\"><span class=\"fu\">>>=</span></code>" + , "nolanguage" =: codeWith ("",["nolanguage"],[]) ">>=" + =?> "<code class=\"nolanguage\">>>=</code>" + ] + ] diff --git a/src/Tests/Writers/Native.hs b/src/Tests/Writers/Native.hs new file mode 100644 index 000000000..234fe938a --- /dev/null +++ b/src/Tests/Writers/Native.hs @@ -0,0 +1,20 @@ +module Tests.Writers.Native (tests) where + +import Test.Framework +import Text.Pandoc.Builder +import Text.Pandoc +import Tests.Helpers +import Tests.Arbitrary() + +p_write_rt :: Pandoc -> Bool +p_write_rt d = + read (writeNative defaultWriterOptions{ writerStandalone = True } d) == d + +p_write_blocks_rt :: [Block] -> Bool +p_write_blocks_rt bs = + read (writeNative defaultWriterOptions (Pandoc (Meta [] [] []) bs)) == bs + +tests :: [Test] +tests = [ property "p_write_rt" p_write_rt + , property "p_write_blocks_rt" p_write_blocks_rt + ] diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index ad429bc93..ef8560284 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -57,11 +57,18 @@ module Text.Pandoc ( -- * Definitions module Text.Pandoc.Definition + -- * Generics + , module Text.Pandoc.Generic + -- * Lists of readers and writers + , readers + , writers -- * Readers: converting /to/ Pandoc format , readMarkdown , readRST , readLaTeX , readHtml + , readTextile + , readNative -- * Parser state used in readers , ParserState (..) , defaultParserState @@ -84,25 +91,34 @@ module Text.Pandoc , writeOpenDocument , writeMan , writeMediaWiki + , writeTextile , writeRTF , writeODT , writeEPUB + , writeOrg -- * Writer options used in writers , WriterOptions (..) , HTMLSlideVariant (..) , HTMLMathMethod (..) + , CiteMethod (..) , defaultWriterOptions -- * Rendering templates and default templates , module Text.Pandoc.Templates -- * Version , pandocVersion + -- * Miscellaneous + , rtfEmbedImage + , jsonFilter ) where import Text.Pandoc.Definition +import Text.Pandoc.Generic import Text.Pandoc.Readers.Markdown import Text.Pandoc.Readers.RST import Text.Pandoc.Readers.LaTeX import Text.Pandoc.Readers.HTML +import Text.Pandoc.Readers.Textile +import Text.Pandoc.Readers.Native import Text.Pandoc.Writers.Native import Text.Pandoc.Writers.Markdown import Text.Pandoc.Writers.RST @@ -117,12 +133,69 @@ import Text.Pandoc.Writers.OpenDocument import Text.Pandoc.Writers.Man import Text.Pandoc.Writers.RTF import Text.Pandoc.Writers.MediaWiki +import Text.Pandoc.Writers.Textile +import Text.Pandoc.Writers.Org import Text.Pandoc.Templates import Text.Pandoc.Parsing import Text.Pandoc.Shared import Data.Version (showVersion) +import Text.JSON.Generic import Paths_pandoc (version) -- | Version number of pandoc library. pandocVersion :: String pandocVersion = showVersion version + +-- | Association list of formats and readers. +readers :: [(String, ParserState -> String -> Pandoc)] +readers = [("native" , \_ -> readNative) + ,("json" , \_ -> decodeJSON) + ,("markdown" , readMarkdown) + ,("markdown+lhs" , \st -> + readMarkdown st{ stateLiterateHaskell = True}) + ,("rst" , readRST) + ,("rst+lhs" , \st -> + readRST st{ stateLiterateHaskell = True}) + ,("textile" , readTextile) -- TODO : textile+lhs + ,("html" , readHtml) + ,("latex" , readLaTeX) + ,("latex+lhs" , \st -> + readLaTeX st{ stateLiterateHaskell = True}) + ] + +-- | Association list of formats and writers (omitting the +-- binary writers, odt and epub). +writers :: [ ( String, WriterOptions -> Pandoc -> String ) ] +writers = [("native" , writeNative) + ,("json" , \_ -> encodeJSON) + ,("html" , writeHtmlString) + ,("html+lhs" , \o -> + writeHtmlString o{ writerLiterateHaskell = True }) + ,("s5" , writeHtmlString) + ,("slidy" , writeHtmlString) + ,("docbook" , writeDocbook) + ,("opendocument" , writeOpenDocument) + ,("latex" , writeLaTeX) + ,("latex+lhs" , \o -> + writeLaTeX o{ writerLiterateHaskell = True }) + ,("context" , writeConTeXt) + ,("texinfo" , writeTexinfo) + ,("man" , writeMan) + ,("markdown" , writeMarkdown) + ,("markdown+lhs" , \o -> + writeMarkdown o{ writerLiterateHaskell = True }) + ,("plain" , writePlain) + ,("rst" , writeRST) + ,("rst+lhs" , \o -> + writeRST o{ writerLiterateHaskell = True }) + ,("mediawiki" , writeMediaWiki) + ,("textile" , writeTextile) + ,("rtf" , writeRTF) + ,("org" , writeOrg) + ] + +-- | Converts a transformation on the Pandoc AST into a function +-- that reads and writes a JSON-encoded string. This is useful +-- for writing small scripts. +jsonFilter :: (Pandoc -> Pandoc) -> String -> String +jsonFilter f = encodeJSON . f . decodeJSON diff --git a/src/Text/Pandoc/Biblio.hs b/src/Text/Pandoc/Biblio.hs index 436eadd68..d65c9de1c 100644 --- a/src/Text/Pandoc/Biblio.hs +++ b/src/Text/Pandoc/Biblio.hs @@ -19,48 +19,203 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Biblio - Copyright : Copyright (C) 2008 Andrea Rossato + Copyright : Copyright (C) 2008-2010 Andrea Rossato License : GNU GPL, version 2 or above - Maintainer : Andrea Rossato <andrea.rossato@ing.unitn.it> + Maintainer : Andrea Rossato <andrea.rossato@unitn.it> Stability : alpha Portability : portable -} module Text.Pandoc.Biblio ( processBiblio ) where -import Control.Monad ( when ) import Data.List -import Text.CSL +import Data.Unique +import Data.Char ( isDigit, isPunctuation ) +import qualified Data.Map as M +import Text.CSL hiding ( Cite(..), Citation(..) ) +import qualified Text.CSL as CSL ( Cite(..) ) import Text.Pandoc.Definition +import Text.Pandoc.Generic +import Text.Pandoc.Shared (stringify) +import Text.ParserCombinators.Parsec +import Control.Monad -- | Process a 'Pandoc' document by adding citations formatted -- according to a CSL style, using 'citeproc' from citeproc-hs. -processBiblio :: String -> [Reference] -> Pandoc -> IO Pandoc -processBiblio cf r p +processBiblio :: FilePath -> [Reference] -> Pandoc -> IO Pandoc +processBiblio cslfile r p = if null r then return p else do - when (null cf) $ error "Missing the needed citation style file" - csl <- readCSLFile cf - let groups = queryWith getCite p - result = citeproc csl r groups - cits_map = zip groups (citations result) - biblioList = map (read . renderPandoc' csl) (bibliography result) - Pandoc m b = processWith (processCite csl cits_map) p - return $ Pandoc m $ b ++ biblioList + csl <- readCSLFile cslfile + p' <- bottomUpM setHash p + let (nts,grps) = if styleClass csl == "note" + then let cits = queryWith getCite p' + ncits = map (queryWith getCite) $ queryWith getNote p' + needNt = cits \\ concat ncits + in (,) needNt $ getNoteCitations needNt p' + else (,) [] $ queryWith getCitation p' + result = citeproc procOpts csl r (setNearNote csl $ + map (map toCslCite) grps) + cits_map = M.fromList $ zip grps (citations result) + biblioList = map (renderPandoc' csl) (bibliography result) + Pandoc m b = bottomUp (procInlines $ processCite csl cits_map) p' + return . generateNotes nts . Pandoc m $ b ++ biblioList -- | Substitute 'Cite' elements with formatted citations. -processCite :: Style -> [([Target],[FormattedOutput])] -> Inline -> Inline -processCite s cs il - | Cite t _ <- il = Cite t (process t) - | otherwise = il +processCite :: Style -> M.Map [Citation] [FormattedOutput] -> [Inline] -> [Inline] +processCite _ _ [] = [] +processCite s cs (i:is) + | Cite t _ <- i = process t ++ processCite s cs is + | otherwise = i : processCite s cs is where - process t = case elemIndex t (map fst cs) of - Just i -> read . renderPandoc s $ snd (cs !! i) + addNt t x = if null x then [] else [Cite t $ renderPandoc s x] + process t = case M.lookup t cs of + Just x -> if isTextualCitation t && x /= [] + then renderPandoc s [head x] ++ + if tail x /= [] + then Space : addNt t (tail x) + else [] + else [Cite t $ renderPandoc s x] Nothing -> [Str ("Error processing " ++ show t)] +isTextualCitation :: [Citation] -> Bool +isTextualCitation (c:_) = citationMode c == AuthorInText +isTextualCitation _ = False + -- | Retrieve all citations from a 'Pandoc' docuument. To be used with -- 'queryWith'. -getCite :: Inline -> [[(String,String)]] -getCite i | Cite t _ <- i = [t] +getCitation :: Inline -> [[Citation]] +getCitation i | Cite t _ <- i = [t] + | otherwise = [] + +getNote :: Inline -> [Inline] +getNote i | Note _ <- i = [i] + | otherwise = [] + +getCite :: Inline -> [Inline] +getCite i | Cite _ _ <- i = [i] | otherwise = [] + +getNoteCitations :: [Inline] -> Pandoc -> [[Citation]] +getNoteCitations needNote + = let mvCite i = if i `elem` needNote then Note [Para [i]] else i + setNote = bottomUp mvCite + getCits = concat . flip (zipWith $ setCiteNoteNum) [1..] . + map (queryWith getCite) . queryWith getNote . setNote + in queryWith getCitation . getCits + +setHash :: Citation -> IO Citation +setHash (Citation i p s cm nn _) + = hashUnique `fmap` newUnique >>= return . Citation i p s cm nn + +generateNotes :: [Inline] -> Pandoc -> Pandoc +generateNotes needNote = bottomUp (mvCiteInNote needNote) + +procInlines :: ([Inline] -> [Inline]) -> Block -> Block +procInlines f b + | Plain inls <- b = Plain $ f inls + | Para inls <- b = Para $ f inls + | Header i inls <- b = Header i $ f inls + | otherwise = b + +mvCiteInNote :: [Inline] -> Block -> Block +mvCiteInNote is = procInlines mvCite + where + mvCite :: [Inline] -> [Inline] + mvCite inls + | x:i:xs <- inls, startWithPunct xs + , x == Space, i `elem_` is = switch i xs ++ mvCite (tailFirstInlineStr xs) + | x:i:xs <- inls + , x == Space, i `elem_` is = mvInNote i : mvCite xs + | i:xs <- inls, i `elem_` is + , startWithPunct xs = switch i xs ++ mvCite (tailFirstInlineStr xs) + | i:xs <- inls, Note _ <- i = checkNt i : mvCite xs + | i:xs <- inls = i : mvCite xs + | otherwise = [] + elem_ x xs = case x of Cite cs _ -> (Cite cs []) `elem` xs; _ -> False + switch i xs = Str (headInline xs) : mvInNote i : [] + mvInNote i + | Cite t o <- i = Note [Para [Cite t $ sanitize o]] + | otherwise = Note [Para [i ]] + sanitize i + | endWithPunct i = toCapital i + | otherwise = toCapital (i ++ [Str "."]) + + checkPt i + | Cite c o : xs <- i + , endWithPunct o, startWithPunct xs + , endWithPunct o = Cite c (initInline o) : checkPt xs + | x:xs <- i = x : checkPt xs + | otherwise = [] + checkNt = bottomUp $ procInlines checkPt + +setCiteNoteNum :: [Inline] -> Int -> [Inline] +setCiteNoteNum ((Cite cs o):xs) n = Cite (setCitationNoteNum n cs) o : setCiteNoteNum xs n +setCiteNoteNum _ _ = [] + +setCitationNoteNum :: Int -> [Citation] -> [Citation] +setCitationNoteNum i = map $ \c -> c { citationNoteNum = i} + +toCslCite :: Citation -> CSL.Cite +toCslCite c + = let (l, s) = locatorWords $ citationSuffix c + (la,lo) = parseLocator l + citMode = case citationMode c of + AuthorInText -> (True, False) + SuppressAuthor -> (False,True ) + NormalCitation -> (False,False) + s' = case s of + [] -> [] + (Str (y:_) : _) | isPunctuation y -> s + _ -> Str "," : Space : s + in emptyCite { CSL.citeId = citationId c + , CSL.citePrefix = PandocText $ citationPrefix c + , CSL.citeSuffix = PandocText $ s' + , CSL.citeLabel = la + , CSL.citeLocator = lo + , CSL.citeNoteNumber = show $ citationNoteNum c + , CSL.authorInText = fst citMode + , CSL.suppressAuthor = snd citMode + , CSL.citeHash = citationHash c + } + +locatorWords :: [Inline] -> (String, [Inline]) +locatorWords inp = + case parse pLocatorWords "suffix" inp of + Right r -> r + Left _ -> ("",inp) + +pLocatorWords :: GenParser Inline st (String, [Inline]) +pLocatorWords = do + l <- pLocator + s <- getInput -- rest is suffix + if length l > 0 && last l == ',' + then return (init l, Str "," : s) + else return (l, s) + +pMatch :: (Inline -> Bool) -> GenParser Inline st Inline +pMatch condition = try $ do + t <- anyToken + guard $ condition t + return t + +pSpace :: GenParser Inline st Inline +pSpace = pMatch (== Space) + +pLocator :: GenParser Inline st String +pLocator = try $ do + optional $ pMatch (== Str ",") + optional pSpace + f <- many1 (notFollowedBy pSpace >> anyToken) + gs <- many1 pWordWithDigits + return $ stringify f ++ (' ' : unwords gs) + +pWordWithDigits :: GenParser Inline st String +pWordWithDigits = try $ do + pSpace + r <- many1 (notFollowedBy pSpace >> anyToken) + let s = stringify r + guard $ any isDigit s + return s + diff --git a/src/Text/Pandoc/Blocks.hs b/src/Text/Pandoc/Blocks.hs deleted file mode 100644 index 122931773..000000000 --- a/src/Text/Pandoc/Blocks.hs +++ /dev/null @@ -1,146 +0,0 @@ -{- -Copyright (C) 2007 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Blocks - Copyright : Copyright (C) 2007 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Functions for the manipulation of fixed-width blocks of text. -These are used in the construction of plain-text tables. --} - -module Text.Pandoc.Blocks - ( - TextBlock (..), - docToBlock, - blockToDoc, - widthOfBlock, - heightOfBlock, - hcatBlocks, - hsepBlocks, - centerAlignBlock, - leftAlignBlock, - rightAlignBlock - ) -where -import Text.PrettyPrint -import Data.List ( intersperse ) - --- | A fixed-width block of text. Parameters are width of block, --- height of block, and list of lines. -data TextBlock = TextBlock Int Int [String] -instance Show TextBlock where - show x = show $ blockToDoc x - --- | Break lines in a list of lines so that none are greater than --- a given width. -breakLines :: Int -- ^ Maximum length of lines. - -> [String] -- ^ List of lines. - -> [String] -breakLines _ [] = [] -breakLines width (l:ls) = - if length l > width - then (take width l):(breakLines width ((drop width l):ls)) - else l:(breakLines width ls) - --- | Convert a @Doc@ element into a @TextBlock@ with a specified width. -docToBlock :: Int -- ^ Width of text block. - -> Doc -- ^ @Doc@ to convert. - -> TextBlock -docToBlock width doc = - let rendered = renderStyle (style {lineLength = width, - ribbonsPerLine = 1}) doc - lns = breakLines width $ lines rendered - in TextBlock width (length lns) lns - --- | Convert a @TextBlock@ to a @Doc@ element. -blockToDoc :: TextBlock -> Doc -blockToDoc (TextBlock _ _ lns) = - if null lns - then empty - else vcat $ map text lns - --- | Returns width of a @TextBlock@ (number of columns). -widthOfBlock :: TextBlock -> Int -widthOfBlock (TextBlock width _ _) = width - --- | Returns height of a @TextBlock@ (number of rows). -heightOfBlock :: TextBlock -> Int -heightOfBlock (TextBlock _ height _) = height - --- | Pads a string out to a given width using spaces. -hPad :: Int -- ^ Desired width. - -> String -- ^ String to pad. - -> String -hPad width line = - let linelen = length line - in if linelen <= width - then line ++ replicate (width - linelen) ' ' - else take width line - --- | Concatenates a list of @TextBlock@s into a new @TextBlock@ in --- which they appear side by side. -hcatBlocks :: [TextBlock] -> TextBlock -hcatBlocks [] = TextBlock 0 0 [] -hcatBlocks [x] = x -- This is not redundant! We don't want last item hPad'd. -hcatBlocks ((TextBlock width1 height1 lns1):xs) = - let (TextBlock width2 height2 lns2) = hcatBlocks xs - height = max height1 height2 - width = width1 + width2 - lns1' = map (hPad width1) $ lns1 ++ replicate (height - height1) "" - lns2' = lns2 ++ replicate (height - height2) "" - lns = zipWith (++) lns1' lns2' - in TextBlock width height lns - --- | Like @hcatBlocks@, but inserts space between the @TextBlock@s. -hsepBlocks :: [TextBlock] -> TextBlock -hsepBlocks = hcatBlocks . (intersperse (TextBlock 1 1 [" "])) - -isWhitespace :: Char -> Bool -isWhitespace x = x `elem` " \t" - --- | Left-aligns the contents of a @TextBlock@ within the block. -leftAlignBlock :: TextBlock -> TextBlock -leftAlignBlock (TextBlock width height lns) = - TextBlock width height $ map (dropWhile isWhitespace) lns - --- | Right-aligns the contents of a @TextBlock@ within the block. -rightAlignBlock :: TextBlock -> TextBlock -rightAlignBlock (TextBlock width height lns) = - let rightAlignLine ln = - let (spaces, rest) = span isWhitespace $ reverse $ hPad width ln - in reverse (rest ++ spaces) - in TextBlock width height $ map rightAlignLine lns - --- | Centers the contents of a @TextBlock@ within the block. -centerAlignBlock :: TextBlock -> TextBlock -centerAlignBlock (TextBlock width height lns) = - let centerAlignLine ln = - let ln' = hPad width ln - (startSpaces, rest) = span isWhitespace ln' - endSpaces = takeWhile isWhitespace (reverse ln') - numSpaces = length (startSpaces ++ endSpaces) - startSpaces' = replicate (quot numSpaces 2) ' ' - in startSpaces' ++ rest - in TextBlock width height $ map centerAlignLine lns - diff --git a/src/Text/Pandoc/CharacterReferences.hs b/src/Text/Pandoc/CharacterReferences.hs index 8ac55fc61..8157d94d3 100644 --- a/src/Text/Pandoc/CharacterReferences.hs +++ b/src/Text/Pandoc/CharacterReferences.hs @@ -31,9 +31,9 @@ module Text.Pandoc.CharacterReferences ( characterReference, decodeCharacterReferences, ) where -import Data.Char ( chr ) import Text.ParserCombinators.Parsec -import qualified Data.Map as Map +import Text.HTML.TagSoup.Entity ( lookupNamedEntity, lookupNumericEntity ) +import Data.Maybe ( fromMaybe ) -- | Parse character entity. characterReference :: GenParser Char st Char @@ -47,18 +47,21 @@ numRef :: GenParser Char st Char numRef = do char '#' num <- hexNum <|> decNum - return $ chr $ num + return $ fromMaybe '?' $ lookupNumericEntity num -hexNum :: GenParser Char st Int -hexNum = oneOf "Xx" >> many1 hexDigit >>= return . read . (\xs -> '0':'x':xs) +hexNum :: GenParser Char st [Char] +hexNum = do + x <- oneOf "Xx" + num <- many1 hexDigit + return (x:num) -decNum :: GenParser Char st Int -decNum = many1 digit >>= return . read +decNum :: GenParser Char st [Char] +decNum = many1 digit entity :: GenParser Char st Char entity = do body <- many1 alphaNum - return $ Map.findWithDefault '?' body entityTable + return $ fromMaybe '?' $ lookupNamedEntity body -- | Convert entities in a string to characters. decodeCharacterReferences :: String -> String @@ -67,261 +70,3 @@ decodeCharacterReferences str = Left err -> error $ "\nError: " ++ show err Right result -> result -entityTable :: Map.Map String Char -entityTable = Map.fromList entityTableList - -entityTableList :: [(String, Char)] -entityTableList = [ - ("quot", chr 34), - ("amp", chr 38), - ("lt", chr 60), - ("gt", chr 62), - ("nbsp", chr 160), - ("iexcl", chr 161), - ("cent", chr 162), - ("pound", chr 163), - ("curren", chr 164), - ("yen", chr 165), - ("brvbar", chr 166), - ("sect", chr 167), - ("uml", chr 168), - ("copy", chr 169), - ("ordf", chr 170), - ("laquo", chr 171), - ("not", chr 172), - ("shy", chr 173), - ("reg", chr 174), - ("macr", chr 175), - ("deg", chr 176), - ("plusmn", chr 177), - ("sup2", chr 178), - ("sup3", chr 179), - ("acute", chr 180), - ("micro", chr 181), - ("para", chr 182), - ("middot", chr 183), - ("cedil", chr 184), - ("sup1", chr 185), - ("ordm", chr 186), - ("raquo", chr 187), - ("frac14", chr 188), - ("frac12", chr 189), - ("frac34", chr 190), - ("iquest", chr 191), - ("Agrave", chr 192), - ("Aacute", chr 193), - ("Acirc", chr 194), - ("Atilde", chr 195), - ("Auml", chr 196), - ("Aring", chr 197), - ("AElig", chr 198), - ("Ccedil", chr 199), - ("Egrave", chr 200), - ("Eacute", chr 201), - ("Ecirc", chr 202), - ("Euml", chr 203), - ("Igrave", chr 204), - ("Iacute", chr 205), - ("Icirc", chr 206), - ("Iuml", chr 207), - ("ETH", chr 208), - ("Ntilde", chr 209), - ("Ograve", chr 210), - ("Oacute", chr 211), - ("Ocirc", chr 212), - ("Otilde", chr 213), - ("Ouml", chr 214), - ("times", chr 215), - ("Oslash", chr 216), - ("Ugrave", chr 217), - ("Uacute", chr 218), - ("Ucirc", chr 219), - ("Uuml", chr 220), - ("Yacute", chr 221), - ("THORN", chr 222), - ("szlig", chr 223), - ("agrave", chr 224), - ("aacute", chr 225), - ("acirc", chr 226), - ("atilde", chr 227), - ("auml", chr 228), - ("aring", chr 229), - ("aelig", chr 230), - ("ccedil", chr 231), - ("egrave", chr 232), - ("eacute", chr 233), - ("ecirc", chr 234), - ("euml", chr 235), - ("igrave", chr 236), - ("iacute", chr 237), - ("icirc", chr 238), - ("iuml", chr 239), - ("eth", chr 240), - ("ntilde", chr 241), - ("ograve", chr 242), - ("oacute", chr 243), - ("ocirc", chr 244), - ("otilde", chr 245), - ("ouml", chr 246), - ("divide", chr 247), - ("oslash", chr 248), - ("ugrave", chr 249), - ("uacute", chr 250), - ("ucirc", chr 251), - ("uuml", chr 252), - ("yacute", chr 253), - ("thorn", chr 254), - ("yuml", chr 255), - ("OElig", chr 338), - ("oelig", chr 339), - ("Scaron", chr 352), - ("scaron", chr 353), - ("Yuml", chr 376), - ("fnof", chr 402), - ("circ", chr 710), - ("tilde", chr 732), - ("Alpha", chr 913), - ("Beta", chr 914), - ("Gamma", chr 915), - ("Delta", chr 916), - ("Epsilon", chr 917), - ("Zeta", chr 918), - ("Eta", chr 919), - ("Theta", chr 920), - ("Iota", chr 921), - ("Kappa", chr 922), - ("Lambda", chr 923), - ("Mu", chr 924), - ("Nu", chr 925), - ("Xi", chr 926), - ("Omicron", chr 927), - ("Pi", chr 928), - ("Rho", chr 929), - ("Sigma", chr 931), - ("Tau", chr 932), - ("Upsilon", chr 933), - ("Phi", chr 934), - ("Chi", chr 935), - ("Psi", chr 936), - ("Omega", chr 937), - ("alpha", chr 945), - ("beta", chr 946), - ("gamma", chr 947), - ("delta", chr 948), - ("epsilon", chr 949), - ("zeta", chr 950), - ("eta", chr 951), - ("theta", chr 952), - ("iota", chr 953), - ("kappa", chr 954), - ("lambda", chr 955), - ("mu", chr 956), - ("nu", chr 957), - ("xi", chr 958), - ("omicron", chr 959), - ("pi", chr 960), - ("rho", chr 961), - ("sigmaf", chr 962), - ("sigma", chr 963), - ("tau", chr 964), - ("upsilon", chr 965), - ("phi", chr 966), - ("chi", chr 967), - ("psi", chr 968), - ("omega", chr 969), - ("thetasym", chr 977), - ("upsih", chr 978), - ("piv", chr 982), - ("ensp", chr 8194), - ("emsp", chr 8195), - ("thinsp", chr 8201), - ("zwnj", chr 8204), - ("zwj", chr 8205), - ("lrm", chr 8206), - ("rlm", chr 8207), - ("ndash", chr 8211), - ("mdash", chr 8212), - ("lsquo", chr 8216), - ("rsquo", chr 8217), - ("sbquo", chr 8218), - ("ldquo", chr 8220), - ("rdquo", chr 8221), - ("bdquo", chr 8222), - ("dagger", chr 8224), - ("Dagger", chr 8225), - ("bull", chr 8226), - ("hellip", chr 8230), - ("permil", chr 8240), - ("prime", chr 8242), - ("Prime", chr 8243), - ("lsaquo", chr 8249), - ("rsaquo", chr 8250), - ("oline", chr 8254), - ("frasl", chr 8260), - ("euro", chr 8364), - ("image", chr 8465), - ("weierp", chr 8472), - ("real", chr 8476), - ("trade", chr 8482), - ("alefsym", chr 8501), - ("larr", chr 8592), - ("uarr", chr 8593), - ("rarr", chr 8594), - ("darr", chr 8595), - ("harr", chr 8596), - ("crarr", chr 8629), - ("lArr", chr 8656), - ("uArr", chr 8657), - ("rArr", chr 8658), - ("dArr", chr 8659), - ("hArr", chr 8660), - ("forall", chr 8704), - ("part", chr 8706), - ("exist", chr 8707), - ("empty", chr 8709), - ("nabla", chr 8711), - ("isin", chr 8712), - ("notin", chr 8713), - ("ni", chr 8715), - ("prod", chr 8719), - ("sum", chr 8721), - ("minus", chr 8722), - ("lowast", chr 8727), - ("radic", chr 8730), - ("prop", chr 8733), - ("infin", chr 8734), - ("ang", chr 8736), - ("and", chr 8743), - ("or", chr 8744), - ("cap", chr 8745), - ("cup", chr 8746), - ("int", chr 8747), - ("there4", chr 8756), - ("sim", chr 8764), - ("cong", chr 8773), - ("asymp", chr 8776), - ("ne", chr 8800), - ("equiv", chr 8801), - ("le", chr 8804), - ("ge", chr 8805), - ("sub", chr 8834), - ("sup", chr 8835), - ("nsub", chr 8836), - ("sube", chr 8838), - ("supe", chr 8839), - ("oplus", chr 8853), - ("otimes", chr 8855), - ("perp", chr 8869), - ("sdot", chr 8901), - ("lceil", chr 8968), - ("rceil", chr 8969), - ("lfloor", chr 8970), - ("rfloor", chr 8971), - ("lang", chr 9001), - ("rang", chr 9002), - ("loz", chr 9674), - ("spades", chr 9824), - ("clubs", chr 9827), - ("hearts", chr 9829), - ("diams", chr 9830) - ] diff --git a/src/Text/Pandoc/Definition.hs b/src/Text/Pandoc/Definition.hs deleted file mode 100644 index fffca3b2e..000000000 --- a/src/Text/Pandoc/Definition.hs +++ /dev/null @@ -1,151 +0,0 @@ -{-# OPTIONS_GHC -fglasgow-exts #-} -- for deriving Typeable -{- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Definition - Copyright : Copyright (C) 2006-2010 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Definition of 'Pandoc' data structure for format-neutral representation -of documents. --} -module Text.Pandoc.Definition where - -import Data.Generics - -data Pandoc = Pandoc Meta [Block] deriving (Eq, Ord, Read, Show, Typeable, Data) - --- | Bibliographic information for the document: title, authors, date. -data Meta = Meta { docTitle :: [Inline] - , docAuthors :: [[Inline]] - , docDate :: [Inline] } - deriving (Eq, Ord, Show, Read, Typeable, Data) - --- | Alignment of a table column. -data Alignment = AlignLeft - | AlignRight - | AlignCenter - | AlignDefault deriving (Eq, Ord, Show, Read, Typeable, Data) - --- | List attributes. -type ListAttributes = (Int, ListNumberStyle, ListNumberDelim) - --- | Style of list numbers. -data ListNumberStyle = DefaultStyle - | Example - | Decimal - | LowerRoman - | UpperRoman - | LowerAlpha - | UpperAlpha deriving (Eq, Ord, Show, Read, Typeable, Data) - --- | Delimiter of list numbers. -data ListNumberDelim = DefaultDelim - | Period - | OneParen - | TwoParens deriving (Eq, Ord, Show, Read, Typeable, Data) - --- | Attributes: identifier, classes, key-value pairs -type Attr = (String, [String], [(String, String)]) - --- | Block element. -data Block - = Plain [Inline] -- ^ Plain text, not a paragraph - | Para [Inline] -- ^ Paragraph - | CodeBlock Attr String -- ^ Code block (literal) with attributes - | RawHtml String -- ^ Raw HTML block (literal) - | BlockQuote [Block] -- ^ Block quote (list of blocks) - | OrderedList ListAttributes [[Block]] -- ^ Ordered list (attributes - -- and a list of items, each a list of blocks) - | BulletList [[Block]] -- ^ Bullet list (list of items, each - -- a list of blocks) - | DefinitionList [([Inline],[[Block]])] -- ^ Definition list - -- Each list item is a pair consisting of a - -- term (a list of inlines) and one or more - -- definitions (each a list of blocks) - | Header Int [Inline] -- ^ Header - level (integer) and text (inlines) - | HorizontalRule -- ^ Horizontal rule - | Table [Inline] [Alignment] [Double] [[Block]] [[[Block]]] -- ^ Table, - -- with caption, column alignments, - -- relative column widths (0 = default), - -- column headers (each a list of blocks), and - -- rows (each a list of lists of blocks) - | Null -- ^ Nothing - deriving (Eq, Ord, Read, Show, Typeable, Data) - --- | Type of quotation marks to use in Quoted inline. -data QuoteType = SingleQuote | DoubleQuote deriving (Show, Eq, Ord, Read, Typeable, Data) - --- | Link target (URL, title). -type Target = (String, String) - --- | Type of math element (display or inline). -data MathType = DisplayMath | InlineMath deriving (Show, Eq, Ord, Read, Typeable, Data) - --- | Inline elements. -data Inline - = Str String -- ^ Text (string) - | Emph [Inline] -- ^ Emphasized text (list of inlines) - | Strong [Inline] -- ^ Strongly emphasized text (list of inlines) - | Strikeout [Inline] -- ^ Strikeout text (list of inlines) - | Superscript [Inline] -- ^ Superscripted text (list of inlines) - | Subscript [Inline] -- ^ Subscripted text (list of inlines) - | SmallCaps [Inline] -- ^ Small caps text (list of inlines) - | Quoted QuoteType [Inline] -- ^ Quoted text (list of inlines) - | Cite [Target] [Inline] -- ^ Citation (list of inlines) - | Code String -- ^ Inline code (literal) - | Space -- ^ Inter-word space - | EmDash -- ^ Em dash - | EnDash -- ^ En dash - | Apostrophe -- ^ Apostrophe - | Ellipses -- ^ Ellipses - | LineBreak -- ^ Hard line break - | Math MathType String -- ^ TeX math (literal) - | TeX String -- ^ LaTeX code (literal) - | HtmlInline String -- ^ HTML code (literal) - | Link [Inline] Target -- ^ Hyperlink: text (list of inlines), target - | Image [Inline] Target -- ^ Image: alt text (list of inlines), target - -- and target - | Note [Block] -- ^ Footnote or endnote - deriving (Show, Eq, Ord, Read, Typeable, Data) - --- | Applies a transformation on @a@s to matching elements in a @b@. -processWith :: (Data a, Data b) => (a -> a) -> b -> b -processWith f = everywhere (mkT f) - --- | Like 'processWith', but with monadic transformations. -processWithM :: (Monad m, Data a, Data b) => (a -> m a) -> b -> m b -processWithM f = everywhereM (mkM f) - --- | Runs a query on matching @a@ elements in a @c@. -queryWith :: (Data a, Data c) => (a -> [b]) -> c -> [b] -queryWith f = everything (++) ([] `mkQ` f) - -{-# DEPRECATED processPandoc "Use processWith instead" #-} -processPandoc :: Data a => (a -> a) -> Pandoc -> Pandoc -processPandoc = processWith - -{-# DEPRECATED queryPandoc "Use queryWith instead" #-} -queryPandoc :: Data a => (a -> [b]) -> Pandoc -> [b] -queryPandoc = queryWith - diff --git a/src/Text/Pandoc/Highlighting.hs b/src/Text/Pandoc/Highlighting.hs index f29106262..5ddaf1379 100644 --- a/src/Text/Pandoc/Highlighting.hs +++ b/src/Text/Pandoc/Highlighting.hs @@ -37,10 +37,14 @@ import Data.List (find) import Data.Maybe (fromMaybe) import Data.Char (toLower) -highlightHtml :: Attr -> String -> Either String Html -highlightHtml (_, classes, keyvals) rawCode = +highlightHtml :: Bool -- ^ True if inline HTML + -> Attr -- ^ Attributes of the Code or CodeBlock + -> String -- ^ Raw contents of the Code or CodeBlock + -> Either String Html -- ^ An error or the formatted Html +highlightHtml inline (_, classes, keyvals) rawCode = let firstNum = read $ fromMaybe "1" $ lookup "startFrom" keyvals fmtOpts = [OptNumberFrom firstNum] ++ + [OptInline | inline] ++ case find (`elem` ["number","numberLines","number-lines"]) classes of Nothing -> [] Just _ -> [OptNumberLines] @@ -65,6 +69,6 @@ languages = [] languagesByExtension :: String -> [String] languagesByExtension _ = [] -highlightHtml :: Attr -> String -> Either String Html -highlightHtml _ _ = Left "Pandoc was not compiled with support for highlighting" +highlightHtml :: Bool -> Attr -> String -> Either String Html +highlightHtml _ _ _ = Left "Pandoc was not compiled with support for highlighting" #endif diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 3678fc22a..9ce064f91 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -64,21 +64,27 @@ module Text.Pandoc.Parsing ( (>>~), QuoteContext (..), NoteTable, KeyTable, - Key (..), + Key, + toKey, + fromKey, lookupKeySrc, - refsMatch ) + smartPunctuation, + macro, + applyMacros' ) where import Text.Pandoc.Definition +import Text.Pandoc.Generic import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) import Text.ParserCombinators.Parsec import Text.Pandoc.CharacterReferences ( characterReference ) -import Data.Char ( toLower, toUpper, ord, isAscii ) +import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit ) import Data.List ( intercalate, transpose ) import Network.URI ( parseURI, URI (..), isAllowedInURI ) -import Control.Monad ( join, liftM ) +import Control.Monad ( join, liftM, guard ) import Text.Pandoc.Shared import qualified Data.Map as M +import Text.TeXMath.Macros (applyMacros, Macro, parseMacroDefinitions) -- | Like >>, but returns the operation on the left. -- (Suggested by Tillmann Rendel on Haskell-cafe list.) @@ -114,7 +120,7 @@ oneOfStrings listOfStrings = choice $ map (try . string) listOfStrings -- | Parses a space or tab. spaceChar :: CharParser st Char -spaceChar = char ' ' <|> char '\t' +spaceChar = satisfy $ \c -> c == ' ' || c == '\t' -- | Skips zero or more spaces or tabs. skipSpaces :: GenParser Char st () @@ -169,7 +175,8 @@ lineClump = blanklines charsInBalanced :: Char -> Char -> GenParser Char st String charsInBalanced open close = try $ do char open - raw <- many $ (many1 (noneOf [open, close, '\n'])) + raw <- many $ (many1 (satisfy $ \c -> + c /= open && c /= close && c /= '\n')) <|> (do res <- charsInBalanced open close return $ [open] ++ res ++ [close]) <|> try (string "\n" >>~ notFollowedBy' blanklines) @@ -180,7 +187,7 @@ charsInBalanced open close = try $ do charsInBalanced' :: Char -> Char -> GenParser Char st String charsInBalanced' open close = try $ do char open - raw <- many $ (many1 (noneOf [open, close])) + raw <- many $ (many1 (satisfy $ \c -> c /= open && c /= close)) <|> (do res <- charsInBalanced' open close return $ [open] ++ res ++ [close]) char close @@ -201,7 +208,7 @@ romanNumeral upperCase = do let romanDigits = if upperCase then uppercaseRomanDigits else lowercaseRomanDigits - lookAhead $ oneOf romanDigits + lookAhead $ oneOf romanDigits let [one, five, ten, fifty, hundred, fivehundred, thousand] = map char romanDigits thousands <- many thousand >>= (return . (1000 *) . length) @@ -227,7 +234,8 @@ romanNumeral upperCase = do -- Parsers for email addresses and URIs emailChar :: GenParser Char st Char -emailChar = alphaNum <|> oneOf "-+_." +emailChar = alphaNum <|> + satisfy (\c -> c == '-' || c == '+' || c == '_' || c == '.') domainChar :: GenParser Char st Char domainChar = alphaNum <|> char '-' @@ -283,7 +291,7 @@ nullBlock :: GenParser Char st Block nullBlock = anyChar >> return Null -- | Fail if reader is in strict markdown syntax mode. -failIfStrict :: GenParser Char ParserState () +failIfStrict :: GenParser a ParserState () failIfStrict = do state <- getState if stateStrict state then fail "strict mode" else return () @@ -327,7 +335,7 @@ decimal = do exampleNum :: GenParser Char ParserState (ListNumberStyle, Int) exampleNum = do char '@' - lab <- many (alphaNum <|> oneOf "_-") + lab <- many (alphaNum <|> satisfy (\c -> c == '_' || c == '-')) st <- getState let num = stateNextExample st let newlabels = if null lab @@ -450,8 +458,9 @@ widthsFromIndices :: Int -- Number of columns on terminal -> [Int] -- Indices -> [Double] -- Fractional relative sizes of columns widthsFromIndices _ [] = [] -widthsFromIndices numColumns indices = - let lengths' = zipWith (-) indices (0:indices) +widthsFromIndices numColumns' indices = + let numColumns = max numColumns' (if null indices then 0 else last indices) + lengths' = zipWith (-) indices (0:indices) lengths = reverse $ case reverse lengths' of [] -> [] @@ -481,8 +490,8 @@ gridTableWith block tableCaption headless = tableWith (gridTableHeader headless block) (gridTableRow block) (gridTableSep '-') gridTableFooter tableCaption gridTableSplitLine :: [Int] -> String -> [String] -gridTableSplitLine indices line = - map removeFinalBar $ tail $ splitByIndices (init indices) line +gridTableSplitLine indices line = map removeFinalBar $ tail $ + splitByIndices (init indices) $ removeTrailingSpace line gridPart :: Char -> GenParser Char st (Int, Int) gridPart ch = do @@ -494,8 +503,8 @@ gridDashedLines :: Char -> GenParser Char st [(Int,Int)] gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline removeFinalBar :: String -> String -removeFinalBar = reverse . dropWhile (=='|') . dropWhile (`elem` " \t") . - reverse +removeFinalBar = + reverse . dropWhile (`elem` " \t") . dropWhile (=='|') . reverse -- | Separator between rows of grid table. gridTableSep :: Char -> GenParser Char ParserState Char @@ -532,7 +541,7 @@ gridTableRawLine :: [Int] -> GenParser Char ParserState [String] gridTableRawLine indices = do char '|' line <- many1Till anyChar newline - return (gridTableSplitLine indices $ removeTrailingSpace line) + return (gridTableSplitLine indices line) -- | Parse row of grid table. gridTableRow :: GenParser Char ParserState Block @@ -562,9 +571,9 @@ gridTableFooter = blanklines --- -- | Parse a string with a given parser and state. -readWith :: GenParser Char ParserState a -- ^ parser +readWith :: GenParser t ParserState a -- ^ parser -> ParserState -- ^ initial state - -> String -- ^ input string + -> [t] -- ^ input -> a readWith parser state input = case runParser parser state "source" input of @@ -583,11 +592,8 @@ data ParserState = ParserState { stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX? stateParserContext :: ParserContext, -- ^ Inside list? stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment? - stateSanitizeHTML :: Bool, -- ^ Sanitize HTML? stateKeys :: KeyTable, -- ^ List of reference keys -#ifdef _CITEPROC stateCitations :: [String], -- ^ List of available citations -#endif stateNotes :: NoteTable, -- ^ List of notes stateTabStop :: Int, -- ^ Tab stop stateStandalone :: Bool, -- ^ Parse bibliographic info? @@ -602,7 +608,9 @@ data ParserState = ParserState stateIndentedCodeClasses :: [String], -- ^ Classes to use for indented code blocks stateNextExample :: Int, -- ^ Number of next example stateExamples :: M.Map String Int, -- ^ Map from example labels to numbers - stateHasChapters :: Bool -- ^ True if \chapter encountered + stateHasChapters :: Bool, -- ^ True if \chapter encountered + stateApplyMacros :: Bool, -- ^ Apply LaTeX macros? + stateMacros :: [Macro] -- ^ List of macros defined so far } deriving Show @@ -611,11 +619,8 @@ defaultParserState = ParserState { stateParseRaw = False, stateParserContext = NullState, stateQuoteContext = NoQuote, - stateSanitizeHTML = False, stateKeys = M.empty, -#ifdef _CITEPROC stateCitations = [], -#endif stateNotes = [], stateTabStop = 4, stateStandalone = False, @@ -630,7 +635,9 @@ defaultParserState = stateIndentedCodeClasses = [], stateNextExample = 1, stateExamples = M.empty, - stateHasChapters = False } + stateHasChapters = False, + stateApplyMacros = True, + stateMacros = []} data HeaderType = SingleHeader Char -- ^ Single line of characters underneath @@ -650,13 +657,20 @@ data QuoteContext type NoteTable = [(String, String)] -newtype Key = Key [Inline] deriving (Show, Read) +newtype Key = Key [Inline] deriving (Show, Read, Eq, Ord) -instance Eq Key where - Key a == Key b = refsMatch a b +toKey :: [Inline] -> Key +toKey = Key . bottomUp lowercase + where lowercase :: Inline -> Inline + lowercase (Str xs) = Str (map toLower xs) + lowercase (Math t xs) = Math t (map toLower xs) + lowercase (Code attr xs) = Code attr (map toLower xs) + lowercase (RawInline f xs) = RawInline f (map toLower xs) + lowercase LineBreak = Space + lowercase x = x -instance Ord Key where - compare (Key a) (Key b) = if a == b then EQ else compare a b +fromKey :: Key -> [Inline] +fromKey (Key xs) = xs type KeyTable = M.Map Key Target @@ -668,33 +682,130 @@ lookupKeySrc table key = case M.lookup key table of Nothing -> Nothing Just src -> Just src --- | Returns @True@ if keys match (case insensitive). -refsMatch :: [Inline] -> [Inline] -> Bool -refsMatch ((Str x):restx) ((Str y):resty) = - ((map toLower x) == (map toLower y)) && refsMatch restx resty -refsMatch ((Emph x):restx) ((Emph y):resty) = - refsMatch x y && refsMatch restx resty -refsMatch ((Strong x):restx) ((Strong y):resty) = - refsMatch x y && refsMatch restx resty -refsMatch ((Strikeout x):restx) ((Strikeout y):resty) = - refsMatch x y && refsMatch restx resty -refsMatch ((Superscript x):restx) ((Superscript y):resty) = - refsMatch x y && refsMatch restx resty -refsMatch ((Subscript x):restx) ((Subscript y):resty) = - refsMatch x y && refsMatch restx resty -refsMatch ((SmallCaps x):restx) ((SmallCaps y):resty) = - refsMatch x y && refsMatch restx resty -refsMatch ((Quoted t x):restx) ((Quoted u y):resty) = - t == u && refsMatch x y && refsMatch restx resty -refsMatch ((Code x):restx) ((Code y):resty) = - ((map toLower x) == (map toLower y)) && refsMatch restx resty -refsMatch ((Math t x):restx) ((Math u y):resty) = - ((map toLower x) == (map toLower y)) && t == u && refsMatch restx resty -refsMatch ((TeX x):restx) ((TeX y):resty) = - ((map toLower x) == (map toLower y)) && refsMatch restx resty -refsMatch ((HtmlInline x):restx) ((HtmlInline y):resty) = - ((map toLower x) == (map toLower y)) && refsMatch restx resty -refsMatch (x:restx) (y:resty) = (x == y) && refsMatch restx resty -refsMatch [] x = null x -refsMatch x [] = null x +-- | Fail unless we're in "smart typography" mode. +failUnlessSmart :: GenParser tok ParserState () +failUnlessSmart = getState >>= guard . stateSmart + +smartPunctuation :: GenParser Char ParserState Inline + -> GenParser Char ParserState Inline +smartPunctuation inlineParser = do + failUnlessSmart + choice [ quoted inlineParser, apostrophe, dash, ellipses ] + +apostrophe :: GenParser Char ParserState Inline +apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe + +quoted :: GenParser Char ParserState Inline + -> GenParser Char ParserState Inline +quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser + +withQuoteContext :: QuoteContext + -> (GenParser Char ParserState Inline) + -> GenParser Char ParserState Inline +withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = stateQuoteContext oldState + setState oldState { stateQuoteContext = context } + result <- parser + newState <- getState + setState newState { stateQuoteContext = oldQuoteContext } + return result + +singleQuoted :: GenParser Char ParserState Inline + -> GenParser Char ParserState Inline +singleQuoted inlineParser = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ many1Till inlineParser singleQuoteEnd >>= + return . Quoted SingleQuote . normalizeSpaces + +doubleQuoted :: GenParser Char ParserState Inline + -> GenParser Char ParserState Inline +doubleQuoted inlineParser = try $ do + doubleQuoteStart + withQuoteContext InDoubleQuote $ do + contents <- manyTill inlineParser doubleQuoteEnd + return . Quoted DoubleQuote . normalizeSpaces $ contents + +failIfInQuoteContext :: QuoteContext -> GenParser tok ParserState () +failIfInQuoteContext context = do + st <- getState + if stateQuoteContext st == context + then fail "already inside quotes" + else return () + +charOrRef :: [Char] -> GenParser Char st Char +charOrRef cs = + oneOf cs <|> try (do c <- characterReference + guard (c `elem` cs) + return c) + +singleQuoteStart :: GenParser Char ParserState () +singleQuoteStart = do + failIfInQuoteContext InSingleQuote + try $ do charOrRef "'\8216" + notFollowedBy (oneOf ")!],.;:-? \t\n") + notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> + satisfy (not . isAlphaNum))) + -- possess/contraction + return () + +singleQuoteEnd :: GenParser Char st () +singleQuoteEnd = try $ do + charOrRef "'\8217" + notFollowedBy alphaNum + +doubleQuoteStart :: GenParser Char ParserState () +doubleQuoteStart = do + failIfInQuoteContext InDoubleQuote + try $ do charOrRef "\"\8220" + notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n')) + +doubleQuoteEnd :: GenParser Char st () +doubleQuoteEnd = do + charOrRef "\"\8221" + return () + +ellipses :: GenParser Char st Inline +ellipses = do + try (charOrRef "…") <|> try (string "..." >> return '…') + return Ellipses + +dash :: GenParser Char st Inline +dash = enDash <|> emDash + +enDash :: GenParser Char st Inline +enDash = do + try (charOrRef "–") <|> + try (char '-' >> lookAhead (satisfy isDigit) >> return '–') + return EnDash + +emDash :: GenParser Char st Inline +emDash = do + try (charOrRef "—") <|> (try $ string "--" >> optional (char '-') >> return '—') + return EmDash + +-- +-- Macros +-- + +-- | Parse a \newcommand or \renewcommand macro definition. +macro :: GenParser Char ParserState Block +macro = do + getState >>= guard . stateApplyMacros + inp <- getInput + case parseMacroDefinitions inp of + ([], _) -> pzero + (ms, rest) -> do count (length inp - length rest) anyChar + updateState $ \st -> + st { stateMacros = ms ++ stateMacros st } + return Null + +-- | Apply current macros to string. +applyMacros' :: String -> GenParser Char ParserState String +applyMacros' target = do + apply <- liftM stateApplyMacros getState + if apply + then do macros <- liftM stateMacros getState + return $ applyMacros macros target + else return target diff --git a/src/Text/Pandoc/Pretty.hs b/src/Text/Pandoc/Pretty.hs new file mode 100644 index 000000000..54d65af6f --- /dev/null +++ b/src/Text/Pandoc/Pretty.hs @@ -0,0 +1,429 @@ +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{- +Copyright (C) 2010 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111(-1)307 USA +-} + +{- | + Module : Text.Pandoc.Pretty + Copyright : Copyright (C) 2010 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +A prettyprinting library for the production of text documents, +including wrapped text, indentated blocks, and tables. +-} + +module Text.Pandoc.Pretty ( + Doc + , render + , cr + , blankline + , space + , text + , char + , prefixed + , flush + , nest + , hang + , nowrap + , offset + , height + , lblock + , cblock + , rblock + , (<>) + , (<+>) + , ($$) + , ($+$) + , isEmpty + , empty + , cat + , hcat + , hsep + , vcat + , vsep + , inside + , braces + , brackets + , parens + , quotes + , doubleQuotes + ) + +where +import Data.DList (DList, fromList, toList, cons, singleton) +import Data.List (intercalate) +import Data.Monoid +import Data.String +import Control.Monad.State +import Data.Char (isSpace) + +data Monoid a => + RenderState a = RenderState{ + output :: [a] -- ^ In reverse order + , prefix :: String + , usePrefix :: Bool + , lineLength :: Maybe Int -- ^ 'Nothing' means no wrapping + , column :: Int + , newlines :: Int -- ^ Number of preceding newlines + } + +type DocState a = State (RenderState a) () + +data D = Text Int String + | Block Int [String] + | Prefixed String Doc + | Flush Doc + | BreakingSpace + | CarriageReturn + | NewLine + | BlankLine + deriving (Show) + +newtype Doc = Doc { unDoc :: DList D } + deriving (Monoid) + +instance Show Doc where + show = render Nothing + +instance IsString Doc where + fromString = text + +-- | True if the document is empty. +isEmpty :: Doc -> Bool +isEmpty = null . toList . unDoc + +-- | The empty document. +empty :: Doc +empty = mempty + +-- | @a <> b@ is the result of concatenating @a@ with @b@. +(<>) :: Doc -> Doc -> Doc +(<>) = mappend + +-- | Concatenate a list of 'Doc's. +cat :: [Doc] -> Doc +cat = mconcat + +-- | Same as 'cat'. +hcat :: [Doc] -> Doc +hcat = mconcat + +-- | Concatenate a list of 'Doc's, putting breakable spaces +-- between them. +(<+>) :: Doc -> Doc -> Doc +(<+>) x y = if isEmpty x + then y + else if isEmpty y + then x + else x <> space <> y + +-- | Same as 'cat', but putting breakable spaces between the +-- 'Doc's. +hsep :: [Doc] -> Doc +hsep = foldr (<+>) empty + +-- | @a $$ b@ puts @a@ above @b@. +($$) :: Doc -> Doc -> Doc +($$) x y = if isEmpty x + then y + else if isEmpty y + then x + else x <> cr <> y + +-- | @a $$ b@ puts @a@ above @b@, with a blank line between. +($+$) :: Doc -> Doc -> Doc +($+$) x y = if isEmpty x + then y + else if isEmpty y + then x + else x <> blankline <> y + +-- | List version of '$$'. +vcat :: [Doc] -> Doc +vcat = foldr ($$) empty + +-- | List version of '$+$'. +vsep :: [Doc] -> Doc +vsep = foldr ($+$) empty + +outp :: (IsString a, Monoid a) + => Int -> String -> DocState a +outp off s | off <= 0 = do + st' <- get + let rawpref = prefix st' + when (column st' == 0 && usePrefix st' && not (null rawpref)) $ do + let pref = reverse $ dropWhile isSpace $ reverse rawpref + modify $ \st -> st{ output = fromString pref : output st + , column = column st + length pref } + when (off < 0) $ do + modify $ \st -> st { output = fromString s : output st + , column = 0 + , newlines = newlines st + 1 } +outp off s = do + st' <- get + let pref = prefix st' + when (column st' == 0 && usePrefix st' && not (null pref)) $ do + modify $ \st -> st{ output = fromString pref : output st + , column = column st + length pref } + modify $ \st -> st{ output = fromString s : output st + , column = column st + off + , newlines = 0 } + +-- | Renders a 'Doc'. @render (Just n)@ will use +-- a line length of @n@ to reflow text on breakable spaces. +-- @render Nothing@ will not reflow text. +render :: (Monoid a, IsString a) + => Maybe Int -> Doc -> a +render linelen doc = fromString . mconcat . reverse . output $ + execState (renderDoc doc) startingState + where startingState = RenderState{ + output = mempty + , prefix = "" + , usePrefix = True + , lineLength = linelen + , column = 0 + , newlines = 2 } + +renderDoc :: (IsString a, Monoid a) + => Doc -> DocState a +renderDoc = renderList . toList . unDoc + +renderList :: (IsString a, Monoid a) + => [D] -> DocState a +renderList [] = return () +renderList (Text off s : xs) = do + outp off s + renderList xs + +renderList (Prefixed pref d : xs) = do + st <- get + let oldPref = prefix st + put st{ prefix = prefix st ++ pref } + renderDoc d + modify $ \s -> s{ prefix = oldPref } + renderList xs + +renderList (Flush d : xs) = do + st <- get + let oldUsePrefix = usePrefix st + put st{ usePrefix = False } + renderDoc d + modify $ \s -> s{ usePrefix = oldUsePrefix } + renderList xs + +renderList (BlankLine : xs) = do + st <- get + case output st of + _ | newlines st > 1 || null xs -> return () + _ | column st == 0 -> do + outp (-1) "\n" + _ -> do + outp (-1) "\n" + outp (-1) "\n" + renderList xs + +renderList (CarriageReturn : xs) = do + st <- get + if newlines st > 0 || null xs + then renderList xs + else do + outp (-1) "\n" + renderList xs + +renderList (NewLine : xs) = do + outp (-1) "\n" + renderList xs + +renderList (BreakingSpace : CarriageReturn : xs) = renderList (CarriageReturn:xs) +renderList (BreakingSpace : NewLine : xs) = renderList (NewLine:xs) +renderList (BreakingSpace : BlankLine : xs) = renderList (BlankLine:xs) +renderList (BreakingSpace : BreakingSpace : xs) = renderList (BreakingSpace:xs) +renderList (BreakingSpace : xs) = do + let isText (Text _ _) = True + isText (Block _ _) = True + isText _ = False + let isBreakingSpace BreakingSpace = True + isBreakingSpace _ = False + let xs' = dropWhile isBreakingSpace xs + let next = takeWhile isText xs' + st <- get + let off = sum $ map offsetOf next + case lineLength st of + Just l | column st + 1 + off > l -> do + outp (-1) "\n" + renderList xs' + _ -> do + outp 1 " " + renderList xs' + +renderList (b1@Block{} : b2@Block{} : xs) = + renderList (mergeBlocks False b1 b2 : xs) + +renderList (b1@Block{} : BreakingSpace : b2@Block{} : xs) = + renderList (mergeBlocks True b1 b2 : xs) + +renderList (Block width lns : xs) = do + st <- get + let oldPref = prefix st + case column st - length oldPref of + n | n > 0 -> modify $ \s -> s{ prefix = oldPref ++ replicate n ' ' } + _ -> return () + renderDoc $ blockToDoc width lns + modify $ \s -> s{ prefix = oldPref } + renderList xs + +mergeBlocks :: Bool -> D -> D -> D +mergeBlocks addSpace (Block w1 lns1) (Block w2 lns2) = + Block (w1 + w2 + if addSpace then 1 else 0) $ + zipWith (\l1 l2 -> pad w1 l1 ++ l2) (lns1 ++ empties) (map sp lns2 ++ empties) + where empties = replicate (abs $ length lns1 - length lns2) "" + pad n s = s ++ replicate (n - length s) ' ' + sp "" = "" + sp xs = if addSpace then (' ' : xs) else xs +mergeBlocks _ _ _ = error "mergeBlocks tried on non-Block!" + +blockToDoc :: Int -> [String] -> Doc +blockToDoc _ lns = text $ intercalate "\n" lns + +offsetOf :: D -> Int +offsetOf (Text o _) = o +offsetOf (Block w _) = w +offsetOf BreakingSpace = 1 +offsetOf _ = 0 + +-- | A literal string. +text :: String -> Doc +text = Doc . toChunks + where toChunks :: String -> DList D + toChunks [] = mempty + toChunks s = case break (=='\n') s of + ([], _:ys) -> NewLine `cons` toChunks ys + (xs, _:ys) -> Text (length xs) xs `cons` + NewLine `cons` toChunks ys + (xs, []) -> singleton $ Text (length xs) xs + +-- | A character. +char :: Char -> Doc +char c = text [c] + +-- | A breaking (reflowable) space. +space :: Doc +space = Doc $ singleton BreakingSpace + +-- | A carriage return. Does nothing if we're at the beginning of +-- a line; otherwise inserts a newline. +cr :: Doc +cr = Doc $ singleton CarriageReturn + +-- | Inserts a blank line unless one exists already. +-- (@blankline <> blankline@ has the same effect as @blankline@. +-- If you want multiple blank lines, use @text "\\n\\n"@. +blankline :: Doc +blankline = Doc $ singleton BlankLine + +-- | Uses the specified string as a prefix for every line of +-- the inside document (except the first, if not at the beginning +-- of the line). +prefixed :: String -> Doc -> Doc +prefixed pref doc = Doc $ singleton $ Prefixed pref doc + +-- | Makes a 'Doc' flush against the left margin. +flush :: Doc -> Doc +flush doc = Doc $ singleton $ Flush doc + +-- | Indents a 'Doc' by the specified number of spaces. +nest :: Int -> Doc -> Doc +nest ind = prefixed (replicate ind ' ') + +-- | A hanging indent. @hang ind start doc@ prints @start@, +-- then @doc@, leaving an indent of @ind@ spaces on every +-- line but the first. +hang :: Int -> Doc -> Doc -> Doc +hang ind start doc = start <> nest ind doc + +-- | Makes a 'Doc' non-reflowable. +nowrap :: Doc -> Doc +nowrap doc = Doc $ fromList $ map replaceSpace $ toList $ unDoc doc + where replaceSpace BreakingSpace = Text 1 " " + replaceSpace x = x + +-- | Returns the width of a 'Doc'. +offset :: Doc -> Int +offset d = case map length . lines . render Nothing $ d of + [] -> 0 + os -> maximum os + +block :: (String -> String) -> Int -> Doc -> Doc +block filler width = Doc . singleton . Block width . + map filler . chop width . render (Just width) + +-- | @lblock n d@ is a block of width @n@ characters, with +-- text derived from @d@ and aligned to the left. +lblock :: Int -> Doc -> Doc +lblock = block id + +-- | Like 'lblock' but aligned to the right. +rblock :: Int -> Doc -> Doc +rblock w = block (\s -> replicate (w - length s) ' ' ++ s) w + +-- | Like 'lblock' but aligned centered. +cblock :: Int -> Doc -> Doc +cblock w = block (\s -> replicate ((w - length s) `div` 2) ' ' ++ s) w + +-- | Returns the height of a block or other 'Doc'. +height :: Doc -> Int +height = length . lines . render Nothing + +chop :: Int -> String -> [String] +chop _ [] = [] +chop n cs = case break (=='\n') cs of + (xs, ys) -> if len <= n + then case ys of + [] -> [xs] + (_:[]) -> [xs, ""] + (_:zs) -> xs : chop n zs + else take n xs : chop n (drop n xs ++ ys) + where len = length xs + +-- | Encloses a 'Doc' inside a start and end 'Doc'. +inside :: Doc -> Doc -> Doc -> Doc +inside start end contents = + start <> contents <> end + +-- | Puts a 'Doc' in curly braces. +braces :: Doc -> Doc +braces = inside (char '{') (char '}') + +-- | Puts a 'Doc' in square brackets. +brackets :: Doc -> Doc +brackets = inside (char '[') (char ']') + +-- | Puts a 'Doc' in parentheses. +parens :: Doc -> Doc +parens = inside (char '(') (char ')') + +-- | Wraps a 'Doc' in single quotes. +quotes :: Doc -> Doc +quotes = inside (char '\'') (char '\'') + +-- | Wraps a 'Doc' in double quotes. +doubleQuotes :: Doc -> Doc +doubleQuotes = inside (char '"') (char '"') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index f47309d3f..18e3113d3 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -27,43 +27,397 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of HTML to 'Pandoc' document. -} -module Text.Pandoc.Readers.HTML ( - readHtml, - rawHtmlInline, - rawHtmlBlock, - anyHtmlBlockTag, - anyHtmlInlineTag, - anyHtmlTag, - anyHtmlEndTag, - htmlEndTag, - extractTagType, - htmlBlockElement, - htmlComment, - unsanitaryURI +module Text.Pandoc.Readers.HTML ( readHtml + , htmlTag + , htmlInBalanced + , isInlineTag + , isBlockTag + , isTextTag + , isCommentTag ) where import Text.ParserCombinators.Parsec +import Text.ParserCombinators.Parsec.Pos +import Text.HTML.TagSoup +import Text.HTML.TagSoup.Match import Text.Pandoc.Definition +import Text.Pandoc.Builder (text, toList) import Text.Pandoc.Shared import Text.Pandoc.Parsing -import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) -import Data.Maybe ( fromMaybe ) -import Data.List ( isPrefixOf, isSuffixOf, intercalate ) -import Data.Char ( toLower, isAlphaNum ) -import Network.URI ( parseURIReference, URI (..) ) -import Control.Monad ( liftM, when ) +import Data.Maybe ( fromMaybe, isJust ) +import Data.List ( intercalate ) +import Data.Char ( isSpace, isDigit ) +import Control.Monad ( liftM, guard ) -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ParserState -- ^ Parser state -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc -readHtml = readWith parseHtml +readHtml st inp = Pandoc meta blocks + where blocks = readWith parseBody st rest + tags = canonicalizeTags $ + parseTagsOptions parseOptions{ optTagPosition = True } inp + hasHeader = any (~== TagOpen "head" []) tags + (meta, rest) = if hasHeader + then parseHeader tags + else (Meta [] [] [], tags) + +type TagParser = GenParser (Tag String) ParserState + +-- TODO - fix this - not every header has a title tag +parseHeader :: [Tag String] -> (Meta, [Tag String]) +parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest) + where (tit,_) = break (~== TagClose "title") $ drop 1 $ + dropWhile (\t -> not $ t ~== TagOpen "title" []) tags + tit' = concatMap fromTagText $ filter isTagText tit + tit'' = normalizeSpaces $ toList $ text tit' + rest = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head" || + t ~== TagOpen "body" []) tags + +parseBody :: TagParser [Block] +parseBody = liftM concat $ manyTill block eof + +block :: TagParser [Block] +block = choice + [ pPara + , pHeader + , pBlockQuote + , pCodeBlock + , pList + , pHrule + , pSimpleTable + , pPlain + , pRawHtmlBlock + ] + +renderTags' :: [Tag String] -> String +renderTags' = renderTagsOptions + renderOptions{ optMinimize = (`elem` ["hr","br","img"]) } + +pList :: TagParser [Block] +pList = pBulletList <|> pOrderedList <|> pDefinitionList + +pBulletList :: TagParser [Block] +pBulletList = try $ do + pSatisfy (~== TagOpen "ul" []) + let nonItem = pSatisfy (\t -> + not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && + not (t ~== TagClose "ul")) + -- note: if they have an <ol> or <ul> not in scope of a <li>, + -- treat it as a list item, though it's not valid xhtml... + skipMany nonItem + items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul") + return [BulletList items] + +pOrderedList :: TagParser [Block] +pOrderedList = try $ do + TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) + st <- getState + let (start, style) = if stateStrict st + then (1, DefaultStyle) + else (sta', sty') + where sta = fromMaybe "1" $ + lookup "start" attribs + sta' = if all isDigit sta + then read sta + else 1 + sty = fromMaybe (fromMaybe "" $ + lookup "style" attribs) $ + lookup "class" attribs + sty' = case sty of + "lower-roman" -> LowerRoman + "upper-roman" -> UpperRoman + "lower-alpha" -> LowerAlpha + "upper-alpha" -> UpperAlpha + "decimal" -> Decimal + _ -> DefaultStyle + let nonItem = pSatisfy (\t -> + not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && + not (t ~== TagClose "ol")) + -- note: if they have an <ol> or <ul> not in scope of a <li>, + -- treat it as a list item, though it's not valid xhtml... + skipMany nonItem + items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol") + return [OrderedList (start, style, DefaultDelim) items] + +pDefinitionList :: TagParser [Block] +pDefinitionList = try $ do + pSatisfy (~== TagOpen "dl" []) + items <- manyTill pDefListItem (pCloses "dl") + return [DefinitionList items] + +pDefListItem :: TagParser ([Inline],[[Block]]) +pDefListItem = try $ do + let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && + not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) + terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) + defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) + skipMany nonItem + let term = intercalate [LineBreak] terms + return (term, defs) + +pRawTag :: TagParser String +pRawTag = do + tag <- pAnyTag + let ignorable x = x `elem` ["html","head","body"] + if tagOpen ignorable (const True) tag || tagClose ignorable tag + then return [] + else return $ renderTags' [tag] + +pRawHtmlBlock :: TagParser [Block] +pRawHtmlBlock = do + raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag + state <- getState + if stateParseRaw state && not (null raw) + then return [RawBlock "html" raw] + else return [] + +pHtmlBlock :: String -> TagParser String +pHtmlBlock t = try $ do + open <- pSatisfy (~== TagOpen t []) + contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) + return $ renderTags' $ [open] ++ contents ++ [TagClose t] + +pHeader :: TagParser [Block] +pHeader = try $ do + TagOpen tagtype attr <- pSatisfy $ + tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) + (const True) + let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] + let level = read (drop 1 tagtype) + contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof) + return $ if bodyTitle + then [] -- skip a representation of the title in the body + else [Header level $ normalizeSpaces contents] + +pHrule :: TagParser [Block] +pHrule = do + pSelfClosing (=="hr") (const True) + return [HorizontalRule] + +pSimpleTable :: TagParser [Block] +pSimpleTable = try $ do + TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) + skipMany pBlank + head' <- option [] $ pInTags "th" pTd + rows <- many1 $ try $ + skipMany pBlank >> pInTags "tr" pTd + skipMany pBlank + TagClose _ <- pSatisfy (~== TagClose "table") + let cols = maximum $ map length rows + let aligns = replicate cols AlignLeft + let widths = replicate cols 0 + return [Table [] aligns widths head' rows] + +pTd :: TagParser [TableCell] +pTd = try $ do + skipMany pBlank + res <- pInTags "td" pPlain + skipMany pBlank + return [res] + +pBlockQuote :: TagParser [Block] +pBlockQuote = do + contents <- pInTags "blockquote" block + return [BlockQuote contents] + +pPlain :: TagParser [Block] +pPlain = do + contents <- liftM (normalizeSpaces . concat) $ many1 inline + if null contents + then return [] + else return [Plain contents] + +pPara :: TagParser [Block] +pPara = do + contents <- pInTags "p" inline + return [Para $ normalizeSpaces contents] + +pCodeBlock :: TagParser [Block] +pCodeBlock = try $ do + TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) + contents <- manyTill pAnyTag (pCloses "pre" <|> eof) + let rawText = concatMap fromTagText $ filter isTagText contents + -- drop leading newline if any + let result' = case rawText of + '\n':xs -> xs + _ -> rawText + -- drop trailing newline if any + let result = case reverse result' of + '\n':_ -> init result' + _ -> result' + let attribsId = fromMaybe "" $ lookup "id" attr + let attribsClasses = words $ fromMaybe "" $ lookup "class" attr + let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + st <- getState + let attribs = if stateStrict st + then ("",[],[]) + else (attribsId, attribsClasses, attribsKV) + return [CodeBlock attribs result] + +inline :: TagParser [Inline] +inline = choice + [ pTagText + , pEmph + , pStrong + , pSuperscript + , pSubscript + , pStrikeout + , pLineBreak + , pLink + , pImage + , pCode + , pRawHtmlInline + ] + +pLocation :: TagParser () +pLocation = do + (TagPosition r c) <- pSat isTagPosition + setPosition $ newPos "input" r c + +pSat :: (Tag String -> Bool) -> TagParser (Tag String) +pSat f = do + pos <- getPosition + token show (const pos) (\x -> if f x then Just x else Nothing) + +pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String) +pSatisfy f = try $ optional pLocation >> pSat f + +pAnyTag :: TagParser (Tag String) +pAnyTag = pSatisfy (const True) + +pSelfClosing :: (String -> Bool) -> ([Attribute String] -> Bool) + -> TagParser (Tag String) +pSelfClosing f g = do + open <- pSatisfy (tagOpen f g) + optional $ pSatisfy (tagClose f) + return open + +pEmph :: TagParser [Inline] +pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph + +pStrong :: TagParser [Inline] +pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong + +pSuperscript :: TagParser [Inline] +pSuperscript = failIfStrict >> pInlinesInTags "sup" Superscript + +pSubscript :: TagParser [Inline] +pSubscript = failIfStrict >> pInlinesInTags "sub" Subscript + +pStrikeout :: TagParser [Inline] +pStrikeout = do + failIfStrict + pInlinesInTags "s" Strikeout <|> + pInlinesInTags "strike" Strikeout <|> + pInlinesInTags "del" Strikeout <|> + try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) + contents <- liftM concat $ manyTill inline (pCloses "span") + return [Strikeout contents]) + +pLineBreak :: TagParser [Inline] +pLineBreak = do + pSelfClosing (=="br") (const True) + return [LineBreak] + +pLink :: TagParser [Inline] +pLink = try $ do + tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) + let url = fromAttrib "href" tag + let title = fromAttrib "title" tag + lab <- liftM concat $ manyTill inline (pCloses "a") + return [Link (normalizeSpaces lab) (escapeURI url, title)] + +pImage :: TagParser [Inline] +pImage = do + tag <- pSelfClosing (=="img") (isJust . lookup "src") + let url = fromAttrib "src" tag + let title = fromAttrib "title" tag + let alt = fromAttrib "alt" tag + return [Image (toList $ text alt) (escapeURI url, title)] + +pCode :: TagParser [Inline] +pCode = try $ do + (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) + result <- manyTill pAnyTag (pCloses open) + let ident = fromMaybe "" $ lookup "id" attr + let classes = words $ fromMaybe [] $ lookup "class" attr + let rest = filter (\(x,_) -> x /= "id" && x /= "class") attr + return [Code (ident,classes,rest) + $ intercalate " " $ lines $ innerText result] + +pRawHtmlInline :: TagParser [Inline] +pRawHtmlInline = do + result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag + state <- getState + if stateParseRaw state + then return [RawInline "html" $ renderTags' [result]] + else return [] + +pInlinesInTags :: String -> ([Inline] -> Inline) + -> TagParser [Inline] +pInlinesInTags tagtype f = do + contents <- pInTags tagtype inline + return [f contents] + +pInTags :: String -> TagParser [a] + -> TagParser [a] +pInTags tagtype parser = try $ do + pSatisfy (~== TagOpen tagtype []) + liftM concat $ manyTill parser (pCloses tagtype <|> eof) + +pCloses :: String -> TagParser () +pCloses tagtype = try $ do + t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag + case t of + (TagClose t') | t' == tagtype -> pAnyTag >> return () + (TagOpen t' _) | t' `closes` tagtype -> return () + (TagClose "ul") | tagtype == "li" -> return () + (TagClose "ol") | tagtype == "li" -> return () + (TagClose "dl") | tagtype == "li" -> return () + _ -> pzero + +pTagText :: TagParser [Inline] +pTagText = try $ do + (TagText str) <- pSatisfy isTagText + st <- getState + case runParser (many pTagContents) st "text" str of + Left _ -> fail $ "Could not parse `" ++ str ++ "'" + Right result -> return result + +pBlank :: TagParser () +pBlank = try $ do + (TagText str) <- pSatisfy isTagText + guard $ all isSpace str + +pTagContents :: GenParser Char ParserState Inline +pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol + +pStr :: GenParser Char ParserState Inline +pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) + +isSpecial :: Char -> Bool +isSpecial '"' = True +isSpecial '\'' = True +isSpecial '.' = True +isSpecial '-' = True +isSpecial '\8216' = True +isSpecial '\8217' = True +isSpecial '\8220' = True +isSpecial '\8221' = True +isSpecial _ = False + +pSymbol :: GenParser Char ParserState Inline +pSymbol = satisfy isSpecial >>= return . Str . (:[]) + +pSpace :: GenParser Char ParserState Inline +pSpace = many1 (satisfy isSpace) >> return Space -- -- Constants -- -eitherBlockOrInline :: [[Char]] +eitherBlockOrInline :: [String] eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", "map", "area", "object"] @@ -76,57 +430,41 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "textarea", "tt", "u", "var"] -} -blockHtmlTags :: [[Char]] +blockHtmlTags :: [String] blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div", "dl", "fieldset", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "head", "hr", "html", "isindex", "menu", "noframes", - "noscript", "ol", "p", "pre", "table", "ul", "dd", + "h5", "h6", "head", "hr", "html", "isindex", "menu", + "noframes", "noscript", "ol", "p", "pre", "table", "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", "th", "thead", "tr", "script", "style"] -sanitaryTags :: [[Char]] -sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big", - "blockquote", "br", "button", "caption", "center", - "cite", "code", "col", "colgroup", "dd", "del", "dfn", - "dir", "div", "dl", "dt", "em", "fieldset", "font", - "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", - "i", "img", "input", "ins", "kbd", "label", "legend", - "li", "map", "menu", "ol", "optgroup", "option", "p", - "pre", "q", "s", "samp", "select", "small", "span", - "strike", "strong", "sub", "sup", "table", "tbody", - "td", "textarea", "tfoot", "th", "thead", "tr", "tt", - "u", "ul", "var"] - -sanitaryAttributes :: [[Char]] -sanitaryAttributes = ["abbr", "accept", "accept-charset", - "accesskey", "action", "align", "alt", "axis", - "border", "cellpadding", "cellspacing", "char", - "charoff", "charset", "checked", "cite", "class", - "clear", "cols", "colspan", "color", "compact", - "coords", "datetime", "dir", "disabled", - "enctype", "for", "frame", "headers", "height", - "href", "hreflang", "hspace", "id", "ismap", - "label", "lang", "longdesc", "maxlength", "media", - "method", "multiple", "name", "nohref", "noshade", - "nowrap", "prompt", "readonly", "rel", "rev", - "rows", "rowspan", "rules", "scope", "selected", - "shape", "size", "span", "src", "start", - "summary", "tabindex", "target", "title", "type", - "usemap", "valign", "value", "vspace", "width"] +isInlineTag :: Tag String -> Bool +isInlineTag t = tagOpen (`notElem` blockHtmlTags) (const True) t || + tagClose (`notElem` blockHtmlTags) t || + tagComment (const True) t + +isBlockTag :: Tag String -> Bool +isBlockTag t = tagOpen (`elem` blocktags) (const True) t || + tagClose (`elem` blocktags) t || + tagComment (const True) t + where blocktags = blockHtmlTags ++ eitherBlockOrInline + +isTextTag :: Tag String -> Bool +isTextTag = tagText (const True) + +isCommentTag :: Tag String -> Bool +isCommentTag = tagComment (const True) -- taken from HXT and extended closes :: String -> String -> Bool -"EOF" `closes` _ = True _ `closes` "body" = False _ `closes` "html" = False "a" `closes` "a" = True "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True -"td" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True -"dd" `closes` t | t `elem` ["dt","dd"] = True "hr" `closes` "p" = True "p" `closes` "p" = True "meta" `closes` "meta" = True @@ -148,627 +486,34 @@ t1 `closes` t2 | t2 `notElem` (blockHtmlTags ++ eitherBlockOrInline) = True _ `closes` _ = False --- --- HTML utility functions --- - --- | Returns @True@ if sanitization is specified and the specified tag is --- not on the sanitized tag list. -unsanitaryTag :: [Char] - -> GenParser tok ParserState Bool -unsanitaryTag tag = do - st <- getState - return $ stateSanitizeHTML st && tag `notElem` sanitaryTags - --- | returns @True@ if sanitization is specified and the specified attribute --- is not on the sanitized attribute list. -unsanitaryAttribute :: ([Char], String, t) - -> GenParser tok ParserState Bool -unsanitaryAttribute (attr, val, _) = do - st <- getState - return $ stateSanitizeHTML st && - (attr `notElem` sanitaryAttributes || - (attr `elem` ["href","src"] && unsanitaryURI val)) - --- | Returns @True@ if the specified URI is potentially a security risk. -unsanitaryURI :: String -> Bool -unsanitaryURI u = - let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:", - "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:", - "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:", - "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:", - "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:", - "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:", - "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:", - "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:", - "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:", - "snews:", "webcal:", "ymsgr:"] - in case parseURIReference (escapeURI u) of - Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes - Nothing -> True - --- | Read blocks until end tag. -blocksTilEnd :: String -> GenParser Char ParserState [Block] -blocksTilEnd tag = do - blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) - return $ filter (/= Null) blocks - --- | Read inlines until end tag. -inlinesTilEnd :: String -> GenParser Char ParserState [Inline] -inlinesTilEnd tag = manyTill inline (htmlEndTag tag) - --- | Parse blocks between open and close tag. -blocksIn :: String -> GenParser Char ParserState [Block] -blocksIn tag = try $ htmlOpenTag tag >> spaces >> blocksTilEnd tag - --- | Parse inlines between open and close tag. -inlinesIn :: String -> GenParser Char ParserState [Inline] -inlinesIn tag = try $ htmlOpenTag tag >> spaces >> inlinesTilEnd tag - --- | Extract type from a tag: e.g. @br@ from @\<br\>@ -extractTagType :: String -> String -extractTagType ('<':rest) = - let isSpaceOrSlash c = c `elem` "/ \n\t" in - map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest -extractTagType _ = "" - --- Parse any HTML tag (opening or self-closing) and return tag type -anyOpener :: GenParser Char ParserState [Char] -anyOpener = try $ do - char '<' - spaces - tag <- many1 alphaNum - skipMany htmlAttribute - spaces - option "" (string "/") - spaces - char '>' - return $ map toLower tag - --- | Parse any HTML tag (opening or self-closing) and return text of tag -anyHtmlTag :: GenParser Char ParserState [Char] -anyHtmlTag = try $ do - char '<' - spaces - tag <- many1 alphaNum - attribs <- many htmlAttribute - spaces - ender <- option "" (string "/") - let ender' = if null ender then "" else " /" - spaces - char '>' - let result = "<" ++ tag ++ - concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" - unsanitary <- unsanitaryTag tag - if unsanitary - then return $ "<!-- unsafe HTML removed -->" - else return result - -anyHtmlEndTag :: GenParser Char ParserState [Char] -anyHtmlEndTag = try $ do - char '<' - spaces - char '/' - spaces - tag <- many1 alphaNum - spaces - char '>' - let result = "</" ++ tag ++ ">" - unsanitary <- unsanitaryTag tag - if unsanitary - then return $ "<!-- unsafe HTML removed -->" - else return result - -htmlTag :: Bool - -> String - -> GenParser Char ParserState (String, [(String, String)]) -htmlTag selfClosing tag = try $ do - char '<' - spaces - stringAnyCase tag - attribs <- many htmlAttribute - spaces - -- note: we want to handle both HTML and XHTML, - -- so we don't require the / - when selfClosing $ optional $ char '/' >> spaces - char '>' - return (tag, (map (\(name, content, _) -> (name, content)) attribs)) - -htmlOpenTag :: String - -> GenParser Char ParserState (String, [(String, String)]) -htmlOpenTag = htmlTag False - -htmlCloseTag :: String - -> GenParser Char ParserState (String, [(String, String)]) -htmlCloseTag = htmlTag False . ('/':) - -htmlSelfClosingTag :: String - -> GenParser Char ParserState (String, [(String, String)]) -htmlSelfClosingTag = htmlTag True - --- parses a quoted html attribute value -quoted :: Char -> GenParser Char st (String, String) -quoted quoteChar = do - result <- between (char quoteChar) (char quoteChar) - (many (noneOf [quoteChar])) - return (result, [quoteChar]) - -nullAttribute :: ([Char], [Char], [Char]) -nullAttribute = ("", "", "") - -htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char]) -htmlAttribute = do - attr <- htmlRegularAttribute <|> htmlMinimizedAttribute - unsanitary <- unsanitaryAttribute attr - if unsanitary - then return nullAttribute - else return attr - --- minimized boolean attribute -htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char]) -htmlMinimizedAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - return (name, name, name) - -htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char]) -htmlRegularAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - spaces - char '=' - spaces - (content, quoteStr) <- choice [ (quoted '\''), - (quoted '"'), - (do - a <- many (noneOf " \t\n\r\"'<>") - return (a,"")) ] - return (name, content, - (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) - --- | Parse an end tag of type 'tag' -htmlEndTag :: [Char] -> GenParser Char ParserState [Char] -htmlEndTag tag = try $ do - closedByNext <- lookAhead $ option False $ liftM (`closes` tag) $ - anyOpener <|> (eof >> return "EOF") - if closedByNext - then return "" - else do char '<' - spaces - char '/' - spaces - stringAnyCase tag - spaces - char '>' - return $ "</" ++ tag ++ ">" - --- | Returns @True@ if the tag is (or can be) a block tag. -isBlock :: String -> Bool -isBlock tag = (extractTagType tag) `elem` (blockHtmlTags ++ eitherBlockOrInline) - -anyHtmlBlockTag :: GenParser Char ParserState [Char] -anyHtmlBlockTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if isBlock tag then return tag else fail "not a block tag" - -anyHtmlInlineTag :: GenParser Char ParserState [Char] -anyHtmlInlineTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if not (isBlock tag) then return tag else fail "not an inline tag" - --- | Parses material between script tags. --- Scripts must be treated differently, because they can contain '<>' etc. -htmlScript :: GenParser Char ParserState [Char] -htmlScript = try $ do - lookAhead $ htmlOpenTag "script" - open <- anyHtmlTag - rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script") - st <- getState - if stateSanitizeHTML st && not ("script" `elem` sanitaryTags) - then return "<!-- unsafe HTML removed -->" - else return $ open ++ rest ++ "</script>" - -scriptChunk :: GenParser Char ParserState [Char] -scriptChunk = jsComment <|> jsString <|> jsChars - where jsComment = jsEndlineComment <|> jsMultilineComment - jsString = jsSingleQuoteString <|> jsDoubleQuoteString - jsChars = many1 (noneOf "<\"'*/") <|> count 1 anyChar - jsEndlineComment = try $ do - string "//" - res <- manyTill anyChar newline - return ("//" ++ res) - jsMultilineComment = try $ do - string "/*" - res <- manyTill anyChar (try $ string "*/") - return ("/*" ++ res ++ "*/") - jsSingleQuoteString = stringwith '\'' - jsDoubleQuoteString = stringwith '"' - charWithEsc escapable = try $ - (try $ char '\\' >> oneOf ('\\':escapable) >>= \x -> return ['\\',x]) - <|> count 1 anyChar - stringwith c = try $ do - char c - res <- liftM concat $ manyTill (charWithEsc [c]) (char c) - return (c : (res ++ [c])) - --- | Parses material between style tags. --- Style tags must be treated differently, because they can contain CSS -htmlStyle :: GenParser Char ParserState [Char] -htmlStyle = try $ do - lookAhead $ htmlOpenTag "style" - open <- anyHtmlTag - rest <- manyTill anyChar (htmlEndTag "style") - st <- getState - if stateSanitizeHTML st && not ("style" `elem` sanitaryTags) - then return "<!-- unsafe HTML removed -->" - else return $ open ++ rest ++ "</style>" - -htmlBlockElement :: GenParser Char ParserState [Char] -htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ] - -rawHtmlBlock :: GenParser Char ParserState Block -rawHtmlBlock = try $ do - body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag - state <- getState - if stateParseRaw state then return (RawHtml body) else return Null - --- This is a block whose contents should be passed through verbatim, not interpreted. -rawVerbatimBlock :: GenParser Char ParserState [Char] -rawVerbatimBlock = try $ do - start <- anyHtmlBlockTag - let tagtype = extractTagType start - if tagtype `elem` ["pre"] - then do - contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar) - end <- htmlEndTag tagtype - return $ start ++ contents ++ end - else fail "Not a verbatim block" - --- We don't want to parse </body> or </html> as raw HTML, since these --- are handled in parseHtml. -rawHtmlBlock' :: GenParser Char ParserState Block -rawHtmlBlock' = do notFollowedBy' (htmlCloseTag "body" <|> - htmlCloseTag "html") - rawHtmlBlock - --- | Parses an HTML comment. -htmlComment :: GenParser Char st [Char] -htmlComment = try $ do - string "<!--" - comment <- many $ noneOf "-" - <|> try (char '-' >>~ notFollowedBy (try (char '-' >> char '>'))) - string "-->" - return $ "<!--" ++ comment ++ "-->" - --- --- parsing documents --- - -xmlDec :: GenParser Char st [Char] -xmlDec = try $ do - string "<?" - rest <- manyTill anyChar (char '>') - return $ "<?" ++ rest ++ ">" - -definition :: GenParser Char st [Char] -definition = try $ do - string "<!" - rest <- manyTill anyChar (char '>') - return $ "<!" ++ rest ++ ">" - -nonTitleNonHead :: GenParser Char ParserState Char -nonTitleNonHead = try $ do - notFollowedBy $ (htmlOpenTag "title" >> return ' ') <|> - (htmlEndTag "head" >> return ' ') - (rawHtmlBlock >> return ' ') <|> anyChar - -parseTitle :: GenParser Char ParserState [Inline] -parseTitle = try $ do - (tag, _) <- htmlOpenTag "title" - contents <- inlinesTilEnd tag - spaces - return contents - --- parse header and return meta-information (for now, just title) -parseHead :: GenParser Char ParserState Meta -parseHead = try $ do - htmlOpenTag "head" - spaces - skipMany nonTitleNonHead - contents <- option [] parseTitle - skipMany nonTitleNonHead - htmlEndTag "head" - return $ Meta contents [] [] - --- h1 class="title" representation of title in body -bodyTitle :: GenParser Char ParserState [Inline] -bodyTitle = try $ do - (_, attribs) <- htmlOpenTag "h1" - case (extractAttribute "class" attribs) of - Just "title" -> return "" - _ -> fail "not title" - inlinesTilEnd "h1" - -endOfDoc :: GenParser Char ParserState () -endOfDoc = try $ do - spaces - optional (htmlEndTag "body") - spaces - optional (htmlEndTag "html" >> many anyChar) -- ignore stuff after </html> - eof - -parseHtml :: GenParser Char ParserState Pandoc -parseHtml = do - sepEndBy (choice [xmlDec, definition, htmlComment]) spaces - spaces - optional $ htmlOpenTag "html" - spaces - meta <- option (Meta [] [] []) parseHead - spaces - optional $ htmlOpenTag "body" - spaces - optional bodyTitle -- skip title in body, because it's represented in meta - blocks <- parseBlocks - endOfDoc - return $ Pandoc meta blocks - --- --- parsing blocks --- - -parseBlocks :: GenParser Char ParserState [Block] -parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) - -block :: GenParser Char ParserState Block -block = choice [ codeBlock - , header - , hrule - , list - , blockQuote - , para - , plain - , rawHtmlBlock' - , notFollowedBy' endOfDoc >> char '<' >> return Null - ] <?> "block" - --- --- header blocks --- - -header :: GenParser Char ParserState Block -header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" - -headerLevel :: Int -> GenParser Char ParserState Block -headerLevel n = try $ do - let level = "h" ++ show n - htmlOpenTag level - contents <- inlinesTilEnd level - return $ Header n (normalizeSpaces contents) - --- --- hrule block --- - -hrule :: GenParser Char ParserState Block -hrule = try $ do - (_, attribs) <- htmlSelfClosingTag "hr" - state <- getState - if not (null attribs) && stateParseRaw state - then unexpected "attributes in hr" -- parse as raw in this case - else return HorizontalRule - --- --- code blocks --- - --- Note: HTML tags in code blocks (e.g. for syntax highlighting) are --- skipped, because they are not portable to output formats other than HTML. -codeBlock :: GenParser Char ParserState Block -codeBlock = try $ do - htmlOpenTag "pre" - result <- manyTill - (many1 (satisfy (/= '<')) <|> - ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) - (htmlEndTag "pre") - let result' = concat result - -- drop leading newline if any - let result'' = if "\n" `isPrefixOf` result' - then drop 1 result' - else result' - -- drop trailing newline if any - let result''' = if "\n" `isSuffixOf` result'' - then init result'' - else result'' - return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result''' - --- --- block quotes --- - -blockQuote :: GenParser Char ParserState Block -blockQuote = try $ htmlOpenTag "blockquote" >> spaces >> - blocksTilEnd "blockquote" >>= (return . BlockQuote) - --- --- list blocks --- - -list :: GenParser Char ParserState Block -list = choice [ bulletList, orderedList, definitionList ] <?> "list" - -orderedList :: GenParser Char ParserState Block -orderedList = try $ do - (_, attribs) <- htmlOpenTag "ol" - (start, style) <- option (1, DefaultStyle) $ - do failIfStrict - let sta = fromMaybe "1" $ - lookup "start" attribs - let sty = fromMaybe (fromMaybe "" $ - lookup "style" attribs) $ - lookup "class" attribs - let sty' = case sty of - "lower-roman" -> LowerRoman - "upper-roman" -> UpperRoman - "lower-alpha" -> LowerAlpha - "upper-alpha" -> UpperAlpha - "decimal" -> Decimal - _ -> DefaultStyle - return (read sta, sty') - spaces - -- note: if they have an <ol> or <ul> not in scope of a <li>, - -- treat it as a list item, though it's not valid xhtml... - items <- sepEndBy1 (blocksIn "li" <|> liftM (:[]) list) spaces - htmlEndTag "ol" - return $ OrderedList (start, style, DefaultDelim) items - -bulletList :: GenParser Char ParserState Block -bulletList = try $ do - htmlOpenTag "ul" - spaces - -- note: if they have an <ol> or <ul> not in scope of a <li>, - -- treat it as a list item, though it's not valid xhtml... - items <- sepEndBy1 (blocksIn "li" <|> liftM (:[]) list) spaces - htmlEndTag "ul" - return $ BulletList items - -definitionList :: GenParser Char ParserState Block -definitionList = try $ do - failIfStrict -- def lists not part of standard markdown - htmlOpenTag "dl" - spaces - items <- sepEndBy1 definitionListItem spaces - htmlEndTag "dl" - return $ DefinitionList items - -definitionListItem :: GenParser Char ParserState ([Inline], [[Block]]) -definitionListItem = try $ do - terms <- sepEndBy1 (inlinesIn "dt") spaces - defs <- sepEndBy1 (blocksIn "dd") spaces - let term = intercalate [LineBreak] terms - return (term, defs) - --- --- paragraph block --- - -para :: GenParser Char ParserState Block -para = try $ htmlOpenTag "p" >> inlinesTilEnd "p" >>= - return . Para . normalizeSpaces - --- --- plain block --- - -plain :: GenParser Char ParserState Block -plain = many1 inline >>= return . Plain . normalizeSpaces - --- --- inline --- - -inline :: GenParser Char ParserState Inline -inline = choice [ charRef - , strong - , emph - , superscript - , subscript - , strikeout - , spanStrikeout - , code - , str - , linebreak - , whitespace - , link - , image - , rawHtmlInline - , char '&' >> return (Str "&") -- common HTML error - ] <?> "inline" - -code :: GenParser Char ParserState Inline -code = try $ do - result <- (htmlOpenTag "code" >> manyTill anyChar (htmlEndTag "code")) - <|> (htmlOpenTag "tt" >> manyTill anyChar (htmlEndTag "tt")) - -- remove internal line breaks, leading and trailing space, - -- and decode character references - return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ - intercalate " " $ lines result - -rawHtmlInline :: GenParser Char ParserState Inline -rawHtmlInline = do - result <- anyHtmlInlineTag <|> htmlComment - state <- getState - if stateParseRaw state then return (HtmlInline result) else return (Str "") - -betweenTags :: [Char] -> GenParser Char ParserState [Inline] -betweenTags tag = try $ htmlOpenTag tag >> inlinesTilEnd tag >>= - return . normalizeSpaces - -emph :: GenParser Char ParserState Inline -emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph - -strong :: GenParser Char ParserState Inline -strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong - -superscript :: GenParser Char ParserState Inline -superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript - -subscript :: GenParser Char ParserState Inline -subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript - -strikeout :: GenParser Char ParserState Inline -strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= - return . Strikeout - -spanStrikeout :: GenParser Char ParserState Inline -spanStrikeout = try $ do - failIfStrict -- strict markdown has no strikeout, so treat as raw HTML - (_, attributes) <- htmlOpenTag "span" - result <- case (extractAttribute "class" attributes) of - Just "strikeout" -> inlinesTilEnd "span" - _ -> fail "not a strikeout" - return $ Strikeout result - -whitespace :: GenParser Char st Inline -whitespace = many1 space >> return Space - --- hard line break -linebreak :: GenParser Char ParserState Inline -linebreak = htmlSelfClosingTag "br" >> optional newline >> return LineBreak - -str :: GenParser Char st Inline -str = many1 (noneOf "< \t\n&") >>= return . Str - --- --- links and images --- - --- extract contents of attribute (attribute names are case-insensitive) -extractAttribute :: [Char] -> [([Char], String)] -> Maybe String -extractAttribute _ [] = Nothing -extractAttribute name ((attrName, contents):rest) = - let name' = map toLower name - attrName' = map toLower attrName - in if attrName' == name' - then Just (decodeCharacterReferences contents) - else extractAttribute name rest - -link :: GenParser Char ParserState Inline -link = try $ do - (_, attributes) <- htmlOpenTag "a" - url <- case (extractAttribute "href" attributes) of - Just url -> return url - Nothing -> fail "no href" - let title = fromMaybe "" $ extractAttribute "title" attributes - lab <- inlinesTilEnd "a" - return $ Link (normalizeSpaces lab) (escapeURI url, title) - -image :: GenParser Char ParserState Inline -image = try $ do - (_, attributes) <- htmlSelfClosingTag "img" - url <- case (extractAttribute "src" attributes) of - Just url -> return url - Nothing -> fail "no src" - let title = fromMaybe "" $ extractAttribute "title" attributes - let alt = fromMaybe "" (extractAttribute "alt" attributes) - return $ Image [Str alt] (escapeURI url, title) - +--- parsers for use in markdown, textile readers + +-- | Matches a stretch of HTML in balanced tags. +htmlInBalanced :: (Tag String -> Bool) -> GenParser Char ParserState String +htmlInBalanced f = try $ do + (TagOpen t _, tag) <- htmlTag f + guard $ '/' `notElem` tag -- not a self-closing tag + let nonTagChunk = many1 $ satisfy (/= '<') + let stopper = htmlTag (~== TagClose t) + let anytag = liftM snd $ htmlTag (const True) + contents <- many $ notFollowedBy' stopper >> + (nonTagChunk <|> htmlInBalanced (const True) <|> anytag) + endtag <- liftM snd stopper + return $ tag ++ concat contents ++ endtag + +-- | Matches a tag meeting a certain condition. +htmlTag :: (Tag String -> Bool) -> GenParser Char ParserState (Tag String, String) +htmlTag f = try $ do + lookAhead (char '<') + (next : _) <- getInput >>= return . canonicalizeTags . parseTags + guard $ f next + -- advance the parser + case next of + TagComment s -> do + count (length s + 4) anyChar + skipMany (satisfy (/='>')) + char '>' + return (next, "<!--" ++ s ++ "-->") + _ -> do + rendered <- manyTill anyChar (char '>') + return (next, rendered ++ ">") diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 406809dfc..dca745b56 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -38,9 +38,9 @@ import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.Parsing import Data.Maybe ( fromMaybe ) -import Data.Char ( chr ) -import Data.List ( isPrefixOf, isSuffixOf ) -import Control.Monad ( when ) +import Data.Char ( chr, toUpper ) +import Data.List ( intercalate, isPrefixOf, isSuffixOf ) +import Control.Monad -- | Parse LaTeX from string and return 'Pandoc' document. readLaTeX :: ParserState -- ^ Parser state, including options for parser @@ -50,7 +50,7 @@ readLaTeX = readWith parseLaTeX -- characters with special meaning specialChars :: [Char] -specialChars = "\\`$%^&_~#{}\n \t|<>'\"-" +specialChars = "\\`$%^&_~#{}[]\n \t|<>'\"-" -- -- utility functions @@ -64,7 +64,7 @@ bracketedText openB closeB = do -- | Returns an option or argument of a LaTeX command. optOrArg :: GenParser Char st [Char] -optOrArg = bracketedText '{' '}' <|> bracketedText '[' ']' +optOrArg = try $ spaces >> (bracketedText '{' '}' <|> bracketedText '[' ']') -- | True if the string begins with '{'. isArg :: [Char] -> Bool @@ -86,14 +86,22 @@ command = do begin :: [Char] -> GenParser Char st [Char] begin name = try $ do - string $ "\\begin{" ++ name ++ "}" + string "\\begin" + spaces + char '{' + string name + char '}' optional commandArgs spaces return name end :: [Char] -> GenParser Char st [Char] end name = try $ do - string $ "\\end{" ++ name ++ "}" + string "\\end" + spaces + char '{' + string name + char '}' return name -- | Returns a list of block elements containing the contents of an @@ -103,7 +111,9 @@ environment name = try $ begin name >> spaces >> manyTill block (end name) >>~ s anyEnvironment :: GenParser Char ParserState Block anyEnvironment = try $ do - string "\\begin{" + string "\\begin" + spaces + char '{' name <- many letter star <- option "" (string "*") -- some environments have starred variants char '}' @@ -119,22 +129,17 @@ anyEnvironment = try $ do -- | Process LaTeX preamble, extracting metadata. processLaTeXPreamble :: GenParser Char ParserState () -processLaTeXPreamble = try $ manyTill - (choice [bibliographic, comment, unknownCommand, nullBlock]) - (try (string "\\begin{document}")) >> - spaces +processLaTeXPreamble = do + try $ string "\\documentclass" + skipMany $ bibliographic <|> macro <|> commentBlock <|> skipChar -- | Parse LaTeX and return 'Pandoc'. parseLaTeX :: GenParser Char ParserState Pandoc parseLaTeX = do - optional processLaTeXPreamble -- preamble might not be present (fragment) - spaces - blocks <- parseBlocks spaces - optional $ try (string "\\end{document}" >> many anyChar) - -- might not be present (fragment) - spaces - eof + skipMany $ comment >> spaces + blocks <- try (processLaTeXPreamble >> environment "document") + <|> (many block >>~ (spaces >> eof)) state <- getState let blocks' = filter (/= Null) blocks let title' = stateTitle state @@ -155,13 +160,16 @@ block = choice [ hrule , header , list , blockQuote - , comment + , simpleTable + , commentBlock + , macro , bibliographic , para , itemBlock , unknownEnvironment , ignore - , unknownCommand ] <?> "block" + , unknownCommand + ] <?> "block" -- -- header blocks @@ -208,20 +216,77 @@ hrule :: GenParser Char st Block hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", "\\newpage" ] >> spaces >> return HorizontalRule +-- tables + +simpleTable :: GenParser Char ParserState Block +simpleTable = try $ do + string "\\begin" + spaces + string "{tabular}" + spaces + aligns <- parseAligns + let cols = length aligns + optional hline + header' <- option [] $ parseTableHeader cols + rows <- many (parseTableRow cols >>~ optional hline) + spaces + end "tabular" + spaces + let header'' = if null header' + then replicate cols [] + else header' + return $ Table [] aligns (replicate cols 0) header'' rows + +hline :: GenParser Char st () +hline = try $ spaces >> string "\\hline" >> return () + +parseAligns :: GenParser Char ParserState [Alignment] +parseAligns = try $ do + char '{' + optional $ char '|' + let cAlign = char 'c' >> return AlignCenter + let lAlign = char 'l' >> return AlignLeft + let rAlign = char 'r' >> return AlignRight + let alignChar = cAlign <|> lAlign <|> rAlign + aligns' <- sepEndBy alignChar (optional $ char '|') + char '}' + spaces + return aligns' + +parseTableHeader :: Int -- ^ number of columns + -> GenParser Char ParserState [TableCell] +parseTableHeader cols = try $ do + cells' <- parseTableRow cols + hline + return cells' + +parseTableRow :: Int -- ^ number of columns + -> GenParser Char ParserState [TableCell] +parseTableRow cols = try $ do + let tableCellInline = notFollowedBy (char '&' <|> + (try $ char '\\' >> char '\\')) >> inline + cells' <- sepBy (spaces >> liftM ((:[]) . Plain . normalizeSpaces) + (many tableCellInline)) (char '&') + guard $ length cells' == cols + spaces + (try $ string "\\\\" >> spaces) <|> + (lookAhead (end "tabular") >> return ()) + return cells' + -- -- code blocks -- codeBlock :: GenParser Char ParserState Block -codeBlock = codeBlockWith "verbatim" <|> codeBlockWith "Verbatim" <|> lhsCodeBlock +codeBlock = codeBlockWith "verbatim" <|> codeBlockWith "Verbatim" <|> codeBlockWith "lstlisting" <|> lhsCodeBlock -- Note: Verbatim is from fancyvrb. codeBlockWith :: String -> GenParser Char st Block codeBlockWith env = try $ do - string ("\\begin{" ++ env ++ "}") -- don't use begin function because it - -- gobbles whitespace - optional blanklines -- we want to gobble blank lines, but not - -- leading space + string "\\begin" + spaces -- don't use begin function because it + string $ "{" ++ env ++ "}" -- gobbles whitespace; we want to gobble + optional blanklines -- blank lines, but not leading space contents <- manyTill anyChar (try (string $ "\\end{" ++ env ++ "}")) spaces let classes = if env == "code" then ["haskell"] else [] @@ -265,7 +330,10 @@ listItem = try $ do orderedList :: GenParser Char ParserState Block orderedList = try $ do - string "\\begin{enumerate}" + string "\\begin" + spaces + string "{enumerate}" + spaces (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ try $ do failIfStrict char '[' @@ -293,7 +361,6 @@ orderedList = try $ do bulletList :: GenParser Char ParserState Block bulletList = try $ do begin "itemize" - spaces items <- many listItem end "itemize" spaces @@ -302,7 +369,6 @@ bulletList = try $ do definitionList :: GenParser Char ParserState Block definitionList = try $ do begin "description" - spaces items <- many listItem end "description" spaces @@ -342,7 +408,7 @@ authors :: GenParser Char ParserState Block authors = try $ do string "\\author{" raw <- many1 (notFollowedBy (char '}') >> inline) - let authors' = map normalizeSpaces $ splitBy LineBreak raw + let authors' = map normalizeSpaces $ splitBy (== LineBreak) raw char '}' spaces updateState (\s -> s { stateAuthors = authors' }) @@ -382,13 +448,15 @@ rawLaTeXEnvironment :: GenParser Char st Block rawLaTeXEnvironment = do contents <- rawLaTeXEnvironment' spaces - return $ Para [TeX contents] + return $ RawBlock "latex" contents -- | Parse any LaTeX environment and return a string containing -- the whole literal environment as raw TeX. rawLaTeXEnvironment' :: GenParser Char st String rawLaTeXEnvironment' = try $ do - string "\\begin{" + string "\\begin" + spaces + char '{' name <- many1 letter star <- option "" (string "*") -- for starred variants let name' = name ++ star @@ -418,31 +486,49 @@ ignore = try $ do spaces return Null +demacro :: (String, String, [String]) -> GenParser Char ParserState Inline +demacro (n,st,args) = try $ do + let raw = "\\" ++ n ++ st ++ concat args + s' <- applyMacros' raw + if raw == s' + then return $ RawInline "latex" raw + else do + inp <- getInput + setInput $ s' ++ inp + return $ Str "" + unknownCommand :: GenParser Char ParserState Block unknownCommand = try $ do - notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description", - "document"] + spaces + notFollowedBy' $ oneOfStrings ["\\begin","\\end","\\item"] state <- getState when (stateParserContext state == ListItemState) $ notFollowedBy' (string "\\item") if stateParseRaw state - then do - (name, star, args) <- command - spaces - return $ Plain [TeX ("\\" ++ name ++ star ++ concat args)] + then command >>= demacro >>= return . Plain . (:[]) else do (name, _, args) <- command spaces - if name `elem` commandsToIgnore - then return Null - else return $ Plain [Str $ concat args] + unless (name `elem` commandsToIgnore) $ do + -- put arguments back in input to be parsed + inp <- getInput + setInput $ intercalate " " args ++ inp + return Null commandsToIgnore :: [String] -commandsToIgnore = ["special","pdfannot","pdfstringdef"] +commandsToIgnore = ["special","pdfannot","pdfstringdef", "index","bibliography"] + +skipChar :: GenParser Char ParserState Block +skipChar = do + satisfy (/='\\') <|> + (notFollowedBy' (try $ + string "\\begin" >> spaces >> string "{document}") >> + anyChar) + spaces + return Null --- latex comment -comment :: GenParser Char st Block -comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return Null +commentBlock :: GenParser Char st Block +commentBlock = many1 (comment >> spaces) >> return Null -- -- inline @@ -464,8 +550,6 @@ inline = choice [ str , strikeout , superscript , subscript - , ref - , lab , code , url , link @@ -474,12 +558,20 @@ inline = choice [ str , linebreak , accentedChar , nonbreakingSpace + , cite , specialChar + , ensureMath , rawLaTeXInline' , escapedChar , unescapedChar + , comment ] <?> "inline" + +-- latex comment +comment :: GenParser Char st Inline +comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return (Str "") + accentedChar :: GenParser Char st Inline accentedChar = normalAccentedChar <|> specialAccentedChar @@ -512,7 +604,7 @@ accentTable = ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ] specialAccentedChar :: GenParser Char st Inline -specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, +specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, lslash, oslash, pound, euro, copyright, sect ] ccedil :: GenParser Char st Inline @@ -543,6 +635,13 @@ oslash = try $ do let num = if letter' == 'o' then 248 else 216 return $ Str [chr num] +lslash :: GenParser Char st Inline +lslash = try $ do + cmd <- oneOfStrings ["{\\L}","{\\l}","\\L ","\\l "] + return $ if 'l' `elem` cmd + then Str "\x142" + else Str "\x141" + aelig :: GenParser Char st Inline aelig = try $ do char '\\' @@ -569,7 +668,7 @@ escapedChar = do -- nonescaped special characters unescapedChar :: GenParser Char st Inline -unescapedChar = oneOf "`$^&_#{}|<>" >>= return . (\c -> Str [c]) +unescapedChar = oneOf "`$^&_#{}[]|<>" >>= return . (\c -> Str [c]) specialChar :: GenParser Char st Inline specialChar = choice [ spacer, interwordSpace, @@ -604,27 +703,34 @@ doubleQuote :: GenParser Char st Inline doubleQuote = char '"' >> return (Str "\"") code :: GenParser Char ParserState Inline -code = code1 <|> code2 <|> lhsInlineCode +code = code1 <|> code2 <|> code3 <|> lhsInlineCode code1 :: GenParser Char st Inline code1 = try $ do string "\\verb" marker <- anyChar result <- manyTill anyChar (char marker) - return $ Code $ removeLeadingTrailingSpace result + return $ Code nullAttr $ removeLeadingTrailingSpace result code2 :: GenParser Char st Inline code2 = try $ do string "\\texttt{" result <- manyTill (noneOf "\\\n~$%^&{}") (char '}') - return $ Code result + return $ Code nullAttr result + +code3 :: GenParser Char st Inline +code3 = try $ do + string "\\lstinline" + marker <- anyChar + result <- manyTill anyChar (char marker) + return $ Code nullAttr $ removeLeadingTrailingSpace result lhsInlineCode :: GenParser Char ParserState Inline lhsInlineCode = try $ do failUnlessLHS char '|' result <- manyTill (noneOf "|\n") (char '|') - return $ Code result + return $ Code ("",["haskell"],[]) result emph :: GenParser Char ParserState Inline emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >> @@ -683,15 +789,6 @@ emDash = try (string "---") >> return EmDash hyphen :: GenParser Char st Inline hyphen = char '-' >> return (Str "-") -lab :: GenParser Char st Inline -lab = try $ do - string "\\label{" - result <- manyTill anyChar (char '}') - return $ Str $ "(" ++ result ++ ")" - -ref :: GenParser Char st Inline -ref = try (string "\\ref{") >> manyTill anyChar (char '}') >>= return . Str - strong :: GenParser Char ParserState Inline strong = try (string "\\textbf{") >> manyTill inline (char '}') >>= return . Strong @@ -714,13 +811,13 @@ endline :: GenParser Char st Inline endline = try $ newline >> notFollowedBy blankline >> return Space -- math -math :: GenParser Char st Inline -math = (math3 >>= return . Math DisplayMath) - <|> (math1 >>= return . Math InlineMath) - <|> (math2 >>= return . Math InlineMath) - <|> (math4 >>= return . Math DisplayMath) - <|> (math5 >>= return . Math DisplayMath) - <|> (math6 >>= return . Math DisplayMath) +math :: GenParser Char ParserState Inline +math = (math3 >>= applyMacros' >>= return . Math DisplayMath) + <|> (math1 >>= applyMacros' >>= return . Math InlineMath) + <|> (math2 >>= applyMacros' >>= return . Math InlineMath) + <|> (math4 >>= applyMacros' >>= return . Math DisplayMath) + <|> (math5 >>= applyMacros' >>= return . Math DisplayMath) + <|> (math6 >>= applyMacros' >>= return . Math DisplayMath) <?> "math" math1 :: GenParser Char st String @@ -737,7 +834,6 @@ math4 = try $ do name <- begin "displaymath" <|> begin "equation" <|> begin "equation*" <|> begin "gather" <|> begin "gather*" <|> begin "gathered" <|> begin "multline" <|> begin "multline*" - spaces manyTill anyChar (end name) math5 :: GenParser Char st String @@ -748,10 +844,15 @@ math6 = try $ do name <- begin "eqnarray" <|> begin "eqnarray*" <|> begin "align" <|> begin "align*" <|> begin "alignat" <|> begin "alignat*" <|> begin "split" <|> begin "aligned" <|> begin "alignedat" - spaces res <- manyTill anyChar (end name) return $ filter (/= '&') res -- remove alignment codes +ensureMath :: GenParser Char st Inline +ensureMath = try $ do + (n, _, args) <- command + guard $ n == "ensuremath" && not (null args) + return $ Math InlineMath $ tail $ init $ head args + -- -- links and images -- @@ -760,7 +861,7 @@ url :: GenParser Char ParserState Inline url = try $ do string "\\url" url' <- charsInBalanced '{' '}' - return $ Link [Code url'] (escapeURI url', "") + return $ Link [Code ("",["url"],[]) url'] (escapeURI url', "") link :: GenParser Char ParserState Inline link = try $ do @@ -793,6 +894,103 @@ footnote = try $ do setInput rest return $ Note blocks +-- | citations +cite :: GenParser Char ParserState Inline +cite = simpleCite <|> complexNatbibCites + +simpleCiteArgs :: GenParser Char ParserState [Citation] +simpleCiteArgs = try $ do + first <- optionMaybe $ (char '[') >> manyTill inline (char ']') + second <- optionMaybe $ (char '[') >> manyTill inline (char ']') + char '{' + keys <- many1Till citationLabel (char '}') + let (pre, suf) = case (first , second ) of + (Just s , Nothing) -> ([], s ) + (Just s , Just t ) -> (s , t ) + _ -> ([], []) + conv k = Citation { citationId = k + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationHash = 0 + , citationNoteNum = 0 + } + return $ addPrefix pre $ addSuffix suf $ map conv keys + + +simpleCite :: GenParser Char ParserState Inline +simpleCite = try $ do + char '\\' + let biblatex = [a ++ "cite" | a <- ["auto", "foot", "paren", "super", ""]] + ++ ["footcitetext"] + normal = ["cite" ++ a ++ b | a <- ["al", ""], b <- ["p", "p*", ""]] + ++ biblatex + supress = ["citeyearpar", "citeyear", "autocite*", "cite*", "parencite*"] + intext = ["textcite"] ++ ["cite" ++ a ++ b | a <- ["al", ""], b <- ["t", "t*"]] + mintext = ["textcites"] + mnormal = map (++ "s") biblatex + cmdend = notFollowedBy (letter <|> char '*') + capit [] = [] + capit (x:xs) = toUpper x : xs + addUpper xs = xs ++ map capit xs + toparser l t = try $ oneOfStrings (addUpper l) >> cmdend >> return t + (mode, multi) <- toparser normal (NormalCitation, False) + <|> toparser supress (SuppressAuthor, False) + <|> toparser intext (AuthorInText , False) + <|> toparser mnormal (NormalCitation, True ) + <|> toparser mintext (AuthorInText , True ) + cits <- if multi then + many1 simpleCiteArgs + else + simpleCiteArgs >>= \c -> return [c] + let (c:cs) = concat cits + cits' = case mode of + AuthorInText -> c {citationMode = mode} : cs + _ -> map (\a -> a {citationMode = mode}) (c:cs) + return $ Cite cits' [] + +complexNatbibCites :: GenParser Char ParserState Inline +complexNatbibCites = complexNatbibTextual <|> complexNatbibParenthetical + +complexNatbibTextual :: GenParser Char ParserState Inline +complexNatbibTextual = try $ do + string "\\citeauthor{" + manyTill (noneOf "}") (char '}') + skipSpaces + Cite (c:cs) _ <- complexNatbibParenthetical + return $ Cite (c {citationMode = AuthorInText} : cs) [] + + +complexNatbibParenthetical :: GenParser Char ParserState Inline +complexNatbibParenthetical = try $ do + string "\\citetext{" + cits <- many1Till parseOne (char '}') + return $ Cite (concat cits) [] + where + parseOne = do + skipSpaces + pref <- many (notFollowedBy (oneOf "\\}") >> inline) + (Cite cites _) <- simpleCite + suff <- many (notFollowedBy (oneOf "\\};") >> inline) + skipSpaces + optional $ char ';' + return $ addPrefix pref $ addSuffix suff $ cites + +addPrefix :: [Inline] -> [Citation] -> [Citation] +addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks +addPrefix _ _ = [] + +addSuffix :: [Inline] -> [Citation] -> [Citation] +addSuffix s ks@(_:_) = let k = last ks + in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] +addSuffix _ _ = [] + +citationLabel :: GenParser Char ParserState String +citationLabel = do + res <- many1 $ noneOf ",}" + optional $ char ',' + return $ removeLeadingTrailingSpace res + -- | Parse any LaTeX inline command and return it in a raw TeX inline element. rawLaTeXInline' :: GenParser Char ParserState Inline rawLaTeXInline' = do @@ -805,12 +1003,11 @@ rawLaTeXInline :: GenParser Char ParserState Inline rawLaTeXInline = try $ do state <- getState if stateParseRaw state - then do - (name, star, args) <- command - return $ TeX ("\\" ++ name ++ star ++ concat args) + then command >>= demacro else do - (name, _, args) <- command - spaces - if name `elem` commandsToIgnore - then return $ Str "" - else return $ Str (concat args) + (name,st,args) <- command + x <- demacro (name,st,args) + unless (x == Str "" || name `elem` commandsToIgnore) $ do + inp <- getInput + setInput $ intercalate " " args ++ inp + return $ Str "" diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 33fb3d8e6..58d2158bf 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -27,26 +27,25 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of markdown-formatted plain text to 'Pandoc' document. -} -module Text.Pandoc.Readers.Markdown ( - readMarkdown - ) where +module Text.Pandoc.Readers.Markdown ( readMarkdown ) where -import Data.List ( transpose, isSuffixOf, sortBy, findIndex, intercalate ) +import Data.List ( transpose, sortBy, findIndex, intercalate ) import qualified Data.Map as M import Data.Ord ( comparing ) import Data.Char ( isAlphaNum ) import Data.Maybe import Text.Pandoc.Definition +import Text.Pandoc.Generic import Text.Pandoc.Shared import Text.Pandoc.Parsing import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' ) -import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, - anyHtmlInlineTag, anyHtmlTag, - anyHtmlEndTag, htmlEndTag, extractTagType, - htmlBlockElement, htmlComment, unsanitaryURI ) +import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag, + isTextTag, isCommentTag ) import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) import Text.ParserCombinators.Parsec -import Control.Monad (when, liftM, unless) +import Control.Monad (when, liftM, guard) +import Text.HTML.TagSoup +import Text.HTML.TagSoup.Match (tagOpen) -- | Read markdown from an input string and return a Pandoc document. readMarkdown :: ParserState -- ^ Parser state, including options for parser @@ -58,18 +57,26 @@ readMarkdown state s = (readWith parseMarkdown) state (s ++ "\n\n") -- Constants and data structure definitions -- -bulletListMarkers :: [Char] -bulletListMarkers = "*+-" +isBulletListMarker :: Char -> Bool +isBulletListMarker '*' = True +isBulletListMarker '+' = True +isBulletListMarker '-' = True +isBulletListMarker _ = False -hruleChars :: [Char] -hruleChars = "*-_" +isHruleChar :: Char -> Bool +isHruleChar '*' = True +isHruleChar '-' = True +isHruleChar '_' = True +isHruleChar _ = False setextHChars :: [Char] setextHChars = "=-" --- treat these as potentially non-text when parsing inline: -specialChars :: [Char] -specialChars = "\\[]*_~`<>$!^-.&@'\";" +isBlank :: Char -> Bool +isBlank ' ' = True +isBlank '\t' = True +isBlank '\n' = True +isBlank _ = False -- -- auxiliary functions @@ -106,12 +113,6 @@ failUnlessBeginningOfLine = do pos <- getPosition if sourceColumn pos == 1 then return () else fail "not beginning of line" --- | Fail unless we're in "smart typography" mode. -failUnlessSmart :: GenParser tok ParserState () -failUnlessSmart = do - state <- getState - if stateSmart state then return () else pzero - -- | Parse a sequence of inline elements between square brackets, -- including inlines between balanced pairs of square brackets. inlinesInBalancedBrackets :: GenParser Char ParserState Inline @@ -119,7 +120,7 @@ inlinesInBalancedBrackets :: GenParser Char ParserState Inline inlinesInBalancedBrackets parser = try $ do char '[' result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser - unless (res == "[") pzero + guard (res == "[") bal <- inlinesInBalancedBrackets parser return $ [Str "["] ++ bal ++ [Str "]"]) <|> (count 1 parser)) @@ -143,7 +144,8 @@ authorsLine :: GenParser Char ParserState [[Inline]] authorsLine = try $ do char '%' skipSpaces - authors <- sepEndBy (many (notFollowedBy (oneOf ";\n") >> inline)) + authors <- sepEndBy (many (notFollowedBy (satisfy $ \c -> + c == ';' || c == '\n') >> inline)) (char ';' <|> try (newline >> notFollowedBy blankline >> spaceChar)) newline @@ -196,7 +198,7 @@ parseMarkdown = do handleExampleRef z = z if M.null examples then return doc - else return $ processWith handleExampleRef doc + else return $ bottomUp handleExampleRef doc -- -- initial pass for references and notes @@ -209,16 +211,24 @@ referenceKey = try $ do lab <- reference char ':' skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') - let sourceURL excludes = many $ - optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' referenceTitle >> char ' ')) - src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n" + let nl = char '\n' >> notFollowedBy blankline >> return ' ' + let sourceURL = liftM unwords $ many $ try $ do + notFollowedBy' referenceTitle + skipMany spaceChar + optional nl + skipMany spaceChar + notFollowedBy' reference + many1 (satisfy $ not . isBlank) + let betweenAngles = try $ char '<' >> + manyTill (noneOf ">\n" <|> nl) (char '>') + src <- try betweenAngles <|> sourceURL tit <- option "" referenceTitle blanklines endPos <- getPosition let target = (escapeURI $ removeTrailingSpace src, tit) st <- getState let oldkeys = stateKeys st - updateState $ \s -> s { stateKeys = M.insert (Key lab) target oldkeys } + updateState $ \s -> s { stateKeys = M.insert (toKey lab) target oldkeys } -- return blanks so line count isn't affected return $ replicate (sourceLine endPos - sourceLine startPos) '\n' @@ -232,12 +242,12 @@ referenceTitle = try $ do return $ decodeCharacterReferences tit noteMarker :: GenParser Char ParserState [Char] -noteMarker = skipNonindentSpaces >> string "[^" >> manyTill (noneOf " \t\n") (char ']') +noteMarker = string "[^" >> many1Till (satisfy $ not . isBlank) (char ']') rawLine :: GenParser Char ParserState [Char] rawLine = do notFollowedBy blankline - notFollowedBy' noteMarker + notFollowedBy' $ try $ skipNonindentSpaces >> noteMarker contents <- many1 nonEndline end <- option "" (newline >> optional indentSpaces >> return "\n") return $ contents ++ end @@ -248,6 +258,7 @@ rawLines = many1 rawLine >>= return . concat noteBlock :: GenParser Char ParserState [Char] noteBlock = try $ do startPos <- getPosition + skipNonindentSpaces ref <- noteMarker char ':' optional blankline @@ -284,6 +295,7 @@ block = do , plain , nullBlock ] else [ codeBlockDelimited + , macro , header , table , codeBlockIndented @@ -293,6 +305,7 @@ block = do , bulletList , orderedList , definitionList + , rawTeXBlock , para , rawHtmlBlocks , plain @@ -318,6 +331,9 @@ atxClosing = try $ skipMany (char '#') >> blanklines setextHeader :: GenParser Char ParserState Block setextHeader = try $ do + -- This lookahead prevents us from wasting time parsing Inlines + -- unless necessary -- it gives a significant performance boost. + lookAhead $ anyLine >> many1 (oneOf setextHChars) >> blankline text <- many1Till inline newline underlineChar <- oneOf setextHChars many (char underlineChar) @@ -332,7 +348,7 @@ setextHeader = try $ do hrule :: GenParser Char st Block hrule = try $ do skipSpaces - start <- oneOf hruleChars + start <- satisfy isHruleChar count 2 (skipSpaces >> char start) skipMany (spaceChar <|> char start) newline @@ -371,6 +387,7 @@ attributes = try $ do attribute :: GenParser Char st ([Char], [[Char]], [([Char], [Char])]) attribute = identifierAttr <|> classAttr <|> keyValAttr + identifier :: GenParser Char st [Char] identifier = do first <- letter @@ -394,7 +411,7 @@ keyValAttr = try $ do key <- identifier char '=' char '"' - val <- manyTill (noneOf "\n") (char '"') + val <- manyTill (satisfy (/='\n')) (char '"') return ("",[],[(key,val)]) codeBlockDelimited :: GenParser Char st Block @@ -489,7 +506,7 @@ bulletListStart = try $ do optional newline -- if preceded by a Plain block in a list context skipNonindentSpaces notFollowedBy' hrule -- because hrules start out just like lists - oneOf bulletListMarkers + satisfy isBulletListMarker spaceChar skipSpaces @@ -524,7 +541,7 @@ listLine = try $ do notFollowedBy' (do indentSpaces many (spaceChar) listStart) - chunks <- manyTill (htmlComment <|> count 1 anyChar) newline + chunks <- manyTill (liftM snd (htmlTag isCommentTag) <|> count 1 anyChar) newline return $ concat chunks ++ "\n" -- parse raw text for one list item, excluding start marker and continuations @@ -644,23 +661,21 @@ definitionList = do -- isHtmlOrBlank :: Inline -> Bool -isHtmlOrBlank (HtmlInline _) = True -isHtmlOrBlank (Space) = True -isHtmlOrBlank (LineBreak) = True -isHtmlOrBlank _ = False +isHtmlOrBlank (RawInline "html" _) = True +isHtmlOrBlank (Space) = True +isHtmlOrBlank (LineBreak) = True +isHtmlOrBlank _ = False para :: GenParser Char ParserState Block para = try $ do - result <- many1 inline - if all isHtmlOrBlank result - then fail "treat as raw HTML" - else return () - newline - blanklines <|> do st <- getState - if stateStrict st - then lookAhead (blockQuote <|> header) >> return "" - else pzero - return $ Para $ normalizeSpaces result + result <- liftM normalizeSpaces $ many1 inline + guard $ not . all isHtmlOrBlank $ result + option (Plain result) $ try $ do + newline + blanklines <|> + (getState >>= guard . stateStrict >> + lookAhead (blockQuote <|> header) >> return "") + return $ Para result plain :: GenParser Char ParserState Block plain = many1 inline >>~ spaces >>= return . Plain . normalizeSpaces @@ -670,7 +685,7 @@ plain = many1 inline >>~ spaces >>= return . Plain . normalizeSpaces -- htmlElement :: GenParser Char ParserState [Char] -htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element" +htmlElement = strictHtmlBlock <|> liftM snd (htmlTag isBlockTag) htmlBlock :: GenParser Char ParserState Block htmlBlock = try $ do @@ -678,27 +693,33 @@ htmlBlock = try $ do first <- htmlElement finalSpace <- many spaceChar finalNewlines <- many newline - return $ RawHtml $ first ++ finalSpace ++ finalNewlines - --- True if tag is self-closing -isSelfClosing :: [Char] -> Bool -isSelfClosing tag = - isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag + return $ RawBlock "html" $ first ++ finalSpace ++ finalNewlines strictHtmlBlock :: GenParser Char ParserState [Char] -strictHtmlBlock = try $ do - tag <- anyHtmlBlockTag - let tag' = extractTagType tag - if isSelfClosing tag || tag' == "hr" - then return tag - else do contents <- many (notFollowedBy' (htmlEndTag tag') >> - (htmlElement <|> (count 1 anyChar))) - end <- htmlEndTag tag' - return $ tag ++ concat contents ++ end +strictHtmlBlock = do + failUnlessBeginningOfLine + htmlInBalanced (not . isInlineTag) + +rawVerbatimBlock :: GenParser Char ParserState String +rawVerbatimBlock = try $ do + (TagOpen tag _, open) <- htmlTag (tagOpen (\t -> + t == "pre" || t == "style" || t == "script") + (const True)) + contents <- manyTill anyChar (htmlTag (~== TagClose tag)) + return $ open ++ contents ++ renderTags [TagClose tag] + +rawTeXBlock :: GenParser Char ParserState Block +rawTeXBlock = do + failIfStrict + result <- liftM (RawBlock "latex") rawLaTeXEnvironment' + <|> liftM (RawBlock "context") rawConTeXtEnvironment' + spaces + return result rawHtmlBlocks :: GenParser Char ParserState Block rawHtmlBlocks = do - htmlBlocks <- many1 $ do (RawHtml blk) <- rawHtmlBlock + htmlBlocks <- many1 $ do blk <- rawVerbatimBlock <|> + liftM snd (htmlTag isBlockTag) sps <- do sp1 <- many spaceChar sp2 <- option "" (blankline >> return "\n") sp3 <- many spaceChar @@ -710,7 +731,7 @@ rawHtmlBlocks = do return $ blk ++ sps let combined = concat htmlBlocks let combined' = if last combined == '\n' then init combined else combined - return $ RawHtml combined' + return $ RawBlock "html" combined' -- -- Tables @@ -848,10 +869,11 @@ alignType :: [String] -> Alignment alignType [] _ = AlignDefault alignType strLst len = - let s = head $ sortBy (comparing length) $ - map removeTrailingSpace strLst - leftSpace = if null s then False else (s !! 0) `elem` " \t" - rightSpace = length s < len || (s !! (len - 1)) `elem` " \t" + let nonempties = filter (not . null) $ map removeTrailingSpace strLst + (leftSpace, rightSpace) = + case sortBy (comparing length) nonempties of + (x:_) -> (head x `elem` " \t", length x < len) + [] -> (False, False) in case (leftSpace, rightSpace) of (True, False) -> AlignRight (False, True) -> AlignLeft @@ -875,31 +897,29 @@ inline :: GenParser Char ParserState Inline inline = choice inlineParsers <?> "inline" inlineParsers :: [GenParser Char ParserState Inline] -inlineParsers = [ str - , smartPunctuation - , whitespace +inlineParsers = [ whitespace + , str , endline , code - , charRef , (fourOrMore '*' <|> fourOrMore '_') , strong , emph , note - , inlineNote , link -#ifdef _CITEPROC - , inlineCitation -#endif + , cite , image , math , strikeout , superscript , subscript + , inlineNote -- after superscript because of ^[link](/foo)^ , autoLink - , rawHtmlInline' + , rawHtmlInline , rawLaTeXInline' , escapedChar , exampleRef + , smartPunctuation inline + , charRef , symbol , ltSign ] @@ -913,12 +933,12 @@ failIfLink (Link _ _) = pzero failIfLink elt = return elt escapedChar :: GenParser Char ParserState Inline -escapedChar = do +escapedChar = try $ do char '\\' state <- getState - result <- option '\\' $ if stateStrict state - then oneOf "\\`*_{}[]()>#+-.!~" - else satisfy (not . isAlphaNum) + result <- if stateStrict state + then oneOf "\\`*_{}[]()>#+-.!~" + else satisfy (not . isAlphaNum) return $ case result of ' ' -> Str "\160" -- "\ " is a nonbreaking space '\n' -> LineBreak -- "\[newline]" is a linebreak @@ -932,9 +952,6 @@ ltSign = do else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html return $ Str ['<'] -specialCharsMinusLt :: [Char] -specialCharsMinusLt = filter (/= '<') specialChars - exampleRef :: GenParser Char ParserState Inline exampleRef = try $ do char '@' @@ -945,7 +962,11 @@ exampleRef = try $ do symbol :: GenParser Char ParserState Inline symbol = do - result <- oneOf specialCharsMinusLt + result <- noneOf "<\\\n\t " + <|> try (do lookAhead $ char '\\' + notFollowedBy' $ rawLaTeXEnvironment' + <|> rawConTeXtEnvironment' + char '\\') return $ Str [result] -- parses inline code, between n `s and n `s @@ -957,7 +978,8 @@ code = try $ do (char '\n' >> notFollowedBy' blankline >> return " ")) (try (skipSpaces >> count (length starts) (char '`') >> notFollowedBy (char '`'))) - return $ Code $ removeLeadingTrailingSpace $ concat result + attr <- option ([],[],[]) (try $ optional whitespace >> attributes) + return $ Code attr $ removeLeadingTrailingSpace $ concat result mathWord :: GenParser Char st [Char] mathWord = liftM concat $ many1 mathChunk @@ -966,11 +988,11 @@ mathChunk :: GenParser Char st [Char] mathChunk = do char '\\' c <- anyChar return ['\\',c] - <|> many1 (noneOf " \t\n\\$") + <|> many1 (satisfy $ \c -> not (isBlank c || c == '\\' || c == '$')) math :: GenParser Char ParserState Inline -math = (mathDisplay >>= return . Math DisplayMath) - <|> (mathInline >>= return . Math InlineMath) +math = (mathDisplay >>= applyMacros' >>= return . Math DisplayMath) + <|> (mathInline >>= applyMacros' >>= return . Math InlineMath) mathDisplay :: GenParser Char ParserState String mathDisplay = try $ do @@ -1019,85 +1041,6 @@ subscript = failIfStrict >> enclosed (char '~') (char '~') (notFollowedBy spaceChar >> inline) >>= -- may not contain Space return . Subscript -smartPunctuation :: GenParser Char ParserState Inline -smartPunctuation = failUnlessSmart >> - choice [ quoted, apostrophe, dash, ellipses ] - -apostrophe :: GenParser Char ParserState Inline -apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe - -quoted :: GenParser Char ParserState Inline -quoted = doubleQuoted <|> singleQuoted - -withQuoteContext :: QuoteContext - -> (GenParser Char ParserState Inline) - -> GenParser Char ParserState Inline -withQuoteContext context parser = do - oldState <- getState - let oldQuoteContext = stateQuoteContext oldState - setState oldState { stateQuoteContext = context } - result <- parser - newState <- getState - setState newState { stateQuoteContext = oldQuoteContext } - return result - -singleQuoted :: GenParser Char ParserState Inline -singleQuoted = try $ do - singleQuoteStart - withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>= - return . Quoted SingleQuote . normalizeSpaces - -doubleQuoted :: GenParser Char ParserState Inline -doubleQuoted = try $ do - doubleQuoteStart - withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>= - return . Quoted DoubleQuote . normalizeSpaces - -failIfInQuoteContext :: QuoteContext -> GenParser tok ParserState () -failIfInQuoteContext context = do - st <- getState - if stateQuoteContext st == context - then fail "already inside quotes" - else return () - -singleQuoteStart :: GenParser Char ParserState Char -singleQuoteStart = do - failIfInQuoteContext InSingleQuote - try $ do char '\'' - notFollowedBy (oneOf ")!],.;:-? \t\n") - notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> - satisfy (not . isAlphaNum))) - -- possess/contraction - return '\'' - -singleQuoteEnd :: GenParser Char st Char -singleQuoteEnd = try $ do - char '\'' - notFollowedBy alphaNum - return '\'' - -doubleQuoteStart :: GenParser Char ParserState Char -doubleQuoteStart = do - failIfInQuoteContext InDoubleQuote - try $ do char '"' - notFollowedBy (oneOf " \t\n") - return '"' - -doubleQuoteEnd :: GenParser Char st Char -doubleQuoteEnd = char '"' - -ellipses :: GenParser Char st Inline -ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses - -dash :: GenParser Char st Inline -dash = enDash <|> emDash - -enDash :: GenParser Char st Inline -enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash - -emDash :: GenParser Char st Inline -emDash = oneOfStrings ["---", "--"] >> return EmDash - whitespace :: GenParser Char ParserState Inline whitespace = spaceChar >> ( (spaceChar >> skipMany spaceChar >> option Space (endline >> return LineBreak)) @@ -1106,20 +1049,19 @@ whitespace = spaceChar >> nonEndline :: GenParser Char st Char nonEndline = satisfy (/='\n') -strChar :: GenParser Char st Char -strChar = noneOf (specialChars ++ " \t\n") - str :: GenParser Char ParserState Inline str = do - result <- many1 strChar + a <- alphaNum + as <- many $ alphaNum <|> (try $ char '_' >>~ lookAhead alphaNum) + let result = a:as state <- getState let spacesToNbr = map (\c -> if c == ' ' then '\160' else c) if stateSmart state then case likelyAbbrev result of [] -> return $ Str result xs -> choice (map (\x -> - try (string x >> char ' ' >> - notFollowedBy spaceChar >> + try (string x >> oneOf " \n" >> + lookAhead alphaNum >> return (Str $ result ++ spacesToNbr x ++ "\160"))) xs) <|> (return $ Str result) else return $ Str result @@ -1142,15 +1084,13 @@ endline = try $ do newline notFollowedBy blankline st <- getState - if stateStrict st - then do notFollowedBy emailBlockQuoteStart - notFollowedBy (char '#') -- atx header - else return () + when (stateStrict st) $ do + notFollowedBy emailBlockQuoteStart + notFollowedBy (char '#') -- atx header -- parse potential list-starts differently if in a list: - if stateParserContext st == ListItemState - then notFollowedBy' (bulletListStart <|> - (anyOrderedListStart >> return ())) - else return () + when (stateParserContext st == ListItemState) $ do + notFollowedBy' bulletListStart + notFollowedBy' anyOrderedListStart return Space -- @@ -1175,9 +1115,16 @@ source = source' :: GenParser Char st (String, [Char]) source' = do skipSpaces - let sourceURL excludes = many $ - optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' linkTitle >> char ' ')) - src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n" + let nl = char '\n' >>~ notFollowedBy blankline + let sourceURL = liftM unwords $ many $ try $ do + notFollowedBy' linkTitle + skipMany spaceChar + optional nl + skipMany spaceChar + many1 (satisfy $ not . isBlank) + let betweenAngles = try $ char '<' >> + manyTill (noneOf ">\n" <|> nl) (char '>') + src <- try betweenAngles <|> sourceURL tit <- option "" linkTitle skipSpaces eof @@ -1196,10 +1143,7 @@ link :: GenParser Char ParserState Inline link = try $ do lab <- reference (src, tit) <- source <|> referenceLink lab - sanitize <- getState >>= return . stateSanitizeHTML - if sanitize && unsanitaryURI src - then fail "Unsanitary URI" - else return $ Link lab (src, tit) + return $ Link lab (src, tit) -- a link like [this][ref] or [this][] or [this] referenceLink :: [Inline] @@ -1209,7 +1153,7 @@ referenceLink lab = do optional (newline >> skipSpaces) >> reference)) let ref' = if null ref then lab else ref state <- getState - case lookupKeySrc (stateKeys state) (Key ref') of + case lookupKeySrc (stateKeys state) (toKey ref') of Nothing -> fail "no corresponding key" Just target -> return target @@ -1219,12 +1163,9 @@ autoLink = try $ do (orig, src) <- uri <|> emailAddress char '>' st <- getState - let sanitize = stateSanitizeHTML st - if sanitize && unsanitaryURI src - then fail "Unsanitary URI" - else return $ if stateStrict st - then Link [Str orig] (src, "") - else Link [Code orig] (src, "") + return $ if stateStrict st + then Link [Str orig] (src, "") + else Link [Code ("",["url"],[]) orig] (src, "") image :: GenParser Char ParserState Inline image = try $ do @@ -1250,11 +1191,13 @@ inlineNote = try $ do return $ Note [Para contents] rawLaTeXInline' :: GenParser Char ParserState Inline -rawLaTeXInline' = do +rawLaTeXInline' = try $ do failIfStrict - (rawConTeXtEnvironment' >>= return . TeX) - <|> (rawLaTeXEnvironment' >>= return . TeX) - <|> rawLaTeXInline + lookAhead $ char '\\' + notFollowedBy' $ rawLaTeXEnvironment' + <|> rawConTeXtEnvironment' + RawInline _ s <- rawLaTeXInline + return $ RawInline "tex" s -- "tex" because it might be context or latex rawConTeXtEnvironment' :: GenParser Char st String rawConTeXtEnvironment' = try $ do @@ -1272,46 +1215,98 @@ inBrackets parser = do char ']' return $ "[" ++ contents ++ "]" -rawHtmlInline' :: GenParser Char ParserState Inline -rawHtmlInline' = do +rawHtmlInline :: GenParser Char ParserState Inline +rawHtmlInline = do st <- getState - result <- if stateStrict st - then choice [htmlBlockElement, anyHtmlTag, anyHtmlEndTag] - else anyHtmlInlineTag - return $ HtmlInline result - -#ifdef _CITEPROC -inlineCitation :: GenParser Char ParserState Inline -inlineCitation = try $ do + (_,result) <- if stateStrict st + then htmlTag (not . isTextTag) + else htmlTag isInlineTag + return $ RawInline "html" result + +-- Citations + +cite :: GenParser Char ParserState Inline +cite = do failIfStrict - cit <- citeMarker - let citations = readWith parseCitation defaultParserState cit - mr <- mapM chkCit citations - if catMaybes mr /= [] - then return $ Cite citations [] - else fail "no citation found" - -chkCit :: Target -> GenParser Char ParserState (Maybe Target) -chkCit t = do + citations <- textualCite <|> normalCite + return $ Cite citations [] + +spnl :: GenParser Char st () +spnl = try $ do + skipSpaces + optional newline + skipSpaces + notFollowedBy (char '\n') + +textualCite :: GenParser Char ParserState [Citation] +textualCite = try $ do + (_, key) <- citeKey + let first = Citation{ citationId = key + , citationPrefix = [] + , citationSuffix = [] + , citationMode = AuthorInText + , citationNoteNum = 0 + , citationHash = 0 + } + rest <- option [] $ try $ spnl >> normalCite + if null rest + then option [first] $ bareloc first + else return $ first : rest + +bareloc :: Citation -> GenParser Char ParserState [Citation] +bareloc c = try $ do + spnl + char '[' + suff <- suffix + rest <- option [] $ try $ char ';' >> citeList + spnl + char ']' + return $ c{ citationSuffix = suff } : rest + +normalCite :: GenParser Char ParserState [Citation] +normalCite = try $ do + char '[' + spnl + citations <- citeList + spnl + char ']' + return citations + +citeKey :: GenParser Char ParserState (Bool, String) +citeKey = try $ do + suppress_author <- option False (char '-' >> return True) + char '@' + first <- letter + rest <- many $ (noneOf ",;]@ \t\n") + let key = first:rest st <- getState - case lookupKeySrc (stateKeys st) (Key [Str $ fst t]) of - Just _ -> fail "This is a link" - Nothing -> if elem (fst t) $ stateCitations st - then return $ Just t - else return $ Nothing - -citeMarker :: GenParser Char ParserState String -citeMarker = char '[' >> manyTill ( noneOf "\n" <|> (newline >>~ notFollowedBy blankline) ) (char ']') - -parseCitation :: GenParser Char ParserState [(String,String)] -parseCitation = try $ sepBy (parseLabel) (oneOf ";") - -parseLabel :: GenParser Char ParserState (String,String) -parseLabel = try $ do - res <- sepBy (skipSpaces >> optional newline >> skipSpaces >> many1 (noneOf "@;")) (oneOf "@") - case res of - [lab,loc] -> return (lab, loc) - [lab] -> return (lab, "" ) - _ -> return ("" , "" ) - -#endif + guard $ key `elem` stateCitations st + return (suppress_author, key) + +suffix :: GenParser Char ParserState [Inline] +suffix = try $ do + spnl + liftM normalizeSpaces $ many $ notFollowedBy (oneOf ";]") >> inline + +prefix :: GenParser Char ParserState [Inline] +prefix = liftM normalizeSpaces $ + manyTill inline (char ']' <|> liftM (const ']') (lookAhead citeKey)) + +citeList :: GenParser Char ParserState [Citation] +citeList = sepBy1 citation (try $ char ';' >> spnl) + +citation :: GenParser Char ParserState Citation +citation = try $ do + pref <- prefix + (suppress_author, key) <- citeKey + suff <- suffix + return $ Citation{ citationId = key + , citationPrefix = pref + , citationSuffix = suff + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + diff --git a/src/Text/Pandoc/Readers/Native.hs b/src/Text/Pandoc/Readers/Native.hs new file mode 100644 index 000000000..2c6fcc6e6 --- /dev/null +++ b/src/Text/Pandoc/Readers/Native.hs @@ -0,0 +1,81 @@ +{- +Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Native + Copyright : Copyright (C) 2011 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of a string representation of a pandoc type (@Pandoc@, +@[Block]@, @Block@, @[Inline]@, or @Inline@) to a @Pandoc@ document. +-} +module Text.Pandoc.Readers.Native ( readNative ) where + +import Text.Pandoc.Definition + +nullMeta :: Meta +nullMeta = Meta{ docTitle = [] + , docAuthors = [] + , docDate = [] + } + +-- | Read native formatted text and return a Pandoc document. +-- The input may be a full pandoc document, a block list, a block, +-- an inline list, or an inline. Thus, for example, +-- +-- > Str "hi" +-- +-- will be treated as if it were +-- +-- > Pandoc (Meta [] [] []) [Plain [Str "hi"]] +-- +readNative :: String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readNative s = + case reads s of + (d,_):_ -> d + [] -> Pandoc nullMeta $ readBlocks s + +readBlocks :: String -> [Block] +readBlocks s = + case reads s of + (d,_):_ -> d + [] -> [readBlock s] + +readBlock :: String -> Block +readBlock s = + case reads s of + (d,_):_ -> d + [] -> Plain $ readInlines s + +readInlines :: String -> [Inline] +readInlines s = + case reads s of + (d,_):_ -> d + [] -> [readInline s] + +readInline :: String -> Inline +readInline s = + case reads s of + (d,_):_ -> d + [] -> error "Cannot parse document" + diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index 13afe5053..32fae5ee7 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -34,10 +34,11 @@ import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.Parsing import Text.ParserCombinators.Parsec -import Control.Monad ( when, unless ) -import Data.List ( findIndex, intercalate, transpose, sort ) +import Control.Monad ( when ) +import Data.List ( findIndex, intercalate, transpose, sort, deleteFirstsBy ) import qualified Data.Map as M import Text.Printf ( printf ) +import Data.Maybe ( catMaybes ) -- | Parse reStructuredText string and return Pandoc document. readRST :: ParserState -- ^ Parser state, including options for parser @@ -57,7 +58,7 @@ underlineChars = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" -- treat these as potentially non-text when parsing inline: specialChars :: [Char] -specialChars = "\\`|*_<>$:[-" +specialChars = "\\`|*_<>$:[-.\"'\8216\8217\8220\8221" -- -- parsing documents @@ -90,12 +91,17 @@ titleTransform blocks = (blocks, []) parseRST :: GenParser Char ParserState Pandoc parseRST = do + optional blanklines -- skip blank lines at beginning of file startPos <- getPosition - -- go through once just to get list of reference keys + -- go through once just to get list of reference keys and notes -- docMinusKeys is the raw document with blanks where the keys were... - docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= return . concat + docMinusKeys <- manyTill (referenceKey <|> noteBlock <|> lineClump) eof >>= + return . concat setInput docMinusKeys setPosition startPos + st' <- getState + let reversedNotes = stateNotes st' + updateState $ \s -> s { stateNotes = reverse reversedNotes } -- now parse it for real... blocks <- parseBlocks let blocks' = filter (/= Null) blocks @@ -117,10 +123,9 @@ parseBlocks = manyTill block eof block :: GenParser Char ParserState Block block = choice [ codeBlock - , rawHtmlBlock - , rawLaTeXBlock - , fieldList + , rawBlock , blockQuote + , fieldList , imageBlock , customCodeBlock , unknownDirective @@ -138,46 +143,54 @@ block = choice [ codeBlock -- field list -- -fieldListItem :: String -> GenParser Char st ([Char], [Char]) -fieldListItem indent = try $ do +rawFieldListItem :: String -> GenParser Char ParserState (String, String) +rawFieldListItem indent = try $ do string indent char ':' - name <- many1 alphaNum + name <- many1 $ alphaNum <|> spaceChar string ": " skipSpaces first <- manyTill anyChar newline - rest <- option "" $ try $ lookAhead (string indent >> oneOf " \t") >> - indentedBlock - return (name, intercalate " " (first:(lines rest))) + rest <- option "" $ try $ do lookAhead (string indent >> spaceChar) + indentedBlock + let raw = first ++ "\n" ++ rest ++ "\n" + return (name, raw) + +fieldListItem :: String + -> GenParser Char ParserState (Maybe ([Inline], [[Block]])) +fieldListItem indent = try $ do + (name, raw) <- rawFieldListItem indent + let term = [Str name] + contents <- parseFromString (many block) raw + case (name, contents) of + ("Author", x) -> do + updateState $ \st -> + st{ stateAuthors = stateAuthors st ++ [extractContents x] } + return Nothing + ("Authors", [BulletList auths]) -> do + updateState $ \st -> st{ stateAuthors = map extractContents auths } + return Nothing + ("Date", x) -> do + updateState $ \st -> st{ stateDate = extractContents x } + return Nothing + ("Title", x) -> do + updateState $ \st -> st{ stateTitle = extractContents x } + return Nothing + _ -> return $ Just (term, [contents]) + +extractContents :: [Block] -> [Inline] +extractContents [Plain auth] = auth +extractContents [Para auth] = auth +extractContents _ = [] fieldList :: GenParser Char ParserState Block fieldList = try $ do - indent <- lookAhead $ many (oneOf " \t") + indent <- lookAhead $ many spaceChar items <- many1 $ fieldListItem indent blanklines - let authors = case lookup "Authors" items of - Just auth -> [auth] - Nothing -> map snd (filter (\(x,_) -> x == "Author") items) - unless (null authors) $ do - authors' <- mapM (parseFromString (many inline)) authors - updateState $ \st -> st {stateAuthors = map normalizeSpaces authors'} - case (lookup "Date" items) of - Just dat -> do - dat' <- parseFromString (many inline) dat - updateState $ \st -> st{ stateDate = normalizeSpaces dat' } - Nothing -> return () - case (lookup "Title" items) of - Just tit -> parseFromString (many inline) tit >>= - \t -> updateState $ \st -> st {stateTitle = t} - Nothing -> return () - let remaining = filter (\(x,_) -> (x /= "Authors") && (x /= "Author") && - (x /= "Date") && (x /= "Title")) items - if null remaining - then return Null - else do terms <- mapM (return . (:[]) . Str . fst) remaining - defs <- mapM (parseFromString (many block) . snd) - remaining - return $ DefinitionList $ zip terms $ map (:[]) defs + if null items + then return Null + else return $ DefinitionList $ catMaybes items -- -- line block @@ -186,7 +199,7 @@ fieldList = try $ do lineBlockLine :: GenParser Char ParserState [Inline] lineBlockLine = try $ do string "| " - white <- many (oneOf " \t") + white <- many spaceChar line <- many $ (notFollowedBy newline >> inline) <|> (try $ endline >>~ char ' ') optional endline return $ normalizeSpaces $ (if null white then [] else [Str white]) ++ line @@ -231,15 +244,16 @@ plain = many1 inline >>= return . Plain . normalizeSpaces -- image block -- -imageBlock :: GenParser Char st Block +imageBlock :: GenParser Char ParserState Block imageBlock = try $ do string ".. image:: " src <- manyTill anyChar newline - fields <- option [] $ do indent <- lookAhead $ many (oneOf " /t") - many1 $ fieldListItem indent + fields <- try $ do indent <- lookAhead $ many (oneOf " /t") + many $ rawFieldListItem indent optional blanklines case lookup "alt" fields of - Just alt -> return $ Plain [Image [Str alt] (src, alt)] + Just alt -> return $ Plain [Image [Str $ removeTrailingSpace alt] + (src, "")] Nothing -> return $ Plain [Image [Str "image"] (src, "")] -- -- header blocks @@ -314,20 +328,19 @@ hrule = try $ do indentedLine :: String -> GenParser Char st [Char] indentedLine indents = try $ do string indents - result <- manyTill anyChar newline - return $ result ++ "\n" + manyTill anyChar newline -- two or more indented lines, possibly separated by blank lines. -- any amount of indentation will work. indentedBlock :: GenParser Char st [Char] -indentedBlock = do - indents <- lookAhead $ many1 (oneOf " \t") +indentedBlock = try $ do + indents <- lookAhead $ many1 spaceChar lns <- many $ choice $ [ indentedLine indents, try $ do b <- blanklines l <- indentedLine indents return (b ++ l) ] - optional blanklines - return $ concat lns + optional blanklines + return $ unlines lns codeBlock :: GenParser Char st Block codeBlock = try $ do @@ -365,23 +378,16 @@ birdTrackLine = do manyTill anyChar newline -- --- raw html +-- raw html/latex/etc -- -rawHtmlBlock :: GenParser Char st Block -rawHtmlBlock = try $ string ".. raw:: html" >> blanklines >> - indentedBlock >>= return . RawHtml - --- --- raw latex --- - -rawLaTeXBlock :: GenParser Char st Block -rawLaTeXBlock = try $ do - string ".. raw:: latex" +rawBlock :: GenParser Char st Block +rawBlock = try $ do + string ".. raw:: " + lang <- many1 (letter <|> digit) blanklines result <- indentedBlock - return $ Para [(TeX result)] + return $ RawBlock lang result -- -- block quotes @@ -408,7 +414,7 @@ definitionListItem = try $ do term <- many1Till inline endline raw <- indentedBlock -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ raw ++ "\n\n" + contents <- parseFromString parseBlocks $ raw ++ "\n" return (normalizeSpaces term, [contents]) definitionList :: GenParser Char ParserState Block @@ -505,9 +511,35 @@ unknownDirective = try $ do string ".." notFollowedBy (noneOf " \t\n") manyTill anyChar newline - many $ blanklines <|> (oneOf " \t" >> manyTill anyChar newline) + many $ blanklines <|> (spaceChar >> manyTill anyChar newline) return Null +--- +--- note block +--- + +noteBlock :: GenParser Char ParserState [Char] +noteBlock = try $ do + startPos <- getPosition + string ".." + spaceChar >> skipMany spaceChar + ref <- noteMarker + spaceChar >> skipMany spaceChar + first <- anyLine + blanks <- option "" blanklines + rest <- option "" indentedBlock + endPos <- getPosition + let raw = first ++ "\n" ++ blanks ++ rest ++ "\n" + let newnote = (ref, raw) + st <- getState + let oldnotes = stateNotes st + updateState $ \s -> s { stateNotes = newnote : oldnotes } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +noteMarker :: GenParser Char ParserState [Char] +noteMarker = char '[' >> (many1 digit <|> count 1 (oneOf "#*")) >>~ char ']' + -- -- reference key -- @@ -565,14 +597,14 @@ imageKey = try $ do skipSpaces string "image::" src <- targetURI - return (Key (normalizeSpaces ref), (src, "")) + return (toKey (normalizeSpaces ref), (src, "")) anonymousKey :: GenParser Char st (Key, Target) anonymousKey = try $ do oneOfStrings [".. __:", "__"] src <- targetURI pos <- getPosition - return (Key [Str $ "_" ++ printf "%09d" (sourceLine pos)], (src, "")) + return (toKey [Str $ "_" ++ printf "%09d" (sourceLine pos)], (src, "")) regularKey :: GenParser Char ParserState (Key, Target) regularKey = try $ do @@ -580,7 +612,7 @@ regularKey = try $ do ref <- referenceName char ':' src <- targetURI - return (Key (normalizeSpaces ref), (src, "")) + return (toKey (normalizeSpaces ref), (src, "")) -- -- tables @@ -679,17 +711,19 @@ table = gridTable False <|> simpleTable False <|> -- inline :: GenParser Char ParserState Inline -inline = choice [ link +inline = choice [ whitespace + , link , str - , whitespace , endline , strong , emph , code , image - , hyphens , superscript , subscript + , note + , smartPunctuation inline + , hyphens , escapedChar , symbol ] <?> "inline" @@ -713,7 +747,8 @@ code :: GenParser Char ParserState Inline code = try $ do string "``" result <- manyTill anyChar (try (string "``")) - return $ Code $ removeLeadingTrailingSpace $ intercalate " " $ lines result + return $ Code nullAttr + $ removeLeadingTrailingSpace $ intercalate " " $ lines result emph :: GenParser Char ParserState Inline emph = enclosed (char '*') (char '*') inline >>= @@ -779,9 +814,10 @@ referenceLink = try $ do label' <- (quotedReferenceName <|> simpleReferenceName) >>~ char '_' state <- getState let keyTable = stateKeys state - let isAnonKey (Key [Str ('_':_)]) = True - isAnonKey _ = False - key <- option (Key label') $ + let isAnonKey x = case fromKey x of + [Str ('_':_)] -> True + _ -> False + key <- option (toKey label') $ do char '_' let anonKeys = sort $ filter isAnonKey $ M.keys keyTable if null anonKeys @@ -814,7 +850,24 @@ image = try $ do ref <- manyTill inline (char '|') state <- getState let keyTable = stateKeys state - (src,tit) <- case lookupKeySrc keyTable (Key ref) of + (src,tit) <- case lookupKeySrc keyTable (toKey ref) of Nothing -> fail "no corresponding key" Just target -> return target return $ Image (normalizeSpaces ref) (src, tit) + +note :: GenParser Char ParserState Inline +note = try $ do + ref <- noteMarker + char '_' + state <- getState + let notes = stateNotes state + case lookup ref notes of + Nothing -> fail "note not found" + Just raw -> do + contents <- parseFromString parseBlocks raw + when (ref == "*" || ref == "#") $ do -- auto-numbered + -- delete the note so the next auto-numbered note + -- doesn't get the same contents: + let newnotes = deleteFirstsBy (==) notes [(ref,raw)] + updateState $ \st -> st{ stateNotes = newnotes } + return $ Note contents diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs index b0c6e86d4..b9a46e8ff 100644 --- a/src/Text/Pandoc/Readers/TeXMath.hs +++ b/src/Text/Pandoc/Readers/TeXMath.hs @@ -27,12 +27,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of TeX math to a list of 'Pandoc' inline elements. -} -module Text.Pandoc.Readers.TeXMath ( - readTeXMath - ) where +module Text.Pandoc.Readers.TeXMath ( readTeXMath ) where -import Text.ParserCombinators.Parsec import Text.Pandoc.Definition +import Text.TeXMath.Types import Text.TeXMath.Parser -- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. @@ -40,16 +38,17 @@ import Text.TeXMath.Parser -- can't be converted. readTeXMath :: String -- ^ String to parse (assumes @'\n'@ line endings) -> [Inline] -readTeXMath inp = case readTeXMath' inp of - Nothing -> [Str ("$" ++ inp ++ "$")] - Just res -> res +readTeXMath inp = case texMathToPandoc inp of + Left _ -> [Str ("$" ++ inp ++ "$")] + Right res -> res --- | Like 'readTeXMath', but without the default. -readTeXMath' :: String -- ^ String to parse (assumes @'\n'@ line endings) - -> Maybe [Inline] -readTeXMath' inp = case parse formula "formula" inp of - Left _ -> Just [Str inp] - Right exps -> expsToInlines exps +texMathToPandoc :: String -> Either String [Inline] +texMathToPandoc inp = inp `seq` + case parseFormula inp of + Left err -> Left err + Right exps -> case expsToInlines exps of + Nothing -> Left "Formula too complex for [Inline]" + Just r -> Right r expsToInlines :: [Exp] -> Maybe [Inline] expsToInlines xs = do @@ -89,6 +88,26 @@ expToInlines (ESubsup x y z) = do expToInlines (EDown x y) = expToInlines (ESub x y) expToInlines (EUp x y) = expToInlines (ESuper x y) expToInlines (EDownup x y z) = expToInlines (ESubsup x y z) -expToInlines (EText _ x) = Just [Emph [Str x]] +expToInlines (EText "normal" x) = Just [Str x] +expToInlines (EText "bold" x) = Just [Strong [Str x]] +expToInlines (EText "monospace" x) = Just [Code nullAttr x] +expToInlines (EText "italic" x) = Just [Emph [Str x]] +expToInlines (EText _ x) = Just [Str x] +expToInlines (EOver (EGrouped [EIdentifier [c]]) (ESymbol Accent [accent])) = + case accent of + '\x203E' -> Just [Emph [Str [c,'\x0304']]] -- bar + '\x00B4' -> Just [Emph [Str [c,'\x0301']]] -- acute + '\x0060' -> Just [Emph [Str [c,'\x0300']]] -- grave + '\x02D8' -> Just [Emph [Str [c,'\x0306']]] -- breve + '\x02C7' -> Just [Emph [Str [c,'\x030C']]] -- check + '.' -> Just [Emph [Str [c,'\x0307']]] -- dot + '\x00B0' -> Just [Emph [Str [c,'\x030A']]] -- ring + '\x20D7' -> Just [Emph [Str [c,'\x20D7']]] -- arrow right + '\x20D6' -> Just [Emph [Str [c,'\x20D6']]] -- arrow left + '\x005E' -> Just [Emph [Str [c,'\x0302']]] -- hat + '\x0302' -> Just [Emph [Str [c,'\x0302']]] -- hat + '~' -> Just [Emph [Str [c,'\x0303']]] -- tilde + _ -> Nothing expToInlines _ = Nothing + diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs new file mode 100644 index 000000000..19357b343 --- /dev/null +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -0,0 +1,523 @@ +{- +Copyright (C) 2010 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Textile + Copyright : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : Paul Rivier <paul*rivier#demotera*com> + Stability : alpha + Portability : portable + +Conversion from Textile to 'Pandoc' document, based on the spec +available at http://redcloth.org/textile. + +Implemented and parsed: + - Paragraphs + - Code blocks + - Lists + - blockquote + - Inlines : strong, emph, cite, code, deleted, superscript, + subscript, links + - footnotes + +Implemented but discarded: + - HTML-specific and CSS-specific attributes + +Left to be implemented: + - dimension sign + - all caps + - continued blocks (ex bq..) + +TODO : refactor common patterns across readers : + - autolink + - smartPunctuation + - more ... + +-} + + +module Text.Pandoc.Readers.Textile ( readTextile) where + +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.Parsing +import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag ) +import Text.ParserCombinators.Parsec +import Text.HTML.TagSoup.Match +import Data.Char ( digitToInt, isLetter ) +import Control.Monad ( guard, liftM ) + +-- | Parse a Textile text and return a Pandoc document. +readTextile :: ParserState -- ^ Parser state, including options for parser + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readTextile state s = (readWith parseTextile) state (s ++ "\n\n") + + +-- +-- Constants and data structure definitions +-- + +-- | Special chars border strings parsing +specialChars :: [Char] +specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()" + +-- | Generate a Pandoc ADT from a textile document +parseTextile :: GenParser Char ParserState Pandoc +parseTextile = do + -- textile allows raw HTML and does smart punctuation by default + updateState (\state -> state { stateParseRaw = True, stateSmart = True }) + many blankline + startPos <- getPosition + -- go through once just to get list of reference keys and notes + -- docMinusKeys is the raw document with blanks where the keys/notes were... + let firstPassParser = noteBlock <|> lineClump + manyTill firstPassParser eof >>= setInput . concat + setPosition startPos + st' <- getState + let reversedNotes = stateNotes st' + updateState $ \s -> s { stateNotes = reverse reversedNotes } + -- now parse it for real... + blocks <- parseBlocks + return $ Pandoc (Meta [] [] []) blocks -- FIXME + +noteMarker :: GenParser Char ParserState [Char] +noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.') + +noteBlock :: GenParser Char ParserState [Char] +noteBlock = try $ do + startPos <- getPosition + ref <- noteMarker + optional blankline + contents <- liftM unlines $ many1Till anyLine (blanklines <|> noteBlock) + endPos <- getPosition + let newnote = (ref, contents ++ "\n") + st <- getState + let oldnotes = stateNotes st + updateState $ \s -> s { stateNotes = newnote : oldnotes } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +-- | Parse document blocks +parseBlocks :: GenParser Char ParserState [Block] +parseBlocks = manyTill block eof + +-- | Block parsers list tried in definition order +blockParsers :: [GenParser Char ParserState Block] +blockParsers = [ codeBlock + , header + , blockQuote + , hrule + , anyList + , rawHtmlBlock + , maybeExplicitBlock "table" table + , maybeExplicitBlock "p" para + , nullBlock ] + +-- | Any block in the order of definition of blockParsers +block :: GenParser Char ParserState Block +block = choice blockParsers <?> "block" + +codeBlock :: GenParser Char ParserState Block +codeBlock = codeBlockBc <|> codeBlockPre + +codeBlockBc :: GenParser Char ParserState Block +codeBlockBc = try $ do + string "bc. " + contents <- manyTill anyLine blanklines + return $ CodeBlock ("",[],[]) $ unlines contents + +-- | Code Blocks in Textile are between <pre> and </pre> +codeBlockPre :: GenParser Char ParserState Block +codeBlockPre = try $ do + htmlTag (tagOpen (=="pre") null) + result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak) + -- drop leading newline if any + let result'' = case result' of + '\n':xs -> xs + _ -> result' + -- drop trailing newline if any + let result''' = case reverse result'' of + '\n':_ -> init result'' + _ -> result'' + return $ CodeBlock ("",[],[]) result''' + +-- | Header of the form "hN. content" with N in 1..6 +header :: GenParser Char ParserState Block +header = try $ do + char 'h' + level <- oneOf "123456" >>= return . digitToInt + optional attributes + char '.' + whitespace + name <- manyTill inline blockBreak + return $ Header level (normalizeSpaces name) + +-- | Blockquote of the form "bq. content" +blockQuote :: GenParser Char ParserState Block +blockQuote = try $ do + string "bq" + optional attributes + char '.' + whitespace + para >>= return . BlockQuote . (:[]) + +-- Horizontal rule + +hrule :: GenParser Char st Block +hrule = try $ do + skipSpaces + start <- oneOf "-*" + count 2 (skipSpaces >> char start) + skipMany (spaceChar <|> char start) + newline + optional blanklines + return HorizontalRule + +-- Lists handling + +-- | Can be a bullet list or an ordered list. This implementation is +-- strict in the nesting, sublist must start at exactly "parent depth +-- plus one" +anyList :: GenParser Char ParserState Block +anyList = try $ do + l <- anyListAtDepth 1 + blanklines + return l + +-- | This allow one type of list to be nested into an other type, +-- provided correct nesting +anyListAtDepth :: Int -> GenParser Char ParserState Block +anyListAtDepth depth = choice [ bulletListAtDepth depth, + orderedListAtDepth depth, + definitionList ] + +-- | Bullet List of given depth, depth being the number of leading '*' +bulletListAtDepth :: Int -> GenParser Char ParserState Block +bulletListAtDepth depth = try $ do + items <- many1 (bulletListItemAtDepth depth) + return (BulletList items) + +-- | Bullet List Item of given depth, depth being the number of +-- leading '*' +bulletListItemAtDepth :: Int -> GenParser Char ParserState [Block] +bulletListItemAtDepth depth = try $ do + count depth (char '*') + optional attributes + whitespace + p <- inlines >>= return . Plain + sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[])) + return (p:sublist) + +-- | Ordered List of given depth, depth being the number of +-- leading '#' +orderedListAtDepth :: Int -> GenParser Char ParserState Block +orderedListAtDepth depth = try $ do + items <- many1 (orderedListItemAtDepth depth) + return (OrderedList (1, DefaultStyle, DefaultDelim) items) + +-- | Ordered List Item of given depth, depth being the number of +-- leading '#' +orderedListItemAtDepth :: Int -> GenParser Char ParserState [Block] +orderedListItemAtDepth depth = try $ do + count depth (char '#') + optional attributes + whitespace + p <- inlines >>= return . Plain + sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[])) + return (p:sublist) + +-- | A definition list is a set of consecutive definition items +definitionList :: GenParser Char ParserState Block +definitionList = try $ do + items <- many1 definitionListItem + return $ DefinitionList items + +-- | A definition list item in textile begins with '- ', followed by +-- the term defined, then spaces and ":=". The definition follows, on +-- the same single line, or spaned on multiple line, after a line +-- break. +definitionListItem :: GenParser Char ParserState ([Inline], [[Block]]) +definitionListItem = try $ do + string "- " + term <- many1Till inline (try (whitespace >> string ":=")) + def <- inlineDef <|> multilineDef + return (term, def) + where inlineDef :: GenParser Char ParserState [[Block]] + inlineDef = liftM (\d -> [[Plain d]]) $ try (whitespace >> inlines) + multilineDef :: GenParser Char ParserState [[Block]] + multilineDef = try $ do + optional whitespace >> newline + s <- many1Till anyChar (try (string "=:" >> newline)) + -- this ++ "\n\n" does not look very good + ds <- parseFromString parseBlocks (s ++ "\n\n") + return [ds] + +-- | This terminates a block such as a paragraph. Because of raw html +-- blocks support, we have to lookAhead for a rawHtmlBlock. +blockBreak :: GenParser Char ParserState () +blockBreak = try (newline >> blanklines >> return ()) <|> + (lookAhead rawHtmlBlock >> return ()) + +-- | A raw Html Block, optionally followed by blanklines +rawHtmlBlock :: GenParser Char ParserState Block +rawHtmlBlock = try $ do + (_,b) <- htmlTag isBlockTag + optional blanklines + return $ RawBlock "html" b + +-- | In textile, paragraphs are separated by blank lines. +para :: GenParser Char ParserState Block +para = try $ do + content <- manyTill inline blockBreak + return $ Para $ normalizeSpaces content + + +-- Tables + +-- | A table cell spans until a pipe | +tableCell :: GenParser Char ParserState TableCell +tableCell = do + c <- many1 (noneOf "|\n") + content <- parseFromString (many1 inline) c + return $ [ Plain $ normalizeSpaces content ] + +-- | A table row is made of many table cells +tableRow :: GenParser Char ParserState [TableCell] +tableRow = try $ do + char '|' + cells <- endBy1 tableCell (char '|') + newline + return cells + +-- | Many table rows +tableRows :: GenParser Char ParserState [[TableCell]] +tableRows = many1 tableRow + +-- | Table headers are made of cells separated by a tag "|_." +tableHeaders :: GenParser Char ParserState [TableCell] +tableHeaders = try $ do + let separator = (try $ string "|_.") + separator + headers <- sepBy1 tableCell separator + char '|' + newline + return headers + +-- | A table with an optional header. Current implementation can +-- handle tables with and without header, but will parse cells +-- alignment attributes as content. +table :: GenParser Char ParserState Block +table = try $ do + headers <- option [] tableHeaders + rows <- tableRows + blanklines + let nbOfCols = max (length headers) (length $ head rows) + return $ Table [] + (replicate nbOfCols AlignDefault) + (replicate nbOfCols 0.0) + headers + rows + + +-- | Blocks like 'p' and 'table' do not need explicit block tag. +-- However, they can be used to set HTML/CSS attributes when needed. +maybeExplicitBlock :: String -- ^ block tag name + -> GenParser Char ParserState Block -- ^ implicit block + -> GenParser Char ParserState Block +maybeExplicitBlock name blk = try $ do + optional $ try $ string name >> optional attributes >> char '.' >> + ((try whitespace) <|> endline) + blk + + + +---------- +-- Inlines +---------- + + +-- | Any inline element +inline :: GenParser Char ParserState Inline +inline = choice inlineParsers <?> "inline" + +-- | List of consecutive inlines before a newline +inlines :: GenParser Char ParserState [Inline] +inlines = manyTill inline newline + +-- | Inline parsers tried in order +inlineParsers :: [GenParser Char ParserState Inline] +inlineParsers = [ autoLink + , str + , whitespace + , endline + , code + , htmlSpan + , rawHtmlInline + , note + , simpleInline (string "??") (Cite []) + , simpleInline (string "**") Strong + , simpleInline (string "__") Emph + , simpleInline (char '*') Strong + , simpleInline (char '_') Emph + , simpleInline (char '-') Strikeout + , simpleInline (char '^') Superscript + , simpleInline (char '~') Subscript + , link + , image + , mark + , smartPunctuation inline + , symbol + ] + +-- | Trademark, registered, copyright +mark :: GenParser Char st Inline +mark = try $ char '(' >> (try tm <|> try reg <|> copy) + +reg :: GenParser Char st Inline +reg = do + oneOf "Rr" + char ')' + return $ Str "\174" + +tm :: GenParser Char st Inline +tm = do + oneOf "Tt" + oneOf "Mm" + char ')' + return $ Str "\8482" + +copy :: GenParser Char st Inline +copy = do + oneOf "Cc" + char ')' + return $ Str "\169" + +note :: GenParser Char ParserState Inline +note = try $ do + char '[' + ref <- many1 digit + char ']' + state <- getState + let notes = stateNotes state + case lookup ref notes of + Nothing -> fail "note not found" + Just raw -> liftM Note $ parseFromString parseBlocks raw + +-- | Any string +str :: GenParser Char ParserState Inline +str = do + xs <- many1 (noneOf (specialChars ++ "\t\n ")) + optional $ try $ do + lookAhead (char '(') + notFollowedBy' mark + getInput >>= setInput . (' ':) -- add space before acronym explanation + -- parse a following hyphen if followed by a letter + -- (this prevents unwanted interpretation as starting a strikeout section) + result <- option xs $ try $ do + char '-' + next <- lookAhead letter + guard $ isLetter (last xs) || isLetter next + return $ xs ++ "-" + return $ Str result + +-- | Textile allows HTML span infos, we discard them +htmlSpan :: GenParser Char ParserState Inline +htmlSpan = try $ do + char '%' + _ <- attributes + content <- manyTill anyChar (char '%') + return $ Str content + +-- | Some number of space chars +whitespace :: GenParser Char ParserState Inline +whitespace = many1 spaceChar >> return Space <?> "whitespace" + +-- | In Textile, an isolated endline character is a line break +endline :: GenParser Char ParserState Inline +endline = try $ do + newline >> notFollowedBy blankline + return LineBreak + +rawHtmlInline :: GenParser Char ParserState Inline +rawHtmlInline = liftM (RawInline "html" . snd) + $ htmlTag isInlineTag + +-- | Textile standard link syntax is "label":target +link :: GenParser Char ParserState Inline +link = try $ do + name <- surrounded (char '"') inline + char ':' + url <- manyTill (anyChar) (lookAhead $ (space <|> try (oneOf ".;," >> (space <|> newline)))) + return $ Link name (url, "") + +-- | Detect plain links to http or email. +autoLink :: GenParser Char ParserState Inline +autoLink = do + (orig, src) <- (try uri <|> try emailAddress) + return $ Link [Str orig] (src, "") + +-- | image embedding +image :: GenParser Char ParserState Inline +image = try $ do + char '!' >> notFollowedBy space + src <- manyTill anyChar (lookAhead $ oneOf "!(") + alt <- option "" (try $ (char '(' >> manyTill anyChar (char ')'))) + char '!' + return $ Image [Str alt] (src, alt) + +-- | Any special symbol defined in specialChars +symbol :: GenParser Char ParserState Inline +symbol = do + result <- oneOf specialChars + return $ Str [result] + +-- | Inline code +code :: GenParser Char ParserState Inline +code = code1 <|> code2 + +code1 :: GenParser Char ParserState Inline +code1 = surrounded (char '@') anyChar >>= return . Code nullAttr + +code2 :: GenParser Char ParserState Inline +code2 = do + htmlTag (tagOpen (=="tt") null) + result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt")) + return $ Code nullAttr result' + +-- | Html / CSS attributes +attributes :: GenParser Char ParserState String +attributes = choice [ enclosed (char '(') (char ')') anyChar, + enclosed (char '{') (char '}') anyChar, + enclosed (char '[') (char ']') anyChar] + +-- | Parses material surrounded by a parser. +surrounded :: GenParser Char st t -- ^ surrounding parser + -> GenParser Char st a -- ^ content parser (to be used repeatedly) + -> GenParser Char st [a] +surrounded border = enclosed border border + +-- | Inlines are most of the time of the same form +simpleInline :: GenParser Char ParserState t -- ^ surrounding parser + -> ([Inline] -> Inline) -- ^ Inline constructor + -> GenParser Char ParserState Inline -- ^ content parser (to be used repeatedly) +simpleInline border construct = surrounded border (inlineWithAttribute) >>= + return . construct . normalizeSpaces + where inlineWithAttribute = (try $ optional attributes) >> inline diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 633708046..b1d5de63f 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -46,17 +46,11 @@ module Text.Pandoc.Shared ( escapeURI, unescapeURI, tabFilter, - -- * Prettyprinting - wrapped, - wrapIfNeeded, - wrappedTeX, - wrapTeXIfNeeded, - BlockWrapper (..), - wrappedBlocksToDoc, - hang', -- * Pandoc block and inline list processing orderedListMarkers, normalizeSpaces, + normalize, + stringify, compactify, Element (..), hierarchicalize, @@ -65,19 +59,20 @@ module Text.Pandoc.Shared ( headerShift, -- * Writer options HTMLMathMethod (..), + CiteMethod (..), ObfuscationMethod (..), HTMLSlideVariant (..), WriterOptions (..), defaultWriterOptions, -- * File handling inDirectory, + findDataFile, readDataFile ) where import Text.Pandoc.Definition +import Text.Pandoc.Generic import qualified Text.Pandoc.UTF8 as UTF8 (readFile) -import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest ) -import qualified Text.PrettyPrint.HughesPJ as PP import Data.Char ( toLower, isLower, isUpper, isAlpha, isAscii, isLetter, isDigit ) import Data.List ( find, isPrefixOf, intercalate ) @@ -94,12 +89,12 @@ import Paths_pandoc (getDataFileName) -- -- | Split list by groups of one or more sep. -splitBy :: (Eq a) => a -> [a] -> [[a]] +splitBy :: (a -> Bool) -> [a] -> [[a]] splitBy _ [] = [] -splitBy sep lst = - let (first, rest) = break (== sep) lst - rest' = dropWhile (== sep) rest - in first:(splitBy sep rest') +splitBy isSep lst = + let (first, rest) = break isSep lst + rest' = dropWhile isSep rest + in first:(splitBy isSep rest') -- | Split list into chunks divided at specified indices. splitByIndices :: [Int] -> [a] -> [[a]] @@ -218,83 +213,6 @@ tabFilter tabStop = in go tabStop -- --- Prettyprinting --- - --- | Wrap inlines to line length. -wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc -wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>= - return . fsep - --- | Wrap inlines if the text wrap option is selected. -wrapIfNeeded :: Monad m => WriterOptions -> ([Inline] -> m Doc) -> - [Inline] -> m Doc -wrapIfNeeded opts = if writerWrapText opts - then wrapped - else ($) - --- auxiliary function for wrappedTeX -isNote :: Inline -> Bool -isNote (Note _) = True -isNote _ = False - --- | Wrap inlines to line length, treating footnotes in a way that --- makes sense in LaTeX and ConTeXt. -wrappedTeX :: Monad m - => Bool - -> ([Inline] -> m Doc) - -> [Inline] - -> m Doc -wrappedTeX includePercent listWriter sect = do - let (firstpart, rest) = break isNote sect - firstpartWrapped <- wrapped listWriter firstpart - if null rest - then return firstpartWrapped - else do let (note:rest') = rest - let (rest1, rest2) = break (== Space) rest' - -- rest1 is whatever comes between the note and a Space. - -- if the note is followed directly by a Space, rest1 is null. - -- rest1 is printed after the note but before the line break, - -- to avoid spurious blank space the note and immediately - -- following punctuation. - rest1Out <- if null rest1 - then return empty - else listWriter rest1 - rest2Wrapped <- if null rest2 - then return empty - else wrappedTeX includePercent listWriter (tail rest2) - noteText <- listWriter [note] - return $ (firstpartWrapped <> if includePercent then PP.char '%' else empty) $$ - (noteText <> rest1Out) $$ - rest2Wrapped - --- | Wrap inlines if the text wrap option is selected, specialized --- for LaTeX and ConTeXt. -wrapTeXIfNeeded :: Monad m - => WriterOptions - -> Bool - -> ([Inline] -> m Doc) - -> [Inline] - -> m Doc -wrapTeXIfNeeded opts includePercent = if writerWrapText opts - then wrappedTeX includePercent - else ($) - --- | Indicates whether block should be surrounded by blank lines (@Pad@) or not (@Reg@). -data BlockWrapper = Pad Doc | Reg Doc - --- | Converts a list of wrapped blocks to a Doc, with appropriate spaces around blocks. -wrappedBlocksToDoc :: [BlockWrapper] -> Doc -wrappedBlocksToDoc = foldr addBlock empty - where addBlock (Pad d) accum | isEmpty accum = d - addBlock (Pad d) accum = d $$ text "" $$ accum - addBlock (Reg d) accum = d $$ accum - --- | A version of hang that works like the version in pretty-1.0.0.0 -hang' :: Doc -> Int -> Doc -> Doc -hang' d1 n d2 = d1 $$ (nest n d2) - --- -- Pandoc block and inline list processing -- @@ -324,20 +242,96 @@ orderedListMarkers (start, numstyle, numdelim) = -- @Space@ elements, collapse double @Space@s into singles, and -- remove empty Str elements. normalizeSpaces :: [Inline] -> [Inline] -normalizeSpaces [] = [] -normalizeSpaces list = - let removeDoubles [] = [] - removeDoubles (Space:Space:rest) = removeDoubles (Space:rest) - removeDoubles (Space:(Str ""):Space:rest) = removeDoubles (Space:rest) - removeDoubles ((Str ""):rest) = removeDoubles rest - removeDoubles (x:rest) = x:(removeDoubles rest) - removeLeading (Space:xs) = removeLeading xs - removeLeading x = x - removeTrailing [] = [] - removeTrailing lst = if (last lst == Space) - then init lst - else lst - in removeLeading $ removeTrailing $ removeDoubles list +normalizeSpaces = cleanup . dropWhile isSpaceOrEmpty + where isSpaceOrEmpty Space = True + isSpaceOrEmpty (Str "") = True + isSpaceOrEmpty _ = False + cleanup [] = [] + cleanup (Space:rest) = let rest' = dropWhile isSpaceOrEmpty rest + in case rest' of + [] -> [] + _ -> Space : cleanup rest' + cleanup ((Str ""):rest) = cleanup rest + cleanup (x:rest) = x : cleanup rest + +-- | Normalize @Pandoc@ document, consolidating doubled 'Space's, +-- combining adjacent 'Str's and 'Emph's, remove 'Null's and +-- empty elements, etc. +normalize :: (Eq a, Data a) => a -> a +normalize = topDown removeEmptyBlocks . + topDown consolidateInlines . + bottomUp removeEmptyInlines + +removeEmptyBlocks :: [Block] -> [Block] +removeEmptyBlocks (Null : xs) = removeEmptyBlocks xs +removeEmptyBlocks (BulletList [] : xs) = removeEmptyBlocks xs +removeEmptyBlocks (OrderedList _ [] : xs) = removeEmptyBlocks xs +removeEmptyBlocks (DefinitionList [] : xs) = removeEmptyBlocks xs +removeEmptyBlocks (RawBlock _ [] : xs) = removeEmptyBlocks xs +removeEmptyBlocks (x:xs) = x : removeEmptyBlocks xs +removeEmptyBlocks [] = [] + +removeEmptyInlines :: [Inline] -> [Inline] +removeEmptyInlines (Emph [] : zs) = removeEmptyInlines zs +removeEmptyInlines (Strong [] : zs) = removeEmptyInlines zs +removeEmptyInlines (Subscript [] : zs) = removeEmptyInlines zs +removeEmptyInlines (Superscript [] : zs) = removeEmptyInlines zs +removeEmptyInlines (SmallCaps [] : zs) = removeEmptyInlines zs +removeEmptyInlines (Strikeout [] : zs) = removeEmptyInlines zs +removeEmptyInlines (RawInline _ [] : zs) = removeEmptyInlines zs +removeEmptyInlines (Code _ [] : zs) = removeEmptyInlines zs +removeEmptyInlines (Str "" : zs) = removeEmptyInlines zs +removeEmptyInlines (x : xs) = x : removeEmptyInlines xs +removeEmptyInlines [] = [] + +consolidateInlines :: [Inline] -> [Inline] +consolidateInlines (Str x : ys) = + case concat (x : map fromStr strs) of + "" -> consolidateInlines rest + n -> Str n : consolidateInlines rest + where + (strs, rest) = span isStr ys + isStr (Str _) = True + isStr _ = False + fromStr (Str z) = z + fromStr _ = error "consolidateInlines - fromStr - not a Str" +consolidateInlines (Space : ys) = Space : rest + where isSpace Space = True + isSpace _ = False + rest = consolidateInlines $ dropWhile isSpace ys +consolidateInlines (Emph xs : Emph ys : zs) = consolidateInlines $ + Emph (xs ++ ys) : zs +consolidateInlines (Strong xs : Strong ys : zs) = consolidateInlines $ + Strong (xs ++ ys) : zs +consolidateInlines (Subscript xs : Subscript ys : zs) = consolidateInlines $ + Subscript (xs ++ ys) : zs +consolidateInlines (Superscript xs : Superscript ys : zs) = consolidateInlines $ + Superscript (xs ++ ys) : zs +consolidateInlines (SmallCaps xs : SmallCaps ys : zs) = consolidateInlines $ + SmallCaps (xs ++ ys) : zs +consolidateInlines (Strikeout xs : Strikeout ys : zs) = consolidateInlines $ + Strikeout (xs ++ ys) : zs +consolidateInlines (RawInline f x : RawInline f' y : zs) | f == f' = + consolidateInlines $ RawInline f (x ++ y) : zs +consolidateInlines (Code a1 x : Code a2 y : zs) | a1 == a2 = + consolidateInlines $ Code a1 (x ++ y) : zs +consolidateInlines (x : xs) = x : consolidateInlines xs +consolidateInlines [] = [] + +-- | Convert list of inlines to a string with formatting removed. +stringify :: [Inline] -> String +stringify = queryWith go + where go :: Inline -> [Char] + go Space = " " + go (Str x) = x + go (Code _ x) = x + go (Math _ x) = x + go EmDash = "--" + go EnDash = "-" + go Apostrophe = "'" + go Ellipses = "..." + go LineBreak = " " + go _ = "" -- | Change final list item from @Para@ to @Plain@ if the list contains -- no other @Para@ blocks. @@ -370,32 +364,12 @@ data Element = Blk Block -- letters, digits, and the characters _-. inlineListToIdentifier :: [Inline] -> String inlineListToIdentifier = - dropWhile (not . isAlpha) . intercalate "-" . words . map toLower . - filter (\c -> isLetter c || isDigit c || c `elem` "_-. ") . - concatMap extractText - where extractText x = case x of - Str s -> s - Emph lst -> concatMap extractText lst - Strikeout lst -> concatMap extractText lst - Superscript lst -> concatMap extractText lst - SmallCaps lst -> concatMap extractText lst - Subscript lst -> concatMap extractText lst - Strong lst -> concatMap extractText lst - Quoted _ lst -> concatMap extractText lst - Cite _ lst -> concatMap extractText lst - Code s -> s - Space -> " " - EmDash -> "---" - EnDash -> "--" - Apostrophe -> "" - Ellipses -> "..." - LineBreak -> " " - Math _ s -> s - TeX _ -> "" - HtmlInline _ -> "" - Link lst _ -> concatMap extractText lst - Image lst _ -> concatMap extractText lst - Note _ -> "" + dropWhile (not . isAlpha) . intercalate "-" . words . + map (nbspToSp . toLower) . + filter (\c -> isLetter c || isDigit c || c `elem` "_-. ") . + stringify + where nbspToSp '\160' = ' ' + nbspToSp x = x -- | Convert list of Pandoc blocks into (hierarchical) list of Elements hierarchicalize :: [Block] -> [Element] @@ -444,7 +418,7 @@ isHeaderBlock _ = False -- | Shift header levels up or down. headerShift :: Int -> Pandoc -> Pandoc -headerShift n = processWith shift +headerShift n = bottomUp shift where shift :: Block -> Block shift (Header level inner) = Header (level + n) inner shift x = x @@ -459,8 +433,14 @@ data HTMLMathMethod = PlainMath | GladTeX | WebTeX String -- url of TeX->image script. | MathML (Maybe String) -- url of MathMLinHTML.js + | MathJax String -- url of MathJax.js deriving (Show, Read, Eq) +data CiteMethod = Citeproc -- use citeproc to render them + | Natbib -- output natbib cite commands + | Biblatex -- output biblatex cite commands + deriving (Show, Read, Eq) + -- | Methods for obfuscating email addresses in HTML. data ObfuscationMethod = NoObfuscation | ReferenceObfuscation @@ -491,11 +471,17 @@ data WriterOptions = WriterOptions , writerStrictMarkdown :: Bool -- ^ Use strict markdown syntax , writerReferenceLinks :: Bool -- ^ Use reference links in writing markdown, rst , writerWrapText :: Bool -- ^ Wrap text to line length + , writerColumns :: Int -- ^ Characters in a line (for text wrapping) , writerLiterateHaskell :: Bool -- ^ Write as literate haskell , writerEmailObfuscation :: ObfuscationMethod -- ^ How to obfuscate emails , writerIdentifierPrefix :: String -- ^ Prefix for section & note ids in HTML , writerSourceDirectory :: FilePath -- ^ Directory path of 1st source file , writerUserDataDir :: Maybe FilePath -- ^ Path of user data directory + , writerCiteMethod :: CiteMethod -- ^ How to print cites + , writerBiblioFiles :: [FilePath] -- ^ Biblio files to use for citations + , writerHtml5 :: Bool -- ^ Produce HTML5 + , writerChapters :: Bool -- ^ Use "chapter" for top-level sects + , writerListings :: Bool -- ^ Use listings package for code } deriving Show -- | Default writer options. @@ -517,11 +503,17 @@ defaultWriterOptions = , writerStrictMarkdown = False , writerReferenceLinks = False , writerWrapText = True + , writerColumns = 72 , writerLiterateHaskell = False , writerEmailObfuscation = JavascriptObfuscation , writerIdentifierPrefix = "" , writerSourceDirectory = "." , writerUserDataDir = Nothing + , writerCiteMethod = Citeproc + , writerBiblioFiles = [] + , writerHtml5 = False + , writerChapters = False + , writerListings = False } -- @@ -537,11 +529,17 @@ inDirectory path action = do setCurrentDirectory oldDir return result +-- | Get file path for data file, either from specified user data directory, +-- or, if not found there, from Cabal data directory. +findDataFile :: Maybe FilePath -> FilePath -> IO FilePath +findDataFile Nothing f = getDataFileName f +findDataFile (Just u) f = do + ex <- doesFileExist (u </> f) + if ex + then return (u </> f) + else getDataFileName f + -- | Read file from specified user data directory or, if not found there, from -- Cabal data directory. readDataFile :: Maybe FilePath -> FilePath -> IO String -readDataFile userDir fname = - case userDir of - Nothing -> getDataFileName fname >>= UTF8.readFile - Just u -> catch (UTF8.readFile $ u </> fname) - (\_ -> getDataFileName fname >>= UTF8.readFile) +readDataFile userDir fname = findDataFile userDir fname >>= UTF8.readFile diff --git a/src/Text/Pandoc/Templates.hs b/src/Text/Pandoc/Templates.hs index c8ddc3abf..b03e8c73f 100644 --- a/src/Text/Pandoc/Templates.hs +++ b/src/Text/Pandoc/Templates.hs @@ -72,7 +72,6 @@ import Text.ParserCombinators.Parsec import Control.Monad (liftM, when, forM) import System.FilePath import Data.List (intercalate, intersperse) -import Text.PrettyPrint (text, Doc) import Text.XHtml (primHtml, Html) import Data.ByteString.Lazy.UTF8 (ByteString, fromString) import Text.Pandoc.Shared (readDataFile) @@ -112,9 +111,6 @@ instance TemplateTarget ByteString where instance TemplateTarget Html where toTarget = primHtml -instance TemplateTarget Doc where - toTarget = text - -- | Renders a template renderTemplate :: TemplateTarget a => [(String,String)] -- ^ Assoc. list of values for variables diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs index 96d6e6218..a77f92cdc 100644 --- a/src/Text/Pandoc/UTF8.hs +++ b/src/Text/Pandoc/UTF8.hs @@ -25,7 +25,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Stability : alpha Portability : portable -UTF-8 aware string IO functions that will work with GHC 6.10 or 6.12. +UTF-8 aware string IO functions that will work with GHC 6.10, 6.12, or 7. -} module Text.Pandoc.UTF8 ( readFile , writeFile @@ -34,15 +34,54 @@ module Text.Pandoc.UTF8 ( readFile , putStrLn , hPutStr , hPutStrLn + , hGetContents ) where + +#if MIN_VERSION_base(4,2,0) + +import System.IO hiding (readFile, writeFile, getContents, + putStr, putStrLn, hPutStr, hPutStrLn, hGetContents) +import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn ) +import qualified System.IO as IO + +readFile :: FilePath -> IO String +readFile f = do + h <- openFile f ReadMode + hGetContents h + +writeFile :: FilePath -> String -> IO () +writeFile f s = withFile f WriteMode $ \h -> hPutStr h s + +getContents :: IO String +getContents = hGetContents stdin + +putStr :: String -> IO () +putStr s = hPutStr stdout s + +putStrLn :: String -> IO () +putStrLn s = hPutStrLn stdout s + +hPutStr :: Handle -> String -> IO () +hPutStr h s = hSetEncoding h utf8 >> IO.hPutStr h s + +hPutStrLn :: Handle -> String -> IO () +hPutStrLn h s = hSetEncoding h utf8 >> IO.hPutStrLn h s + +hGetContents :: Handle -> IO String +hGetContents h = hSetEncoding h utf8_bom >> IO.hGetContents h + +#else + import qualified Data.ByteString as B +import Codec.Binary.UTF8.String (encodeString) import Data.ByteString.UTF8 (toString, fromString) import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn) import System.IO (Handle) import Control.Monad (liftM) + bom :: B.ByteString bom = B.pack [0xEF, 0xBB, 0xBF] @@ -51,14 +90,17 @@ stripBOM s | bom `B.isPrefixOf` s = B.drop 3 s stripBOM s = s readFile :: FilePath -> IO String -readFile = liftM (toString . stripBOM) . B.readFile +readFile = liftM (toString . stripBOM) . B.readFile . encodeString writeFile :: FilePath -> String -> IO () -writeFile f = B.writeFile f . fromString +writeFile f = B.writeFile (encodeString f) . fromString getContents :: IO String getContents = liftM (toString . stripBOM) B.getContents +hGetContents :: Handle -> IO String +hGetContents h = liftM (toString . stripBOM) (B.hGetContents h) + putStr :: String -> IO () putStr = B.putStr . fromString @@ -70,3 +112,5 @@ hPutStr h = B.hPutStr h . fromString hPutStrLn :: Handle -> String -> IO () hPutStrLn h s = hPutStr h (s ++ "\n") + +#endif diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs index 395bc2d30..0f6e00a3b 100644 --- a/src/Text/Pandoc/Writers/ConTeXt.hs +++ b/src/Text/Pandoc/Writers/ConTeXt.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- Copyright (C) 2007-2010 John MacFarlane <jgm@berkeley.edu> @@ -31,9 +32,9 @@ module Text.Pandoc.Writers.ConTeXt ( writeConTeXt ) where import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Printf ( printf ) -import Data.List ( isSuffixOf, intercalate, intersperse ) +import Data.List ( intercalate ) import Control.Monad.State -import Text.PrettyPrint.HughesPJ hiding ( Str ) +import Text.Pandoc.Pretty import Text.Pandoc.Templates ( renderTemplate ) data WriterState = @@ -56,15 +57,18 @@ writeConTeXt options document = pandocToConTeXt :: WriterOptions -> Pandoc -> State WriterState String pandocToConTeXt options (Pandoc (Meta title authors date) blocks) = do + let colwidth = if writerWrapText options + then Just $ writerColumns options + else Nothing titletext <- if null title then return "" - else liftM render $ inlineListToConTeXt title - authorstext <- mapM (liftM render . inlineListToConTeXt) authors + else liftM (render colwidth) $ inlineListToConTeXt title + authorstext <- mapM (liftM (render colwidth) . inlineListToConTeXt) authors datetext <- if null date then return "" - else liftM render $ inlineListToConTeXt date - body <- blockListToConTeXt blocks - let main = render $ body $$ text "" + else liftM (render colwidth) $ inlineListToConTeXt date + body <- blockListToConTeXt blocks + let main = render colwidth $ body let context = writerVariables options ++ [ ("toc", if writerTableOfContents options then "yes" else "") , ("body", main) @@ -92,6 +96,8 @@ escapeCharForConTeXt ch = '#' -> "\\#" '<' -> "\\letterless{}" '>' -> "\\lettermore{}" + '[' -> "{[}" + ']' -> "{]}" '_' -> "\\letterunderscore{}" '\160' -> "~" x -> [x] @@ -102,32 +108,27 @@ stringToConTeXt = concatMap escapeCharForConTeXt -- | Convert Pandoc block element to ConTeXt. blockToConTeXt :: Block - -> State WriterState BlockWrapper -blockToConTeXt Null = return $ Reg empty -blockToConTeXt (Plain lst) = do - st <- get - let options = stOptions st - contents <- wrapTeXIfNeeded options False inlineListToConTeXt lst - return $ Reg contents + -> State WriterState Doc +blockToConTeXt Null = return empty +blockToConTeXt (Plain lst) = inlineListToConTeXt lst blockToConTeXt (Para [Image txt (src,_)]) = do capt <- inlineListToConTeXt txt - return $ Pad $ text "\\placefigure[here,nonumber]{" <> capt <> - text "}{\\externalfigure[" <> text src <> text "]}" + return $ blankline $$ "\\placefigure[here,nonumber]" <> braces capt <> + braces ("\\externalfigure" <> brackets (text src)) <> blankline blockToConTeXt (Para lst) = do - st <- get - let options = stOptions st - contents <- wrapTeXIfNeeded options False inlineListToConTeXt lst - return $ Pad contents + contents <- inlineListToConTeXt lst + return $ contents <> blankline blockToConTeXt (BlockQuote lst) = do contents <- blockListToConTeXt lst - return $ Pad $ text "\\startblockquote" $$ contents $$ text "\\stopblockquote" -blockToConTeXt (CodeBlock _ str) = - return $ Reg $ text $ "\\starttyping\n" ++ str ++ "\n\\stoptyping\n" - -- \n because \stoptyping can't have anything after it, inc. } -blockToConTeXt (RawHtml _) = return $ Reg empty -blockToConTeXt (BulletList lst) = do + return $ "\\startblockquote" $$ nest 0 contents $$ "\\stopblockquote" <> blankline +blockToConTeXt (CodeBlock _ str) = + return $ "\\starttyping" <> cr <> flush (text str) <> cr <> "\\stoptyping" $$ blankline + -- blankline because \stoptyping can't have anything after it, inc. '}' +blockToConTeXt (RawBlock "context" str) = return $ text str <> blankline +blockToConTeXt (RawBlock _ _ ) = return empty +blockToConTeXt (BulletList lst) = do contents <- mapM listItemToConTeXt lst - return $ Pad $ text "\\startitemize" $$ vcat contents $$ text "\\stopitemize" + return $ "\\startitemize" $$ vcat contents $$ text "\\stopitemize" <> blankline blockToConTeXt (OrderedList (start, style', delim) lst) = do st <- get let level = stOrderedListLevel st @@ -159,20 +160,23 @@ blockToConTeXt (OrderedList (start, style', delim) lst) = do LowerAlpha -> "[a]" UpperAlpha -> "[A]" let specs = style'' ++ specs2 - return $ Pad $ text ("\\startitemize" ++ specs) $$ vcat contents $$ - text "\\stopitemize" + return $ "\\startitemize" <> text specs $$ vcat contents $$ + "\\stopitemize" <> blankline blockToConTeXt (DefinitionList lst) = - mapM defListItemToConTeXt lst >>= return . Pad . wrappedBlocksToDoc -blockToConTeXt HorizontalRule = return $ Pad $ text "\\thinrule" + liftM vcat $ mapM defListItemToConTeXt lst +blockToConTeXt HorizontalRule = return $ "\\thinrule" <> blankline blockToConTeXt (Header level lst) = do contents <- inlineListToConTeXt lst st <- get let opts = stOptions st let base = if writerNumberSections opts then "section" else "subject" - return $ Pad $ if level >= 1 && level <= 5 - then char '\\' <> text (concat (replicate (level - 1) "sub")) <> - text base <> char '{' <> contents <> char '}' - else contents + let level' = if writerChapters opts then level - 1 else level + return $ if level' >= 1 && level' <= 5 + then char '\\' <> text (concat (replicate (level' - 1) "sub")) <> + text base <> char '{' <> contents <> char '}' <> blankline + else if level' == 0 + then "\\chapter{" <> contents <> "}" + else contents <> blankline blockToConTeXt (Table caption aligns widths heads rows) = do let colDescriptor colWidth alignment = (case alignment of AlignLeft -> 'l' @@ -186,81 +190,87 @@ blockToConTeXt (Table caption aligns widths heads rows) = do zipWith colDescriptor widths aligns) headers <- if all null heads then return empty - else liftM ($$ text "\\HL") $ tableRowToConTeXt heads + else liftM ($$ "\\HL") $ tableRowToConTeXt heads captionText <- inlineListToConTeXt caption let captionText' = if null caption then text "none" else captionText rows' <- mapM tableRowToConTeXt rows - return $ Pad $ text "\\placetable[here]{" <> captionText' <> char '}' $$ - text "\\starttable[" <> text colDescriptors <> char ']' $$ - text "\\HL" $$ headers $$ - vcat rows' $$ text "\\HL\n\\stoptable" + return $ "\\placetable[here]" <> braces captionText' $$ + "\\starttable" <> brackets (text colDescriptors) $$ + "\\HL" $$ headers $$ + vcat rows' $$ "\\HL" $$ "\\stoptable" <> blankline tableRowToConTeXt :: [[Block]] -> State WriterState Doc tableRowToConTeXt cols = do cols' <- mapM blockListToConTeXt cols - return $ (vcat (map (text "\\NC " <>) cols')) $$ - text "\\NC\\AR" + return $ (vcat (map ("\\NC " <>) cols')) $$ "\\NC\\AR" listItemToConTeXt :: [Block] -> State WriterState Doc listItemToConTeXt list = blockListToConTeXt list >>= - return . (text "\\item" $$) . (nest 2) + return . ("\\item" $$) . (nest 2) -defListItemToConTeXt :: ([Inline], [[Block]]) -> State WriterState BlockWrapper +defListItemToConTeXt :: ([Inline], [[Block]]) -> State WriterState Doc defListItemToConTeXt (term, defs) = do term' <- inlineListToConTeXt term - def' <- liftM (vcat . intersperse (text "")) $ mapM blockListToConTeXt defs - return $ Pad $ text "\\startdescr{" <> term' <> char '}' $$ def' $$ text "\\stopdescr" + def' <- liftM vsep $ mapM blockListToConTeXt defs + return $ "\\startdescr" <> braces term' $$ nest 2 def' $$ + "\\stopdescr" <> blankline -- | Convert list of block elements to ConTeXt. blockListToConTeXt :: [Block] -> State WriterState Doc -blockListToConTeXt lst = mapM blockToConTeXt lst >>= return . wrappedBlocksToDoc +blockListToConTeXt lst = liftM vcat $ mapM blockToConTeXt lst -- | Convert list of inline elements to ConTeXt. inlineListToConTeXt :: [Inline] -- ^ Inlines to convert -> State WriterState Doc -inlineListToConTeXt lst = mapM inlineToConTeXt lst >>= return . hcat +inlineListToConTeXt lst = liftM hcat $ mapM inlineToConTeXt lst -- | Convert inline element to ConTeXt inlineToConTeXt :: Inline -- ^ Inline to convert -> State WriterState Doc inlineToConTeXt (Emph lst) = do contents <- inlineListToConTeXt lst - return $ text "{\\em " <> contents <> char '}' + return $ braces $ "\\em " <> contents inlineToConTeXt (Strong lst) = do contents <- inlineListToConTeXt lst - return $ text "{\\bf " <> contents <> char '}' + return $ braces $ "\\bf " <> contents inlineToConTeXt (Strikeout lst) = do contents <- inlineListToConTeXt lst - return $ text "\\overstrikes{" <> contents <> char '}' + return $ "\\overstrikes" <> braces contents inlineToConTeXt (Superscript lst) = do contents <- inlineListToConTeXt lst - return $ text "\\high{" <> contents <> char '}' + return $ "\\high" <> braces contents inlineToConTeXt (Subscript lst) = do contents <- inlineListToConTeXt lst - return $ text "\\low{" <> contents <> char '}' + return $ "\\low" <> braces contents inlineToConTeXt (SmallCaps lst) = do contents <- inlineListToConTeXt lst - return $ text "{\\sc " <> contents <> char '}' -inlineToConTeXt (Code str) = return $ text $ "\\type{" ++ str ++ "}" + return $ braces $ "\\sc " <> contents +inlineToConTeXt (Code _ str) | not ('{' `elem` str || '}' `elem` str) = + return $ "\\type" <> braces (text str) +inlineToConTeXt (Code _ str) = + return $ "\\mono" <> braces (text $ stringToConTeXt str) inlineToConTeXt (Quoted SingleQuote lst) = do contents <- inlineListToConTeXt lst - return $ text "\\quote{" <> contents <> char '}' + return $ "\\quote" <> braces contents inlineToConTeXt (Quoted DoubleQuote lst) = do contents <- inlineListToConTeXt lst - return $ text "\\quotation{" <> contents <> char '}' + return $ "\\quotation" <> braces contents inlineToConTeXt (Cite _ lst) = inlineListToConTeXt lst inlineToConTeXt Apostrophe = return $ char '\'' -inlineToConTeXt EmDash = return $ text "---" -inlineToConTeXt EnDash = return $ text "--" -inlineToConTeXt Ellipses = return $ text "\\ldots{}" +inlineToConTeXt EmDash = return "---" +inlineToConTeXt EnDash = return "--" +inlineToConTeXt Ellipses = return "\\ldots{}" inlineToConTeXt (Str str) = return $ text $ stringToConTeXt str -inlineToConTeXt (Math InlineMath str) = return $ char '$' <> text str <> char '$' -inlineToConTeXt (Math DisplayMath str) = return $ text "\\startformula " <> text str <> text " \\stopformula" -inlineToConTeXt (TeX str) = return $ text str -inlineToConTeXt (HtmlInline _) = return empty -inlineToConTeXt (LineBreak) = return $ text "\\crlf\n" -inlineToConTeXt Space = return $ char ' ' -inlineToConTeXt (Link [Code str] (src, tit)) = -- since ConTeXt has its own +inlineToConTeXt (Math InlineMath str) = + return $ char '$' <> text str <> char '$' +inlineToConTeXt (Math DisplayMath str) = + return $ text "\\startformula " <> text str <> text " \\stopformula" +inlineToConTeXt (RawInline "context" str) = return $ text str +inlineToConTeXt (RawInline "tex" str) = return $ text str +inlineToConTeXt (RawInline _ _) = return empty +inlineToConTeXt (LineBreak) = return $ text "\\crlf" <> cr +inlineToConTeXt Space = return space +inlineToConTeXt (Link [Code _ str] (src, tit)) = -- since ConTeXt has its own inlineToConTeXt (Link [Str str] (src, tit)) -- way of printing links... inlineToConTeXt (Link txt (src, _)) = do st <- get @@ -268,15 +278,12 @@ inlineToConTeXt (Link txt (src, _)) = do put $ st {stNextRef = next + 1} let ref = show next label <- inlineListToConTeXt txt - return $ text "\\useURL[" <> text ref <> text "][" <> text src <> - text "][][" <> label <> text "]\\from[" <> text ref <> char ']' + return $ "\\useURL" <> brackets (text ref) <> brackets (text src) <> + brackets empty <> brackets label <> + "\\from" <> brackets (text ref) inlineToConTeXt (Image _ (src, _)) = do - return $ text "{\\externalfigure[" <> text src <> text "]}" + return $ braces $ "\\externalfigure" <> brackets (text src) inlineToConTeXt (Note contents) = do contents' <- blockListToConTeXt contents - let rawnote = stripTrailingNewlines $ render contents' - -- note: a \n before } is needed when note ends with a \stoptyping - let optNewline = "\\stoptyping" `isSuffixOf` rawnote - return $ text "\\footnote{" <> - text rawnote <> (if optNewline then char '\n' else empty) <> char '}' - + return $ text "\\footnote{" <> + nest 2 contents' <> char '}' diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index 5223259eb..9d09d46e3 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -33,15 +33,15 @@ import Text.Pandoc.XML import Text.Pandoc.Shared import Text.Pandoc.Templates (renderTemplate) import Text.Pandoc.Readers.TeXMath -import Data.List ( isPrefixOf, intercalate ) +import Data.List ( isPrefixOf, intercalate, isSuffixOf ) import Data.Char ( toLower ) -import Text.PrettyPrint.HughesPJ hiding ( Str ) import Text.Pandoc.Highlighting (languages, languagesByExtension) +import Text.Pandoc.Pretty -- | Convert list of authors to a docbook <author> section authorToDocbook :: WriterOptions -> [Inline] -> Doc authorToDocbook opts name' = - let name = render $ inlinesToDocbook opts name' + let name = render Nothing $ inlinesToDocbook opts name' in if ',' `elem` name then -- last name first let (lastname, rest) = break (==',') name @@ -61,16 +61,24 @@ authorToDocbook opts name' = -- | Convert Pandoc document to string in Docbook format. writeDocbook :: WriterOptions -> Pandoc -> String writeDocbook opts (Pandoc (Meta tit auths dat) blocks) = - let title = wrap opts tit + let title = inlinesToDocbook opts tit authors = map (authorToDocbook opts) auths date = inlinesToDocbook opts dat elements = hierarchicalize blocks - main = render $ vcat (map (elementToDocbook opts) elements) + colwidth = if writerWrapText opts + then Just $ writerColumns opts + else Nothing + render' = render colwidth + opts' = if "</book>" `isSuffixOf` + (removeTrailingSpace $ writerTemplate opts) + then opts{ writerChapters = True } + else opts + main = render' $ vcat (map (elementToDocbook opts') elements) context = writerVariables opts ++ [ ("body", main) - , ("title", render title) - , ("date", render date) ] ++ - [ ("author", render a) | a <- authors ] + , ("title", render' title) + , ("date", render' date) ] ++ + [ ("author", render' a) | a <- authors ] in if writerStandalone opts then renderTemplate context $ writerTemplate opts else main @@ -83,9 +91,12 @@ elementToDocbook opts (Sec _ _num id' title elements) = let elements' = if null elements then [Blk (Para [])] else elements - in inTags True "section" [("id",id')] $ - inTagsSimple "title" (wrap opts title) $$ - vcat (map (elementToDocbook opts) elements') + tag = if writerChapters opts + then "chapter" + else "section" + in inTags True tag [("id",id')] $ + inTagsSimple "title" (inlinesToDocbook opts title) $$ + vcat (map (elementToDocbook opts{ writerChapters = False }) elements') -- | Convert a list of Pandoc blocks to Docbook. blocksToDocbook :: WriterOptions -> [Block] -> Doc @@ -123,7 +134,7 @@ listItemToDocbook opts item = blockToDocbook :: WriterOptions -> Block -> Doc blockToDocbook _ Null = empty blockToDocbook _ (Header _ _) = empty -- should not occur after hierarchicalize -blockToDocbook opts (Plain lst) = wrap opts lst +blockToDocbook opts (Plain lst) = inlinesToDocbook opts lst blockToDocbook opts (Para [Image txt (src,_)]) = let capt = inlinesToDocbook opts txt in inTagsIndented "figure" $ @@ -132,12 +143,13 @@ blockToDocbook opts (Para [Image txt (src,_)]) = (inTagsIndented "imageobject" (selfClosingTag "imagedata" [("fileref",src)])) $$ inTagsSimple "textobject" (inTagsSimple "phrase" capt)) -blockToDocbook opts (Para lst) = inTagsIndented "para" $ wrap opts lst +blockToDocbook opts (Para lst) = + inTagsIndented "para" $ inlinesToDocbook opts lst blockToDocbook opts (BlockQuote blocks) = inTagsIndented "blockquote" $ blocksToDocbook opts blocks blockToDocbook _ (CodeBlock (_,classes,_) str) = - text ("<screen" ++ lang ++ ">\n") <> - text (escapeStringForXML str) <> text "\n</screen>" + text ("<screen" ++ lang ++ ">") <> cr <> + flush (text (escapeStringForXML str) <> cr <> text "</screen>") where lang = if null langs then "" else " language=\"" ++ escapeStringForXML (head langs) ++ @@ -167,7 +179,10 @@ blockToDocbook opts (OrderedList (start, numstyle, _) (first:rest)) = in inTags True "orderedlist" attribs items blockToDocbook opts (DefinitionList lst) = inTagsIndented "variablelist" $ deflistItemsToDocbook opts lst -blockToDocbook _ (RawHtml str) = text str -- raw XML block +blockToDocbook _ (RawBlock "docbook" str) = text str -- raw XML block +-- we allow html for compatibility with earlier versions of pandoc +blockToDocbook _ (RawBlock "html" str) = text str -- raw XML block +blockToDocbook _ (RawBlock _ _) = empty blockToDocbook _ HorizontalRule = empty -- not semantic blockToDocbook opts (Table caption aligns widths headers rows) = let alignStrings = map alignmentToString aligns @@ -214,12 +229,6 @@ tableItemToDocbook opts tag align item = let attrib = [("align", align)] in inTags True tag attrib $ vcat $ map (blockToDocbook opts) item --- | Take list of inline elements and return wrapped doc. -wrap :: WriterOptions -> [Inline] -> Doc -wrap opts lst = if writerWrapText opts - then fsep $ map (inlinesToDocbook opts) (splitBy Space lst) - else inlinesToDocbook opts lst - -- | Convert a list of inline elements to Docbook. inlinesToDocbook :: WriterOptions -> [Inline] -> Doc inlinesToDocbook opts lst = hcat $ map (inlineToDocbook opts) lst @@ -249,22 +258,21 @@ inlineToDocbook _ Apostrophe = char '\'' inlineToDocbook _ Ellipses = text "…" inlineToDocbook _ EmDash = text "—" inlineToDocbook _ EnDash = text "–" -inlineToDocbook _ (Code str) = +inlineToDocbook _ (Code _ str) = inTagsSimple "literal" $ text (escapeStringForXML str) inlineToDocbook opts (Math _ str) = inlinesToDocbook opts $ readTeXMath str -inlineToDocbook _ (TeX _) = empty -inlineToDocbook _ (HtmlInline _) = empty -inlineToDocbook _ LineBreak = text $ "<literallayout></literallayout>" -inlineToDocbook _ Space = char ' ' +inlineToDocbook _ (RawInline _ _) = empty +inlineToDocbook _ LineBreak = inTagsSimple "literallayout" empty +inlineToDocbook _ Space = space inlineToDocbook opts (Link txt (src, _)) = if isPrefixOf "mailto:" src then let src' = drop 7 src emailLink = inTagsSimple "email" $ text $ escapeStringForXML $ src' - in if txt == [Code src'] - then emailLink - else inlinesToDocbook opts txt <+> char '(' <> emailLink <> - char ')' + in case txt of + [Code _ s] | s == src' -> emailLink + _ -> inlinesToDocbook opts txt <+> + char '(' <> emailLink <> char ')' else (if isPrefixOf "#" src then inTags False "link" [("linkend", drop 1 src)] else inTags False "ulink" [("url", src)]) $ @@ -275,6 +283,6 @@ inlineToDocbook _ (Image _ (src, tit)) = else inTagsIndented "objectinfo" $ inTagsIndented "title" (text $ escapeStringForXML tit) in inTagsIndented "inlinemediaobject" $ inTagsIndented "imageobject" $ - titleDoc $$ selfClosingTag "imagedata" [("fileref", src)] + titleDoc $$ selfClosingTag "imagedata" [("fileref", src)] inlineToDocbook opts (Note contents) = inTagsIndented "footnote" $ blocksToDocbook opts contents diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index deaa2fe33..33b8aa76a 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -39,6 +39,7 @@ import Codec.Archive.Zip import System.Time import Text.Pandoc.Shared hiding ( Element ) import Text.Pandoc.Definition +import Text.Pandoc.Generic import Control.Monad (liftM) import Text.XML.Light hiding (ppTopElement) import Text.Pandoc.UUID @@ -69,7 +70,7 @@ writeEPUB mbStylesheet opts doc@(Pandoc meta _) = do -- handle pictures picsRef <- newIORef [] - Pandoc _ blocks <- liftM (processWith transformBlock) $ processWithM + Pandoc _ blocks <- liftM (bottomUp transformBlock) $ bottomUpM (transformInlines (writerHTMLMathMethod opts) sourceDir picsRef) doc pics <- readIORef picsRef let readPicEntry (oldsrc, newsrc) = readEntry [] oldsrc >>= \e -> @@ -232,13 +233,13 @@ transformInlines (MathML _) _ _ (x@(Math _ _) : xs) = do mathml ++ "</ops:case><ops:default>" ++ fallback ++ "</ops:default>" ++ "</ops:switch>" result = if "<math" `isPrefixOf` mathml then inOps else mathml - return $ HtmlInline result : xs -transformInlines _ _ _ (HtmlInline _ : xs) = return $ Str "" : xs + return $ RawInline "html" result : xs +transformInlines _ _ _ (RawInline _ _ : xs) = return $ Str "" : xs transformInlines _ _ _ (Link lab (_,_) : xs) = return $ lab ++ xs transformInlines _ _ _ xs = return xs transformBlock :: Block -> Block -transformBlock (RawHtml _) = Null +transformBlock (RawBlock _ _) = Null transformBlock x = x (!) :: Node t => (t -> Element) -> [(String, String)] -> t -> Element diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index d2a400c5c..ef14b6809 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -105,8 +105,8 @@ pandocToHtml opts (Pandoc (Meta title' authors' date') blocks) = do toc <- if writerTableOfContents opts then tableOfContents opts sects else return Nothing - let startSlide = RawHtml "<div class=\"slide\">\n" - endSlide = RawHtml "</div>\n" + let startSlide = RawBlock "html" "<div class=\"slide\">\n" + endSlide = RawBlock "html" "</div>\n" let cutUp (HorizontalRule : Header 1 ys : xs) = cutUp (Header 1 ys : xs) cutUp (HorizontalRule : xs) = [endSlide, startSlide] ++ cutUp xs cutUp (Header 1 ys : xs) = [endSlide, startSlide] ++ @@ -134,6 +134,8 @@ pandocToHtml opts (Pandoc (Meta title' authors' date') blocks) = do MathML (Just url) -> script ! [src url, thetype "text/javascript"] $ noHtml + MathJax url -> + script ! [src url, thetype "text/javascript"] $ noHtml JsMath (Just url) -> script ! [src url, thetype "text/javascript"] $ noHtml @@ -168,6 +170,7 @@ inTemplate opts tit auths date toc body' newvars = , ("pagetitle", topTitle') , ("title", renderHtmlFragment tit) , ("date", date') ] ++ + [ ("html5","true") | writerHtml5 opts ] ++ (case toc of Just t -> [ ("toc", renderHtmlFragment t)] Nothing -> []) ++ @@ -187,7 +190,12 @@ tableOfContents opts sects = do let tocList = catMaybes contents return $ if null tocList then Nothing - else Just $ thediv ! [prefixedId opts' "TOC"] $ unordList tocList + else Just $ + if writerHtml5 opts + then tag "nav" ! [prefixedId opts' "TOC"] $ + unordList tocList + else thediv ! [prefixedId opts' "TOC"] $ + unordList tocList -- | Convert section number to string showSecNum :: [Int] -> String @@ -224,7 +232,10 @@ elementToHtml opts (Sec level num id' title' elements) = do return $ if slides -- S5 gets confused by the extra divs around sections then toHtmlFromList stuff else if writerSectionDivs opts - then thediv ! [prefixedId opts id'] << stuff + then if writerHtml5 opts + then tag "section" ! [prefixedId opts id'] + << stuff + else thediv ! [prefixedId opts id'] << stuff else toHtmlFromList stuff -- | Convert list of Note blocks to a footnote <div>. @@ -287,6 +298,12 @@ obfuscateChar char = obfuscateString :: String -> String obfuscateString = concatMap obfuscateChar . decodeCharacterReferences +attrsToHtml :: WriterOptions -> Attr -> [HtmlAttr] +attrsToHtml opts (id',classes',keyvals) = + [theclass (unwords classes') | not (null classes')] ++ + [prefixedId opts id' | not (null id')] ++ + map (\(x,y) -> strAttr x y) keyvals + -- | Convert Pandoc block element to HTML. blockToHtml :: WriterOptions -> Block -> State WriterState Html blockToHtml _ Null = return $ noHtml @@ -294,22 +311,24 @@ blockToHtml opts (Plain lst) = inlineListToHtml opts lst blockToHtml opts (Para [Image txt (s,tit)]) = do img <- inlineToHtml opts (Image txt (s,tit)) capt <- inlineListToHtml opts txt - return $ thediv ! [theclass "figure"] << - [img, paragraph ! [theclass "caption"] << capt] + return $ if writerHtml5 opts + then tag "figure" << + [img, tag "figcaption" << capt] + else thediv ! [theclass "figure"] << + [img, paragraph ! [theclass "caption"] << capt] blockToHtml opts (Para lst) = inlineListToHtml opts lst >>= (return . paragraph) -blockToHtml _ (RawHtml str) = return $ primHtml str +blockToHtml _ (RawBlock "html" str) = return $ primHtml str +blockToHtml _ (RawBlock _ _) = return noHtml blockToHtml _ (HorizontalRule) = return $ hr blockToHtml opts (CodeBlock (id',classes,keyvals) rawCode) = do let classes' = if writerLiterateHaskell opts then classes else filter (/= "literate") classes - case highlightHtml (id',classes',keyvals) rawCode of + case highlightHtml False (id',classes',keyvals) rawCode of Left _ -> -- change leading newlines into <br /> tags, because some -- browsers ignore leading newlines in pre blocks let (leadingBreaks, rawCode') = span (=='\n') rawCode - attrs = [theclass (unwords classes') | not (null classes')] ++ - [prefixedId opts id' | not (null id')] ++ - map (\(x,y) -> strAttr x y) keyvals + attrs = attrsToHtml opts (id', classes', keyvals) addBird = if "literate" `elem` classes' then unlines . map ("> " ++) . lines else unlines . lines @@ -366,7 +385,17 @@ blockToHtml opts (OrderedList (startnum, numstyle, _) lst) = do then [start startnum] else []) ++ (if numstyle /= DefaultStyle - then [thestyle $ "list-style-type: " ++ numstyle' ++ ";"] + then if writerHtml5 opts + then [strAttr "type" $ + case numstyle of + Decimal -> "1" + LowerAlpha -> "a" + UpperAlpha -> "A" + LowerRoman -> "i" + UpperRoman -> "I" + _ -> "1"] + else [thestyle $ "list-style-type: " ++ + numstyle'] else []) return $ ordList ! attribs $ contents blockToHtml opts (DefinitionList lst) = do @@ -379,28 +408,30 @@ blockToHtml opts (DefinitionList lst) = do else [] return $ dlist ! attribs << concat contents blockToHtml opts (Table capt aligns widths headers rows') = do - let alignStrings = map alignmentToString aligns captionDoc <- if null capt then return noHtml else inlineListToHtml opts capt >>= return . caption let percent w = show (truncate (100*w) :: Integer) ++ "%" + let widthAttrs w = if writerHtml5 opts + then [thestyle $ "width: " ++ percent w] + else [width $ percent w] let coltags = if all (== 0.0) widths then noHtml else concatHtml $ map - (\w -> col ! [width $ percent w] $ noHtml) widths + (\w -> col ! (widthAttrs w) $ noHtml) widths head' <- if all null headers then return noHtml - else liftM (thead <<) $ tableRowToHtml opts alignStrings 0 headers + else liftM (thead <<) $ tableRowToHtml opts aligns 0 headers body' <- liftM (tbody <<) $ - zipWithM (tableRowToHtml opts alignStrings) [1..] rows' + zipWithM (tableRowToHtml opts aligns) [1..] rows' return $ table $ captionDoc +++ coltags +++ head' +++ body' tableRowToHtml :: WriterOptions - -> [String] + -> [Alignment] -> Int -> [[Block]] -> State WriterState Html -tableRowToHtml opts alignStrings rownum cols' = do +tableRowToHtml opts aligns rownum cols' = do let mkcell = if rownum == 0 then th else td let rowclass = case rownum of 0 -> "header" @@ -408,7 +439,7 @@ tableRowToHtml opts alignStrings rownum cols' = do _ -> "even" cols'' <- sequence $ zipWith (\alignment item -> tableItemToHtml opts mkcell alignment item) - alignStrings cols' + aligns cols' return $ tr ! [theclass rowclass] $ toHtmlFromList cols'' alignmentToString :: Alignment -> [Char] @@ -420,12 +451,15 @@ alignmentToString alignment = case alignment of tableItemToHtml :: WriterOptions -> (Html -> Html) - -> [Char] + -> Alignment -> [Block] -> State WriterState Html tableItemToHtml opts tag' align' item = do contents <- blockListToHtml opts item - return $ tag' ! [align align'] $ contents + let alignAttrs = if writerHtml5 opts + then [thestyle $ "align: " ++ alignmentToString align'] + else [align $ alignmentToString align'] + return $ tag' ! alignAttrs $ contents blockListToHtml :: WriterOptions -> [Block] -> State WriterState Html blockListToHtml opts lst = @@ -449,7 +483,11 @@ inlineToHtml opts inline = (Apostrophe) -> return $ stringToHtml "’" (Emph lst) -> inlineListToHtml opts lst >>= return . emphasize (Strong lst) -> inlineListToHtml opts lst >>= return . strong - (Code str) -> return $ thecode << str + (Code attr str) -> case highlightHtml True attr str of + Left _ -> return + $ thecode ! (attrsToHtml opts attr) + $ stringToHtml str + Right h -> return h (Strikeout lst) -> inlineListToHtml opts lst >>= return . (thespan ! [thestyle "text-decoration: line-through;"]) (SmallCaps lst) -> inlineListToHtml opts lst >>= @@ -464,8 +502,7 @@ inlineToHtml opts inline = stringToHtml "”") in do contents <- inlineListToHtml opts lst return $ leftQuote +++ contents +++ rightQuote - (Math t str) -> - modify (\st -> st {stMath = True}) >> + (Math t str) -> modify (\st -> st {stMath = True}) >> (case writerHTMLMathMethod opts of LaTeXMathML _ -> -- putting LaTeXMathML in container with class "LaTeX" prevents @@ -487,7 +524,9 @@ inlineToHtml opts inline = InlineMath -> m DisplayMath -> br +++ m +++ br GladTeX -> - return $ primHtml $ "<EQ>" ++ str ++ "</EQ>" + return $ case t of + InlineMath -> primHtml $ "<EQ ENV=\"math\">" ++ str ++ "</EQ>" + DisplayMath -> primHtml $ "<EQ ENV=\"displaymath\">" ++ str ++ "</EQ>" MathML _ -> do let dt = if t == InlineMath then DisplayInline @@ -500,18 +539,23 @@ inlineToHtml opts inline = Left _ -> inlineListToHtml opts (readTeXMath str) >>= return . (thespan ! [theclass "math"]) + MathJax _ -> return $ primHtml $ + case t of + InlineMath -> "\\(" ++ str ++ "\\)" + DisplayMath -> "\\[" ++ str ++ "\\]" PlainMath -> do x <- inlineListToHtml opts (readTeXMath str) let m = thespan ! [theclass "math"] $ x return $ case t of InlineMath -> m DisplayMath -> br +++ m +++ br ) - (TeX str) -> case writerHTMLMathMethod opts of - LaTeXMathML _ -> do modify (\st -> st {stMath = True}) - return $ primHtml str - _ -> return noHtml - (HtmlInline str) -> return $ primHtml str - (Link [Code str] (s,_)) | "mailto:" `isPrefixOf` s -> + (RawInline "latex" str) -> case writerHTMLMathMethod opts of + LaTeXMathML _ -> do modify (\st -> st {stMath = True}) + return $ primHtml str + _ -> return noHtml + (RawInline "html" str) -> return $ primHtml str + (RawInline _ _) -> return noHtml + (Link [Code _ str] (s,_)) | "mailto:" `isPrefixOf` s -> return $ obfuscateLink opts str s (Link txt (s,_)) | "mailto:" `isPrefixOf` s -> do linkText <- inlineListToHtml opts txt @@ -551,7 +595,7 @@ blockListToNote :: WriterOptions -> String -> [Block] -> State WriterState Html blockListToNote opts ref blocks = -- If last block is Para or Plain, include the backlink at the end of -- that block. Otherwise, insert a new Plain block with the backlink. - let backlink = [HtmlInline $ " <a href=\"#" ++ writerIdentifierPrefix opts ++ "fnref" ++ ref ++ + let backlink = [RawInline "html" $ " <a href=\"#" ++ writerIdentifierPrefix opts ++ "fnref" ++ ref ++ "\" class=\"footnoteBackLink\"" ++ " title=\"Jump back to footnote " ++ ref ++ "\">↩</a>"] blocks' = if null blocks diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index 720c00ac8..28a1e7174 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> @@ -29,13 +30,15 @@ Conversion of 'Pandoc' format into LaTeX. -} module Text.Pandoc.Writers.LaTeX ( writeLaTeX ) where import Text.Pandoc.Definition +import Text.Pandoc.Generic import Text.Pandoc.Shared import Text.Pandoc.Templates import Text.Printf ( printf ) -import Data.List ( (\\), isSuffixOf, isPrefixOf, intersperse ) -import Data.Char ( toLower ) +import Data.List ( (\\), isSuffixOf, isPrefixOf, intercalate, intersperse ) +import Data.Char ( toLower, isPunctuation ) import Control.Monad.State -import Text.PrettyPrint.HughesPJ hiding ( Str ) +import Text.Pandoc.Pretty +import System.FilePath (dropExtension) data WriterState = WriterState { stInNote :: Bool -- @True@ if we're in a note @@ -60,7 +63,7 @@ writeLaTeX options document = stVerbInNote = False, stEnumerate = False, stTable = False, stStrikeout = False, stSubscript = False, stUrl = False, stGraphics = False, - stLHS = False, stBook = False } + stLHS = False, stBook = writerChapters options } pandocToLaTeX :: WriterOptions -> Pandoc -> State WriterState String pandocToLaTeX options (Pandoc (Meta title authors date) blocks) = do @@ -70,13 +73,34 @@ pandocToLaTeX options (Pandoc (Meta title authors date) blocks) = do "{report}" `isSuffixOf` x) when (any usesBookClass (lines template)) $ modify $ \s -> s{stBook = True} - titletext <- liftM render $ inlineListToLaTeX title - authorsText <- mapM (liftM render . inlineListToLaTeX) authors - dateText <- liftM render $ inlineListToLaTeX date - body <- blockListToLaTeX blocks - let main = render body + opts <- liftM stOptions get + let colwidth = if writerWrapText opts + then Just $ writerColumns opts + else Nothing + titletext <- liftM (render colwidth) $ inlineListToLaTeX title + authorsText <- mapM (liftM (render colwidth) . inlineListToLaTeX) authors + dateText <- liftM (render colwidth) $ inlineListToLaTeX date + let (blocks', lastHeader) = if writerCiteMethod options == Citeproc then + (blocks, []) + else case last blocks of + Header 1 il -> (init blocks, il) + _ -> (blocks, []) + body <- blockListToLaTeX blocks' + biblioTitle <- liftM (render colwidth) $ inlineListToLaTeX lastHeader + let main = render colwidth body st <- get - let context = writerVariables options ++ + let biblioFiles = intercalate "," $ map dropExtension $ writerBiblioFiles options + citecontext = case writerCiteMethod options of + Natbib -> [ ("biblio-files", biblioFiles) + , ("biblio-title", biblioTitle) + , ("natbib", "yes") + ] + Biblatex -> [ ("biblio-files", biblioFiles) + , ("biblio-title", biblioTitle) + , ("biblatex", "yes") + ] + _ -> [] + context = writerVariables options ++ [ ("toc", if writerTableOfContents options then "yes" else "") , ("body", main) , ("title", titletext) @@ -91,7 +115,10 @@ pandocToLaTeX options (Pandoc (Meta title authors date) blocks) = do [ ("url", "yes") | stUrl st ] ++ [ ("numbersections", "yes") | writerNumberSections options ] ++ [ ("lhs", "yes") | stLHS st ] ++ - [ ("graphics", "yes") | stGraphics st ] + [ ("graphics", "yes") | stGraphics st ] ++ + [ ("book-class", "yes") | stBook st] ++ + [ ("listings", "yes") | writerListings options ] ++ + citecontext return $ if writerStandalone options then renderTemplate context template else main @@ -107,7 +134,13 @@ stringToLaTeX = escapeStringUsing latexEscapes , ('|', "\\textbar{}") , ('<', "\\textless{}") , ('>', "\\textgreater{}") + , ('[', "{[}") -- to avoid interpretation as + , (']', "{]}") -- optional arguments , ('\160', "~") + , ('\x2018', "`") + , ('\x2019', "'") + , ('\x201C', "``") + , ('\x201D', "''") ] -- | Puts contents into LaTeX command. @@ -118,49 +151,73 @@ inCmd cmd contents = char '\\' <> text cmd <> braces contents -- (because it's illegal to have verbatim inside some command arguments) deVerb :: [Inline] -> [Inline] deVerb [] = [] -deVerb ((Code str):rest) = - (TeX $ "\\texttt{" ++ stringToLaTeX str ++ "}"):(deVerb rest) +deVerb ((Code _ str):rest) = + (RawInline "latex" $ "\\texttt{" ++ stringToLaTeX str ++ "}"):(deVerb rest) deVerb (other:rest) = other:(deVerb rest) -- | Convert Pandoc block element to LaTeX. blockToLaTeX :: Block -- ^ Block to convert -> State WriterState Doc blockToLaTeX Null = return empty -blockToLaTeX (Plain lst) = do - st <- get - let opts = stOptions st - wrapTeXIfNeeded opts True inlineListToLaTeX lst +blockToLaTeX (Plain lst) = inlineListToLaTeX lst blockToLaTeX (Para [Image txt (src,tit)]) = do capt <- inlineListToLaTeX txt img <- inlineToLaTeX (Image txt (src,tit)) - return $ text "\\begin{figure}[htb]" $$ text "\\centering" $$ img $$ - (text "\\caption{" <> capt <> char '}') $$ text "\\end{figure}\n" + return $ "\\begin{figure}[htb]" $$ "\\centering" $$ img $$ + ("\\caption{" <> capt <> char '}') $$ "\\end{figure}" $$ blankline blockToLaTeX (Para lst) = do - st <- get - let opts = stOptions st - result <- wrapTeXIfNeeded opts True inlineListToLaTeX lst - return $ result <> char '\n' + result <- inlineListToLaTeX lst + return $ result <> blankline blockToLaTeX (BlockQuote lst) = do contents <- blockListToLaTeX lst - return $ text "\\begin{quote}" $$ contents $$ text "\\end{quote}" -blockToLaTeX (CodeBlock (_,classes,_) str) = do + return $ "\\begin{quote}" $$ contents $$ "\\end{quote}" +blockToLaTeX (CodeBlock (_,classes,keyvalAttr) str) = do st <- get env <- if writerLiterateHaskell (stOptions st) && "haskell" `elem` classes && "literate" `elem` classes then do modify $ \s -> s{ stLHS = True } return "code" - else if stInNote st - then do - modify $ \s -> s{ stVerbInNote = True } - return "Verbatim" - else return "verbatim" - return $ text ("\\begin{" ++ env ++ "}\n") <> text str <> - text ("\n\\end{" ++ env ++ "}") -blockToLaTeX (RawHtml _) = return empty + else if writerListings (stOptions st) + then return "lstlisting" + else if stInNote st + then do + modify $ \s -> s{ stVerbInNote = True } + return "Verbatim" + else return "verbatim" + let params = if writerListings (stOptions st) + then take 1 + [ "language=" ++ lang | lang <- classes + , lang `elem` ["ABAP","IDL","Plasm","ACSL","inform" + ,"POV","Ada","Java","Prolog","Algol" + ,"JVMIS","Promela","Ant","ksh","Python" + ,"Assembler","Lisp","R","Awk","Logo" + ,"Reduce","bash","make","Rexx","Basic" + ,"Mathematica","RSL","C","Matlab","Ruby" + ,"C++","Mercury","S","Caml","MetaPost" + ,"SAS","Clean","Miranda","Scilab","Cobol" + ,"Mizar","sh","Comal","ML","SHELXL","csh" + ,"Modula-2","Simula","Delphi","MuPAD" + ,"SQL","Eiffel","NASTRAN","tcl","Elan" + ,"Oberon-2","TeX","erlang","OCL" + ,"VBScript","Euphoria","Octave","Verilog" + ,"Fortran","Oz","VHDL","GCL","Pascal" + ,"VRML","Gnuplot","Perl","XML","Haskell" + ,"PHP","XSLT","HTML","PL/I"] + ] ++ + [ key ++ "=" ++ attr | (key,attr) <- keyvalAttr ] + else [] + printParams + | null params = empty + | otherwise = "[" <> hsep (intersperse "," (map text params)) <> + "]" + return $ "\\begin{" <> text env <> "}" <> printParams $$ flush (text str) $$ + "\\end{" <> text env <> "}" $$ cr -- final cr needed because of footnotes +blockToLaTeX (RawBlock "latex" x) = return $ text x <> blankline +blockToLaTeX (RawBlock _ _) = return empty blockToLaTeX (BulletList lst) = do items <- mapM listItemToLaTeX lst - return $ text "\\begin{itemize}" $$ vcat items $$ text "\\end{itemize}" + return $ "\\begin{itemize}" $$ vcat items $$ "\\end{itemize}" blockToLaTeX (OrderedList (start, numstyle, numdelim) lst) = do st <- get let oldlevel = stOLLevel st @@ -179,20 +236,19 @@ blockToLaTeX (OrderedList (start, numstyle, numdelim) lst) = do map toLower (toRomanNumeral oldlevel) ++ "}{" ++ show (start - 1) ++ "}" else empty - return $ text "\\begin{enumerate}" <> exemplar $$ resetcounter $$ - vcat items $$ text "\\end{enumerate}" + return $ "\\begin{enumerate}" <> exemplar $$ resetcounter $$ + vcat items $$ "\\end{enumerate}" blockToLaTeX (DefinitionList lst) = do items <- mapM defListItemToLaTeX lst - return $ text "\\begin{description}" $$ vcat items $$ - text "\\end{description}" -blockToLaTeX HorizontalRule = return $ text $ - "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n" + return $ "\\begin{description}" $$ vcat items $$ "\\end{description}" +blockToLaTeX HorizontalRule = return $ + "\\begin{center}\\rule{3in}{0.4pt}\\end{center}" $$ blankline blockToLaTeX (Header level lst) = do let lst' = deVerb lst txt <- inlineListToLaTeX lst' let noNote (Note _) = Str "" noNote x = x - let lstNoNotes = processWith noNote lst' + let lstNoNotes = bottomUp noNote lst' -- footnotes in sections don't work unless you specify an optional -- argument: \section[mysec]{mysec\footnote{blah}} optional <- if lstNoNotes == lst' @@ -202,30 +258,31 @@ blockToLaTeX (Header level lst) = do return $ char '[' <> res <> char ']' let stuffing = optional <> char '{' <> txt <> char '}' book <- liftM stBook get - return $ case (book, level) of - (True, 1) -> text "\\chapter" <> stuffing <> char '\n' - (True, 2) -> text "\\section" <> stuffing <> char '\n' - (True, 3) -> text "\\subsection" <> stuffing <> char '\n' - (True, 4) -> text "\\subsubsection" <> stuffing <> char '\n' - (False, 1) -> text "\\section" <> stuffing <> char '\n' - (False, 2) -> text "\\subsection" <> stuffing <> char '\n' - (False, 3) -> text "\\subsubsection" <> stuffing <> char '\n' - _ -> txt <> char '\n' + let level' = if book then level - 1 else level + let headerWith x y = text x <> y $$ blankline + return $ case level' of + 0 -> headerWith "\\chapter" stuffing + 1 -> headerWith "\\section" stuffing + 2 -> headerWith "\\subsection" stuffing + 3 -> headerWith "\\subsubsection" stuffing + 4 -> headerWith "\\paragraph" stuffing + 5 -> headerWith "\\subparagraph" stuffing + _ -> txt $$ blankline blockToLaTeX (Table caption aligns widths heads rows) = do headers <- if all null heads then return empty - else liftM ($$ text "\\hline") $ tableRowToLaTeX heads + else liftM ($$ "\\hline") $ (tableRowToLaTeX widths) heads captionText <- inlineListToLaTeX caption - rows' <- mapM tableRowToLaTeX rows + rows' <- mapM (tableRowToLaTeX widths) rows let colDescriptors = concat $ zipWith toColDescriptor widths aligns let tableBody = text ("\\begin{tabular}{" ++ colDescriptors ++ "}") $$ - headers $$ vcat rows' $$ text "\\end{tabular}" - let centered txt = text "\\begin{center}" $$ txt $$ text "\\end{center}" + headers $$ vcat rows' $$ "\\end{tabular}" + let centered txt = "\\begin{center}" $$ txt $$ "\\end{center}" modify $ \s -> s{ stTable = True } return $ if isEmpty captionText - then centered tableBody <> char '\n' - else text "\\begin{table}[h]" $$ centered tableBody $$ - inCmd "caption" captionText $$ text "\\end{table}\n" + then centered tableBody $$ blankline + else "\\begin{table}[h]" $$ centered tableBody $$ + inCmd "caption" captionText $$ "\\end{table}" $$ blankline toColDescriptor :: Double -> Alignment -> String toColDescriptor 0 align = @@ -240,16 +297,19 @@ toColDescriptor width align = ">{\\PBS" ++ AlignRight -> "\\raggedleft" AlignCenter -> "\\centering" AlignDefault -> "\\raggedright") ++ - "\\hspace{0pt}}p{" ++ printf "%.2f" width ++ - "\\columnwidth}" + "\\hspace{0pt}}p{" ++ printf "%.2f" width ++ "\\columnwidth}" blockListToLaTeX :: [Block] -> State WriterState Doc blockListToLaTeX lst = mapM blockToLaTeX lst >>= return . vcat -tableRowToLaTeX :: [[Block]] -> State WriterState Doc -tableRowToLaTeX cols = mapM blockListToLaTeX cols >>= - return . ($$ text "\\\\") . foldl (\row item -> row $$ - (if isEmpty row then text "" else text " & ") <> item) empty +tableRowToLaTeX :: [Double] -> [[Block]] -> State WriterState Doc +tableRowToLaTeX widths cols = do + renderedCells <- mapM blockListToLaTeX cols + let toCell 0 c = c + toCell w c = "\\parbox{" <> text (printf "%.2f" w) <> + "\\columnwidth}{" <> c <> cr <> "}" + let cells = zipWith toCell widths renderedCells + return $ (hcat $ intersperse (" & ") cells) <> "\\\\" listItemToLaTeX :: [Block] -> State WriterState Doc listItemToLaTeX lst = blockListToLaTeX lst >>= return . (text "\\item" $$) . @@ -258,8 +318,8 @@ listItemToLaTeX lst = blockListToLaTeX lst >>= return . (text "\\item" $$) . defListItemToLaTeX :: ([Inline], [[Block]]) -> State WriterState Doc defListItemToLaTeX (term, defs) = do term' <- inlineListToLaTeX $ deVerb term - def' <- liftM (vcat . intersperse (text "")) $ mapM blockListToLaTeX defs - return $ text "\\item[" <> term' <> text "]" $$ def' + def' <- liftM vsep $ mapM blockListToLaTeX defs + return $ "\\item" <> brackets term' $$ def' -- | Convert list of inline elements to LaTeX. inlineListToLaTeX :: [Inline] -- ^ Inlines to convert @@ -292,60 +352,161 @@ inlineToLaTeX (Subscript lst) = do return $ inCmd "textsubscr" contents inlineToLaTeX (SmallCaps lst) = inlineListToLaTeX (deVerb lst) >>= return . inCmd "textsc" -inlineToLaTeX (Cite _ lst) = - inlineListToLaTeX lst -inlineToLaTeX (Code str) = do +inlineToLaTeX (Cite cits lst) = do + st <- get + let opts = stOptions st + case writerCiteMethod opts of + Natbib -> citationsToNatbib cits + Biblatex -> citationsToBiblatex cits + _ -> inlineListToLaTeX lst + +inlineToLaTeX (Code _ str) = do st <- get when (stInNote st) $ modify $ \s -> s{ stVerbInNote = True } let chr = ((enumFromTo '!' '~') \\ str) !! 0 - return $ text $ "\\verb" ++ [chr] ++ str ++ [chr] + if writerListings (stOptions st) + then return $ text $ "\\lstinline" ++ [chr] ++ str ++ [chr] + else return $ text $ "\\verb" ++ [chr] ++ str ++ [chr] inlineToLaTeX (Quoted SingleQuote lst) = do contents <- inlineListToLaTeX lst let s1 = if (not (null lst)) && (isQuoted (head lst)) - then text "\\," - else empty + then "\\," + else empty let s2 = if (not (null lst)) && (isQuoted (last lst)) - then text "\\," + then "\\," else empty return $ char '`' <> s1 <> contents <> s2 <> char '\'' inlineToLaTeX (Quoted DoubleQuote lst) = do contents <- inlineListToLaTeX lst let s1 = if (not (null lst)) && (isQuoted (head lst)) - then text "\\," - else empty + then "\\," + else empty let s2 = if (not (null lst)) && (isQuoted (last lst)) - then text "\\," + then "\\," else empty - return $ text "``" <> s1 <> contents <> s2 <> text "''" + return $ "``" <> s1 <> contents <> s2 <> "''" inlineToLaTeX Apostrophe = return $ char '\'' -inlineToLaTeX EmDash = return $ text "---" -inlineToLaTeX EnDash = return $ text "--" -inlineToLaTeX Ellipses = return $ text "\\ldots{}" +inlineToLaTeX EmDash = return "---" +inlineToLaTeX EnDash = return "--" +inlineToLaTeX Ellipses = return "\\ldots{}" inlineToLaTeX (Str str) = return $ text $ stringToLaTeX str inlineToLaTeX (Math InlineMath str) = return $ char '$' <> text str <> char '$' -inlineToLaTeX (Math DisplayMath str) = return $ text "\\[" <> text str <> text "\\]" -inlineToLaTeX (TeX str) = return $ text str -inlineToLaTeX (HtmlInline _) = return empty -inlineToLaTeX (LineBreak) = return $ text "\\\\" -inlineToLaTeX Space = return $ char ' ' +inlineToLaTeX (Math DisplayMath str) = return $ "\\[" <> text str <> "\\]" +inlineToLaTeX (RawInline "latex" str) = return $ text str +inlineToLaTeX (RawInline "tex" str) = return $ text str +inlineToLaTeX (RawInline _ _) = return empty +inlineToLaTeX (LineBreak) = return "\\\\" +inlineToLaTeX Space = return space inlineToLaTeX (Link txt (src, _)) = case txt of - [Code x] | x == src -> -- autolink + [Code _ x] | x == src -> -- autolink do modify $ \s -> s{ stUrl = True } return $ text $ "\\url{" ++ x ++ "}" _ -> do contents <- inlineListToLaTeX $ deVerb txt - return $ text ("\\href{" ++ src ++ "}{") <> contents <> - char '}' + return $ text ("\\href{" ++ stringToLaTeX src ++ "}{") <> + contents <> char '}' inlineToLaTeX (Image _ (source, _)) = do modify $ \s -> s{ stGraphics = True } - return $ text $ "\\includegraphics{" ++ source ++ "}" + return $ "\\includegraphics" <> braces (text source) inlineToLaTeX (Note contents) = do - st <- get - put (st {stInNote = True}) + modify (\s -> s{stInNote = True}) contents' <- blockListToLaTeX contents modify (\s -> s {stInNote = False}) - let rawnote = stripTrailingNewlines $ render contents' -- note: a \n before } is needed when note ends with a Verbatim environment - let optNewline = "\\end{Verbatim}" `isSuffixOf` rawnote - return $ text "\\footnote{" <> - text rawnote <> (if optNewline then char '\n' else empty) <> char '}' + return $ "\\footnote" <> braces (nest 2 contents') + + +citationsToNatbib :: [Citation] -> State WriterState Doc +citationsToNatbib (one:[]) + = citeCommand c p s k + where + Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + , citationMode = m + } + = one + c = case m of + AuthorInText -> "citet" + SuppressAuthor -> "citeyearpar" + NormalCitation -> "citep" + +citationsToNatbib cits + | noPrefix (tail cits) && noSuffix (init cits) && ismode NormalCitation cits + = citeCommand "citep" p s ks + where + noPrefix = and . map (null . citationPrefix) + noSuffix = and . map (null . citationSuffix) + ismode m = and . map (((==) m) . citationMode) + p = citationPrefix $ head $ cits + s = citationSuffix $ last $ cits + ks = intercalate ", " $ map citationId cits + +citationsToNatbib (c:cs) | citationMode c == AuthorInText = do + author <- citeCommand "citeauthor" [] [] (citationId c) + cits <- citationsToNatbib (c { citationMode = SuppressAuthor } : cs) + return $ author <+> cits + +citationsToNatbib cits = do + cits' <- mapM convertOne cits + return $ text "\\citetext{" <> foldl combineTwo empty cits' <> text "}" + where + combineTwo a b | isEmpty a = b + | otherwise = a <> text "; " <> b + convertOne Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + , citationMode = m + } + = case m of + AuthorInText -> citeCommand "citealt" p s k + SuppressAuthor -> citeCommand "citeyear" p s k + NormalCitation -> citeCommand "citealp" p s k + +citeCommand :: String -> [Inline] -> [Inline] -> String -> State WriterState Doc +citeCommand c p s k = do + args <- citeArguments p s k + return $ text ("\\" ++ c) <> args + +citeArguments :: [Inline] -> [Inline] -> String -> State WriterState Doc +citeArguments p s k = do + let s' = case s of + (Str (x:[]) : r) | isPunctuation x -> dropWhile (== Space) r + (Str (x:xs) : r) | isPunctuation x -> Str xs : r + _ -> s + pdoc <- inlineListToLaTeX p + sdoc <- inlineListToLaTeX s' + let optargs = case (isEmpty pdoc, isEmpty sdoc) of + (True, True ) -> empty + (True, False) -> brackets sdoc + (_ , _ ) -> brackets pdoc <> brackets sdoc + return $ optargs <> braces (text k) + +citationsToBiblatex :: [Citation] -> State WriterState Doc +citationsToBiblatex (one:[]) + = citeCommand cmd p s k + where + Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + , citationMode = m + } = one + cmd = case m of + SuppressAuthor -> "autocite*" + AuthorInText -> "textcite" + NormalCitation -> "autocite" + +citationsToBiblatex (c:cs) = do + args <- mapM convertOne (c:cs) + return $ text cmd <> foldl (<>) empty args + where + cmd = case citationMode c of + AuthorInText -> "\\textcites" + _ -> "\\autocites" + convertOne Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + } + = citeArguments p s k + +citationsToBiblatex _ = return empty diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs index a46a18893..78b9274d6 100644 --- a/src/Text/Pandoc/Writers/Man.hs +++ b/src/Text/Pandoc/Writers/Man.hs @@ -35,7 +35,7 @@ import Text.Pandoc.Shared import Text.Pandoc.Readers.TeXMath import Text.Printf ( printf ) import Data.List ( isPrefixOf, intersperse, intercalate ) -import Text.PrettyPrint.HughesPJ hiding ( Str ) +import Text.Pandoc.Pretty import Control.Monad.State type Notes = [[Block]] @@ -52,27 +52,31 @@ pandocToMan opts (Pandoc (Meta title authors date) blocks) = do titleText <- inlineListToMan opts title authors' <- mapM (inlineListToMan opts) authors date' <- inlineListToMan opts date - let (cmdName, rest) = break (== ' ') $ render titleText + let colwidth = if writerWrapText opts + then Just $ writerColumns opts + else Nothing + let render' = render colwidth + let (cmdName, rest) = break (== ' ') $ render' titleText let (title', section) = case reverse cmdName of (')':d:'(':xs) | d `elem` ['0'..'9'] -> (text (reverse xs), char d) - xs -> (text (reverse xs), doubleQuotes empty) + xs -> (text (reverse xs), doubleQuotes empty) let description = hsep $ map (doubleQuotes . text . removeLeadingTrailingSpace) $ - splitBy '|' rest + splitBy (== '|') rest body <- blockListToMan opts blocks notes <- liftM stNotes get notes' <- notesToMan opts (reverse notes) - let main = render $ body $$ notes' $$ text "" + let main = render' $ body $$ notes' $$ text "" hasTables <- liftM stHasTables get let context = writerVariables opts ++ [ ("body", main) - , ("title", render title') - , ("section", render section) - , ("date", render date') - , ("description", render description) ] ++ + , ("title", render' title') + , ("section", render' section) + , ("date", render' date') + , ("description", render' description) ] ++ [ ("has-tables", "yes") | hasTables ] ++ - [ ("author", render a) | a <- authors' ] + [ ("author", render' a) | a <- authors' ] if writerStandalone opts then return $ renderTemplate context $ writerTemplate opts else return main @@ -89,7 +93,7 @@ notesToMan opts notes = noteToMan :: WriterOptions -> Int -> [Block] -> State WriterState Doc noteToMan opts num note = do contents <- blockListToMan opts note - let marker = text "\n.SS [" <> text (show num) <> char ']' + let marker = cr <> text ".SS " <> brackets (text (show num)) return $ marker $$ contents -- | Association list of characters to escape. @@ -136,14 +140,14 @@ blockToMan :: WriterOptions -- ^ Options -> State WriterState Doc blockToMan _ Null = return empty blockToMan opts (Plain inlines) = - liftM vcat $ mapM (wrapIfNeeded opts (inlineListToMan opts)) $ - splitSentences inlines + liftM vcat $ mapM (inlineListToMan opts) $ splitSentences inlines blockToMan opts (Para inlines) = do - contents <- liftM vcat $ mapM (wrapIfNeeded opts (inlineListToMan opts)) $ + contents <- liftM vcat $ mapM (inlineListToMan opts) $ splitSentences inlines return $ text ".PP" $$ contents -blockToMan _ (RawHtml _) = return empty -blockToMan _ HorizontalRule = return $ text $ ".PP\n * * * * *" +blockToMan _ (RawBlock "man" str) = return $ text str +blockToMan _ (RawBlock _ _) = return empty +blockToMan _ HorizontalRule = return $ text ".PP" $$ text " * * * * *" blockToMan opts (Header level inlines) = do contents <- inlineListToMan opts inlines let heading = case level of @@ -256,7 +260,7 @@ definitionListItemToMan opts (label, defs) = do mapM (\item -> blockToMan opts item) rest first' <- blockToMan opts first return $ first' $$ text ".RS" $$ rest' $$ text ".RE" - return $ text ".TP\n.B " <> labelText $+$ contents + return $ text ".TP" $$ text ".B " <> labelText $$ contents -- | Convert list of Pandoc block elements to man. blockListToMan :: WriterOptions -- ^ Options @@ -303,23 +307,25 @@ inlineToMan _ EmDash = return $ text "\\[em]" inlineToMan _ EnDash = return $ text "\\[en]" inlineToMan _ Apostrophe = return $ char '\'' inlineToMan _ Ellipses = return $ text "\\&..." -inlineToMan _ (Code str) = +inlineToMan _ (Code _ str) = return $ text $ "\\f[C]" ++ escapeCode str ++ "\\f[]" inlineToMan _ (Str str) = return $ text $ escapeString str inlineToMan opts (Math InlineMath str) = inlineListToMan opts $ readTeXMath str inlineToMan opts (Math DisplayMath str) = do contents <- inlineListToMan opts $ readTeXMath str - return $ text ".RS" $$ contents $$ text ".RE" -inlineToMan _ (TeX _) = return empty -inlineToMan _ (HtmlInline _) = return empty -inlineToMan _ (LineBreak) = return $ text "\n.PD 0\n.P\n.PD\n" -inlineToMan _ Space = return $ char ' ' + return $ cr <> text ".RS" $$ contents $$ text ".RE" +inlineToMan _ (RawInline "man" str) = return $ text str +inlineToMan _ (RawInline _ _) = return empty +inlineToMan _ (LineBreak) = return $ + cr <> text ".PD 0" $$ text ".P" $$ text ".PD" <> cr +inlineToMan _ Space = return space inlineToMan opts (Link txt (src, _)) = do linktext <- inlineListToMan opts txt let srcSuffix = if isPrefixOf "mailto:" src then drop 7 src else src - return $ if txt == [Code srcSuffix] - then char '<' <> text srcSuffix <> char '>' - else linktext <> text " (" <> text src <> char ')' + return $ case txt of + [Code _ s] + | s == srcSuffix -> char '<' <> text srcSuffix <> char '>' + _ -> linktext <> text " (" <> text src <> char ')' inlineToMan opts (Image alternate (source, tit)) = do let txt = if (null alternate) || (alternate == [Str ""]) || (alternate == [Str source]) -- to prevent autolinks diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 1b612006b..5e12c4aca 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> @@ -31,13 +32,13 @@ Markdown: <http://daringfireball.net/projects/markdown/> -} module Text.Pandoc.Writers.Markdown (writeMarkdown, writePlain) where import Text.Pandoc.Definition +import Text.Pandoc.Generic import Text.Pandoc.Templates (renderTemplate) import Text.Pandoc.Shared -import Text.Pandoc.Parsing -import Text.Pandoc.Blocks +import Text.Pandoc.Parsing hiding (blankline) import Text.ParserCombinators.Parsec ( runParser, GenParser ) import Data.List ( group, isPrefixOf, find, intersperse, transpose ) -import Text.PrettyPrint.HughesPJ hiding ( Str ) +import Text.Pandoc.Pretty import Control.Monad.State type Notes = [[Block]] @@ -57,28 +58,28 @@ writeMarkdown opts document = -- pictures, or inline formatting). writePlain :: WriterOptions -> Pandoc -> String writePlain opts document = - evalState (pandocToMarkdown opts document') WriterState{ stNotes = [] - , stRefs = [] - , stPlain = True } + evalState (pandocToMarkdown opts{writerStrictMarkdown = True} + document') WriterState{ stNotes = [] + , stRefs = [] + , stPlain = True } where document' = plainify document plainify :: Pandoc -> Pandoc -plainify = processWith go - where go :: [Inline] -> [Inline] - go (Emph xs : ys) = go xs ++ go ys - go (Strong xs : ys) = go xs ++ go ys - go (Strikeout xs : ys) = go xs ++ go ys - go (Superscript xs : ys) = go xs ++ go ys - go (Subscript xs : ys) = go xs ++ go ys - go (SmallCaps xs : ys) = go xs ++ go ys - go (Code s : ys) = Str s : go ys - go (Math _ s : ys) = Str s : go ys - go (TeX _ : ys) = Str "" : go ys - go (HtmlInline _ : ys) = Str "" : go ys - go (Link xs _ : ys) = go xs ++ go ys - go (Image _ _ : ys) = go ys - go (x : ys) = x : go ys - go [] = [] +plainify = bottomUp go + where go :: Inline -> Inline + go (Emph xs) = SmallCaps xs + go (Strong xs) = SmallCaps xs + go (Strikeout xs) = SmallCaps xs + go (Superscript xs) = SmallCaps xs + go (Subscript xs) = SmallCaps xs + go (SmallCaps xs) = SmallCaps xs + go (Code _ s) = Str s + go (Math _ s) = Str s + go (RawInline _ _) = Str "" + go (Link xs _) = SmallCaps xs + go (Image xs _) = SmallCaps $ [Str "["] ++ xs ++ [Str "]"] + go (Cite _ cits) = SmallCaps cits + go x = x -- | Return markdown representation of document. pandocToMarkdown :: WriterOptions -> Pandoc -> State WriterState String @@ -96,15 +97,20 @@ pandocToMarkdown opts (Pandoc (Meta title authors date) blocks) = do notes' <- notesToMarkdown opts (reverse $ stNotes st) st' <- get -- note that the notes may contain refs refs' <- refsToMarkdown opts (reverse $ stRefs st') - let main = render $ foldl ($+$) empty $ [body, notes', refs'] + let colwidth = if writerWrapText opts + then Just $ writerColumns opts + else Nothing + let main = render colwidth $ body <> + (if isEmpty notes' then empty else blankline <> notes') <> + (if isEmpty refs' then empty else blankline <> refs') let context = writerVariables opts ++ - [ ("toc", render toc) + [ ("toc", render colwidth toc) , ("body", main) - , ("title", render title') - , ("date", render date') + , ("title", render colwidth title') + , ("date", render colwidth date') ] ++ [ ("titleblock", "yes") | titleblock ] ++ - [ ("author", render a) | a <- authors' ] + [ ("author", render colwidth a) | a <- authors' ] if writerStandalone opts then return $ renderTemplate context $ writerTemplate opts else return main @@ -112,29 +118,36 @@ pandocToMarkdown opts (Pandoc (Meta title authors date) blocks) = do -- | Return markdown representation of reference key table. refsToMarkdown :: WriterOptions -> Refs -> State WriterState Doc refsToMarkdown opts refs = mapM (keyToMarkdown opts) refs >>= return . vcat - + -- | Return markdown representation of a reference key. keyToMarkdown :: WriterOptions -> ([Inline], (String, String)) -> State WriterState Doc keyToMarkdown opts (label, (src, tit)) = do label' <- inlineListToMarkdown opts label - let tit' = if null tit then empty else text $ " \"" ++ tit ++ "\"" - return $ text " " <> char '[' <> label' <> char ']' <> text ": " <> - text src <> tit' + let tit' = if null tit + then empty + else space <> "\"" <> text tit <> "\"" + return $ nest 2 $ hang 2 + ("[" <> label' <> "]:" <> space) (text src <> tit') -- | Return markdown representation of notes. notesToMarkdown :: WriterOptions -> [[Block]] -> State WriterState Doc notesToMarkdown opts notes = - mapM (\(num, note) -> noteToMarkdown opts num note) (zip [1..] notes) >>= - return . vcat + mapM (\(num, note) -> noteToMarkdown opts num note) (zip [1..] notes) >>= + return . vsep -- | Return markdown representation of a note. noteToMarkdown :: WriterOptions -> Int -> [Block] -> State WriterState Doc noteToMarkdown opts num blocks = do contents <- blockListToMarkdown opts blocks - let marker = text "[^" <> text (show num) <> text "]:" - return $ hang' marker (writerTabStop opts) contents + let num' = text $ show num + let marker = text "[^" <> num' <> text "]:" + let markerSize = 4 + offset num' + let spacer = case writerTabStop opts - markerSize of + n | n > 0 -> text $ replicate n ' ' + _ -> text " " + return $ hang (writerTabStop opts) (marker <> spacer) contents -- | Escape special characters for Markdown. escapeString :: String -> String @@ -158,6 +171,22 @@ elementToListItem (Sec _ _ _ headerText subsecs) = [Plain headerText] ++ then [] else [BulletList $ map elementToListItem subsecs] +attrsToMarkdown :: Attr -> Doc +attrsToMarkdown attribs = braces $ hsep [attribId, attribClasses, attribKeys] + where attribId = case attribs of + ([],_,_) -> empty + (i,_,_) -> "#" <> text i + attribClasses = case attribs of + (_,[],_) -> empty + (_,cs,_) -> hsep $ + map (text . ('.':)) + cs + attribKeys = case attribs of + (_,_,[]) -> empty + (_,_,ks) -> hsep $ + map (\(k,v) -> text k + <> "=\"" <> text v <> "\"") ks + -- | Ordered list start parser for use in Para below. olMarker :: GenParser Char ParserState Char olMarker = do (start, style', delim) <- anyOrderedListMarker @@ -169,134 +198,139 @@ olMarker = do (start, style', delim) <- anyOrderedListMarker -- | True if string begins with an ordered list marker beginsWithOrderedListMarker :: String -> Bool -beginsWithOrderedListMarker str = - case runParser olMarker defaultParserState "para start" str of - Left _ -> False +beginsWithOrderedListMarker str = + case runParser olMarker defaultParserState "para start" (take 10 str) of + Left _ -> False Right _ -> True -wrappedMarkdown :: WriterOptions -> [Inline] -> State WriterState Doc -wrappedMarkdown opts inlines = do - let chunks = splitBy LineBreak inlines - let chunks' = if null chunks - then [] - else (map (++ [Str " "]) $ init chunks) ++ [last chunks] - lns <- mapM (wrapIfNeeded opts (inlineListToMarkdown opts)) chunks' - return $ vcat lns - -- | Convert Pandoc block element to markdown. blockToMarkdown :: WriterOptions -- ^ Options -> Block -- ^ Block element -> State WriterState Doc blockToMarkdown _ Null = return empty -blockToMarkdown opts (Plain inlines) = - wrappedMarkdown opts inlines +blockToMarkdown opts (Plain inlines) = do + contents <- inlineListToMarkdown opts inlines + return $ contents <> cr blockToMarkdown opts (Para inlines) = do - contents <- wrappedMarkdown opts inlines + contents <- inlineListToMarkdown opts inlines -- escape if para starts with ordered list marker - let esc = if (not (writerStrictMarkdown opts)) && - beginsWithOrderedListMarker (render contents) - then char '\\' - else empty - return $ esc <> contents <> text "\n" -blockToMarkdown _ (RawHtml str) = do st <- get - if stPlain st - then return empty - else return $ text str -blockToMarkdown _ HorizontalRule = return $ text "\n* * * * *\n" + let esc = if (not (writerStrictMarkdown opts)) && + not (stPlain st) && + beginsWithOrderedListMarker (render Nothing contents) + then text "\\" + else empty + return $ esc <> contents <> blankline +blockToMarkdown _ (RawBlock f str) + | f == "html" || f == "latex" || f == "tex" || f == "markdown" = do + st <- get + if stPlain st + then return empty + else return $ text str <> text "\n" +blockToMarkdown _ (RawBlock _ _) = return empty +blockToMarkdown _ HorizontalRule = + return $ blankline <> text "* * * * *" <> blankline blockToMarkdown opts (Header level inlines) = do contents <- inlineListToMarkdown opts inlines st <- get -- use setext style headers if in literate haskell mode. -- ghc interprets '#' characters in column 1 as line number specifiers. if writerLiterateHaskell opts || stPlain st - then let len = length $ render contents - in return $ contents <> text "\n" <> - case level of - 1 -> text $ replicate len '=' ++ "\n" - 2 -> text $ replicate len '-' ++ "\n" - _ -> empty - else return $ text ((replicate level '#') ++ " ") <> contents <> text "\n" -blockToMarkdown opts (CodeBlock (_,classes,_) str) | "haskell" `elem` classes && - "literate" `elem` classes && - writerLiterateHaskell opts = - return $ (vcat $ map (text "> " <>) $ map text (lines str)) <> text "\n" -blockToMarkdown opts (CodeBlock _ str) = return $ - (nest (writerTabStop opts) $ vcat $ map text (lines str)) <> text "\n" + then let len = offset contents + in return $ contents <> cr <> + (case level of + 1 -> text $ replicate len '=' + 2 -> text $ replicate len '-' + _ -> empty) <> blankline + else return $ + text ((replicate level '#') ++ " ") <> contents <> blankline +blockToMarkdown opts (CodeBlock (_,classes,_) str) + | "haskell" `elem` classes && "literate" `elem` classes && + writerLiterateHaskell opts = + return $ prefixed "> " (text str) <> blankline +blockToMarkdown opts (CodeBlock attribs str) = return $ + if writerStrictMarkdown opts || attribs == nullAttr + then nest (writerTabStop opts) (text str) <> blankline + else -- use delimited code block + flush (tildes <> space <> attrs <> cr <> text str <> + cr <> tildes) <> blankline + where tildes = text "~~~~" + attrs = attrsToMarkdown attribs blockToMarkdown opts (BlockQuote blocks) = do st <- get -- if we're writing literate haskell, put a space before the bird tracks -- so they won't be interpreted as lhs... let leader = if writerLiterateHaskell opts - then text . (" > " ++) + then " > " else if stPlain st - then text . (" " ++) - else text . ("> " ++) + then " " + else "> " contents <- blockListToMarkdown opts blocks - return $ (vcat $ map leader $ lines $ render contents) <> - text "\n" + return $ (prefixed leader contents) <> blankline blockToMarkdown opts (Table caption aligns widths headers rows) = do caption' <- inlineListToMarkdown opts caption let caption'' = if null caption then empty - else text "" $+$ (text ": " <> caption') + else blankline <> ": " <> caption' <> blankline headers' <- mapM (blockListToMarkdown opts) headers let alignHeader alignment = case alignment of - AlignLeft -> leftAlignBlock - AlignCenter -> centerAlignBlock - AlignRight -> rightAlignBlock - AlignDefault -> leftAlignBlock + AlignLeft -> lblock + AlignCenter -> cblock + AlignRight -> rblock + AlignDefault -> lblock rawRows <- mapM (mapM (blockListToMarkdown opts)) rows let isSimple = all (==0) widths - let numChars = maximum . map (length . render) + let numChars = maximum . map offset let widthsInChars = if isSimple then map ((+2) . numChars) $ transpose (headers' : rawRows) - else map (floor . (78 *)) widths - let makeRow = hsepBlocks . (zipWith alignHeader aligns) . - (zipWith docToBlock widthsInChars) + else map (floor . (fromIntegral (writerColumns opts) *)) widths + let makeRow = hcat . intersperse (lblock 1 (text " ")) . + (zipWith3 alignHeader aligns widthsInChars) let rows' = map makeRow rawRows let head' = makeRow headers' - let maxRowHeight = maximum $ map heightOfBlock (head':rows') - let underline = hsep $ - map (\width -> text $ replicate width '-') widthsInChars + let maxRowHeight = maximum $ map height (head':rows') + let underline = cat $ intersperse (text " ") $ + map (\width -> text (replicate width '-')) widthsInChars let border = if maxRowHeight > 1 - then text $ replicate (sum widthsInChars + (length widthsInChars - 1)) '-' + then text (replicate (sum widthsInChars + + length widthsInChars - 1) '-') else if all null headers then underline else empty let head'' = if all null headers then empty - else border $+$ blockToDoc head' - let spacer = if maxRowHeight > 1 - then text "" - else empty - let body = vcat $ intersperse spacer $ map blockToDoc rows' + else border <> cr <> head' + let body = if maxRowHeight > 1 + then vsep rows' + else vcat rows' let bottom = if all null headers then underline else border - return $ (nest 2 $ head'' $+$ underline $+$ body $+$ - bottom $+$ caption'') <> text "\n" + return $ nest 2 $ head'' $$ underline $$ body $$ + bottom $$ blankline $$ caption'' $$ blankline blockToMarkdown opts (BulletList items) = do contents <- mapM (bulletListItemToMarkdown opts) items - return $ (vcat contents) <> text "\n" + return $ cat contents <> blankline blockToMarkdown opts (OrderedList attribs items) = do let markers = orderedListMarkers attribs let markers' = map (\m -> if length m < 3 then m ++ replicate (3 - length m) ' ' - else m) markers + else m) markers contents <- mapM (\(item, num) -> orderedListItemToMarkdown opts item num) $ - zip markers' items - return $ (vcat contents) <> text "\n" + zip markers' items + return $ cat contents <> blankline blockToMarkdown opts (DefinitionList items) = do contents <- mapM (definitionListItemToMarkdown opts) items - return $ (vcat contents) <> text "\n" + return $ cat contents <> blankline -- | Convert bullet list item (list of blocks) to markdown. bulletListItemToMarkdown :: WriterOptions -> [Block] -> State WriterState Doc bulletListItemToMarkdown opts items = do contents <- blockListToMarkdown opts items - return $ hang' (text "- ") (writerTabStop opts) contents + let sps = replicate (writerTabStop opts - 2) ' ' + let start = text ('-' : ' ' : sps) + return $ hang (writerTabStop opts) start $ contents <> cr -- | Convert ordered list item (a list of blocks) to markdown. orderedListItemToMarkdown :: WriterOptions -- ^ options @@ -305,8 +339,11 @@ orderedListItemToMarkdown :: WriterOptions -- ^ options -> State WriterState Doc orderedListItemToMarkdown opts marker items = do contents <- blockListToMarkdown opts items - return $ hsep [nest (min (3 - length marker) 0) (text marker), - nest (writerTabStop opts) contents] + let sps = case length marker - writerTabStop opts of + n | n > 0 -> text $ replicate n ' ' + _ -> text " " + let start = text marker <> sps + return $ hang (writerTabStop opts) start $ contents <> cr -- | Convert definition list item (label, list of blocks) to markdown. definitionListItemToMarkdown :: WriterOptions @@ -316,17 +353,20 @@ definitionListItemToMarkdown opts (label, defs) = do labelText <- inlineListToMarkdown opts label let tabStop = writerTabStop opts st <- get - let leader = if stPlain st then empty else text " ~" - contents <- liftM vcat $ - mapM (liftM ((leader $$) . nest tabStop . vcat) . mapM (blockToMarkdown opts)) defs - return $ labelText $+$ contents + let leader = if stPlain st then " " else " ~" + let sps = case writerTabStop opts - 3 of + n | n > 0 -> text $ replicate n ' ' + _ -> text " " + defs' <- mapM (mapM (blockToMarkdown opts)) defs + let contents = vcat $ map (\d -> hang tabStop (leader <> sps) $ vcat d <> cr) defs' + return $ labelText <> cr <> contents <> cr -- | Convert list of Pandoc block elements to markdown. blockListToMarkdown :: WriterOptions -- ^ Options -> [Block] -- ^ List of block elements -> State WriterState Doc blockListToMarkdown opts blocks = - mapM (blockToMarkdown opts) blocks >>= return . vcat + mapM (blockToMarkdown opts) blocks >>= return . cat -- | Get reference for target; if none exists, create unique one and return. -- Prefer label if possible; otherwise, generate a unique key. @@ -349,86 +389,132 @@ getReference label (src, tit) = do -- | Convert list of Pandoc inline elements to markdown. inlineListToMarkdown :: WriterOptions -> [Inline] -> State WriterState Doc inlineListToMarkdown opts lst = - mapM (inlineToMarkdown opts) lst >>= return . hcat + mapM (inlineToMarkdown opts) lst >>= return . cat + +escapeSpaces :: Inline -> Inline +escapeSpaces (Str s) = Str $ substitute " " "\\ " s +escapeSpaces Space = Str "\\ " +escapeSpaces x = x -- | Convert Pandoc inline element to markdown. inlineToMarkdown :: WriterOptions -> Inline -> State WriterState Doc inlineToMarkdown opts (Emph lst) = do contents <- inlineListToMarkdown opts lst - return $ char '*' <> contents <> char '*' + return $ "*" <> contents <> "*" inlineToMarkdown opts (Strong lst) = do contents <- inlineListToMarkdown opts lst - return $ text "**" <> contents <> text "**" + return $ "**" <> contents <> "**" inlineToMarkdown opts (Strikeout lst) = do contents <- inlineListToMarkdown opts lst - return $ text "~~" <> contents <> text "~~" + return $ "~~" <> contents <> "~~" inlineToMarkdown opts (Superscript lst) = do - contents <- inlineListToMarkdown opts lst - let contents' = text $ substitute " " "\\ " $ render contents - return $ char '^' <> contents' <> char '^' + let lst' = bottomUp escapeSpaces lst + contents <- inlineListToMarkdown opts lst' + return $ "^" <> contents <> "^" inlineToMarkdown opts (Subscript lst) = do - contents <- inlineListToMarkdown opts lst - let contents' = text $ substitute " " "\\ " $ render contents - return $ char '~' <> contents' <> char '~' + let lst' = bottomUp escapeSpaces lst + contents <- inlineListToMarkdown opts lst' + return $ "~" <> contents <> "~" inlineToMarkdown opts (SmallCaps lst) = inlineListToMarkdown opts lst inlineToMarkdown opts (Quoted SingleQuote lst) = do contents <- inlineListToMarkdown opts lst - return $ char '‘' <> contents <> char '’' + return $ "‘" <> contents <> "’" inlineToMarkdown opts (Quoted DoubleQuote lst) = do contents <- inlineListToMarkdown opts lst - return $ char '“' <> contents <> char '”' -inlineToMarkdown _ EmDash = return $ char '\8212' -inlineToMarkdown _ EnDash = return $ char '\8211' -inlineToMarkdown _ Apostrophe = return $ char '\8217' -inlineToMarkdown _ Ellipses = return $ char '\8230' -inlineToMarkdown _ (Code str) = + return $ "“" <> contents <> "”" +inlineToMarkdown _ EmDash = return "\8212" +inlineToMarkdown _ EnDash = return "\8211" +inlineToMarkdown _ Apostrophe = return "\8217" +inlineToMarkdown _ Ellipses = return "\8230" +inlineToMarkdown opts (Code attr str) = let tickGroups = filter (\s -> '`' `elem` s) $ group str longest = if null tickGroups then 0 else maximum $ map length tickGroups marker = replicate (longest + 1) '`' - spacer = if (longest == 0) then "" else " " in - return $ text (marker ++ spacer ++ str ++ spacer ++ marker) + spacer = if (longest == 0) then "" else " " + attrs = if writerStrictMarkdown opts || attr == nullAttr + then empty + else attrsToMarkdown attr + in return $ text (marker ++ spacer ++ str ++ spacer ++ marker) <> attrs inlineToMarkdown _ (Str str) = do st <- get if stPlain st then return $ text str else return $ text $ escapeString str -inlineToMarkdown _ (Math InlineMath str) = return $ char '$' <> text str <> char '$' -inlineToMarkdown _ (Math DisplayMath str) = return $ text "$$" <> text str <> text "$$" -inlineToMarkdown _ (TeX str) = return $ text str -inlineToMarkdown _ (HtmlInline str) = return $ text str -inlineToMarkdown _ (LineBreak) = return $ text " \n" -inlineToMarkdown _ Space = return $ char ' ' -inlineToMarkdown opts (Cite _ cits) = inlineListToMarkdown opts cits +inlineToMarkdown _ (Math InlineMath str) = + return $ "$" <> text str <> "$" +inlineToMarkdown _ (Math DisplayMath str) = + return $ "$$" <> text str <> "$$" +inlineToMarkdown _ (RawInline f str) + | f == "html" || f == "latex" || f == "tex" || f == "markdown" = + return $ text str +inlineToMarkdown _ (RawInline _ _) = return empty +inlineToMarkdown opts (LineBreak) = return $ + if writerStrictMarkdown opts + then " " <> cr + else "\\" <> cr +inlineToMarkdown _ Space = return space +inlineToMarkdown opts (Cite (c:cs) lst) + | writerCiteMethod opts == Citeproc = inlineListToMarkdown opts lst + | citationMode c == AuthorInText = do + suffs <- inlineListToMarkdown opts $ citationSuffix c + rest <- mapM convertOne cs + let inbr = suffs <+> joincits rest + br = if isEmpty inbr then empty else char '[' <> inbr <> char ']' + return $ text ("@" ++ citationId c) <+> br + | otherwise = do + cits <- mapM convertOne (c:cs) + return $ text "[" <> joincits cits <> text "]" + where + joincits = hcat . intersperse (text "; ") . filter (not . isEmpty) + convertOne Citation { citationId = k + , citationPrefix = pinlines + , citationSuffix = sinlines + , citationMode = m } + = do + pdoc <- inlineListToMarkdown opts pinlines + sdoc <- inlineListToMarkdown opts sinlines + let k' = text (modekey m ++ "@" ++ k) + r = case sinlines of + Str (y:_):_ | y `elem` ",;]@" -> k' <> sdoc + _ -> k' <+> sdoc + return $ pdoc <+> r + modekey SuppressAuthor = "-" + modekey _ = "" +inlineToMarkdown _ (Cite _ _) = return $ text "" inlineToMarkdown opts (Link txt (src', tit)) = do linktext <- inlineListToMarkdown opts txt - let linktitle = if null tit then empty else text $ " \"" ++ tit ++ "\"" + let linktitle = if null tit + then empty + else text $ " \"" ++ tit ++ "\"" let src = unescapeURI src' let srcSuffix = if isPrefixOf "mailto:" src then drop 7 src else src let useRefLinks = writerReferenceLinks opts - let useAuto = null tit && txt == [Code srcSuffix] + let useAuto = case (tit,txt) of + ("", [Code _ s]) | s == srcSuffix -> True + _ -> False ref <- if useRefLinks then getReference txt (src, tit) else return [] reftext <- inlineListToMarkdown opts ref return $ if useAuto - then char '<' <> text srcSuffix <> char '>' + then "<" <> text srcSuffix <> ">" else if useRefLinks - then let first = char '[' <> linktext <> char ']' + then let first = "[" <> linktext <> "]" second = if txt == ref - then text "[]" - else char '[' <> reftext <> char ']' + then "[]" + else "[" <> reftext <> "]" in first <> second - else char '[' <> linktext <> char ']' <> - char '(' <> text src <> linktitle <> char ')' + else "[" <> linktext <> "](" <> + text src <> linktitle <> ")" inlineToMarkdown opts (Image alternate (source, tit)) = do let txt = if (null alternate) || (alternate == [Str ""]) || (alternate == [Str source]) -- to prevent autolinks then [Str "image"] else alternate - linkPart <- inlineToMarkdown opts (Link txt (unescapeURI source, tit)) - return $ char '!' <> linkPart + linkPart <- inlineToMarkdown opts (Link txt (source, tit)) + return $ "!" <> linkPart inlineToMarkdown _ (Note contents) = do modify (\st -> st{ stNotes = contents : stNotes st }) st <- get let ref = show $ (length $ stNotes st) - return $ text "[^" <> text ref <> char ']' + return $ "[^" <> text ref <> "]" diff --git a/src/Text/Pandoc/Writers/MediaWiki.hs b/src/Text/Pandoc/Writers/MediaWiki.hs index e8cb33caf..a7c7fc482 100644 --- a/src/Text/Pandoc/Writers/MediaWiki.hs +++ b/src/Text/Pandoc/Writers/MediaWiki.hs @@ -96,7 +96,9 @@ blockToMediaWiki opts (Para inlines) = do then "<p>" ++ contents ++ "</p>" else contents ++ if null listLevel then "\n" else "" -blockToMediaWiki _ (RawHtml str) = return str +blockToMediaWiki _ (RawBlock "mediawiki" str) = return str +blockToMediaWiki _ (RawBlock "html" str) = return str +blockToMediaWiki _ (RawBlock _ _) = return "" blockToMediaWiki _ HorizontalRule = return "\n-----\n" @@ -360,7 +362,7 @@ inlineToMediaWiki _ Apostrophe = return "’" inlineToMediaWiki _ Ellipses = return "…" -inlineToMediaWiki _ (Code str) = +inlineToMediaWiki _ (Code _ str) = return $ "<tt>" ++ (escapeString str) ++ "</tt>" inlineToMediaWiki _ (Str str) = return $ escapeString str @@ -368,9 +370,9 @@ inlineToMediaWiki _ (Str str) = return $ escapeString str inlineToMediaWiki _ (Math _ str) = return $ "<math>" ++ str ++ "</math>" -- note: str should NOT be escaped -inlineToMediaWiki _ (TeX _) = return "" - -inlineToMediaWiki _ (HtmlInline str) = return str +inlineToMediaWiki _ (RawInline "mediawiki" str) = return str +inlineToMediaWiki _ (RawInline "html" str) = return str +inlineToMediaWiki _ (RawInline _ _) = return "" inlineToMediaWiki _ (LineBreak) = return "<br />\n" @@ -378,12 +380,12 @@ inlineToMediaWiki _ Space = return " " inlineToMediaWiki opts (Link txt (src, _)) = do label <- inlineListToMediaWiki opts txt - if txt == [Code src] -- autolink - then return src - else if isURI src - then return $ "[" ++ src ++ " " ++ label ++ "]" - else return $ "[[" ++ src' ++ "|" ++ label ++ "]]" - where src' = case src of + case txt of + [Code _ s] | s == src -> return src + _ -> if isURI src + then return $ "[" ++ src ++ " " ++ label ++ "]" + else return $ "[[" ++ src' ++ "|" ++ label ++ "]]" + where src' = case src of '/':xs -> xs -- with leading / it's a _ -> src -- link to a help page inlineToMediaWiki opts (Image alt (source, tit)) = do diff --git a/src/Text/Pandoc/Writers/Native.hs b/src/Text/Pandoc/Writers/Native.hs index 3b5ea7481..d2b56cd17 100644 --- a/src/Text/Pandoc/Writers/Native.hs +++ b/src/Text/Pandoc/Writers/Native.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> @@ -25,62 +26,53 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Stability : alpha Portability : portable -Utility functions and definitions used by the various Pandoc modules. +Conversion of a 'Pandoc' document to a string representation. + +Note: If @writerStandalone@ is @False@, only the document body +is represented; otherwise, the full 'Pandoc' document, including the +metadata. -} module Text.Pandoc.Writers.Native ( writeNative ) where -import Text.Pandoc.Shared ( WriterOptions ) -import Data.List ( intercalate ) +import Text.Pandoc.Shared ( WriterOptions(..) ) +import Data.List ( intersperse ) import Text.Pandoc.Definition +import Text.Pandoc.Pretty --- | Indent string as a block. -indentBy :: Int -- ^ Number of spaces to indent the block - -> Int -- ^ Number of spaces (rel to block) to indent first line - -> String -- ^ Contents of block to indent - -> String -indentBy _ _ [] = "" -indentBy num first str = - let (firstLine:restLines) = lines str - firstLineIndent = num + first - in (replicate firstLineIndent ' ') ++ firstLine ++ "\n" ++ - (intercalate "\n" $ map ((replicate num ' ') ++ ) restLines) - --- | Prettyprint list of Pandoc blocks elements. -prettyBlockList :: Int -- ^ Number of spaces to indent list of blocks - -> [Block] -- ^ List of blocks - -> String -prettyBlockList indent [] = indentBy indent 0 "[]" -prettyBlockList indent blocks = indentBy indent (-2) $ "[ " ++ - (intercalate "\n, " (map prettyBlock blocks)) ++ " ]" +prettyList :: [Doc] -> Doc +prettyList ds = + "[" <> (cat $ intersperse (cr <> ",") $ map (nest 1) ds) <> "]" -- | Prettyprint Pandoc block element. -prettyBlock :: Block -> String -prettyBlock (BlockQuote blocks) = "BlockQuote\n " ++ - (prettyBlockList 2 blocks) +prettyBlock :: Block -> Doc +prettyBlock (BlockQuote blocks) = + "BlockQuote" $$ prettyList (map prettyBlock blocks) prettyBlock (OrderedList attribs blockLists) = - "OrderedList " ++ show attribs ++ "\n" ++ indentBy 2 0 ("[ " ++ - (intercalate ", " $ map (\blocks -> prettyBlockList 2 blocks) - blockLists)) ++ " ]" -prettyBlock (BulletList blockLists) = "BulletList\n" ++ - indentBy 2 0 ("[ " ++ (intercalate ", " - (map (\blocks -> prettyBlockList 2 blocks) blockLists))) ++ " ]" -prettyBlock (DefinitionList items) = "DefinitionList\n" ++ - indentBy 2 0 ("[ " ++ (intercalate "\n, " - (map (\(term, defs) -> "(" ++ show term ++ ",\n" ++ - indentBy 3 0 ("[ " ++ (intercalate ", " - (map (\blocks -> prettyBlockList 2 blocks) defs)) ++ "]") ++ - ")") items))) ++ " ]" + "OrderedList" <> space <> text (show attribs) $$ + (prettyList $ map (prettyList . map prettyBlock) blockLists) +prettyBlock (BulletList blockLists) = + "BulletList" $$ + (prettyList $ map (prettyList . map prettyBlock) blockLists) +prettyBlock (DefinitionList items) = "DefinitionList" $$ + (prettyList $ map deflistitem items) + where deflistitem (term, defs) = "(" <> text (show term) <> "," <> cr <> + nest 1 (prettyList $ map (prettyList . map prettyBlock) defs) <> ")" prettyBlock (Table caption aligns widths header rows) = - "Table " ++ show caption ++ " " ++ show aligns ++ " " ++ - show widths ++ "\n" ++ prettyRow header ++ " [\n" ++ - (intercalate ",\n" (map prettyRow rows)) ++ " ]" - where prettyRow cols = indentBy 2 0 ("[ " ++ (intercalate ", " - (map (\blocks -> prettyBlockList 2 blocks) - cols))) ++ " ]" -prettyBlock block = show block + "Table " <> text (show caption) <> " " <> text (show aligns) <> " " <> + text (show widths) $$ + prettyRow header $$ + prettyList (map prettyRow rows) + where prettyRow cols = prettyList (map (prettyList . map prettyBlock) cols) +prettyBlock block = text $ show block -- | Prettyprint Pandoc document. writeNative :: WriterOptions -> Pandoc -> String -writeNative _ (Pandoc meta blocks) = "Pandoc " ++ "(" ++ show meta ++ - ")\n" ++ (prettyBlockList 0 blocks) ++ "\n" - +writeNative opts (Pandoc meta blocks) = + let colwidth = if writerWrapText opts + then Just $ writerColumns opts + else Nothing + withHead = if writerStandalone opts + then \bs -> text ("Pandoc " ++ "(" ++ show meta ++ ")") $$ + bs $$ cr + else id + in render colwidth $ withHead $ prettyList $ map prettyBlock blocks diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs index 5aa0fd310..cf1be8755 100644 --- a/src/Text/Pandoc/Writers/ODT.hs +++ b/src/Text/Pandoc/Writers/ODT.hs @@ -37,6 +37,7 @@ import System.Time import Paths_pandoc ( getDataFileName ) import Text.Pandoc.Shared ( WriterOptions(..) ) import Text.Pandoc.Definition +import Text.Pandoc.Generic import Text.Pandoc.Writers.OpenDocument ( writeOpenDocument ) import System.Directory import Control.Monad (liftM) @@ -63,8 +64,8 @@ writeODT mbRefOdt opts doc = do -- handle pictures picEntriesRef <- newIORef ([] :: [Entry]) let sourceDir = writerSourceDirectory opts - doc' <- processWithM (transformPic sourceDir picEntriesRef) doc - let newContents = writeOpenDocument opts doc' + doc' <- bottomUpM (transformPic sourceDir picEntriesRef) doc + let newContents = writeOpenDocument opts{writerWrapText = False} doc' (TOD epochtime _) <- getClockTime let contentEntry = toEntry "content.xml" epochtime $ fromString newContents picEntries <- readIORef picEntriesRef diff --git a/src/Text/Pandoc/Writers/OpenDocument.hs b/src/Text/Pandoc/Writers/OpenDocument.hs index 4e3979c07..b9444aac7 100644 --- a/src/Text/Pandoc/Writers/OpenDocument.hs +++ b/src/Text/Pandoc/Writers/OpenDocument.hs @@ -35,7 +35,7 @@ import Text.Pandoc.Shared import Text.Pandoc.XML import Text.Pandoc.Templates (renderTemplate) import Text.Pandoc.Readers.TeXMath -import Text.PrettyPrint.HughesPJ hiding ( Str ) +import Text.Pandoc.Pretty import Text.Printf ( printf ) import Control.Applicative ( (<$>) ) import Control.Arrow ( (***), (>>>) ) @@ -112,7 +112,9 @@ setInDefinitionList :: Bool -> State WriterState () setInDefinitionList b = modify $ \s -> s { stInDefinition = b } inParagraphTags :: Doc -> Doc -inParagraphTags = inTags False "text:p" [("text:style-name", "Text_20_body")] +inParagraphTags d | isEmpty d = empty +inParagraphTags d = + inTags False "text:p" [("text:style-name", "Text_20_body")] d inParagraphTagsWithStyle :: String -> Doc -> Doc inParagraphTagsWithStyle sty = inTags False "text:p" [("text:style-name", sty)] @@ -167,7 +169,11 @@ writeOpenDocument opts (Pandoc (Meta title authors date) blocks) = date'' <- inlinesToOpenDocument opts date doc'' <- blocksToOpenDocument opts blocks return (doc'', title'', authors'', date'') - body' = render doc + colwidth = if writerWrapText opts + then Just $ writerColumns opts + else Nothing + render' = render colwidth + body' = render' doc styles = stTableStyles s ++ stParaStyles s ++ stTextStyles s listStyle (n,l) = inTags True "text:list-style" [("style:name", "L" ++ show n)] (vcat l) @@ -176,10 +182,10 @@ writeOpenDocument opts (Pandoc (Meta title authors date) blocks) = reverse $ styles ++ listStyles context = writerVariables opts ++ [ ("body", body') - , ("automatic-styles", render automaticStyles) - , ("title", render title') - , ("date", render date') ] ++ - [ ("author", render a) | a <- authors' ] + , ("automatic-styles", render' automaticStyles) + , ("title", render' title') + , ("date", render' date') ] ++ + [ ("author", render' a) | a <- authors' ] in if writerStandalone opts then renderTemplate context $ writerTemplate opts else body' @@ -273,7 +279,7 @@ blockToOpenDocument o bs | Header i b <- bs = inHeaderTags i <$> inlinesToOpenDocument o b | BlockQuote b <- bs = mkBlockQuote b | CodeBlock _ s <- bs = preformatted s - | RawHtml _ <- bs = return empty + | RawBlock _ _ <- bs = return empty | DefinitionList b <- bs = defList b | BulletList b <- bs = bulletListToOpenDocument o b | OrderedList a b <- bs = orderedList a b @@ -286,7 +292,7 @@ blockToOpenDocument o bs r <- vcat <$> mapM (deflistItemToOpenDocument o) b setInDefinitionList False return r - preformatted s = vcat <$> mapM (inPreformattedTags . escapeStringForXML) (lines s) + preformatted s = (flush . vcat) <$> mapM (inPreformattedTags . escapeStringForXML) (lines s) mkBlockQuote b = do increaseIndent i <- paraStyle "Quotations" [] inBlockQuote o i (map plainToPara b) @@ -346,7 +352,7 @@ inlineToOpenDocument o ils | EmDash <- ils = inTextStyle $ text "—" | EnDash <- ils = inTextStyle $ text "–" | Apostrophe <- ils = inTextStyle $ text "’" - | Space <- ils = inTextStyle $ char ' ' + | Space <- ils = inTextStyle space | LineBreak <- ils = return $ selfClosingTag "text:line-break" [] | Str s <- ils = inTextStyle $ handleSpaces $ escapeStringForXML s | Emph l <- ils = withTextStyle Italic $ inlinesToOpenDocument o l @@ -356,11 +362,12 @@ inlineToOpenDocument o ils | Subscript l <- ils = withTextStyle Sub $ inlinesToOpenDocument o l | SmallCaps l <- ils = withTextStyle SmallC $ inlinesToOpenDocument o l | Quoted t l <- ils = inQuotes t <$> inlinesToOpenDocument o l - | Code s <- ils = preformatted s + | Code _ s <- ils = preformatted s | Math _ s <- ils = inlinesToOpenDocument o (readTeXMath s) | Cite _ l <- ils = inlinesToOpenDocument o l - | TeX s <- ils = preformatted s - | HtmlInline s <- ils = preformatted s + | RawInline "opendocument" s <- ils = preformatted s + | RawInline "html" s <- ils = preformatted s -- for backwards compat. + | RawInline _ _ <- ils = return empty | Link l (s,t) <- ils = mkLink s t <$> inlinesToOpenDocument o l | Image _ (s,_) <- ils = return $ mkImg s | Note l <- ils = mkNote l diff --git a/src/Text/Pandoc/Writers/Org.hs b/src/Text/Pandoc/Writers/Org.hs new file mode 100644 index 000000000..f7f314428 --- /dev/null +++ b/src/Text/Pandoc/Writers/Org.hs @@ -0,0 +1,284 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2006-2010 Puneeth Chaganti <punchagan@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Org + Copyright : Copyright (C) 2010 Puneeth Chaganti + License : GNU GPL, version 2 or above + + Maintainer : Puneeth Chaganti <punchagan@gmail.com> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to Emacs Org-Mode. + +Org-Mode: <http://orgmode.org> +-} +module Text.Pandoc.Writers.Org ( writeOrg) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.Pretty +import Text.Pandoc.Templates (renderTemplate) +import Data.List ( intersect, intersperse, transpose ) +import Control.Monad.State +import Control.Applicative ( (<$>) ) + +data WriterState = + WriterState { stNotes :: [[Block]] + , stLinks :: Bool + , stImages :: Bool + , stHasMath :: Bool + , stOptions :: WriterOptions + } + +-- | Convert Pandoc to Org. +writeOrg :: WriterOptions -> Pandoc -> String +writeOrg opts document = + let st = WriterState { stNotes = [], stLinks = False, + stImages = False, stHasMath = False, + stOptions = opts } + in evalState (pandocToOrg document) st + +-- | Return Org representation of document. +pandocToOrg :: Pandoc -> State WriterState String +pandocToOrg (Pandoc (Meta tit auth dat) blocks) = do + opts <- liftM stOptions get + title <- titleToOrg tit + authors <- mapM inlineListToOrg auth + date <- inlineListToOrg dat + body <- blockListToOrg blocks + notes <- liftM (reverse . stNotes) get >>= notesToOrg + -- note that the notes may contain refs, so we do them first + hasMath <- liftM stHasMath get + let colwidth = if writerWrapText opts + then Just $ writerColumns opts + else Nothing + let main = render colwidth $ foldl ($+$) empty $ [body, notes] + let context = writerVariables opts ++ + [ ("body", main) + , ("title", render Nothing title) + , ("date", render Nothing date) ] ++ + [ ("math", "yes") | hasMath ] ++ + [ ("author", render Nothing a) | a <- authors ] + if writerStandalone opts + then return $ renderTemplate context $ writerTemplate opts + else return main + +-- | Return Org representation of notes. +notesToOrg :: [[Block]] -> State WriterState Doc +notesToOrg notes = + mapM (\(num, note) -> noteToOrg num note) (zip [1..] notes) >>= + return . vsep + +-- | Return Org representation of a note. +noteToOrg :: Int -> [Block] -> State WriterState Doc +noteToOrg num note = do + contents <- blockListToOrg note + let marker = "[" ++ show num ++ "] " + return $ hang (length marker) (text marker) contents + +-- | Escape special characters for Org. +escapeString :: String -> String +escapeString = escapeStringUsing (backslashEscapes "^_") + +titleToOrg :: [Inline] -> State WriterState Doc +titleToOrg [] = return empty +titleToOrg lst = do + contents <- inlineListToOrg lst + return $ "#+TITLE: " <> contents + +-- | Convert Pandoc block element to Org. +blockToOrg :: Block -- ^ Block element + -> State WriterState Doc +blockToOrg Null = return empty +blockToOrg (Plain inlines) = inlineListToOrg inlines +blockToOrg (Para [Image txt (src,tit)]) = do + capt <- inlineListToOrg txt + img <- inlineToOrg (Image txt (src,tit)) + return $ "#+CAPTION: " <> capt <> blankline <> img +blockToOrg (Para inlines) = do + contents <- inlineListToOrg inlines + return $ contents <> blankline +blockToOrg (RawBlock "html" str) = + return $ blankline $$ "#+BEGIN_HTML" $$ + nest 2 (text str) $$ "#+END_HTML" $$ blankline +blockToOrg (RawBlock f str) | f == "org" || f == "latex" || f == "tex" = + return $ text str +blockToOrg (RawBlock _ _) = return empty +blockToOrg HorizontalRule = return $ blankline $$ "--------------" $$ blankline +blockToOrg (Header level inlines) = do + contents <- inlineListToOrg inlines + let headerStr = text $ if level > 999 then " " else replicate level '*' + return $ headerStr <> " " <> contents <> blankline +blockToOrg (CodeBlock (_,classes,_) str) = do + opts <- stOptions <$> get + let tabstop = writerTabStop opts + let at = classes `intersect` ["asymptote", "C", "clojure", "css", "ditaa", + "dot", "emacs-lisp", "gnuplot", "haskell", "js", "latex", + "ledger", "lisp", "matlab", "mscgen", "ocaml", "octave", + "oz", "perl", "plantuml", "python", "R", "ruby", "sass", + "scheme", "screen", "sh", "sql", "sqlite"] + let (beg, end) = if null at + then ("#+BEGIN_EXAMPLE", "#+END_EXAMPLE") + else ("#+BEGIN_SRC" ++ head at, "#+END_SRC") + return $ text beg $$ nest tabstop (text str) $$ text end $$ blankline +blockToOrg (BlockQuote blocks) = do + contents <- blockListToOrg blocks + return $ blankline $$ "#+BEGIN_QUOTE" $$ + nest 2 contents $$ "#+END_QUOTE" $$ blankline +blockToOrg (Table caption' _ _ headers rows) = do + caption'' <- inlineListToOrg caption' + let caption = if null caption' + then empty + else ("#+CAPTION: " <> caption'') + headers' <- mapM blockListToOrg headers + rawRows <- mapM (mapM blockListToOrg) rows + let numChars = maximum . map offset + -- FIXME: width is not being used. + let widthsInChars = + map ((+2) . numChars) $ transpose (headers' : rawRows) + -- FIXME: Org doesn't allow blocks with height more than 1. + let hpipeBlocks blocks = hcat [beg, middle, end] + where h = maximum (map height blocks) + sep' = lblock 3 $ vcat (map text $ replicate h " | ") + beg = lblock 2 $ vcat (map text $ replicate h "| ") + end = lblock 2 $ vcat (map text $ replicate h " |") + middle = hcat $ intersperse sep' blocks + let makeRow = hpipeBlocks . zipWith lblock widthsInChars + let head' = makeRow headers' + rows' <- mapM (\row -> do cols <- mapM blockListToOrg row |