From 622df7034c96d1d0ba10b1ad654840ac93baf7f7 Mon Sep 17 00:00:00 2001 From: mb21 Date: Thu, 20 Aug 2015 17:18:14 +0200 Subject: `lang` variable is now in BCP47 format strings are converted for LaTeX and ConTeXt output, closes #1614 --- README | 20 ++++- src/Text/Pandoc/Writers/ConTeXt.hs | 40 +++++++++- src/Text/Pandoc/Writers/LaTeX.hs | 148 +++++++++++++++++++++++++++++++++++-- 3 files changed, 192 insertions(+), 16 deletions(-) diff --git a/README b/README index 5a75c5a0b..da9bfea03 100644 --- a/README +++ b/README @@ -945,7 +945,19 @@ as `title`, `author`, and `date`) as well as the following: : body of document `lang` -: language code for HTML or LaTeX documents +: The `lang` variable should be set by the user to a language + code according to [BCP 47] (e.g. `en` or `en-GB`). + For some output formats, pandoc will convert it to an approriate + format stored in the additional variables `babel-lang`, + `polyglossia-lang`, `polyglossia-variant` (LaTeX) + and `context-lang` (ConTeXt). + +`otherlangs` +: Should be set to a list of other languages used in the document + in the YAML metadata, according to [BCP 47]. For example: + `otherlangs: [en-GB, fr]`. + Currently only used by XeTeX through the generated + `polyglossia-otherlangs` variable. `slidy-url` : base URL for Slidy documents (defaults to @@ -3264,8 +3276,8 @@ The following fields are recognized: ~ A string value in `YYYY-MM-DD` format. (Only the year is necessary.) Pandoc will attempt to convert other common date formats. -`language` - ~ A string value in [RFC5646] format. Pandoc will default to the local +`lang` (or legacy: `language`) + ~ A string value in [BCP 47] format. Pandoc will default to the local language if nothing is specified. `subject` @@ -3549,7 +3561,7 @@ Xavier Olive. [FictionBook2]: http://www.fictionbook.org/index.php/Eng:XML_Schema_Fictionbook_2.1 [lua]: http://www.lua.org [marc relators]: http://www.loc.gov/marc/relators/relaterm.html -[RFC5646]: http://tools.ietf.org/html/rfc5646 +[BCP 47]: https://tools.ietf.org/html/bcp47 [InDesign ICML]: https://www.adobe.com/content/dam/Adobe/en/devnet/indesign/cs55-docs/IDML/idml-specification.pdf [txt2tags]: http://txt2tags.org/ [EPUB]: http://idpf.org/epub diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs index 1f8bbcdba..5e2d7cfee 100644 --- a/src/Text/Pandoc/Writers/ConTeXt.hs +++ b/src/Text/Pandoc/Writers/ConTeXt.hs @@ -80,12 +80,12 @@ pandocToConTeXt options (Pandoc meta blocks) = do "subsubsubsection","subsubsubsubsection"]) $ defField "body" main $ defField "number-sections" (writerNumberSections options) - $ defField "mainlang" (maybe "" - (reverse . takeWhile (/=',') . reverse) - (lookup "lang" $ writerVariables options)) $ metadata + let context' = defField "context-lang" (maybe "" (fromBcp47 . splitBy (=='-')) $ + getField "lang" context) + context return $ if writerStandalone options - then renderTemplate' (writerTemplate options) context + then renderTemplate' (writerTemplate options) context' else main -- escape things as needed for ConTeXt @@ -362,3 +362,35 @@ sectionHeader (ident,classes,_) hdrLevel lst = do then char '\\' <> chapter <> braces contents else contents <> blankline +-- Takes a list of the constituents of a BCP 47 language code +-- and irons out ConTeXt's exceptions +-- https://tools.ietf.org/html/bcp47#section-2.1 +-- http://wiki.contextgarden.net/Language_Codes +fromBcp47 :: [String] -> String +fromBcp47 [] = "" +fromBcp47 ("ar":"SY":_) = "ar-sy" +fromBcp47 ("ar":"IQ":_) = "ar-iq" +fromBcp47 ("ar":"JO":_) = "ar-jo" +fromBcp47 ("ar":"LB":_) = "ar-lb" +fromBcp47 ("ar":"DZ":_) = "ar-dz" +fromBcp47 ("ar":"MA":_) = "ar-ma" +fromBcp47 ("de":"1901":_) = "deo" +fromBcp47 ("de":"DE":_) = "de-de" +fromBcp47 ("de":"AT":_) = "de-at" +fromBcp47 ("de":"CH":_) = "de-ch" +fromBcp47 ("el":"poly":_) = "agr" +fromBcp47 ("en":"US":_) = "en-us" +fromBcp47 ("en":"GB":_) = "en-gb" +fromBcp47 ("grc":_) = "agr" +fromBcp47 x = fromIso $ head x + where + fromIso "cz" = "cs" + fromIso "el" = "gr" + fromIso "eu" = "ba" + fromIso "he" = "il" + fromIso "jp" = "ja" + fromIso "uk" = "ua" + fromIso "vi" = "vn" + fromIso "zh" = "cn" + fromIso l = l + diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index 506edd182..6a30efbf5 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -144,11 +144,6 @@ pandocToLaTeX options (Pandoc meta blocks) = do st <- get titleMeta <- stringToLaTeX TextString $ stringify $ docTitle meta authorsMeta <- mapM (stringToLaTeX TextString . stringify) $ docAuthors meta - let (mainlang, otherlang) = - case (reverse . splitBy (==',') . filter (/=' ')) `fmap` - getField "lang" metadata of - Just (m:os) -> (m, reverse os) - _ -> ("", []) let context = defField "toc" (writerTableOfContents options) $ defField "toc-depth" (show (writerTOCDepth options - if stBook st @@ -173,8 +168,6 @@ pandocToLaTeX options (Pandoc meta blocks) = do defField "euro" (stUsesEuro st) $ defField "listings" (writerListings options || stLHS st) $ defField "beamer" (writerBeamer options) $ - defField "mainlang" mainlang $ - defField "otherlang" otherlang $ (if stHighlighting st then defField "highlighting-macros" (styleToLaTeX $ writerHighlightStyle options ) @@ -186,8 +179,18 @@ pandocToLaTeX options (Pandoc meta blocks) = do defField "biblatex" True _ -> id) $ metadata + let lang = maybe [] (splitBy (=='-')) $ getField "lang" context + (polyLang, polyVar) = toPolyglossia lang + let context' = + defField "babel-lang" (toBabel lang) + $ defField "polyglossia-lang" polyLang + $ defField "polyglossia-variant" polyVar + $ defField "polyglossia-otherlangs" + (maybe [] (map $ fst . toPolyglossia . splitBy (=='-')) $ + getField "otherlangs" context) + $ context return $ if writerStandalone options - then renderTemplate' template context + then renderTemplate' template context' else main -- | Convert Elements to LaTeX @@ -980,3 +983,132 @@ citationsToBiblatex _ = return empty getListingsLanguage :: [String] -> Maybe String getListingsLanguage [] = Nothing getListingsLanguage (x:xs) = toListingsLanguage x <|> getListingsLanguage xs + +-- Takes a list of the constituents of a BCP 47 language code and +-- converts it to a Polyglossia (language, variant) tuple +-- http://mirrors.concertpass.com/tex-archive/macros/latex/contrib/polyglossia/polyglossia.pdf +toPolyglossia :: [String] -> (String, String) +toPolyglossia ("de":"AT":_) = ("german", "austrian") +toPolyglossia ("de":"CH":_) = ("german", "swiss") +toPolyglossia ("de":_) = ("german", "") +toPolyglossia ("dsb":_) = ("lsorbian", "") +toPolyglossia ("el":"poly":_) = ("greek", "poly") +toPolyglossia ("en":"AU":_) = ("english", "australian") +toPolyglossia ("en":"CA":_) = ("english", "canadian") +toPolyglossia ("en":"GB":_) = ("english", "british") +toPolyglossia ("en":"NZ":_) = ("english", "newzealand") +toPolyglossia ("en":"UK":_) = ("english", "british") +toPolyglossia ("en":"US":_) = ("english", "american") +toPolyglossia ("grc":_) = ("greek", "ancient") +toPolyglossia ("hsb":_) = ("usorbian", "") +toPolyglossia ("sl":_) = ("slovenian", "") +toPolyglossia x = (commonFromBcp47 x, "") + +-- Takes a list of the constituents of a BCP 47 language code and +-- converts it to a Babel language string. +-- http://mirrors.concertpass.com/tex-archive/macros/latex/required/babel/base/babel.pdf +-- Note that the PDF unfortunately does not contain a complete list of supported languages. +toBabel :: [String] -> String +toBabel ("de":"1901":_) = "german" +toBabel ("de":"AT":"1901":_) = "austrian" +toBabel ("de":"AT":_) = "naustrian" +toBabel ("de":_) = "ngerman" +toBabel ("dsb":_) = "lowersorbian" +toBabel ("el":"poly":_) = "polutonikogreek" +toBabel ("en":"AU":_) = "australian" +toBabel ("en":"CA":_) = "canadian" +toBabel ("en":"GB":_) = "british" +toBabel ("en":"NZ":_) = "newzealand" +toBabel ("en":"UK":_) = "british" +toBabel ("en":"US":_) = "american" +toBabel ("fr":"CA":_) = "canadien" +toBabel ("fra":"aca":_) = "acadian" +toBabel ("grc":_) = "polutonikogreek" +toBabel ("hsb":_) = "uppersorbian" +toBabel ("sl":_) = "slovene" +toBabel x = commonFromBcp47 x + +-- Takes a list of the constituents of a BCP 47 language code +-- and converts it to a string shared by Babel and Polyglossia. +-- https://tools.ietf.org/html/bcp47#section-2.1 +commonFromBcp47 :: [String] -> String +commonFromBcp47 [] = "" +commonFromBcp47 ("pt":"BR":_) = "brazilian" +commonFromBcp47 x = fromIso $ head x + where + fromIso "af" = "afrikaans" + fromIso "am" = "amharic" + fromIso "ar" = "arabic" + fromIso "ast" = "asturian" + fromIso "bg" = "bulgarian" + fromIso "bn" = "bengali" + fromIso "bo" = "tibetan" + fromIso "br" = "breton" + fromIso "ca" = "catalan" + fromIso "cy" = "welsh" + fromIso "cz" = "czech" + fromIso "cop" = "coptic" + fromIso "da" = "danish" + fromIso "dv" = "divehi" + fromIso "el" = "greek" + fromIso "en" = "english" + fromIso "eo" = "esperanto" + fromIso "es" = "spanish" + fromIso "et" = "estonian" + fromIso "eu" = "basque" + fromIso "fa" = "farsi" + fromIso "fi" = "finnish" + fromIso "fr" = "french" + fromIso "fur" = "friulan" + fromIso "ga" = "irish" + fromIso "gd" = "scottish" + fromIso "gl" = "galician" + fromIso "he" = "hebrew" + fromIso "hi" = "hindi" + fromIso "hr" = "croatian" + fromIso "hy" = "armenian" + fromIso "hu" = "magyar" + fromIso "ia" = "interlingua" + fromIso "id" = "indonesian" + fromIso "ie" = "interlingua" + fromIso "is" = "icelandic" + fromIso "it" = "italian" + fromIso "jp" = "japanese" + fromIso "km" = "khmer" + fromIso "kn" = "kannada" + fromIso "ko" = "korean" + fromIso "la" = "latin" + fromIso "lo" = "lao" + fromIso "lt" = "lithuanian" + fromIso "lv" = "latvian" + fromIso "ml" = "malayalam" + fromIso "mn" = "mongolian" + fromIso "mr" = "marathi" + fromIso "nb" = "norsk" + fromIso "nl" = "dutch" + fromIso "nn" = "nynorsk" + fromIso "no" = "norsk" + fromIso "nqo" = "nko" + fromIso "oc" = "occitan" + fromIso "pl" = "polish" + fromIso "pms" = "piedmontese" + fromIso "pt" = "portuguese" + fromIso "rm" = "romansh" + fromIso "ro" = "romanian" + fromIso "ru" = "russian" + fromIso "sa" = "sanskrit" + fromIso "se" = "samin" + fromIso "sk" = "slovak" + fromIso "sq" = "albanian" + fromIso "sr" = "serbian" + fromIso "sv" = "swedish" + fromIso "syr" = "syriac" + fromIso "ta" = "tamil" + fromIso "te" = "telugu" + fromIso "th" = "thai" + fromIso "tk" = "turkmen" + fromIso "tr" = "turkish" + fromIso "uk" = "ukrainian" + fromIso "ur" = "urdu" + fromIso "vi" = "vietnamese" + fromIso _ = "" -- cgit v1.2.3