From 622df7034c96d1d0ba10b1ad654840ac93baf7f7 Mon Sep 17 00:00:00 2001
From: mb21 <mb21@users.noreply.github.com>
Date: Thu, 20 Aug 2015 17:18:14 +0200
Subject: `lang` variable is now in BCP47 format

strings are converted for LaTeX and ConTeXt output, closes #1614
---
 README                             |  20 ++++-
 src/Text/Pandoc/Writers/ConTeXt.hs |  40 +++++++++-
 src/Text/Pandoc/Writers/LaTeX.hs   | 148 +++++++++++++++++++++++++++++++++++--
 3 files changed, 192 insertions(+), 16 deletions(-)

diff --git a/README b/README
index 5a75c5a0b..da9bfea03 100644
--- a/README
+++ b/README
@@ -945,7 +945,19 @@ as `title`, `author`, and `date`) as well as the following:
 :   body of document
 
 `lang`
-:   language code for HTML or LaTeX documents
+:   The `lang` variable should be set by the user to a language
+    code according to [BCP 47] (e.g. `en` or `en-GB`).
+    For some output formats, pandoc will convert it to an approriate
+    format stored in the additional variables `babel-lang`,
+    `polyglossia-lang`, `polyglossia-variant` (LaTeX)
+    and `context-lang` (ConTeXt).
+
+`otherlangs`
+:   Should be set to a list of other languages used in the document
+    in the YAML metadata, according to [BCP 47]. For example:
+    `otherlangs: [en-GB, fr]`.
+    Currently only used by XeTeX through the generated
+    `polyglossia-otherlangs` variable.
 
 `slidy-url`
 :   base URL for Slidy documents (defaults to
@@ -3264,8 +3276,8 @@ The following fields are recognized:
   ~ A string value in `YYYY-MM-DD` format.  (Only the year is necessary.)
     Pandoc will attempt to convert other common date formats.
 
-`language`
-  ~ A string value in [RFC5646] format.  Pandoc will default to the local
+`lang` (or legacy: `language`)
+  ~ A string value in [BCP 47] format.  Pandoc will default to the local
     language if nothing is specified.
 
 `subject`
@@ -3549,7 +3561,7 @@ Xavier Olive.
 [FictionBook2]: http://www.fictionbook.org/index.php/Eng:XML_Schema_Fictionbook_2.1
 [lua]: http://www.lua.org
 [marc relators]: http://www.loc.gov/marc/relators/relaterm.html
-[RFC5646]: http://tools.ietf.org/html/rfc5646
+[BCP 47]: https://tools.ietf.org/html/bcp47
 [InDesign ICML]: https://www.adobe.com/content/dam/Adobe/en/devnet/indesign/cs55-docs/IDML/idml-specification.pdf
 [txt2tags]: http://txt2tags.org/
 [EPUB]: http://idpf.org/epub
diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs
index 1f8bbcdba..5e2d7cfee 100644
--- a/src/Text/Pandoc/Writers/ConTeXt.hs
+++ b/src/Text/Pandoc/Writers/ConTeXt.hs
@@ -80,12 +80,12 @@ pandocToConTeXt options (Pandoc meta blocks) = do
                         "subsubsubsection","subsubsubsubsection"])
                 $ defField "body" main
                 $ defField "number-sections" (writerNumberSections options)
-                $ defField "mainlang" (maybe ""
-                    (reverse . takeWhile (/=',') . reverse)
-                    (lookup "lang" $ writerVariables options))
                 $ metadata
+  let context' =  defField "context-lang" (maybe "" (fromBcp47 . splitBy (=='-')) $
+                    getField "lang" context)
+                  context
   return $ if writerStandalone options
-              then renderTemplate' (writerTemplate options) context
+              then renderTemplate' (writerTemplate options) context'
               else main
 
 -- escape things as needed for ConTeXt
@@ -362,3 +362,35 @@ sectionHeader (ident,classes,_) hdrLevel lst = do
                        then char '\\' <> chapter <> braces contents
                        else contents <> blankline
 
+-- Takes a list of the constituents of a BCP 47 language code
+-- and irons out ConTeXt's exceptions
+-- https://tools.ietf.org/html/bcp47#section-2.1
+-- http://wiki.contextgarden.net/Language_Codes
+fromBcp47 :: [String] -> String
+fromBcp47 []              = ""
+fromBcp47 ("ar":"SY":_)   = "ar-sy"
+fromBcp47 ("ar":"IQ":_)   = "ar-iq"
+fromBcp47 ("ar":"JO":_)   = "ar-jo"
+fromBcp47 ("ar":"LB":_)   = "ar-lb"
+fromBcp47 ("ar":"DZ":_)   = "ar-dz"
+fromBcp47 ("ar":"MA":_)   = "ar-ma"
+fromBcp47 ("de":"1901":_) = "deo"
+fromBcp47 ("de":"DE":_)   = "de-de"
+fromBcp47 ("de":"AT":_)   = "de-at"
+fromBcp47 ("de":"CH":_)   = "de-ch"
+fromBcp47 ("el":"poly":_) = "agr"
+fromBcp47 ("en":"US":_)   = "en-us"
+fromBcp47 ("en":"GB":_)   = "en-gb"
+fromBcp47 ("grc":_)       = "agr"
+fromBcp47 x               = fromIso $ head x
+  where
+    fromIso "cz" = "cs"
+    fromIso "el" = "gr"
+    fromIso "eu" = "ba"
+    fromIso "he" = "il"
+    fromIso "jp" = "ja"
+    fromIso "uk" = "ua"
+    fromIso "vi" = "vn"
+    fromIso "zh" = "cn"
+    fromIso l    = l
+
diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs
index 506edd182..6a30efbf5 100644
--- a/src/Text/Pandoc/Writers/LaTeX.hs
+++ b/src/Text/Pandoc/Writers/LaTeX.hs
@@ -144,11 +144,6 @@ pandocToLaTeX options (Pandoc meta blocks) = do
   st <- get
   titleMeta <- stringToLaTeX TextString $ stringify $ docTitle meta
   authorsMeta <- mapM (stringToLaTeX TextString . stringify) $ docAuthors meta
-  let (mainlang, otherlang) =
-       case (reverse . splitBy (==',') . filter (/=' ')) `fmap`
-            getField "lang" metadata of
-              Just (m:os) -> (m, reverse os)
-              _           -> ("", [])
   let context  =  defField "toc" (writerTableOfContents options) $
                   defField "toc-depth" (show (writerTOCDepth options -
                                               if stBook st
@@ -173,8 +168,6 @@ pandocToLaTeX options (Pandoc meta blocks) = do
                   defField "euro" (stUsesEuro st) $
                   defField "listings" (writerListings options || stLHS st) $
                   defField "beamer" (writerBeamer options) $
-                  defField "mainlang" mainlang $
-                  defField "otherlang" otherlang $
                   (if stHighlighting st
                       then defField "highlighting-macros" (styleToLaTeX
                                 $ writerHighlightStyle options )
@@ -186,8 +179,18 @@ pandocToLaTeX options (Pandoc meta blocks) = do
                                      defField "biblatex" True
                          _        -> id) $
                   metadata
+  let lang = maybe [] (splitBy (=='-')) $ getField "lang" context
+      (polyLang, polyVar) = toPolyglossia lang
+  let context' =
+          defField "babel-lang" (toBabel lang)
+        $ defField "polyglossia-lang" polyLang
+        $ defField "polyglossia-variant" polyVar
+        $ defField "polyglossia-otherlangs"
+            (maybe [] (map $ fst . toPolyglossia . splitBy (=='-')) $
+            getField "otherlangs" context)
+        $ context
   return $ if writerStandalone options
-              then renderTemplate' template context
+              then renderTemplate' template context'
               else main
 
 -- | Convert Elements to LaTeX
@@ -980,3 +983,132 @@ citationsToBiblatex _ = return empty
 getListingsLanguage :: [String] -> Maybe String
 getListingsLanguage [] = Nothing
 getListingsLanguage (x:xs) = toListingsLanguage x <|> getListingsLanguage xs
+
+-- Takes a list of the constituents of a BCP 47 language code and
+-- converts it to a Polyglossia (language, variant) tuple
+-- http://mirrors.concertpass.com/tex-archive/macros/latex/contrib/polyglossia/polyglossia.pdf
+toPolyglossia :: [String] -> (String, String)
+toPolyglossia ("de":"AT":_)   = ("german", "austrian")
+toPolyglossia ("de":"CH":_)   = ("german", "swiss")
+toPolyglossia ("de":_)        = ("german", "")
+toPolyglossia ("dsb":_)       = ("lsorbian", "")
+toPolyglossia ("el":"poly":_) = ("greek", "poly")
+toPolyglossia ("en":"AU":_)   = ("english", "australian")
+toPolyglossia ("en":"CA":_)   = ("english", "canadian")
+toPolyglossia ("en":"GB":_)   = ("english", "british")
+toPolyglossia ("en":"NZ":_)   = ("english", "newzealand")
+toPolyglossia ("en":"UK":_)   = ("english", "british")
+toPolyglossia ("en":"US":_)   = ("english", "american")
+toPolyglossia ("grc":_)       = ("greek", "ancient")
+toPolyglossia ("hsb":_)       = ("usorbian", "")
+toPolyglossia ("sl":_)        = ("slovenian", "")
+toPolyglossia x               = (commonFromBcp47 x, "")
+
+-- Takes a list of the constituents of a BCP 47 language code and
+-- converts it to a Babel language string.
+-- http://mirrors.concertpass.com/tex-archive/macros/latex/required/babel/base/babel.pdf
+-- Note that the PDF unfortunately does not contain a complete list of supported languages.
+toBabel :: [String] -> String
+toBabel ("de":"1901":_)      = "german"
+toBabel ("de":"AT":"1901":_) = "austrian"
+toBabel ("de":"AT":_)        = "naustrian"
+toBabel ("de":_)             = "ngerman"
+toBabel ("dsb":_)            = "lowersorbian"
+toBabel ("el":"poly":_)      = "polutonikogreek"
+toBabel ("en":"AU":_)        = "australian"
+toBabel ("en":"CA":_)        = "canadian"
+toBabel ("en":"GB":_)        = "british"
+toBabel ("en":"NZ":_)        = "newzealand"
+toBabel ("en":"UK":_)        = "british"
+toBabel ("en":"US":_)        = "american"
+toBabel ("fr":"CA":_)        = "canadien"
+toBabel ("fra":"aca":_)      = "acadian"
+toBabel ("grc":_)            = "polutonikogreek"
+toBabel ("hsb":_)            = "uppersorbian"
+toBabel ("sl":_)             = "slovene"
+toBabel x                    = commonFromBcp47 x
+
+-- Takes a list of the constituents of a BCP 47 language code
+-- and converts it to a string shared by Babel and Polyglossia.
+-- https://tools.ietf.org/html/bcp47#section-2.1
+commonFromBcp47 :: [String] -> String
+commonFromBcp47 [] = ""
+commonFromBcp47 ("pt":"BR":_) = "brazilian"
+commonFromBcp47 x = fromIso $ head x
+  where
+    fromIso "af"  = "afrikaans"
+    fromIso "am"  = "amharic"
+    fromIso "ar"  = "arabic"
+    fromIso "ast" = "asturian"
+    fromIso "bg"  = "bulgarian"
+    fromIso "bn"  = "bengali"
+    fromIso "bo"  = "tibetan"
+    fromIso "br"  = "breton"
+    fromIso "ca"  = "catalan"
+    fromIso "cy"  = "welsh"
+    fromIso "cz"  = "czech"
+    fromIso "cop" = "coptic"
+    fromIso "da"  = "danish"
+    fromIso "dv"  = "divehi"
+    fromIso "el"  = "greek"
+    fromIso "en"  = "english"
+    fromIso "eo"  = "esperanto"
+    fromIso "es"  = "spanish"
+    fromIso "et"  = "estonian"
+    fromIso "eu"  = "basque"
+    fromIso "fa"  = "farsi"
+    fromIso "fi"  = "finnish"
+    fromIso "fr"  = "french"
+    fromIso "fur" = "friulan"
+    fromIso "ga"  = "irish"
+    fromIso "gd"  = "scottish"
+    fromIso "gl"  = "galician"
+    fromIso "he"  = "hebrew"
+    fromIso "hi"  = "hindi"
+    fromIso "hr"  = "croatian"
+    fromIso "hy"  = "armenian"
+    fromIso "hu"  = "magyar"
+    fromIso "ia"  = "interlingua"
+    fromIso "id"  = "indonesian"
+    fromIso "ie"  = "interlingua"
+    fromIso "is"  = "icelandic"
+    fromIso "it"  = "italian"
+    fromIso "jp"  = "japanese"
+    fromIso "km"  = "khmer"
+    fromIso "kn"  = "kannada"
+    fromIso "ko"  = "korean"
+    fromIso "la"  = "latin"
+    fromIso "lo"  = "lao"
+    fromIso "lt"  = "lithuanian"
+    fromIso "lv"  = "latvian"
+    fromIso "ml"  = "malayalam"
+    fromIso "mn"  = "mongolian"
+    fromIso "mr"  = "marathi"
+    fromIso "nb"  = "norsk"
+    fromIso "nl"  = "dutch"
+    fromIso "nn"  = "nynorsk"
+    fromIso "no"  = "norsk"
+    fromIso "nqo" = "nko"
+    fromIso "oc"  = "occitan"
+    fromIso "pl"  = "polish"
+    fromIso "pms" = "piedmontese"
+    fromIso "pt"  = "portuguese"
+    fromIso "rm"  = "romansh"
+    fromIso "ro"  = "romanian"
+    fromIso "ru"  = "russian"
+    fromIso "sa"  = "sanskrit"
+    fromIso "se"  = "samin"
+    fromIso "sk"  = "slovak"
+    fromIso "sq"  = "albanian"
+    fromIso "sr"  = "serbian"
+    fromIso "sv"  = "swedish"
+    fromIso "syr" = "syriac"
+    fromIso "ta"  = "tamil"
+    fromIso "te"  = "telugu"
+    fromIso "th"  = "thai"
+    fromIso "tk"  = "turkmen"
+    fromIso "tr"  = "turkish"
+    fromIso "uk"  = "ukrainian"
+    fromIso "ur"  = "urdu"
+    fromIso "vi"  = "vietnamese"
+    fromIso _     = ""
-- 
cgit v1.2.3