summaryrefslogtreecommitdiff
path: root/lib/localcharset.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/localcharset.c')
-rw-r--r--lib/localcharset.c273
1 files changed, 218 insertions, 55 deletions
diff --git a/lib/localcharset.c b/lib/localcharset.c
index 58c5718..721c8a9 100644
--- a/lib/localcharset.c
+++ b/lib/localcharset.c
@@ -1,6 +1,6 @@
/* Determine a canonical name for the current locale's character encoding.
- Copyright (C) 2000-2006, 2008-2018 Free Software Foundation, Inc.
+ Copyright (C) 2000-2006, 2008-2020 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -58,6 +58,9 @@
#elif defined WINDOWS_NATIVE
# define WIN32_LEAN_AND_MEAN
# include <windows.h>
+ /* For the use of setlocale() below, the Gnulib override in setlocale.c is
+ not needed; see the platform lists in setlocale_null.m4. */
+# undef setlocale
#endif
#if defined OS2
# define INCL_DOS
@@ -150,7 +153,8 @@ static const struct table_entry alias_table[] =
{ "ISO8859-2", "ISO-8859-2" },
{ "ISO8859-4", "ISO-8859-4" },
{ "ISO8859-5", "ISO-8859-5" },
- { "ISO8859-7", "ISO-8859-7" }
+ { "ISO8859-7", "ISO-8859-7" },
+ { "US-ASCII", "ASCII" }
# define alias_table_defined
# endif
# if defined __APPLE__ && defined __MACH__ /* Mac OS X */
@@ -377,27 +381,164 @@ static const struct table_entry alias_table[] =
/* The list of encodings is taken from "List of OS/2 Codepages"
by Alex Taylor:
<http://altsan.org/os2/toolkits/uls/index.html#codepages>.
- See also "IBM Globalization - Code page identifiers":
- <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>. */
- { "CP1089", "ISO-8859-6" },
- { "CP1208", "UTF-8" },
- { "CP1381", "GB2312" },
- { "CP1386", "GBK" },
- { "CP3372", "EUC-JP" },
- { "CP813", "ISO-8859-7" },
- { "CP819", "ISO-8859-1" },
- { "CP878", "KOI8-R" },
- { "CP912", "ISO-8859-2" },
- { "CP913", "ISO-8859-3" },
- { "CP914", "ISO-8859-4" },
- { "CP915", "ISO-8859-5" },
- { "CP916", "ISO-8859-8" },
- { "CP920", "ISO-8859-9" },
- { "CP921", "ISO-8859-13" },
- { "CP923", "ISO-8859-15" },
- { "CP954", "EUC-JP" },
- { "CP964", "EUC-TW" },
- { "CP970", "EUC-KR" }
+ See also "__convcp() of kLIBC":
+ <https://github.com/bitwiseworks/libc/blob/master/src/emx/src/lib/locale/__convcp.c>. */
+ { "CP1004", "CP1252" },
+ /*{ "CP1041", "CP943" },*/
+ /*{ "CP1088", "CP949" },*/
+ { "CP1089", "ISO-8859-6" },
+ /*{ "CP1114", "CP950" },*/
+ /*{ "CP1115", "GB2312" },*/
+ { "CP1208", "UTF-8" },
+ /*{ "CP1380", "GB2312" },*/
+ { "CP1381", "GB2312" },
+ { "CP1383", "GB2312" },
+ { "CP1386", "GBK" },
+ /*{ "CP301", "CP943" },*/
+ { "CP3372", "EUC-JP" },
+ { "CP4946", "CP850" },
+ /*{ "CP5048", "JIS_X0208-1990" },*/
+ /*{ "CP5049", "JIS_X0212-1990" },*/
+ /*{ "CP5067", "KS_C_5601-1987" },*/
+ { "CP813", "ISO-8859-7" },
+ { "CP819", "ISO-8859-1" },
+ { "CP878", "KOI8-R" },
+ /*{ "CP897", "CP943" },*/
+ { "CP912", "ISO-8859-2" },
+ { "CP913", "ISO-8859-3" },
+ { "CP914", "ISO-8859-4" },
+ { "CP915", "ISO-8859-5" },
+ { "CP916", "ISO-8859-8" },
+ { "CP920", "ISO-8859-9" },
+ { "CP921", "ISO-8859-13" },
+ { "CP923", "ISO-8859-15" },
+ /*{ "CP941", "CP943" },*/
+ /*{ "CP947", "CP950" },*/
+ /*{ "CP951", "CP949" },*/
+ /*{ "CP952", "JIS_X0208-1990" },*/
+ /*{ "CP953", "JIS_X0212-1990" },*/
+ { "CP954", "EUC-JP" },
+ { "CP964", "EUC-TW" },
+ { "CP970", "EUC-KR" },
+ /*{ "CP971", "KS_C_5601-1987" },*/
+ { "IBM-1004", "CP1252" },
+ /*{ "IBM-1006", "?" },*/
+ /*{ "IBM-1008", "?" },*/
+ /*{ "IBM-1041", "CP943" },*/
+ /*{ "IBM-1051", "?" },*/
+ /*{ "IBM-1088", "CP949" },*/
+ { "IBM-1089", "ISO-8859-6" },
+ /*{ "IBM-1098", "?" },*/
+ /*{ "IBM-1114", "CP950" },*/
+ /*{ "IBM-1115", "GB2312" },*/
+ /*{ "IBM-1116", "?" },*/
+ /*{ "IBM-1117", "?" },*/
+ /*{ "IBM-1118", "?" },*/
+ /*{ "IBM-1119", "?" },*/
+ { "IBM-1124", "CP1124" },
+ { "IBM-1125", "CP1125" },
+ { "IBM-1131", "CP1131" },
+ { "IBM-1208", "UTF-8" },
+ { "IBM-1250", "CP1250" },
+ { "IBM-1251", "CP1251" },
+ { "IBM-1252", "CP1252" },
+ { "IBM-1253", "CP1253" },
+ { "IBM-1254", "CP1254" },
+ { "IBM-1255", "CP1255" },
+ { "IBM-1256", "CP1256" },
+ { "IBM-1257", "CP1257" },
+ /*{ "IBM-1275", "?" },*/
+ /*{ "IBM-1276", "?" },*/
+ /*{ "IBM-1277", "?" },*/
+ /*{ "IBM-1280", "?" },*/
+ /*{ "IBM-1281", "?" },*/
+ /*{ "IBM-1282", "?" },*/
+ /*{ "IBM-1283", "?" },*/
+ /*{ "IBM-1380", "GB2312" },*/
+ { "IBM-1381", "GB2312" },
+ { "IBM-1383", "GB2312" },
+ { "IBM-1386", "GBK" },
+ /*{ "IBM-301", "CP943" },*/
+ { "IBM-3372", "EUC-JP" },
+ { "IBM-367", "ASCII" },
+ { "IBM-437", "CP437" },
+ { "IBM-4946", "CP850" },
+ /*{ "IBM-5048", "JIS_X0208-1990" },*/
+ /*{ "IBM-5049", "JIS_X0212-1990" },*/
+ /*{ "IBM-5067", "KS_C_5601-1987" },*/
+ { "IBM-813", "ISO-8859-7" },
+ { "IBM-819", "ISO-8859-1" },
+ { "IBM-850", "CP850" },
+ /*{ "IBM-851", "?" },*/
+ { "IBM-852", "CP852" },
+ { "IBM-855", "CP855" },
+ { "IBM-856", "CP856" },
+ { "IBM-857", "CP857" },
+ /*{ "IBM-859", "?" },*/
+ { "IBM-860", "CP860" },
+ { "IBM-861", "CP861" },
+ { "IBM-862", "CP862" },
+ { "IBM-863", "CP863" },
+ { "IBM-864", "CP864" },
+ { "IBM-865", "CP865" },
+ { "IBM-866", "CP866" },
+ /*{ "IBM-868", "?" },*/
+ { "IBM-869", "CP869" },
+ { "IBM-874", "CP874" },
+ { "IBM-878", "KOI8-R" },
+ /*{ "IBM-895", "?" },*/
+ /*{ "IBM-897", "CP943" },*/
+ /*{ "IBM-907", "?" },*/
+ /*{ "IBM-909", "?" },*/
+ { "IBM-912", "ISO-8859-2" },
+ { "IBM-913", "ISO-8859-3" },
+ { "IBM-914", "ISO-8859-4" },
+ { "IBM-915", "ISO-8859-5" },
+ { "IBM-916", "ISO-8859-8" },
+ { "IBM-920", "ISO-8859-9" },
+ { "IBM-921", "ISO-8859-13" },
+ { "IBM-922", "CP922" },
+ { "IBM-923", "ISO-8859-15" },
+ { "IBM-932", "CP932" },
+ /*{ "IBM-941", "CP943" },*/
+ /*{ "IBM-942", "?" },*/
+ { "IBM-943", "CP943" },
+ /*{ "IBM-947", "CP950" },*/
+ { "IBM-949", "CP949" },
+ { "IBM-950", "CP950" },
+ /*{ "IBM-951", "CP949" },*/
+ /*{ "IBM-952", "JIS_X0208-1990" },*/
+ /*{ "IBM-953", "JIS_X0212-1990" },*/
+ { "IBM-954", "EUC-JP" },
+ /*{ "IBM-955", "?" },*/
+ { "IBM-964", "EUC-TW" },
+ { "IBM-970", "EUC-KR" },
+ /*{ "IBM-971", "KS_C_5601-1987" },*/
+ { "IBM-eucCN", "GB2312" },
+ { "IBM-eucJP", "EUC-JP" },
+ { "IBM-eucKR", "EUC-KR" },
+ { "IBM-eucTW", "EUC-TW" },
+ { "IBM33722", "EUC-JP" },
+ { "ISO8859-1", "ISO-8859-1" },
+ { "ISO8859-2", "ISO-8859-2" },
+ { "ISO8859-3", "ISO-8859-3" },
+ { "ISO8859-4", "ISO-8859-4" },
+ { "ISO8859-5", "ISO-8859-5" },
+ { "ISO8859-6", "ISO-8859-6" },
+ { "ISO8859-7", "ISO-8859-7" },
+ { "ISO8859-8", "ISO-8859-8" },
+ { "ISO8859-9", "ISO-8859-9" },
+ /*{ "JISX0201-1976", "JISX0201-1976" },*/
+ /*{ "JISX0208-1978", "?" },*/
+ /*{ "JISX0208-1983", "JIS_X0208-1983" },*/
+ /*{ "JISX0208-1990", "JIS_X0208-1990" },*/
+ /*{ "JISX0212-1990", "JIS_X0212-1990" },*/
+ /*{ "KSC5601-1987", "KS_C_5601-1987" },*/
+ { "SJIS-1", "CP943" },
+ { "SJIS-2", "CP943" },
+ { "eucJP", "EUC-JP" },
+ { "eucKR", "EUC-KR" },
+ { "eucTW-1993", "EUC-TW" }
# define alias_table_defined
# endif
# if defined VMS /* OpenVMS */
@@ -675,8 +816,11 @@ static const struct table_entry locale_table[] =
/* Determine the current locale's character encoding, and canonicalize it
- into one of the canonical names listed in localcharset.h.
- The result must not be freed; it is statically allocated.
+ into one of the canonical names listed below.
+ The result must not be freed; it is statically allocated. The result
+ becomes invalid when setlocale() is used to change the global locale, or
+ when the value of one of the environment variables LC_ALL, LC_CTYPE, LANG
+ is changed; threads in multithreaded programs should not do this.
If the canonical name cannot be determined, the result is a non-canonical
name. */
@@ -688,6 +832,13 @@ locale_charset (void)
{
const char *codeset;
+ /* This function must be multithread-safe. To achieve this without using
+ thread-local storage, we use a simple strcpy or memcpy to fill this static
+ buffer. Filling it through, for example, strcpy + strcat would not be
+ guaranteed to leave the buffer's contents intact if another thread is
+ currently accessing it. If necessary, the contents is first assembled in
+ a stack-allocated buffer. */
+
#if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
# if HAVE_LANGINFO_CODESET
@@ -702,7 +853,7 @@ locale_charset (void)
if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
{
const char *locale;
- static char buf[2 + 10 + 1];
+ static char resultbuf[2 + 10 + 1];
locale = getenv ("LC_ALL");
if (locale == NULL || locale[0] == '\0')
@@ -726,11 +877,12 @@ locale_charset (void)
modifier = strchr (dot, '@');
if (modifier == NULL)
return dot;
- if (modifier - dot < sizeof (buf))
+ if (modifier - dot < sizeof (resultbuf))
{
- memcpy (buf, dot, modifier - dot);
- buf [modifier - dot] = '\0';
- return buf;
+ /* This way of filling resultbuf is multithread-safe. */
+ memcpy (resultbuf, dot, modifier - dot);
+ resultbuf [modifier - dot] = '\0';
+ return resultbuf;
}
}
}
@@ -746,8 +898,13 @@ locale_charset (void)
converting to GetConsoleOutputCP(). This leads to correct results,
except when SetConsoleOutputCP has been called and a raster font is
in use. */
- sprintf (buf, "CP%u", GetACP ());
- codeset = buf;
+ {
+ char buf[2 + 10 + 1];
+
+ sprintf (buf, "CP%u", GetACP ());
+ strcpy (resultbuf, buf);
+ codeset = resultbuf;
+ }
}
# endif
@@ -757,42 +914,44 @@ locale_charset (void)
# elif defined WINDOWS_NATIVE
- static char buf[2 + 10 + 1];
+ char buf[2 + 10 + 1];
+ static char resultbuf[2 + 10 + 1];
/* The Windows API has a function returning the locale's codepage as
a number, but the value doesn't change according to what the
'setlocale' call specified. So we use it as a last resort, in
case the string returned by 'setlocale' doesn't specify the
codepage. */
- char *current_locale = setlocale (LC_ALL, NULL);
- char *pdot;
+ char *current_locale = setlocale (LC_CTYPE, NULL);
+ char *pdot = strrchr (current_locale, '.');
- /* If they set different locales for different categories,
- 'setlocale' will return a semi-colon separated list of locale
- values. To make sure we use the correct one, we choose LC_CTYPE. */
- if (strchr (current_locale, ';'))
- current_locale = setlocale (LC_CTYPE, NULL);
-
- pdot = strrchr (current_locale, '.');
if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
sprintf (buf, "CP%s", pdot + 1);
else
{
/* The Windows API has a function returning the locale's codepage as a
- number: GetACP().
- When the output goes to a console window, it needs to be provided in
- GetOEMCP() encoding if the console is using a raster font, or in
- GetConsoleOutputCP() encoding if it is using a TrueType font.
- But in GUI programs and for output sent to files and pipes, GetACP()
- encoding is the best bet. */
+ number: GetACP().
+ When the output goes to a console window, it needs to be provided in
+ GetOEMCP() encoding if the console is using a raster font, or in
+ GetConsoleOutputCP() encoding if it is using a TrueType font.
+ But in GUI programs and for output sent to files and pipes, GetACP()
+ encoding is the best bet. */
sprintf (buf, "CP%u", GetACP ());
}
- codeset = buf;
+ /* For a locale name such as "French_France.65001", in Windows 10,
+ setlocale now returns "French_France.utf8" instead. */
+ if (strcmp (buf + 2, "65001") == 0 || strcmp (buf + 2, "utf8") == 0)
+ codeset = "UTF-8";
+ else
+ {
+ strcpy (resultbuf, buf);
+ codeset = resultbuf;
+ }
# elif defined OS2
const char *locale;
- static char buf[2 + 10 + 1];
+ static char resultbuf[2 + 10 + 1];
ULONG cp[3];
ULONG cplen;
@@ -821,11 +980,12 @@ locale_charset (void)
modifier = strchr (dot, '@');
if (modifier == NULL)
return dot;
- if (modifier - dot < sizeof (buf))
+ if (modifier - dot < sizeof (resultbuf))
{
- memcpy (buf, dot, modifier - dot);
- buf [modifier - dot] = '\0';
- return buf;
+ /* This way of filling resultbuf is multithread-safe. */
+ memcpy (resultbuf, dot, modifier - dot);
+ resultbuf [modifier - dot] = '\0';
+ return resultbuf;
}
}
@@ -841,8 +1001,11 @@ locale_charset (void)
codeset = "";
else
{
+ char buf[2 + 10 + 1];
+
sprintf (buf, "CP%u", cp[0]);
- codeset = buf;
+ strcpy (resultbuf, buf);
+ codeset = resultbuf;
}
}