summaryrefslogtreecommitdiff
path: root/testsuite/test-mbrtowc.c
diff options
context:
space:
mode:
Diffstat (limited to 'testsuite/test-mbrtowc.c')
-rw-r--r--testsuite/test-mbrtowc.c172
1 files changed, 172 insertions, 0 deletions
diff --git a/testsuite/test-mbrtowc.c b/testsuite/test-mbrtowc.c
new file mode 100644
index 0000000..04c20fc
--- /dev/null
+++ b/testsuite/test-mbrtowc.c
@@ -0,0 +1,172 @@
+/* Auxiliary program to test mbrtowc(3) behaviour.
+ Copyright 2016 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/* Test the operating-system's native mbrtowc(3) function,
+ by feeding it multibyte seqeunces one byte at a time,
+ and reporting the result.
+
+ The program prints the following values after each mbrtowc invocation,
+ separated by commas:
+
+ -2 the octet is contributes to a valid yet incomplete multibyte sequence
+ in the current locale.
+
+ -1 the octet causes an encoding error.
+
+ 0 the octet represents a NUL byte
+
+ 1 the octet is a valid single-byte character, OR
+ completes a valid multibyte sequence.
+
+ Because the program invokes mbrtowc(3) byte-by-byte, the reported
+ result should never be larger than 1.
+
+ Example of typical output with UTF-8 encoding
+ ---------------------------------------------
+
+ The unicode character 'N-ARY SUMMATION' (U+2211), encoded in UTF-8 as:
+ hex: 0xE2 0x88 0x91
+ oct: 342 210 211
+
+ Decoding the valid sequence byte-by-byte gives:
+ $ printf '\342\210\221' | LC_ALL=en_US.UTF-8 test-mbrtowc
+ -2,-2,1
+
+ '\210' is not a valid leading byte in UTF-8,
+ thus the first byte gives -1, and the 'X' is treated
+ as a valid single-byte character:
+
+ $ printf '\210X' | LC_ALL=en_US.UTF-8 test-mbrtowc
+ -1,1
+
+ '\342' is a valid yet incomplete multibyte sequence.
+ Passing it to mbrtowc results in value '-2'.
+ The following value 'X' gives an encoding error '-1'
+ (as 'X' is not a valid trailing byte in a multibyte UTF-8 sequence):
+
+ $ printf '\342X' | LC_ALL=en_US.UTF-8 test-mbrtowc
+ -2,-1
+
+
+ Detecting implementation bugs in mbrtowc
+ ----------------------------------------
+
+ UTF-8 implementation is correct on most operating systems.
+ Other multibyte locales might present more difficulties.
+ An example is the Japanese SHIFT-JIS locale under Mac OS X.
+ NOTE: The locale is 'ja_JP.SJIS' under Mac OS X, 'ja_JP.shiftjis'
+ under Ubuntu. 'ja_JP.sjis' was also found on some systems.
+
+ Using unicode character 'KATAKANA LETTER ZE' (U+30BC)
+ UTF-8: hex: 0xE3 0x82 0xBC
+ Shift-jis hex: 0x83 0x5B
+ oct: 203 133
+
+ The following is a valid multibyte sequence in SHIFT-JIS,
+ the first byte should result in '-2' (valid yet incomplete),
+ and the second byte should result in '1' (a valid multibyte sequence
+ completed):
+
+ $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
+ -2,1
+
+ The follwing is an INVALID multibyte sequence in SHIFT-JIS
+ (The byte ':' is not valid as a second octet).
+ Buggy implementations will accept this as a valid multibyte sequence:
+
+ # NOTE: this result indicates a buggy mbrtowc
+ $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
+ -2,1
+
+ A correct implementations should report '-1' for the second byte (i.e.
+ an encoding error):
+
+ $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
+ -2,-1
+
+
+ Expected results with correct implementations
+ ---------------------------------------------
+
+ In GNU Sed some tests purposely use invalid multibyte sequences
+ to test sed's behaviour. A buggy implemetation of mbrtowc
+ would result in false-alarm failures.
+
+ The following are expected results in correct implementations:
+ (locale names are from Mac OS X):
+
+ $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
+ -2,1
+ $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
+ -2,-1
+ $ printf '\262C' | LC_ALL=ja_JP.eucJP test-mbrtowc
+ -2,-1
+*/
+
+#include <config.h>
+#include <locale.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#include "closeout.h"
+#include "error.h"
+#include "progname.h"
+
+/* stub replacement for non-standard err(3) */
+static int
+die (const char *msg)
+{
+ error (0, 0, "%s: error: %s\n", program_name, msg);
+ exit (EXIT_FAILURE);
+}
+
+int
+main (int argc, char **argv)
+{
+ int c;
+ int first = 1;
+
+ set_program_name (argv[0]);
+ if (!setlocale (LC_ALL, ""))
+ die ("failed to set locale");
+
+ while ((c = getchar ()) != EOF)
+ {
+ wchar_t wc;
+ char ch = (unsigned char) c;
+ int i = (int) mbrtowc (&wc, &ch, 1, NULL);
+
+ if (!first)
+ putchar (',');
+ first = 0;
+
+ printf ("%d", i);
+ }
+
+ if (first)
+ die ("empty input");
+
+ putchar ('\n');
+
+ if (ferror (stdin))
+ die ("read error");
+ close_stdout ();
+
+ exit (EXIT_SUCCESS);
+}