diff options
Diffstat (limited to 'testsuite/test-mbrtowc.c')
-rw-r--r-- | testsuite/test-mbrtowc.c | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/testsuite/test-mbrtowc.c b/testsuite/test-mbrtowc.c new file mode 100644 index 0000000..04c20fc --- /dev/null +++ b/testsuite/test-mbrtowc.c @@ -0,0 +1,172 @@ +/* Auxiliary program to test mbrtowc(3) behaviour. + Copyright 2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Test the operating-system's native mbrtowc(3) function, + by feeding it multibyte seqeunces one byte at a time, + and reporting the result. + + The program prints the following values after each mbrtowc invocation, + separated by commas: + + -2 the octet is contributes to a valid yet incomplete multibyte sequence + in the current locale. + + -1 the octet causes an encoding error. + + 0 the octet represents a NUL byte + + 1 the octet is a valid single-byte character, OR + completes a valid multibyte sequence. + + Because the program invokes mbrtowc(3) byte-by-byte, the reported + result should never be larger than 1. + + Example of typical output with UTF-8 encoding + --------------------------------------------- + + The unicode character 'N-ARY SUMMATION' (U+2211), encoded in UTF-8 as: + hex: 0xE2 0x88 0x91 + oct: 342 210 211 + + Decoding the valid sequence byte-by-byte gives: + $ printf '\342\210\221' | LC_ALL=en_US.UTF-8 test-mbrtowc + -2,-2,1 + + '\210' is not a valid leading byte in UTF-8, + thus the first byte gives -1, and the 'X' is treated + as a valid single-byte character: + + $ printf '\210X' | LC_ALL=en_US.UTF-8 test-mbrtowc + -1,1 + + '\342' is a valid yet incomplete multibyte sequence. + Passing it to mbrtowc results in value '-2'. + The following value 'X' gives an encoding error '-1' + (as 'X' is not a valid trailing byte in a multibyte UTF-8 sequence): + + $ printf '\342X' | LC_ALL=en_US.UTF-8 test-mbrtowc + -2,-1 + + + Detecting implementation bugs in mbrtowc + ---------------------------------------- + + UTF-8 implementation is correct on most operating systems. + Other multibyte locales might present more difficulties. + An example is the Japanese SHIFT-JIS locale under Mac OS X. + NOTE: The locale is 'ja_JP.SJIS' under Mac OS X, 'ja_JP.shiftjis' + under Ubuntu. 'ja_JP.sjis' was also found on some systems. + + Using unicode character 'KATAKANA LETTER ZE' (U+30BC) + UTF-8: hex: 0xE3 0x82 0xBC + Shift-jis hex: 0x83 0x5B + oct: 203 133 + + The following is a valid multibyte sequence in SHIFT-JIS, + the first byte should result in '-2' (valid yet incomplete), + and the second byte should result in '1' (a valid multibyte sequence + completed): + + $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc + -2,1 + + The follwing is an INVALID multibyte sequence in SHIFT-JIS + (The byte ':' is not valid as a second octet). + Buggy implementations will accept this as a valid multibyte sequence: + + # NOTE: this result indicates a buggy mbrtowc + $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc + -2,1 + + A correct implementations should report '-1' for the second byte (i.e. + an encoding error): + + $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc + -2,-1 + + + Expected results with correct implementations + --------------------------------------------- + + In GNU Sed some tests purposely use invalid multibyte sequences + to test sed's behaviour. A buggy implemetation of mbrtowc + would result in false-alarm failures. + + The following are expected results in correct implementations: + (locale names are from Mac OS X): + + $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc + -2,1 + $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc + -2,-1 + $ printf '\262C' | LC_ALL=ja_JP.eucJP test-mbrtowc + -2,-1 +*/ + +#include <config.h> +#include <locale.h> +#include <stdio.h> +#include <stdlib.h> +#include <wchar.h> + +#include "closeout.h" +#include "error.h" +#include "progname.h" + +/* stub replacement for non-standard err(3) */ +static int +die (const char *msg) +{ + error (0, 0, "%s: error: %s\n", program_name, msg); + exit (EXIT_FAILURE); +} + +int +main (int argc, char **argv) +{ + int c; + int first = 1; + + set_program_name (argv[0]); + if (!setlocale (LC_ALL, "")) + die ("failed to set locale"); + + while ((c = getchar ()) != EOF) + { + wchar_t wc; + char ch = (unsigned char) c; + int i = (int) mbrtowc (&wc, &ch, 1, NULL); + + if (!first) + putchar (','); + first = 0; + + printf ("%d", i); + } + + if (first) + die ("empty input"); + + putchar ('\n'); + + if (ferror (stdin)) + die ("read error"); + close_stdout (); + + exit (EXIT_SUCCESS); +} |