summaryrefslogtreecommitdiff
path: root/sed/mbcs.c
blob: 3505ef61f49b3f0ddd36cc0a746a605dc0d71dd1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
/*  GNU SED, a batch stream editor.
    Copyright (C) 2003-2016 Free Software Foundation, Inc.

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3, or (at your option)
    any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */

#include "sed.h"
#include <stdlib.h>
#include <string.h>

#include "localcharset.h"

int mb_cur_max;
bool is_utf8;

/* Return non-zero if CH is part of a valid multibyte sequence:
   Either incomplete yet valid sequence (in case of a leading byte),
   or the last byte of a valid multibyte sequence.

   Return zero in all other cases:
    CH is a valid single-byte character (e.g. 0x01-0x7F in UTF-8 locales);
    CH is an invalid byte in a multibyte sequence for the currentl locale,
    CH is the NUL byte.

   Reset CUR_STAT in the case of an invalid byte.
*/
int
is_mb_char (int ch, mbstate_t *cur_stat)
{
  const char c = ch ;
  const int mb_pending = !mbsinit (cur_stat);
  const int result = mbrtowc (NULL, &c, 1, cur_stat);

  switch (result)
    {
    case -2: /* Beginning or middle of valid multibyte sequence */
      return 1;

    case -1: /* Invalid sequence, byte treated like a single-byte character */
      memset (cur_stat, 0, sizeof (mbstate_t));
      return 0;

    case 1: /* A valid byte, check if part of on-going multibyte sequence */
      return mb_pending;

    case 0: /* Special case of mbrtowc(3): the NUL character */
      /* TODO: test this */
      return 1;

    default: /* Should never happen, as per mbrtowc(3) documentation */
      panic ("is_mb_char: mbrtowc (0x%x) returned %d",
             (unsigned int) ch, result);
    }
}

void
initialize_mbcs (void)
{
  /* For UTF-8, we know that the encoding is stateless.  */
  const char *codeset_name;

  codeset_name = locale_charset ();
  is_utf8 = (strcmp (codeset_name, "UTF-8") == 0);

  mb_cur_max = MB_CUR_MAX;
}