summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgregor herrmann <gregoa@debian.org>2021-01-03 04:37:57 +0100
committergregor herrmann <gregoa@debian.org>2021-01-03 04:37:57 +0100
commit59d78d0353ecb859bd6300b1f0064d9fb2d5b096 (patch)
tree360edc8845b630f5192b2319ac693f9d29091426
parent2ec8478ad5c61a596ace4f72387dc3f6ba229d24 (diff)
parente65ed56efb5861bd232e87b506ac71c67c12c075 (diff)
Update upstream source from tag 'upstream/0.58'
Update to upstream version '0.58' with Debian dir 129bd643e808e36baf92f324d0be14415465cbb7
-rw-r--r--CONTRIBUTING.md8
-rw-r--r--Changes8
-rw-r--r--Json3.xs23
-rw-r--r--MANIFEST2
-rw-r--r--MANIFEST.SKIP8
-rw-r--r--META.json10
-rw-r--r--META.yml7
-rw-r--r--Makefile.PL4
-rw-r--r--README12
-rw-r--r--json-common.c48
-rw-r--r--json-entry-points.c14
-rw-r--r--json-perl.c26
-rw-r--r--lib/JSON/Parse.pm2
-rw-r--r--lib/JSON/Parse.pod733
-rw-r--r--lib/JSON/Tokenize.pm2
-rw-r--r--lib/JSON/Tokenize.pod6
-rw-r--r--t/max-depth.t27
-rw-r--r--unicode.c434
-rw-r--r--unicode.h69
-rw-r--r--utf8-byte-one.c3
20 files changed, 992 insertions, 454 deletions
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..4332a7c
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,8 @@
+If you want to contribute to this module, you can file a bug report at
+github issues or email us with your suggestions.
+
+As of version 0.58, it is just about possible to install this module
+from the github repository, so you may be able to fork the module and
+install it successfully on your local computer. If so, please try
+sending a pull request.
+
diff --git a/Changes b/Changes
index bd06fa9..3d1f46d 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,11 @@
+0.58 2021-01-01
+
+* Protect against stack overflows by having maximum parsing depth
+ -- set_max_depth, get_max_depth methods added
+* Documentation updated
+ -- JSON RFC changed to 8259
+ -- Discussion of Unicode tests in JSON Test Suite
+
0.57 2020-07-09
* Bug fix for long strings
diff --git a/Json3.xs b/Json3.xs
index e5db149..6bb8669 100644
--- a/Json3.xs
+++ b/Json3.xs
@@ -78,6 +78,7 @@ CODE:
croak ("no class");
}
Newxz (RETVAL, 1, json_parse_t);
+ json_parse_init (RETVAL);
OUTPUT:
RETVAL
@@ -195,6 +196,27 @@ detect_collisions (parser, onoff)
CODE:
parser->detect_collisions = SvTRUE (onoff) ? 1 : 0;
+void
+set_max_depth (json, max_depth)
+ JSON::Parse json;
+ int max_depth;
+CODE:
+ if (max_depth < 0) {
+ croak ("Invalid max depth %d", max_depth);
+ }
+ json->max_depth = max_depth;
+
+int
+get_max_depth (json)
+ JSON::Parse json;
+CODE:
+ RETVAL = json->max_depth;
+ if (json->max_depth == 0) {
+ RETVAL = JSON_PARSE_DEFAULT_MAX_DEPTH;
+ }
+OUTPUT:
+ RETVAL
+
#ifdef TESTRANDOM
int random_json ()
@@ -218,6 +240,7 @@ OUTPUT:
JSON::Tokenize tokenize_child (token)
JSON::Tokenize token
CODE:
+ RETVAL = token;
if (token->child) {
RETVAL = token->child;
RETVAL->blessed = 1;
diff --git a/MANIFEST b/MANIFEST
index 2d3f824..27a759d 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1,4 +1,5 @@
Changes
+CONTRIBUTING.md
errors.c
examples/array.pl
examples/assert.pl
@@ -39,6 +40,7 @@ t/JSON-Parse.t
t/json-tokenize.t
t/Json3.t
t/kolmorogov42-1.t
+t/max-depth.t
t/numbers.t
t/object.t
t/perl-monks-1165399.t
diff --git a/MANIFEST.SKIP b/MANIFEST.SKIP
index 3b51251..215ee1b 100644
--- a/MANIFEST.SKIP
+++ b/MANIFEST.SKIP
@@ -48,6 +48,14 @@ benchmarks
^lib/JSON/Tokenize\.pod\.tmpl$
# Obsolete files
^obsolete/.*$
+# Files which are so that the distribution can be built from the
+# github repository rather than the CPAN distribution file.
+^copy\.pl$
+^copied/.*$
+# Author version change script
+^versionup\.pl$
+# CI control files
+^\.travis\.yml$
# Local variables:
# comment-start: "#"
# End:
diff --git a/META.json b/META.json
index 7042d86..bb1b0fb 100644
--- a/META.json
+++ b/META.json
@@ -4,7 +4,7 @@
"Ben Bullock <bkb@cpan.org>"
],
"dynamic_config" : 1,
- "generated_by" : "ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 2.150010",
+ "generated_by" : "ExtUtils::MakeMaker version 7.44, CPAN::Meta::Converter version 2.150010",
"license" : [
"perl_5"
],
@@ -48,10 +48,6 @@
"web" : "https://github.com/benkasminbullock/JSON-Parse"
}
},
- "version" : "0.57",
- "x_contributors" : [
- "Shlomi Fish <shlomif@cpan.org>",
- "kolmogorov42"
- ],
- "x_serialization_backend" : "JSON::PP version 2.97001"
+ "version" : "0.58",
+ "x_serialization_backend" : "JSON::PP version 4.04"
}
diff --git a/META.yml b/META.yml
index ee7485d..ad31e52 100644
--- a/META.yml
+++ b/META.yml
@@ -7,7 +7,7 @@ build_requires:
configure_requires:
ExtUtils::MakeMaker: '0'
dynamic_config: 1
-generated_by: 'ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 2.150010'
+generated_by: 'ExtUtils::MakeMaker version 7.44, CPAN::Meta::Converter version 2.150010'
license: perl
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
@@ -23,8 +23,5 @@ requires:
resources:
bugtracker: https://github.com/benkasminbullock/JSON-Parse/issues
repository: git://github.com/benkasminbullock/JSON-Parse.git
-version: '0.57'
-x_contributors:
- - 'Shlomi Fish <shlomif@cpan.org>'
- - kolmogorov42
+version: '0.58'
x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
diff --git a/Makefile.PL b/Makefile.PL
index 96f109a..30d022d 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -30,10 +30,6 @@ WriteMakefile (
web => "$repo/issues",
},
},
- x_contributors => [
- 'Shlomi Fish <shlomif@cpan.org>',
- 'kolmogorov42',
- ],
},
# All the C files are actually #included into Json3.xs so there is
# only one object file.
diff --git a/README b/README
index 2294d71..c10aaaa 100644
--- a/README
+++ b/README
@@ -6,7 +6,7 @@
-This is the README for JSON::Parse version 0.57.
+This is the README for JSON::Parse version 0.58.
JSON::Parse is a "module" for the Perl computer programming language, a
library of computer code to install on a computer. This document contains
@@ -27,7 +27,7 @@ four sections:
JSON::Parse - Read JSON into a Perl variable
A module for parsing JSON. (JSON means "JavaScript Object Notation"
-and it is specified in "RFC 7159".)
+and it is specified in "RFC 8259".)
JSON::Parse offers the function "parse_json", which takes a string
containing JSON, and returns an equivalent Perl structure. It also
@@ -75,11 +75,11 @@ If you have the App::cpanminus installer, you may prefer
cpanm JSON::Parse
-To install the module from the source file, JSON-Parse-0.57.tar.gz, follow
+To install the module from the source file, JSON-Parse-0.58.tar.gz, follow
this sequence of commands:
- tar xfz JSON-Parse-0.57.tar.gz
- cd JSON-Parse-0.57
+ tar xfz JSON-Parse-0.58.tar.gz
+ cd JSON-Parse-0.58
perl Makefile.PL
make
make install
@@ -104,6 +104,6 @@ repository on github at
-----------------------------------------------------------------------------
-This README was written on Thu Jul 9 07:17:01 2020.
+This README was written on Fri Jan 1 10:00:07 2021.
-----------------------------------------------------------------------------
diff --git a/json-common.c b/json-common.c
index 86a0318..5921ba9 100644
--- a/json-common.c
+++ b/json-common.c
@@ -233,6 +233,14 @@ typedef struct parser {
int valid_bytes[JSON3MAXBYTE];
+ /* Current depth into arrays or objects. */
+
+ int depth;
+
+ /* Maximum depth we accept. */
+
+ int max_depth;
+
/* Perl SV * pointers to copy for our true, false, and null
values. */
void * user_true;
@@ -291,6 +299,42 @@ typedef struct parser {
}
json_parse_t;
+/* Maximum depth of parsing. */
+
+#define JSON_PARSE_DEFAULT_MAX_DEPTH 10000
+
+static void
+json_parse_init (json_parse_t * parser)
+{
+ parser->max_depth = JSON_PARSE_DEFAULT_MAX_DEPTH;
+}
+
+/* Check if the user has set something different from the default, and
+ don't croak if we are still OK. */
+
+/* Increment the parsing depth, with check. */
+
+//#define DEBUG_DEPTH
+
+#ifdef DEBUG_DEPTH
+#define PRINT_DEPTH \
+ printf ("%s:%d: %d\n", __FILE__, __LINE__, parser->depth);
+#else
+#define PRINT_DEPTH
+#endif
+
+#define INCDEPTH \
+ PRINT_DEPTH; \
+ parser->depth++; \
+ if (parser->depth > parser->max_depth) { \
+ croak ("error: too many [ or {, maximum is %d", \
+ parser->max_depth); \
+ }
+
+#define DECDEPTH \
+ parser->depth--; \
+ PRINT_DEPTH;
+
#ifndef NOPERL
static SV * error_to_hash (json_parse_t * parser, char * error_as_string);
#endif /* ndef NOPERL */
@@ -668,7 +712,7 @@ failbadinput (json_parse_t * parser)
else if (parser->bad_byte) {
croak ("JSON error at line %d, byte %d/%d: %s",
parser->line,
- parser->bad_byte - parser->input + 1,
+ (int) (parser->bad_byte - parser->input + 1),
parser->length, buffer);
}
else {
@@ -695,7 +739,7 @@ static INLINE void failresources (json_parse_t * parser, const char * format, ..
vsnprintf (buffer, ERRORMSGBUFFERSIZE, format, a);
va_end (a);
croak ("Parsing failed at line %d, byte %d/%d: %s", parser->line,
- parser->end - parser->input,
+ (int) (parser->end - parser->input),
parser->length, buffer);
}
diff --git a/json-entry-points.c b/json-entry-points.c
index 579633e..1d2b94b 100644
--- a/json-entry-points.c
+++ b/json-entry-points.c
@@ -35,7 +35,8 @@ static void check_end (json_parse_t * parser)
/* Our collection of bits and pieces. */ \
\
json_parse_t parser_o = {0}; \
- json_parse_t * parser = & parser_o
+ json_parse_t * parser = & parser_o; \
+ json_parse_init (parser)
#ifndef NOPERL
@@ -92,10 +93,12 @@ json_parse_run (json_parse_t * parser, SV * json)
switch (NEXTBYTE) {
case '{':
+ INCDEPTH;
r = object (parser);
break;
case '[':
+ INCDEPTH;
r = array (parser);
break;
@@ -196,10 +199,12 @@ c_validate (json_parse_t * parser)
switch (NEXTBYTE) {
case '{':
+ INCDEPTH;
valid_object (parser);
break;
case '[':
+ INCDEPTH;
valid_array (parser);
break;
@@ -287,10 +292,6 @@ c_tokenize (json_parse_t * parser)
}
check_end (parser);
-#if 0
- printf ("TOKENS:\n");
- print_tokens (r);
-#endif /* 0 */
return r;
}
@@ -298,7 +299,6 @@ static void
tokenize_free (json_token_t * token)
{
json_token_t * next;
- static int nfree;
next = token->child;
if (next) {
if (! next->blessed) {
@@ -314,8 +314,6 @@ tokenize_free (json_token_t * token)
token->next = 0;
}
if (! token->blessed) {
- //nfree++;
- //fprintf (stderr, "Free %d %p\n", nfree, token);
Safefree (token);
}
}
diff --git a/json-perl.c b/json-perl.c
index d1c713c..8753c14 100644
--- a/json-perl.c
+++ b/json-perl.c
@@ -14,7 +14,7 @@
#define PREFIX(x) x
#define SVPTR SV *
-#define SETVALUE value =
+#define SETVALUE value =
#elif defined(TOKENING)
@@ -30,7 +30,7 @@
#define PREFIX(x) valid_ ## x
#define SVPTR void
-#define SETVALUE
+#define SETVALUE
#endif /* def PERLING */
@@ -248,7 +248,11 @@ PREFIX (number) (json_parse_t * parser)
exp_number_end:
parser->end--;
+#ifdef PERLING
d = strtod (start, & end);
+#else
+ strtod (start, & end);
+#endif
if ((unsigned char *) end == parser->end) {
/* Success, strtod worked as planned. */
#ifdef PERLING
@@ -493,7 +497,7 @@ PREFIX (string) (json_parse_t * parser)
#define ADDBYTE len++
#include "utf8-byte-one.c"
-
+
/* Not a fall through. */
case BADBYTES:
ILLEGALBYTE;
@@ -697,10 +701,12 @@ static SVPTR PREFIX (object) (json_parse_t * parser);
break; \
\
case '{': \
+ INCDEPTH; \
SETVALUE PREFIX (object) (parser); \
break; \
\
case '[': \
+ INCDEPTH; \
SETVALUE PREFIX (array) (parser); \
break; \
\
@@ -817,6 +823,7 @@ PREFIX (array) (json_parse_t * parser)
goto array_middle;
array_end:
+ DECDEPTH;
#ifdef PERLING
return newRV_noinc ((SV *) av);
@@ -994,11 +1001,13 @@ PREFIX (object) (json_parse_t * parser)
copy the value but have to process it to remove the
escapes. */
+#ifdef PERLING
int klen;
klen = resolve_string (parser, & key);
-#ifdef PERLING
key.start = parser->buffer;
key.length = klen;
+#else
+ resolve_string (parser, & key);
#endif
}
#ifdef PERLING
@@ -1024,6 +1033,7 @@ PREFIX (object) (json_parse_t * parser)
goto hash_middle;
hash_end:
+ DECDEPTH;
#ifdef PERLING
return newRV_noinc ((SV *) hv);
@@ -1112,6 +1122,11 @@ json_parse_set_null (json_parse_t * parser, SV * user_null)
static void
json_parse_free (json_parse_t * parser)
{
+ /* We can get here with depth > 0 if the parser fails and then the
+ error is caught. */
+ if (parser->depth < 0) {
+ warn ("Parser depth underflow %d", parser->depth);
+ }
json_parse_delete_true (parser);
json_parse_delete_false (parser);
json_parse_delete_null (parser);
@@ -1121,7 +1136,7 @@ json_parse_free (json_parse_t * parser)
static void
json_parse_copy_literals (json_parse_t * parser, SV * onoff)
{
- if (! parser->no_warn_literals &&
+ if (! parser->no_warn_literals &&
(parser->user_true || parser->user_false || parser->user_null)) {
warn ("User-defined value overrules copy_literals");
}
@@ -1129,4 +1144,3 @@ json_parse_copy_literals (json_parse_t * parser, SV * onoff)
}
#endif /* def PERLING */
-
diff --git a/lib/JSON/Parse.pm b/lib/JSON/Parse.pm
index 8efe616..7e5269e 100644
--- a/lib/JSON/Parse.pm
+++ b/lib/JSON/Parse.pm
@@ -17,7 +17,7 @@ require Exporter;
use warnings;
use strict;
use Carp;
-our $VERSION = '0.57';
+our $VERSION = '0.58';
require XSLoader;
XSLoader::load (__PACKAGE__, $VERSION);
diff --git a/lib/JSON/Parse.pod b/lib/JSON/Parse.pod
index aadd2bb..f962dee 100644
--- a/lib/JSON/Parse.pod
+++ b/lib/JSON/Parse.pod
@@ -23,15 +23,15 @@ Convert JSON into Perl.
=head1 VERSION
-This documents version 0.57 of JSON::Parse corresponding to
-L<git commit 42e099ed38a3e85daed46f53bf367fa2b828e152|https://github.com/benkasminbullock/JSON-Parse/commit/42e099ed38a3e85daed46f53bf367fa2b828e152> released on Thu Jul 9 07:16:33 2020 +0900.
+This documents version 0.58 of JSON::Parse corresponding to
+L<git commit 6df355c947edf0bf190c0f3631ea46e65d4bb7ed|https://github.com/benkasminbullock/JSON-Parse/commit/6df355c947edf0bf190c0f3631ea46e65d4bb7ed> released on Fri Jan 1 09:59:25 2021 +0900.
=head1 DESCRIPTION
A module for parsing JSON. (JSON means "JavaScript Object Notation"
-and it is specified in L</RFC 7159>.)
+and it is specified in L</RFC 8259>.)
JSON::Parse offers the function L</parse_json>, which takes a string
containing JSON, and returns an equivalent Perl structure. It also
@@ -79,7 +79,7 @@ produces output
HASH
-(This example is included as L<F<hash.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.57/examples/hash.pl> in the distribution.)
+(This example is included as L<F<hash.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.58/examples/hash.pl> in the distribution.)
If the input JSON text is a serialized array, an array reference is
@@ -97,7 +97,7 @@ produces output
ARRAY
-(This example is included as L<F<array.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.57/examples/array.pl> in the distribution.)
+(This example is included as L<F<array.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.58/examples/array.pl> in the distribution.)
Otherwise a Perl scalar is returned.
@@ -162,7 +162,7 @@ produces output
-(This example is included as L<F<assert.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.57/examples/assert.pl> in the distribution.)
+(This example is included as L<F<assert.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.58/examples/assert.pl> in the distribution.)
This is the underlying function for L</valid_json>. It runs at the
@@ -219,26 +219,10 @@ refers to the caller's line.
=back
As the name implies, this is meant to be a "safety-first" version of
-L</parse_json>. This function does not pass all of the tests of the
-L</JSON Parsing Test Suite>, because it creates an error for duplicate
-keys in objects, which is legal JSON. See F<t/jpts.t> for details.
+L</parse_json>.
This function was added in version 0.38.
-=head1 OLD INTERFACE
-
-The following alternative function names are accepted. These are the
-names used for the functions in old versions of this module. These
-names are not deprecated and will never be removed from the module.
-
-=head2 json_to_perl
-
-This is exactly the same function as L</parse_json>.
-
-=head2 validate_json
-
-This is exactly the same function as L</assert_valid_json>.
-
=head1 Mapping from JSON to Perl
JSON elements are mapped to Perl as follows:
@@ -258,7 +242,7 @@ possible these are accepted by JSON::Parse.
=head2 JSON strings
JSON strings become Perl strings. The JSON escape characters such as
-C<\t> for the tab character (see section 2.5 of L</RFC 7159>) are
+C<\t> for the tab character (see section 2.5 of L</RFC 8259>) are
mapped to the equivalent ASCII character.
=head3 Handling of Unicode
@@ -294,7 +278,7 @@ but
-Escapes of the form \uXXXX (see page three of L</RFC 7159>) are mapped
+Escapes of the form \uXXXX (see page three of L</RFC 8259>) are mapped
to ASCII if XXXX is less than 0x80, or to UTF-8 if XXXX is greater
than or equal to 0x80.
@@ -370,7 +354,7 @@ produces output
Native Perl: かあ
-(This example is included as L<F<unicode-details.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.57/examples/unicode-details.pl> in the distribution.)
+(This example is included as L<F<unicode-details.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.58/examples/unicode-details.pl> in the distribution.)
Although in general the above would be an unsafe practice, JSON::Parse
@@ -379,7 +363,7 @@ format. To ensure that invalid inputs are never upgraded, JSON::Parse
checks each input byte to make sure that it forms UTF-8. See also
L</UTF-8 only>. Doing things this way, rather than the way that Perl
does it, was one of the original motivations for writing this
-module. See also L</HISTORY>.
+module.
Surrogate pairs in the form C<\uD834\uDD1E> are also handled. If the
second half of the surrogate pair is missing, an L</Unexpected
@@ -443,7 +427,7 @@ produces output
Ambiguous key 'a' is 2
-(This example is included as L<F<key-collision.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.57/examples/key-collision.pl> in the distribution.)
+(This example is included as L<F<key-collision.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.58/examples/key-collision.pl> in the distribution.)
Here the key "a" could be either 1 or 2. As seen in the example,
@@ -457,13 +441,24 @@ doesn't give information about collisions when storing into hash
values, and checking for collisions for every key will degrade
performance for the sake of an unlikely occurrence. The JSON
specification says "The names within an object SHOULD be unique." (see
-L</RFC 7159>, page 5), although it's not a requirement.
+L</RFC 8259>, page 5), although it's not a requirement.
For performance, L</valid_json> and L</assert_valid_json> do not store
hash keys, thus they cannot detect this variety of problem.
=head2 Literals
+=head3 false
+
+L</parse_json> maps the JSON false literal to a readonly scalar which
+evaluates to the empty string, or to zero in a numeric context. (This
+behaviour changed from version 0.36 to 0.37. In versions up to 0.36,
+the false literal was mapped to a readonly scalar which evaluated to 0
+only.) L</parse_json_safe> maps the JSON literal to a similar scalar
+without the readonly constraints. If you use a parser created with
+L</new>, you can choose either of these behaviours with
+L</copy_literals>, or you can tell JSON::Parse to put your own value
+in place of falses using the L</set_false> method.
=head3 null
@@ -483,18 +478,6 @@ either of these behaviours with L</copy_literals>, or you can tell
JSON::Parse to put your own value in place of trues using the
L</set_true> method.
-=head3 false
-
-L</parse_json> maps the JSON false literal to a readonly scalar which
-evaluates to the empty string, or to zero in a numeric context. (This
-behaviour changed from version 0.36 to 0.37. In versions up to 0.36,
-the false literal was mapped to a readonly scalar which evaluated to 0
-only.) L</parse_json_safe> maps the JSON literal to a similar scalar
-without the readonly constraints. If you use a parser created with
-L</new>, you can choose either of these behaviours with
-L</copy_literals>, or you can tell JSON::Parse to put your own value
-in place of falses using the L</set_false> method.
-
=head3 Round trips and compatibility
The Perl versions of literals produced by L</parse_json> will be
@@ -561,7 +544,7 @@ produces output
{"fripp":false,"bruce":true,"clapton":true,"hendrix":false}
-(This example is included as L<F<json-tiny-round-trip-demo.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.57/examples/json-tiny-round-trip-demo.pl> in the distribution.)
+(This example is included as L<F<json-tiny-round-trip-demo.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.58/examples/json-tiny-round-trip-demo.pl> in the distribution.)
Most of the other CPAN modules use similar methods to L<JSON::Tiny>,
@@ -624,23 +607,6 @@ literals>.
These methods only work on an object created with L</new>; they do not
affect the behaviour of L</parse_json> or L</parse_json_safe>.
-=head2 new
-
- my $jp = JSON::Parse->new ();
-
-Create a new JSON::Parse object.
-
-This method was added in version 0.38.
-
-=head2 run
-
- my $out = $jp->run ($json);
-
-This does the same thing as L</parse_json>, except its behaviour can
-be modified using the methods below.
-
-This method was added in version 0.38.
-
=head2 check
eval {
@@ -675,20 +641,6 @@ L</set_false> and L</set_null>, that takes precedence over this.
This method was added in version 0.38.
-=head2 warn_only
-
- $jp->warn_only (1);
-
-Warn, don't die, on error. Failed parsing returns the undefined value,
-C<undef>, and prints a warning.
-
-This can be switched off again using any false value:
-
- $jp->warn_only ('');
-
-This method was documented in version 0.38, but only implemented in
-version 0.41.
-
=head2 detect_collisions
$jp->detect_collisions (1);
@@ -714,7 +666,7 @@ produces output
-(This example is included as L<F<collide.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.57/examples/collide.pl> in the distribution.)
+(This example is included as L<F<collide.pl>|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.58/examples/collide.pl> in the distribution.)
The C<detect_collisions (1)> behaviour is the behaviour of
@@ -741,6 +693,52 @@ This requires Perl version 5.14 or later.
This method was added in version 0.46.
+=head2 get_max_depth
+
+ my $max_depth = $jp->get_max_depth ();
+
+This returns the maximum nesting depth of objects or arrays in the
+input JSON. The default value is 10,000.
+
+=head2 new
+
+ my $jp = JSON::Parse->new ();
+
+Create a new JSON::Parse object.
+
+This method was added in version 0.38.
+
+=head2 run
+
+ my $out = $jp->run ($json);
+
+This does the same thing as L</parse_json>, except its behaviour can
+be modified using the methods below.
+
+This method was added in version 0.38.
+
+
+=head2 set_max_depth
+
+ $jp->set_max_depth (42);
+
+Set the maximum nesting depth of objects or arrays in the input
+JSON. The default value is 10,000.
+
+=head2 warn_only
+
+ $jp->warn_only (1);
+
+Warn, don't die, on error. Failed parsing returns the undefined value,
+C<undef>, and prints a warning.
+
+This can be switched off again using any false value:
+
+ $jp->warn_only ('');
+
+This method was documented in version 0.38, but only implemented in
+version 0.41.
+
=head2 Methods for manipulating literals
These methods alter what is written into the Perl structure when the
@@ -903,6 +901,20 @@ copy_literals>.
This method was added in version 0.38.
+=head1 OLD INTERFACE
+
+The following alternative function names are accepted. These are the
+names used for the functions in old versions of this module. These
+names are not deprecated and will never be removed from the module.
+
+=head2 json_to_perl
+
+This is exactly the same function as L</parse_json>.
+
+=head2 validate_json
+
+This is exactly the same function as L</assert_valid_json>.
+
=head1 RESTRICTIONS
This module imposes the following restrictions on its input.
@@ -912,7 +924,7 @@ This module imposes the following restrictions on its input.
=item JSON only
JSON::Parse is a strict parser. It only accepts input which exactly
-meets the criteria of L</RFC 7159>. That means, for example,
+meets the criteria of L</RFC 8259>. That means, for example,
JSON::Parse does not accept single quotes (') instead of double quotes
("), or numbers with leading zeros, like 0123. JSON::Parse does not
accept control characters (0x00 - 0x1F) in strings, missing commas
@@ -949,12 +961,16 @@ L<perluniintro>):
open my $input, "<:encoding(UTF-16)", 'some-json-file';
JSON::Parse does not determine the nature of the octet stream, as
-described in part 3 of L</RFC 7159>.
+described in part 3 of L</RFC 8259>.
This restriction to UTF-8 applies regardless of whether Perl thinks
that the input string is a character string or a byte
string. Non-UTF-8 input will cause an L</Unexpected character> error.
+JSON::Parse does not accept non-characters or parts of surrogate pairs
+as UTF-8 bytes. See also the discussion under L</JSON Parsing Test
+Suite>.
+
=back
=head1 DIAGNOSTICS
@@ -983,9 +999,11 @@ block:
The following error messages are produced:
+=over
+
-=head2 Unexpected character
+=item Unexpected character
An unexpected character (byte) was encountered in the input. For
example, when looking at the beginning of a string supposedly
@@ -1111,7 +1129,7 @@ prints
-=head2 Unexpected end of input
+=item Unexpected end of input
The end of the string was encountered before the end of whatever was
being parsed was. For example, if a quote is missing from the end of
@@ -1129,7 +1147,7 @@ gives output
-=head2 Not surrogate pair
+=item Not surrogate pair
While parsing a string, a surrogate pair was encountered. While trying
to turn this into UTF-8, the second half of the surrogate pair turned
@@ -1147,7 +1165,7 @@ gives output
-=head2 Empty input
+=item Empty input
This error occurs for an input which is an empty (no length or
whitespace only) or an undefined value.
@@ -1163,12 +1181,12 @@ gives output
Prior to version 0.49, this error was produced by
L</assert_valid_json> only, but it is now also produced by
-L</parse_json>. See L</JSON Parsing Test Suite>.
+L</parse_json>.
-=head2 Name is not unique
+=item Name is not unique
This error occurs when parsing JSON when the user has chosen
L</detect_collisions>. For example an input like
@@ -1190,9 +1208,11 @@ terminology "name is not unique" is from the JSON specification.
-=head2 Contradictory values for "true" and "false"
+=item Contradictory values for "true" and "false"
+
+=over
-=head3 User-defined value for JSON false evaluates as true
+=item User-defined value for JSON false evaluates as true
This happens if you set JSON false to map to a true value:
@@ -1202,7 +1222,7 @@ To switch off this warning, use L</no_warn_literals>.
This warning was added in version 0.38.
-=head3 User-defined value for JSON true evaluates as false
+=item User-defined value for JSON true evaluates as false
This happens if you set JSON true to map to a false value:
@@ -1212,7 +1232,7 @@ To switch off this warning, use L</no_warn_literals>.
This warning was added in version 0.38.
-=head2 User-defined value overrules copy_literals
+=item User-defined value overrules copy_literals
This warning is given if you set up literals with L</copy_literals>
then you also set up your own true, false, or null values with
@@ -1220,6 +1240,10 @@ L</set_true>, L</set_false>, or L</set_null>.
This warning was added in version 0.38.
+=back
+
+=back
+
=head1 PERFORMANCE
On the author's computer, the module's speed of parsing is
@@ -1243,8 +1267,8 @@ able to fully work out the reason behind the better speed.
There is some benchmarking code in the github repository under the
directory "benchmarks" for those wishing to test these claims. The
-script L<F<benchmarks/bench>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/benchmarks/bench> is an adaptation of the similar
-script in the L<JSON::XS> distribution. The script L<F<benchmarks/pub-bench.pl>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/benchmarks/pub-bench.pl> runs the benchmarks and prints them
+script L<F<benchmarks/bench>|https://github.com/benkasminbullock/JSON-Parse/blob/6df355c947edf0bf190c0f3631ea46e65d4bb7ed/benchmarks/bench> is an adaptation of the similar
+script in the L<JSON::XS> distribution. The script L<F<benchmarks/pub-bench.pl>|https://github.com/benkasminbullock/JSON-Parse/blob/6df355c947edf0bf190c0f3631ea46e65d4bb7ed/benchmarks/pub-bench.pl> runs the benchmarks and prints them
out as POD.
The following benchmark tests used version 0.47 of JSON::Parse and
@@ -1335,30 +1359,34 @@ by JSON::XS.
=over
-=item RFC 7159
+=item RFC 8259
-JSON is specified in L<RFC 7159 "The application/json Media Type for
-JavaScript Object Notation
-(JSON)"|http://www.ietf.org/rfc/rfc7159.txt>.
+JSON is specified in L<RFC 8259 "The JavaScript Object Notation (JSON) Data Interchange Format"|http://www.ietf.org/rfc/rfc8259.txt>.
=item json.org
-L<http://json.org> is the website for JSON, authored by Douglas
+L<https://json.org> is the website for JSON, authored by Douglas
Crockford.
-=item JSON::Create
+=back
-L<JSON::Create> is a companion module to JSON::Parse by the same
-author. As of version 0.08, I'm using it everywhere, but it should
-still be considered to be in a testing stage. Please feel free to try
-it out.
+=head2 Other CPAN modules for parsing and producing JSON
-=item JSON::Tokenize
+Modules which we recommend are marked with 👍. Deprecated modules and
+modules which are definitely buggy (bug reports/pull requests ignored)
+and abandoned (no releases for several years) are marked with 👎
+and/or 🐛. Modules we can't work out are marked with 😕.
-L<JSON::Tokenize> is part of the JSON::Parse distribution, a tokenizer
-which reduces a JSON string to tokens. This makes the JSON::Parse
-tokenizer available to people who want to write their own JSON
-parsers.
+=over
+
+=item Modules by the same author
+
+=over
+
+=item JSON::Create
+
+👍 L<JSON::Create> is a companion module to JSON::Parse by the same
+author.
=item JSON::Repair
@@ -1366,20 +1394,50 @@ L<JSON::Repair> is an example module which demonstrates using
JSON::Parse to apply some kinds of heuristics to repair "relaxed JSON"
or otherwise broken JSON into compliant JSON.
-=back
+=item JSON::Tokenize
-=head2 Other CPAN modules for parsing and producing JSON
+L<JSON::Tokenize> is part of the JSON::Parse distribution, a tokenizer
+which reduces a JSON string to tokens. This makes the JSON::Parse
+tokenizer available to people who want to write their own JSON
+parsers.
-=over
+=back
=item Reading and writing JSON
=over
+=item L<Cpanel::JSON::XS>
+
+This is a fork of L<JSON::XS> related to a disagreement about how to
+report bugs. Please see the module for details.
+
+=item L<File::JSON::Slurper>
+
+Slurp a JSON file into a data structure, and the reverse. It relies on
+L</JSON::MaybeXS>.
+
+=item L<Glib::JSON>
+
+Uses the JSON library from Glib, a library of C functions for the
+Linux GNOME desktop project.
+
+=item L<Inline::JSON>
+
+Include a chunk of JSON text right into your Perl program. Relies on
+L</JSON>.
+
=item L<JSON>
This calls on either L<JSON::PP> or L<JSON::XS>.
+=item L<JSON::DWIW>
+
+👎🐛 This module "Does What I Want", where "I" refers to the module's
+author. Development seems to have ceased in 2010, there is a long list
+of unfixed bugs, and some of the module's features seem to predate
+Unicode support in Perl.
+
=item L<JSON::PP>
This is part of the Perl core, installed when you install Perl. "PP"
@@ -1387,46 +1445,45 @@ stands for "Pure Perl", which means it is in Perl-only without the XS
(C-based) parsing. This is slower but may be necessary if you cannot
install modules requiring a C compiler.
-=item L<JSON::XS>
+=item L<JSON::Slurper>
-This is an all-purpose JSON module in XS, which means it requires a C
-compiler to install.
-
-=item L<Cpanel::JSON::XS>
+Convenient file slurping and spurting of data using JSON. Uses
+L</JSON::PP> or L</Cpanel::JSON::XS> if available. The basic idea
+seems to be that it uses context to return arrays or hashes as
+required, and read and write files without extra stages of opening and
+closing the file.
-This is a fork of L<JSON::XS> related to a disagreement about how to
-report bugs. Please see the module for details.
+=item L<JSON::Syck>
-=item L<JSON::DWIW>
+👎🐛 Takes advantage of a similarity between YAML (yet another markup
+language) and JSON to provide a JSON parser/producer using
+L<YAML::Syck>.
-"Does what I want" module.
+We have never tried this module, but it seems to be semi-deprecated
+(the ABSTRACT says "consider using JSON::XS instead!") and L<there
+are a lot of bug reports|https://github.com/toddr/YAML-Syck/issues>
+about things like failing to process equals signs. However, the
+maintainer is fixing some of the bugs and making new releases, so
+we're not really sure.
-=item L<JSON::YAJL>
+=item L<JSON::Tiny>
-Wraps a C library called yajl.
+This is a fork of L</Mojo::JSON>.
=item L<JSON::Util>
Relies on L<JSON::MaybeXS>.
-=item L<Pegex::JSON>
-
-Based on L<Pegex>.
-
-=item L<JSON::Syck>
-
-Takes advantage of a similarity between YAML (yet another markup
-language) and JSON to provide a JSON parser/producer using
-L<YAML::Syck>.
-
-=item L<Inline::JSON>
+=item L<JSON::XS>
-Relies on L</JSON>.
+This is an all-purpose JSON module in XS, which means it requires a C
+compiler to install.
-=item L<Glib::JSON>
+=item L<JSON::YAJL>
-Uses the JSON library from Glib, a library of C functions for the
-Linux GNOME desktop project.
+👎🐛 Wraps a C library called yajl. The module has been abandoned since
+ten years ago, bug reports include serious errors, and pull requests
+have been ignored.
=item L<Mojo::JSON>
@@ -1434,14 +1491,10 @@ Part of the L<Mojolicious> standalone web framework, "pure Perl" JSON
reader/writer. As of version 6.25 of Mojolicious, this actually
depends on L</JSON::PP>.
-=item L<JSON::Tiny>
-
-This is a fork of L</Mojo::JSON>.
-
-=item L<File::JSON::Slurper>
+=item L<Pegex::JSON>
-Slurp a JSON file into a data structure, and the reverse. It relies on
-L</JSON::MaybeXS>.
+🐛 Based on L<Pegex>. See
+L<our bug report|https://github.com/pegex-parser/pegex-json-pm/issues/3>.
=back
@@ -1449,54 +1502,60 @@ L</JSON::MaybeXS>.
=over
-=item L<JSON::MultiValueOrdered> and L<JSON::Tiny::Subclassable>
-
-C<JSON::MultiValueOrdered> is a special-purpose module for parsing
-JSON objects which have key collisions (something like
-C<{"a":1,"a":2}>) within objects.
+=item L<App::JSON::to>
-(JSON::Parse's handling of key collisions is discussed in L</Key
-collisions> in this document.)
+Convert JSON data to other formats. It reads your JSON file or input
+and converts it into either YAML or Perl native format using
+L<Data::Dumper>.
=item L<boolean>
-This module offers C<true> and C<false> literals similar to JSON.
+👍 This module offers C<true> and C<false> literals similar to JSON.
+
+=item L<Config::JSON>
+
+Configuration files in JSON
=item L<Devel::JSON>
For one-liners.
-=item L<App::JSON::to>
+=over
-Convert JSON data to other formats.
+If you use this module from the command-line, the last value of your
+one-liner (-e) code will be serialized as JSON data.
-=item L<JSON::Color>
+=back
-This module generates JSON, colorized with ANSI escape sequences.
+=item L<JSON::Builder>
-=item L<Config::JSON>
+Create JSON under memory limitations.
-Configuration files in JSON
+=item L<JSON::Color>
-=item L<JSON::String>
+🌈 This module generates JSON colorized with ANSI escape sequences.
-Automatically change a JSON string when a data structure changes.
+=item L<JSON::MultiValueOrdered> and L<JSON::Tiny::Subclassable>
-=item L<JSON::Builder>
+C<JSON::MultiValueOrdered> is a special-purpose module for parsing
+JSON objects which have key collisions (something like
+C<{"a":1,"a":2}>) within objects.
-Create JSON under memory limitations.
+(JSON::Parse's handling of key collisions is discussed in L</Key
+collisions> in this document.)
-=item L<JSON::Pointer>
+=item L<JSON::Path>
-Extract parts of a JSON string.
+Search nested hashref/arrayref structures using JSONPath.
-=item L<Inline::JSON>
+=item L<JSON::Pointer>
-Include JSON in a Perl program.
+Extract parts of a JSON string.
-=item L<JSON::Path>
+=item L<JSON::String>
-Search nested hashref/arrayref structures using JSONPath.
+Automatically change a JSON string when a data structure changes using
+tied scalars.
=back
@@ -1504,20 +1563,27 @@ Search nested hashref/arrayref structures using JSONPath.
=over
+=item L<Test::Deep::JSON>
+
+Compare JSON with L<Test::Deep>. As of version 0.05, it relies on
+L</JSON::MaybeXS>.
+
=item L<Test::JSON>
-This offers a way to compare two different JSON strings to see if they
-refer to the same object. As of version 0.11, it relies on
-L</JSON::Any>.
+👎 This offers a way to compare two different JSON strings to see if
+they refer to the same object. The most recent version, 0.11, was
+released in 2009, and it relies on the deprecated L</JSON::Any>, which
+makes it essentially abandoned.
-=item L<Test::JSON::More>
+=item L<Test::JSON::Entails>
-JSON Test Utility. As of version 0.02, it relies on L</JSON>.
+👎 Test whether one JSON or Perl structure entails/subsumes
+another. The most recent version is from 2012, and it relies on
+L</JSON::Any>, so it is probably abandoned.
-=item L<Test::Deep::JSON>
+=item L<Test::JSON::More>
-Compare JSON with L<Test::Deep>. As of version 0.03, it relies on
-L</JSON>.
+JSON Test Utility. As of version 0.02, it relies on L</JSON>.
=back
@@ -1527,41 +1593,61 @@ These untangle numbers, strings, and booleans into JSON types.
=over
+=item L<JSON::TypeInference>
+
+😕 Virtually undocumented, it's not clear what this does.
+
=item L<JSON::Types>
-=item L<JSON::TypeInference>
+Change the type of a Perl variable so that it comes out as a number, a
+string, or a boolean in the output JSON.
+
+=item L<JSON::Types::Flexible>
+
+The module is barely documented, but from looking at L<the test
+file|https://metacpan.org/source/PINE/JSON-Types-Flexible-0.03/t%2Fjson%2Ftypes%2Fflexible%2Fclass.t>,
+this seems to enable you to change the output type of a number or a
+string so that you can, for example, make the number C<1> come out as
+either a number, C<1>, a string C<"1">, or a boolean, C<true>, in the
+output JSON.
=item L<JSON::Typist>
-=item L<JSON::Types::Flexible>
+"Replace mushy strings and numbers with rigidly typed replacements"
+
+Since Perl muddles strings and numbers, this enables you to work out
+whether your input JSON was C<"123"> (a string) or C<123> (a number).
=back
=item Combination modules
-These modules rely on more than one back-end module.
+These modules rely on more than one back-end module to process JSON
+for you.
=over
+=item L<JSON::Any>
+
+👎 This now-deprecated module combines L</JSON::DWIW>, L</JSON::XS>
+versions one and two, and L</JSON::Syck>.
+
=item L<JSON::MaybeXS>
A module which combines L</Cpanel::JSON::XS>, L</JSON::XS>, and
L</JSON::PP>. The original L</JSON> combines L</JSON::XS> and
-L</JSON::PP>, so this prioritizes L</Cpanel::JSON::XS>.
-
-=item L<JSON::Any>
-
-This module combines L</JSON::DWIW>, L</JSON::XS> versions one and
-two, and L</JSON::Syck>.
+L</JSON::PP>, but this prioritizes L</Cpanel::JSON::XS> over
+L</JSON::XS>.
=item L<JSON::XS::VersionOneAndTwo>
-A "combination module" which supports two different interfaces of
-L</JSON::XS>. However, JSON::XS is now onto version 3.
+👎 A "combination module" which supports two different interfaces of
+L</JSON::XS>. However, JSON::XS is now onto version 4.
=item L<Mojo::JSON::MaybeXS>
-This pulls in L</JSON::MaybeXS> instead of L</Mojo::JSON>.
+This pulls in L</JSON::MaybeXS> instead of L</Mojo::JSON> for
+L<Mojolicious> users.
=back
@@ -1571,6 +1657,10 @@ These modules extend JSON with comments and other things.
=over
+=item L<JSON::Diffable>
+
+"A relaxed and easy diffable JSON variant"
+
=item L<JSON::Relaxed>
"An extension of JSON that allows for better human-readability".
@@ -1579,44 +1669,152 @@ These modules extend JSON with comments and other things.
"Relaxed JSON with a little bit of YAML"
-=item L<JSON::Diffable>
+=back
-"A relaxed and easy diffable JSON variant"
+=item Web interactions via JSON
+
+=over
+
+=item L<JSON::API>
+
+Combines L<LWP::UserAgent> and L<JSON> to make a unified module to
+communicate with a web server via JSON.
+
+=item L<WWW::JSON>
+
+"Make working with JSON Web API's as painless as possible"
+
+=back
+
+=item Extension modules
+
+These modules extend the existing modules with some extra bits.
+
+=over
+
+=item L<JSON::XS::Sugar>
+
+Provides booleans and number/string forcing for L</JSON::XS>.
+
+=item L<Silki::JSON>
+
+Switches on formatting and strict utf8 in a L</JSON::XS> object.
=back
=item Other modules
+Modules which are parts of bigger releases have not been included here
+except by accident.
+
=over
=item L<App::JSON::Tools>
+Undocumented command-line tools for JSON.
+
=item L<App::JSONPretty>
-=item L<Eve::Json>
+👎🐛 JSON prettification script. For whatever reason the script
+encapsulates the entirety of an old version of the L</JSON> module
+dating from before L</JSON::PP> was included in the Perl core.
+
+If you need this kind of script, there is something called L<json_xs>
+which comes with L</JSON::XS>, or equivalently L<cpanel_json_xs> in
+the forked module L</Cpanel::JSON::XS>.
+
+=item L<ARGV::JSON>
=item L<Haineko::JSON>
-=item L<JBD::JSON>
+😕 It says "Wrapper class to load/dump JSON" but we're not sure what
+the author means, for example what does it mean to say "loading JSON
+from scalar value"? Every module which deals with JSON deals with
+parsing JSON from a scalar, so why is a wrapper class necessary?
+
+=item L<JS::JSON>
+
+👎 This is JavaScript code which was uploaded to CPAN. The original
+JavaScript is now obsolete since the thing it codes is included in all
+modern web browsers.
+
+=item L<JSON::Assert>
+
+"Asserts JSONPaths into a JSON data structure for correct
+values/matches"
+
+=item L<JSON::Eval>
+
+Eval Perl code found in JSON. This module enables one to encode and
+decode Perl scalar references and code references to JSON.
+
+=item L<JSON::ize>
-=item L<JSON::JS>
+Something about one-liners.
+
+=item L<JSON::JSend>
+
+=item L<JSON::Lines>
+
+"JSON Lines is a convenient format for storing structured data that
+may be processed one record at a time."
=item L<JSON::Meth>
+😕 Claims to be "no nonsense JSON encoding/decoding as method calls on
+data". From the documentation:
+
+=over
+
+Don't make me think and give me what I want! This module automatically
+figures out whether you want to encode a Perl data structure to JSON
+or decode a JSON string to a Perl data structure.
+
+=back
+
=item L<JSON::ON>
+JavaScript object notation object notator.
+
+=item L<JSON::Patch>
+
+😕 We don't know what this does, or how it relates to JSON. The example
+in the synopsis section of the document doesn't show any JSON, it
+shows an example of altering nested hashes in Perl.
+
=item L<JSON::SL>
+😕
+
=item L<JSON::Streaming::Reader> and L<JSON::Streaming::Writer>
-=item L<JSON::XS::ByteString>
+=item L<JSON::T>
-=item L<JSON::XS::Sugar>
+Transform JSON using JsonT
-=item L<Silki::JSON>
+=item L<JSON::XS::ByteString>
+
+😕 L<The
+README|https://metacpan.org/source/CINDY/JSON-XS-ByteString-1.004/README>
+claims it is a "thin wrapper around JSON::XS", but L<it contains a
+complete implementation of
+JSON|https://metacpan.org/source/CINDY/JSON-XS-ByteString-1.004/ByteString.xs>,
+which seems to have partly been copy-pasted from the JSON::XS source
+code, but internally it doesn't make any reference to JSON::XS. The
+licence and copyright statement don't mention JSON::XS's original
+author at all so we're not sure if this is a fork, a wrapper, or a
+reimplementation.
+
+We haven't tried downloading this or installing it, but according to
+the documentation, this module encodes numbers with quotes around
+them, so C<< {this => 2} >> turns into C<{"this":"2"}>.
=item L<Text::JSON::Nibble>
+Nibble complete JSON objects from buffers.
+
+This seems to be for extracting JSON from the midst of noise.
+
=back
=back
@@ -1680,7 +1878,7 @@ supplied with the module in the F</t/> subdirectory of the
distribution.
More extensive testing code is in the git repository. This is not
-supplied in the CPAN distribution. A script, L<F<randomjson.pl>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/randomjson.pl>,
+supplied in the CPAN distribution. A script, L<F<randomjson.pl>|https://github.com/benkasminbullock/JSON-Parse/blob/6df355c947edf0bf190c0f3631ea46e65d4bb7ed/randomjson.pl>,
generates a set number of bytes of random JSON and checks that the
module's bytewise validation of input is correct. It does this by
taking a valid fragment, then adding each possible byte from 0 to 255
@@ -1690,119 +1888,62 @@ it to the fragment and continuing the process until a complete valid
JSON input is formed. The module has undergone about a billion
repetitions of this test.
-This setup relies on a C file, L<F<json-random-test.c>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/json-random-test.c>, which isn't in
-the CPAN distribution, and it also requires L<F<Json3.xs>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/Json3.xs> to be edited
-to make the macro C<TESTRANDOM> true (uncomment line 7 of the
-file). The testing code uses C setjmp/longjmp, so it's not guaranteed
-to work on all operating systems and is commented out for CPAN
-releases.
+This setup relies on a C file, L<F<json-random-test.c>|https://github.com/benkasminbullock/JSON-Parse/blob/6df355c947edf0bf190c0f3631ea46e65d4bb7ed/json-random-test.c>, which
+isn't in the CPAN distribution, and it also requires L<F<Json3.xs>|https://github.com/benkasminbullock/JSON-Parse/blob/6df355c947edf0bf190c0f3631ea46e65d4bb7ed/Json3.xs> to be edited to make the macro C<TESTRANDOM> true
+(uncomment line 7 of the file). The testing code uses C
+setjmp/longjmp, so it's not guaranteed to work on all operating
+systems and is commented out for CPAN releases.
-A pure C version called L<F<random-test.c>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/random-test.c> also exists. This applies
+A pure C version called L<F<random-test.c>|https://github.com/benkasminbullock/JSON-Parse/blob/6df355c947edf0bf190c0f3631ea46e65d4bb7ed/random-test.c> also exists. This applies
exactly the same tests, and requires no Perl at all.
If you're interested in testing your own JSON parser, the outputs
-generated by L<F<randomjson.pl>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/randomjson.pl> are quite a good place to start. The
-default is to produce UTF-8 output, which looks pretty horrible since
-it tends to produce long strings of UTF-8 garbage. (This is because it
-chooses randomly from 256 bytes and the end-of-string marker C<"> has
-only a 1/256 chance of being chosen, so the strings tend to get long
-and messy). You can mess with the internals of JSON::Parse by setting
-MAXBYTE in F<json-common.c> to 0x80, recompiling (you can ignore the
-compiler warnings), and running F<randomjson.pl> again to get just
-ASCII random JSON things. This breaks the UTF-8 functionality of
-JSON::Parse, so please don't install that version.
+generated by L<F<randomjson.pl>|https://github.com/benkasminbullock/JSON-Parse/blob/6df355c947edf0bf190c0f3631ea46e65d4bb7ed/randomjson.pl> are quite a good place to
+start. The default is to produce UTF-8 output, which looks pretty
+horrible since it tends to produce long strings of UTF-8
+garbage. (This is because it chooses randomly from 256 bytes and the
+end-of-string marker C<"> has only a 1/256 chance of being chosen, so
+the strings tend to get long and messy). You can mess with the
+internals of JSON::Parse by setting MAXBYTE in F<json-common.c> to
+0x80, recompiling (you can ignore the compiler warnings), and running
+F<randomjson.pl> again to get just ASCII random JSON things. This
+breaks the UTF-8 functionality of JSON::Parse, so please don't install
+that version.
=head2 JSON Parsing Test Suite
-Version 0.48 passed all but two of the yes/no tests of the L<JSON
-Parsing Test Suite|https://github.com/nst/JSONTestSuite>. The first
-failure was that L</assert_valid_json> did not mark L<a completely
-empty
-file|https://github.com/nst/JSONTestSuite/blob/master/test_parsing/n_structure_no_data.json>
-as invalid JSON, and the second was that L</parse_json> did not mark
-L<a file containing a single space
-character|https://github.com/nst/JSONTestSuite/blob/master/test_parsing/n_single_space.json>
-as invalid json. The tests also revealed an inconsistency between
-L</assert_valid_json> and L</valid_json>, which was reporting the
-completely empty file as invalid. Running these tests also revealed
-several bugs in the script L<validjson|/SCRIPT>. All of these errors
-were amended in version 0.49.
-
-I attempted to include the JSON Parsing Test Suite tests in the
-module's tests, but some of the files (like 100,000 open arrays)
-actually L<cause crashes on some versions of Perl on some
-machines|http://fast-matrix.cpantesters.org/?dist=JSON-Parse%200.48_01>,
-so they're not really suitable for distribution. The tests are found,
-however, in the repository under L<F<xt/jpts.t>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/xt/jpts.t> and the
-subdirectory L<F<xt/jpts>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/xt/jpts>, so if you are interested in the
-results, please copy that and try it. There is also a test for the
-L<validjson|/SCRIPT> script as L<F<xt/validjson.t>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/xt/validjson.t> in the
-repository. These are author tests, so you may need to install extra
-modules to run them. These author tests are run automatically before
-any code is uploaded to CPAN.
-
-=head1 HISTORY
-
-See L<F<Changes>|https://github.com/benkasminbullock/JSON-Parse/blob/42e099ed38a3e85daed46f53bf367fa2b828e152/Changes> in the distribution for a full list of changes.
-
-This module started out under the name C<JSON::Argo>. It was
-originally a way to escape from having to use the other JSON modules
-on CPAN. The biggest issue that I had with the other modules was the
-way that Unicode was handled. Insisting on the pure Perl method of
-dealing with JSON strings, which are required to be in Unicode anyway,
-seems to me little more than superstition, something like telling
-programmers not to step on cracks in the pavement. This module
-completely bypasses that. See L</Handling of Unicode> for the details
-of how this module differs from the other modules.
-
-The reason it only parsed JSON was that when I started this I didn't
-know the Perl extension language XS very well (I still don't know it
-very well), and I was not confident about making a JSON producer, so
-it only parsed JSON, which was the main job I needed to do. It
-originally used lex and yacc in the form of flex and bison, since
-discarded. I also found out that someone else had a JSON parser called
-Argo in Java, so to save confusion I dropped the name JSON::Argo and
-renamed this JSON::Parse, keeping the version numbers continuous.
-
-The module has since been completely rewritten, twice, mostly in an
-attempt to improve performance, after I found that JSON::XS was much
-faster than the original JSON::Parse. (The first rewrite of the module
-was not released to CPAN, this is the second one, which explains why
-some files have names like F<Json3.xs>). I also hoped to make something
-useful which wasn't in any existing CPAN module by offering the
-high-speed validator, L</valid_json>.
-
-I also rewrote the module due to some bugs I found, for example up to
-version 0.09 it was failing to accept whitespace after an object key
-string, so a JSON input of the form C<{ "x" : "y" }>, with whitespace
-between the C<"x"> and the colon, C<:>, would cause it to fail. That
-was one big reason I created the random testing regime described in
-L</TESTING> above. I believe that the module is now compliant with the
-JSON specification.
-
-After starting JSON::Create, I realised that some edge case handling
-in JSON::Parse needed to be improved. This resulted in the addition of
-the hash collision and literal-overriding methods introduced
-in versions 0.37 and 0.38 of this module.
-
-Version 0.42 fixed a very serious bug where long strings could
-overflow an internal buffer, and could cause a segmentation fault.
-
-Version 0.48 removed an experimental feature called
-C<$json_diagnostics> which made the module's errors be produced in
-JSON format, and replaced it with the current L</diagnostics_hash>
-method, for the benefit of L</JSON::Repair>.
-
-Version 0.49 brought the module into conformance with the L</JSON
-Parsing Test Suite>.
-
-Version 0.54 removed support for the Solaris operating system.
+JSON::Parse version 0.58 passes most of the JSON Parsing Test Suite,
+with the exception that JSON::Parse rejects various erroneous UTF-8
+inputs, for example JSON::Parse will throw an error for non-character
+code points like Unicode U+FFFF and U+10FFFF. This parser only accepts
+valid UTF-8 as input. See L</UTF-8 only>.
+
+In our opinion it would be a disservice to users of this module to
+allow bytes containing useless fragments such as incomplete parts of
+surrogate pairs, or invalid characters, just because the JSON
+specification doesn't actually explicitly rule out rejecting these
+kinds of garbage inputs. Please see the function C<daft_test> in the
+file F<xt/JPXT.pm> for exactly which of these elements of the test
+suite we do not comply with. See also Douglas Crockford, the inventor
+of JSON's, L<JSON
+parser|https://github.com/douglascrockford/JSON-c/blob/master/utf8_decode.c#L38-L43>,
+dated 2005.
+
+JSON::Parse version 0.58 also introduced L</get_max_depth> and
+L</set_max_depth> to prevent the stack overflow errors caused by some
+very deeply nested inputs such as those of the JSON Parsing Test
+Suite.
=head1 ACKNOWLEDGEMENTS
-Shlomi Fish (SHLOMIF) fixed some memory leaks in version
-0.40. kolmogorov42 (https://github.com/kolmogorov42) reported a very
-serious bug which led to version 0.42.
+Toby Inkster (TOBYINK) suggested the new function names which replaced
+the L</OLD INTERFACE> names. Nicolas Immelman and Shlomi Fish
+(SHLOMIF) reported memory leaks which were fixed in 0.32 and
+0.40. Github user kolmogorov42 reported a bug which led to
+0.42. Github user SteveGlassman found an error in string copying for
+long strings, fixed in 0.57. Lars Dɪᴇᴄᴋᴏᴡ (DAXIM) pointed out problems
+with the JSON Parsing Test Suite which led to the addition of stack
+protection and L</set_max_depth> and L</get_max_depth> in 0.58.
@@ -1814,7 +1955,7 @@ Ben Bullock, <bkb@cpan.org>
=head1 COPYRIGHT & LICENCE
This package and associated files are copyright (C)
-2013-2020
+2013-2021
Ben Bullock.
You can use, copy, modify and redistribute this package and associated
diff --git a/lib/JSON/Tokenize.pm b/lib/JSON/Tokenize.pm
index 69ea3c9..1bb386e 100644
--- a/lib/JSON/Tokenize.pm
+++ b/lib/JSON/Tokenize.pm
@@ -7,7 +7,7 @@ use JSON::Parse;
our @EXPORT_OK = qw/tokenize_json tokenize_start tokenize_next tokenize_start tokenize_end tokenize_type tokenize_child tokenize_text/;
our %EXPORT_TAGS = ('all' => \@EXPORT_OK);
use Carp;
-our $VERSION = '0.57';
+our $VERSION = '0.58';
sub tokenize_text
{
diff --git a/lib/JSON/Tokenize.pod b/lib/JSON/Tokenize.pod
index 2906f35..6129003 100644
--- a/lib/JSON/Tokenize.pod
+++ b/lib/JSON/Tokenize.pod
@@ -48,8 +48,8 @@ This outputs
=head1 VERSION
-This documents version 0.57 of JSON::Tokenize corresponding to
-L<git commit 42e099ed38a3e85daed46f53bf367fa2b828e152|https://github.com/benkasminbullock/JSON-Parse/commit/42e099ed38a3e85daed46f53bf367fa2b828e152> released on Thu Jul 9 07:16:33 2020 +0900.
+This documents version 0.58 of JSON::Tokenize corresponding to
+L<git commit 6df355c947edf0bf190c0f3631ea46e65d4bb7ed|https://github.com/benkasminbullock/JSON-Parse/commit/6df355c947edf0bf190c0f3631ea46e65d4bb7ed> released on Fri Jan 1 09:59:25 2021 +0900.
@@ -130,7 +130,7 @@ Ben Bullock, <bkb@cpan.org>
=head1 COPYRIGHT & LICENCE
This package and associated files are copyright (C)
-2016-2020
+2016-2021
Ben Bullock.
You can use, copy, modify and redistribute this package and associated
diff --git a/t/max-depth.t b/t/max-depth.t
new file mode 100644
index 0000000..c2c837c
--- /dev/null
+++ b/t/max-depth.t
@@ -0,0 +1,27 @@
+use warnings;
+use strict;
+use utf8;
+use FindBin '$Bin';
+use Test::More;
+my $builder = Test::More->builder;
+binmode $builder->output, ":utf8";
+binmode $builder->failure_output, ":utf8";
+binmode $builder->todo_output, ":utf8";
+binmode STDOUT, ":encoding(utf8)";
+binmode STDERR, ":encoding(utf8)";
+use JSON::Parse;
+
+my $jp = JSON::Parse->new ();
+$jp->set_max_depth (1);
+my $ok = eval {
+ $jp->run ('[[[["should fail due to depth"]]]]');
+ 1;
+};
+ok (! $ok, "fails to parse array when max depth is set to 1");
+my $md = $jp->get_max_depth ();
+cmp_ok ($md, '==', 1, "got back the max depth");
+$jp->set_max_depth (0);
+my $mdd = $jp->get_max_depth ();
+cmp_ok ($mdd, '==', 10000, "got back the default max depth");
+
+done_testing ();
diff --git a/unicode.c b/unicode.c
index 81e2988..7b5896d 100644
--- a/unicode.c
+++ b/unicode.c
@@ -1,5 +1,5 @@
-/* This file is a Unicode library in the programming language C which
- deals with conversions to and from the UTF-8 format. */
+/* This is a Unicode library in the programming language C which deals
+ with conversions to and from the UTF-8 format. */
/*
Author:
@@ -63,7 +63,7 @@
#define UTF8_BAD_LEADING_BYTE -1
/* This return value means the caller attempted to turn a code point
- for a surrogate pair into UTF-8. */
+ for a surrogate pair to or from UTF-8. */
#define UNICODE_SURROGATE_PAIR -2
@@ -88,7 +88,10 @@
/* This return value indicates that UTF-8 bytes were not in the
shortest possible form. See
- http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8. */
+ http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8.
+
+ This return value is currently unused. If a character is not in the
+ shortest form, the error UTF8_BAD_CONTINUATION_BYTE is returned. */
#define UTF8_NON_SHORTEST -6
@@ -100,15 +103,17 @@
/* This return value indicates that the Unicode code-point ended with
either 0xFFFF or 0xFFFE, meaning it cannot be used as a character
- code point. */
+ code point, or it was in the disallowed range FDD0 to FDEF. */
#define UNICODE_NOT_CHARACTER -8
-/* This return value indicates that the UTF-8 is valid. */
+/* This return value indicates that the UTF-8 is valid. It is only
+ used by "valid_utf8". */
#define UTF8_VALID 1
-/* This return value indicates that the UTF-8 is not valid. */
+/* This return value indicates that the UTF-8 is not valid. It is only
+ used by "valid_utf8". */
#define UTF8_INVALID 0
@@ -165,6 +170,28 @@ int32_t utf8_bytes (uint8_t c)
| (((int32_t) (x[2] & 0x3F)) << 6) \
| (((int32_t) (x[3] & 0x3F)))
+/* Reject code points which end in either FFFE or FFFF. */
+
+#define REJECT_FFFF(x) \
+ if ((x & 0xFFFF) >= 0xFFFE) { \
+ return UNICODE_NOT_CHARACTER; \
+ }
+
+/* Reject code points in a certain range. */
+
+#define REJECT_NOT_CHAR(r) \
+ if (r >= UNI_NOT_CHAR_MIN && r <= UNI_NOT_CHAR_MAX) { \
+ return UNICODE_NOT_CHARACTER; \
+ }
+
+/* Reject surrogates. */
+
+#define REJECT_SURROGATE(ucs2) \
+ if (ucs2 >= UNI_SUR_HIGH_START && ucs2 <= UNI_SUR_LOW_END) { \
+ /* Ill-formed. */ \
+ return UNICODE_SURROGATE_PAIR; \
+ }
+
/* Try to convert "input" from UTF-8 to UCS-2, and return a value even
if the input is partly broken. This checks the first byte of the
input, but it doesn't check the subsequent bytes. */
@@ -203,29 +230,59 @@ utf8_no_checks (const uint8_t * input, const uint8_t ** end_ptr)
}
}
+/* Surrogate pair zone. */
+
+#define UNI_SUR_HIGH_START 0xD800
+#define UNI_SUR_HIGH_END 0xDBFF
+#define UNI_SUR_LOW_START 0xDC00
+#define UNI_SUR_LOW_END 0xDFFF
+
+/* Start of the "not character" range. */
+
+#define UNI_NOT_CHAR_MIN 0xFDD0
+
+/* End of the "not character" range. */
+
+#define UNI_NOT_CHAR_MAX 0xFDEF
+
/* This function converts UTF-8 encoded bytes in "input" into the
- equivalent Unicode code point. The return value is the Unicode code
- point corresponding to the UTF-8 character in "input" if
- successful, and a negative number if not successful. "*end_ptr" is
- set to the next character after the read character on
- success. "*end_ptr" is set to the start of input on
- failure. "end_ptr" may not be null.
-
- If the first byte of "input" is zero, UNICODE_EMPTY_INPUT is
- returned. If the first byte of "input" is not valid UTF-8,
- UTF8_BAD_LEADING_BYTE is returned. If the second or later bytes of
- "input" are not valid UTF-8, UTF8_BAD_CONTINUATION_BYTE is returned. If the
- UTF-8 is not in the shortest possible form, the error
- UTF8_NON_SHORTEST is returned. If the value extrapolated from
- "input" is greater than UNICODE_MAXIMUM, UNICODE_TOO_BIG is
- returned. If the value extrapolated from "input" ends in 0xFFFF or
- 0xFFFE, UNICODE_NOT_CHARACTER is returned. */
+ equivalent Unicode code point. The return value is the Unicode
+ code point corresponding to the UTF-8 character in "input" if
+ successful, and a negative number if not successful. Nul bytes are
+ rejected.
+
+ "*end_ptr" is set to the next character after the read character on
+ success. "*end_ptr" is set to the start of input on all failures.
+ "end_ptr" may not be NULL.
+
+ If the first byte of "input" is zero, in other words a NUL or '\0',
+ UNICODE_EMPTY_INPUT is returned.
+
+ If the first byte of "input" is not valid UTF-8,
+ UTF8_BAD_LEADING_BYTE is returned.
+
+ If the second or later bytes of "input" are not valid UTF-8,
+ including NUL, UTF8_BAD_CONTINUATION_BYTE is returned.
+
+ If the value extrapolated from "input" is greater than
+ UNICODE_MAXIMUM, UNICODE_TOO_BIG is returned.
+
+ If the value extrapolated from "input" ends in 0xFFFF or 0xFFFE,
+ UNICODE_NOT_CHARACTER is returned.
+
+ If the value extrapolated from "input" is between 0xFDD0 and 0xFDEF,
+ UNICODE_NOT_CHARACTER is returned.
+
+ If the value is within the range of surrogate pairs, the error
+ UNICODE_SURROGATE_PAIR is returned.
+*/
int32_t
utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr)
{
uint8_t c;
uint8_t l;
+
*end_ptr = input;
c = input[0];
if (c == 0) {
@@ -234,45 +291,57 @@ utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr)
l = utf8_sequence_len[c];
if (l == 1) {
* end_ptr = input + 1;
- return c;
+ return (int32_t) c;
}
if (l == 2) {
+ uint8_t d;
+ d = input[1];
/* Two byte case. */
- if (input[1] < 0x80 || input[1] > 0xBF) {
+ if (d < 0x80 || d > 0xBF) {
return UTF8_BAD_CONTINUATION_BYTE;
}
if (c <= 0xC1) {
- return UTF8_NON_SHORTEST;
+ return UTF8_BAD_CONTINUATION_BYTE;
}
* end_ptr = input + 2;
return
((int32_t) (c & 0x1F) << 6) |
- ((int32_t) (input[1] & 0x3F));
+ ((int32_t) (d & 0x3F));
}
if (l == 3) {
+ uint8_t d;
+ uint8_t e;
+ int32_t r;
+
+ d = input[1];
+ e = input[2];
/* Three byte case. */
- if (input[1] < 0x80 || input[1] > 0xBF ||
- input[2] < 0x80 || input[2] > 0xBF) {
+ if (d < 0x80 || d > 0xBF ||
+ e < 0x80 || e > 0xBF) {
return UTF8_BAD_CONTINUATION_BYTE;
}
- if (c == 0xe0 && input[1] < 0xa0) {
+ if (c == 0xe0 && d < 0xa0) {
/* We don't need to check the value of input[2], because
the if statement above this one already guarantees that
it is 10xxxxxx. */
- return UTF8_NON_SHORTEST;
+ return UTF8_BAD_CONTINUATION_BYTE;
}
+ r = ((int32_t) (c & 0x0F)) << 12 |
+ ((int32_t) (d & 0x3F)) << 6 |
+ ((int32_t) (e & 0x3F));
+ REJECT_SURROGATE(r);
+ REJECT_FFFF(r);
+ REJECT_NOT_CHAR(r);
* end_ptr = input + 3;
- return
- ((int32_t) (c & 0x0F)) << 12 |
- ((int32_t) (input[1] & 0x3F)) << 6 |
- ((int32_t) (input[2] & 0x3F));
+ return r;
}
- if (l == 4) {
+ else if (l == 4) {
/* Four byte case. */
uint8_t d;
uint8_t e;
uint8_t f;
int32_t v;
+
d = input[1];
e = input[2];
f = input[3];
@@ -287,10 +356,10 @@ utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr)
}
if (c == 0xf0 && d < 0x90) {
- /* We don't need to check the values of e and d, because
+ /* We don't need to check the values of e and f, because
the if statement above this one already guarantees that
- e and d are 10xxxxxx. */
- return UTF8_NON_SHORTEST;
+ e and f are 10xxxxxx. */
+ return UTF8_BAD_CONTINUATION_BYTE;
}
/* Calculate the code point. */
v = FOUR (input);
@@ -299,33 +368,40 @@ utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr)
return UNICODE_TOO_BIG;
}
/* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
- if ((v & 0xffff) >= 0xfffe) {
- return UNICODE_NOT_CHARACTER;
- }
+ REJECT_FFFF(v);
+ /* We don't need to check for surrogate pairs here, since the
+ minimum value of UCS2 if there are four bytes of UTF-8 is
+ 0x10000. */
* end_ptr = input + 4;
return v;
}
return UTF8_BAD_LEADING_BYTE;
}
-#define UNI_SUR_HIGH_START 0xD800
-#define UNI_SUR_HIGH_END 0xDBFF
-#define UNI_SUR_LOW_START 0xDC00
-#define UNI_SUR_LOW_END 0xDFFF
/* Input: a Unicode code point, "ucs2".
Output: UTF-8 characters in buffer "utf8".
Return value: the number of bytes written into "utf8", or a
- negative number if there was an error. If the value of "ucs2" is
- invalid because of being in the surrogate pair range from 0xD800 to
- 0xDFFF, the return value is UNICODE_SURROGATE_PAIR, else if the
- value is too big to fit into four bytes of UTF-8, UNICODE_UTF8_4,
- the return value is UNICODE_TOO_BIG. However, it does not insist on
- ucs2 being less than UNICODE_MAXIMUM, so the user needs to check
- that "ucs2" is a valid code point. It also does not check for
- invalid characters, such as 0xFFFF.
+ negative number if there was an error.
+
+ If the value of "ucs2" is invalid because of being in the surrogate
+ pair range from 0xD800 to 0xDFFF, the return value is
+ UNICODE_SURROGATE_PAIR.
+
+ If the value of "ucs2" is in the range 0xFDD0 to 0xFDEF inclusive,
+ the return value is UNICODE_NOT_CHARACTER.
+
+ If the lower two bytes of "ucs2" are either 0xFFFE or 0xFFFF, the
+ return value is UNICODE_NOT_CHARACTER.
+
+ If the value is too big to fit into four bytes of UTF-8,
+ UNICODE_UTF8_4, the return value is UNICODE_TOO_BIG.
+
+ However, it does not insist on ucs2 being less than
+ UNICODE_MAXIMUM, so the user needs to check that "ucs2" is a valid
+ code point.
This adds a zero byte to the end of the string. It assumes that the
buffer "utf8" has at least UNICODE_MAX_LENGTH (5) bytes of space to
@@ -334,6 +410,7 @@ utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr)
int32_t
ucs2_to_utf8 (int32_t ucs2, uint8_t * utf8)
{
+ REJECT_FFFF(ucs2);
if (ucs2 < 0x80) {
utf8[0] = ucs2;
utf8[1] = '\0';
@@ -350,10 +427,8 @@ ucs2_to_utf8 (int32_t ucs2, uint8_t * utf8)
utf8[1] = ((ucs2 >> 6 ) & 0x3F) | 0x80;
utf8[2] = ((ucs2 ) & 0x3F) | 0x80;
utf8[3] = '\0';
- if (ucs2 >= UNI_SUR_HIGH_START && ucs2 <= UNI_SUR_LOW_END) {
- /* Ill-formed. */
- return UNICODE_SURROGATE_PAIR;
- }
+ REJECT_SURROGATE(ucs2);
+ REJECT_NOT_CHAR(ucs2);
return 3;
}
if (ucs2 <= UNICODE_UTF8_4) {
@@ -472,9 +547,9 @@ unicode_chars_to_bytes (const uint8_t * utf8, int32_t n_chars)
}
/* Like unicode_count_chars, but without error checks or validation of
- the input. This only checks the first byte of each UTF-8
- sequence. It may return UTF8_BAD_LEADING_BYTE if the first byte is
- invalid. */
+ the input. This only checks the first byte of each UTF-8 sequence,
+ then jumps over the succeeding bytes. It may return
+ UTF8_BAD_LEADING_BYTE if the first byte is invalid. */
int32_t
unicode_count_chars_fast (const uint8_t * utf8)
@@ -562,6 +637,34 @@ unicode_count_chars (const uint8_t * utf8)
case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
case 0xBF
+#define BYTE_80_8F_B0_BF \
+ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
+ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
+ case 0x8E: case 0x8F: case 0xB0: \
+ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
+ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
+ case 0xBF
+#define BYTE_80_B6_B8_BF \
+ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
+ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
+ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
+ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
+ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
+ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
+ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
+ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: \
+ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
+ case 0xBF
+#define BYTE_80_BD \
+ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
+ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
+ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
+ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
+ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
+ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
+ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
+ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
+ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD
#define BYTE_90_BF \
0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: \
case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: \
@@ -585,8 +688,6 @@ unicode_count_chars (const uint8_t * utf8)
#define BYTE_E1_EC \
0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: \
case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC
-#define BYTE_EE_EF \
- 0xEE: case 0xEF
#define BYTE_F1_F3 \
0xF1: case 0xF2: case 0xF3
#endif /* def HEADER */
@@ -604,26 +705,75 @@ unicode_count_chars (const uint8_t * utf8)
int32_t
valid_utf8 (const uint8_t * input, int32_t input_length)
{
+ int32_t error;
+ utf8_info_t info;
+ error = validate_utf8 (input, input_length, & info);
+ if (error < 0) {
+ return UTF8_INVALID;
+ }
+ return UTF8_VALID;
+}
+
+#define FAIL(x) \
+ info->len_read = i; \
+ return x
+
+#ifdef HEADER
+
+typedef struct utf8_info
+{
+ int32_t len_read;
+ int32_t runes_read;
+}
+utf8_info_t;
+
+#endif /* def HEADER */
+
+/* Given "input" and "len", validate "input" byte by byte up to
+ "len". The return value is "UNICODE_OK" (zero) on success or the
+ error found (a negative number) on failure.
+
+ utf8_info_t is defined in "unicode.h".
+
+ The value of "info.len_read" is the number of bytes processed. the
+ value of "info.runes_read" is the number of Unicode code points in
+ the input. */
+
+int32_t
+validate_utf8 (const uint8_t * input, int32_t len, utf8_info_t * info)
+{
int32_t i;
uint8_t c;
+ info->len_read = 0;
+ /* We want to increment the runes after "string_start", but that
+ would give us one too many. */
+ info->runes_read = -1;
i = 0;
string_start:
- i++;
- if (i >= input_length) {
- return UTF8_VALID;
+ /* We get here after successfully reading a "rune". */
+
+ info->runes_read++;
+ if (i >= len) {
+ info->len_read = len;
+ return UNICODE_OK; /* 0 */
}
+
/* Set c separately here since we use a range comparison before
the switch statement. */
+
c = input[i];
- /* Admit all bytes <= 0x80. */
- if (c <= 0x80) {
+ if (c == 0) {
+ FAIL (UNICODE_EMPTY_INPUT);
+ }
+ /* Admit all bytes < 0x80. */
+ if (c < 0x80) {
+ i++;
goto string_start;
}
-
switch (c) {
case BYTE_C2_DF:
UNICODEADDBYTE;
@@ -641,10 +791,14 @@ valid_utf8 (const uint8_t * input, int32_t input_length)
UNICODEADDBYTE;
goto byte23_80_9f;
- case BYTE_EE_EF:
+ case 0xEE:
UNICODEADDBYTE;
goto byte_penultimate_80_bf;
+ case 0xEF:
+ UNICODEADDBYTE;
+ goto byte_ef_80_bf;
+
case 0xF0:
UNICODEADDBYTE;
goto byte24_90_bf;
@@ -657,86 +811,136 @@ valid_utf8 (const uint8_t * input, int32_t input_length)
UNICODEADDBYTE;
goto byte24_80_8f;
+ default:
+ FAIL (UTF8_BAD_LEADING_BYTE);
}
byte_last_80_bf:
switch (UNICODENEXTBYTE) {
-
case BYTE_80_BF:
UNICODEADDBYTE;
goto string_start;
default:
- UNICODEFAILUTF8 (XBYTES_80_BF);
+ FAIL (UTF8_BAD_CONTINUATION_BYTE);
}
- byte_penultimate_80_bf:
+ byte_ef_b7:
+ switch (UNICODENEXTBYTE) {
+ case BYTE_80_8F_B0_BF:
+ UNICODEADDBYTE;
+ goto string_start;
+ default:
+ if (c >= 0x90 && c <= 0xAF) {
+ FAIL (UNICODE_NOT_CHARACTER);
+ }
+ else {
+ FAIL (UTF8_BAD_CONTINUATION_BYTE);
+ }
+ }
+
+ byte_last_80_bd:
switch (UNICODENEXTBYTE) {
+ case BYTE_80_BD:
+ UNICODEADDBYTE;
+ goto string_start;
+ case 0xBE:
+ case 0xBF:
+ FAIL (UNICODE_NOT_CHARACTER);
+ default:
+ FAIL (UTF8_BAD_CONTINUATION_BYTE);
+ }
+
+ byte_penultimate_80_bf:
+ switch (UNICODENEXTBYTE) {
case BYTE_80_BF:
UNICODEADDBYTE;
goto byte_last_80_bf;
default:
- UNICODEFAILUTF8 (XBYTES_80_BF);
+ FAIL (UTF8_BAD_CONTINUATION_BYTE);
+ }
+
+ byte_ef_80_bf:
+ switch (UNICODENEXTBYTE) {
+ case BYTE_80_B6_B8_BF:
+ UNICODEADDBYTE;
+ goto byte_last_80_bd;
+ case 0xB7:
+ UNICODEADDBYTE;
+ /* FDD0 - FDE7 */
+ goto byte_ef_b7;
+ default:
+ FAIL (UTF8_BAD_CONTINUATION_BYTE);
}
byte24_90_bf:
switch (UNICODENEXTBYTE) {
-
case BYTE_90_BF:
UNICODEADDBYTE;
goto byte_penultimate_80_bf;
default:
- UNICODEFAILUTF8 (XBYTES_90_BF);
+ FAIL (UTF8_BAD_CONTINUATION_BYTE);
}
byte23_80_9f:
switch (UNICODENEXTBYTE) {
-
case BYTE_80_9F:
UNICODEADDBYTE;
goto byte_last_80_bf;
default:
- UNICODEFAILUTF8 (XBYTES_80_9F);
+ if (c >= 0xA0 && c <= 0xBF) {
+ FAIL (UNICODE_SURROGATE_PAIR);
+ }
+ else {
+ FAIL (UTF8_BAD_CONTINUATION_BYTE);
+ }
}
byte23_a0_bf:
switch (UNICODENEXTBYTE) {
-
case BYTE_A0_BF:
UNICODEADDBYTE;
goto byte_last_80_bf;
default:
- UNICODEFAILUTF8 (XBYTES_A0_BF);
+ FAIL (UTF8_BAD_CONTINUATION_BYTE);
}
byte24_80_bf:
switch (UNICODENEXTBYTE) {
-
case BYTE_80_BF:
UNICODEADDBYTE;
- goto byte_penultimate_80_bf;
+ goto byte_ef_80_bf;
default:
- UNICODEFAILUTF8 (XBYTES_80_BF);
+ FAIL (UTF8_BAD_CONTINUATION_BYTE);
}
byte24_80_8f:
switch (UNICODENEXTBYTE) {
-
case BYTE_80_8F:
UNICODEADDBYTE;
- goto byte_penultimate_80_bf;
+ goto byte_ef_80_bf;
default:
- UNICODEFAILUTF8 (XBYTES_80_8F);
+ if (c >= 0x90) {
+ FAIL (UNICODE_TOO_BIG);
+ }
+ else {
+ FAIL (UTF8_BAD_CONTINUATION_BYTE);
+ }
}
}
+#define REJECT_FE_FF(c) \
+ if (c == 0xFF || c == 0xFE) { \
+ return UNICODE_NOT_CHARACTER; \
+ }
+
/* Make "* ptr" point to the start of the first UTF-8 character after
its initial value. This assumes that there are at least four bytes
which can be read, and that "* ptr" points to valid UTF-8.
@@ -749,26 +953,35 @@ valid_utf8 (const uint8_t * input, int32_t input_length)
second, third, or fourth byte of a multibyte sequence, "* ptr" is
incremented until either "** ptr" is a valid first byte of a UTF-8
sequence, or too many bytes have passed for it to be valid
- UTF-8. If too many bytes have passed, UTF8_BAD_CONTINUATION_BYTE is returned
- and "*ptr" is left unchanged. If a valid UTF-8 first byte was
- found, either 11xx_xxxx or 00xx_xxxx, UNICODE_OK is returned, and
- "*ptr" is set to the address of the valid byte. Nul bytes (bytes
- containing zero) are considered valid. This does not check for
- invalid UTF-8 bytes such as 0xFE and 0xFF. */
+ UTF-8. If too many bytes have passed, UTF8_BAD_CONTINUATION_BYTE is
+ returned and "*ptr" is left unchanged.
+
+ If a valid UTF-8 first byte was found, either 11xx_xxxx or
+ 00xx_xxxx, UNICODE_OK is returned, and "*ptr" is set to the address
+ of the valid byte. Nul bytes (bytes containing zero) are considered
+ valid.
+
+ If any of the bytes read contains invalid UTF-8 bytes 0xFE and
+ 0xFF, the error code UNICODE_NOT_CHARACTER is returned and "*ptr"
+ is left unchanged. */
int32_t
-trim_to_utf8_start (uint8_t ** ptr)
+trim_to_utf8_start (const uint8_t ** ptr)
{
- uint8_t * p = *ptr;
+ const uint8_t * p = *ptr;
uint8_t c;
int32_t i;
+
+ c = * p;
+ REJECT_FE_FF (c);
/* 0xC0 = 1100_0000. */
- c = *p & 0xC0;
+ c &= 0xC0;
if (c == 0xC0 || c == 0x00) {
return UNICODE_OK;
}
for (i = 0; i < UTF8_MAX_LENGTH - 1; i++) {
c = p[i];
+ REJECT_FE_FF (c);
if ((c & 0x80) != 0x80 || (c & 0x40) != 0) {
* ptr = p + i;
return UNICODE_OK;
@@ -826,6 +1039,7 @@ unicode_code_to_error (int32_t code)
#include "c-tap-test.h"
static const uint8_t * utf8 = (uint8_t *) "漢数字ÔÕÖX";
+static const uint8_t bad[] = {0x99, 0x99, 0x99, 0x99, 0x99, 0x99, 0x0};
#define BUFFSIZE 0x100
@@ -979,7 +1193,8 @@ test_utf8_to_ucs2 ()
const uint8_t * end;
unicode = utf8_to_ucs2 (start, & end);
TAP_TEST_MSG (unicode > 0, "no bad value at %s", start);
- printf ("# %s is %04X, length is %d\n", start, unicode, end - start);
+ printf ("# %s is %04X, length is %d\n",
+ start, unicode, (int) (end - start));
start = end;
}
}
@@ -1008,9 +1223,8 @@ static void
test_trim_to_utf8_start ()
{
int32_t status;
- uint8_t * p;
+ const uint8_t * p;
/* Invalid UTF-8. */
- uint8_t bad[] = {0x99, 0x99, 0x99, 0x99, 0x99, 0x99};
/* Valid UTF-8. */
uint8_t good[] = "化苦";
uint8_t good2[] = "化abc";
@@ -1037,6 +1251,29 @@ test_constants ()
TAP_TEST (UNICODE_UTF8_4 > UNICODE_MAXIMUM);
}
+static void
+test_utf8_validate ()
+{
+ int r;
+ int l;
+ utf8_info_t info;
+
+ r = validate_utf8 ((const uint8_t *) "", 0, & info);
+ TAP_TEST_EQUAL (r, UNICODE_OK);
+ TAP_TEST_EQUAL (info.len_read, 0);
+ TAP_TEST_EQUAL (info.runes_read, 0);
+
+ l = strlen ((const char *) utf8);
+ r = validate_utf8 (utf8, l, & info);
+ TAP_TEST_EQUAL (r, UNICODE_OK);
+ TAP_TEST_EQUAL (info.len_read, l);
+ TAP_TEST_EQUAL (info.runes_read, 7);
+
+ l = strlen ((const char *) bad);
+ r = validate_utf8 (bad, l, & info);
+ TAP_TEST (r != UNICODE_OK);
+}
+
int main ()
{
test_utf8_to_ucs2 ();
@@ -1048,6 +1285,7 @@ int main ()
test_valid_utf8 ();
test_trim_to_utf8_start ();
test_constants ();
+ test_utf8_validate ();
TAP_PLAN;
}
diff --git a/unicode.h b/unicode.h
index 0f1efcc..75103b2 100644
--- a/unicode.h
+++ b/unicode.h
@@ -29,34 +29,34 @@ extern const uint8_t utf8_sequence_len[];
#line 103 "unicode.c"
int32_t utf8_bytes (uint8_t c);
-#line 124 "unicode.c"
+#line 146 "unicode.c"
int32_t utf8_no_checks (const uint8_t* input, const uint8_t** end_ptr);
-#line 160 "unicode.c"
+#line 197 "unicode.c"
int32_t utf8_to_ucs2 (const uint8_t* input, const uint8_t** end_ptr);
-#line 250 "unicode.c"
+#line 295 "unicode.c"
int32_t ucs2_to_utf8 (int32_t ucs2, uint8_t* utf8);
-#line 295 "unicode.c"
+#line 339 "unicode.c"
int32_t unicode_to_surrogates (int32_t unicode, int32_t* hi_ptr, int32_t* lo_ptr);
-#line 314 "unicode.c"
+#line 358 "unicode.c"
int32_t surrogates_to_unicode (int32_t hi, int32_t lo);
-#line 337 "unicode.c"
+#line 381 "unicode.c"
int32_t surrogate_to_utf8 (int32_t hi, int32_t lo, uint8_t* utf8);
-#line 350 "unicode.c"
+#line 394 "unicode.c"
int32_t unicode_chars_to_bytes (const uint8_t* utf8, int32_t n_chars);
-#line 370 "unicode.c"
+#line 414 "unicode.c"
int32_t unicode_count_chars_fast (const uint8_t* utf8);
-#line 392 "unicode.c"
+#line 436 "unicode.c"
int32_t unicode_count_chars (const uint8_t* utf8);
-#line 415 "unicode.c"
+#line 459 "unicode.c"
#define BYTE_80_8F \
0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
@@ -78,6 +78,34 @@ int32_t unicode_count_chars (const uint8_t* utf8);
case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
case 0xBF
+#define BYTE_80_8F_B0_BF \
+ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
+ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
+ case 0x8E: case 0x8F: case 0xB0: \
+ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
+ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
+ case 0xBF
+#define BYTE_80_B6_B8_BF \
+ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
+ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
+ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
+ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
+ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
+ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
+ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
+ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: \
+ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \
+ case 0xBF
+#define BYTE_80_BD \
+ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \
+ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \
+ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \
+ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \
+ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \
+ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \
+ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \
+ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \
+ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD
#define BYTE_90_BF \
0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: \
case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: \
@@ -101,18 +129,27 @@ int32_t unicode_count_chars (const uint8_t* utf8);
#define BYTE_E1_EC \
0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: \
case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC
-#define BYTE_EE_EF \
- 0xEE: case 0xEF
#define BYTE_F1_F3 \
0xF1: case 0xF2: case 0xF3
-#line 479 "unicode.c"
+#line 549 "unicode.c"
int32_t valid_utf8 (const uint8_t* input, int32_t input_length);
-#line 616 "unicode.c"
-int32_t trim_to_utf8_start (uint8_t** ptr);
+#line 563 "unicode.c"
+typedef struct utf8_info
+{
+ int32_t len_read;
+ int32_t runes_read;
+}
+utf8_info_t;
+
+#line 573 "unicode.c"
+int32_t validate_utf8 (const uint8_t* input, int32_t len, utf8_info_t* info);
+
+#line 775 "unicode.c"
+int32_t trim_to_utf8_start (const uint8_t** ptr);
-#line 639 "unicode.c"
+#line 802 "unicode.c"
const char* unicode_code_to_error (int32_t code);
#endif /* CFH_UNICODE_H */
diff --git a/utf8-byte-one.c b/utf8-byte-one.c
index e7561e8..a8623b3 100644
--- a/utf8-byte-one.c
+++ b/utf8-byte-one.c
@@ -18,7 +18,8 @@
ADDBYTE;
goto byte23_80_9f;
- case BYTE_EE_EF:
+ case 0xEE:
+ case 0xEF:
ADDBYTE;
goto byte_penultimate_80_bf;