summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteve Bennett <steveb@workware.net.au>2019-10-28 07:38:22 +1000
committerSteve Bennett <steveb@workware.net.au>2019-10-28 07:43:47 +1000
commit338f5b5681181c771646b6f225f103be4b14ea03 (patch)
treef5f22e66725e52e6d281ed525f8f6643b22eddc8
parent27ed764fa513ea55f92ea1e6be500fb22b30007b (diff)
utf8_strlen: Improve performance
Use utf8_charlen() rather than utf8_tounicode() Note that utf8_charlen() now returns 1 instead of -1 for an invalid utf-8 start byte. Reported-by: dbohdan <dbohdan@dbohdan.com> Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rw-r--r--utf8.c7
-rw-r--r--utf8.h5
2 files changed, 5 insertions, 7 deletions
diff --git a/utf8.c b/utf8.c
index ffee89d..405c20d 100644
--- a/utf8.c
+++ b/utf8.c
@@ -56,8 +56,8 @@ int utf8_charlen(int c)
if ((c & 0xf8) == 0xf0) {
return 4;
}
- /* Invalid sequence */
- return -1;
+ /* Invalid sequence, so treat it as a single byte */
+ return 1;
}
int utf8_strlen(const char *str, int bytelen)
@@ -93,8 +93,7 @@ int utf8_index(const char *str, int index)
{
const char *s = str;
while (index--) {
- int c;
- s += utf8_tounicode(s, &c);
+ s += utf8_charlen(*s);
}
return s - str;
}
diff --git a/utf8.h b/utf8.h
index 40fc95f..9970683 100644
--- a/utf8.h
+++ b/utf8.h
@@ -49,9 +49,8 @@ int utf8_fromunicode(char *p, unsigned uc);
/**
* Returns the length of the utf-8 sequence starting with 'c'.
*
- * Returns 1-4, or -1 if this is not a valid start byte.
- *
- * Note that charlen=4 is not supported by the rest of the API.
+ * Returns 1-4.
+ * If 'c' is not a valid start byte, returns 1.
*/
int utf8_charlen(int c);