utf8_strlen: Improve performance

Use utf8_charlen() rather than utf8_tounicode() Note that utf8_charlen() now returns 1 instead of -1 for an invalid utf-8 start byte. Reported-by: dbohdan <dbohdan@dbohdan.com> Signed-off-by: Steve Bennett <steveb@workware.net.au>
author: Steve Bennett <steveb@workware.net.au> 2019-10-28 07:38:22 +1000
committer: Steve Bennett <steveb@workware.net.au> 2019-10-28 07:43:47 +1000
commit: 338f5b5681181c771646b6f225f103be4b14ea03 (patch)
tree: f5f22e66725e52e6d281ed525f8f6643b22eddc8
parent: 27ed764fa513ea55f92ea1e6be500fb22b30007b (diff)
2 files changed, 5 insertions, 7 deletions
diff --git a/utf8.c b/utf8.c
index ffee89d..405c20d 100644
--- a/utf8.c
+++ b/utf8.c
@@ -56,8 +56,8 @@ int utf8_charlen(int c)
     if ((c & 0xf8) == 0xf0) {
         return 4;
     }
-    /* Invalid sequence */
-    return -1;
+    /* Invalid sequence, so treat it as a single byte */
+    return 1;
 }
 
 int utf8_strlen(const char *str, int bytelen)
@@ -93,8 +93,7 @@ int utf8_index(const char *str, int index)
 {
     const char *s = str;
     while (index--) {
-        int c;
-        s += utf8_tounicode(s, &c);
+        s += utf8_charlen(*s);
     }
     return s - str;
 }
diff --git a/utf8.h b/utf8.h
index 40fc95f..9970683 100644
--- a/utf8.h
+++ b/utf8.h
@@ -49,9 +49,8 @@ int utf8_fromunicode(char *p, unsigned uc);
 /**
  * Returns the length of the utf-8 sequence starting with 'c'.
  *
- * Returns 1-4, or -1 if this is not a valid start byte.
- *
- * Note that charlen=4 is not supported by the rest of the API.
+ * Returns 1-4.
+ * If 'c' is not a valid start byte, returns 1.
  */
 int utf8_charlen(int c);
author	Steve Bennett <steveb@workware.net.au>	2019-10-28 07:38:22 +1000
committer	Steve Bennett <steveb@workware.net.au>	2019-10-28 07:43:47 +1000
commit	338f5b5681181c771646b6f225f103be4b14ea03 (patch)
tree	f5f22e66725e52e6d281ed525f8f6643b22eddc8
parent	27ed764fa513ea55f92ea1e6be500fb22b30007b (diff)