diff options
Diffstat (limited to 'tools/utf8-tests.scm')
-rw-r--r-- | tools/utf8-tests.scm | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/tools/utf8-tests.scm b/tools/utf8-tests.scm new file mode 100644 index 0000000..30f1a97 --- /dev/null +++ b/tools/utf8-tests.scm @@ -0,0 +1,135 @@ +;;; utf8proc->s7 tests + +(load "libutf8proc.scm") + +(when (defined? '*libutf8proc*) + + (with-let *libutf8proc* + + ;; -------------------------------- + ;; these are from the libutf8proc test directory + + (define (print-property c) + (format *stderr* " category = ~S~% charwidth = ~D~%~A~%" + (utf8proc_category_string c) + (utf8proc_charwidth c) + (utf8proc_get_property c))) + + (do ((c 1 (+ c 1))) + ((= c #x110000)) + (let ((l (utf8proc_tolower c)) + (u (utf8proc_toupper c))) + (unless (or (= l c) + (utf8proc_codepoint_valid l)) + (format *stderr* "~X: invalid tolower~%" c)) + (unless (or (= u c) + (utf8proc_codepoint_valid u)) + (format *stderr* "~X: invalid toupper~%" c)) + )) + + (do ((c 0 (+ c 1))) + ((or (= c #xd800) + (and (not (utf8proc_codepoint_valid c)) + (not (format *stderr* "~X: codepoint invalid~%" c)))))) + + (do ((c #xd800 (+ c 1))) + ((or (= c #xe000) + (and (utf8proc_codepoint_valid c) + (not (format *stderr* "~X: codepoint valid?~%" c)))))) + + (do ((c #xe000 (+ c 1))) + ((or (= c #x110000) + (and (not (utf8proc_codepoint_valid c)) + (not (format *stderr* "~X: codepoint invalid~%" c)))))) + + (do ((c #x110000 (+ c 1))) + ((or (= c #x110010) + (and (utf8proc_codepoint_valid c) + (not (format *stderr* "~X: codepoint valid?~%" c)))))) + + ;; (print-property #xbb) + + (do ((c 1 (+ c 1))) + ((= c #x110000)) + (let ((cat ((utf8proc_get_property c) 'category)) + (w (utf8proc_charwidth c))) + (if (and (or (= cat UTF8PROC_CATEGORY_MN) (= cat UTF8PROC_CATEGORY_ME)) + (positive? w)) + (format *stderr* "nonzero width ~D for combining char ~X~%" w c)) + (if (and (zero? w) + (or (and (>= cat UTF8PROC_CATEGORY_LU) (<= cat UTF8PROC_CATEGORY_LO)) + (and (>= cat UTF8PROC_CATEGORY_ND) (<= cat UTF8PROC_CATEGORY_SC)) + (and (>= cat UTF8PROC_CATEGORY_SO) (<= cat UTF8PROC_CATEGORY_ZS)))) + (format *stderr* "zero width for symbol-like char ~X~%" c)))) + ;; -------------------------------- + + (define s '("élan ‘quote’")) ; example from Norman Gray + (display s) ; ("élan â\x80;\x98;quoteâ\x80;\x99;") -- this is due to write's slashify_table choices: now displays ("élan ‘quote’") + (newline) + (display (car s)) ; élan ‘quote’ + (newline) + + (define b (string->byte-vector (car s))) + (format #t "~{~X ~}" b) ;c3 a9 6c 61 6e 20 e2 80 98 71 75 6f 74 65 e2 80 99 + (newline) + + + (define p (utf8proc_map (car s) UTF8PROC_NULLTERM)) ; is this doing anything useful (besides error checking)? + (display (car p)) ; élan ‘quote’ + (newline) + + (define p1 (utf8proc_map "(\"élan ‘quote’\")" UTF8PROC_NULLTERM)) + (display (car p1)) ; ("élan ‘quote’") + (newline) + + (define b1 (string->byte-vector (car p1))) + (format #t "~{~X ~}" b1) ;28 22 c3 a9 6c 61 6e 20 e2 80 98 71 75 6f 74 65 e2 80 99 22 29 + (newline) + + (define s1 (with-output-to-string (lambda () (display s)))) + (display s1) (newline) ; ("élan ‘quote’") + (define p2 (utf8proc_map s1 UTF8PROC_NULLTERM)) + (if (integer? (cdr p2)) + (display (utf8proc_errmsg (cdr p2))) ; "Invalid UTF-8 string" or "unknown error" -- what is the problem here? + (display (car p2))) + (newline) + + (let ((len (cdr p1)) + (p1c (copy (car p1)))) + (do ((n (utf8proc_iterate p1c len) (utf8proc_iterate p1c len))) + ((<= (car n) 0)) ; (cdr n) is the codepoint as an integer + (display (substring p1c 0 (car n))) (display #\space) ; ( " é l a n ‘ q u o t e ’ " ) + (set! p1c (substring p1c (car n))) + (set! len (- len (car n)))) + (newline)) + + (let ((e1 (utf8proc_encode_char #x00E9))) ; unicode code-point to utf-8 -> (cons utf-8-string length-thereof) + (format #t "#x~{~X~}" (string->byte-vector (car e1))) ; #xc3a9 + (newline) + (display (car e1)) ; é + (newline)) + + (let ((e1 (utf8proc_encode_char #x018b))) + (format #t "#x~{~X~}" (string->byte-vector (car e1))) + (newline) + (display (car e1)) ; latin cap D with top bar + (newline)) + + (let ((e1 (utf8proc_encode_char #x0238))) + (format #t "#x~{~X~}" (string->byte-vector (car e1))) + (newline) + (display (car e1)) ; latin small db digraph + (newline)) + + (let ((e1 (utf8proc_encode_char #x1e00))) + (format #t "#x~{~X~}" (string->byte-vector (car e1))) + (newline) + (display (car e1)) ; latin cap A ring below + (newline)) + + (display (string->symbol "élan ‘quote’")) + (newline) + (display (symbol->string (symbol "élan ‘quote’"))) + (newline) + + )) |