summaryrefslogtreecommitdiff
path: root/tools/utf8-tests.scm
diff options
context:
space:
mode:
Diffstat (limited to 'tools/utf8-tests.scm')
-rw-r--r--tools/utf8-tests.scm135
1 files changed, 135 insertions, 0 deletions
diff --git a/tools/utf8-tests.scm b/tools/utf8-tests.scm
new file mode 100644
index 0000000..30f1a97
--- /dev/null
+++ b/tools/utf8-tests.scm
@@ -0,0 +1,135 @@
+;;; utf8proc->s7 tests
+
+(load "libutf8proc.scm")
+
+(when (defined? '*libutf8proc*)
+
+ (with-let *libutf8proc*
+
+ ;; --------------------------------
+ ;; these are from the libutf8proc test directory
+
+ (define (print-property c)
+ (format *stderr* " category = ~S~% charwidth = ~D~%~A~%"
+ (utf8proc_category_string c)
+ (utf8proc_charwidth c)
+ (utf8proc_get_property c)))
+
+ (do ((c 1 (+ c 1)))
+ ((= c #x110000))
+ (let ((l (utf8proc_tolower c))
+ (u (utf8proc_toupper c)))
+ (unless (or (= l c)
+ (utf8proc_codepoint_valid l))
+ (format *stderr* "~X: invalid tolower~%" c))
+ (unless (or (= u c)
+ (utf8proc_codepoint_valid u))
+ (format *stderr* "~X: invalid toupper~%" c))
+ ))
+
+ (do ((c 0 (+ c 1)))
+ ((or (= c #xd800)
+ (and (not (utf8proc_codepoint_valid c))
+ (not (format *stderr* "~X: codepoint invalid~%" c))))))
+
+ (do ((c #xd800 (+ c 1)))
+ ((or (= c #xe000)
+ (and (utf8proc_codepoint_valid c)
+ (not (format *stderr* "~X: codepoint valid?~%" c))))))
+
+ (do ((c #xe000 (+ c 1)))
+ ((or (= c #x110000)
+ (and (not (utf8proc_codepoint_valid c))
+ (not (format *stderr* "~X: codepoint invalid~%" c))))))
+
+ (do ((c #x110000 (+ c 1)))
+ ((or (= c #x110010)
+ (and (utf8proc_codepoint_valid c)
+ (not (format *stderr* "~X: codepoint valid?~%" c))))))
+
+ ;; (print-property #xbb)
+
+ (do ((c 1 (+ c 1)))
+ ((= c #x110000))
+ (let ((cat ((utf8proc_get_property c) 'category))
+ (w (utf8proc_charwidth c)))
+ (if (and (or (= cat UTF8PROC_CATEGORY_MN) (= cat UTF8PROC_CATEGORY_ME))
+ (positive? w))
+ (format *stderr* "nonzero width ~D for combining char ~X~%" w c))
+ (if (and (zero? w)
+ (or (and (>= cat UTF8PROC_CATEGORY_LU) (<= cat UTF8PROC_CATEGORY_LO))
+ (and (>= cat UTF8PROC_CATEGORY_ND) (<= cat UTF8PROC_CATEGORY_SC))
+ (and (>= cat UTF8PROC_CATEGORY_SO) (<= cat UTF8PROC_CATEGORY_ZS))))
+ (format *stderr* "zero width for symbol-like char ~X~%" c))))
+ ;; --------------------------------
+
+ (define s '("élan ‘quote’")) ; example from Norman Gray
+ (display s) ; ("élan â\x80;\x98;quoteâ\x80;\x99;") -- this is due to write's slashify_table choices: now displays ("élan ‘quote’")
+ (newline)
+ (display (car s)) ; élan ‘quote’
+ (newline)
+
+ (define b (string->byte-vector (car s)))
+ (format #t "~{~X ~}" b) ;c3 a9 6c 61 6e 20 e2 80 98 71 75 6f 74 65 e2 80 99
+ (newline)
+
+
+ (define p (utf8proc_map (car s) UTF8PROC_NULLTERM)) ; is this doing anything useful (besides error checking)?
+ (display (car p)) ; élan ‘quote’
+ (newline)
+
+ (define p1 (utf8proc_map "(\"élan ‘quote’\")" UTF8PROC_NULLTERM))
+ (display (car p1)) ; ("élan ‘quote’")
+ (newline)
+
+ (define b1 (string->byte-vector (car p1)))
+ (format #t "~{~X ~}" b1) ;28 22 c3 a9 6c 61 6e 20 e2 80 98 71 75 6f 74 65 e2 80 99 22 29
+ (newline)
+
+ (define s1 (with-output-to-string (lambda () (display s))))
+ (display s1) (newline) ; ("élan ‘quote’")
+ (define p2 (utf8proc_map s1 UTF8PROC_NULLTERM))
+ (if (integer? (cdr p2))
+ (display (utf8proc_errmsg (cdr p2))) ; "Invalid UTF-8 string" or "unknown error" -- what is the problem here?
+ (display (car p2)))
+ (newline)
+
+ (let ((len (cdr p1))
+ (p1c (copy (car p1))))
+ (do ((n (utf8proc_iterate p1c len) (utf8proc_iterate p1c len)))
+ ((<= (car n) 0)) ; (cdr n) is the codepoint as an integer
+ (display (substring p1c 0 (car n))) (display #\space) ; ( " é l a n ‘ q u o t e ’ " )
+ (set! p1c (substring p1c (car n)))
+ (set! len (- len (car n))))
+ (newline))
+
+ (let ((e1 (utf8proc_encode_char #x00E9))) ; unicode code-point to utf-8 -> (cons utf-8-string length-thereof)
+ (format #t "#x~{~X~}" (string->byte-vector (car e1))) ; #xc3a9
+ (newline)
+ (display (car e1)) ; é
+ (newline))
+
+ (let ((e1 (utf8proc_encode_char #x018b)))
+ (format #t "#x~{~X~}" (string->byte-vector (car e1)))
+ (newline)
+ (display (car e1)) ; latin cap D with top bar
+ (newline))
+
+ (let ((e1 (utf8proc_encode_char #x0238)))
+ (format #t "#x~{~X~}" (string->byte-vector (car e1)))
+ (newline)
+ (display (car e1)) ; latin small db digraph
+ (newline))
+
+ (let ((e1 (utf8proc_encode_char #x1e00)))
+ (format #t "#x~{~X~}" (string->byte-vector (car e1)))
+ (newline)
+ (display (car e1)) ; latin cap A ring below
+ (newline))
+
+ (display (string->symbol "élan ‘quote’"))
+ (newline)
+ (display (symbol->string (symbol "élan ‘quote’")))
+ (newline)
+
+ ))