Raymond Toy pushed to branch issue-511-update-unicode-tests at cmucl / cmucl Commits: eee157dc by Raymond Toy at 2026-06-02T07:34:04-07:00 Widen word-break table to ntrie8 for Unicode 17.0.0 The word-break table was stored as ntrie4 (4-bit, max 15 classes). The 6.2-era class list already had 14 entries, leaving no room for the classes added since: Hebrew_Letter, Single_Quote, Double_Quote (6.3), ZWJ (9.0), and WSegSpace (11.0). Extend the class list in pack-word-break and the matching decoder array in unicode-word-break (same order, appended so existing indices are preserved), and widen the table from ntrie4 to ntrie8 in the writer, loader, and accessor. unidata.bin was also regenerated. This updates the stored data only; string-next-word-break does not yet implement the new WB rules. - - - - - 3 changed files: - src/code/unidata.lisp - src/i18n/unidata.bin - src/tools/build-unidata.lisp Changes: ===================================== src/code/unidata.lisp ===================================== @@ -715,9 +715,9 @@ (defloader load-word-break (stm 18) (multiple-value-bind (split hvec mvec lvec) - (read-ntrie 4 stm) + (read-ntrie 8 stm) (setf (unidata-word-break *unicode-data*) - (make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec)))) + (make-ntrie8 :split split :hvec hvec :mvec mvec :lvec lvec)))) ;;; Accessor functions. @@ -1193,7 +1193,7 @@ (unless (unidata-word-break *unicode-data*) (load-word-break)) (let* ((data (unidata-word-break *unicode-data*)) - (n (qref4 data code))) + (n (qref8 data code))) n)) (defun unicode-word-break (code) @@ -1201,7 +1201,9 @@ ;; pack-word-break in tools/build-unidata.lisp! (aref #(:other :cr :lf :newline :extend :format :katakana :aletter :midnumlet :midletter :midnum - :numeric :extendnumlet :regional_indicator) + :numeric :extendnumlet :regional_indicator + :hebrew_letter :single_quote :double_quote + :zwj :wsegspace) (unicode-word-break-code code))) ;; Support for character name completion for slime. ===================================== src/i18n/unidata.bin ===================================== Binary files a/src/i18n/unidata.bin and b/src/i18n/unidata.bin differ ===================================== src/tools/build-unidata.lisp ===================================== @@ -637,7 +637,7 @@ ;; 18. Word-break (let ((data (unidata-word-break *unicode-data*))) (update-index (file-position stm) index) - (write-ntrie4 data stm)) + (write-ntrie8 data stm)) ;; All components saved. Patch up index table now. (file-position stm 8) (dotimes (i (length index)) @@ -1011,7 +1011,13 @@ (or (position (ucdent-word-break ucdent) '(:other :cr :lf :newline :extend :format :katakana :aletter :midnumlet :midletter :midnum - :numeric :extendnumlet :regional_indicator)) + :numeric :extendnumlet :regional_indicator + ;; Classes added since Unicode 6.2 (6.3: hebrew_letter, + ;; single_quote, double_quote; 9.0: zwj; 11.0: wsegspace). + ;; Appended so existing indices are preserved; the array in + ;; unicode-word-break MUST match this order. + :hebrew_letter :single_quote :double_quote + :zwj :wsegspace)) 0)) ;; ucd-directory should be the directory where UnicodeData.txt is @@ -1213,7 +1219,7 @@ (let ((split #x66)) (multiple-value-bind (hvec mvec lvec) (pack ucd range (lambda (x) (pack-word-break x)) - 0 4 split) + 0 8 split) (setf (unidata-word-break *unicode-data*) - (make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec)))) + (make-ntrie8 :split split :hvec hvec :mvec mvec :lvec lvec)))) nil)) View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/eee157dc55f993a2e72feaf0... -- View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/eee157dc55f993a2e72feaf0... You're receiving this email because of your account on gitlab.common-lisp.net. Manage all notifications: https://gitlab.common-lisp.net/-/profile/notifications | Help: https://gitlab.common-lisp.net/help