Raymond Toy pushed to branch issue-511-update-unicode-tests at cmucl / cmucl
Commits:
-
eee157dc
by Raymond Toy at 2026-06-02T07:34:04-07:00
3 changed files:
Changes:
| ... | ... | @@ -715,9 +715,9 @@ |
| 715 | 715 | |
| 716 | 716 | (defloader load-word-break (stm 18)
|
| 717 | 717 | (multiple-value-bind (split hvec mvec lvec)
|
| 718 | - (read-ntrie 4 stm)
|
|
| 718 | + (read-ntrie 8 stm)
|
|
| 719 | 719 | (setf (unidata-word-break *unicode-data*)
|
| 720 | - (make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec))))
|
|
| 720 | + (make-ntrie8 :split split :hvec hvec :mvec mvec :lvec lvec))))
|
|
| 721 | 721 | |
| 722 | 722 | ;;; Accessor functions.
|
| 723 | 723 | |
| ... | ... | @@ -1193,7 +1193,7 @@ |
| 1193 | 1193 | (unless (unidata-word-break *unicode-data*)
|
| 1194 | 1194 | (load-word-break))
|
| 1195 | 1195 | (let* ((data (unidata-word-break *unicode-data*))
|
| 1196 | - (n (qref4 data code)))
|
|
| 1196 | + (n (qref8 data code)))
|
|
| 1197 | 1197 | n))
|
| 1198 | 1198 | |
| 1199 | 1199 | (defun unicode-word-break (code)
|
| ... | ... | @@ -1201,7 +1201,9 @@ |
| 1201 | 1201 | ;; pack-word-break in tools/build-unidata.lisp!
|
| 1202 | 1202 | (aref #(:other :cr :lf :newline :extend :format
|
| 1203 | 1203 | :katakana :aletter :midnumlet :midletter :midnum
|
| 1204 | - :numeric :extendnumlet :regional_indicator)
|
|
| 1204 | + :numeric :extendnumlet :regional_indicator
|
|
| 1205 | + :hebrew_letter :single_quote :double_quote
|
|
| 1206 | + :zwj :wsegspace)
|
|
| 1205 | 1207 | (unicode-word-break-code code)))
|
| 1206 | 1208 | |
| 1207 | 1209 | ;; Support for character name completion for slime.
|
| ... | ... | @@ -637,7 +637,7 @@ |
| 637 | 637 | ;; 18. Word-break
|
| 638 | 638 | (let ((data (unidata-word-break *unicode-data*)))
|
| 639 | 639 | (update-index (file-position stm) index)
|
| 640 | - (write-ntrie4 data stm))
|
|
| 640 | + (write-ntrie8 data stm))
|
|
| 641 | 641 | ;; All components saved. Patch up index table now.
|
| 642 | 642 | (file-position stm 8)
|
| 643 | 643 | (dotimes (i (length index))
|
| ... | ... | @@ -1011,7 +1011,13 @@ |
| 1011 | 1011 | (or (position (ucdent-word-break ucdent)
|
| 1012 | 1012 | '(:other :cr :lf :newline :extend :format
|
| 1013 | 1013 | :katakana :aletter :midnumlet :midletter :midnum
|
| 1014 | - :numeric :extendnumlet :regional_indicator))
|
|
| 1014 | + :numeric :extendnumlet :regional_indicator
|
|
| 1015 | + ;; Classes added since Unicode 6.2 (6.3: hebrew_letter,
|
|
| 1016 | + ;; single_quote, double_quote; 9.0: zwj; 11.0: wsegspace).
|
|
| 1017 | + ;; Appended so existing indices are preserved; the array in
|
|
| 1018 | + ;; unicode-word-break MUST match this order.
|
|
| 1019 | + :hebrew_letter :single_quote :double_quote
|
|
| 1020 | + :zwj :wsegspace))
|
|
| 1015 | 1021 | 0))
|
| 1016 | 1022 | |
| 1017 | 1023 | ;; ucd-directory should be the directory where UnicodeData.txt is
|
| ... | ... | @@ -1213,7 +1219,7 @@ |
| 1213 | 1219 | (let ((split #x66))
|
| 1214 | 1220 | (multiple-value-bind (hvec mvec lvec)
|
| 1215 | 1221 | (pack ucd range (lambda (x) (pack-word-break x))
|
| 1216 | - 0 4 split)
|
|
| 1222 | + 0 8 split)
|
|
| 1217 | 1223 | (setf (unidata-word-break *unicode-data*)
|
| 1218 | - (make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec))))
|
|
| 1224 | + (make-ntrie8 :split split :hvec hvec :mvec mvec :lvec lvec))))
|
|
| 1219 | 1225 | nil)) |