Raymond Toy pushed to branch issue-511-update-unicode-tests at cmucl / cmucl Commits: c6ff297a by Raymond Toy at 2026-06-02T08:06:39-07:00 Add Extended_Pictographic property for word-break rule WB3c Word-break rule WB3c (do not break within emoji ZWJ sequences) needs the Extended_Pictographic property, which comes from emoji-data.txt -- a file we did not previously read. Read emoji-data.txt in build-unidata (only the Extended_Pictographic property; the other emoji properties are ignored) and pack the flag into bit 5 of the word-break table value, alongside the class index in the low 5 bits. On the runtime side, mask the class to 5 bits in unicode-word-break-code and add unicode-extended-pictographic-p to test the flag. unidata.bin regenerated. This adds the data only; string-next-word-break does not yet implement WB3c. - - - - - 3 changed files: - src/code/unidata.lisp - src/i18n/unidata.bin - src/tools/build-unidata.lisp Changes: ===================================== src/code/unidata.lisp ===================================== @@ -1190,11 +1190,21 @@ (gethash (composition-table-key c1 c2) *composition-pair-table* nil)))) (defun unicode-word-break-code (code) + ;; Low 5 bits hold the word-break class index; bit 5 is the + ;; Extended_Pictographic flag (see unicode-extended-pictographic-p). (unless (unidata-word-break *unicode-data*) (load-word-break)) (let* ((data (unidata-word-break *unicode-data*)) (n (qref8 data code))) - n)) + (logand n #x1F))) + +(defun unicode-extended-pictographic-p (code) + ;; The Extended_Pictographic property (from emoji-data.txt) is packed + ;; into bit 5 of the word-break table value. Needed for word-break + ;; rule WB3c. + (unless (unidata-word-break *unicode-data*) + (load-word-break)) + (logbitp 5 (qref8 (unidata-word-break *unicode-data*) code))) (defun unicode-word-break (code) ;; The order of the array here MUST match the order used in ===================================== src/i18n/unidata.bin ===================================== Binary files a/src/i18n/unidata.bin and b/src/i18n/unidata.bin differ ===================================== src/tools/build-unidata.lisp ===================================== @@ -663,6 +663,7 @@ case-fold-full case-fold-simple word-break + ext-pictographic ;; ... ) @@ -874,6 +875,18 @@ as ent = (find-ucd i) do (when ent (setf (ucdent-word-break ent) code)))))) + + ;; Extended_Pictographic (from emoji-data.txt) is needed for + ;; word-break rule WB3c. It is the only emoji property we use; + ;; ignore the others in the file. + (foreach-ucd "emoji-data" + ucd-directory + (lambda (min max prop) + (when (string= prop "Extended_Pictographic") + (loop for i from min to max + as ent = (find-ucd i) do + (when ent + (setf (ucdent-ext-pictographic ent) t)))))) (values vec (make-range :codes range))))) @@ -1008,17 +1021,20 @@ (defun pack-word-break (ucdent) ;; The code is the index in the list. :OTHER is a dummy value and ;; used to represent the default case. - (or (position (ucdent-word-break ucdent) - '(:other :cr :lf :newline :extend :format - :katakana :aletter :midnumlet :midletter :midnum - :numeric :extendnumlet :regional_indicator - ;; Classes added since Unicode 6.2 (6.3: hebrew_letter, - ;; single_quote, double_quote; 9.0: zwj; 11.0: wsegspace). - ;; Appended so existing indices are preserved; the array in - ;; unicode-word-break MUST match this order. - :hebrew_letter :single_quote :double_quote - :zwj :wsegspace)) - 0)) + ;; Low 5 bits: word-break class index (the array in unicode-word-break + ;; MUST match this order). Bit 5 (#x20): Extended_Pictographic, for + ;; word-break rule WB3c. + (logior + (or (position (ucdent-word-break ucdent) + '(:other :cr :lf :newline :extend :format + :katakana :aletter :midnumlet :midletter :midnum + :numeric :extendnumlet :regional_indicator + ;; Classes added since Unicode 6.2 (6.3: hebrew_letter, + ;; single_quote, double_quote; 9.0: zwj; 11.0: wsegspace). + :hebrew_letter :single_quote :double_quote + :zwj :wsegspace)) + 0) + (if (ucdent-ext-pictographic ucdent) #x20 0))) ;; ucd-directory should be the directory where UnicodeData.txt is ;; located. View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/c6ff297a87dd6e3e23e594e7... -- View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/c6ff297a87dd6e3e23e594e7... You're receiving this email because of your account on gitlab.common-lisp.net. Manage all notifications: https://gitlab.common-lisp.net/-/profile/notifications | Help: https://gitlab.common-lisp.net/help