GitLab

at 2026-06-02T08:06:39-07:00

Add Extended_Pictographic property for word-break rule WB3c

Word-break rule WB3c (do not break within emoji ZWJ sequences) needs
the Extended_Pictographic property, which comes from emoji-data.txt --
a file we did not previously read.

Read emoji-data.txt in build-unidata (only the Extended_Pictographic
property; the other emoji properties are ignored) and pack the flag
into bit 5 of the word-break table value, alongside the class index in
the low 5 bits.  On the runtime side, mask the class to 5 bits in
unicode-word-break-code and add unicode-extended-pictographic-p to test
the flag.

unidata.bin regenerated.  This adds the data only;
string-next-word-break does not yet implement WB3c.

@@ -1190,11 +1190,21 @@
  	 (gethash (composition-table-key c1 c2) *composition-pair-table* nil))))
  (defun unicode-word-break-code (code)
 +  ;; Low 5 bits hold the word-break class index; bit 5 is the
 +  ;; Extended_Pictographic flag (see unicode-extended-pictographic-p).
    (unless (unidata-word-break *unicode-data*)
      (load-word-break))
    (let* ((data (unidata-word-break *unicode-data*))
  	 (n (qref8 data code)))
 -    n))
 +    (logand n #x1F)))
++
 +(defun unicode-extended-pictographic-p (code)
 +  ;; The Extended_Pictographic property (from emoji-data.txt) is packed
 +  ;; into bit 5 of the word-break table value.  Needed for word-break
 +  ;; rule WB3c.
 +  (unless (unidata-word-break *unicode-data*)
 +    (load-word-break))
 +  (logbitp 5 (qref8 (unidata-word-break *unicode-data*) code)))
  (defun unicode-word-break (code)
    ;; The order of the array here MUST match the order used in

@@ -663,6 +663,7 @@
    case-fold-full
    case-fold-simple
    word-break
 +  ext-pictographic
    ;; ...
+   )
@@ -874,6 +875,18 @@
  		  as ent = (find-ucd i) do
  		  (when ent
  		    (setf (ucdent-word-break ent) code))))))
++
 +      ;; Extended_Pictographic (from emoji-data.txt) is needed for
 +      ;; word-break rule WB3c.  It is the only emoji property we use;
 +      ;; ignore the others in the file.
 +      (foreach-ucd "emoji-data"
 +	  ucd-directory
 +	(lambda (min max prop)
 +	  (when (string= prop "Extended_Pictographic")
 +	    (loop for i from min to max
 +		  as ent = (find-ucd i) do
 +		  (when ent
 +		    (setf (ucdent-ext-pictographic ent) t))))))
        (values vec (make-range :codes range)))))
@@ -1008,17 +1021,20 @@
  (defun pack-word-break (ucdent)
    ;; The code is the index in the list.  :OTHER is a dummy value and
    ;; used to represent the default case.
 -  (or (position (ucdent-word-break ucdent)
 -		'(:other :cr :lf :newline :extend :format
 -		  :katakana :aletter :midnumlet :midletter :midnum
 -		  :numeric :extendnumlet :regional_indicator
 -		  ;; Classes added since Unicode 6.2 (6.3: hebrew_letter,
 -		  ;; single_quote, double_quote; 9.0: zwj; 11.0: wsegspace).
 -		  ;; Appended so existing indices are preserved; the array in
 -		  ;; unicode-word-break MUST match this order.
 -		  :hebrew_letter :single_quote :double_quote
 -		  :zwj :wsegspace))
 -      0))
 +  ;; Low 5 bits: word-break class index (the array in unicode-word-break
 +  ;; MUST match this order).  Bit 5 (#x20): Extended_Pictographic, for
 +  ;; word-break rule WB3c.
 +  (logior
 +   (or (position (ucdent-word-break ucdent)
 +		 '(:other :cr :lf :newline :extend :format
 +		   :katakana :aletter :midnumlet :midletter :midnum
 +		   :numeric :extendnumlet :regional_indicator
 +		   ;; Classes added since Unicode 6.2 (6.3: hebrew_letter,
 +		   ;; single_quote, double_quote; 9.0: zwj; 11.0: wsegspace).
 +		   :hebrew_letter :single_quote :double_quote
 +		   :zwj :wsegspace))
 +       0)
 +   (if (ucdent-ext-pictographic ucdent) #x20 0)))
  ;; ucd-directory should be the directory where UnicodeData.txt is
  ;; located.

Raymond Toy pushed to branch issue-511-update-unicode-tests at cmucl / cmucl

Commits:

3 changed files:

Changes: