Raymond Toy pushed to branch issue-511-update-unicode-tests at cmucl / cmucl

Commits:

3 changed files:

Changes:

  • src/code/unidata.lisp
    ... ... @@ -1190,11 +1190,21 @@
    1190 1190
     	 (gethash (composition-table-key c1 c2) *composition-pair-table* nil))))
    
    1191 1191
     
    
    1192 1192
     (defun unicode-word-break-code (code)
    
    1193
    +  ;; Low 5 bits hold the word-break class index; bit 5 is the
    
    1194
    +  ;; Extended_Pictographic flag (see unicode-extended-pictographic-p).
    
    1193 1195
       (unless (unidata-word-break *unicode-data*)
    
    1194 1196
         (load-word-break))
    
    1195 1197
       (let* ((data (unidata-word-break *unicode-data*))
    
    1196 1198
     	 (n (qref8 data code)))
    
    1197
    -    n))
    
    1199
    +    (logand n #x1F)))
    
    1200
    +
    
    1201
    +(defun unicode-extended-pictographic-p (code)
    
    1202
    +  ;; The Extended_Pictographic property (from emoji-data.txt) is packed
    
    1203
    +  ;; into bit 5 of the word-break table value.  Needed for word-break
    
    1204
    +  ;; rule WB3c.
    
    1205
    +  (unless (unidata-word-break *unicode-data*)
    
    1206
    +    (load-word-break))
    
    1207
    +  (logbitp 5 (qref8 (unidata-word-break *unicode-data*) code)))
    
    1198 1208
     
    
    1199 1209
     (defun unicode-word-break (code)
    
    1200 1210
       ;; The order of the array here MUST match the order used in
    

  • src/i18n/unidata.bin
    No preview for this file type
  • src/tools/build-unidata.lisp
    ... ... @@ -663,6 +663,7 @@
    663 663
       case-fold-full
    
    664 664
       case-fold-simple
    
    665 665
       word-break
    
    666
    +  ext-pictographic
    
    666 667
       ;; ...
    
    667 668
       )
    
    668 669
     
    
    ... ... @@ -874,6 +875,18 @@
    874 875
     		  as ent = (find-ucd i) do
    
    875 876
     		  (when ent
    
    876 877
     		    (setf (ucdent-word-break ent) code))))))
    
    878
    +
    
    879
    +      ;; Extended_Pictographic (from emoji-data.txt) is needed for
    
    880
    +      ;; word-break rule WB3c.  It is the only emoji property we use;
    
    881
    +      ;; ignore the others in the file.
    
    882
    +      (foreach-ucd "emoji-data"
    
    883
    +	  ucd-directory
    
    884
    +	(lambda (min max prop)
    
    885
    +	  (when (string= prop "Extended_Pictographic")
    
    886
    +	    (loop for i from min to max
    
    887
    +		  as ent = (find-ucd i) do
    
    888
    +		  (when ent
    
    889
    +		    (setf (ucdent-ext-pictographic ent) t))))))
    
    877 890
           (values vec (make-range :codes range)))))
    
    878 891
     
    
    879 892
     
    
    ... ... @@ -1008,17 +1021,20 @@
    1008 1021
     (defun pack-word-break (ucdent)
    
    1009 1022
       ;; The code is the index in the list.  :OTHER is a dummy value and
    
    1010 1023
       ;; used to represent the default case.
    
    1011
    -  (or (position (ucdent-word-break ucdent)
    
    1012
    -		'(:other :cr :lf :newline :extend :format
    
    1013
    -		  :katakana :aletter :midnumlet :midletter :midnum
    
    1014
    -		  :numeric :extendnumlet :regional_indicator
    
    1015
    -		  ;; Classes added since Unicode 6.2 (6.3: hebrew_letter,
    
    1016
    -		  ;; single_quote, double_quote; 9.0: zwj; 11.0: wsegspace).
    
    1017
    -		  ;; Appended so existing indices are preserved; the array in
    
    1018
    -		  ;; unicode-word-break MUST match this order.
    
    1019
    -		  :hebrew_letter :single_quote :double_quote
    
    1020
    -		  :zwj :wsegspace))
    
    1021
    -      0))
    
    1024
    +  ;; Low 5 bits: word-break class index (the array in unicode-word-break
    
    1025
    +  ;; MUST match this order).  Bit 5 (#x20): Extended_Pictographic, for
    
    1026
    +  ;; word-break rule WB3c.
    
    1027
    +  (logior
    
    1028
    +   (or (position (ucdent-word-break ucdent)
    
    1029
    +		 '(:other :cr :lf :newline :extend :format
    
    1030
    +		   :katakana :aletter :midnumlet :midletter :midnum
    
    1031
    +		   :numeric :extendnumlet :regional_indicator
    
    1032
    +		   ;; Classes added since Unicode 6.2 (6.3: hebrew_letter,
    
    1033
    +		   ;; single_quote, double_quote; 9.0: zwj; 11.0: wsegspace).
    
    1034
    +		   :hebrew_letter :single_quote :double_quote
    
    1035
    +		   :zwj :wsegspace))
    
    1036
    +       0)
    
    1037
    +   (if (ucdent-ext-pictographic ucdent) #x20 0)))
    
    1022 1038
     
    
    1023 1039
     ;; ucd-directory should be the directory where UnicodeData.txt is
    
    1024 1040
     ;; located.