| ... |
... |
@@ -663,6 +663,7 @@ |
|
663
|
663
|
case-fold-full
|
|
664
|
664
|
case-fold-simple
|
|
665
|
665
|
word-break
|
|
|
666
|
+ ext-pictographic
|
|
666
|
667
|
;; ...
|
|
667
|
668
|
)
|
|
668
|
669
|
|
| ... |
... |
@@ -874,6 +875,18 @@ |
|
874
|
875
|
as ent = (find-ucd i) do
|
|
875
|
876
|
(when ent
|
|
876
|
877
|
(setf (ucdent-word-break ent) code))))))
|
|
|
878
|
+
|
|
|
879
|
+ ;; Extended_Pictographic (from emoji-data.txt) is needed for
|
|
|
880
|
+ ;; word-break rule WB3c. It is the only emoji property we use;
|
|
|
881
|
+ ;; ignore the others in the file.
|
|
|
882
|
+ (foreach-ucd "emoji-data"
|
|
|
883
|
+ ucd-directory
|
|
|
884
|
+ (lambda (min max prop)
|
|
|
885
|
+ (when (string= prop "Extended_Pictographic")
|
|
|
886
|
+ (loop for i from min to max
|
|
|
887
|
+ as ent = (find-ucd i) do
|
|
|
888
|
+ (when ent
|
|
|
889
|
+ (setf (ucdent-ext-pictographic ent) t))))))
|
|
877
|
890
|
(values vec (make-range :codes range)))))
|
|
878
|
891
|
|
|
879
|
892
|
|
| ... |
... |
@@ -1008,17 +1021,20 @@ |
|
1008
|
1021
|
(defun pack-word-break (ucdent)
|
|
1009
|
1022
|
;; The code is the index in the list. :OTHER is a dummy value and
|
|
1010
|
1023
|
;; used to represent the default case.
|
|
1011
|
|
- (or (position (ucdent-word-break ucdent)
|
|
1012
|
|
- '(:other :cr :lf :newline :extend :format
|
|
1013
|
|
- :katakana :aletter :midnumlet :midletter :midnum
|
|
1014
|
|
- :numeric :extendnumlet :regional_indicator
|
|
1015
|
|
- ;; Classes added since Unicode 6.2 (6.3: hebrew_letter,
|
|
1016
|
|
- ;; single_quote, double_quote; 9.0: zwj; 11.0: wsegspace).
|
|
1017
|
|
- ;; Appended so existing indices are preserved; the array in
|
|
1018
|
|
- ;; unicode-word-break MUST match this order.
|
|
1019
|
|
- :hebrew_letter :single_quote :double_quote
|
|
1020
|
|
- :zwj :wsegspace))
|
|
1021
|
|
- 0))
|
|
|
1024
|
+ ;; Low 5 bits: word-break class index (the array in unicode-word-break
|
|
|
1025
|
+ ;; MUST match this order). Bit 5 (#x20): Extended_Pictographic, for
|
|
|
1026
|
+ ;; word-break rule WB3c.
|
|
|
1027
|
+ (logior
|
|
|
1028
|
+ (or (position (ucdent-word-break ucdent)
|
|
|
1029
|
+ '(:other :cr :lf :newline :extend :format
|
|
|
1030
|
+ :katakana :aletter :midnumlet :midletter :midnum
|
|
|
1031
|
+ :numeric :extendnumlet :regional_indicator
|
|
|
1032
|
+ ;; Classes added since Unicode 6.2 (6.3: hebrew_letter,
|
|
|
1033
|
+ ;; single_quote, double_quote; 9.0: zwj; 11.0: wsegspace).
|
|
|
1034
|
+ :hebrew_letter :single_quote :double_quote
|
|
|
1035
|
+ :zwj :wsegspace))
|
|
|
1036
|
+ 0)
|
|
|
1037
|
+ (if (ucdent-ext-pictographic ucdent) #x20 0)))
|
|
1022
|
1038
|
|
|
1023
|
1039
|
;; ucd-directory should be the directory where UnicodeData.txt is
|
|
1024
|
1040
|
;; located.
|