| ... |
... |
@@ -44,6 +44,7 @@ |
|
44
|
44
|
case-fold-full
|
|
45
|
45
|
case-fold-simple
|
|
46
|
46
|
word-break
|
|
|
47
|
+ collation
|
|
47
|
48
|
)
|
|
48
|
49
|
|
|
49
|
50
|
(defvar *unicode-data* (make-unidata))
|
| ... |
... |
@@ -146,6 +147,27 @@ |
|
146
|
147
|
(tabl (ext:required-argument) :read-only t
|
|
147
|
148
|
:type (simple-array (unsigned-byte 16) (*))))
|
|
148
|
149
|
|
|
|
150
|
+(defstruct (collation (:include ntrie32))
|
|
|
151
|
+ ;; Parallel collation-element arrays shared by the single-codepoint
|
|
|
152
|
+ ;; index (whose LVEC packs (offset << 6) | count into these) and the
|
|
|
153
|
+ ;; contraction table. TERV holds the tertiary weight in its low 7
|
|
|
154
|
+ ;; bits and the variable flag in bit 7.
|
|
|
155
|
+ (primv (ext:required-argument) :read-only t
|
|
|
156
|
+ :type (simple-array (unsigned-byte 16) (*)))
|
|
|
157
|
+ (secv (ext:required-argument) :read-only t
|
|
|
158
|
+ :type (simple-array (unsigned-byte 16) (*)))
|
|
|
159
|
+ (terv (ext:required-argument) :read-only t
|
|
|
160
|
+ :type (simple-array (unsigned-byte 8) (*)))
|
|
|
161
|
+ ;; Contraction table: four 32-bit words per entry -- cp1, cp2, cp3
|
|
|
162
|
+ ;; (or #xFFFFFFFF when the key has only two codepoints), and the
|
|
|
163
|
+ ;; packed (offset << 6) | count into the collation-element arrays.
|
|
|
164
|
+ (contractions (ext:required-argument) :read-only t
|
|
|
165
|
+ :type (simple-array (unsigned-byte 32) (*)))
|
|
|
166
|
+ ;; @implicitweights ranges: four 32-bit words per entry -- start,
|
|
|
167
|
+ ;; end, base, and base-origin (smallest start sharing the base).
|
|
|
168
|
+ (ranges (ext:required-argument) :read-only t
|
|
|
169
|
+ :type (simple-array (unsigned-byte 32) (*))))
|
|
|
170
|
+
|
|
149
|
171
|
(defstruct (bidi (:include ntrie16))
|
|
150
|
172
|
(tabl (ext:required-argument) :read-only t
|
|
151
|
173
|
:type (simple-array (unsigned-byte 16) (*))))
|
| ... |
... |
@@ -535,11 +557,11 @@ |
|
535
|
557
|
:element-type '(unsigned-byte 8))
|
|
536
|
558
|
;; The length of the index array is the number of sections to be
|
|
537
|
559
|
;; written. See below for each section.
|
|
538
|
|
- (let ((index (make-array 19 :fill-pointer 0)))
|
|
|
560
|
+ (let ((index (make-array 20 :fill-pointer 0)))
|
|
539
|
561
|
;; File header
|
|
540
|
562
|
(write32 +unicode-magic-number+ stm) ; identification "magic"
|
|
541
|
|
- ;; File format version (1: dictionary nextv de-packed, keypv added)
|
|
542
|
|
- (write-byte 1 stm)
|
|
|
563
|
+ ;; File format version (2: collation/DUCET section added)
|
|
|
564
|
+ (write-byte 2 stm)
|
|
543
|
565
|
;; Unicode version
|
|
544
|
566
|
(write-byte +unicode-major-version+ stm)
|
|
545
|
567
|
(write-byte +unicode-minor-version+ stm)
|
| ... |
... |
@@ -638,6 +660,18 @@ |
|
638
|
660
|
(let ((data (unidata-word-break *unicode-data*)))
|
|
639
|
661
|
(update-index (file-position stm) index)
|
|
640
|
662
|
(write-ntrie4 data stm))
|
|
|
663
|
+ ;; 19. Collation (DUCET)
|
|
|
664
|
+ (let ((data (unidata-collation *unicode-data*)))
|
|
|
665
|
+ (update-index (file-position stm) index)
|
|
|
666
|
+ (write-ntrie32 data stm)
|
|
|
667
|
+ (write32 (length (collation-primv data)) stm)
|
|
|
668
|
+ (write-vector (collation-primv data) stm :endian-swap :network-order)
|
|
|
669
|
+ (write-vector (collation-secv data) stm :endian-swap :network-order)
|
|
|
670
|
+ (write-vector (collation-terv data) stm :endian-swap :network-order)
|
|
|
671
|
+ (write32 (truncate (length (collation-contractions data)) 4) stm)
|
|
|
672
|
+ (write-vector (collation-contractions data) stm :endian-swap :network-order)
|
|
|
673
|
+ (write-byte (truncate (length (collation-ranges data)) 4) stm)
|
|
|
674
|
+ (write-vector (collation-ranges data) stm :endian-swap :network-order))
|
|
641
|
675
|
;; All components saved. Patch up index table now.
|
|
642
|
676
|
(file-position stm 8)
|
|
643
|
677
|
(dotimes (i (length index))
|
| ... |
... |
@@ -1016,6 +1050,140 @@ |
|
1016
|
1050
|
|
|
1017
|
1051
|
;; ucd-directory should be the directory where UnicodeData.txt is
|
|
1018
|
1052
|
;; located.
|
|
|
1053
|
+(defun parse-collation-key (string)
|
|
|
1054
|
+ "Parse the space-separated hexadecimal codepoints in STRING (the part
|
|
|
1055
|
+of an allkeys.txt line before the semicolon) into a list of integers."
|
|
|
1056
|
+ (let ((result nil) (i 0) (n (length string)))
|
|
|
1057
|
+ (loop
|
|
|
1058
|
+ (loop while (and (< i n) (not (digit-char-p (char string i) 16)))
|
|
|
1059
|
+ do (incf i))
|
|
|
1060
|
+ (when (>= i n) (return))
|
|
|
1061
|
+ (let ((j i))
|
|
|
1062
|
+ (loop while (and (< j n) (digit-char-p (char string j) 16))
|
|
|
1063
|
+ do (incf j))
|
|
|
1064
|
+ (push (parse-integer string :start i :end j :radix 16) result)
|
|
|
1065
|
+ (setf i j)))
|
|
|
1066
|
+ (nreverse result)))
|
|
|
1067
|
+
|
|
|
1068
|
+(defun parse-collation-elements (string)
|
|
|
1069
|
+ "Parse the collation elements [.pppp.ssss.tttt] (or [*pppp...] for a
|
|
|
1070
|
+variable element) from STRING into a list of (primary secondary tertiary
|
|
|
1071
|
+variablep) lists."
|
|
|
1072
|
+ (let ((result nil) (i 0))
|
|
|
1073
|
+ (loop
|
|
|
1074
|
+ (let ((open (position #\[ string :start i)))
|
|
|
1075
|
+ (unless open (return))
|
|
|
1076
|
+ (let* ((var (char= (char string (1+ open)) #\*))
|
|
|
1077
|
+ (close (position #\] string :start open))
|
|
|
1078
|
+ (body (subseq string (+ open 2) close))
|
|
|
1079
|
+ (d1 (position #\. body))
|
|
|
1080
|
+ (d2 (position #\. body :start (1+ d1))))
|
|
|
1081
|
+ (push (list (parse-integer body :end d1 :radix 16)
|
|
|
1082
|
+ (parse-integer body :start (1+ d1) :end d2 :radix 16)
|
|
|
1083
|
+ (parse-integer body :start (1+ d2) :radix 16)
|
|
|
1084
|
+ var)
|
|
|
1085
|
+ result)
|
|
|
1086
|
+ (setf i (1+ close)))))
|
|
|
1087
|
+ (nreverse result)))
|
|
|
1088
|
+
|
|
|
1089
|
+(defun build-collation (ucd range ucd-directory)
|
|
|
1090
|
+ "Read allkeys.txt (the DUCET) from UCD-DIRECTORY and build the
|
|
|
1091
|
+collation section: the parallel collation-element arrays, the
|
|
|
1092
|
+single-codepoint index (an ntrie32 mapping a codepoint to a packed
|
|
|
1093
|
+(offset << 6) | count into those arrays), the contraction table, and the
|
|
|
1094
|
+@implicitweights ranges."
|
|
|
1095
|
+ (let ((path (make-pathname :name "allkeys" :type "txt" :defaults ucd-directory))
|
|
|
1096
|
+ (primv (make-array 65536 :element-type '(unsigned-byte 16)
|
|
|
1097
|
+ :adjustable t :fill-pointer 0))
|
|
|
1098
|
+ (secv (make-array 65536 :element-type '(unsigned-byte 16)
|
|
|
1099
|
+ :adjustable t :fill-pointer 0))
|
|
|
1100
|
+ (terv (make-array 65536 :element-type '(unsigned-byte 8)
|
|
|
1101
|
+ :adjustable t :fill-pointer 0))
|
|
|
1102
|
+ (single (make-hash-table))
|
|
|
1103
|
+ (contractions nil)
|
|
|
1104
|
+ (raw-ranges nil))
|
|
|
1105
|
+ (flet ((emit (ces)
|
|
|
1106
|
+ ;; Append CES to the parallel arrays; return the packed
|
|
|
1107
|
+ ;; (offset << 6) | count referring to them.
|
|
|
1108
|
+ (let ((offset (fill-pointer primv))
|
|
|
1109
|
+ (count (length ces)))
|
|
|
1110
|
+ (dolist (ce ces)
|
|
|
1111
|
+ (destructuring-bind (p s te var) ce
|
|
|
1112
|
+ (vector-push-extend p primv)
|
|
|
1113
|
+ (vector-push-extend s secv)
|
|
|
1114
|
+ (vector-push-extend (logior te (if var #x80 0)) terv)))
|
|
|
1115
|
+ (logior (ash offset 6) count))))
|
|
|
1116
|
+ (with-open-file (s path :direction :input :external-format :utf-8)
|
|
|
1117
|
+ (loop for line = (read-line s nil) while line do
|
|
|
1118
|
+ (cond
|
|
|
1119
|
+ ((zerop (length line)))
|
|
|
1120
|
+ ((char= (char line 0) #\#))
|
|
|
1121
|
+ ((eql 0 (search "@implicitweights" line))
|
|
|
1122
|
+ (let* ((semi (position #\; line))
|
|
|
1123
|
+ (dd (search ".." line))
|
|
|
1124
|
+ (start (parse-integer line :start (length "@implicitweights")
|
|
|
1125
|
+ :end dd :radix 16 :junk-allowed t))
|
|
|
1126
|
+ (end (parse-integer line :start (+ dd 2) :end semi
|
|
|
1127
|
+ :radix 16 :junk-allowed t))
|
|
|
1128
|
+ (base (parse-integer line :start (1+ semi)
|
|
|
1129
|
+ :radix 16 :junk-allowed t)))
|
|
|
1130
|
+ (push (list start end base) raw-ranges)))
|
|
|
1131
|
+ ((char= (char line 0) #\@))
|
|
|
1132
|
+ (t
|
|
|
1133
|
+ (let ((semi (position #\; line)))
|
|
|
1134
|
+ (when semi
|
|
|
1135
|
+ (let* ((hash (position #\# line))
|
|
|
1136
|
+ (key (parse-collation-key (subseq line 0 semi)))
|
|
|
1137
|
+ (ces (parse-collation-elements
|
|
|
1138
|
+ (subseq line (1+ semi) hash)))
|
|
|
1139
|
+ (packed (emit ces)))
|
|
|
1140
|
+ (if (= (length key) 1)
|
|
|
1141
|
+ (setf (gethash (first key) single) packed)
|
|
|
1142
|
+ (push (list (first key) (second key) (third key) packed)
|
|
|
1143
|
+ contractions))))))))))
|
|
|
1144
|
+ ;; base-origin: smallest start among ranges sharing a base.
|
|
|
1145
|
+ (let ((origin (make-hash-table)))
|
|
|
1146
|
+ (dolist (r raw-ranges)
|
|
|
1147
|
+ (destructuring-bind (start end base) r
|
|
|
1148
|
+ (declare (ignore end))
|
|
|
1149
|
+ (when (or (null (gethash base origin))
|
|
|
1150
|
+ (< start (gethash base origin)))
|
|
|
1151
|
+ (setf (gethash base origin) start))))
|
|
|
1152
|
+ (let* ((rl (nreverse raw-ranges))
|
|
|
1153
|
+ (rvec (make-array (* 4 (length rl)) :element-type '(unsigned-byte 32)))
|
|
|
1154
|
+ (cl (nreverse contractions))
|
|
|
1155
|
+ (cvec (make-array (* 4 (length cl)) :element-type '(unsigned-byte 32)))
|
|
|
1156
|
+ (i 0))
|
|
|
1157
|
+ (dolist (r rl)
|
|
|
1158
|
+ (destructuring-bind (start end base) r
|
|
|
1159
|
+ (setf (aref rvec i) start
|
|
|
1160
|
+ (aref rvec (+ i 1)) end
|
|
|
1161
|
+ (aref rvec (+ i 2)) base
|
|
|
1162
|
+ (aref rvec (+ i 3)) (gethash base origin))
|
|
|
1163
|
+ (incf i 4)))
|
|
|
1164
|
+ (setf i 0)
|
|
|
1165
|
+ (dolist (c cl)
|
|
|
1166
|
+ (destructuring-bind (cp1 cp2 cp3 packed) c
|
|
|
1167
|
+ (setf (aref cvec i) cp1
|
|
|
1168
|
+ (aref cvec (+ i 1)) cp2
|
|
|
1169
|
+ (aref cvec (+ i 2)) (or cp3 #xFFFFFFFF)
|
|
|
1170
|
+ (aref cvec (+ i 3)) packed)
|
|
|
1171
|
+ (incf i 4)))
|
|
|
1172
|
+ (multiple-value-bind (hvec mvec lvec)
|
|
|
1173
|
+ (pack ucd range
|
|
|
1174
|
+ (lambda (ent) (gethash (ucdent-code ent) single 0))
|
|
|
1175
|
+ 0 32 #x54)
|
|
|
1176
|
+ (make-collation
|
|
|
1177
|
+ :split #x54 :hvec hvec :mvec mvec :lvec lvec
|
|
|
1178
|
+ :primv (make-array (length primv) :element-type '(unsigned-byte 16)
|
|
|
1179
|
+ :initial-contents primv)
|
|
|
1180
|
+ :secv (make-array (length secv) :element-type '(unsigned-byte 16)
|
|
|
1181
|
+ :initial-contents secv)
|
|
|
1182
|
+ :terv (make-array (length terv) :element-type '(unsigned-byte 8)
|
|
|
1183
|
+ :initial-contents terv)
|
|
|
1184
|
+ :contractions cvec
|
|
|
1185
|
+ :ranges rvec))))))
|
|
|
1186
|
+
|
|
1019
|
1187
|
(defun build-unidata (&optional (ucd-directory "target:i18n/"))
|
|
1020
|
1188
|
(format t "~&Reading data from ~S~%" (probe-file ucd-directory))
|
|
1021
|
1189
|
(force-output)
|
| ... |
... |
@@ -1216,4 +1384,9 @@ |
|
1216
|
1384
|
0 4 split)
|
|
1217
|
1385
|
(setf (unidata-word-break *unicode-data*)
|
|
1218
|
1386
|
(make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec))))
|
|
|
1387
|
+
|
|
|
1388
|
+ (format t "~&Building collation table~%")
|
|
|
1389
|
+ (force-output)
|
|
|
1390
|
+ (setf (unidata-collation *unicode-data*)
|
|
|
1391
|
+ (build-collation ucd range ucd-directory))
|
|
1219
|
1392
|
nil)) |