Raymond Toy pushed to branch rtoy-unicode-collation-ducet at cmucl / cmucl

Commits:

1 changed file:

Changes:

  • src/tools/build-unidata.lisp
    ... ... @@ -44,6 +44,7 @@
    44 44
       case-fold-full
    
    45 45
       case-fold-simple
    
    46 46
       word-break
    
    47
    +  collation
    
    47 48
       )
    
    48 49
     
    
    49 50
     (defvar *unicode-data* (make-unidata))
    
    ... ... @@ -146,6 +147,27 @@
    146 147
       (tabl (ext:required-argument) :read-only t
    
    147 148
     	:type (simple-array (unsigned-byte 16) (*))))
    
    148 149
     
    
    150
    +(defstruct (collation (:include ntrie32))
    
    151
    +  ;; Parallel collation-element arrays shared by the single-codepoint
    
    152
    +  ;; index (whose LVEC packs (offset << 6) | count into these) and the
    
    153
    +  ;; contraction table.  TERV holds the tertiary weight in its low 7
    
    154
    +  ;; bits and the variable flag in bit 7.
    
    155
    +  (primv (ext:required-argument) :read-only t
    
    156
    +	 :type (simple-array (unsigned-byte 16) (*)))
    
    157
    +  (secv (ext:required-argument) :read-only t
    
    158
    +	:type (simple-array (unsigned-byte 16) (*)))
    
    159
    +  (terv (ext:required-argument) :read-only t
    
    160
    +	:type (simple-array (unsigned-byte 8) (*)))
    
    161
    +  ;; Contraction table: four 32-bit words per entry -- cp1, cp2, cp3
    
    162
    +  ;; (or #xFFFFFFFF when the key has only two codepoints), and the
    
    163
    +  ;; packed (offset << 6) | count into the collation-element arrays.
    
    164
    +  (contractions (ext:required-argument) :read-only t
    
    165
    +		:type (simple-array (unsigned-byte 32) (*)))
    
    166
    +  ;; @implicitweights ranges: four 32-bit words per entry -- start,
    
    167
    +  ;; end, base, and base-origin (smallest start sharing the base).
    
    168
    +  (ranges (ext:required-argument) :read-only t
    
    169
    +	  :type (simple-array (unsigned-byte 32) (*))))
    
    170
    +
    
    149 171
     (defstruct (bidi (:include ntrie16))
    
    150 172
       (tabl (ext:required-argument) :read-only t
    
    151 173
     	:type (simple-array (unsigned-byte 16) (*))))
    
    ... ... @@ -535,11 +557,11 @@
    535 557
     			 :element-type '(unsigned-byte 8))
    
    536 558
           ;; The length of the index array is the number of sections to be
    
    537 559
           ;; written.  See below for each section.
    
    538
    -      (let ((index (make-array 19 :fill-pointer 0)))
    
    560
    +      (let ((index (make-array 20 :fill-pointer 0)))
    
    539 561
     	;; File header
    
    540 562
     	(write32 +unicode-magic-number+ stm)	; identification "magic"
    
    541
    -	;; File format version (1: dictionary nextv de-packed, keypv added)
    
    542
    -	(write-byte 1 stm)
    
    563
    +	;; File format version (2: collation/DUCET section added)
    
    564
    +	(write-byte 2 stm)
    
    543 565
     	;; Unicode version
    
    544 566
     	(write-byte +unicode-major-version+ stm)
    
    545 567
     	(write-byte +unicode-minor-version+ stm)
    
    ... ... @@ -638,6 +660,18 @@
    638 660
     	(let ((data (unidata-word-break *unicode-data*)))
    
    639 661
     	  (update-index (file-position stm) index)
    
    640 662
     	  (write-ntrie4 data stm))
    
    663
    +	;; 19. Collation (DUCET)
    
    664
    +	(let ((data (unidata-collation *unicode-data*)))
    
    665
    +	  (update-index (file-position stm) index)
    
    666
    +	  (write-ntrie32 data stm)
    
    667
    +	  (write32 (length (collation-primv data)) stm)
    
    668
    +	  (write-vector (collation-primv data) stm :endian-swap :network-order)
    
    669
    +	  (write-vector (collation-secv data) stm :endian-swap :network-order)
    
    670
    +	  (write-vector (collation-terv data) stm :endian-swap :network-order)
    
    671
    +	  (write32 (truncate (length (collation-contractions data)) 4) stm)
    
    672
    +	  (write-vector (collation-contractions data) stm :endian-swap :network-order)
    
    673
    +	  (write-byte (truncate (length (collation-ranges data)) 4) stm)
    
    674
    +	  (write-vector (collation-ranges data) stm :endian-swap :network-order))
    
    641 675
     	;; All components saved. Patch up index table now.
    
    642 676
     	(file-position stm 8)
    
    643 677
     	(dotimes (i (length index))
    
    ... ... @@ -1016,6 +1050,140 @@
    1016 1050
     
    
    1017 1051
     ;; ucd-directory should be the directory where UnicodeData.txt is
    
    1018 1052
     ;; located.
    
    1053
    +(defun parse-collation-key (string)
    
    1054
    +  "Parse the space-separated hexadecimal codepoints in STRING (the part
    
    1055
    +of an allkeys.txt line before the semicolon) into a list of integers."
    
    1056
    +  (let ((result nil) (i 0) (n (length string)))
    
    1057
    +    (loop
    
    1058
    +      (loop while (and (< i n) (not (digit-char-p (char string i) 16)))
    
    1059
    +	    do (incf i))
    
    1060
    +      (when (>= i n) (return))
    
    1061
    +      (let ((j i))
    
    1062
    +	(loop while (and (< j n) (digit-char-p (char string j) 16))
    
    1063
    +	      do (incf j))
    
    1064
    +	(push (parse-integer string :start i :end j :radix 16) result)
    
    1065
    +	(setf i j)))
    
    1066
    +    (nreverse result)))
    
    1067
    +
    
    1068
    +(defun parse-collation-elements (string)
    
    1069
    +  "Parse the collation elements [.pppp.ssss.tttt] (or [*pppp...] for a
    
    1070
    +variable element) from STRING into a list of (primary secondary tertiary
    
    1071
    +variablep) lists."
    
    1072
    +  (let ((result nil) (i 0))
    
    1073
    +    (loop
    
    1074
    +      (let ((open (position #\[ string :start i)))
    
    1075
    +	(unless open (return))
    
    1076
    +	(let* ((var (char= (char string (1+ open)) #\*))
    
    1077
    +	       (close (position #\] string :start open))
    
    1078
    +	       (body (subseq string (+ open 2) close))
    
    1079
    +	       (d1 (position #\. body))
    
    1080
    +	       (d2 (position #\. body :start (1+ d1))))
    
    1081
    +	  (push (list (parse-integer body :end d1 :radix 16)
    
    1082
    +		      (parse-integer body :start (1+ d1) :end d2 :radix 16)
    
    1083
    +		      (parse-integer body :start (1+ d2) :radix 16)
    
    1084
    +		      var)
    
    1085
    +		result)
    
    1086
    +	  (setf i (1+ close)))))
    
    1087
    +    (nreverse result)))
    
    1088
    +
    
    1089
    +(defun build-collation (ucd range ucd-directory)
    
    1090
    +  "Read allkeys.txt (the DUCET) from UCD-DIRECTORY and build the
    
    1091
    +collation section: the parallel collation-element arrays, the
    
    1092
    +single-codepoint index (an ntrie32 mapping a codepoint to a packed
    
    1093
    +(offset << 6) | count into those arrays), the contraction table, and the
    
    1094
    +@implicitweights ranges."
    
    1095
    +  (let ((path (make-pathname :name "allkeys" :type "txt" :defaults ucd-directory))
    
    1096
    +	(primv (make-array 65536 :element-type '(unsigned-byte 16)
    
    1097
    +			   :adjustable t :fill-pointer 0))
    
    1098
    +	(secv (make-array 65536 :element-type '(unsigned-byte 16)
    
    1099
    +			  :adjustable t :fill-pointer 0))
    
    1100
    +	(terv (make-array 65536 :element-type '(unsigned-byte 8)
    
    1101
    +			  :adjustable t :fill-pointer 0))
    
    1102
    +	(single (make-hash-table))
    
    1103
    +	(contractions nil)
    
    1104
    +	(raw-ranges nil))
    
    1105
    +    (flet ((emit (ces)
    
    1106
    +	     ;; Append CES to the parallel arrays; return the packed
    
    1107
    +	     ;; (offset << 6) | count referring to them.
    
    1108
    +	     (let ((offset (fill-pointer primv))
    
    1109
    +		   (count (length ces)))
    
    1110
    +	       (dolist (ce ces)
    
    1111
    +		 (destructuring-bind (p s te var) ce
    
    1112
    +		   (vector-push-extend p primv)
    
    1113
    +		   (vector-push-extend s secv)
    
    1114
    +		   (vector-push-extend (logior te (if var #x80 0)) terv)))
    
    1115
    +	       (logior (ash offset 6) count))))
    
    1116
    +      (with-open-file (s path :direction :input :external-format :utf-8)
    
    1117
    +	(loop for line = (read-line s nil) while line do
    
    1118
    +	  (cond
    
    1119
    +	    ((zerop (length line)))
    
    1120
    +	    ((char= (char line 0) #\#))
    
    1121
    +	    ((eql 0 (search "@implicitweights" line))
    
    1122
    +	     (let* ((semi (position #\; line))
    
    1123
    +		    (dd (search ".." line))
    
    1124
    +		    (start (parse-integer line :start (length "@implicitweights")
    
    1125
    +					  :end dd :radix 16 :junk-allowed t))
    
    1126
    +		    (end (parse-integer line :start (+ dd 2) :end semi
    
    1127
    +					 :radix 16 :junk-allowed t))
    
    1128
    +		    (base (parse-integer line :start (1+ semi)
    
    1129
    +					 :radix 16 :junk-allowed t)))
    
    1130
    +	       (push (list start end base) raw-ranges)))
    
    1131
    +	    ((char= (char line 0) #\@))
    
    1132
    +	    (t
    
    1133
    +	     (let ((semi (position #\; line)))
    
    1134
    +	       (when semi
    
    1135
    +		 (let* ((hash (position #\# line))
    
    1136
    +			(key (parse-collation-key (subseq line 0 semi)))
    
    1137
    +			(ces (parse-collation-elements
    
    1138
    +			      (subseq line (1+ semi) hash)))
    
    1139
    +			(packed (emit ces)))
    
    1140
    +		   (if (= (length key) 1)
    
    1141
    +		       (setf (gethash (first key) single) packed)
    
    1142
    +		       (push (list (first key) (second key) (third key) packed)
    
    1143
    +			     contractions))))))))))
    
    1144
    +    ;; base-origin: smallest start among ranges sharing a base.
    
    1145
    +    (let ((origin (make-hash-table)))
    
    1146
    +      (dolist (r raw-ranges)
    
    1147
    +	(destructuring-bind (start end base) r
    
    1148
    +	  (declare (ignore end))
    
    1149
    +	  (when (or (null (gethash base origin))
    
    1150
    +		    (< start (gethash base origin)))
    
    1151
    +	    (setf (gethash base origin) start))))
    
    1152
    +      (let* ((rl (nreverse raw-ranges))
    
    1153
    +	     (rvec (make-array (* 4 (length rl)) :element-type '(unsigned-byte 32)))
    
    1154
    +	     (cl (nreverse contractions))
    
    1155
    +	     (cvec (make-array (* 4 (length cl)) :element-type '(unsigned-byte 32)))
    
    1156
    +	     (i 0))
    
    1157
    +	(dolist (r rl)
    
    1158
    +	  (destructuring-bind (start end base) r
    
    1159
    +	    (setf (aref rvec i) start
    
    1160
    +		  (aref rvec (+ i 1)) end
    
    1161
    +		  (aref rvec (+ i 2)) base
    
    1162
    +		  (aref rvec (+ i 3)) (gethash base origin))
    
    1163
    +	    (incf i 4)))
    
    1164
    +	(setf i 0)
    
    1165
    +	(dolist (c cl)
    
    1166
    +	  (destructuring-bind (cp1 cp2 cp3 packed) c
    
    1167
    +	    (setf (aref cvec i) cp1
    
    1168
    +		  (aref cvec (+ i 1)) cp2
    
    1169
    +		  (aref cvec (+ i 2)) (or cp3 #xFFFFFFFF)
    
    1170
    +		  (aref cvec (+ i 3)) packed)
    
    1171
    +	    (incf i 4)))
    
    1172
    +	(multiple-value-bind (hvec mvec lvec)
    
    1173
    +	    (pack ucd range
    
    1174
    +		  (lambda (ent) (gethash (ucdent-code ent) single 0))
    
    1175
    +		  0 32 #x54)
    
    1176
    +	  (make-collation
    
    1177
    +	   :split #x54 :hvec hvec :mvec mvec :lvec lvec
    
    1178
    +	   :primv (make-array (length primv) :element-type '(unsigned-byte 16)
    
    1179
    +			      :initial-contents primv)
    
    1180
    +	   :secv (make-array (length secv) :element-type '(unsigned-byte 16)
    
    1181
    +			     :initial-contents secv)
    
    1182
    +	   :terv (make-array (length terv) :element-type '(unsigned-byte 8)
    
    1183
    +			     :initial-contents terv)
    
    1184
    +	   :contractions cvec
    
    1185
    +	   :ranges rvec))))))
    
    1186
    +
    
    1019 1187
     (defun build-unidata (&optional (ucd-directory "target:i18n/"))
    
    1020 1188
       (format t "~&Reading data from ~S~%" (probe-file ucd-directory))
    
    1021 1189
       (force-output)
    
    ... ... @@ -1216,4 +1384,9 @@
    1216 1384
     		0 4 split)
    
    1217 1385
     	(setf (unidata-word-break *unicode-data*)
    
    1218 1386
     	      (make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec))))
    
    1387
    +
    
    1388
    +    (format t "~&Building collation table~%")
    
    1389
    +    (force-output)
    
    1390
    +    (setf (unidata-collation *unicode-data*)
    
    1391
    +	  (build-collation ucd range ucd-directory))
    
    1219 1392
         nil))