Raymond Toy pushed to branch rtoy-unicode-collation-ducet at cmucl / cmucl Commits: 6a425efa by Raymond Toy at 2026-06-17T13:30:01-07:00 Serialize the DUCET into unidata.bin The collation table was loaded at runtime by parsing allkeys.txt. Store it in unidata.bin instead, as a new section (index 19), and bump the file format version to 2. The section holds the collation elements in three parallel arrays (primary u16, secondary u16, tertiary u8 with the variable flag in bit 7), a single-codepoint index as an ntrie32 mapping a codepoint to a packed (offset << 6) | count into those arrays, a contraction table (four 32-bit words per entry), and the @implicitweights ranges. build-unidata.lisp reads allkeys.txt and builds the section; unidata.lisp reads it back (loader for section 19, plus the COLLATION struct and slot). UNIDATA-DUCET in unicode-collation.lisp builds the runtime DUCET from the loaded section -- structurally identical to one from LOAD-DUCET, so the sort-key construction code is unchanged -- and COLLATION-TABLE now uses it. LOAD-DUCET is kept for regenerating data and cross-checking. The collation conformance tests build the DUCET from unidata.bin and still pass all 437930 assertions. unidata.bin updated with the new collation data. - - - - - 4 changed files: - src/code/unicode-collation.lisp - src/code/unidata.lisp - src/i18n/unidata.bin - tests/unicode-collation.lisp Changes: ===================================== src/code/unicode-collation.lisp ===================================== @@ -518,6 +518,85 @@ if after. VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS." ;;; no meaningful character index of the first difference to return. ;;; ------------------------------------------------------------------- + +;;; ------------------------------------------------------------------- +;;; Building the runtime DUCET from the collation section of +;;; unidata.bin. The resulting table is structurally identical to one +;;; built by LOAD-DUCET from allkeys.txt -- the same MAP / SINGLE / +;;; STARTERS hashes and implicit ranges -- so the sort-key construction +;;; code uses it unchanged. This replaces the runtime use of LOAD-DUCET +;;; (which is kept for regenerating data and for cross-checking). +;;; ------------------------------------------------------------------- + +(defun unidata-ducet () + "Build a DUCET from the collation section of unidata.bin, loading the +section first if necessary." + (unless (unidata-collation *unicode-data*) + (load-collation)) + (let* ((c (unidata-collation *unicode-data*)) + (primv (collation-primv c)) + (secv (collation-secv c)) + (terv (collation-terv c)) + (contractions (collation-contractions c)) + (ranges (collation-ranges c)) + (d (make-ducet :version (format nil "~D.~D.~D" + +unicode-major-version+ + +unicode-minor-version+ + +unicode-update-version+))) + (maxvar 0) + (maxkey 1)) + (flet ((ces-at (packed) + ;; Slice the parallel arrays into a simple-vector of + ;; collation-elements for the packed (offset << 6) | count. + (let* ((off (ash packed -6)) + (n (logand packed #x3f)) + (v (make-array n))) + (dotimes (i n) + (let* ((j (+ off i)) + (te (aref terv j)) + (var (logbitp 7 te)) + (p (aref primv j))) + (when (and var (> p maxvar)) + (setf maxvar p)) + (setf (aref v i) + (make-ce p (aref secv j) (logand te #x7f) var)))) + v))) + ;; Single-codepoint entries: walk the codepoint space and pull the + ;; non-zero values out of the index trie. (Many keys are astral, + ;; so the walk must cover the full range, not just the BMP.) + (dotimes (cp #x110000) + (let ((packed (qref32 c cp))) + (unless (zerop packed) + (let ((ces (ces-at packed))) + (setf (gethash cp (ducet-single d)) ces) + (setf (gethash (make-array 1 :initial-element cp) (ducet-map d)) + ces))))) + ;; Contractions: four 32-bit words each. + (loop for i from 0 below (length contractions) by 4 do + (let* ((cp1 (aref contractions i)) + (cp2 (aref contractions (+ i 1))) + (cp3 (aref contractions (+ i 2))) + (packed (aref contractions (+ i 3))) + (key (if (= cp3 #xFFFFFFFF) + (make-array 2 :initial-contents (list cp1 cp2)) + (make-array 3 :initial-contents (list cp1 cp2 cp3))))) + (setf (gethash key (ducet-map d)) (ces-at packed)) + (setf (gethash cp1 (ducet-starters d)) t) + (setf maxkey (max maxkey (length key))))) + ;; Implicit-weight ranges: four 32-bit words each (start, end, + ;; base, base-origin). + (let ((rl nil)) + (loop for i from 0 below (length ranges) by 4 do + (let ((r (make-implicit-range (aref ranges i) + (aref ranges (+ i 1)) + (aref ranges (+ i 2))))) + (setf (implicit-range-base-origin r) (aref ranges (+ i 3))) + (push r rl))) + (setf (ducet-implicit-ranges d) (nreverse rl))) + (setf (ducet-max-key-length d) maxkey + (ducet-max-variable-primary d) maxvar) + d))) + (in-package "UNICODE") (defvar *collation-table-path* "ext-formats:allkeys.txt" @@ -530,10 +609,10 @@ loaded. Loaded lazily from *COLLATION-TABLE-PATH* the first time a collation function needs it. Set to NIL to force a reload.") (defun collation-table () - "Return the default Unicode collation table, loading it from -*COLLATION-TABLE-PATH* on first use." + "Return the default Unicode collation table, building it from the +collation section of unidata.bin on first use." (or *collation-table* - (setf *collation-table* (lisp::load-ducet *collation-table-path*)))) + (setf *collation-table* (lisp::unidata-ducet)))) (defun %collation-compare (string1 string2 start1 end1 start2 end2 variable-weighting) ===================================== src/code/unidata.lisp ===================================== @@ -56,6 +56,7 @@ case-fold-simple case-fold-full word-break + collation ) (defvar *unicode-data* (make-unidata)) @@ -65,7 +66,7 @@ (defconstant +unicode-magic-number+ #x2A554344) ;; The format version for the unidata.bin file. -(defconstant +unicode-format-version+ 1) +(defconstant +unicode-format-version+ 2) ;; The expected Unicode version. This needs to be synced with ;; build-unidata.lisp. @@ -292,6 +293,27 @@ (defstruct (case-fold-full (:include decomp))) +(defstruct (collation (:include ntrie32)) + ;; Parallel collation-element arrays shared by the single-codepoint + ;; index (whose LVEC packs (offset << 6) | count into these) and the + ;; contraction table. TERV holds the tertiary weight in its low 7 + ;; bits and the variable flag in bit 7. + (primv (ext:required-argument) :read-only t + :type (simple-array (unsigned-byte 16) (*))) + (secv (ext:required-argument) :read-only t + :type (simple-array (unsigned-byte 16) (*))) + (terv (ext:required-argument) :read-only t + :type (simple-array (unsigned-byte 8) (*))) + ;; Contraction table: four 32-bit words per entry -- cp1, cp2, cp3 + ;; (or #xFFFFFFFF when the key has only two codepoints), and the + ;; packed (offset << 6) | count into the collation-element arrays. + (contractions (ext:required-argument) :read-only t + :type (simple-array (unsigned-byte 32) (*))) + ;; @implicitweights ranges: four 32-bit words per entry -- start, + ;; end, base, and base-origin (smallest start sharing the base). + (ranges (ext:required-argument) :read-only t + :type (simple-array (unsigned-byte 32) (*)))) + (defstruct (bidi (:include ntrie16)) (tabl (ext:required-argument) :read-only t :type (simple-array (unsigned-byte 16) (*)))) @@ -718,6 +740,29 @@ (read-ntrie 4 stm) (setf (unidata-word-break *unicode-data*) (make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec)))) +(defloader load-collation (stm 19) + (multiple-value-bind (split hvec mvec lvec) + (read-ntrie 32 stm) + (let* ((nce (read32 stm)) + (primv (make-array nce :element-type '(unsigned-byte 16))) + (secv (make-array nce :element-type '(unsigned-byte 16))) + (terv (make-array nce :element-type '(unsigned-byte 8)))) + (read-vector primv stm :endian-swap :network-order) + (read-vector secv stm :endian-swap :network-order) + (read-vector terv stm :endian-swap :network-order) + (let* ((ncontr (read32 stm)) + (contractions (make-array (* 4 ncontr) + :element-type '(unsigned-byte 32)))) + (read-vector contractions stm :endian-swap :network-order) + (let* ((nrange (read-byte stm)) + (ranges (make-array (* 4 nrange) + :element-type '(unsigned-byte 32)))) + (read-vector ranges stm :endian-swap :network-order) + (setf (unidata-collation *unicode-data*) + (make-collation :split split :hvec hvec :mvec mvec :lvec lvec + :primv primv :secv secv :terv terv + :contractions contractions + :ranges ranges))))))) ;;; Accessor functions. @@ -1657,4 +1702,5 @@ unidata.bin." (unidata-case-fold-simple *unicode-data*) (unidata-case-fold-full *unicode-data*) (unidata-word-break *unicode-data*) + (unidata-collation *unicode-data*) t)) ===================================== src/i18n/unidata.bin ===================================== Binary files a/src/i18n/unidata.bin and b/src/i18n/unidata.bin differ ===================================== tests/unicode-collation.lisp ===================================== @@ -19,9 +19,10 @@ "The Default Unicode Collation Element Table, loaded on first use.") (defun ducet () - "Return the DUCET, loading it from *COLLATION-ALLKEYS* the first time." + "Return the DUCET, built from the collation section of unidata.bin on +first use." (or *ducet* - (setf *ducet* (lisp::load-ducet *collation-allkeys*)))) + (setf *ducet* (lisp::unidata-ducet)))) (defun collation-hex-list (string) "Parse all space-separated hexadecimal numbers in STRING into a list of View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/6a425efa239f6911e1b1fc07... -- View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/6a425efa239f6911e1b1fc07... You're receiving this email because of your account on gitlab.common-lisp.net. Manage all notifications: https://gitlab.common-lisp.net/-/profile/notifications | Help: https://gitlab.common-lisp.net/help