Raymond Toy pushed to branch rtoy-unicode-collation-ducet at cmucl / cmucl
Commits:
-
6a425efa
by Raymond Toy at 2026-06-17T13:30:01-07:00
4 changed files:
- src/code/unicode-collation.lisp
- src/code/unidata.lisp
- src/i18n/unidata.bin
- tests/unicode-collation.lisp
Changes:
| ... | ... | @@ -518,6 +518,85 @@ if after. VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS." |
| 518 | 518 | ;;; no meaningful character index of the first difference to return.
|
| 519 | 519 | ;;; -------------------------------------------------------------------
|
| 520 | 520 | |
| 521 | + |
|
| 522 | +;;; -------------------------------------------------------------------
|
|
| 523 | +;;; Building the runtime DUCET from the collation section of
|
|
| 524 | +;;; unidata.bin. The resulting table is structurally identical to one
|
|
| 525 | +;;; built by LOAD-DUCET from allkeys.txt -- the same MAP / SINGLE /
|
|
| 526 | +;;; STARTERS hashes and implicit ranges -- so the sort-key construction
|
|
| 527 | +;;; code uses it unchanged. This replaces the runtime use of LOAD-DUCET
|
|
| 528 | +;;; (which is kept for regenerating data and for cross-checking).
|
|
| 529 | +;;; -------------------------------------------------------------------
|
|
| 530 | + |
|
| 531 | +(defun unidata-ducet ()
|
|
| 532 | + "Build a DUCET from the collation section of unidata.bin, loading the
|
|
| 533 | +section first if necessary."
|
|
| 534 | + (unless (unidata-collation *unicode-data*)
|
|
| 535 | + (load-collation))
|
|
| 536 | + (let* ((c (unidata-collation *unicode-data*))
|
|
| 537 | + (primv (collation-primv c))
|
|
| 538 | + (secv (collation-secv c))
|
|
| 539 | + (terv (collation-terv c))
|
|
| 540 | + (contractions (collation-contractions c))
|
|
| 541 | + (ranges (collation-ranges c))
|
|
| 542 | + (d (make-ducet :version (format nil "~D.~D.~D"
|
|
| 543 | + +unicode-major-version+
|
|
| 544 | + +unicode-minor-version+
|
|
| 545 | + +unicode-update-version+)))
|
|
| 546 | + (maxvar 0)
|
|
| 547 | + (maxkey 1))
|
|
| 548 | + (flet ((ces-at (packed)
|
|
| 549 | + ;; Slice the parallel arrays into a simple-vector of
|
|
| 550 | + ;; collation-elements for the packed (offset << 6) | count.
|
|
| 551 | + (let* ((off (ash packed -6))
|
|
| 552 | + (n (logand packed #x3f))
|
|
| 553 | + (v (make-array n)))
|
|
| 554 | + (dotimes (i n)
|
|
| 555 | + (let* ((j (+ off i))
|
|
| 556 | + (te (aref terv j))
|
|
| 557 | + (var (logbitp 7 te))
|
|
| 558 | + (p (aref primv j)))
|
|
| 559 | + (when (and var (> p maxvar))
|
|
| 560 | + (setf maxvar p))
|
|
| 561 | + (setf (aref v i)
|
|
| 562 | + (make-ce p (aref secv j) (logand te #x7f) var))))
|
|
| 563 | + v)))
|
|
| 564 | + ;; Single-codepoint entries: walk the codepoint space and pull the
|
|
| 565 | + ;; non-zero values out of the index trie. (Many keys are astral,
|
|
| 566 | + ;; so the walk must cover the full range, not just the BMP.)
|
|
| 567 | + (dotimes (cp #x110000)
|
|
| 568 | + (let ((packed (qref32 c cp)))
|
|
| 569 | + (unless (zerop packed)
|
|
| 570 | + (let ((ces (ces-at packed)))
|
|
| 571 | + (setf (gethash cp (ducet-single d)) ces)
|
|
| 572 | + (setf (gethash (make-array 1 :initial-element cp) (ducet-map d))
|
|
| 573 | + ces)))))
|
|
| 574 | + ;; Contractions: four 32-bit words each.
|
|
| 575 | + (loop for i from 0 below (length contractions) by 4 do
|
|
| 576 | + (let* ((cp1 (aref contractions i))
|
|
| 577 | + (cp2 (aref contractions (+ i 1)))
|
|
| 578 | + (cp3 (aref contractions (+ i 2)))
|
|
| 579 | + (packed (aref contractions (+ i 3)))
|
|
| 580 | + (key (if (= cp3 #xFFFFFFFF)
|
|
| 581 | + (make-array 2 :initial-contents (list cp1 cp2))
|
|
| 582 | + (make-array 3 :initial-contents (list cp1 cp2 cp3)))))
|
|
| 583 | + (setf (gethash key (ducet-map d)) (ces-at packed))
|
|
| 584 | + (setf (gethash cp1 (ducet-starters d)) t)
|
|
| 585 | + (setf maxkey (max maxkey (length key)))))
|
|
| 586 | + ;; Implicit-weight ranges: four 32-bit words each (start, end,
|
|
| 587 | + ;; base, base-origin).
|
|
| 588 | + (let ((rl nil))
|
|
| 589 | + (loop for i from 0 below (length ranges) by 4 do
|
|
| 590 | + (let ((r (make-implicit-range (aref ranges i)
|
|
| 591 | + (aref ranges (+ i 1))
|
|
| 592 | + (aref ranges (+ i 2)))))
|
|
| 593 | + (setf (implicit-range-base-origin r) (aref ranges (+ i 3)))
|
|
| 594 | + (push r rl)))
|
|
| 595 | + (setf (ducet-implicit-ranges d) (nreverse rl)))
|
|
| 596 | + (setf (ducet-max-key-length d) maxkey
|
|
| 597 | + (ducet-max-variable-primary d) maxvar)
|
|
| 598 | + d)))
|
|
| 599 | + |
|
| 521 | 600 | (in-package "UNICODE")
|
| 522 | 601 | |
| 523 | 602 | (defvar *collation-table-path* "ext-formats:allkeys.txt"
|
| ... | ... | @@ -530,10 +609,10 @@ loaded. Loaded lazily from *COLLATION-TABLE-PATH* the first time a |
| 530 | 609 | collation function needs it. Set to NIL to force a reload.")
|
| 531 | 610 | |
| 532 | 611 | (defun collation-table ()
|
| 533 | - "Return the default Unicode collation table, loading it from
|
|
| 534 | -*COLLATION-TABLE-PATH* on first use."
|
|
| 612 | + "Return the default Unicode collation table, building it from the
|
|
| 613 | +collation section of unidata.bin on first use."
|
|
| 535 | 614 | (or *collation-table*
|
| 536 | - (setf *collation-table* (lisp::load-ducet *collation-table-path*))))
|
|
| 615 | + (setf *collation-table* (lisp::unidata-ducet))))
|
|
| 537 | 616 | |
| 538 | 617 | (defun %collation-compare (string1 string2 start1 end1 start2 end2
|
| 539 | 618 | variable-weighting)
|
| ... | ... | @@ -56,6 +56,7 @@ |
| 56 | 56 | case-fold-simple
|
| 57 | 57 | case-fold-full
|
| 58 | 58 | word-break
|
| 59 | + collation
|
|
| 59 | 60 | )
|
| 60 | 61 | |
| 61 | 62 | (defvar *unicode-data* (make-unidata))
|
| ... | ... | @@ -65,7 +66,7 @@ |
| 65 | 66 | (defconstant +unicode-magic-number+ #x2A554344)
|
| 66 | 67 | |
| 67 | 68 | ;; The format version for the unidata.bin file.
|
| 68 | -(defconstant +unicode-format-version+ 1)
|
|
| 69 | +(defconstant +unicode-format-version+ 2)
|
|
| 69 | 70 | |
| 70 | 71 | ;; The expected Unicode version. This needs to be synced with
|
| 71 | 72 | ;; build-unidata.lisp.
|
| ... | ... | @@ -292,6 +293,27 @@ |
| 292 | 293 | |
| 293 | 294 | (defstruct (case-fold-full (:include decomp)))
|
| 294 | 295 | |
| 296 | +(defstruct (collation (:include ntrie32))
|
|
| 297 | + ;; Parallel collation-element arrays shared by the single-codepoint
|
|
| 298 | + ;; index (whose LVEC packs (offset << 6) | count into these) and the
|
|
| 299 | + ;; contraction table. TERV holds the tertiary weight in its low 7
|
|
| 300 | + ;; bits and the variable flag in bit 7.
|
|
| 301 | + (primv (ext:required-argument) :read-only t
|
|
| 302 | + :type (simple-array (unsigned-byte 16) (*)))
|
|
| 303 | + (secv (ext:required-argument) :read-only t
|
|
| 304 | + :type (simple-array (unsigned-byte 16) (*)))
|
|
| 305 | + (terv (ext:required-argument) :read-only t
|
|
| 306 | + :type (simple-array (unsigned-byte 8) (*)))
|
|
| 307 | + ;; Contraction table: four 32-bit words per entry -- cp1, cp2, cp3
|
|
| 308 | + ;; (or #xFFFFFFFF when the key has only two codepoints), and the
|
|
| 309 | + ;; packed (offset << 6) | count into the collation-element arrays.
|
|
| 310 | + (contractions (ext:required-argument) :read-only t
|
|
| 311 | + :type (simple-array (unsigned-byte 32) (*)))
|
|
| 312 | + ;; @implicitweights ranges: four 32-bit words per entry -- start,
|
|
| 313 | + ;; end, base, and base-origin (smallest start sharing the base).
|
|
| 314 | + (ranges (ext:required-argument) :read-only t
|
|
| 315 | + :type (simple-array (unsigned-byte 32) (*))))
|
|
| 316 | + |
|
| 295 | 317 | (defstruct (bidi (:include ntrie16))
|
| 296 | 318 | (tabl (ext:required-argument) :read-only t
|
| 297 | 319 | :type (simple-array (unsigned-byte 16) (*))))
|
| ... | ... | @@ -718,6 +740,29 @@ |
| 718 | 740 | (read-ntrie 4 stm)
|
| 719 | 741 | (setf (unidata-word-break *unicode-data*)
|
| 720 | 742 | (make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec))))
|
| 743 | +(defloader load-collation (stm 19)
|
|
| 744 | + (multiple-value-bind (split hvec mvec lvec)
|
|
| 745 | + (read-ntrie 32 stm)
|
|
| 746 | + (let* ((nce (read32 stm))
|
|
| 747 | + (primv (make-array nce :element-type '(unsigned-byte 16)))
|
|
| 748 | + (secv (make-array nce :element-type '(unsigned-byte 16)))
|
|
| 749 | + (terv (make-array nce :element-type '(unsigned-byte 8))))
|
|
| 750 | + (read-vector primv stm :endian-swap :network-order)
|
|
| 751 | + (read-vector secv stm :endian-swap :network-order)
|
|
| 752 | + (read-vector terv stm :endian-swap :network-order)
|
|
| 753 | + (let* ((ncontr (read32 stm))
|
|
| 754 | + (contractions (make-array (* 4 ncontr)
|
|
| 755 | + :element-type '(unsigned-byte 32))))
|
|
| 756 | + (read-vector contractions stm :endian-swap :network-order)
|
|
| 757 | + (let* ((nrange (read-byte stm))
|
|
| 758 | + (ranges (make-array (* 4 nrange)
|
|
| 759 | + :element-type '(unsigned-byte 32))))
|
|
| 760 | + (read-vector ranges stm :endian-swap :network-order)
|
|
| 761 | + (setf (unidata-collation *unicode-data*)
|
|
| 762 | + (make-collation :split split :hvec hvec :mvec mvec :lvec lvec
|
|
| 763 | + :primv primv :secv secv :terv terv
|
|
| 764 | + :contractions contractions
|
|
| 765 | + :ranges ranges)))))))
|
|
| 721 | 766 | |
| 722 | 767 | ;;; Accessor functions.
|
| 723 | 768 | |
| ... | ... | @@ -1657,4 +1702,5 @@ unidata.bin." |
| 1657 | 1702 | (unidata-case-fold-simple *unicode-data*)
|
| 1658 | 1703 | (unidata-case-fold-full *unicode-data*)
|
| 1659 | 1704 | (unidata-word-break *unicode-data*)
|
| 1705 | + (unidata-collation *unicode-data*)
|
|
| 1660 | 1706 | t)) |
| ... | ... | @@ -19,9 +19,10 @@ |
| 19 | 19 | "The Default Unicode Collation Element Table, loaded on first use.")
|
| 20 | 20 | |
| 21 | 21 | (defun ducet ()
|
| 22 | - "Return the DUCET, loading it from *COLLATION-ALLKEYS* the first time."
|
|
| 22 | + "Return the DUCET, built from the collation section of unidata.bin on
|
|
| 23 | +first use."
|
|
| 23 | 24 | (or *ducet*
|
| 24 | - (setf *ducet* (lisp::load-ducet *collation-allkeys*))))
|
|
| 25 | + (setf *ducet* (lisp::unidata-ducet))))
|
|
| 25 | 26 | |
| 26 | 27 | (defun collation-hex-list (string)
|
| 27 | 28 | "Parse all space-separated hexadecimal numbers in STRING into a list of
|