Raymond Toy pushed to branch rtoy-unicode-collation-ducet at cmucl / cmucl

Commits:

4 changed files:

Changes:

  • src/code/unicode-collation.lisp
    ... ... @@ -518,6 +518,85 @@ if after. VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS."
    518 518
     ;;; no meaningful character index of the first difference to return.
    
    519 519
     ;;; -------------------------------------------------------------------
    
    520 520
     
    
    521
    +
    
    522
    +;;; -------------------------------------------------------------------
    
    523
    +;;; Building the runtime DUCET from the collation section of
    
    524
    +;;; unidata.bin.  The resulting table is structurally identical to one
    
    525
    +;;; built by LOAD-DUCET from allkeys.txt -- the same MAP / SINGLE /
    
    526
    +;;; STARTERS hashes and implicit ranges -- so the sort-key construction
    
    527
    +;;; code uses it unchanged.  This replaces the runtime use of LOAD-DUCET
    
    528
    +;;; (which is kept for regenerating data and for cross-checking).
    
    529
    +;;; -------------------------------------------------------------------
    
    530
    +
    
    531
    +(defun unidata-ducet ()
    
    532
    +  "Build a DUCET from the collation section of unidata.bin, loading the
    
    533
    +section first if necessary."
    
    534
    +  (unless (unidata-collation *unicode-data*)
    
    535
    +    (load-collation))
    
    536
    +  (let* ((c (unidata-collation *unicode-data*))
    
    537
    +	 (primv (collation-primv c))
    
    538
    +	 (secv (collation-secv c))
    
    539
    +	 (terv (collation-terv c))
    
    540
    +	 (contractions (collation-contractions c))
    
    541
    +	 (ranges (collation-ranges c))
    
    542
    +	 (d (make-ducet :version (format nil "~D.~D.~D"
    
    543
    +					 +unicode-major-version+
    
    544
    +					 +unicode-minor-version+
    
    545
    +					 +unicode-update-version+)))
    
    546
    +	 (maxvar 0)
    
    547
    +	 (maxkey 1))
    
    548
    +    (flet ((ces-at (packed)
    
    549
    +	     ;; Slice the parallel arrays into a simple-vector of
    
    550
    +	     ;; collation-elements for the packed (offset << 6) | count.
    
    551
    +	     (let* ((off (ash packed -6))
    
    552
    +		    (n (logand packed #x3f))
    
    553
    +		    (v (make-array n)))
    
    554
    +	       (dotimes (i n)
    
    555
    +		 (let* ((j (+ off i))
    
    556
    +			(te (aref terv j))
    
    557
    +			(var (logbitp 7 te))
    
    558
    +			(p (aref primv j)))
    
    559
    +		   (when (and var (> p maxvar))
    
    560
    +		     (setf maxvar p))
    
    561
    +		   (setf (aref v i)
    
    562
    +			 (make-ce p (aref secv j) (logand te #x7f) var))))
    
    563
    +	       v)))
    
    564
    +      ;; Single-codepoint entries: walk the codepoint space and pull the
    
    565
    +      ;; non-zero values out of the index trie.  (Many keys are astral,
    
    566
    +      ;; so the walk must cover the full range, not just the BMP.)
    
    567
    +      (dotimes (cp #x110000)
    
    568
    +	(let ((packed (qref32 c cp)))
    
    569
    +	  (unless (zerop packed)
    
    570
    +	    (let ((ces (ces-at packed)))
    
    571
    +	      (setf (gethash cp (ducet-single d)) ces)
    
    572
    +	      (setf (gethash (make-array 1 :initial-element cp) (ducet-map d))
    
    573
    +		    ces)))))
    
    574
    +      ;; Contractions: four 32-bit words each.
    
    575
    +      (loop for i from 0 below (length contractions) by 4 do
    
    576
    +	(let* ((cp1 (aref contractions i))
    
    577
    +	       (cp2 (aref contractions (+ i 1)))
    
    578
    +	       (cp3 (aref contractions (+ i 2)))
    
    579
    +	       (packed (aref contractions (+ i 3)))
    
    580
    +	       (key (if (= cp3 #xFFFFFFFF)
    
    581
    +			(make-array 2 :initial-contents (list cp1 cp2))
    
    582
    +			(make-array 3 :initial-contents (list cp1 cp2 cp3)))))
    
    583
    +	  (setf (gethash key (ducet-map d)) (ces-at packed))
    
    584
    +	  (setf (gethash cp1 (ducet-starters d)) t)
    
    585
    +	  (setf maxkey (max maxkey (length key)))))
    
    586
    +      ;; Implicit-weight ranges: four 32-bit words each (start, end,
    
    587
    +      ;; base, base-origin).
    
    588
    +      (let ((rl nil))
    
    589
    +	(loop for i from 0 below (length ranges) by 4 do
    
    590
    +	  (let ((r (make-implicit-range (aref ranges i)
    
    591
    +					(aref ranges (+ i 1))
    
    592
    +					(aref ranges (+ i 2)))))
    
    593
    +	    (setf (implicit-range-base-origin r) (aref ranges (+ i 3)))
    
    594
    +	    (push r rl)))
    
    595
    +	(setf (ducet-implicit-ranges d) (nreverse rl)))
    
    596
    +      (setf (ducet-max-key-length d) maxkey
    
    597
    +	    (ducet-max-variable-primary d) maxvar)
    
    598
    +      d)))
    
    599
    +
    
    521 600
     (in-package "UNICODE")
    
    522 601
     
    
    523 602
     (defvar *collation-table-path* "ext-formats:allkeys.txt"
    
    ... ... @@ -530,10 +609,10 @@ loaded. Loaded lazily from *COLLATION-TABLE-PATH* the first time a
    530 609
     collation function needs it.  Set to NIL to force a reload.")
    
    531 610
     
    
    532 611
     (defun collation-table ()
    
    533
    -  "Return the default Unicode collation table, loading it from
    
    534
    -*COLLATION-TABLE-PATH* on first use."
    
    612
    +  "Return the default Unicode collation table, building it from the
    
    613
    +collation section of unidata.bin on first use."
    
    535 614
       (or *collation-table*
    
    536
    -      (setf *collation-table* (lisp::load-ducet *collation-table-path*))))
    
    615
    +      (setf *collation-table* (lisp::unidata-ducet))))
    
    537 616
     
    
    538 617
     (defun %collation-compare (string1 string2 start1 end1 start2 end2
    
    539 618
                                variable-weighting)
    

  • src/code/unidata.lisp
    ... ... @@ -56,6 +56,7 @@
    56 56
       case-fold-simple
    
    57 57
       case-fold-full
    
    58 58
       word-break
    
    59
    +  collation
    
    59 60
       )
    
    60 61
     
    
    61 62
     (defvar *unicode-data* (make-unidata))
    
    ... ... @@ -65,7 +66,7 @@
    65 66
     (defconstant +unicode-magic-number+ #x2A554344)
    
    66 67
     
    
    67 68
     ;; The format version for the unidata.bin file.
    
    68
    -(defconstant +unicode-format-version+ 1)
    
    69
    +(defconstant +unicode-format-version+ 2)
    
    69 70
     
    
    70 71
     ;; The expected Unicode version.  This needs to be synced with
    
    71 72
     ;; build-unidata.lisp.
    
    ... ... @@ -292,6 +293,27 @@
    292 293
     
    
    293 294
     (defstruct (case-fold-full (:include decomp)))
    
    294 295
     
    
    296
    +(defstruct (collation (:include ntrie32))
    
    297
    +  ;; Parallel collation-element arrays shared by the single-codepoint
    
    298
    +  ;; index (whose LVEC packs (offset << 6) | count into these) and the
    
    299
    +  ;; contraction table.  TERV holds the tertiary weight in its low 7
    
    300
    +  ;; bits and the variable flag in bit 7.
    
    301
    +  (primv (ext:required-argument) :read-only t
    
    302
    +	 :type (simple-array (unsigned-byte 16) (*)))
    
    303
    +  (secv (ext:required-argument) :read-only t
    
    304
    +	:type (simple-array (unsigned-byte 16) (*)))
    
    305
    +  (terv (ext:required-argument) :read-only t
    
    306
    +	:type (simple-array (unsigned-byte 8) (*)))
    
    307
    +  ;; Contraction table: four 32-bit words per entry -- cp1, cp2, cp3
    
    308
    +  ;; (or #xFFFFFFFF when the key has only two codepoints), and the
    
    309
    +  ;; packed (offset << 6) | count into the collation-element arrays.
    
    310
    +  (contractions (ext:required-argument) :read-only t
    
    311
    +		:type (simple-array (unsigned-byte 32) (*)))
    
    312
    +  ;; @implicitweights ranges: four 32-bit words per entry -- start,
    
    313
    +  ;; end, base, and base-origin (smallest start sharing the base).
    
    314
    +  (ranges (ext:required-argument) :read-only t
    
    315
    +	  :type (simple-array (unsigned-byte 32) (*))))
    
    316
    +
    
    295 317
     (defstruct (bidi (:include ntrie16))
    
    296 318
       (tabl (ext:required-argument) :read-only t
    
    297 319
     	:type (simple-array (unsigned-byte 16) (*))))
    
    ... ... @@ -718,6 +740,29 @@
    718 740
           (read-ntrie 4 stm)
    
    719 741
         (setf (unidata-word-break *unicode-data*)
    
    720 742
     	  (make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec))))
    
    743
    +(defloader load-collation (stm 19)
    
    744
    +  (multiple-value-bind (split hvec mvec lvec)
    
    745
    +      (read-ntrie 32 stm)
    
    746
    +    (let* ((nce (read32 stm))
    
    747
    +	   (primv (make-array nce :element-type '(unsigned-byte 16)))
    
    748
    +	   (secv (make-array nce :element-type '(unsigned-byte 16)))
    
    749
    +	   (terv (make-array nce :element-type '(unsigned-byte 8))))
    
    750
    +      (read-vector primv stm :endian-swap :network-order)
    
    751
    +      (read-vector secv stm :endian-swap :network-order)
    
    752
    +      (read-vector terv stm :endian-swap :network-order)
    
    753
    +      (let* ((ncontr (read32 stm))
    
    754
    +	     (contractions (make-array (* 4 ncontr)
    
    755
    +				       :element-type '(unsigned-byte 32))))
    
    756
    +	(read-vector contractions stm :endian-swap :network-order)
    
    757
    +	(let* ((nrange (read-byte stm))
    
    758
    +	       (ranges (make-array (* 4 nrange)
    
    759
    +				   :element-type '(unsigned-byte 32))))
    
    760
    +	  (read-vector ranges stm :endian-swap :network-order)
    
    761
    +	  (setf (unidata-collation *unicode-data*)
    
    762
    +		(make-collation :split split :hvec hvec :mvec mvec :lvec lvec
    
    763
    +				:primv primv :secv secv :terv terv
    
    764
    +				:contractions contractions
    
    765
    +				:ranges ranges)))))))
    
    721 766
     
    
    722 767
     ;;; Accessor functions.
    
    723 768
     
    
    ... ... @@ -1657,4 +1702,5 @@ unidata.bin."
    1657 1702
            (unidata-case-fold-simple *unicode-data*)
    
    1658 1703
            (unidata-case-fold-full *unicode-data*)
    
    1659 1704
            (unidata-word-break *unicode-data*)
    
    1705
    +       (unidata-collation *unicode-data*)
    
    1660 1706
            t))

  • src/i18n/unidata.bin
    No preview for this file type
  • tests/unicode-collation.lisp
    ... ... @@ -19,9 +19,10 @@
    19 19
       "The Default Unicode Collation Element Table, loaded on first use.")
    
    20 20
     
    
    21 21
     (defun ducet ()
    
    22
    -  "Return the DUCET, loading it from *COLLATION-ALLKEYS* the first time."
    
    22
    +  "Return the DUCET, built from the collation section of unidata.bin on
    
    23
    +first use."
    
    23 24
       (or *ducet*
    
    24
    -      (setf *ducet* (lisp::load-ducet *collation-allkeys*))))
    
    25
    +      (setf *ducet* (lisp::unidata-ducet))))
    
    25 26
     
    
    26 27
     (defun collation-hex-list (string)
    
    27 28
       "Parse all space-separated hexadecimal numbers in STRING into a list of