Raymond Toy pushed to branch rtoy-unicode-collation-ducet at cmucl / cmucl

Commits:

2 changed files:

Changes:

  • src/code/unicode-collation.lisp
    ... ... @@ -55,6 +55,15 @@
    55 55
       ;; of collation-elements.  Single characters and contractions share
    
    56 56
       ;; this table; the key length distinguishes them.
    
    57 57
       (map (make-hash-table :test 'equalp) :type hash-table)
    
    58
    +  ;; Fast paths derived from MAP at load time.  SINGLE maps a single
    
    59
    +  ;; codepoint (a fixnum) directly to its collation elements, avoiding an
    
    60
    +  ;; EQUALP probe with a one-element key for the common case.  STARTERS
    
    61
    +  ;; holds every codepoint that begins some multi-codepoint key; a
    
    62
    +  ;; codepoint not in STARTERS can neither begin a contraction nor be the
    
    63
    +  ;; base of a discontiguous match, so the contraction machinery is
    
    64
    +  ;; skipped for it entirely.
    
    65
    +  (single (make-hash-table :test 'eql) :type hash-table)
    
    66
    +  (starters (make-hash-table :test 'eql) :type hash-table)
    
    58 67
       ;; Implicit-weight ranges, in file order.
    
    59 68
       (implicit-ranges nil :type list)
    
    60 69
       ;; Longest key (in codepoints) present in MAP; bounds the contraction
    
    ... ... @@ -180,6 +189,13 @@ returning an IMPLICIT-RANGE, or NIL if it does not parse."
    180 189
                                do (setf (ducet-max-variable-primary d)
    
    181 190
                                         (ce-primary ce))))))))))))
    
    182 191
         (setf (ducet-implicit-ranges d) (nreverse (ducet-implicit-ranges d)))
    
    192
    +    ;; Build the fast-path tables: SINGLE for one-codepoint keys, and
    
    193
    +    ;; STARTERS for the first codepoint of every multi-codepoint key.
    
    194
    +    (maphash (lambda (key ces)
    
    195
    +               (if (= (length key) 1)
    
    196
    +                   (setf (gethash (aref key 0) (ducet-single d)) ces)
    
    197
    +                   (setf (gethash (aref key 0) (ducet-starters d)) t)))
    
    198
    +             (ducet-map d))
    
    183 199
         ;; Compute, for each base, the smallest range start sharing it, and
    
    184 200
         ;; record it as the BBBB offset origin on every range with that base.
    
    185 201
         (let ((origin (make-hash-table)))
    
    ... ... @@ -298,74 +314,106 @@ the blocking threshold for later non-starters."
    298 314
             (consumed (make-array (max 1 (length cps)) :initial-element nil))
    
    299 315
             (i 0)
    
    300 316
             (n (length cps))
    
    301
    -        (maxlen (ducet-max-key-length d)))
    
    317
    +        (maxlen (ducet-max-key-length d))
    
    318
    +        (map (ducet-map d))
    
    319
    +        (single (ducet-single d))
    
    320
    +        (starters (ducet-starters d)))
    
    302 321
         (loop while (< i n) do
    
    303 322
           (cond
    
    304 323
             ((aref consumed i) (incf i))    ; folded in by a discontiguous match
    
    305 324
             (t
    
    306
    -         (let ((best-key nil) (best-len 0) (best-ces nil))
    
    307
    -           ;; S2.1: longest contiguous match starting at I, over positions
    
    308
    -           ;; not already consumed by an earlier discontiguous fold.  A
    
    309
    -           ;; multi-codepoint key may only match an unbroken run of
    
    310
    -           ;; still-present codepoints, so a span containing a consumed
    
    311
    -           ;; position (a "hole" left by a fold) is rejected.
    
    312
    -           (loop for len from (min maxlen (- n i)) downto 1 do
    
    313
    -             (when (loop for j from i below (+ i len)
    
    314
    -                         never (aref consumed j))
    
    315
    -               (let* ((key (subseq cps i (+ i len)))
    
    316
    -                      (ces (gethash key (ducet-map d))))
    
    317
    -                 (when ces
    
    318
    -                   (setf best-key key best-len len best-ces ces)
    
    319
    -                   (return)))))
    
    325
    +         (let* ((cp (aref cps i))
    
    326
    +                (starter (gethash cp starters))
    
    327
    +                (best-key nil) (best-len 0) (best-ces nil))
    
    328
    +           ;; S2.1: longest contiguous match starting at I.  Only a
    
    329
    +           ;; codepoint that begins some multi-codepoint key (a STARTER)
    
    330
    +           ;; can match a contraction, so the multi-length scan is done
    
    331
    +           ;; only for those; every other codepoint takes the SINGLE
    
    332
    +           ;; fast path.  The scan rejects any span containing a position
    
    333
    +           ;; already consumed by an earlier discontiguous fold.
    
    334
    +           (when starter
    
    335
    +             (loop for len from (min maxlen (- n i)) downto 2 do
    
    336
    +               (when (loop for j from i below (+ i len)
    
    337
    +                           never (aref consumed j))
    
    338
    +                 (let* ((key (subseq cps i (+ i len)))
    
    339
    +                        (ces (gethash key map)))
    
    340
    +                   (when ces
    
    341
    +                     (setf best-key key best-len len best-ces ces)
    
    342
    +                     (return))))))
    
    343
    +           (unless best-ces
    
    344
    +             (let ((ces (gethash cp single)))
    
    345
    +               (when ces
    
    346
    +                 (setf best-ces ces best-len 1)
    
    347
    +                 ;; BEST-KEY is needed only to extend by discontiguous
    
    348
    +                 ;; folds, which apply only to starters.
    
    349
    +                 (when starter
    
    350
    +                   (setf best-key (make-array 1 :initial-element cp))))))
    
    320 351
                (cond
    
    321
    -             (best-key
    
    352
    +             (best-ces
    
    322 353
                   ;; S2.1.1-S2.1.3: extend by unblocked following non-starters.
    
    354
    +              ;; A codepoint that is not a STARTER can begin no multi-key,
    
    355
    +              ;; so no fold is possible and the scan is skipped entirely.
    
    323 356
                   ;; MAXCCC is the highest combining class among still-present
    
    324 357
                   ;; codepoints passed since the end of the contiguous match.
    
    325
    -              (let ((maxccc 0))
    
    326
    -                (loop for k from (+ i best-len) below n do
    
    327
    -                  (unless (aref consumed k)
    
    328
    -                    (let ((ccc (lisp::unicode-combining-class (aref cps k))))
    
    329
    -                      (cond
    
    330
    -                        ((zerop ccc)
    
    331
    -                         (return))       ; starter blocks all further
    
    332
    -                        ((> ccc maxccc)  ; unblocked candidate
    
    333
    -                         (let* ((cand (collation-extend-key
    
    334
    -                                       best-key (aref cps k)))
    
    335
    -                                (ces (gethash cand (ducet-map d))))
    
    336
    -                           (if ces
    
    337
    -                               ;; Fold C in and remove it; MAXCCC unchanged
    
    338
    -                               ;; since C is no longer present.
    
    339
    -                               (setf best-key cand
    
    340
    -                                     best-ces ces
    
    341
    -                                     (aref consumed k) t)
    
    342
    -                               ;; No match: C stays and becomes a blocker.
    
    343
    -                               (setf maxccc ccc))))
    
    344
    -                        (t
    
    345
    -                         ;; Blocked (ccc <= maxccc): C stays; MAXCCC already
    
    346
    -                         ;; covers it.
    
    347
    -                         nil))))))
    
    358
    +              (when (and starter best-key)
    
    359
    +                (let ((maxccc 0))
    
    360
    +                  (loop for k from (+ i best-len) below n do
    
    361
    +                    (unless (aref consumed k)
    
    362
    +                      (let ((ccc (lisp::unicode-combining-class (aref cps k))))
    
    363
    +                        (cond
    
    364
    +                          ((zerop ccc)
    
    365
    +                           (return))      ; starter blocks all further
    
    366
    +                          ((> ccc maxccc) ; unblocked candidate
    
    367
    +                           (let* ((cand (collation-extend-key
    
    368
    +                                         best-key (aref cps k)))
    
    369
    +                                  (ces (gethash cand map)))
    
    370
    +                             (if ces
    
    371
    +                                 ;; Fold C in and remove it; MAXCCC
    
    372
    +                                 ;; unchanged since C is no longer present.
    
    373
    +                                 (setf best-key cand
    
    374
    +                                       best-ces ces
    
    375
    +                                       (aref consumed k) t)
    
    376
    +                                 ;; No match: C stays and becomes a blocker.
    
    377
    +                                 (setf maxccc ccc))))
    
    378
    +                          (t
    
    379
    +                           ;; Blocked (ccc <= maxccc): C stays; MAXCCC
    
    380
    +                           ;; already covers it.
    
    381
    +                           nil)))))))
    
    348 382
                   (loop for ce across best-ces do (push ce result))
    
    349 383
                   (incf i best-len))
    
    350 384
                  (t
    
    351 385
                   ;; No entry: derive implicit weights (Section 10.1).
    
    352
    -              (dolist (ce (derive-implicit-elements d (aref cps i)))
    
    386
    +              (dolist (ce (derive-implicit-elements d cp))
    
    353 387
                     (push ce result))
    
    354 388
                   (incf i)))))))
    
    355 389
         (nreverse result)))
    
    356 390
     
    
    357
    -(defun collation-weights (d string)
    
    391
    +(defun collation-weights (d string &optional (variable-weighting :shifted))
    
    358 392
       "Return four values -- the level-1, level-2, level-3 and level-4 weight
    
    359
    -lists for STRING under DUCET D, using the Shifted variable-weighting
    
    360
    -option (UTS #10 Section 4).  Under Shifted: a variable element con-
    
    361
    -tributes nothing at levels 1-3 and its primary at level 4; a non-
    
    362
    -variable element takes a level-4 weight of FFFF when it carries level-2
    
    363
    -or level-3 content, but none when it is a primary-only continuation
    
    364
    -element ([.XXXX.0000.0000], e.g. the second half of an implicit weight
    
    365
    -pair or an expansion tail); a completely ignorable element
    
    366
    -([.0000.0000.0000]) contributes nothing anywhere; and a primary-
    
    367
    -ignorable element that follows a variable element is shifted away
    
    368
    -entirely."
    
    393
    +lists for STRING under DUCET D.  VARIABLE-WEIGHTING selects the UTS #10
    
    394
    +Section 4 option:
    
    395
    +
    
    396
    +  :SHIFTED (the default) -- a variable element contributes nothing at
    
    397
    +    levels 1-3 and its primary at level 4; a non-variable element takes
    
    398
    +    a level-4 weight of FFFF when it carries level-2 or level-3 content,
    
    399
    +    but none when it is a primary-only continuation element
    
    400
    +    ([.XXXX.0000.0000], e.g. the second half of an implicit weight pair
    
    401
    +    or an expansion tail); a completely ignorable element
    
    402
    +    ([.0000.0000.0000]) contributes nothing anywhere; and a primary-
    
    403
    +    ignorable element that follows a variable element is shifted away
    
    404
    +    entirely.
    
    405
    +
    
    406
    +  :NON-IGNORABLE -- variable elements are not treated specially: every
    
    407
    +    element contributes its non-zero weights at levels 1-3 just like any
    
    408
    +    other element, and there is no fourth level (the returned L4 is
    
    409
    +    always NIL)."
    
    410
    +  (ecase variable-weighting
    
    411
    +    (:shifted (collation-weights-shifted d string))
    
    412
    +    (:non-ignorable (collation-weights-non-ignorable d string))))
    
    413
    +
    
    414
    +(defun collation-weights-shifted (d string)
    
    415
    +  "Compute the four weight levels for STRING under the Shifted option.
    
    416
    +See COLLATION-WEIGHTS."
    
    369 417
       (let ((ces (ducet-element-array d (string-to-collation-codepoints string)))
    
    370 418
             (l1 nil) (l2 nil) (l3 nil) (l4 nil)
    
    371 419
             (after-variable nil))
    
    ... ... @@ -402,14 +450,35 @@ entirely."
    402 450
                (setf after-variable nil)))))
    
    403 451
         (values (nreverse l1) (nreverse l2) (nreverse l3) (nreverse l4))))
    
    404 452
     
    
    405
    -(defun collation-sort-key (d string)
    
    406
    -  "Compute the UTS #10 sort key for STRING under DUCET D (Shifted
    
    407
    -option).  Returns a (simple-array (unsigned-byte 16) (*)) holding the
    
    408
    -level-1 weights, a 0000 separator, the level-2 weights, 0000, the
    
    409
    -level-3 weights, 0000, and the level-4 weights.  Binary comparison of
    
    410
    -two such keys yields the collation order of their strings."
    
    411
    -  (multiple-value-bind (l1 l2 l3 l4) (collation-weights d string)
    
    412
    -    (let* ((weights (append l1 (list 0) l2 (list 0) l3 (list 0) l4))
    
    453
    +(defun collation-weights-non-ignorable (d string)
    
    454
    +  "Compute the weight levels for STRING under the Non-ignorable option.
    
    455
    +Variable elements are treated exactly like ordinary elements, and there
    
    456
    +is no fourth level.  See COLLATION-WEIGHTS."
    
    457
    +  (let ((ces (ducet-element-array d (string-to-collation-codepoints string)))
    
    458
    +        (l1 nil) (l2 nil) (l3 nil))
    
    459
    +    (dolist (ce ces)
    
    460
    +      (let ((p (ce-primary ce))
    
    461
    +            (s (ce-secondary ce))
    
    462
    +            (te (ce-tertiary ce)))
    
    463
    +        ;; Every element -- variable or not -- contributes its non-zero
    
    464
    +        ;; weights at each level; zero weights are passed over.
    
    465
    +        (unless (zerop p) (push p l1))
    
    466
    +        (unless (zerop s) (push s l2))
    
    467
    +        (unless (zerop te) (push te l3))))
    
    468
    +    (values (nreverse l1) (nreverse l2) (nreverse l3) nil)))
    
    469
    +
    
    470
    +(defun collation-sort-key (d string &optional (variable-weighting :shifted))
    
    471
    +  "Compute the UTS #10 sort key for STRING under DUCET D.  Returns a
    
    472
    +(simple-array (unsigned-byte 16) (*)) holding the level-1 weights, a 0000
    
    473
    +separator, the level-2 weights, 0000, the level-3 weights, and -- for the
    
    474
    +:SHIFTED option -- a further 0000 separator and the level-4 weights.
    
    475
    +Binary comparison of two such keys yields the collation order of their
    
    476
    +strings.  VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS."
    
    477
    +  (multiple-value-bind (l1 l2 l3 l4)
    
    478
    +      (collation-weights d string variable-weighting)
    
    479
    +    (let* ((weights (if (eq variable-weighting :non-ignorable)
    
    480
    +                        (append l1 (list 0) l2 (list 0) l3)
    
    481
    +                        (append l1 (list 0) l2 (list 0) l3 (list 0) l4)))
    
    413 482
                (key (make-array (length weights)
    
    414 483
                                 :element-type '(unsigned-byte 16))))
    
    415 484
           (loop for w in weights
    
    ... ... @@ -417,12 +486,12 @@ two such keys yields the collation order of their strings."
    417 486
                 do (setf (aref key k) w))
    
    418 487
           key)))
    
    419 488
     
    
    420
    -(defun collation-compare (d s1 s2)
    
    489
    +(defun collation-compare (d s1 s2 &optional (variable-weighting :shifted))
    
    421 490
       "Compare strings S1 and S2 under DUCET D.  Returns -1, 0, or 1 like a
    
    422 491
     three-way comparison: negative if S1 sorts before S2, zero if equal, 1
    
    423
    -if after."
    
    424
    -  (let ((k1 (collation-sort-key d s1))
    
    425
    -        (k2 (collation-sort-key d s2)))
    
    492
    +if after.  VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS."
    
    493
    +  (let ((k1 (collation-sort-key d s1 variable-weighting))
    
    494
    +        (k2 (collation-sort-key d s2 variable-weighting)))
    
    426 495
         (let ((n (min (length k1) (length k2))))
    
    427 496
           (dotimes (i n)
    
    428 497
             (let ((a (aref k1 i)) (b (aref k2 i)))
    

  • tests/unicode-collation.lisp
    ... ... @@ -12,6 +12,8 @@
    12 12
     (defvar *collation-allkeys* "target:i18n/allkeys.txt")
    
    13 13
     (defvar *collation-shifted-test*
    
    14 14
       "target:i18n/CollationTest/CollationTest_SHIFTED.txt")
    
    15
    +(defvar *collation-non-ignorable-test*
    
    16
    +  "target:i18n/CollationTest/CollationTest_NON_IGNORABLE.txt")
    
    15 17
     
    
    16 18
     (defvar *ducet* nil
    
    17 19
       "The Default Unicode Collation Element Table, loaded on first use.")
    
    ... ... @@ -107,3 +109,26 @@ comment."
    107 109
                          (assert-equalp (list e1 e2 e3 e4)
    
    108 110
                                         (list g1 g2 g3 g4)
    
    109 111
                                         cps))))))))
    
    112
    +
    
    113
    +(define-test unicode.collation-non-ignorable
    
    114
    +  "Test UTS #10 collation sort keys against the UCA NON_IGNORABLE
    
    115
    +conformance data.  Under the Non-ignorable option variable elements keep
    
    116
    +their weights and there is no fourth level, so for each line the three
    
    117
    +weight levels produced by LISP::COLLATION-WEIGHTS with :NON-IGNORABLE
    
    118
    +must match the expected key in the line's comment."
    
    119
    +  (:tag :unicode)
    
    120
    +  (let ((ducet (ducet)))
    
    121
    +    (with-open-file (s *collation-non-ignorable-test* :direction :input
    
    122
    +                       :external-format :utf-8)
    
    123
    +      (loop for line = (read-line s nil nil)
    
    124
    +            while line
    
    125
    +            do
    
    126
    +               (multiple-value-bind (cps e1 e2 e3)
    
    127
    +                   (collation-parse-test-line line)
    
    128
    +                 (when cps
    
    129
    +                   (multiple-value-bind (g1 g2 g3)
    
    130
    +                       (lisp::collation-weights ducet (collation-test-string cps)
    
    131
    +                                                :non-ignorable)
    
    132
    +                     (assert-equalp (list e1 e2 e3)
    
    133
    +                                    (list g1 g2 g3)
    
    134
    +                                    cps))))))))