Raymond Toy pushed to branch rtoy-unicode-collation-ducet at cmucl / cmucl

Commits:

3 changed files:

Changes:

  • src/code/exports.lisp
    ... ... @@ -2313,7 +2313,13 @@
    2313 2313
     	   "STRING="
    
    2314 2314
     	   "STRING/="
    
    2315 2315
     	   "STRING>"
    
    2316
    -	   "STRING>=")
    
    2316
    +	   "STRING>="
    
    2317
    +	   "STRING-EQUAL"
    
    2318
    +	   "STRING-NOT-EQUAL"
    
    2319
    +	   "STRING-LESSP"
    
    2320
    +	   "STRING-GREATERP"
    
    2321
    +	   "STRING-NOT-GREATERP"
    
    2322
    +	   "STRING-NOT-LESSP")
    
    2317 2323
       (:import-from "LISP"
    
    2318 2324
     		"CODEPOINT"
    
    2319 2325
     		"SURROGATES"
    
    ... ... @@ -2337,6 +2343,12 @@
    2337 2343
     	   "STRING="
    
    2338 2344
     	   "STRING/="
    
    2339 2345
     	   "STRING>"
    
    2340
    -	   "STRING>="))
    
    2346
    +	   "STRING>="
    
    2347
    +	   "STRING-EQUAL"
    
    2348
    +	   "STRING-NOT-EQUAL"
    
    2349
    +	   "STRING-LESSP"
    
    2350
    +	   "STRING-GREATERP"
    
    2351
    +	   "STRING-NOT-GREATERP"
    
    2352
    +	   "STRING-NOT-LESSP"))
    
    2341 2353
     
    
    2342 2354
     

  • src/code/unicode-collation.lisp
    ... ... @@ -467,18 +467,28 @@ is no fourth level. See COLLATION-WEIGHTS."
    467 467
             (unless (zerop te) (push te l3))))
    
    468 468
         (values (nreverse l1) (nreverse l2) (nreverse l3) nil)))
    
    469 469
     
    
    470
    -(defun collation-sort-key (d string &optional (variable-weighting :shifted))
    
    470
    +(defun collation-sort-key (d string &optional (variable-weighting :shifted)
    
    471
    +                                              (strength :tertiary))
    
    471 472
       "Compute the UTS #10 sort key for STRING under DUCET D.  Returns a
    
    472
    -(simple-array (unsigned-byte 16) (*)) holding the level-1 weights, a 0000
    
    473
    -separator, the level-2 weights, 0000, the level-3 weights, and -- for the
    
    474
    -:SHIFTED option -- a further 0000 separator and the level-4 weights.
    
    475
    -Binary comparison of two such keys yields the collation order of their
    
    476
    -strings.  VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS."
    
    473
    +(simple-array (unsigned-byte 16) (*)) holding the weight levels separated
    
    474
    +by 0000: level 1, level 2, level 3, and -- under the :SHIFTED option --
    
    475
    +level 4.  Binary comparison of two such keys yields the collation order
    
    476
    +of their strings.  VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS.
    
    477
    +
    
    478
    +STRENGTH bounds the levels included in the key, and hence the
    
    479
    +distinctions the comparison makes: :PRIMARY (base letters only),
    
    480
    +:SECONDARY (also accents), :TERTIARY (also case; the default), or
    
    481
    +:QUATERNARY (also the level-4 weights, which exist only under the
    
    482
    +:SHIFTED option and otherwise add nothing).  A lower strength makes
    
    483
    +more strings compare equal; for example :SECONDARY ignores case."
    
    477 484
       (multiple-value-bind (l1 l2 l3 l4)
    
    478 485
           (collation-weights d string variable-weighting)
    
    479
    -    (let* ((weights (if (eq variable-weighting :non-ignorable)
    
    480
    -                        (append l1 (list 0) l2 (list 0) l3)
    
    481
    -                        (append l1 (list 0) l2 (list 0) l3 (list 0) l4)))
    
    486
    +    (let* ((weights (ecase strength
    
    487
    +                      (:primary l1)
    
    488
    +                      (:secondary (append l1 (list 0) l2))
    
    489
    +                      (:tertiary (append l1 (list 0) l2 (list 0) l3))
    
    490
    +                      (:quaternary
    
    491
    +                       (append l1 (list 0) l2 (list 0) l3 (list 0) l4))))
    
    482 492
                (key (make-array (length weights)
    
    483 493
                                 :element-type '(unsigned-byte 16))))
    
    484 494
           (loop for w in weights
    
    ... ... @@ -486,12 +496,13 @@ strings. VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS."
    486 496
                 do (setf (aref key k) w))
    
    487 497
           key)))
    
    488 498
     
    
    489
    -(defun collation-compare (d s1 s2 &optional (variable-weighting :shifted))
    
    499
    +(defun collation-compare (d s1 s2 &optional (variable-weighting :shifted)
    
    500
    +                                            (strength :tertiary))
    
    490 501
       "Compare strings S1 and S2 under DUCET D.  Returns -1, 0, or 1 like a
    
    491 502
     three-way comparison: negative if S1 sorts before S2, zero if equal, 1
    
    492
    -if after.  VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS."
    
    493
    -  (let ((k1 (collation-sort-key d s1 variable-weighting))
    
    494
    -        (k2 (collation-sort-key d s2 variable-weighting)))
    
    503
    +if after.  VARIABLE-WEIGHTING and STRENGTH are as in COLLATION-SORT-KEY."
    
    504
    +  (let ((k1 (collation-sort-key d s1 variable-weighting strength))
    
    505
    +        (k2 (collation-sort-key d s2 variable-weighting strength)))
    
    495 506
         (let ((n (min (length k1) (length k2))))
    
    496 507
           (dotimes (i n)
    
    497 508
             (let ((a (aref k1 i)) (b (aref k2 i)))
    
    ... ... @@ -615,7 +626,7 @@ collation section of unidata.bin on first use."
    615 626
           (setf *collation-table* (lisp::unidata-ducet))))
    
    616 627
     
    
    617 628
     (defun %collation-compare (string1 string2 start1 end1 start2 end2
    
    618
    -                           variable-weighting)
    
    629
    +                           variable-weighting strength)
    
    619 630
       "Three-way collation comparison of the designated substrings of
    
    620 631
     STRING1 and STRING2: returns a negative integer, zero, or a positive
    
    621 632
     integer as the first sorts before, equal to, or after the second."
    
    ... ... @@ -625,47 +636,87 @@ integer as the first sorts before, equal to, or after the second."
    625 636
           (setf s1 (subseq s1 start1 end1)))
    
    626 637
         (when (or (/= start2 0) end2)
    
    627 638
           (setf s2 (subseq s2 start2 end2)))
    
    628
    -    (lisp::collation-compare (collation-table) s1 s2 variable-weighting)))
    
    639
    +    (lisp::collation-compare (collation-table) s1 s2
    
    640
    +                             variable-weighting strength)))
    
    629 641
     
    
    630
    -(defmacro %def-collation-predicate (name test docstring)
    
    642
    +(defmacro %def-collation-predicate (name test default-strength docstring)
    
    631 643
       "Define a collation comparison predicate NAME whose result is (TEST c)
    
    632
    -where c is the three-way comparison of the two string arguments."
    
    644
    +where c is the three-way comparison of the two string arguments.
    
    645
    +DEFAULT-STRENGTH is the default value of the STRENGTH keyword."
    
    633 646
       `(defun ,name (string1 string2 &key (start1 0) end1 (start2 0) end2
    
    634
    -                                      (variable-weighting :shifted))
    
    647
    +                                      (variable-weighting :shifted)
    
    648
    +                                      (strength ,default-strength))
    
    635 649
          ,docstring
    
    636 650
          (let ((c (%collation-compare string1 string2
    
    637 651
                                       start1 end1 start2 end2
    
    638
    -                                  variable-weighting)))
    
    652
    +                                  variable-weighting strength)))
    
    639 653
            (,test c))))
    
    640 654
     
    
    641
    -(%def-collation-predicate string= zerop
    
    655
    +(%def-collation-predicate string= zerop :tertiary
    
    642 656
       "Return true if STRING1 and STRING2 collate as equal under the Unicode
    
    643 657
     Collation Algorithm.  Note that this is collation equality, not
    
    644 658
     code-point identity: canonically equivalent strings, and strings that
    
    645 659
     differ only in collation-ignorable ways, compare equal.  START1, END1,
    
    646
    -START2 and END2 bound the substrings compared; VARIABLE-WEIGHTING is
    
    647
    -:SHIFTED (the default) or :NON-IGNORABLE.")
    
    660
    +START2 and END2 bound the substrings compared.  VARIABLE-WEIGHTING is
    
    661
    +:SHIFTED (the default) or :NON-IGNORABLE.  STRENGTH is :PRIMARY,
    
    662
    +:SECONDARY, :TERTIARY (the default), or :QUATERNARY, as in
    
    663
    +LISP::COLLATION-SORT-KEY; a lower strength makes more strings compare
    
    664
    +equal -- :SECONDARY, for instance, ignores case.")
    
    648 665
     
    
    649
    -(%def-collation-predicate string/= (lambda (c) (not (zerop c)))
    
    666
    +(%def-collation-predicate string/= (lambda (c) (not (zerop c))) :tertiary
    
    650 667
       "Return true if STRING1 and STRING2 do not collate as equal.  See
    
    651 668
     UNICODE:STRING= for the meaning of the keyword arguments.")
    
    652 669
     
    
    653
    -(%def-collation-predicate string< minusp
    
    670
    +(%def-collation-predicate string< minusp :tertiary
    
    654 671
       "Return true if STRING1 collates before STRING2 under the Unicode
    
    655 672
     Collation Algorithm.  See UNICODE:STRING= for the meaning of the keyword
    
    656 673
     arguments.")
    
    657 674
     
    
    658
    -(%def-collation-predicate string> plusp
    
    675
    +(%def-collation-predicate string> plusp :tertiary
    
    659 676
       "Return true if STRING1 collates after STRING2 under the Unicode
    
    660 677
     Collation Algorithm.  See UNICODE:STRING= for the meaning of the keyword
    
    661 678
     arguments.")
    
    662 679
     
    
    663
    -(%def-collation-predicate string<= (lambda (c) (not (plusp c)))
    
    680
    +(%def-collation-predicate string<= (lambda (c) (not (plusp c))) :tertiary
    
    664 681
       "Return true if STRING1 collates before or equal to STRING2 under the
    
    665 682
     Unicode Collation Algorithm.  See UNICODE:STRING= for the meaning of the
    
    666 683
     keyword arguments.")
    
    667 684
     
    
    668
    -(%def-collation-predicate string>= (lambda (c) (not (minusp c)))
    
    685
    +(%def-collation-predicate string>= (lambda (c) (not (minusp c))) :tertiary
    
    669 686
       "Return true if STRING1 collates after or equal to STRING2 under the
    
    670 687
     Unicode Collation Algorithm.  See UNICODE:STRING= for the meaning of the
    
    671 688
     keyword arguments.")
    
    689
    +
    
    690
    +;;; The case-insensitive comparison functions, the Unicode analogs of
    
    691
    +;;; the COMMON-LISP -EQUAL/-LESSP/... family.  They default to :SECONDARY
    
    692
    +;;; strength, which drops the tertiary level where case is encoded, so
    
    693
    +;;; they ignore case (and other tertiary distinctions, such as width)
    
    694
    +;;; while remaining sensitive to base letters and accents.  This is the
    
    695
    +;;; closest collation analog of case-folded comparison; the Unicode
    
    696
    +;;; Collation Algorithm has no operation that folds case alone.
    
    697
    +
    
    698
    +(%def-collation-predicate string-equal zerop :secondary
    
    699
    +  "Return true if STRING1 and STRING2 collate as equal ignoring case,
    
    700
    +under the Unicode Collation Algorithm.  Like UNICODE:STRING= but
    
    701
    +defaulting to :SECONDARY strength; see it for the keyword arguments.")
    
    702
    +
    
    703
    +(%def-collation-predicate string-not-equal (lambda (c) (not (zerop c))) :secondary
    
    704
    +  "Return true if STRING1 and STRING2 do not collate as equal ignoring
    
    705
    +case.  See UNICODE:STRING-EQUAL for the keyword arguments.")
    
    706
    +
    
    707
    +(%def-collation-predicate string-lessp minusp :secondary
    
    708
    +  "Return true if STRING1 collates before STRING2 ignoring case, under
    
    709
    +the Unicode Collation Algorithm.  See UNICODE:STRING-EQUAL for the
    
    710
    +keyword arguments.")
    
    711
    +
    
    712
    +(%def-collation-predicate string-greaterp plusp :secondary
    
    713
    +  "Return true if STRING1 collates after STRING2 ignoring case.  See
    
    714
    +UNICODE:STRING-EQUAL for the keyword arguments.")
    
    715
    +
    
    716
    +(%def-collation-predicate string-not-greaterp (lambda (c) (not (plusp c))) :secondary
    
    717
    +  "Return true if STRING1 collates before or equal to STRING2 ignoring
    
    718
    +case.  See UNICODE:STRING-EQUAL for the keyword arguments.")
    
    719
    +
    
    720
    +(%def-collation-predicate string-not-lessp (lambda (c) (not (minusp c))) :secondary
    
    721
    +  "Return true if STRING1 collates after or equal to STRING2 ignoring
    
    722
    +case.  See UNICODE:STRING-EQUAL for the keyword arguments.")

  • tests/unicode-collation.lisp
    ... ... @@ -145,6 +145,83 @@ must match the expected key in the line's comment."
    145 145
       (run-collation-conformance (ducet) *collation-non-ignorable-test*
    
    146 146
                                  :non-ignorable))
    
    147 147
     
    
    148
    +;;; Tests for the public UNICODE comparison functions.  The conformance
    
    149
    +;;; tests above already validate the collation weights themselves; these
    
    150
    +;;; check the thin wrappers -- that each predicate maps the comparison to
    
    151
    +;;; the right boolean, and that the STRENGTH and VARIABLE-WEIGHTING
    
    152
    +;;; options and the case-insensitive variants behave as documented.
    
    153
    +
    
    154
    +(define-test unicode.string-predicates
    
    155
    +  "The six case-sensitive comparison predicates map the collation order
    
    156
    +of a known pair to the correct boolean, including the reflexive case."
    
    157
    +  (:tag :unicode)
    
    158
    +  (assert-true (unicode:string< "a" "b"))
    
    159
    +  (assert-false (unicode:string< "b" "a"))
    
    160
    +  (assert-false (unicode:string< "a" "a"))
    
    161
    +  (assert-true (unicode:string> "b" "a"))
    
    162
    +  (assert-false (unicode:string> "a" "b"))
    
    163
    +  (assert-true (unicode:string<= "a" "b"))
    
    164
    +  (assert-true (unicode:string<= "a" "a"))
    
    165
    +  (assert-false (unicode:string<= "b" "a"))
    
    166
    +  (assert-true (unicode:string>= "b" "a"))
    
    167
    +  (assert-true (unicode:string>= "a" "a"))
    
    168
    +  (assert-false (unicode:string>= "a" "b"))
    
    169
    +  (assert-true (unicode:string= "a" "a"))
    
    170
    +  (assert-false (unicode:string= "a" "b"))
    
    171
    +  (assert-true (unicode:string/= "a" "b"))
    
    172
    +  (assert-false (unicode:string/= "a" "a")))
    
    173
    +
    
    174
    +(define-test unicode.string-designators-and-bounds
    
    175
    +  "Comparison accepts string designators (characters, symbols) like the
    
    176
    +COMMON-LISP functions, and honors START/END substring bounds."
    
    177
    +  (:tag :unicode)
    
    178
    +  (assert-true (unicode:string= #\a #\a))
    
    179
    +  (assert-true (unicode:string= 'abc "ABC"))
    
    180
    +  (assert-true (unicode:string= "xab" "yab" :start1 1 :start2 1))
    
    181
    +  (assert-false (unicode:string= "xab" "yzb" :start1 1 :start2 1)))
    
    182
    +
    
    183
    +(define-test unicode.string-strength
    
    184
    +  "STRENGTH bounds the distinctions made: case is a tertiary difference,
    
    185
    +an accent a secondary one."
    
    186
    +  (:tag :unicode)
    
    187
    +  ;; Default (tertiary): case matters.  Lowercase sorts before uppercase.
    
    188
    +  (assert-true (unicode:string< "abc" "ABC"))
    
    189
    +  (assert-false (unicode:string< "ABC" "abc"))
    
    190
    +  (assert-false (unicode:string= "ABC" "abc"))
    
    191
    +  ;; Secondary: case ignored, but accents still distinguish.
    
    192
    +  (assert-true (unicode:string= "ABC" "abc" :strength :secondary))
    
    193
    +  (assert-false (unicode:string= "café" "cafe" :strength :secondary))
    
    194
    +  ;; Primary: accents ignored too.
    
    195
    +  (assert-true (unicode:string= "café" "cafe" :strength :primary))
    
    196
    +  ;; Quaternary: under the default :SHIFTED option the level-4 weights
    
    197
    +  ;; make variable elements (here the hyphen) break an otherwise equal
    
    198
    +  ;; comparison, where :TERTIARY ignores them.
    
    199
    +  (assert-true (unicode:string= "co-op" "coop"))
    
    200
    +  (assert-false (unicode:string= "co-op" "coop" :strength :quaternary)))
    
    201
    +
    
    202
    +(define-test unicode.string-case-insensitive
    
    203
    +  "The case-insensitive family ignores case (secondary strength) while
    
    204
    +remaining sensitive to accents."
    
    205
    +  (:tag :unicode)
    
    206
    +  (assert-true (unicode:string-equal "ABC" "abc"))
    
    207
    +  (assert-false (unicode:string-not-equal "ABC" "abc"))
    
    208
    +  (assert-false (unicode:string-lessp "ABC" "abc"))
    
    209
    +  (assert-false (unicode:string-greaterp "ABC" "abc"))
    
    210
    +  (assert-true (unicode:string-not-greaterp "ABC" "abc"))
    
    211
    +  (assert-true (unicode:string-not-lessp "ABC" "abc"))
    
    212
    +  ;; An accent is a secondary difference, so it still distinguishes.
    
    213
    +  (assert-false (unicode:string-equal "café" "cafe")))
    
    214
    +
    
    215
    +(define-test unicode.string-variable-weighting
    
    216
    +  "Under the default :SHIFTED option variable elements (punctuation) are
    
    217
    +ignored; under :NON-IGNORABLE they are significant."
    
    218
    +  (:tag :unicode)
    
    219
    +  ;; Shifted (default): the hyphen is ignored, so the strings collate equal.
    
    220
    +  (assert-true (unicode:string= "co-op" "coop"))
    
    221
    +  ;; Non-ignorable: the hyphen counts, so they are not equal.
    
    222
    +  (assert-false (unicode:string= "co-op" "coop"
    
    223
    +                                     :variable-weighting :non-ignorable)))
    
    224
    +
    
    148 225
     ;; A DEFINE-TEST body is stored as source and run interpreted, and the
    
    149 226
     ;; test runner (tests/run-tests.lisp) loads this file as source, so its
    
    150 227
     ;; functions would otherwise run interpreted.  The per-line parsing and