Raymond Toy pushed to branch rtoy-unicode-collation-ducet at cmucl / cmucl

Commits:

2 changed files:

Changes:

  • src/code/exports.lisp
    ... ... @@ -2307,7 +2307,13 @@
    2307 2307
       (:use "COMMON-LISP")
    
    2308 2308
       (:shadow "STRING-CAPITALIZE"
    
    2309 2309
     	   "STRING-DOWNCASE"
    
    2310
    -	   "STRING-UPCASE")
    
    2310
    +	   "STRING-UPCASE"
    
    2311
    +	   "STRING<"
    
    2312
    +	   "STRING<="
    
    2313
    +	   "STRING="
    
    2314
    +	   "STRING/="
    
    2315
    +	   "STRING>"
    
    2316
    +	   "STRING>=")
    
    2311 2317
       (:import-from "LISP"
    
    2312 2318
     		"CODEPOINT"
    
    2313 2319
     		"SURROGATES"
    
    ... ... @@ -2325,6 +2331,12 @@
    2325 2331
       (:export "STRING-CAPITALIZE"
    
    2326 2332
     	   "STRING-DOWNCASE"
    
    2327 2333
     	   "STRING-UPCASE"
    
    2328
    -	   "STRING-NEXT-WORD-BREAK"))
    
    2334
    +	   "STRING-NEXT-WORD-BREAK"
    
    2335
    +	   "STRING<"
    
    2336
    +	   "STRING<="
    
    2337
    +	   "STRING="
    
    2338
    +	   "STRING/="
    
    2339
    +	   "STRING>"
    
    2340
    +	   "STRING>="))
    
    2329 2341
     
    
    2330 2342
     

  • src/code/unicode-collation.lisp
    ... ... @@ -500,3 +500,93 @@ if after. VARIABLE-WEIGHTING is as in COLLATION-WEIGHTS."
    500 500
           (cond ((< (length k1) (length k2)) -1)
    
    501 501
                 ((> (length k1) (length k2)) 1)
    
    502 502
                 (t 0)))))
    
    503
    +
    
    504
    +
    
    505
    +;;; -------------------------------------------------------------------
    
    506
    +;;; Public collation API (UNICODE package): the Unicode-aware
    
    507
    +;;; equivalents of the COMMON-LISP string comparison functions.
    
    508
    +;;;
    
    509
    +;;; These compare strings by the Unicode Collation Algorithm rather than
    
    510
    +;;; by code-point order, so the result reflects linguistic sort order
    
    511
    +;;; (after NFD normalization, with contractions, expansions and the
    
    512
    +;;; chosen variable-weighting option).  The Default Unicode Collation
    
    513
    +;;; Element Table is loaded lazily on first use.
    
    514
    +;;;
    
    515
    +;;; Unlike the COMMON-LISP functions, these return a generalized boolean
    
    516
    +;;; (T or NIL) rather than a mismatch index: the comparison is performed
    
    517
    +;;; on sort keys derived from the whole normalized string, so there is
    
    518
    +;;; no meaningful character index of the first difference to return.
    
    519
    +;;; -------------------------------------------------------------------
    
    520
    +
    
    521
    +(in-package "UNICODE")
    
    522
    +
    
    523
    +(defvar *collation-table-path* "ext-formats:allkeys.txt"
    
    524
    +  "Pathname of the DUCET data file (allkeys.txt) from which the default
    
    525
    +collation table is loaded on first use.")
    
    526
    +
    
    527
    +(defvar *collation-table* nil
    
    528
    +  "The default Unicode collation table, or NIL if it has not yet been
    
    529
    +loaded.  Loaded lazily from *COLLATION-TABLE-PATH* the first time a
    
    530
    +collation function needs it.  Set to NIL to force a reload.")
    
    531
    +
    
    532
    +(defun collation-table ()
    
    533
    +  "Return the default Unicode collation table, loading it from
    
    534
    +*COLLATION-TABLE-PATH* on first use."
    
    535
    +  (or *collation-table*
    
    536
    +      (setf *collation-table* (lisp::load-ducet *collation-table-path*))))
    
    537
    +
    
    538
    +(defun %collation-compare (string1 string2 start1 end1 start2 end2
    
    539
    +                           variable-weighting)
    
    540
    +  "Three-way collation comparison of the designated substrings of
    
    541
    +STRING1 and STRING2: returns a negative integer, zero, or a positive
    
    542
    +integer as the first sorts before, equal to, or after the second."
    
    543
    +  (let ((s1 (string string1))
    
    544
    +        (s2 (string string2)))
    
    545
    +    (when (or (/= start1 0) end1)
    
    546
    +      (setf s1 (subseq s1 start1 end1)))
    
    547
    +    (when (or (/= start2 0) end2)
    
    548
    +      (setf s2 (subseq s2 start2 end2)))
    
    549
    +    (lisp::collation-compare (collation-table) s1 s2 variable-weighting)))
    
    550
    +
    
    551
    +(defmacro %def-collation-predicate (name test docstring)
    
    552
    +  "Define a collation comparison predicate NAME whose result is (TEST c)
    
    553
    +where c is the three-way comparison of the two string arguments."
    
    554
    +  `(defun ,name (string1 string2 &key (start1 0) end1 (start2 0) end2
    
    555
    +                                      (variable-weighting :shifted))
    
    556
    +     ,docstring
    
    557
    +     (let ((c (%collation-compare string1 string2
    
    558
    +                                  start1 end1 start2 end2
    
    559
    +                                  variable-weighting)))
    
    560
    +       (,test c))))
    
    561
    +
    
    562
    +(%def-collation-predicate string= zerop
    
    563
    +  "Return true if STRING1 and STRING2 collate as equal under the Unicode
    
    564
    +Collation Algorithm.  Note that this is collation equality, not
    
    565
    +code-point identity: canonically equivalent strings, and strings that
    
    566
    +differ only in collation-ignorable ways, compare equal.  START1, END1,
    
    567
    +START2 and END2 bound the substrings compared; VARIABLE-WEIGHTING is
    
    568
    +:SHIFTED (the default) or :NON-IGNORABLE.")
    
    569
    +
    
    570
    +(%def-collation-predicate string/= (lambda (c) (not (zerop c)))
    
    571
    +  "Return true if STRING1 and STRING2 do not collate as equal.  See
    
    572
    +UNICODE:STRING= for the meaning of the keyword arguments.")
    
    573
    +
    
    574
    +(%def-collation-predicate string< minusp
    
    575
    +  "Return true if STRING1 collates before STRING2 under the Unicode
    
    576
    +Collation Algorithm.  See UNICODE:STRING= for the meaning of the keyword
    
    577
    +arguments.")
    
    578
    +
    
    579
    +(%def-collation-predicate string> plusp
    
    580
    +  "Return true if STRING1 collates after STRING2 under the Unicode
    
    581
    +Collation Algorithm.  See UNICODE:STRING= for the meaning of the keyword
    
    582
    +arguments.")
    
    583
    +
    
    584
    +(%def-collation-predicate string<= (lambda (c) (not (plusp c)))
    
    585
    +  "Return true if STRING1 collates before or equal to STRING2 under the
    
    586
    +Unicode Collation Algorithm.  See UNICODE:STRING= for the meaning of the
    
    587
    +keyword arguments.")
    
    588
    +
    
    589
    +(%def-collation-predicate string>= (lambda (c) (not (minusp c)))
    
    590
    +  "Return true if STRING1 collates after or equal to STRING2 under the
    
    591
    +Unicode Collation Algorithm.  See UNICODE:STRING= for the meaning of the
    
    592
    +keyword arguments.")