Raymond Toy pushed to branch issue-511-update-unicode-tests at cmucl / cmucl

Commits:

6 changed files:

Changes:

  • .gitlab-ci.yml
    ... ... @@ -7,7 +7,7 @@ variables:
    7 7
       download_url: "https://common-lisp.net/project/cmucl/downloads/release/$release"
    
    8 8
       version: "$release-x86"
    
    9 9
       tar_ext: "xz"
    
    10
    -  bootstrap: ""
    
    10
    +  bootstrap: "-B boot-2026-06-01"
    
    11 11
     
    
    12 12
     workflow:
    
    13 13
       rules:
    

  • bin/build.sh
    ... ... @@ -38,7 +38,7 @@ ENABLE2="yes"
    38 38
     ENABLE3="yes"
    
    39 39
     ENABLE4="yes"
    
    40 40
     
    
    41
    -version=21e
    
    41
    +version=21f
    
    42 42
     SRCDIR=src
    
    43 43
     BINDIR=bin
    
    44 44
     TOOLDIR=$BINDIR
    

  • src/bootfiles/21f/boot-2026-06-1.lisp
    1
    +;; Bootstrap file for adding the Extended_Pictographic word-break
    
    2
    +;; support (Unicode 17.0 word-break rule WB3c).
    
    3
    +;;
    
    4
    +;; The new function LISP::UNICODE-EXTENDED-PICTOGRAPHIC-P is imported
    
    5
    +;; into the UNICODE package by code/exports.lisp.  When the new
    
    6
    +;; exports.lisp is compiled by the bootstrapping lisp, that symbol does
    
    7
    +;; not yet exist in the LISP package, so the (:import-from "LISP" ...)
    
    8
    +;; clause would fail.  Intern it here first so the package definition
    
    9
    +;; can be read.
    
    10
    +
    
    11
    +(in-package :lisp)
    
    12
    +
    
    13
    +(intern "UNICODE-EXTENDED-PICTOGRAPHIC-P" "LISP")

  • src/code/exports.lisp
    ... ... @@ -2321,7 +2321,8 @@
    2321 2321
     		"+UNICODE-CATEGORY-UPPER+"
    
    2322 2322
     		"+UNICODE-CATEGORY-TITLE+"
    
    2323 2323
     		"UNICODE-UPPER"
    
    2324
    -		"UNICODE-WORD-BREAK")
    
    2324
    +		"UNICODE-WORD-BREAK"
    
    2325
    +		"UNICODE-EXTENDED-PICTOGRAPHIC-P")
    
    2325 2326
       (:export "STRING-CAPITALIZE"
    
    2326 2327
     	   "STRING-DOWNCASE"
    
    2327 2328
     	   "STRING-UPCASE"
    

  • src/code/unicode.lisp
    ... ... @@ -315,178 +315,155 @@
    315 315
       character extends to the end of S.  If the index is negative or
    
    316 316
       valid index into S, the returned value will be strictly greater than
    
    317 317
       the index."
    
    318
    +  ;; Implements the word-boundary rules of UAX #29 (Unicode 17.0).
    
    319
    +  ;;
    
    320
    +  ;; Decode S into codepoints (collapsing UTF-16 surrogate pairs),
    
    321
    +  ;; classify each by its word-break property, then scan boundaries
    
    322
    +  ;; left to right applying the WB rules, returning the first boundary
    
    323
    +  ;; whose string index is strictly greater than I.
    
    324
    +  ;;
    
    325
    +  ;; WB4 makes Extend, Format and ZWJ "ignorable": they attach to the
    
    326
    +  ;; preceding context, so most rules compare a candidate against the
    
    327
    +  ;; nearest non-ignorable element on each side rather than the literal
    
    328
    +  ;; neighbour.  ZWJ is also significant for WB3c, and Extend can sit
    
    329
    +  ;; between paired Regional_Indicators, so those rules look past
    
    330
    +  ;; ignorables explicitly.
    
    331
    +  (declare (type simple-string s))
    
    318 332
       (let ((n (length s)))
    
    319
    -    (labels
    
    320
    -	((char-word-break-category (c)
    
    321
    -	   ;; Map our unicode word break property into what this
    
    322
    -	   ;; algorithm wants.
    
    323
    -	   (let ((cat (unicode-word-break c)))
    
    324
    -	     (case cat
    
    325
    -	       ((:lf :cr :newline)
    
    326
    -		:sep)
    
    327
    -	       ((:extend :format)
    
    328
    -		:extend-or-format)
    
    329
    -	       (otherwise cat))))
    
    330
    -	 (left-context (i)
    
    331
    -	   ;; Given a valid index i into s, returns the left context
    
    332
    -	   ;; at i.
    
    333
    -	   (multiple-value-bind (c widep)
    
    334
    -	       (codepoint s i n)
    
    335
    -	     (let* ((back
    
    336
    -		     ;; If we're at a regular character or a leading
    
    337
    -		     ;; surrogate, decrementing by 1 gets us the to
    
    338
    -		     ;; previous character.  But for a trailing
    
    339
    -		     ;; surrogate, we need to decrement by 2!
    
    340
    -		     (if (eql widep -1)
    
    341
    -			 2
    
    342
    -			 1))
    
    343
    -		    (cat (char-word-break-category c)))
    
    344
    -	       (case cat
    
    345
    -		 ((:sep)
    
    346
    -		  (if (= c (char-code #\return)) :cr cat))
    
    347
    -		 ((:midletter :midnumlet)
    
    348
    -		  (let ((i-1 (- i back)))
    
    349
    -		    (if (and (<= 0 i-1)
    
    350
    -			     (eq (left-context i-1) :aletter))
    
    351
    -			:aletter-midletter
    
    352
    -			cat)))
    
    353
    -		 ((:midnum :midnumlet)
    
    354
    -		  (let ((i-1 (- i back)))
    
    355
    -		    (if (and (<= 0 i-1)
    
    356
    -			     (eq (left-context i-1) :numeric))
    
    357
    -			:numeric-midnum
    
    358
    -			cat)))
    
    359
    -		 ((:extendorformat)
    
    360
    -		  (if (< 0 i)
    
    361
    -		      (left-context (- i back))
    
    362
    -		      :other))
    
    363
    -		 (otherwise cat)))))
    
    364
    -
    
    365
    -	 (index-of-previous-non-ignored (j)
    
    366
    -	   ;; Returns the index of the last non-Extend, non-Format
    
    367
    -	   ;; character within (substring s 0 j).  Should not be
    
    368
    -	   ;; called unless such a character exists.
    
    369
    -
    
    370
    -	   (let* ((j1 (- j 1)))
    
    371
    -	     (multiple-value-bind (c widep)
    
    372
    -		 (codepoint s j1)
    
    373
    -	       (when (eql widep -1)
    
    374
    -		 ;; Back up one more if we're at the trailing
    
    375
    -		 ;; surrogate.
    
    376
    -		 (decf j1))
    
    377
    -	       (let ((cat (char-word-break-category c)))
    
    378
    -		 (case cat
    
    379
    -		   ((:extend-or-format)
    
    380
    -		    (index-of-previous-non-ignored j1))
    
    381
    -		   (otherwise j1))))))
    
    382
    -
    
    383
    -	 (lookup (j context)
    
    384
    -	   ;; Given j and the context to the left of (not including) j,
    
    385
    -	   ;; returns the index at the start of the next word
    
    386
    -	   ;; (or before which a word break is permitted).
    
    387
    -
    
    388
    -	   (if (>= j n)
    
    389
    -	       (case context
    
    390
    -		 ((:aletter-midletter :numeric-midnum)
    
    391
    -		  (let ((j (index-of-previous-non-ignored n)))
    
    392
    -		    (if (< i j) j n)))
    
    393
    -		 (otherwise n))
    
    394
    -	       (multiple-value-bind (c widep)
    
    395
    -		   (codepoint s j)
    
    396
    -		 (let* ((next-j
    
    397
    -			 ;; The next character is either 1 or 2 code
    
    398
    -			 ;; units away.  For a leading surrogate, it's
    
    399
    -			 ;; 2; Otherwise just 1.
    
    400
    -			 (if (eql widep 1)
    
    401
    -			     2
    
    402
    -			     1))
    
    403
    -			(cat (char-word-break-category c)))
    
    404
    -		   (case cat
    
    405
    -		     ((:extend-or-format)
    
    406
    -		      (case context
    
    407
    -			((:cr :sep) j)
    
    408
    -			(otherwise (lookup (+ j next-j) context))))
    
    409
    -		     (otherwise
    
    410
    -		      (case context
    
    411
    -			((:cr)
    
    412
    -			 (if (= c (char-code #\linefeed))
    
    413
    -			     ;; Rule WB3:  Don't break CRLF, continue looking
    
    414
    -			     (lookup (+ j next-j) cat)
    
    415
    -			     j))
    
    416
    -			((:aletter)
    
    417
    -			 (case cat
    
    418
    -			   ((:aletter :numeric :extendnumlet)
    
    419
    -			    ;; Rules WB5, WB9, ?
    
    420
    -			    (lookup (+ j next-j) cat))
    
    421
    -			   ((:midletter :midnumlet)
    
    422
    -			    ;; Rule WB6, need to keep looking
    
    423
    -			    (lookup (+ j next-j) :aletter-midletter))
    
    424
    -			   (otherwise j)))
    
    425
    -			((:aletter-midletter)
    
    426
    -			 (case cat
    
    427
    -			   ((:aletter)
    
    428
    -			    ;; Rule WB7
    
    429
    -			    (lookup (+ j next-j) cat))
    
    430
    -			   (otherwise
    
    431
    -			    ;; Rule WB6 and WB7 were extended, but the
    
    432
    -			    ;; region didn't end with :aletter.  So
    
    433
    -			    ;; backup and break at that point.
    
    434
    -			    (let ((j2 (index-of-previous-non-ignored j)))
    
    435
    -			      (if (< i j2) j2 j)))))
    
    436
    -			((:numeric)
    
    437
    -			 (case cat
    
    438
    -			   ((:numeric :aletter :extendnumlet)
    
    439
    -			    ;; Rules WB8, WB10, ?
    
    440
    -			    (lookup (+ j next-j) cat))
    
    441
    -			   ((:midnum :midnumlet)
    
    442
    -			    ;; Rules WB11, need to keep looking
    
    443
    -			    (lookup (+ j next-j) :numeric-midnum))
    
    444
    -			   (otherwise j)))
    
    445
    -			((:numeric-midnum)
    
    446
    -			 (case cat
    
    447
    -			   ((:numeric)
    
    448
    -			    ;; Rule WB11, keep looking
    
    449
    -			    (lookup (+ j next-j) cat))
    
    450
    -			   (otherwise
    
    451
    -			    ;; Rule WB11, WB12 were extended, but the
    
    452
    -			    ;; region didn't end with :numeric, so
    
    453
    -			    ;; backup and break at that point.
    
    454
    -			    (let ((j2 (index-of-previous-non-ignored j)))
    
    455
    -			      (if (< i j2) j2 j)))))
    
    456
    -			((:midletter :midnum :midnumlet)
    
    457
    -			 ;; Rule WB14
    
    458
    -			 j)
    
    459
    -			((:katakana)
    
    460
    -			 (case cat
    
    461
    -			   ((:katakana :extendnumlet)
    
    462
    -			    ;; Rule WB13, WB13a
    
    463
    -			    (lookup (+ j next-j) cat))
    
    464
    -			   (otherwise j)))
    
    465
    -			((:extendnumlet)
    
    466
    -			 (case cat
    
    467
    -			   ((:extendnumlet :aletter :numeric :katakana)
    
    468
    -			    ;; Rule WB13a, WB13b
    
    469
    -			    (lookup (+ j next-j) cat))
    
    470
    -			   (otherwise j)))
    
    471
    -			((:regional_indicator)
    
    472
    -			 (case cat
    
    473
    -			   ((:regional_indicator)
    
    474
    -			    ;; Rule WB13c
    
    475
    -			    (lookup (+ j next-j) cat))
    
    476
    -			   (otherwise j)))
    
    477
    -			(otherwise j)))))))))
    
    478
    -      (declare (notinline lookup left-context))
    
    479
    -      (cond ((< i 0)
    
    480
    -	     ;; Rule WB1
    
    481
    -	     0)
    
    482
    -	    ((<= n i)
    
    483
    -	     ;; Rule WB2
    
    484
    -	     n)
    
    485
    -	    (t
    
    486
    -	     (multiple-value-bind (c widep)
    
    487
    -		 (codepoint s i)
    
    488
    -	       (declare (ignore c))
    
    489
    -	       (lookup (+ i (if (eql widep 1) 2 1)) (left-context i))))))))
    
    333
    +    (cond
    
    334
    +      ((< i 0) 0)                       ; Rule WB1 (start of text)
    
    335
    +      ((>= i n) n)                      ; Rule WB2 (end of text)
    
    336
    +      (t
    
    337
    +       (let ((cls (make-array 16 :fill-pointer 0 :adjustable t))
    
    338
    +             (idx (make-array 16 :fill-pointer 0 :adjustable t)))
    
    339
    +         ;; Decode codepoints and record the string index of each.
    
    340
    +         (let ((k 0))
    
    341
    +           (loop while (< k n) do
    
    342
    +             (multiple-value-bind (cp widep) (codepoint s k)
    
    343
    +               (vector-push-extend (unicode-word-break cp) cls)
    
    344
    +               (vector-push-extend k idx)
    
    345
    +               (incf k (if (eql widep 1) 2 1)))))
    
    346
    +         (let ((m (fill-pointer cls)))
    
    347
    +           (labels
    
    348
    +               ((class (j) (aref cls j))
    
    349
    +                (ah-letter-p (c) (or (eq c :aletter) (eq c :hebrew_letter)))
    
    350
    +                (mid-letter-q-p (c)     ; (MidLetter | MidNumLetQ)
    
    351
    +                  (or (eq c :midletter) (eq c :midnumlet) (eq c :single_quote)))
    
    352
    +                (mid-num-q-p (c)        ; (MidNum | MidNumLetQ)
    
    353
    +                  (or (eq c :midnum) (eq c :midnumlet) (eq c :single_quote)))
    
    354
    +                (ignorable (c)          ; WB4: Extend | Format | ZWJ
    
    355
    +                  (or (eq c :extend) (eq c :format) (eq c :zwj)))
    
    356
    +                (ext-pict-at (j)
    
    357
    +                  (unicode-extended-pictographic-p (codepoint s (aref idx j))))
    
    358
    +                (prev-significant (j)   ; last non-ignorable element < J, or -1
    
    359
    +                  (loop for p from (1- j) downto 0
    
    360
    +                        unless (ignorable (class p)) return p
    
    361
    +                        finally (return -1)))
    
    362
    +                (next-significant (j)   ; first non-ignorable element > J, or -1
    
    363
    +                  (loop for q from (1+ j) below m
    
    364
    +                        unless (ignorable (class q)) return q
    
    365
    +                        finally (return -1)))
    
    366
    +                (ri-count-left (j)
    
    367
    +                  ;; Number of significant Regional_Indicator elements
    
    368
    +                  ;; immediately to the left of J (stopping at the first
    
    369
    +                  ;; non-RI significant element; ignorables are skipped).
    
    370
    +                  (let ((count 0))
    
    371
    +                    (loop for p = (prev-significant j) then (prev-significant p)
    
    372
    +                          while (and (>= p 0) (eq (class p) :regional_indicator))
    
    373
    +                          do (incf count))
    
    374
    +                    count))
    
    375
    +                (break-before-p (j)
    
    376
    +                  ;; Does the algorithm allow a break immediately before
    
    377
    +                  ;; element J (1 <= J < M)?
    
    378
    +                  (let* ((c (class j))
    
    379
    +                         (lit (class (1- j)))   ; literal previous element
    
    380
    +                         (pj (prev-significant j)))
    
    381
    +                    (cond
    
    382
    +                      ;; WB3: CR x LF
    
    383
    +                      ((and (eq lit :cr) (eq c :lf)) nil)
    
    384
    +                      ;; WB3a: (Newline | CR | LF) div
    
    385
    +                      ((member lit '(:newline :cr :lf)) t)
    
    386
    +                      ;; WB3b: div (Newline | CR | LF)
    
    387
    +                      ((member c '(:newline :cr :lf)) t)
    
    388
    +                      ;; WB3c: ZWJ x \p{Extended_Pictographic}
    
    389
    +                      ((and (eq lit :zwj) (ext-pict-at j)) nil)
    
    390
    +                      ;; WB3d: WSegSpace x WSegSpace
    
    391
    +                      ((and (eq lit :wsegspace) (eq c :wsegspace)) nil)
    
    392
    +                      ;; WB4: x (Extend | Format | ZWJ): never break before
    
    393
    +                      ;; an ignorable (covers ignorables after sot or after
    
    394
    +                      ;; another ignorable too).
    
    395
    +                      ((ignorable c) nil)
    
    396
    +                      ;; Nothing significant to the left: break.
    
    397
    +                      ((< pj 0) t)
    
    398
    +                      (t
    
    399
    +                       (let ((p (class pj)))
    
    400
    +                         (cond
    
    401
    +                           ;; WB5: AHLetter x AHLetter
    
    402
    +                           ((and (ah-letter-p p) (ah-letter-p c)) nil)
    
    403
    +                           ;; WB6: AHLetter x (MidLetter|MidNumLetQ) AHLetter
    
    404
    +                           ((and (ah-letter-p p) (mid-letter-q-p c)
    
    405
    +                                 (let ((nx (next-significant j)))
    
    406
    +                                   (and (>= nx 0) (ah-letter-p (class nx)))))
    
    407
    +                            nil)
    
    408
    +                           ;; WB7: AHLetter (MidLetter|MidNumLetQ) x AHLetter
    
    409
    +                           ((and (ah-letter-p c) (mid-letter-q-p p)
    
    410
    +                                 (let ((pp (prev-significant pj)))
    
    411
    +                                   (and (>= pp 0) (ah-letter-p (class pp)))))
    
    412
    +                            nil)
    
    413
    +                           ;; WB7a: Hebrew_Letter x Single_Quote
    
    414
    +                           ((and (eq p :hebrew_letter) (eq c :single_quote)) nil)
    
    415
    +                           ;; WB7b: Hebrew_Letter x Double_Quote Hebrew_Letter
    
    416
    +                           ((and (eq p :hebrew_letter) (eq c :double_quote)
    
    417
    +                                 (let ((nx (next-significant j)))
    
    418
    +                                   (and (>= nx 0) (eq (class nx) :hebrew_letter))))
    
    419
    +                            nil)
    
    420
    +                           ;; WB7c: Hebrew_Letter Double_Quote x Hebrew_Letter
    
    421
    +                           ((and (eq c :hebrew_letter) (eq p :double_quote)
    
    422
    +                                 (let ((pp (prev-significant pj)))
    
    423
    +                                   (and (>= pp 0) (eq (class pp) :hebrew_letter))))
    
    424
    +                            nil)
    
    425
    +                           ;; WB8: Numeric x Numeric
    
    426
    +                           ((and (eq p :numeric) (eq c :numeric)) nil)
    
    427
    +                           ;; WB9: AHLetter x Numeric
    
    428
    +                           ((and (ah-letter-p p) (eq c :numeric)) nil)
    
    429
    +                           ;; WB10: Numeric x AHLetter
    
    430
    +                           ((and (eq p :numeric) (ah-letter-p c)) nil)
    
    431
    +                           ;; WB11: Numeric (MidNum|MidNumLetQ) x Numeric
    
    432
    +                           ((and (eq c :numeric) (mid-num-q-p p)
    
    433
    +                                 (let ((pp (prev-significant pj)))
    
    434
    +                                   (and (>= pp 0) (eq (class pp) :numeric))))
    
    435
    +                            nil)
    
    436
    +                           ;; WB12: Numeric x (MidNum|MidNumLetQ) Numeric
    
    437
    +                           ((and (eq p :numeric) (mid-num-q-p c)
    
    438
    +                                 (let ((nx (next-significant j)))
    
    439
    +                                   (and (>= nx 0) (eq (class nx) :numeric))))
    
    440
    +                            nil)
    
    441
    +                           ;; WB13: Katakana x Katakana
    
    442
    +                           ((and (eq p :katakana) (eq c :katakana)) nil)
    
    443
    +                           ;; WB13a: (AHLetter|Numeric|Katakana|ExtendNumLet) x ExtendNumLet
    
    444
    +                           ((and (member p '(:aletter :hebrew_letter :numeric
    
    445
    +                                             :katakana :extendnumlet))
    
    446
    +                                 (eq c :extendnumlet))
    
    447
    +                            nil)
    
    448
    +                           ;; WB13b: ExtendNumLet x (AHLetter|Numeric|Katakana)
    
    449
    +                           ((and (eq p :extendnumlet)
    
    450
    +                                 (or (ah-letter-p c)
    
    451
    +                                     (member c '(:numeric :katakana))))
    
    452
    +                            nil)
    
    453
    +                           ;; WB15/WB16: in a Regional_Indicator run, break
    
    454
    +                           ;; only between pairs: break before J iff an even
    
    455
    +                           ;; number of RIs precede it.
    
    456
    +                           ((and (eq p :regional_indicator)
    
    457
    +                                 (eq c :regional_indicator))
    
    458
    +                            (evenp (ri-count-left j)))
    
    459
    +                           ;; WB999: otherwise break.
    
    460
    +                           (t t))))))))
    
    461
    +             ;; Find the first allowed boundary whose string index is > I.
    
    462
    +             (loop for j from 1 below m
    
    463
    +                   when (and (> (aref idx j) i) (break-before-p j))
    
    464
    +                     do (return-from string-next-word-break (aref idx j)))
    
    465
    +             ;; WB2: otherwise the word extends to end of text.
    
    466
    +             n)))))))
    
    490 467
     
    
    491 468
     (defun char-titlecase (char)
    
    492 469
       "Returns CHAR converted to title-case if that is possible."
    

  • src/i18n/locale/cmucl.pot
    No preview for this file type