Raymond Toy pushed to branch issue-316-support-roundtrip-char-casing at cmucl / cmucl
Commits:
-
1652ad4b
by Raymond Toy at 2024-05-13T20:19:02-07:00
-
6633e24c
by Raymond Toy at 2024-05-13T20:21:17-07:00
3 changed files:
Changes:
| ... | ... | @@ -215,7 +215,8 @@ |
| 215 | 215 | (let ((m (char-code (the base-char char))))
|
| 216 | 216 | (or (< 31 m 127)
|
| 217 | 217 | #+(and unicode (not unicode-bootstrap))
|
| 218 | - (and (> m +unicode-lower-limit+)
|
|
| 218 | + (and (/= m 181)
|
|
| 219 | + (> m +unicode-lower-limit+)
|
|
| 219 | 220 | (>= (unicode-category m) +unicode-category-graphic+))))))
|
| 220 | 221 | |
| 221 | 222 | |
| ... | ... | @@ -272,6 +273,11 @@ |
| 272 | 273 | (or (< 64 m 91) (< 96 m 123)
|
| 273 | 274 | #+(and unicode (not unicode-bootstrap))
|
| 274 | 275 | (and (> m +unicode-lower-limit+)
|
| 276 | + ;; Unicode says Micro_sign is a lower case letter, but
|
|
| 277 | + ;; for CL, we don't want it to be a lower case letter.
|
|
| 278 | + ;; This is for compatibility with other Lisp
|
|
| 279 | + ;; implementations.
|
|
| 280 | + (/= m 181)
|
|
| 275 | 281 | (<= +unicode-category-upper+
|
| 276 | 282 | (unicode-category m)
|
| 277 | 283 | +unicode-category-lower+)))))
|
| ... | ... | @@ -464,8 +470,12 @@ |
| 464 | 470 | char)
|
| 465 | 471 | #+(and unicode (not unicode-bootstrap))
|
| 466 | 472 | (let ((m (char-code char)))
|
| 467 | - (cond ((> m +unicode-lower-limit+) (code-char (unicode-upper m)))
|
|
| 468 | - ((< 96 m 123) (code-char (- m 32)))
|
|
| 473 | + (cond ((< 96 m 123) (code-char (- m 32)))
|
|
| 474 | + ((= m 181) char)
|
|
| 475 | + ((> m +unicode-lower-limit+)
|
|
| 476 | + (if (member (unicode-category m) '(92 32 75 109))
|
|
| 477 | + char
|
|
| 478 | + (code-char (unicode-upper m))))
|
|
| 469 | 479 | (t char))))
|
| 470 | 480 | |
| 471 | 481 | (defun char-titlecase (char)
|
| ... | ... | @@ -490,7 +500,10 @@ |
| 490 | 500 | char)
|
| 491 | 501 | #+(and unicode (not unicode-bootstrap))
|
| 492 | 502 | (let ((m (char-code char)))
|
| 493 | - (cond ((> m +unicode-lower-limit+) (code-char (unicode-lower m)))
|
|
| 503 | + (cond ((> m +unicode-lower-limit+)
|
|
| 504 | + (if (member (unicode-category m) '(92 75 109))
|
|
| 505 | + char
|
|
| 506 | + (code-char (unicode-lower m))))
|
|
| 494 | 507 | ((< 64 m 91) (code-char (+ m 32)))
|
| 495 | 508 | (t char))))
|
| 496 | 509 |
| ... | ... | @@ -883,9 +883,7 @@ |
| 883 | 883 | (unless (unidata-scase *unicode-data*) (load-scase))
|
| 884 | 884 | (let* ((scase (unidata-scase *unicode-data*))
|
| 885 | 885 | (n (logand (qref32 scase code) #xFF)))
|
| 886 | - (if (or (zerop n)
|
|
| 887 | - ;; Ignore category Lt, Mn, Nl, So
|
|
| 888 | - (member (unicode-category code) '(92 32 75 109)))
|
|
| 886 | + (if (zerop n)
|
|
| 889 | 887 | code
|
| 890 | 888 | (let* ((m (aref (scase-svec scase) (logand n #x7F))))
|
| 891 | 889 | (if (logbitp 7 n) (+ code m) (- code m))))))
|
| ... | ... | @@ -896,9 +894,7 @@ |
| 896 | 894 | (unless (unidata-scase *unicode-data*) (load-scase))
|
| 897 | 895 | (let* ((scase (unidata-scase *unicode-data*))
|
| 898 | 896 | (n (logand (ash (qref32 scase code) -8) #xFF)))
|
| 899 | - (if (or (zerop n)
|
|
| 900 | - ;; Ignore category Lt, Nl, So
|
|
| 901 | - (member (unicode-category code) '(92 75 109)))
|
|
| 897 | + (if (zerop n)
|
|
| 902 | 898 | code
|
| 903 | 899 | (let ((m (aref (scase-svec scase) (logand n #x7F))))
|
| 904 | 900 | (if (logbitp 7 n) (+ code m) (- code m))))))
|
| ... | ... | @@ -3337,27 +3337,36 @@ |
| 3337 | 3337 | |
| 3338 | 3338 | (deftransform char-upcase ((x) (base-char))
|
| 3339 | 3339 | "open code"
|
| 3340 | + ;; NOTE: This MUST match what the function char-upcase does.
|
|
| 3340 | 3341 | #-(and unicode (not unicode-bootstrap))
|
| 3341 | 3342 | '(if (lower-case-p x)
|
| 3342 | 3343 | (code-char (- (char-code x) 32))
|
| 3343 | 3344 | x)
|
| 3344 | 3345 | #+(and unicode (not unicode-bootstrap))
|
| 3345 | 3346 | '(let ((m (char-code x)))
|
| 3346 | - (cond ((> m 127) (code-char (lisp::unicode-upper m)))
|
|
| 3347 | - ((< 96 m 123) (code-char (- m 32)))
|
|
| 3347 | + (cond ((< 96 m 123) (code-char (- m 32)))
|
|
| 3348 | + ((= m 181) x)
|
|
| 3349 | + ((> m lisp::+unicode-lower-limit+)
|
|
| 3350 | + (if (member (unicode-category m) '(92 32 75 109))
|
|
| 3351 | + x
|
|
| 3352 | + (code-char (lisp::unicode-upper m))))
|
|
| 3348 | 3353 | (t x))))
|
| 3349 | 3354 | |
| 3350 | 3355 | (deftransform char-downcase ((x) (base-char))
|
| 3351 | 3356 | "open code"
|
| 3357 | + ;; NOTE: This MUST match what the function char-downcase does.
|
|
| 3352 | 3358 | #-(and unicode (not unicode-bootstrap))
|
| 3353 | 3359 | '(if (upper-case-p x)
|
| 3354 | 3360 | (code-char (+ (char-code x) 32))
|
| 3355 | 3361 | x)
|
| 3356 | 3362 | #+(and unicode (not unicode-bootstrap))
|
| 3357 | 3363 | '(let ((m (char-code x)))
|
| 3358 | - (cond ((> m 127) (code-char (lisp::unicode-lower m)))
|
|
| 3359 | - ((< 64 m 91) (code-char (+ m 32)))
|
|
| 3360 | - (t x))))
|
|
| 3364 | + (cond ((> m lisp::+unicode-lower-limit+)
|
|
| 3365 | + (if (member (unicode-category m) '(92 75 109))
|
|
| 3366 | + x
|
|
| 3367 | + (code-char (lisp::unicode-lower m))))
|
|
| 3368 | + ((< 64 m 91) (code-char (+ m 32)))
|
|
| 3369 | + (t x))))
|
|
| 3361 | 3370 | |
| 3362 | 3371 | |
| 3363 | 3372 | ;;;; Equality predicate transforms:
|