Raymond Toy pushed to branch issue-316-support-roundtrip-char-casing at cmucl / cmucl
Commits: 1652ad4b by Raymond Toy at 2024-05-13T20:19:02-07:00 Implement the category cases in the CL functions, not Unicode
Previously, we had unicode-upper/lower functions had the code to exclude the characters from the categories we didn't want. However, this should really be done in char-upcase/downcase because that's what we want from CL, not Unicode.
- - - - - 6633e24c by Raymond Toy at 2024-05-13T20:21:17-07:00 Update the deftransforms for char-upcase/downcase to match
The deftransforms for char-upcase/downcase better match the code for the functions char-upcase/downcase, otherwise, we get totally confusing results.
I just copied the code from the functions into the deftransforms.
We really need a better way to do this to guarantee this automatically!
- - - - -
3 changed files:
- src/code/char.lisp - src/code/unidata.lisp - src/compiler/srctran.lisp
Changes:
===================================== src/code/char.lisp ===================================== @@ -215,7 +215,8 @@ (let ((m (char-code (the base-char char)))) (or (< 31 m 127) #+(and unicode (not unicode-bootstrap)) - (and (> m +unicode-lower-limit+) + (and (/= m 181) + (> m +unicode-lower-limit+) (>= (unicode-category m) +unicode-category-graphic+))))))
@@ -272,6 +273,11 @@ (or (< 64 m 91) (< 96 m 123) #+(and unicode (not unicode-bootstrap)) (and (> m +unicode-lower-limit+) + ;; Unicode says Micro_sign is a lower case letter, but + ;; for CL, we don't want it to be a lower case letter. + ;; This is for compatibility with other Lisp + ;; implementations. + (/= m 181) (<= +unicode-category-upper+ (unicode-category m) +unicode-category-lower+))))) @@ -464,8 +470,12 @@ char) #+(and unicode (not unicode-bootstrap)) (let ((m (char-code char))) - (cond ((> m +unicode-lower-limit+) (code-char (unicode-upper m))) - ((< 96 m 123) (code-char (- m 32))) + (cond ((< 96 m 123) (code-char (- m 32))) + ((= m 181) char) + ((> m +unicode-lower-limit+) + (if (member (unicode-category m) '(92 32 75 109)) + char + (code-char (unicode-upper m)))) (t char))))
(defun char-titlecase (char) @@ -490,7 +500,10 @@ char) #+(and unicode (not unicode-bootstrap)) (let ((m (char-code char))) - (cond ((> m +unicode-lower-limit+) (code-char (unicode-lower m))) + (cond ((> m +unicode-lower-limit+) + (if (member (unicode-category m) '(92 75 109)) + char + (code-char (unicode-lower m)))) ((< 64 m 91) (code-char (+ m 32))) (t char))))
===================================== src/code/unidata.lisp ===================================== @@ -883,9 +883,7 @@ (unless (unidata-scase *unicode-data*) (load-scase)) (let* ((scase (unidata-scase *unicode-data*)) (n (logand (qref32 scase code) #xFF))) - (if (or (zerop n) - ;; Ignore category Lt, Mn, Nl, So - (member (unicode-category code) '(92 32 75 109))) + (if (zerop n) code (let* ((m (aref (scase-svec scase) (logand n #x7F)))) (if (logbitp 7 n) (+ code m) (- code m)))))) @@ -896,9 +894,7 @@ (unless (unidata-scase *unicode-data*) (load-scase)) (let* ((scase (unidata-scase *unicode-data*)) (n (logand (ash (qref32 scase code) -8) #xFF))) - (if (or (zerop n) - ;; Ignore category Lt, Nl, So - (member (unicode-category code) '(92 75 109))) + (if (zerop n) code (let ((m (aref (scase-svec scase) (logand n #x7F)))) (if (logbitp 7 n) (+ code m) (- code m))))))
===================================== src/compiler/srctran.lisp ===================================== @@ -3337,27 +3337,36 @@
(deftransform char-upcase ((x) (base-char)) "open code" + ;; NOTE: This MUST match what the function char-upcase does. #-(and unicode (not unicode-bootstrap)) '(if (lower-case-p x) (code-char (- (char-code x) 32)) x) #+(and unicode (not unicode-bootstrap)) '(let ((m (char-code x))) - (cond ((> m 127) (code-char (lisp::unicode-upper m))) - ((< 96 m 123) (code-char (- m 32))) + (cond ((< 96 m 123) (code-char (- m 32))) + ((= m 181) x) + ((> m lisp::+unicode-lower-limit+) + (if (member (unicode-category m) '(92 32 75 109)) + x + (code-char (lisp::unicode-upper m)))) (t x))))
(deftransform char-downcase ((x) (base-char)) "open code" + ;; NOTE: This MUST match what the function char-downcase does. #-(and unicode (not unicode-bootstrap)) '(if (upper-case-p x) (code-char (+ (char-code x) 32)) x) #+(and unicode (not unicode-bootstrap)) '(let ((m (char-code x))) - (cond ((> m 127) (code-char (lisp::unicode-lower m))) - ((< 64 m 91) (code-char (+ m 32))) - (t x)))) + (cond ((> m lisp::+unicode-lower-limit+) + (if (member (unicode-category m) '(92 75 109)) + x + (code-char (lisp::unicode-lower m)))) + ((< 64 m 91) (code-char (+ m 32))) + (t x))))
;;;; Equality predicate transforms:
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/7f4f1e7590ba3a2326a91d6...