Raymond Toy pushed to branch master at cmucl / cmucl
Commits: db07d64f by Raymond Toy at 2024-06-04T03:52:11+00:00 Fix #326: Cleanups for roundtrip casing
- - - - - 6b5da582 by Raymond Toy at 2024-06-04T03:52:16+00:00 Merge branch 'issue-326-char-casing-cleanup' into 'master'
Fix #326: Cleanups for roundtrip casing
Closes #326
See merge request cmucl/cmucl!225 - - - - -
4 changed files:
- src/code/char.lisp - src/code/unicode.lisp - src/compiler/srctran.lisp - + tests/char.lisp
Changes:
===================================== src/code/char.lisp ===================================== @@ -31,7 +31,6 @@ alphanumericp char= char/= char< char> char<= char>= char-equal char-not-equal char-lessp char-greaterp char-not-greaterp char-not-lessp character char-code code-char char-upcase - char-titlecase title-case-p char-downcase digit-char char-int char-name name-char codepoint-limit codepoint))
@@ -259,7 +258,7 @@ (declare (character char)) (and (typep char 'base-char) (let ((m (char-code (the base-char char)))) - (or (< 31 m 127) + (or (<= (char-code #\space ) m (char-code #~)) #+(and unicode (not unicode-bootstrap)) (and (> m +ascii-limit+) (>= (unicode-category m) +unicode-category-graphic+)))))) @@ -270,7 +269,8 @@ argument is an alphabetic character; otherwise NIL." (declare (character char)) (let ((m (char-code char))) - (or (< 64 m 91) (< 96 m 123) + (or (<= (char-code #\A) m (char-code #\Z)) + (<= (char-code #\a) m (char-code #\z)) #+(and unicode (not unicode-bootstrap)) (and (> m +ascii-limit+) (<= +unicode-category-letter+ (unicode-category m) @@ -282,7 +282,7 @@ argument is an upper-case character, NIL otherwise." (declare (character char)) (let ((m (char-code char))) - (or (< 64 m 91) + (or (<= (char-code #\A) m (char-code #\Z)) #+(and unicode (not unicode-bootstrap)) (and (> m +ascii-limit+) (not (zerop (ldb +lower-case-entry+ (case-mapping-entry m)))))))) @@ -293,29 +293,19 @@ argument is a lower-case character, NIL otherwise." (declare (character char)) (let ((m (char-code char))) - (or (< 96 m 123) + (or (<= (char-code #\a) m (char-code #\z)) #+(and unicode (not unicode-bootstrap)) (and (> m +ascii-limit+) (not (zerop (ldb +upper-case-entry+ (case-mapping-entry m))))))))
-(defun title-case-p (char) - "The argument must be a character object; title-case-p returns T if the - argument is a title-case character, NIL otherwise." - (declare (character char)) - (let ((m (char-code char))) - (or (< 64 m 91) - #+(and unicode (not unicode-bootstrap)) - (and (> m +ascii-limit+) - (= (unicode-category m) +unicode-category-title+))))) - - (defun both-case-p (char) "The argument must be a character object. Both-case-p returns T if the argument is an alphabetic character and if the character exists in both upper and lower case. For ASCII, this is the same as Alpha-char-p." (declare (character char)) (let ((m (char-code char))) - (or (< 64 m 91) (< 96 m 123) + (or (<= (char-code #\A) m (char-code #\Z)) + (<= (char-code #\a) m (char-code #\z)) #+(and unicode (not unicode-bootstrap)) (and (> m +ascii-limit+) (not (zerop (case-mapping-entry m))))))) @@ -347,7 +337,9 @@ (declare (character char)) (let ((m (char-code char))) ;; Shortcut for ASCII digits and upper and lower case ASCII letters - (or (< 47 m 58) (< 64 m 91) (< 96 m 123) + (or (<= (char-code #\0) m (char-code #\9)) + (<= (char-code #\A) m (char-code #\Z)) + (<= (char-code #\a) m (char-code #\z)) #+(and unicode (not unicode-bootstrap)) (and (> m +ascii-limit+) (<= +unicode-category-letter+ (unicode-category m) @@ -418,14 +410,14 @@
(defmacro equal-char-code (character) `(let ((ch (char-code ,character))) - ;; Handle ASCII separately for bootstrapping and for unidata missing. - (if (< 64 ch 91) - (+ ch 32) - #-(and unicode (not unicode-bootstrap)) - ch - #+(and unicode (not unicode-bootstrap)) - (if (> ch +ascii-limit+) (unicode-lower ch) ch)))) - + ;; Handle ASCII separately for bootstrapping. + (cond ((<= (char-code #\A) ch (char-code #\Z)) + (logxor ch #x20)) + #+(and unicode (not unicode-bootstrap)) + ((> ch +ascii-limit+) + (case-mapping-lower-case ch)) + (t + ch))))
(defun char-equal (character &rest more-characters) "Returns T if all of its arguments are the same character. @@ -504,19 +496,6 @@ (declare (character char)) (char-upcase char))
-(defun char-titlecase (char) - "Returns CHAR converted to title-case if that is possible." - (declare (character char)) - #-(and unicode (not unicode-bootstrap)) - (if (lower-case-p char) - (code-char (- (char-code char) 32)) - char) - #+(and unicode (not unicode-bootstrap)) - (let ((m (char-code char))) - (cond ((> m +ascii-limit+) (code-char (unicode-title m))) - ((< 96 m 123) (code-char (- m 32))) - (t char)))) - (defun char-downcase (char) "Returns CHAR converted to lower-case if that is possible." (declare (character char))
===================================== src/code/unicode.lisp ===================================== @@ -488,6 +488,26 @@ (declare (ignore c)) (lookup (+ i (if (eql widep 1) 2 1)) (left-context i))))))))
+(defun char-titlecase (char) + "Returns CHAR converted to title-case if that is possible." + (declare (character char)) + (let ((m (char-code char))) + (cond ((<= (char-code #\a) m (char-code #\z)) + (code-char (logxor m #x20))) + #+(and unicode (not unicode-bootstrap)) + ((> m +ascii-limit+) (code-char (unicode-title m))) + (t char)))) + +(defun title-case-p (char) + "The argument must be a character object; title-case-p returns T if the + argument is a title-case character, NIL otherwise." + (declare (character char)) + (let ((m (char-code char))) + (or (<= (code-char #\A) m (code-char #\Z)) + #+(and unicode (not unicode-bootstrap)) + (and (> m +ascii-limit+) + (= (unicode-category m) +unicode-category-title+))))) + (defun string-capitalize-unicode (string &key (start 0) end (casing :simple)) "Capitalize String using the Unicode word-break algorithm to find the words in String. The beginning is capitalized depending on the
===================================== src/compiler/srctran.lisp ===================================== @@ -3343,7 +3343,8 @@ x) #+(and unicode (not unicode-bootstrap)) '(let ((m (char-code x))) - (cond ((< 96 m 123) (code-char (- m 32))) + (cond ((<= (char-code #\a) m (char-code #\z)) + (code-char (logxor m #x20))) ((> m lisp::+ascii-limit+) (code-char (lisp::case-mapping-upper-case m))) (t x)))) @@ -3356,7 +3357,8 @@ x) #+(and unicode (not unicode-bootstrap)) '(let ((m (char-code x))) - (cond ((< 64 m 91) (code-char (+ m 32))) + (cond ((<= (char-code #\A) m (char-code #\Z)) + (code-char (logxor m #x20))) ((> m lisp::+ascii-limit+) (code-char (lisp::case-mapping-lower-case m))) (t x))))
===================================== tests/char.lisp ===================================== @@ -0,0 +1,24 @@ +;; Tests of char functions + +(defpackage :char-tests + (:use :cl :lisp-unit)) + +(in-package "CHAR-TESTS") + +(define-test char-equal + (:tag :issues) + (let ((test-codes + ;; Find all the codes where the CL lower case character + ;; doesn't match the Unicode lower case character. + (loop for code from 128 below char-code-limit + for ch = (code-char code) + when (/= (char-code (char-downcase ch)) (or (lisp::unicode-lower code) code)) + collect code))) + (dolist (code test-codes) + ;; Verify that we convert to the CL lower case character instead + ;; of the Unicode lower case character for the cases where these + ;; are different. + (assert-false (char-equal (code-char (lisp::unicode-lower code)) + (code-char code)) + code + (lisp::unicode-lower code)))))
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/57d5a21c20064990ff47170...