Raymond Toy pushed to branch master at cmucl / cmucl
Commits: 535f7e5d by Raymond Toy at 2024-06-03T23:07:07+00:00 Fix #323: CL string casing
- - - - - 57d5a21c by Raymond Toy at 2024-06-03T23:07:12+00:00 Merge branch 'issue-323-cl-string-casing' into 'master'
Fix #323: CL string casing
Closes #323
See merge request cmucl/cmucl!224 - - - - -
5 changed files:
- src/code/string.lisp - src/code/unicode.lisp - src/i18n/locale/cmucl.pot - tests/string.lisp - tests/unicode.lisp
Changes:
===================================== src/code/string.lisp ===================================== @@ -638,27 +638,14 @@ (new-index (- start offset) (1+ new-index))) ((= index (the fixnum end))) (declare (fixnum index new-index)) - (multiple-value-bind (code wide) (codepoint string index) - (when wide (incf index)) - ;; Use char-upcase if it's not a surrogate pair so that - ;; we're always consist. - (if wide - (setq code (unicode-upper code)) - (setf code (char-code (char-upcase (code-char code))))) - ;;@@ WARNING: this may, in theory, need to extend newstring - ;; but that never actually occurs as of Unicode 5.1.0, - ;; so I'm just going to ignore it for now... - (multiple-value-bind (hi lo) (surrogates code) - (setf (schar newstring new-index) hi) - (when lo - (setf (schar newstring (incf new-index)) lo))))) - ;;@@ WARNING: see above - (do ((index end (1+ index)) + (setf (schar newstring new-index) + (char-upcase (schar string index)))) + (do ((index end (1+ index)) (new-index (- (the fixnum end) offset) (1+ new-index))) ((= index offset-slen)) (declare (fixnum index new-index)) (setf (schar newstring new-index) (schar string index))) - newstring)))) + newstring))))
(defun string-downcase (string &key (start 0) end) _N"Given a string, returns a new string that is a copy of it with all @@ -680,64 +667,8 @@ (new-index (- start offset) (1+ new-index))) ((= index (the fixnum end))) (declare (fixnum index new-index)) - (multiple-value-bind (code wide) (codepoint string index) - (when wide (incf index)) - ;; Use char-downcase if it's not a surrogate pair so that - ;; we're always consist. - (if wide - (setq code (unicode-lower code)) - (setq code (char-code (char-downcase (code-char code))))) - ;;@@ WARNING: this may, in theory, need to extend newstring - ;; but that never actually occurs as of Unicode 5.1.0, - ;; so I'm just going to ignore it for now... - (multiple-value-bind (hi lo) (surrogates code) - (setf (schar newstring new-index) hi) - (when lo - (setf (schar newstring (incf new-index)) lo))))) - ;;@@ WARNING: see above - (do ((index end (1+ index)) - (new-index (- (the fixnum end) offset) (1+ new-index))) - ((= index offset-slen)) - (declare (fixnum index new-index)) - (setf (schar newstring new-index) (schar string index))) - newstring)))) - -#+nil -(defun string-capitalize (string &key (start 0) end) - _N"Given a string, returns a copy of the string with the first - character of each ``word'' converted to upper-case, and remaining - chars in the word converted to lower case. A ``word'' is defined - to be a string of case-modifiable characters delimited by - non-case-modifiable chars." - (declare (fixnum start)) - (let* ((string (if (stringp string) string (string string))) - (slen (length string))) - (declare (fixnum slen)) - (with-one-string string start end offset - (let ((offset-slen (+ slen offset)) - (newstring (make-string slen))) - (declare (fixnum offset-slen)) - (do ((index offset (1+ index)) - (new-index 0 (1+ new-index))) - ((= index start)) - (declare (fixnum index new-index)) - (setf (schar newstring new-index) (schar string index))) - (do ((index start (1+ index)) - (new-index (- start offset) (1+ new-index)) - (newword t) - (char ())) - ((= index (the fixnum end))) - (declare (fixnum index new-index)) - (setq char (schar string index)) - (cond ((not (alphanumericp char)) - (setq newword t)) - (newword - ;;char is first case-modifiable after non-case-modifiable - (setq char (char-titlecase char)) - (setq newword ())) - ;;char is case-modifiable, but not first - (t (setq char (char-downcase char)))) - (setf (schar newstring new-index) char)) + (setf (schar newstring new-index) + (char-downcase (schar string index)))) (do ((index end (1+ index)) (new-index (- (the fixnum end) offset) (1+ new-index))) ((= index offset-slen)) @@ -796,20 +727,9 @@ (do ((index start (1+ index))) ((= index (the fixnum end))) (declare (fixnum index)) - (multiple-value-bind (code wide) (codepoint string index) - (if wide - (setq code (unicode-upper code)) - (setf code (char-code (char-upcase (code-char code))))) - ;;@@ WARNING: this may, in theory, need to extend string - ;; (which, obviously, we can't do here. Unless - ;; STRING is adjustable, maybe) - ;; but that never actually occurs as of Unicode 5.1.0, - ;; so I'm just going to ignore it for now... - (multiple-value-bind (hi lo) (surrogates code) - (setf (schar string index) hi) - (when lo - (setf (schar string (incf index)) lo)))))) - save-header)) + (setf (schar string index) + (char-upcase (schar string index)))) + save-header)))
(defun nstring-downcase (string &key (start 0) end) _N"Given a string, returns that string with all upper case alphabetic @@ -820,45 +740,8 @@ (do ((index start (1+ index))) ((= index (the fixnum end))) (declare (fixnum index)) - (multiple-value-bind (code wide) (codepoint string index) - (if wide - (setq code (unicode-lower code)) - (setf code (char-code (char-downcase (code-char code))))) - ;;@@ WARNING: this may, in theory, need to extend string - ;; (which, obviously, we can't do here. Unless - ;; STRING is adjustable, maybe) - ;; but that never actually occurs as of Unicode 5.1.0, - ;; so I'm just going to ignore it for now... - (multiple-value-bind (hi lo) (surrogates code) - (setf (schar string index) hi) - (when lo - (setf (schar string (incf index)) lo)))))) - save-header)) - -#+nil -(defun nstring-capitalize (string &key (start 0) end) - "Given a string, returns that string with the first - character of each ``word'' converted to upper-case, and remaining - chars in the word converted to lower case. A ``word'' is defined - to be a string of case-modifiable characters delimited by - non-case-modifiable chars." - (declare (fixnum start)) - (let ((save-header string)) - (with-one-string string start end offset - (do ((index start (1+ index)) - (newword t) - (char ())) - ((= index (the fixnum end))) - (declare (fixnum index)) - (setq char (schar string index)) - (cond ((not (alphanumericp char)) - (setq newword t)) - (newword - ;;char is first case-modifiable after non-case-modifiable - (setf (schar string index) (char-titlecase char)) - (setq newword ())) - (t - (setf (schar string index) (char-downcase char)))))) + (setf (schar string index) + (char-downcase (schar string index))))) save-header))
(defun nstring-capitalize (string &key (start 0) end)
===================================== src/code/unicode.lisp ===================================== @@ -17,14 +17,57 @@ (in-package "UNICODE") (intl:textdomain "cmucl")
+(defun string-upcase-simple (string &key (start 0) end) + _N"Given a string, returns a new string that is a copy of it with all + lower case alphabetic characters converted to uppercase." + (declare (fixnum start)) + (let* ((string (string string)) + (slen (length string))) + (declare (fixnum slen)) + (lisp::with-one-string string start end offset + (let ((offset-slen (+ slen offset)) + (newstring (make-string slen))) + (declare (fixnum offset-slen)) + (do ((index offset (1+ index)) + (new-index 0 (1+ new-index))) + ((= index start)) + (declare (fixnum index new-index)) + (setf (schar newstring new-index) (schar string index))) + (do ((index start (1+ index)) + (new-index (- start offset) (1+ new-index))) + ((= index (the fixnum end))) + (declare (fixnum index new-index)) + (multiple-value-bind (code wide) (codepoint string index) + (when wide (incf index)) + ;; Use char-upcase if it's not a surrogate pair so that + ;; we're always consist. + (if wide + (setq code (unicode-upper code)) + (setf code (char-code (char-upcase (code-char code))))) + ;;@@ WARNING: this may, in theory, need to extend newstring + ;; but that never actually occurs as of Unicode 5.1.0, + ;; so I'm just going to ignore it for now... + (multiple-value-bind (hi lo) (surrogates code) + (setf (schar newstring new-index) hi) + (when lo + (setf (schar newstring (incf new-index)) lo))))) + ;;@@ WARNING: see above + (do ((index end (1+ index)) + (new-index (- (the fixnum end) offset) (1+ new-index))) + ((= index offset-slen)) + (declare (fixnum index new-index)) + (setf (schar newstring new-index) (schar string index))) + newstring)))) + + ;; An example where this differs from cl:string-upcase differ: ;; #\Latin_Small_Letter_Sharp_S (defun string-upcase-full (string &key (start 0) end) _N"Given a string, returns a new string that is a copy of it with all lower case alphabetic characters converted to uppercase using full case conversion." - (declare (fixnum start)) (let* ((string (if - (stringp string) string (string string))) + (declare (fixnum start)) + (let* ((string (string string)) (slen (length string))) (declare (fixnum slen)) (with-output-to-string (s) @@ -57,11 +100,55 @@ all lower case alphabetic characters converted to uppercase. Casing is :simple or :full for simple or full case conversion, respectively." - (declare (fixnum start)) - (if (eq casing :simple) - (cl:string-upcase string :start start :end end) - (string-upcase-full string :start start :end end))) + (declare (fixnum start) + (type (member :simple :full) casing)) + (ecase casing + (:simple + (string-upcase-simple string :start start :end end)) + (:full + (string-upcase-full string :start start :end end))))
+(defun string-downcase-simple (string &key (start 0) end) + _N"Given a string, returns a new string that is a copy of it with all + upper case alphabetic characters converted to lowercase." + (declare (fixnum start)) + (let* ((string (if (stringp string) string (string string))) + (slen (length string))) + (declare (fixnum slen)) + (lisp::with-one-string string start end offset + (let ((offset-slen (+ slen offset)) + (newstring (make-string slen))) + (declare (fixnum offset-slen)) + (do ((index offset (1+ index)) + (new-index 0 (1+ new-index))) + ((= index start)) + (declare (fixnum index new-index)) + (setf (schar newstring new-index) (schar string index))) + (do ((index start (1+ index)) + (new-index (- start offset) (1+ new-index))) + ((= index (the fixnum end))) + (declare (fixnum index new-index)) + (multiple-value-bind (code wide) (codepoint string index) + (when wide (incf index)) + ;; Use char-downcase if it's not a surrogate pair so that + ;; we're always consist. + (if wide + (setq code (unicode-lower code)) + (setq code (char-code (char-downcase (code-char code))))) + ;;@@ WARNING: this may, in theory, need to extend newstring + ;; but that never actually occurs as of Unicode 5.1.0, + ;; so I'm just going to ignore it for now... + (multiple-value-bind (hi lo) (surrogates code) + (setf (schar newstring new-index) hi) + (when lo + (setf (schar newstring (incf new-index)) lo))))) + ;;@@ WARNING: see above + (do ((index end (1+ index)) + (new-index (- (the fixnum end) offset) (1+ new-index))) + ((= index offset-slen)) + (declare (fixnum index new-index)) + (setf (schar newstring new-index) (schar string index))) + newstring))))
;; An example this differs from cl:string-downcase: ;; #\Latin_Capital_Letter_I_With_Dot_Above. @@ -104,10 +191,13 @@ uppercase alphabetic characters converted to lowercase. Casing is :simple or :full for simple or full case conversion, respectively."
- (declare (fixnum start)) - (if (eq casing :simple) - (cl:string-downcase string :start start :end end) - (string-downcase-full string :start start :end end))) + (declare (fixnum start) + (type (member :simple :full) casing)) + (ecase casing + (:simple + (string-downcase-simple string :start start :end end)) + (:full + (string-downcase-full string :start start :end end))))
;;; @@ -553,12 +643,14 @@ delimited by non-case-modifiable chars. "
(declare (fixnum start) - (type (member :simple :full :title) casing)) + (type (member :simple-title :simple :full :title) casing)) (if unicode-word-break (string-capitalize-unicode string :start start :end end :casing casing) - (if (eq casing :simple) - (string-capitalize-simple string :start start :end end) - (string-capitalize-full string :start start :end end :casing casing)))) + (ecase casing + (:simple-title + (string-capitalize-simple string :start start :end end)) + ((:simple :full :title) + (string-capitalize-full string :start start :end end :casing casing)))))
(defun decompose-hangul-syllable (cp stream)
===================================== src/i18n/locale/cmucl.pot ===================================== @@ -3880,13 +3880,13 @@ msgid "" " a new string Count long filled with the fill character." msgstr ""
-#: src/code/string.lisp +#: src/code/unicode.lisp src/code/string.lisp msgid "" "Given a string, returns a new string that is a copy of it with all\n" " lower case alphabetic characters converted to uppercase." msgstr ""
-#: src/code/string.lisp +#: src/code/unicode.lisp src/code/string.lisp msgid "" "Given a string, returns a new string that is a copy of it with all\n" " upper case alphabetic characters converted to lowercase."
===================================== tests/string.lisp ===================================== @@ -8,11 +8,10 @@ (defun make-test-string () ;; Create a string consisting of all the code units EXCEPT for the ;; surrogates because string casing handles that differently. - (coerce - (loop for code from 0 to #xffff - unless (lisp::surrogatep code) - collect (code-char code)) - 'string)) + (let ((s (make-string char-code-limit))) + (dotimes (k char-code-limit) + (setf (aref s k) (code-char k))) + s))
(define-test string-upcase (:tag :issues) @@ -62,32 +61,33 @@
(define-test string-capitalize (:tag :issues) - (let* ((s - ;; This string contains a couple of characters where - ;; Unicode has a titlecase version of the character. We - ;; want to make sure we use char-upcase to capitalize the - ;; string instead of lisp::char-titlecse - (coerce - '(#\Latin_Small_Letter_Dz - #\a #\b - #\space - #\Latin_Small_Letter_Lj - #\A #\B) - 'string)) + (let* ((s (string-capitalize + ;; #\Latin_Small_Letter_Dz and #\Latin_Small_Letter_Lj + ;; #have Unicode titlecase characters that differ from + ;; CHAR-UPCASE. This tests that STRING-CAPITALIZE use + ;; CHAR-UPCASE to produce the capitalization. + (coerce + '(#\Latin_Small_Letter_Dz + #\a #\b + #\space + #\Latin_Small_Letter_Lj + #\A #\B) + 'string))) (expected - ;; Manually convert S to a capitalized string using - ;; char-upcase/downcase. - (map 'string - #'(lambda (ch f) - (funcall f ch)) + ;; Manually convert the test string by calling CHAR-UPCASE + ;; or CHAR-DOWNCASE (or IDENTITY) to get the desired + ;; capitalized string. + (map 'list + #'(lambda (c f) + (funcall f c)) s (list #'char-upcase #'char-downcase #'char-downcase - #'char-downcase + #'identity #'char-upcase #'char-downcase #'char-downcase)))) (assert-equal (map 'list #'char-name expected) - (map 'list #'char-name (string-capitalize s))))) + (map 'list #'char-name s))))
===================================== tests/unicode.lisp ===================================== @@ -192,3 +192,19 @@ computed-breaks))) (assert-equalp b (do-test s))))))))) + +(define-test unicode.case-extend + "Test that Unicode never produces an upper or lower case character + outside the BMP for a character in the BMP" + (:tag :unicode) + ;; For each character code (that isn't a surrogate), find the + ;; corresponding Unicode upper and lowe case character. Verify that + ;; this character is in the BMP. + (loop for code from 0 below char-code-limit + unless (lisp::surrogatep code) + do + (assert-true (< (lisp::unicode-upper code) char-code-limit) + code (lisp::unicode-upper code)) + (assert-true (< (lisp::unicode-lower code) char-code-limit) + code (lisp::unicode-upper code)))) +
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/b71c090704bec10890c7df1...