[Git][cmucl/cmucl][master] 2 commits: Fix #323: CL string casing

3 Jun 2024


      Raymond Toy pushed to branch master at cmucl / cmucl


Commits:
535f7e5d by Raymond Toy at 2024-06-03T23:07:07+00:00
Fix #323:  CL string casing

- - - - -
57d5a21c by Raymond Toy at 2024-06-03T23:07:12+00:00
Merge branch 'issue-323-cl-string-casing' into 'master'

Fix #323:  CL string casing

Closes #323

See merge request cmucl/cmucl!224
- - - - -


5 changed files:

- src/code/string.lisp
- src/code/unicode.lisp
- src/i18n/locale/cmucl.pot
- tests/string.lisp
- tests/unicode.lisp


Changes:

=====================================
src/code/string.lisp
=====================================
@@ -638,27 +638,14 @@
 	     (new-index (- start offset) (1+ new-index)))
 	    ((= index (the fixnum end)))
 	  (declare (fixnum index new-index))
-	  (multiple-value-bind (code wide) (codepoint string index)
-	    (when wide (incf index))
-            ;; Use char-upcase if it's not a surrogate pair so that
-            ;; we're always consist.
-            (if wide
-                (setq code (unicode-upper code))
-                (setf code (char-code (char-upcase (code-char code)))))
-	    ;;@@ WARNING: this may, in theory, need to extend newstring
-	    ;;  but that never actually occurs as of Unicode 5.1.0,
-	    ;;  so I'm just going to ignore it for now...
-	    (multiple-value-bind (hi lo) (surrogates code)
-	      (setf (schar newstring new-index) hi)
-	      (when lo
-		(setf (schar newstring (incf new-index)) lo)))))
-	;;@@ WARNING: see above
-	(do ((index end (1+ index))
+	  (setf (schar newstring new-index)
+                (char-upcase (schar string index))))
+        (do ((index end (1+ index))
 	     (new-index (- (the fixnum end) offset) (1+ new-index)))
 	    ((= index offset-slen))
 	  (declare (fixnum index new-index))
 	  (setf (schar newstring new-index) (schar string index)))
-	newstring))))
+        newstring))))
 
 (defun string-downcase (string &key (start 0) end)
   _N"Given a string, returns a new string that is a copy of it with all
@@ -680,64 +667,8 @@
 	     (new-index (- start offset) (1+ new-index)))
 	    ((= index (the fixnum end)))
 	  (declare (fixnum index new-index))
-	  (multiple-value-bind (code wide) (codepoint string index)
-	    (when wide (incf index))
-            ;; Use char-downcase if it's not a surrogate pair so that
-            ;; we're always consist.
-            (if wide
-                (setq code (unicode-lower code))
-                (setq code (char-code (char-downcase (code-char code)))))
-	    ;;@@ WARNING: this may, in theory, need to extend newstring
-	    ;;  but that never actually occurs as of Unicode 5.1.0,
-	    ;;  so I'm just going to ignore it for now...
-	    (multiple-value-bind (hi lo) (surrogates code)
-	      (setf (schar newstring new-index) hi)
-	      (when lo
-		(setf (schar newstring (incf new-index)) lo)))))
-	;;@@ WARNING: see above
-	(do ((index end (1+ index))
-	     (new-index (- (the fixnum end) offset) (1+ new-index)))
-	    ((= index offset-slen))
-	  (declare (fixnum index new-index))
-	  (setf (schar newstring new-index) (schar string index)))
-	newstring))))
-
-#+nil
-(defun string-capitalize (string &key (start 0) end)
-  _N"Given a string, returns a copy of the string with the first
-  character of each ``word'' converted to upper-case, and remaining
-  chars in the word converted to lower case. A ``word'' is defined
-  to be a string of case-modifiable characters delimited by
-  non-case-modifiable chars."
-  (declare (fixnum start))
-  (let* ((string (if (stringp string) string (string string)))
-	 (slen (length string)))
-    (declare (fixnum slen))
-    (with-one-string string start end offset
-      (let ((offset-slen (+ slen offset))
-	    (newstring (make-string slen)))
-	(declare (fixnum offset-slen))
-	(do ((index offset (1+ index))
-	     (new-index 0 (1+ new-index)))
-	    ((= index start))
-	  (declare (fixnum index new-index))
-	  (setf (schar newstring new-index) (schar string index)))
-	(do ((index start (1+ index))
-	     (new-index (- start offset) (1+ new-index))
-	     (newword t)
-	     (char ()))
-	    ((= index (the fixnum end)))
-	  (declare (fixnum index new-index))
-	  (setq char (schar string index))
-	  (cond ((not (alphanumericp char))
-		 (setq newword t))
-		(newword
-		 ;;char is first case-modifiable after non-case-modifiable
-		 (setq char (char-titlecase char))
-		 (setq newword ()))
-		;;char is case-modifiable, but not first
-		(t (setq char (char-downcase char))))
-	  (setf (schar newstring new-index) char))
+	  (setf (schar newstring new-index)
+                (char-downcase (schar string index))))
 	(do ((index end (1+ index))
 	     (new-index (- (the fixnum end) offset) (1+ new-index)))
 	    ((= index offset-slen))
@@ -796,20 +727,9 @@
       (do ((index start (1+ index)))
 	  ((= index (the fixnum end)))
 	(declare (fixnum index))
-	(multiple-value-bind (code wide) (codepoint string index)
-          (if wide
-              (setq code (unicode-upper code))
-              (setf code (char-code (char-upcase (code-char code)))))
-	  ;;@@ WARNING: this may, in theory, need to extend string
-	  ;;      (which, obviously, we can't do here.  Unless
-	  ;;       STRING is adjustable, maybe)
-	  ;;  but that never actually occurs as of Unicode 5.1.0,
-	  ;;  so I'm just going to ignore it for now...
-	  (multiple-value-bind (hi lo) (surrogates code)
-	    (setf (schar string index) hi)
-	    (when lo
-	      (setf (schar string (incf index)) lo))))))
-    save-header))
+        (setf (schar string index)
+              (char-upcase (schar string index))))
+    save-header)))
 
 (defun nstring-downcase (string &key (start 0) end)
   _N"Given a string, returns that string with all upper case alphabetic
@@ -820,45 +740,8 @@
       (do ((index start (1+ index)))
 	  ((= index (the fixnum end)))
 	(declare (fixnum index))
-	(multiple-value-bind (code wide) (codepoint string index)
-          (if wide
-              (setq code (unicode-lower code))
-              (setf code (char-code (char-downcase (code-char code)))))
-	  ;;@@ WARNING: this may, in theory, need to extend string
-	  ;;      (which, obviously, we can't do here.  Unless
-	  ;;       STRING is adjustable, maybe)
-	  ;;  but that never actually occurs as of Unicode 5.1.0,
-	  ;;  so I'm just going to ignore it for now...
-	  (multiple-value-bind (hi lo) (surrogates code)
-	    (setf (schar string index) hi)
-	    (when lo
-	      (setf (schar string (incf index)) lo))))))
-    save-header))
-
-#+nil
-(defun nstring-capitalize (string &key (start 0) end)
-  "Given a string, returns that string with the first
-  character of each ``word'' converted to upper-case, and remaining
-  chars in the word converted to lower case. A ``word'' is defined
-  to be a string of case-modifiable characters delimited by
-  non-case-modifiable chars."
-  (declare (fixnum start))
-  (let ((save-header string))
-    (with-one-string string start end offset
-      (do ((index start (1+ index))
-	   (newword t)
-	   (char ()))
-	  ((= index (the fixnum end)))
-	(declare (fixnum index))
-	(setq char (schar string index))
-	(cond ((not (alphanumericp char))
-	       (setq newword t))
-	      (newword
-	       ;;char is first case-modifiable after non-case-modifiable
-	       (setf (schar string index) (char-titlecase char))
-	       (setq newword ()))
-	      (t
-	       (setf (schar string index) (char-downcase char))))))
+        (setf (schar string index)
+              (char-downcase (schar string index)))))
     save-header))
 
 (defun nstring-capitalize (string &key (start 0) end)


=====================================
src/code/unicode.lisp
=====================================
@@ -17,14 +17,57 @@
 (in-package "UNICODE")
 (intl:textdomain "cmucl")
 
+(defun string-upcase-simple (string &key (start 0) end)
+  _N"Given a string, returns a new string that is a copy of it with all
+  lower case alphabetic characters converted to uppercase."
+  (declare (fixnum start))
+  (let* ((string (string string))
+	 (slen (length string)))
+    (declare (fixnum slen))
+    (lisp::with-one-string string start end offset
+      (let ((offset-slen (+ slen offset))
+	    (newstring (make-string slen)))
+	(declare (fixnum offset-slen))
+	(do ((index offset (1+ index))
+	     (new-index 0 (1+ new-index)))
+	    ((= index start))
+	  (declare (fixnum index new-index))
+	  (setf (schar newstring new-index) (schar string index)))
+	(do ((index start (1+ index))
+	     (new-index (- start offset) (1+ new-index)))
+	    ((= index (the fixnum end)))
+	  (declare (fixnum index new-index))
+	  (multiple-value-bind (code wide) (codepoint string index)
+	    (when wide (incf index))
+            ;; Use char-upcase if it's not a surrogate pair so that
+            ;; we're always consist.
+            (if wide
+                (setq code (unicode-upper code))
+                (setf code (char-code (char-upcase (code-char code)))))
+	    ;;@@ WARNING: this may, in theory, need to extend newstring
+	    ;;  but that never actually occurs as of Unicode 5.1.0,
+	    ;;  so I'm just going to ignore it for now...
+	    (multiple-value-bind (hi lo) (surrogates code)
+	      (setf (schar newstring new-index) hi)
+	      (when lo
+		(setf (schar newstring (incf new-index)) lo)))))
+	;;@@ WARNING: see above
+	(do ((index end (1+ index))
+	     (new-index (- (the fixnum end) offset) (1+ new-index)))
+	    ((= index offset-slen))
+	  (declare (fixnum index new-index))
+	  (setf (schar newstring new-index) (schar string index)))
+	newstring))))
+
+
 ;; An example where this differs from cl:string-upcase differ:
 ;; #\Latin_Small_Letter_Sharp_S
 (defun string-upcase-full (string &key (start 0) end)
   _N"Given a string, returns a new string that is a copy of it with
   all lower case alphabetic characters converted to uppercase using
   full case conversion."
-  (declare (fixnum start)) (let* ((string (if
-  (stringp string) string (string string)))
+  (declare (fixnum start))
+  (let* ((string (string string))
 	 (slen (length string)))
     (declare (fixnum slen))
     (with-output-to-string (s)
@@ -57,11 +100,55 @@
   all lower case alphabetic characters converted to uppercase.  Casing
   is :simple or :full for simple or full case conversion,
   respectively."
-  (declare (fixnum start))
-  (if (eq casing :simple)
-      (cl:string-upcase string :start start :end end)
-      (string-upcase-full string :start start :end end)))
+  (declare (fixnum start)
+           (type (member :simple :full) casing))
+  (ecase casing
+    (:simple
+     (string-upcase-simple string :start start :end end))
+    (:full
+     (string-upcase-full string :start start :end end))))
 
+(defun string-downcase-simple (string &key (start 0) end)
+  _N"Given a string, returns a new string that is a copy of it with all
+  upper case alphabetic characters converted to lowercase."
+  (declare (fixnum start))
+  (let* ((string (if (stringp string) string (string string)))
+	 (slen (length string)))
+    (declare (fixnum slen))
+    (lisp::with-one-string string start end offset
+      (let ((offset-slen (+ slen offset))
+	    (newstring (make-string slen)))
+	(declare (fixnum offset-slen))
+	(do ((index offset (1+ index))
+	     (new-index 0 (1+ new-index)))
+	    ((= index start))
+	  (declare (fixnum index new-index))
+	  (setf (schar newstring new-index) (schar string index)))
+	(do ((index start (1+ index))
+	     (new-index (- start offset) (1+ new-index)))
+	    ((= index (the fixnum end)))
+	  (declare (fixnum index new-index))
+	  (multiple-value-bind (code wide) (codepoint string index)
+	    (when wide (incf index))
+            ;; Use char-downcase if it's not a surrogate pair so that
+            ;; we're always consist.
+            (if wide
+                (setq code (unicode-lower code))
+                (setq code (char-code (char-downcase (code-char code)))))
+	    ;;@@ WARNING: this may, in theory, need to extend newstring
+	    ;;  but that never actually occurs as of Unicode 5.1.0,
+	    ;;  so I'm just going to ignore it for now...
+	    (multiple-value-bind (hi lo) (surrogates code)
+	      (setf (schar newstring new-index) hi)
+	      (when lo
+		(setf (schar newstring (incf new-index)) lo)))))
+	;;@@ WARNING: see above
+	(do ((index end (1+ index))
+	     (new-index (- (the fixnum end) offset) (1+ new-index)))
+	    ((= index offset-slen))
+	  (declare (fixnum index new-index))
+	  (setf (schar newstring new-index) (schar string index)))
+	newstring))))
 
 ;; An example this differs from cl:string-downcase:
 ;; #\Latin_Capital_Letter_I_With_Dot_Above.
@@ -104,10 +191,13 @@
   uppercase alphabetic characters converted to lowercase.  Casing is
   :simple or :full for simple or full case conversion, respectively."
 
-  (declare (fixnum start))
-  (if (eq casing :simple)
-      (cl:string-downcase string :start start :end end)
-      (string-downcase-full string :start start :end end)))
+  (declare (fixnum start)
+           (type (member :simple :full) casing))
+  (ecase casing
+    (:simple
+     (string-downcase-simple string :start start :end end))
+    (:full
+     (string-downcase-full string :start start :end end))))
 
 
 ;;;
@@ -553,12 +643,14 @@
   delimited by non-case-modifiable chars.  "
 
   (declare (fixnum start)
-	   (type (member :simple :full :title) casing))
+	   (type (member :simple-title :simple :full :title) casing))
   (if unicode-word-break
       (string-capitalize-unicode string :start start :end end :casing casing)
-      (if (eq casing :simple)
-	  (string-capitalize-simple string :start start :end end)
-	  (string-capitalize-full string :start start :end end :casing casing))))
+      (ecase casing
+        (:simple-title
+	 (string-capitalize-simple string :start start :end end))
+        ((:simple :full :title)
+	 (string-capitalize-full string :start start :end end :casing casing)))))
 
 
 (defun decompose-hangul-syllable (cp stream)


=====================================
src/i18n/locale/cmucl.pot
=====================================
@@ -3880,13 +3880,13 @@ msgid ""
 "  a new string Count long filled with the fill character."
 msgstr ""
 
-#: src/code/string.lisp
+#: src/code/unicode.lisp src/code/string.lisp
 msgid ""
 "Given a string, returns a new string that is a copy of it with all\n"
 "  lower case alphabetic characters converted to uppercase."
 msgstr ""
 
-#: src/code/string.lisp
+#: src/code/unicode.lisp src/code/string.lisp
 msgid ""
 "Given a string, returns a new string that is a copy of it with all\n"
 "  upper case alphabetic characters converted to lowercase."


=====================================
tests/string.lisp
=====================================
@@ -8,11 +8,10 @@
 (defun make-test-string ()
   ;; Create a string consisting of all the code units EXCEPT for the
   ;; surrogates because string casing handles that differently.
-  (coerce
-   (loop for code from 0 to #xffff
-         unless (lisp::surrogatep code)
-         collect (code-char code))
-   'string))
+  (let ((s (make-string char-code-limit)))
+    (dotimes (k char-code-limit)
+      (setf (aref s k) (code-char k)))
+    s))
 
 (define-test string-upcase
     (:tag :issues)
@@ -62,32 +61,33 @@
 
 (define-test string-capitalize
     (:tag :issues)
-  (let* ((s
-           ;; This string contains a couple of characters where
-           ;; Unicode has a titlecase version of the character.  We
-           ;; want to make sure we use char-upcase to capitalize the
-           ;; string instead of lisp::char-titlecse
-           (coerce
-             '(#\Latin_Small_Letter_Dz
-               #\a #\b
-               #\space
-               #\Latin_Small_Letter_Lj
-               #\A #\B)
-             'string))
+  (let* ((s (string-capitalize
+             ;; #\Latin_Small_Letter_Dz and #\Latin_Small_Letter_Lj
+             ;; #have Unicode titlecase characters that differ from
+             ;; CHAR-UPCASE.  This tests that STRING-CAPITALIZE use
+             ;; CHAR-UPCASE to produce the capitalization.
+             (coerce
+              '(#\Latin_Small_Letter_Dz
+                #\a #\b
+                #\space
+                #\Latin_Small_Letter_Lj
+                #\A #\B)
+              'string)))
          (expected
-           ;; Manually convert S to a capitalized string using
-           ;; char-upcase/downcase.
-           (map 'string
-                #'(lambda (ch f)
-                    (funcall f ch))
+           ;; Manually convert the test string by calling CHAR-UPCASE
+           ;; or CHAR-DOWNCASE (or IDENTITY) to get the desired
+           ;; capitalized string.
+           (map 'list
+                #'(lambda (c f)
+                    (funcall f c))
                 s
                 (list #'char-upcase
                       #'char-downcase
                       #'char-downcase
-                      #'char-downcase
+                      #'identity
                       #'char-upcase
                       #'char-downcase
                       #'char-downcase))))
     (assert-equal
      (map 'list #'char-name expected)
-     (map 'list #'char-name (string-capitalize s)))))
+     (map 'list #'char-name s))))


=====================================
tests/unicode.lisp
=====================================
@@ -192,3 +192,19 @@
 			      computed-breaks)))
 		     (assert-equalp b
 				    (do-test s)))))))))
+
+(define-test unicode.case-extend
+  "Test that Unicode never produces an upper or lower case character
+  outside the BMP for a character in the BMP"
+  (:tag :unicode)
+  ;; For each character code (that isn't a surrogate), find the
+  ;; corresponding Unicode upper and lowe case character.  Verify that
+  ;; this character is in the BMP.
+  (loop for code from 0 below char-code-limit
+        unless (lisp::surrogatep code)
+          do
+             (assert-true (< (lisp::unicode-upper code) char-code-limit)
+                          code (lisp::unicode-upper code))
+             (assert-true (< (lisp::unicode-lower code) char-code-limit)
+                          code (lisp::unicode-upper code))))
+             



View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/b71c090704bec10890c7df1...

-- 
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/b71c090704bec10890c7df1...
You're receiving this email because of your account on gitlab.common-lisp.net.

    

Raymond Toy (＠rtoy)

tags

participants (1)