GitLab

at 2024-06-03T23:07:07+00:00

Fix #323:  CL string casing

Merge branch 'issue-323-cl-string-casing' into 'master'

Fix #323:  CL string casing

Closes #323

See merge request cmucl/cmucl!224
 	     (new-index (- start offset) (1+ new-index)))

 	    ((= index (the fixnum end)))

 	  (declare (fixnum index new-index))

-	  (multiple-value-bind (code wide) (codepoint string index)

-	    (when wide (incf index))

-            ;; Use char-upcase if it's not a surrogate pair so that

-            ;; we're always consist.

-            (if wide

-                (setq code (unicode-upper code))

-                (setf code (char-code (char-upcase (code-char code)))))

-	    ;;@@ WARNING: this may, in theory, need to extend newstring

-	    ;;  but that never actually occurs as of Unicode 5.1.0,

-	    ;;  so I'm just going to ignore it for now...

-	    (multiple-value-bind (hi lo) (surrogates code)

-	      (setf (schar newstring new-index) hi)

-	      (when lo

-		(setf (schar newstring (incf new-index)) lo)))))

-	;;@@ WARNING: see above

-	(do ((index end (1+ index))

+	  (setf (schar newstring new-index)

+                (char-upcase (schar string index))))

+        (do ((index end (1+ index))

 	     (new-index (- (the fixnum end) offset) (1+ new-index)))

 	    ((= index offset-slen))

 	  (declare (fixnum index new-index))

 	  (setf (schar newstring new-index) (schar string index)))

-	newstring))))

+        newstring))))

 (defun string-downcase (string &key (start 0) end)

   _N"Given a string, returns a new string that is a copy of it with all

 	     (new-index (- start offset) (1+ new-index)))

 	    ((= index (the fixnum end)))

 	  (declare (fixnum index new-index))

-	  (multiple-value-bind (code wide) (codepoint string index)

-	    (when wide (incf index))

-            ;; Use char-downcase if it's not a surrogate pair so that

-            ;; we're always consist.

-            (if wide

-                (setq code (unicode-lower code))

-                (setq code (char-code (char-downcase (code-char code)))))

-	    ;;@@ WARNING: this may, in theory, need to extend newstring

-	    ;;  but that never actually occurs as of Unicode 5.1.0,

-	    ;;  so I'm just going to ignore it for now...

-	    (multiple-value-bind (hi lo) (surrogates code)

-	      (setf (schar newstring new-index) hi)

-	      (when lo

-		(setf (schar newstring (incf new-index)) lo)))))

-	;;@@ WARNING: see above

-	(do ((index end (1+ index))

-	     (new-index (- (the fixnum end) offset) (1+ new-index)))

-	    ((= index offset-slen))

-	  (declare (fixnum index new-index))

-	  (setf (schar newstring new-index) (schar string index)))

-	newstring))))

-

-#+nil

-(defun string-capitalize (string &key (start 0) end)

-  _N"Given a string, returns a copy of the string with the first

-  character of each ``word'' converted to upper-case, and remaining

-  chars in the word converted to lower case. A ``word'' is defined

-  to be a string of case-modifiable characters delimited by

-  non-case-modifiable chars."

-  (declare (fixnum start))

-  (let* ((string (if (stringp string) string (string string)))

-	 (slen (length string)))

-    (declare (fixnum slen))

-    (with-one-string string start end offset

-      (let ((offset-slen (+ slen offset))

-	    (newstring (make-string slen)))

-	(declare (fixnum offset-slen))

-	(do ((index offset (1+ index))

-	     (new-index 0 (1+ new-index)))

-	    ((= index start))

-	  (declare (fixnum index new-index))

-	  (setf (schar newstring new-index) (schar string index)))

-	(do ((index start (1+ index))

-	     (new-index (- start offset) (1+ new-index))

-	     (newword t)

-	     (char ()))

-	    ((= index (the fixnum end)))

-	  (declare (fixnum index new-index))

-	  (setq char (schar string index))

-	  (cond ((not (alphanumericp char))

-		 (setq newword t))

-		(newword

-		 ;;char is first case-modifiable after non-case-modifiable

-		 (setq char (char-titlecase char))

-		 (setq newword ()))

-		;;char is case-modifiable, but not first

-		(t (setq char (char-downcase char))))

-	  (setf (schar newstring new-index) char))

+	  (setf (schar newstring new-index)

+                (char-downcase (schar string index))))

 	(do ((index end (1+ index))

 	     (new-index (- (the fixnum end) offset) (1+ new-index)))

 	    ((= index offset-slen))

       (do ((index start (1+ index)))

 	  ((= index (the fixnum end)))

 	(declare (fixnum index))

-	(multiple-value-bind (code wide) (codepoint string index)

-          (if wide

-              (setq code (unicode-upper code))

-              (setf code (char-code (char-upcase (code-char code)))))

-	  ;;@@ WARNING: this may, in theory, need to extend string

-	  ;;      (which, obviously, we can't do here.  Unless

-	  ;;       STRING is adjustable, maybe)

-	  ;;  but that never actually occurs as of Unicode 5.1.0,

-	  ;;  so I'm just going to ignore it for now...

-	  (multiple-value-bind (hi lo) (surrogates code)

-	    (setf (schar string index) hi)

-	    (when lo

-	      (setf (schar string (incf index)) lo))))))

-    save-header))

+        (setf (schar string index)

+              (char-upcase (schar string index))))

+    save-header)))

 (defun nstring-downcase (string &key (start 0) end)

   _N"Given a string, returns that string with all upper case alphabetic

       (do ((index start (1+ index)))

 	  ((= index (the fixnum end)))

 	(declare (fixnum index))

-	(multiple-value-bind (code wide) (codepoint string index)

-          (if wide

-              (setq code (unicode-lower code))

-              (setf code (char-code (char-downcase (code-char code)))))

-	  ;;@@ WARNING: this may, in theory, need to extend string

-	  ;;      (which, obviously, we can't do here.  Unless

-	  ;;       STRING is adjustable, maybe)

-	  ;;  but that never actually occurs as of Unicode 5.1.0,

-	  ;;  so I'm just going to ignore it for now...

-	  (multiple-value-bind (hi lo) (surrogates code)

-	    (setf (schar string index) hi)

-	    (when lo

-	      (setf (schar string (incf index)) lo))))))

-    save-header))

-

-#+nil

-(defun nstring-capitalize (string &key (start 0) end)

-  "Given a string, returns that string with the first

-  character of each ``word'' converted to upper-case, and remaining

-  chars in the word converted to lower case. A ``word'' is defined

-  to be a string of case-modifiable characters delimited by

-  non-case-modifiable chars."

-  (declare (fixnum start))

-  (let ((save-header string))

-    (with-one-string string start end offset

-      (do ((index start (1+ index))

-	   (newword t)

-	   (char ()))

-	  ((= index (the fixnum end)))

-	(declare (fixnum index))

-	(setq char (schar string index))

-	(cond ((not (alphanumericp char))

-	       (setq newword t))

-	      (newword

-	       ;;char is first case-modifiable after non-case-modifiable

-	       (setf (schar string index) (char-titlecase char))

-	       (setq newword ()))

-	      (t

-	       (setf (schar string index) (char-downcase char))))))

+        (setf (schar string index)

+              (char-downcase (schar string index)))))

     save-header))

 (defun nstring-capitalize (string &key (start 0) end)

 (in-package "UNICODE")

 (intl:textdomain "cmucl")

+(defun string-upcase-simple (string &key (start 0) end)

+  _N"Given a string, returns a new string that is a copy of it with all

+  lower case alphabetic characters converted to uppercase."

+  (declare (fixnum start))

+  (let* ((string (string string))

+	 (slen (length string)))

+    (declare (fixnum slen))

+    (lisp::with-one-string string start end offset

+      (let ((offset-slen (+ slen offset))

+	    (newstring (make-string slen)))

+	(declare (fixnum offset-slen))

+	(do ((index offset (1+ index))

+	     (new-index 0 (1+ new-index)))

+	    ((= index start))

+	  (declare (fixnum index new-index))

+	  (setf (schar newstring new-index) (schar string index)))

+	(do ((index start (1+ index))

+	     (new-index (- start offset) (1+ new-index)))

+	    ((= index (the fixnum end)))

+	  (declare (fixnum index new-index))

+	  (multiple-value-bind (code wide) (codepoint string index)

+	    (when wide (incf index))

+            ;; Use char-upcase if it's not a surrogate pair so that

+            ;; we're always consist.

+            (if wide

+                (setq code (unicode-upper code))

+                (setf code (char-code (char-upcase (code-char code)))))

+	    ;;@@ WARNING: this may, in theory, need to extend newstring

+	    ;;  but that never actually occurs as of Unicode 5.1.0,

+	    ;;  so I'm just going to ignore it for now...

+	    (multiple-value-bind (hi lo) (surrogates code)

+	      (setf (schar newstring new-index) hi)

+	      (when lo

+		(setf (schar newstring (incf new-index)) lo)))))

+	;;@@ WARNING: see above

+	(do ((index end (1+ index))

+	     (new-index (- (the fixnum end) offset) (1+ new-index)))

+	    ((= index offset-slen))

+	  (declare (fixnum index new-index))

+	  (setf (schar newstring new-index) (schar string index)))

+	newstring))))

+

+

 ;; An example where this differs from cl:string-upcase differ:

 ;; #\Latin_Small_Letter_Sharp_S

 (defun string-upcase-full (string &key (start 0) end)

   _N"Given a string, returns a new string that is a copy of it with

   all lower case alphabetic characters converted to uppercase using

   full case conversion."

-  (declare (fixnum start)) (let* ((string (if

-  (stringp string) string (string string)))

+  (declare (fixnum start))

+  (let* ((string (string string))

 	 (slen (length string)))

     (declare (fixnum slen))

     (with-output-to-string (s)

   all lower case alphabetic characters converted to uppercase.  Casing

   is :simple or :full for simple or full case conversion,

   respectively."

-  (declare (fixnum start))

-  (if (eq casing :simple)

-      (cl:string-upcase string :start start :end end)

-      (string-upcase-full string :start start :end end)))

+  (declare (fixnum start)

+           (type (member :simple :full) casing))

+  (ecase casing

+    (:simple

+     (string-upcase-simple string :start start :end end))

+    (:full

+     (string-upcase-full string :start start :end end))))

+(defun string-downcase-simple (string &key (start 0) end)

+  _N"Given a string, returns a new string that is a copy of it with all

+  upper case alphabetic characters converted to lowercase."

+  (declare (fixnum start))

+  (let* ((string (if (stringp string) string (string string)))

+	 (slen (length string)))

+    (declare (fixnum slen))

+    (lisp::with-one-string string start end offset

+      (let ((offset-slen (+ slen offset))

+	    (newstring (make-string slen)))

+	(declare (fixnum offset-slen))

+	(do ((index offset (1+ index))

+	     (new-index 0 (1+ new-index)))

+	    ((= index start))

+	  (declare (fixnum index new-index))

+	  (setf (schar newstring new-index) (schar string index)))

+	(do ((index start (1+ index))

+	     (new-index (- start offset) (1+ new-index)))

+	    ((= index (the fixnum end)))

+	  (declare (fixnum index new-index))

+	  (multiple-value-bind (code wide) (codepoint string index)

+	    (when wide (incf index))

+            ;; Use char-downcase if it's not a surrogate pair so that

+            ;; we're always consist.

+            (if wide

+                (setq code (unicode-lower code))

+                (setq code (char-code (char-downcase (code-char code)))))

+	    ;;@@ WARNING: this may, in theory, need to extend newstring

+	    ;;  but that never actually occurs as of Unicode 5.1.0,

+	    ;;  so I'm just going to ignore it for now...

+	    (multiple-value-bind (hi lo) (surrogates code)

+	      (setf (schar newstring new-index) hi)

+	      (when lo

+		(setf (schar newstring (incf new-index)) lo)))))

+	;;@@ WARNING: see above

+	(do ((index end (1+ index))

+	     (new-index (- (the fixnum end) offset) (1+ new-index)))

+	    ((= index offset-slen))

+	  (declare (fixnum index new-index))

+	  (setf (schar newstring new-index) (schar string index)))

+	newstring))))

 ;; An example this differs from cl:string-downcase:

 ;; #\Latin_Capital_Letter_I_With_Dot_Above.

   uppercase alphabetic characters converted to lowercase.  Casing is

   :simple or :full for simple or full case conversion, respectively."

-  (declare (fixnum start))

-  (if (eq casing :simple)

-      (cl:string-downcase string :start start :end end)

-      (string-downcase-full string :start start :end end)))

+  (declare (fixnum start)

+           (type (member :simple :full) casing))

+  (ecase casing

+    (:simple

+     (string-downcase-simple string :start start :end end))

+    (:full

+     (string-downcase-full string :start start :end end))))

 ;;;

   delimited by non-case-modifiable chars.  "

   (declare (fixnum start)

-	   (type (member :simple :full :title) casing))

+	   (type (member :simple-title :simple :full :title) casing))

   (if unicode-word-break

       (string-capitalize-unicode string :start start :end end :casing casing)

-      (if (eq casing :simple)

-	  (string-capitalize-simple string :start start :end end)

-	  (string-capitalize-full string :start start :end end :casing casing))))

+      (ecase casing

+        (:simple-title

+	 (string-capitalize-simple string :start start :end end))

+        ((:simple :full :title)

+	 (string-capitalize-full string :start start :end end :casing casing)))))

 (defun decompose-hangul-syllable (cp stream)

 "  a new string Count long filled with the fill character."

 msgstr ""

-#: src/code/string.lisp

+#: src/code/unicode.lisp src/code/string.lisp

 msgid ""

 "Given a string, returns a new string that is a copy of it with all\n"

 "  lower case alphabetic characters converted to uppercase."

 msgstr ""

-#: src/code/string.lisp

+#: src/code/unicode.lisp src/code/string.lisp

 msgid ""

 "Given a string, returns a new string that is a copy of it with all\n"

 "  upper case alphabetic characters converted to lowercase."

 (defun make-test-string ()

   ;; Create a string consisting of all the code units EXCEPT for the

   ;; surrogates because string casing handles that differently.

-  (coerce

-   (loop for code from 0 to #xffff

-         unless (lisp::surrogatep code)

-         collect (code-char code))

-   'string))

+  (let ((s (make-string char-code-limit)))

+    (dotimes (k char-code-limit)

+      (setf (aref s k) (code-char k)))

+    s))

 (define-test string-upcase

     (:tag :issues)

 (define-test string-capitalize

     (:tag :issues)

-  (let* ((s

-           ;; This string contains a couple of characters where

-           ;; Unicode has a titlecase version of the character.  We

-           ;; want to make sure we use char-upcase to capitalize the

-           ;; string instead of lisp::char-titlecse

-           (coerce

-             '(#\Latin_Small_Letter_Dz

-               #\a #\b

-               #\space

-               #\Latin_Small_Letter_Lj

-               #\A #\B)

-             'string))

+  (let* ((s (string-capitalize

+             ;; #\Latin_Small_Letter_Dz and #\Latin_Small_Letter_Lj

+             ;; #have Unicode titlecase characters that differ from

+             ;; CHAR-UPCASE.  This tests that STRING-CAPITALIZE use

+             ;; CHAR-UPCASE to produce the capitalization.

+             (coerce

+              '(#\Latin_Small_Letter_Dz

+                #\a #\b

+                #\space

+                #\Latin_Small_Letter_Lj

+                #\A #\B)

+              'string)))

          (expected

-           ;; Manually convert S to a capitalized string using

-           ;; char-upcase/downcase.

-           (map 'string

-                #'(lambda (ch f)

-                    (funcall f ch))

+           ;; Manually convert the test string by calling CHAR-UPCASE

+           ;; or CHAR-DOWNCASE (or IDENTITY) to get the desired

+           ;; capitalized string.

+           (map 'list

+                #'(lambda (c f)

+                    (funcall f c))

                 s

                 (list #'char-upcase

                       #'char-downcase

                       #'char-downcase

-                      #'char-downcase

+                      #'identity

                       #'char-upcase

                       #'char-downcase

                       #'char-downcase))))

     (assert-equal

      (map 'list #'char-name expected)

-     (map 'list #'char-name (string-capitalize s)))))
+     (map 'list #'char-name s))))
 			      computed-breaks)))

 		     (assert-equalp b

 				    (do-test s)))))))))

+

+(define-test unicode.case-extend

+  "Test that Unicode never produces an upper or lower case character

+  outside the BMP for a character in the BMP"

+  (:tag :unicode)

+  ;; For each character code (that isn't a surrogate), find the

+  ;; corresponding Unicode upper and lowe case character.  Verify that

+  ;; this character is in the BMP.

+  (loop for code from 0 below char-code-limit

+        unless (lisp::surrogatep code)

+          do

+             (assert-true (< (lisp::unicode-upper code) char-code-limit)

+                          code (lisp::unicode-upper code))

+             (assert-true (< (lisp::unicode-lower code) char-code-limit)

+                          code (lisp::unicode-upper code))))

+             

Raymond Toy pushed to branch master at cmucl / cmucl

Commits:

5 changed files:

Changes: