Raymond Toy pushed to branch issue-326-char-casing-cleanup at cmucl / cmucl
Commits:
b8887cd6 by Raymond Toy at 2024-06-03T14:38:08-07:00
Fix typo in the lower limit for char-upcase.
We use a lower limit of `#\a`, but we really should have used `#\``.
- - - - -
e4a3d3b5 by Raymond Toy at 2024-06-03T15:59:09-07:00
Fix equal-char-code to use CL lowercase
Previously, `equal-char-code` used the Unicode function
`unicode-lower` to convert the character to lower case. That's not
what we want; we want to use the CL lower case character. Thus, use
`case-mapping-lower-case` to get the desired character code.
Add a test for `char-equal` that shows we've fixed this.
- - - - -
c3e7e0b4 by Raymond Toy at 2024-06-03T16:11:47-07:00
Simplify equal-char-code as suggested.
Replace the nested `if`'s with a `cond`. Change the bounds to be
inclusive to make it easier to reason about the range of characters.
- - - - -
7c711f4f by Raymond Toy at 2024-06-03T16:12:48-07:00
Fix a typo and add a comment.
- - - - -
3 changed files:
- src/code/char.lisp
- src/compiler/srctran.lisp
- + tests/char.lisp
Changes:
=====================================
src/code/char.lisp
=====================================
@@ -418,14 +418,14 @@
(defmacro equal-char-code (character)
`(let ((ch (char-code ,character)))
- ;; Handle ASCII separately for bootstrapping and for unidata missing.
- (if (< (char-code #\@) ch (char-code #\[))
- (+ ch 32)
- #-(and unicode (not unicode-bootstrap))
- ch
- #+(and unicode (not unicode-bootstrap))
- (if (> ch +ascii-limit+) (unicode-lower ch) ch))))
-
+ ;; Handle ASCII separately for bootstrapping.
+ (cond ((<= (char-code #\A) ch (char-code #\Z))
+ (logxor ch #x20))
+ #+(and unicode (not unicode-bootstrap))
+ ((> ch +ascii-limit+)
+ (case-mapping-lower-case ch))
+ (t
+ ch))))
(defun char-equal (character &rest more-characters)
"Returns T if all of its arguments are the same character.
=====================================
src/compiler/srctran.lisp
=====================================
@@ -3343,7 +3343,7 @@
x)
#+(and unicode (not unicode-bootstrap))
'(let ((m (char-code x)))
- (cond ((< (char-code #\a) m (char-code #\{))
+ (cond ((< (char-code #\`) m (char-code #\{))
(code-char (logxor m #x20)))
((> m lisp::+ascii-limit+)
(code-char (lisp::case-mapping-upper-case m)))
=====================================
tests/char.lisp
=====================================
@@ -0,0 +1,24 @@
+;; Tests of char functions
+
+(defpackage :char-tests
+ (:use :cl :lisp-unit))
+
+(in-package "CHAR-TESTS")
+
+(define-test char-equal
+ (:tag :issues)
+ (let ((test-codes
+ ;; Find all the codes where the CL lower case character
+ ;; doesn't match the Unicode lower case character.
+ (loop for code from 128 below char-code-limit
+ for ch = (code-char code)
+ when (/= (char-code (char-downcase ch)) (or (lisp::unicode-lower code) code))
+ collect code)))
+ (dolist (code test-codes)
+ ;; Verify that we convert to the CL lower case character instead
+ ;; of the Unicode lower case character for the cases where these
+ ;; are different.
+ (assert-false (char-equal (code-char (lisp::unicode-lower code))
+ (code-char code))
+ code
+ (lisp::unicode-lower code)))))
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/ca36c16321c2028784aaa3…
--
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/ca36c16321c2028784aaa3…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-323-cl-string-casing at cmucl / cmucl
Commits:
6d72fb36 by Raymond Toy at 2024-06-03T14:39:51-07:00
Remove the old versions of string-capitalize and nstring-capitalize
Forgot to remove these when they were moved to unicode.lisp.
- - - - -
1 changed file:
- src/code/string.lisp
Changes:
=====================================
src/code/string.lisp
=====================================
@@ -676,49 +676,6 @@
(setf (schar newstring new-index) (schar string index)))
newstring))))
-#+nil
-(defun string-capitalize (string &key (start 0) end)
- _N"Given a string, returns a copy of the string with the first
- character of each ``word'' converted to upper-case, and remaining
- chars in the word converted to lower case. A ``word'' is defined
- to be a string of case-modifiable characters delimited by
- non-case-modifiable chars."
- (declare (fixnum start))
- (let* ((string (if (stringp string) string (string string)))
- (slen (length string)))
- (declare (fixnum slen))
- (with-one-string string start end offset
- (let ((offset-slen (+ slen offset))
- (newstring (make-string slen)))
- (declare (fixnum offset-slen))
- (do ((index offset (1+ index))
- (new-index 0 (1+ new-index)))
- ((= index start))
- (declare (fixnum index new-index))
- (setf (schar newstring new-index) (schar string index)))
- (do ((index start (1+ index))
- (new-index (- start offset) (1+ new-index))
- (newword t)
- (char ()))
- ((= index (the fixnum end)))
- (declare (fixnum index new-index))
- (setq char (schar string index))
- (cond ((not (alphanumericp char))
- (setq newword t))
- (newword
- ;;char is first case-modifiable after non-case-modifiable
- (setq char (char-upcase char))
- (setq newword ()))
- ;;char is case-modifiable, but not first
- (t (setq char (char-downcase char))))
- (setf (schar newstring new-index) char))
- (do ((index end (1+ index))
- (new-index (- (the fixnum end) offset) (1+ new-index)))
- ((= index offset-slen))
- (declare (fixnum index new-index))
- (setf (schar newstring new-index) (schar string index)))
- newstring))))
-
(defun string-capitalize (string &key (start 0) end)
_N"Given a string, returns a copy of the string with the first
character of each ``word'' converted to upper-case, and remaining
@@ -787,32 +744,6 @@
(char-downcase (schar string index)))))
save-header))
-#+nil
-(defun nstring-capitalize (string &key (start 0) end)
- "Given a string, returns that string with the first
- character of each ``word'' converted to upper-case, and remaining
- chars in the word converted to lower case. A ``word'' is defined
- to be a string of case-modifiable characters delimited by
- non-case-modifiable chars."
- (declare (fixnum start))
- (let ((save-header string))
- (with-one-string string start end offset
- (do ((index start (1+ index))
- (newword t)
- (char ()))
- ((= index (the fixnum end)))
- (declare (fixnum index))
- (setq char (schar string index))
- (cond ((not (alphanumericp char))
- (setq newword t))
- (newword
- ;;char is first case-modifiable after non-case-modifiable
- (setf (schar string index) (char-titlecase char))
- (setq newword ()))
- (t
- (setf (schar string index) (char-downcase char))))))
- save-header))
-
(defun nstring-capitalize (string &key (start 0) end)
_N"Given a string, returns that string with the first
character of each ``word'' converted to upper-case, and remaining
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/6d72fb364f0f2f84aaf97d2…
--
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/6d72fb364f0f2f84aaf97d2…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-323-cl-string-casing at cmucl / cmucl
Commits:
e3862c75 by Raymond Toy at 2024-06-03T09:58:23-07:00
More cleanups
Just use simple `(string string)` instead of doing a test for
`stringp`.
Fix bad formatting in `string-upcase-full`.
Use `ecase` as needed and also add declarations to it clearer to the
compiler what the acceptable arguments are.
Also, we were confused on what `string-capitalize-simple` does compared
to `string-capitalize-full`. We call `string-capitalize-simple` if
the `casing` arg is `:simple-title` instead of `:simple`, which should
call `string-capitalize-full`.
- - - - -
1 changed file:
- src/code/unicode.lisp
Changes:
=====================================
src/code/unicode.lisp
=====================================
@@ -21,7 +21,7 @@
_N"Given a string, returns a new string that is a copy of it with all
lower case alphabetic characters converted to uppercase."
(declare (fixnum start))
- (let* ((string (if (stringp string) string (string string)))
+ (let* ((string (string string))
(slen (length string)))
(declare (fixnum slen))
(lisp::with-one-string string start end offset
@@ -66,8 +66,8 @@
_N"Given a string, returns a new string that is a copy of it with
all lower case alphabetic characters converted to uppercase using
full case conversion."
- (declare (fixnum start)) (let* ((string (if
- (stringp string) string (string string)))
+ (declare (fixnum start))
+ (let* ((string (string string))
(slen (length string)))
(declare (fixnum slen))
(with-output-to-string (s)
@@ -100,10 +100,13 @@
all lower case alphabetic characters converted to uppercase. Casing
is :simple or :full for simple or full case conversion,
respectively."
- (declare (fixnum start))
- (if (eq casing :simple)
- (string-upcase-simple string :start start :end end)
- (string-upcase-full string :start start :end end)))
+ (declare (fixnum start)
+ (type (member :simple :full) casing))
+ (ecase casing
+ (:simple
+ (string-upcase-simple string :start start :end end))
+ (:full
+ (string-upcase-full string :start start :end end))))
(defun string-downcase-simple (string &key (start 0) end)
_N"Given a string, returns a new string that is a copy of it with all
@@ -188,10 +191,13 @@
uppercase alphabetic characters converted to lowercase. Casing is
:simple or :full for simple or full case conversion, respectively."
- (declare (fixnum start))
- (if (eq casing :simple)
- (string-downcase-simple string :start start :end end)
- (string-downcase-full string :start start :end end)))
+ (declare (fixnum start)
+ (type (member :simple :full) casing))
+ (ecase casing
+ (:simple
+ (string-downcase-simple string :start start :end end))
+ (:full
+ (string-downcase-full string :start start :end end))))
;;;
@@ -637,12 +643,14 @@
delimited by non-case-modifiable chars. "
(declare (fixnum start)
- (type (member :simple :full :title) casing))
+ (type (member :simple-title :simple :full :title) casing))
(if unicode-word-break
(string-capitalize-unicode string :start start :end end :casing casing)
- (if (eq casing :simple)
- (string-capitalize-simple string :start start :end end)
- (string-capitalize-full string :start start :end end :casing casing))))
+ (ecase casing
+ (:simple-title
+ (string-capitalize-simple string :start start :end end))
+ ((:simple :full :title)
+ (string-capitalize-full string :start start :end end :casing casing)))))
(defun decompose-hangul-syllable (cp stream)
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/e3862c755851d5df523f922…
--
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/e3862c755851d5df523f922…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-323-cl-string-casing at cmucl / cmucl
Commits:
c9c599f7 by Raymond Toy at 2024-06-03T08:44:43-07:00
Add test that a BMP character has an upper/lower case in the BMP.
- - - - -
e2e20bc2 by Raymond Toy at 2024-06-03T08:45:16-07:00
Fix typo in name of string-downcase-simpl
It should be `string-downcase-simple`.
- - - - -
2 changed files:
- src/code/unicode.lisp
- tests/unicode.lisp
Changes:
=====================================
src/code/unicode.lisp
=====================================
@@ -105,7 +105,7 @@
(string-upcase-simple string :start start :end end)
(string-upcase-full string :start start :end end)))
-(defun string-downcase-simpl (string &key (start 0) end)
+(defun string-downcase-simple (string &key (start 0) end)
_N"Given a string, returns a new string that is a copy of it with all
upper case alphabetic characters converted to lowercase."
(declare (fixnum start))
=====================================
tests/unicode.lisp
=====================================
@@ -192,3 +192,19 @@
computed-breaks)))
(assert-equalp b
(do-test s)))))))))
+
+(define-test unicode.case-extend
+ "Test that Unicode never produces an upper or lower case character
+ outside the BMP for a character in the BMP"
+ (:tag :unicode)
+ ;; For each character code (that isn't a surrogate), find the
+ ;; corresponding Unicode upper and lowe case character. Verify that
+ ;; this character is in the BMP.
+ (loop for code from 0 below char-code-limit
+ unless (lisp::surrogatep code)
+ do
+ (assert-true (< (lisp::unicode-upper code) char-code-limit)
+ code (lisp::unicode-upper code))
+ (assert-true (< (lisp::unicode-lower code) char-code-limit)
+ code (lisp::unicode-upper code))))
+
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/e6ef0eacc569fbad7f87a5…
--
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/e6ef0eacc569fbad7f87a5…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-323-cl-string-casing at cmucl / cmucl
Commits:
50395287 by Raymond Toy at 2024-05-31T08:28:43-07:00
Update release notes for newly closed issues
Forgot to add these in the MRs that closed the issues.
- - - - -
5a037b19 by Raymond Toy at 2024-05-31T08:56:12-07:00
String-capitalize should use char-upcase, not lisp:char-titlecase
We were using lisp:;char-titlecase to capitalize the string, but we
really should use char-upcase. However, to preserve this
functionality, we moved the functions to the `unicode` package.
Added a test for this.
- - - - -
e6ef0eac by Raymond Toy at 2024-06-01T18:27:19-07:00
Merge branch 'master' into issue-323-cl-string-casing
- - - - -
3 changed files:
- src/code/string.lisp
- src/code/unicode.lisp
- src/general-info/release-21f.md
Changes:
=====================================
src/code/string.lisp
=====================================
@@ -676,6 +676,49 @@
(setf (schar newstring new-index) (schar string index)))
newstring))))
+#+nil
+(defun string-capitalize (string &key (start 0) end)
+ _N"Given a string, returns a copy of the string with the first
+ character of each ``word'' converted to upper-case, and remaining
+ chars in the word converted to lower case. A ``word'' is defined
+ to be a string of case-modifiable characters delimited by
+ non-case-modifiable chars."
+ (declare (fixnum start))
+ (let* ((string (if (stringp string) string (string string)))
+ (slen (length string)))
+ (declare (fixnum slen))
+ (with-one-string string start end offset
+ (let ((offset-slen (+ slen offset))
+ (newstring (make-string slen)))
+ (declare (fixnum offset-slen))
+ (do ((index offset (1+ index))
+ (new-index 0 (1+ new-index)))
+ ((= index start))
+ (declare (fixnum index new-index))
+ (setf (schar newstring new-index) (schar string index)))
+ (do ((index start (1+ index))
+ (new-index (- start offset) (1+ new-index))
+ (newword t)
+ (char ()))
+ ((= index (the fixnum end)))
+ (declare (fixnum index new-index))
+ (setq char (schar string index))
+ (cond ((not (alphanumericp char))
+ (setq newword t))
+ (newword
+ ;;char is first case-modifiable after non-case-modifiable
+ (setq char (char-upcase char))
+ (setq newword ()))
+ ;;char is case-modifiable, but not first
+ (t (setq char (char-downcase char))))
+ (setf (schar newstring new-index) char))
+ (do ((index end (1+ index))
+ (new-index (- (the fixnum end) offset) (1+ new-index)))
+ ((= index offset-slen))
+ (declare (fixnum index new-index))
+ (setf (schar newstring new-index) (schar string index)))
+ newstring))))
+
(defun string-capitalize (string &key (start 0) end)
_N"Given a string, returns a copy of the string with the first
character of each ``word'' converted to upper-case, and remaining
@@ -719,7 +762,7 @@
newstring))))
(defun nstring-upcase (string &key (start 0) end)
- "Given a string, returns that string with all lower case alphabetic
+ _N"Given a string, returns that string with all lower case alphabetic
characters converted to uppercase."
(declare (fixnum start))
(let ((save-header string))
@@ -732,7 +775,7 @@
save-header)))
(defun nstring-downcase (string &key (start 0) end)
- "Given a string, returns that string with all upper case alphabetic
+ _N"Given a string, returns that string with all upper case alphabetic
characters converted to lowercase."
(declare (fixnum start))
(let ((save-header string))
@@ -744,6 +787,7 @@
(char-downcase (schar string index)))))
save-header))
+#+nil
(defun nstring-capitalize (string &key (start 0) end)
"Given a string, returns that string with the first
character of each ``word'' converted to upper-case, and remaining
@@ -769,6 +813,31 @@
(setf (schar string index) (char-downcase char))))))
save-header))
+(defun nstring-capitalize (string &key (start 0) end)
+ _N"Given a string, returns that string with the first
+ character of each ``word'' converted to upper-case, and remaining
+ chars in the word converted to lower case. A ``word'' is defined
+ to be a string of case-modifiable characters delimited by
+ non-case-modifiable chars."
+ (declare (fixnum start))
+ (let ((save-header string))
+ (with-one-string string start end offset
+ (do ((index start (1+ index))
+ (newword t)
+ (char ()))
+ ((= index (the fixnum end)))
+ (declare (fixnum index))
+ (setq char (schar string index))
+ (cond ((not (alphanumericp char))
+ (setq newword t))
+ (newword
+ ;;char is first case-modifiable after non-case-modifiable
+ (setf (schar string index) (char-upcase char))
+ (setq newword ()))
+ (t
+ (setf (schar string index) (char-downcase char))))))
+ save-header))
+
#+unicode
(progn
=====================================
src/code/unicode.lisp
=====================================
@@ -641,7 +641,7 @@
(if unicode-word-break
(string-capitalize-unicode string :start start :end end :casing casing)
(if (eq casing :simple)
- (cl:string-capitalize-simple string :start start :end end)
+ (string-capitalize-simple string :start start :end end)
(string-capitalize-full string :start start :end end :casing casing))))
=====================================
src/general-info/release-21f.md
=====================================
@@ -77,6 +77,9 @@ public domain.
* ~~#298~~ Add `with-float-rounding-mode` macro
* ~~#299~~ Enable xoroshiro assembly routine
* ~~#314~~ tanh incorrect for large args
+ * ~~#316~~ Support roundtrip character casing
+ * ~~#320~~ Motif variant not defaulted for `x86_linux_clang` config
+ * ~~#321~~ Rename Motif Config.x86 to Config.linux
* Other changes:
* Improvements to the PCL implementation of CLOS:
* Changes to building procedure:
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/163cafa40fb099dc30b798…
--
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/163cafa40fb099dc30b798…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-323-cl-string-casing at cmucl / cmucl
Commits:
3557fc31 by Raymond Toy at 2024-05-31T16:20:11-07:00
CL-compliant string-capitalize
Previously, when we capitalized a word, we used the titlecase.
However it should be upper case. Make it so, but move the original
implementation to unicode.lisp.
Add a test that we do use uppercase instead of titlecase.
- - - - -
163cafa4 by Raymond Toy at 2024-05-31T16:23:13-07:00
Rename string-capitalize to string-capitalize-simple.
Forgot to rename the old version so it's clear it's not CL
string-capitalize. Also call it in unicode:string-capitalize when the
casing is simple.
- - - - -
4 changed files:
- src/code/string.lisp
- src/code/unicode.lisp
- src/i18n/locale/cmucl.pot
- tests/string.lisp
Changes:
=====================================
src/code/string.lisp
=====================================
@@ -706,7 +706,7 @@
(setq newword t))
(newword
;;char is first case-modifiable after non-case-modifiable
- (setq char (char-titlecase char))
+ (setq char (char-upcase char))
(setq newword ()))
;;char is case-modifiable, but not first
(t (setq char (char-downcase char))))
=====================================
src/code/unicode.lisp
=====================================
@@ -525,6 +525,48 @@
:end next)))
(write-string string result :start end :end offset-slen))))))
+(defun string-capitalize-simple (string &key (start 0) end)
+ _N"Given a string, returns a copy of the string with the first
+ character of each ``word'' converted to upper-case, and remaining
+ chars in the word converted to lower case. A ``word'' is defined
+ to be a string of case-modifiable characters delimited by
+ non-case-modifiable chars."
+ (declare (fixnum start))
+ (let* ((string (if (stringp string) string (string string)))
+ (slen (length string)))
+ (declare (fixnum slen))
+ (with-one-string string start end offset
+ (let ((offset-slen (+ slen offset))
+ (newstring (make-string slen)))
+ (declare (fixnum offset-slen))
+ (do ((index offset (1+ index))
+ (new-index 0 (1+ new-index)))
+ ((= index start))
+ (declare (fixnum index new-index))
+ (setf (schar newstring new-index) (schar string index)))
+ (do ((index start (1+ index))
+ (new-index (- start offset) (1+ new-index))
+ (newword t)
+ (char ()))
+ ((= index (the fixnum end)))
+ (declare (fixnum index new-index))
+ (setq char (schar string index))
+ (cond ((not (alphanumericp char))
+ (setq newword t))
+ (newword
+ ;;char is first case-modifiable after non-case-modifiable
+ (setq char (char-titlecase char))
+ (setq newword ()))
+ ;;char is case-modifiable, but not first
+ (t (setq char (char-downcase char))))
+ (setf (schar newstring new-index) char))
+ (do ((index end (1+ index))
+ (new-index (- (the fixnum end) offset) (1+ new-index)))
+ ((= index offset-slen))
+ (declare (fixnum index new-index))
+ (setf (schar newstring new-index) (schar string index)))
+ newstring))))
+
(defun string-capitalize-full (string &key (start 0) end (casing :full))
"Capitalize String using the Common Lisp word-break algorithm to find
the words in String. The beginning is capitalized depending on the
@@ -599,7 +641,7 @@
(if unicode-word-break
(string-capitalize-unicode string :start start :end end :casing casing)
(if (eq casing :simple)
- (cl:string-capitalize string :start start :end end)
+ (cl:string-capitalize-simple string :start start :end end)
(string-capitalize-full string :start start :end end :casing casing))))
=====================================
src/i18n/locale/cmucl.pot
=====================================
@@ -3892,7 +3892,7 @@ msgid ""
" upper case alphabetic characters converted to lowercase."
msgstr ""
-#: src/code/string.lisp
+#: src/code/unicode.lisp src/code/string.lisp
msgid ""
"Given a string, returns a copy of the string with the first\n"
" character of each ``word'' converted to upper-case, and remaining\n"
=====================================
tests/string.lisp
=====================================
@@ -58,3 +58,36 @@
when (char/= actual (char-downcase expected))
collect (list (char-downcase expected)
(char-code actual))))))
+
+(define-test string-capitalize
+ (:tag :issues)
+ (let* ((s (string-capitalize
+ ;; #\Latin_Small_Letter_Dz and #\Latin_Small_Letter_Lj
+ ;; #have Unicode titlecase characters that differ from
+ ;; CHAR-UPCASE. This tests that STRING-CAPITALIZE use
+ ;; CHAR-UPCASE to produce the capitalization.
+ (coerce
+ '(#\Latin_Small_Letter_Dz
+ #\a #\b
+ #\space
+ #\Latin_Small_Letter_Lj
+ #\A #\B)
+ 'string)))
+ (expected
+ ;; Manually convert the test string by calling CHAR-UPCASE
+ ;; or CHAR-DOWNCASE (or IDENTITY) to get the desired
+ ;; capitalized string.
+ (map 'list
+ #'(lambda (c f)
+ (funcall f c))
+ s
+ (list #'char-upcase
+ #'char-downcase
+ #'char-downcase
+ #'identity
+ #'char-upcase
+ #'char-downcase
+ #'char-downcase))))
+ (assert-equal
+ (map 'list #'char-name expected)
+ (map 'list #'char-name s))))
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/483dfe3999bd7db0cee36e…
--
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/483dfe3999bd7db0cee36e…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-323-cl-string-casing at cmucl / cmucl
Commits:
483dfe39 by Raymond Toy at 2024-05-31T11:18:35-07:00
Remove old versions of {n}string-{upcase,downcase}
Don't need these anymore.
- - - - -
1 changed file:
- src/code/string.lisp
Changes:
=====================================
src/code/string.lisp
=====================================
@@ -618,49 +618,6 @@
(setf (schar string i) fill-char))
(make-string count)))
-#+nil
-(defun string-upcase (string &key (start 0) end)
- _N"Given a string, returns a new string that is a copy of it with all
- lower case alphabetic characters converted to uppercase."
- (declare (fixnum start))
- (let* ((string (if (stringp string) string (string string)))
- (slen (length string)))
- (declare (fixnum slen))
- (with-one-string string start end offset
- (let ((offset-slen (+ slen offset))
- (newstring (make-string slen)))
- (declare (fixnum offset-slen))
- (do ((index offset (1+ index))
- (new-index 0 (1+ new-index)))
- ((= index start))
- (declare (fixnum index new-index))
- (setf (schar newstring new-index) (schar string index)))
- (do ((index start (1+ index))
- (new-index (- start offset) (1+ new-index)))
- ((= index (the fixnum end)))
- (declare (fixnum index new-index))
- (multiple-value-bind (code wide) (codepoint string index)
- (when wide (incf index))
- ;; Use char-upcase if it's not a surrogate pair so that
- ;; we're always consist.
- (if wide
- (setq code (unicode-upper code))
- (setf code (char-code (char-upcase (code-char code)))))
- ;;@@ WARNING: this may, in theory, need to extend newstring
- ;; but that never actually occurs as of Unicode 5.1.0,
- ;; so I'm just going to ignore it for now...
- (multiple-value-bind (hi lo) (surrogates code)
- (setf (schar newstring new-index) hi)
- (when lo
- (setf (schar newstring (incf new-index)) lo)))))
- ;;@@ WARNING: see above
- (do ((index end (1+ index))
- (new-index (- (the fixnum end) offset) (1+ new-index)))
- ((= index offset-slen))
- (declare (fixnum index new-index))
- (setf (schar newstring new-index) (schar string index)))
- newstring))))
-
(defun string-upcase (string &key (start 0) end)
_N"Given a string, returns a new string that is a copy of it with all
lower case alphabetic characters converted to uppercase."
@@ -690,49 +647,6 @@
(setf (schar newstring new-index) (schar string index)))
newstring))))
-#+nil
-(defun string-downcase (string &key (start 0) end)
- _N"Given a string, returns a new string that is a copy of it with all
- upper case alphabetic characters converted to lowercase."
- (declare (fixnum start))
- (let* ((string (if (stringp string) string (string string)))
- (slen (length string)))
- (declare (fixnum slen))
- (with-one-string string start end offset
- (let ((offset-slen (+ slen offset))
- (newstring (make-string slen)))
- (declare (fixnum offset-slen))
- (do ((index offset (1+ index))
- (new-index 0 (1+ new-index)))
- ((= index start))
- (declare (fixnum index new-index))
- (setf (schar newstring new-index) (schar string index)))
- (do ((index start (1+ index))
- (new-index (- start offset) (1+ new-index)))
- ((= index (the fixnum end)))
- (declare (fixnum index new-index))
- (multiple-value-bind (code wide) (codepoint string index)
- (when wide (incf index))
- ;; Use char-downcase if it's not a surrogate pair so that
- ;; we're always consist.
- (if wide
- (setq code (unicode-lower code))
- (setq code (char-code (char-downcase (code-char code)))))
- ;;@@ WARNING: this may, in theory, need to extend newstring
- ;; but that never actually occurs as of Unicode 5.1.0,
- ;; so I'm just going to ignore it for now...
- (multiple-value-bind (hi lo) (surrogates code)
- (setf (schar newstring new-index) hi)
- (when lo
- (setf (schar newstring (incf new-index)) lo)))))
- ;;@@ WARNING: see above
- (do ((index end (1+ index))
- (new-index (- (the fixnum end) offset) (1+ new-index)))
- ((= index offset-slen))
- (declare (fixnum index new-index))
- (setf (schar newstring new-index) (schar string index)))
- newstring))))
-
(defun string-downcase (string &key (start 0) end)
_N"Given a string, returns a new string that is a copy of it with all
upper case alphabetic characters converted to lowercase."
@@ -804,31 +718,6 @@
(setf (schar newstring new-index) (schar string index)))
newstring))))
-#+nil
-(defun nstring-upcase (string &key (start 0) end)
- "Given a string, returns that string with all lower case alphabetic
- characters converted to uppercase."
- (declare (fixnum start))
- (let ((save-header string))
- (with-one-string string start end offset
- (do ((index start (1+ index)))
- ((= index (the fixnum end)))
- (declare (fixnum index))
- (multiple-value-bind (code wide) (codepoint string index)
- (if wide
- (setq code (unicode-upper code))
- (setf code (char-code (char-upcase (code-char code)))))
- ;;@@ WARNING: this may, in theory, need to extend string
- ;; (which, obviously, we can't do here. Unless
- ;; STRING is adjustable, maybe)
- ;; but that never actually occurs as of Unicode 5.1.0,
- ;; so I'm just going to ignore it for now...
- (multiple-value-bind (hi lo) (surrogates code)
- (setf (schar string index) hi)
- (when lo
- (setf (schar string (incf index)) lo))))))
- save-header))
-
(defun nstring-upcase (string &key (start 0) end)
"Given a string, returns that string with all lower case alphabetic
characters converted to uppercase."
@@ -842,31 +731,6 @@
(char-upcase (schar string index))))
save-header)))
-#+nil
-(defun nstring-downcase (string &key (start 0) end)
- "Given a string, returns that string with all upper case alphabetic
- characters converted to lowercase."
- (declare (fixnum start))
- (let ((save-header string))
- (with-one-string string start end offset
- (do ((index start (1+ index)))
- ((= index (the fixnum end)))
- (declare (fixnum index))
- (multiple-value-bind (code wide) (codepoint string index)
- (if wide
- (setq code (unicode-lower code))
- (setf code (char-code (char-downcase (code-char code)))))
- ;;@@ WARNING: this may, in theory, need to extend string
- ;; (which, obviously, we can't do here. Unless
- ;; STRING is adjustable, maybe)
- ;; but that never actually occurs as of Unicode 5.1.0,
- ;; so I'm just going to ignore it for now...
- (multiple-value-bind (hi lo) (surrogates code)
- (setf (schar string index) hi)
- (when lo
- (setf (schar string (incf index)) lo))))))
- save-header))
-
(defun nstring-downcase (string &key (start 0) end)
"Given a string, returns that string with all upper case alphabetic
characters converted to lowercase."
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/483dfe3999bd7db0cee36ef…
--
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/483dfe3999bd7db0cee36ef…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch master at cmucl / cmucl
Commits:
5a037b19 by Raymond Toy at 2024-05-31T08:56:12-07:00
String-capitalize should use char-upcase, not lisp:char-titlecase
We were using lisp:;char-titlecase to capitalize the string, but we
really should use char-upcase. However, to preserve this
functionality, we moved the functions to the `unicode` package.
Added a test for this.
- - - - -
4 changed files:
- src/code/string.lisp
- src/code/unicode.lisp
- src/i18n/locale/cmucl.pot
- tests/string.lisp
Changes:
=====================================
src/code/string.lisp
=====================================
@@ -702,6 +702,7 @@
(setf (schar newstring new-index) (schar string index)))
newstring))))
+#+nil
(defun string-capitalize (string &key (start 0) end)
_N"Given a string, returns a copy of the string with the first
character of each ``word'' converted to upper-case, and remaining
@@ -744,8 +745,50 @@
(setf (schar newstring new-index) (schar string index)))
newstring))))
+(defun string-capitalize (string &key (start 0) end)
+ _N"Given a string, returns a copy of the string with the first
+ character of each ``word'' converted to upper-case, and remaining
+ chars in the word converted to lower case. A ``word'' is defined
+ to be a string of case-modifiable characters delimited by
+ non-case-modifiable chars."
+ (declare (fixnum start))
+ (let* ((string (if (stringp string) string (string string)))
+ (slen (length string)))
+ (declare (fixnum slen))
+ (with-one-string string start end offset
+ (let ((offset-slen (+ slen offset))
+ (newstring (make-string slen)))
+ (declare (fixnum offset-slen))
+ (do ((index offset (1+ index))
+ (new-index 0 (1+ new-index)))
+ ((= index start))
+ (declare (fixnum index new-index))
+ (setf (schar newstring new-index) (schar string index)))
+ (do ((index start (1+ index))
+ (new-index (- start offset) (1+ new-index))
+ (newword t)
+ (char ()))
+ ((= index (the fixnum end)))
+ (declare (fixnum index new-index))
+ (setq char (schar string index))
+ (cond ((not (alphanumericp char))
+ (setq newword t))
+ (newword
+ ;;char is first case-modifiable after non-case-modifiable
+ (setq char (char-upcase char))
+ (setq newword ()))
+ ;;char is case-modifiable, but not first
+ (t (setq char (char-downcase char))))
+ (setf (schar newstring new-index) char))
+ (do ((index end (1+ index))
+ (new-index (- (the fixnum end) offset) (1+ new-index)))
+ ((= index offset-slen))
+ (declare (fixnum index new-index))
+ (setf (schar newstring new-index) (schar string index)))
+ newstring))))
+
(defun nstring-upcase (string &key (start 0) end)
- "Given a string, returns that string with all lower case alphabetic
+ _N"Given a string, returns that string with all lower case alphabetic
characters converted to uppercase."
(declare (fixnum start))
(let ((save-header string))
@@ -769,7 +812,7 @@
save-header))
(defun nstring-downcase (string &key (start 0) end)
- "Given a string, returns that string with all upper case alphabetic
+ _N"Given a string, returns that string with all upper case alphabetic
characters converted to lowercase."
(declare (fixnum start))
(let ((save-header string))
@@ -792,6 +835,7 @@
(setf (schar string (incf index)) lo))))))
save-header))
+#+nil
(defun nstring-capitalize (string &key (start 0) end)
"Given a string, returns that string with the first
character of each ``word'' converted to upper-case, and remaining
@@ -817,6 +861,31 @@
(setf (schar string index) (char-downcase char))))))
save-header))
+(defun nstring-capitalize (string &key (start 0) end)
+ _N"Given a string, returns that string with the first
+ character of each ``word'' converted to upper-case, and remaining
+ chars in the word converted to lower case. A ``word'' is defined
+ to be a string of case-modifiable characters delimited by
+ non-case-modifiable chars."
+ (declare (fixnum start))
+ (let ((save-header string))
+ (with-one-string string start end offset
+ (do ((index start (1+ index))
+ (newword t)
+ (char ()))
+ ((= index (the fixnum end)))
+ (declare (fixnum index))
+ (setq char (schar string index))
+ (cond ((not (alphanumericp char))
+ (setq newword t))
+ (newword
+ ;;char is first case-modifiable after non-case-modifiable
+ (setf (schar string index) (char-upcase char))
+ (setq newword ()))
+ (t
+ (setf (schar string index) (char-downcase char))))))
+ save-header))
+
#+unicode
(progn
=====================================
src/code/unicode.lisp
=====================================
@@ -441,6 +441,48 @@
:end next)))
(write-string string result :start end :end offset-slen))))))
+(defun string-capitalize-simple (string &key (start 0) end)
+ _N"Given a string, returns a copy of the string with the first
+ character of each ``word'' converted to upper-case, and remaining
+ chars in the word converted to lower case. A ``word'' is defined
+ to be a string of case-modifiable characters delimited by
+ non-case-modifiable chars."
+ (declare (fixnum start))
+ (let* ((string (if (stringp string) string (string string)))
+ (slen (length string)))
+ (declare (fixnum slen))
+ (with-one-string string start end offset
+ (let ((offset-slen (+ slen offset))
+ (newstring (make-string slen)))
+ (declare (fixnum offset-slen))
+ (do ((index offset (1+ index))
+ (new-index 0 (1+ new-index)))
+ ((= index start))
+ (declare (fixnum index new-index))
+ (setf (schar newstring new-index) (schar string index)))
+ (do ((index start (1+ index))
+ (new-index (- start offset) (1+ new-index))
+ (newword t)
+ (char ()))
+ ((= index (the fixnum end)))
+ (declare (fixnum index new-index))
+ (setq char (schar string index))
+ (cond ((not (alphanumericp char))
+ (setq newword t))
+ (newword
+ ;;char is first case-modifiable after non-case-modifiable
+ (setq char (char-titlecase char))
+ (setq newword ()))
+ ;;char is case-modifiable, but not first
+ (t (setq char (char-downcase char))))
+ (setf (schar newstring new-index) char))
+ (do ((index end (1+ index))
+ (new-index (- (the fixnum end) offset) (1+ new-index)))
+ ((= index offset-slen))
+ (declare (fixnum index new-index))
+ (setf (schar newstring new-index) (schar string index)))
+ newstring))))
+
(defun string-capitalize-full (string &key (start 0) end (casing :full))
"Capitalize String using the Common Lisp word-break algorithm to find
the words in String. The beginning is capitalized depending on the
@@ -515,7 +557,7 @@
(if unicode-word-break
(string-capitalize-unicode string :start start :end end :casing casing)
(if (eq casing :simple)
- (cl:string-capitalize string :start start :end end)
+ (string-capitalize-simple string :start start :end end)
(string-capitalize-full string :start start :end end :casing casing))))
=====================================
src/i18n/locale/cmucl.pot
=====================================
@@ -3892,7 +3892,7 @@ msgid ""
" upper case alphabetic characters converted to lowercase."
msgstr ""
-#: src/code/string.lisp
+#: src/code/unicode.lisp src/code/string.lisp
msgid ""
"Given a string, returns a copy of the string with the first\n"
" character of each ``word'' converted to upper-case, and remaining\n"
=====================================
tests/string.lisp
=====================================
@@ -59,3 +59,35 @@
when (char/= actual (char-downcase expected))
collect (list (char-downcase expected)
(char-code actual))))))
+
+(define-test string-capitalize
+ (:tag :issues)
+ (let* ((s
+ ;; This string contains a couple of characters where
+ ;; Unicode has a titlecase version of the character. We
+ ;; want to make sure we use char-upcase to capitalize the
+ ;; string instead of lisp::char-titlecse
+ (coerce
+ '(#\Latin_Small_Letter_Dz
+ #\a #\b
+ #\space
+ #\Latin_Small_Letter_Lj
+ #\A #\B)
+ 'string))
+ (expected
+ ;; Manually convert S to a capitalized string using
+ ;; char-upcase/downcase.
+ (map 'string
+ #'(lambda (ch f)
+ (funcall f ch))
+ s
+ (list #'char-upcase
+ #'char-downcase
+ #'char-downcase
+ #'char-downcase
+ #'char-upcase
+ #'char-downcase
+ #'char-downcase))))
+ (assert-equal
+ (map 'list #'char-name expected)
+ (map 'list #'char-name (string-capitalize s)))))
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/5a037b19ff7be80cca1fdd1…
--
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/5a037b19ff7be80cca1fdd1…
You're receiving this email because of your account on gitlab.common-lisp.net.