Raymond Toy pushed to branch master at cmucl / cmucl
Commits:
-
535f7e5d
by Raymond Toy at 2024-06-03T23:07:07+00:00
-
57d5a21c
by Raymond Toy at 2024-06-03T23:07:12+00:00
5 changed files:
- src/code/string.lisp
- src/code/unicode.lisp
- src/i18n/locale/cmucl.pot
- tests/string.lisp
- tests/unicode.lisp
Changes:
... | ... | @@ -638,27 +638,14 @@ |
638 | 638 | (new-index (- start offset) (1+ new-index)))
|
639 | 639 | ((= index (the fixnum end)))
|
640 | 640 | (declare (fixnum index new-index))
|
641 | - (multiple-value-bind (code wide) (codepoint string index)
|
|
642 | - (when wide (incf index))
|
|
643 | - ;; Use char-upcase if it's not a surrogate pair so that
|
|
644 | - ;; we're always consist.
|
|
645 | - (if wide
|
|
646 | - (setq code (unicode-upper code))
|
|
647 | - (setf code (char-code (char-upcase (code-char code)))))
|
|
648 | - ;;@@ WARNING: this may, in theory, need to extend newstring
|
|
649 | - ;; but that never actually occurs as of Unicode 5.1.0,
|
|
650 | - ;; so I'm just going to ignore it for now...
|
|
651 | - (multiple-value-bind (hi lo) (surrogates code)
|
|
652 | - (setf (schar newstring new-index) hi)
|
|
653 | - (when lo
|
|
654 | - (setf (schar newstring (incf new-index)) lo)))))
|
|
655 | - ;;@@ WARNING: see above
|
|
656 | - (do ((index end (1+ index))
|
|
641 | + (setf (schar newstring new-index)
|
|
642 | + (char-upcase (schar string index))))
|
|
643 | + (do ((index end (1+ index))
|
|
657 | 644 | (new-index (- (the fixnum end) offset) (1+ new-index)))
|
658 | 645 | ((= index offset-slen))
|
659 | 646 | (declare (fixnum index new-index))
|
660 | 647 | (setf (schar newstring new-index) (schar string index)))
|
661 | - newstring))))
|
|
648 | + newstring))))
|
|
662 | 649 | |
663 | 650 | (defun string-downcase (string &key (start 0) end)
|
664 | 651 | _N"Given a string, returns a new string that is a copy of it with all
|
... | ... | @@ -680,64 +667,8 @@ |
680 | 667 | (new-index (- start offset) (1+ new-index)))
|
681 | 668 | ((= index (the fixnum end)))
|
682 | 669 | (declare (fixnum index new-index))
|
683 | - (multiple-value-bind (code wide) (codepoint string index)
|
|
684 | - (when wide (incf index))
|
|
685 | - ;; Use char-downcase if it's not a surrogate pair so that
|
|
686 | - ;; we're always consist.
|
|
687 | - (if wide
|
|
688 | - (setq code (unicode-lower code))
|
|
689 | - (setq code (char-code (char-downcase (code-char code)))))
|
|
690 | - ;;@@ WARNING: this may, in theory, need to extend newstring
|
|
691 | - ;; but that never actually occurs as of Unicode 5.1.0,
|
|
692 | - ;; so I'm just going to ignore it for now...
|
|
693 | - (multiple-value-bind (hi lo) (surrogates code)
|
|
694 | - (setf (schar newstring new-index) hi)
|
|
695 | - (when lo
|
|
696 | - (setf (schar newstring (incf new-index)) lo)))))
|
|
697 | - ;;@@ WARNING: see above
|
|
698 | - (do ((index end (1+ index))
|
|
699 | - (new-index (- (the fixnum end) offset) (1+ new-index)))
|
|
700 | - ((= index offset-slen))
|
|
701 | - (declare (fixnum index new-index))
|
|
702 | - (setf (schar newstring new-index) (schar string index)))
|
|
703 | - newstring))))
|
|
704 | - |
|
705 | -#+nil
|
|
706 | -(defun string-capitalize (string &key (start 0) end)
|
|
707 | - _N"Given a string, returns a copy of the string with the first
|
|
708 | - character of each ``word'' converted to upper-case, and remaining
|
|
709 | - chars in the word converted to lower case. A ``word'' is defined
|
|
710 | - to be a string of case-modifiable characters delimited by
|
|
711 | - non-case-modifiable chars."
|
|
712 | - (declare (fixnum start))
|
|
713 | - (let* ((string (if (stringp string) string (string string)))
|
|
714 | - (slen (length string)))
|
|
715 | - (declare (fixnum slen))
|
|
716 | - (with-one-string string start end offset
|
|
717 | - (let ((offset-slen (+ slen offset))
|
|
718 | - (newstring (make-string slen)))
|
|
719 | - (declare (fixnum offset-slen))
|
|
720 | - (do ((index offset (1+ index))
|
|
721 | - (new-index 0 (1+ new-index)))
|
|
722 | - ((= index start))
|
|
723 | - (declare (fixnum index new-index))
|
|
724 | - (setf (schar newstring new-index) (schar string index)))
|
|
725 | - (do ((index start (1+ index))
|
|
726 | - (new-index (- start offset) (1+ new-index))
|
|
727 | - (newword t)
|
|
728 | - (char ()))
|
|
729 | - ((= index (the fixnum end)))
|
|
730 | - (declare (fixnum index new-index))
|
|
731 | - (setq char (schar string index))
|
|
732 | - (cond ((not (alphanumericp char))
|
|
733 | - (setq newword t))
|
|
734 | - (newword
|
|
735 | - ;;char is first case-modifiable after non-case-modifiable
|
|
736 | - (setq char (char-titlecase char))
|
|
737 | - (setq newword ()))
|
|
738 | - ;;char is case-modifiable, but not first
|
|
739 | - (t (setq char (char-downcase char))))
|
|
740 | - (setf (schar newstring new-index) char))
|
|
670 | + (setf (schar newstring new-index)
|
|
671 | + (char-downcase (schar string index))))
|
|
741 | 672 | (do ((index end (1+ index))
|
742 | 673 | (new-index (- (the fixnum end) offset) (1+ new-index)))
|
743 | 674 | ((= index offset-slen))
|
... | ... | @@ -796,20 +727,9 @@ |
796 | 727 | (do ((index start (1+ index)))
|
797 | 728 | ((= index (the fixnum end)))
|
798 | 729 | (declare (fixnum index))
|
799 | - (multiple-value-bind (code wide) (codepoint string index)
|
|
800 | - (if wide
|
|
801 | - (setq code (unicode-upper code))
|
|
802 | - (setf code (char-code (char-upcase (code-char code)))))
|
|
803 | - ;;@@ WARNING: this may, in theory, need to extend string
|
|
804 | - ;; (which, obviously, we can't do here. Unless
|
|
805 | - ;; STRING is adjustable, maybe)
|
|
806 | - ;; but that never actually occurs as of Unicode 5.1.0,
|
|
807 | - ;; so I'm just going to ignore it for now...
|
|
808 | - (multiple-value-bind (hi lo) (surrogates code)
|
|
809 | - (setf (schar string index) hi)
|
|
810 | - (when lo
|
|
811 | - (setf (schar string (incf index)) lo))))))
|
|
812 | - save-header))
|
|
730 | + (setf (schar string index)
|
|
731 | + (char-upcase (schar string index))))
|
|
732 | + save-header)))
|
|
813 | 733 | |
814 | 734 | (defun nstring-downcase (string &key (start 0) end)
|
815 | 735 | _N"Given a string, returns that string with all upper case alphabetic
|
... | ... | @@ -820,45 +740,8 @@ |
820 | 740 | (do ((index start (1+ index)))
|
821 | 741 | ((= index (the fixnum end)))
|
822 | 742 | (declare (fixnum index))
|
823 | - (multiple-value-bind (code wide) (codepoint string index)
|
|
824 | - (if wide
|
|
825 | - (setq code (unicode-lower code))
|
|
826 | - (setf code (char-code (char-downcase (code-char code)))))
|
|
827 | - ;;@@ WARNING: this may, in theory, need to extend string
|
|
828 | - ;; (which, obviously, we can't do here. Unless
|
|
829 | - ;; STRING is adjustable, maybe)
|
|
830 | - ;; but that never actually occurs as of Unicode 5.1.0,
|
|
831 | - ;; so I'm just going to ignore it for now...
|
|
832 | - (multiple-value-bind (hi lo) (surrogates code)
|
|
833 | - (setf (schar string index) hi)
|
|
834 | - (when lo
|
|
835 | - (setf (schar string (incf index)) lo))))))
|
|
836 | - save-header))
|
|
837 | - |
|
838 | -#+nil
|
|
839 | -(defun nstring-capitalize (string &key (start 0) end)
|
|
840 | - "Given a string, returns that string with the first
|
|
841 | - character of each ``word'' converted to upper-case, and remaining
|
|
842 | - chars in the word converted to lower case. A ``word'' is defined
|
|
843 | - to be a string of case-modifiable characters delimited by
|
|
844 | - non-case-modifiable chars."
|
|
845 | - (declare (fixnum start))
|
|
846 | - (let ((save-header string))
|
|
847 | - (with-one-string string start end offset
|
|
848 | - (do ((index start (1+ index))
|
|
849 | - (newword t)
|
|
850 | - (char ()))
|
|
851 | - ((= index (the fixnum end)))
|
|
852 | - (declare (fixnum index))
|
|
853 | - (setq char (schar string index))
|
|
854 | - (cond ((not (alphanumericp char))
|
|
855 | - (setq newword t))
|
|
856 | - (newword
|
|
857 | - ;;char is first case-modifiable after non-case-modifiable
|
|
858 | - (setf (schar string index) (char-titlecase char))
|
|
859 | - (setq newword ()))
|
|
860 | - (t
|
|
861 | - (setf (schar string index) (char-downcase char))))))
|
|
743 | + (setf (schar string index)
|
|
744 | + (char-downcase (schar string index)))))
|
|
862 | 745 | save-header))
|
863 | 746 | |
864 | 747 | (defun nstring-capitalize (string &key (start 0) end)
|
... | ... | @@ -17,14 +17,57 @@ |
17 | 17 | (in-package "UNICODE")
|
18 | 18 | (intl:textdomain "cmucl")
|
19 | 19 | |
20 | +(defun string-upcase-simple (string &key (start 0) end)
|
|
21 | + _N"Given a string, returns a new string that is a copy of it with all
|
|
22 | + lower case alphabetic characters converted to uppercase."
|
|
23 | + (declare (fixnum start))
|
|
24 | + (let* ((string (string string))
|
|
25 | + (slen (length string)))
|
|
26 | + (declare (fixnum slen))
|
|
27 | + (lisp::with-one-string string start end offset
|
|
28 | + (let ((offset-slen (+ slen offset))
|
|
29 | + (newstring (make-string slen)))
|
|
30 | + (declare (fixnum offset-slen))
|
|
31 | + (do ((index offset (1+ index))
|
|
32 | + (new-index 0 (1+ new-index)))
|
|
33 | + ((= index start))
|
|
34 | + (declare (fixnum index new-index))
|
|
35 | + (setf (schar newstring new-index) (schar string index)))
|
|
36 | + (do ((index start (1+ index))
|
|
37 | + (new-index (- start offset) (1+ new-index)))
|
|
38 | + ((= index (the fixnum end)))
|
|
39 | + (declare (fixnum index new-index))
|
|
40 | + (multiple-value-bind (code wide) (codepoint string index)
|
|
41 | + (when wide (incf index))
|
|
42 | + ;; Use char-upcase if it's not a surrogate pair so that
|
|
43 | + ;; we're always consist.
|
|
44 | + (if wide
|
|
45 | + (setq code (unicode-upper code))
|
|
46 | + (setf code (char-code (char-upcase (code-char code)))))
|
|
47 | + ;;@@ WARNING: this may, in theory, need to extend newstring
|
|
48 | + ;; but that never actually occurs as of Unicode 5.1.0,
|
|
49 | + ;; so I'm just going to ignore it for now...
|
|
50 | + (multiple-value-bind (hi lo) (surrogates code)
|
|
51 | + (setf (schar newstring new-index) hi)
|
|
52 | + (when lo
|
|
53 | + (setf (schar newstring (incf new-index)) lo)))))
|
|
54 | + ;;@@ WARNING: see above
|
|
55 | + (do ((index end (1+ index))
|
|
56 | + (new-index (- (the fixnum end) offset) (1+ new-index)))
|
|
57 | + ((= index offset-slen))
|
|
58 | + (declare (fixnum index new-index))
|
|
59 | + (setf (schar newstring new-index) (schar string index)))
|
|
60 | + newstring))))
|
|
61 | + |
|
62 | + |
|
20 | 63 | ;; An example where this differs from cl:string-upcase differ:
|
21 | 64 | ;; #\Latin_Small_Letter_Sharp_S
|
22 | 65 | (defun string-upcase-full (string &key (start 0) end)
|
23 | 66 | _N"Given a string, returns a new string that is a copy of it with
|
24 | 67 | all lower case alphabetic characters converted to uppercase using
|
25 | 68 | full case conversion."
|
26 | - (declare (fixnum start)) (let* ((string (if
|
|
27 | - (stringp string) string (string string)))
|
|
69 | + (declare (fixnum start))
|
|
70 | + (let* ((string (string string))
|
|
28 | 71 | (slen (length string)))
|
29 | 72 | (declare (fixnum slen))
|
30 | 73 | (with-output-to-string (s)
|
... | ... | @@ -57,11 +100,55 @@ |
57 | 100 | all lower case alphabetic characters converted to uppercase. Casing
|
58 | 101 | is :simple or :full for simple or full case conversion,
|
59 | 102 | respectively."
|
60 | - (declare (fixnum start))
|
|
61 | - (if (eq casing :simple)
|
|
62 | - (cl:string-upcase string :start start :end end)
|
|
63 | - (string-upcase-full string :start start :end end)))
|
|
103 | + (declare (fixnum start)
|
|
104 | + (type (member :simple :full) casing))
|
|
105 | + (ecase casing
|
|
106 | + (:simple
|
|
107 | + (string-upcase-simple string :start start :end end))
|
|
108 | + (:full
|
|
109 | + (string-upcase-full string :start start :end end))))
|
|
64 | 110 | |
111 | +(defun string-downcase-simple (string &key (start 0) end)
|
|
112 | + _N"Given a string, returns a new string that is a copy of it with all
|
|
113 | + upper case alphabetic characters converted to lowercase."
|
|
114 | + (declare (fixnum start))
|
|
115 | + (let* ((string (if (stringp string) string (string string)))
|
|
116 | + (slen (length string)))
|
|
117 | + (declare (fixnum slen))
|
|
118 | + (lisp::with-one-string string start end offset
|
|
119 | + (let ((offset-slen (+ slen offset))
|
|
120 | + (newstring (make-string slen)))
|
|
121 | + (declare (fixnum offset-slen))
|
|
122 | + (do ((index offset (1+ index))
|
|
123 | + (new-index 0 (1+ new-index)))
|
|
124 | + ((= index start))
|
|
125 | + (declare (fixnum index new-index))
|
|
126 | + (setf (schar newstring new-index) (schar string index)))
|
|
127 | + (do ((index start (1+ index))
|
|
128 | + (new-index (- start offset) (1+ new-index)))
|
|
129 | + ((= index (the fixnum end)))
|
|
130 | + (declare (fixnum index new-index))
|
|
131 | + (multiple-value-bind (code wide) (codepoint string index)
|
|
132 | + (when wide (incf index))
|
|
133 | + ;; Use char-downcase if it's not a surrogate pair so that
|
|
134 | + ;; we're always consist.
|
|
135 | + (if wide
|
|
136 | + (setq code (unicode-lower code))
|
|
137 | + (setq code (char-code (char-downcase (code-char code)))))
|
|
138 | + ;;@@ WARNING: this may, in theory, need to extend newstring
|
|
139 | + ;; but that never actually occurs as of Unicode 5.1.0,
|
|
140 | + ;; so I'm just going to ignore it for now...
|
|
141 | + (multiple-value-bind (hi lo) (surrogates code)
|
|
142 | + (setf (schar newstring new-index) hi)
|
|
143 | + (when lo
|
|
144 | + (setf (schar newstring (incf new-index)) lo)))))
|
|
145 | + ;;@@ WARNING: see above
|
|
146 | + (do ((index end (1+ index))
|
|
147 | + (new-index (- (the fixnum end) offset) (1+ new-index)))
|
|
148 | + ((= index offset-slen))
|
|
149 | + (declare (fixnum index new-index))
|
|
150 | + (setf (schar newstring new-index) (schar string index)))
|
|
151 | + newstring))))
|
|
65 | 152 | |
66 | 153 | ;; An example this differs from cl:string-downcase:
|
67 | 154 | ;; #\Latin_Capital_Letter_I_With_Dot_Above.
|
... | ... | @@ -104,10 +191,13 @@ |
104 | 191 | uppercase alphabetic characters converted to lowercase. Casing is
|
105 | 192 | :simple or :full for simple or full case conversion, respectively."
|
106 | 193 | |
107 | - (declare (fixnum start))
|
|
108 | - (if (eq casing :simple)
|
|
109 | - (cl:string-downcase string :start start :end end)
|
|
110 | - (string-downcase-full string :start start :end end)))
|
|
194 | + (declare (fixnum start)
|
|
195 | + (type (member :simple :full) casing))
|
|
196 | + (ecase casing
|
|
197 | + (:simple
|
|
198 | + (string-downcase-simple string :start start :end end))
|
|
199 | + (:full
|
|
200 | + (string-downcase-full string :start start :end end))))
|
|
111 | 201 | |
112 | 202 | |
113 | 203 | ;;;
|
... | ... | @@ -553,12 +643,14 @@ |
553 | 643 | delimited by non-case-modifiable chars. "
|
554 | 644 | |
555 | 645 | (declare (fixnum start)
|
556 | - (type (member :simple :full :title) casing))
|
|
646 | + (type (member :simple-title :simple :full :title) casing))
|
|
557 | 647 | (if unicode-word-break
|
558 | 648 | (string-capitalize-unicode string :start start :end end :casing casing)
|
559 | - (if (eq casing :simple)
|
|
560 | - (string-capitalize-simple string :start start :end end)
|
|
561 | - (string-capitalize-full string :start start :end end :casing casing))))
|
|
649 | + (ecase casing
|
|
650 | + (:simple-title
|
|
651 | + (string-capitalize-simple string :start start :end end))
|
|
652 | + ((:simple :full :title)
|
|
653 | + (string-capitalize-full string :start start :end end :casing casing)))))
|
|
562 | 654 | |
563 | 655 | |
564 | 656 | (defun decompose-hangul-syllable (cp stream)
|
... | ... | @@ -3880,13 +3880,13 @@ msgid "" |
3880 | 3880 | " a new string Count long filled with the fill character."
|
3881 | 3881 | msgstr ""
|
3882 | 3882 | |
3883 | -#: src/code/string.lisp
|
|
3883 | +#: src/code/unicode.lisp src/code/string.lisp
|
|
3884 | 3884 | msgid ""
|
3885 | 3885 | "Given a string, returns a new string that is a copy of it with all\n"
|
3886 | 3886 | " lower case alphabetic characters converted to uppercase."
|
3887 | 3887 | msgstr ""
|
3888 | 3888 | |
3889 | -#: src/code/string.lisp
|
|
3889 | +#: src/code/unicode.lisp src/code/string.lisp
|
|
3890 | 3890 | msgid ""
|
3891 | 3891 | "Given a string, returns a new string that is a copy of it with all\n"
|
3892 | 3892 | " upper case alphabetic characters converted to lowercase."
|
... | ... | @@ -8,11 +8,10 @@ |
8 | 8 | (defun make-test-string ()
|
9 | 9 | ;; Create a string consisting of all the code units EXCEPT for the
|
10 | 10 | ;; surrogates because string casing handles that differently.
|
11 | - (coerce
|
|
12 | - (loop for code from 0 to #xffff
|
|
13 | - unless (lisp::surrogatep code)
|
|
14 | - collect (code-char code))
|
|
15 | - 'string))
|
|
11 | + (let ((s (make-string char-code-limit)))
|
|
12 | + (dotimes (k char-code-limit)
|
|
13 | + (setf (aref s k) (code-char k)))
|
|
14 | + s))
|
|
16 | 15 | |
17 | 16 | (define-test string-upcase
|
18 | 17 | (:tag :issues)
|
... | ... | @@ -62,32 +61,33 @@ |
62 | 61 | |
63 | 62 | (define-test string-capitalize
|
64 | 63 | (:tag :issues)
|
65 | - (let* ((s
|
|
66 | - ;; This string contains a couple of characters where
|
|
67 | - ;; Unicode has a titlecase version of the character. We
|
|
68 | - ;; want to make sure we use char-upcase to capitalize the
|
|
69 | - ;; string instead of lisp::char-titlecse
|
|
70 | - (coerce
|
|
71 | - '(#\Latin_Small_Letter_Dz
|
|
72 | - #\a #\b
|
|
73 | - #\space
|
|
74 | - #\Latin_Small_Letter_Lj
|
|
75 | - #\A #\B)
|
|
76 | - 'string))
|
|
64 | + (let* ((s (string-capitalize
|
|
65 | + ;; #\Latin_Small_Letter_Dz and #\Latin_Small_Letter_Lj
|
|
66 | + ;; #have Unicode titlecase characters that differ from
|
|
67 | + ;; CHAR-UPCASE. This tests that STRING-CAPITALIZE use
|
|
68 | + ;; CHAR-UPCASE to produce the capitalization.
|
|
69 | + (coerce
|
|
70 | + '(#\Latin_Small_Letter_Dz
|
|
71 | + #\a #\b
|
|
72 | + #\space
|
|
73 | + #\Latin_Small_Letter_Lj
|
|
74 | + #\A #\B)
|
|
75 | + 'string)))
|
|
77 | 76 | (expected
|
78 | - ;; Manually convert S to a capitalized string using
|
|
79 | - ;; char-upcase/downcase.
|
|
80 | - (map 'string
|
|
81 | - #'(lambda (ch f)
|
|
82 | - (funcall f ch))
|
|
77 | + ;; Manually convert the test string by calling CHAR-UPCASE
|
|
78 | + ;; or CHAR-DOWNCASE (or IDENTITY) to get the desired
|
|
79 | + ;; capitalized string.
|
|
80 | + (map 'list
|
|
81 | + #'(lambda (c f)
|
|
82 | + (funcall f c))
|
|
83 | 83 | s
|
84 | 84 | (list #'char-upcase
|
85 | 85 | #'char-downcase
|
86 | 86 | #'char-downcase
|
87 | - #'char-downcase
|
|
87 | + #'identity
|
|
88 | 88 | #'char-upcase
|
89 | 89 | #'char-downcase
|
90 | 90 | #'char-downcase))))
|
91 | 91 | (assert-equal
|
92 | 92 | (map 'list #'char-name expected)
|
93 | - (map 'list #'char-name (string-capitalize s))))) |
|
93 | + (map 'list #'char-name s)))) |
... | ... | @@ -192,3 +192,19 @@ |
192 | 192 | computed-breaks)))
|
193 | 193 | (assert-equalp b
|
194 | 194 | (do-test s)))))))))
|
195 | + |
|
196 | +(define-test unicode.case-extend
|
|
197 | + "Test that Unicode never produces an upper or lower case character
|
|
198 | + outside the BMP for a character in the BMP"
|
|
199 | + (:tag :unicode)
|
|
200 | + ;; For each character code (that isn't a surrogate), find the
|
|
201 | + ;; corresponding Unicode upper and lowe case character. Verify that
|
|
202 | + ;; this character is in the BMP.
|
|
203 | + (loop for code from 0 below char-code-limit
|
|
204 | + unless (lisp::surrogatep code)
|
|
205 | + do
|
|
206 | + (assert-true (< (lisp::unicode-upper code) char-code-limit)
|
|
207 | + code (lisp::unicode-upper code))
|
|
208 | + (assert-true (< (lisp::unicode-lower code) char-code-limit)
|
|
209 | + code (lisp::unicode-upper code))))
|
|
210 | + |