Raymond Toy pushed to branch issue-316-support-roundtrip-char-casing at cmucl / cmucl
Commits:
-
832e1d7e
by Raymond Toy at 2024-05-17T07:54:35-07:00
-
48806331
by Raymond Toy at 2024-05-17T08:42:41-07:00
-
7431f3a8
by Raymond Toy at 2024-05-17T08:43:27-07:00
4 changed files:
Changes:
... | ... | @@ -60,10 +60,12 @@ |
60 | 60 | (deftype codepoint ()
|
61 | 61 | `(integer 0 (,codepoint-limit)))
|
62 | 62 | |
63 | -;; This MUST be greater than or equal to 127!
|
|
63 | +;; This MUST be greater than or equal to 127! Be very careful about
|
|
64 | +;; changing this. It affects all kinds of casing issues.
|
|
64 | 65 | (defconstant +unicode-lower-limit+
|
65 | 66 | 191
|
66 | - "A character code strictly larger than this is handled using Unicode rules.")
|
|
67 | + "A character code strictly larger than this is handled using Unicode
|
|
68 | + rules.")
|
|
67 | 69 | |
68 | 70 | |
69 | 71 | (macrolet ((frob (char-names-list)
|
... | ... | @@ -640,12 +640,8 @@ |
640 | 640 | (declare (fixnum index new-index))
|
641 | 641 | (multiple-value-bind (code wide) (codepoint string index)
|
642 | 642 | (when wide (incf index))
|
643 | - ;; Handle ASCII specially because this is called early in
|
|
644 | - ;; initialization, before unidata is available.
|
|
645 | - #+nil
|
|
646 | - (cond ((< 96 code 123) (decf code 32))
|
|
647 | - #+unicode
|
|
648 | - ((> code 127) (setq code (unicode-upper code))))
|
|
643 | + ;; Use char-upcase if it's not a surrogate pair so that
|
|
644 | + ;; we're always consist.
|
|
649 | 645 | (if wide
|
650 | 646 | (setq code (unicode-upper code))
|
651 | 647 | (setf code (char-code (char-upcase (code-char code)))))
|
... | ... | @@ -686,11 +682,8 @@ |
686 | 682 | (declare (fixnum index new-index))
|
687 | 683 | (multiple-value-bind (code wide) (codepoint string index)
|
688 | 684 | (when wide (incf index))
|
689 | - ;; Handle ASCII specially because this is called early in
|
|
690 | - ;; initialization, before unidata is available.
|
|
691 | - #+nil
|
|
692 | - (cond ((< 64 code 91) (incf code 32))
|
|
693 | - ((> code 127) (setq code (unicode-lower code))))
|
|
685 | + ;; Use char-downcase if it's not a surrogate pair so that
|
|
686 | + ;; we're always consist.
|
|
694 | 687 | (if wide
|
695 | 688 | (setq code (unicode-lower code))
|
696 | 689 | (setq code (char-code (char-downcase (code-char code)))))
|
... | ... | @@ -761,12 +754,9 @@ |
761 | 754 | ((= index (the fixnum end)))
|
762 | 755 | (declare (fixnum index))
|
763 | 756 | (multiple-value-bind (code wide) (codepoint string index)
|
764 | - (declare (ignore wide))
|
|
765 | - ;; Handle ASCII specially because this is called early in
|
|
766 | - ;; initialization, before unidata is available.
|
|
767 | - (cond ((< 96 code 123) (decf code 32))
|
|
768 | - #+unicode
|
|
769 | - ((> code 127) (setq code (unicode-upper code))))
|
|
757 | + (if wide
|
|
758 | + (setq code (unicode-upper code))
|
|
759 | + (setf code (char-code (char-upcase (code-char code)))))
|
|
770 | 760 | ;;@@ WARNING: this may, in theory, need to extend string
|
771 | 761 | ;; (which, obviously, we can't do here. Unless
|
772 | 762 | ;; STRING is adjustable, maybe)
|
... | ... | @@ -788,10 +778,9 @@ |
788 | 778 | ((= index (the fixnum end)))
|
789 | 779 | (declare (fixnum index))
|
790 | 780 | (multiple-value-bind (code wide) (codepoint string index)
|
791 | - (declare (ignore wide))
|
|
792 | - (cond ((< 64 code 91) (incf code 32))
|
|
793 | - #+unicode
|
|
794 | - ((> code 127) (setq code (unicode-lower code))))
|
|
781 | + (if wide
|
|
782 | + (setq code (unicode-lower code))
|
|
783 | + (setf code (char-code (char-downcase (code-char code)))))
|
|
795 | 784 | ;;@@ WARNING: this may, in theory, need to extend string
|
796 | 785 | ;; (which, obviously, we can't do here. Unless
|
797 | 786 | ;; STRING is adjustable, maybe)
|
... | ... | @@ -5445,7 +5445,8 @@ msgstr "" |
5445 | 5445 | |
5446 | 5446 | #: src/code/char.lisp
|
5447 | 5447 | msgid ""
|
5448 | -"A character code strictly larger than this is handled using Unicode rules."
|
|
5448 | +"A character code strictly larger than this is handled using Unicode\n"
|
|
5449 | +" rules."
|
|
5449 | 5450 | msgstr ""
|
5450 | 5451 | |
5451 | 5452 | #: src/code/char.lisp
|
... | ... | @@ -22,7 +22,7 @@ |
22 | 22 | (loop for expected across s
|
23 | 23 | and actual across s-upcase
|
24 | 24 | when (char/= actual (char-upcase expected))
|
25 | - collect (list (char-upcase (char-code expected))
|
|
25 | + collect (list (char-upcase expected)
|
|
26 | 26 | (char-code actual))))))
|
27 | 27 | |
28 | 28 | (define-test string-downcase
|
... | ... | @@ -33,5 +33,29 @@ |
33 | 33 | (loop for expected across s
|
34 | 34 | and actual across s-downcase
|
35 | 35 | when (char/= actual (char-downcase expected))
|
36 | - collect (list (char-downcase (char-code expected))
|
|
36 | + collect (list (char-downcase expected)
|
|
37 | + (char-code actual))))))
|
|
38 | + |
|
39 | +(define-test nstring-upcase
|
|
40 | + (:tag :issues)
|
|
41 | + (let* ((s (make-test-string))
|
|
42 | + (ns (make-test-string))
|
|
43 | + (ns-upcase (nstring-upcase ns)))
|
|
44 | + (assert-false
|
|
45 | + (loop for expected across s
|
|
46 | + and actual across ns
|
|
47 | + when (char/= actual (char-upcase expected))
|
|
48 | + collect (list (char-upcase expected)
|
|
49 | + (char-code actual))))))
|
|
50 | + |
|
51 | +(define-test nstring-downcase
|
|
52 | + (:tag :issues)
|
|
53 | + (let* ((s (make-test-string))
|
|
54 | + (ns (make-test-string))
|
|
55 | + (ns-downcase (nstring-downcase ns)))
|
|
56 | + (assert-false
|
|
57 | + (loop for expected across s
|
|
58 | + and actual across ns
|
|
59 | + when (char/= actual (char-downcase expected))
|
|
60 | + collect (list (char-downcase expected)
|
|
37 | 61 | (char-code actual)))))) |