Raymond Toy pushed to branch master at cmucl / cmucl

Commits:

5 changed files:

Changes:

  • src/code/string.lisp
    ... ... @@ -638,27 +638,14 @@
    638 638
     	     (new-index (- start offset) (1+ new-index)))
    
    639 639
     	    ((= index (the fixnum end)))
    
    640 640
     	  (declare (fixnum index new-index))
    
    641
    -	  (multiple-value-bind (code wide) (codepoint string index)
    
    642
    -	    (when wide (incf index))
    
    643
    -            ;; Use char-upcase if it's not a surrogate pair so that
    
    644
    -            ;; we're always consist.
    
    645
    -            (if wide
    
    646
    -                (setq code (unicode-upper code))
    
    647
    -                (setf code (char-code (char-upcase (code-char code)))))
    
    648
    -	    ;;@@ WARNING: this may, in theory, need to extend newstring
    
    649
    -	    ;;  but that never actually occurs as of Unicode 5.1.0,
    
    650
    -	    ;;  so I'm just going to ignore it for now...
    
    651
    -	    (multiple-value-bind (hi lo) (surrogates code)
    
    652
    -	      (setf (schar newstring new-index) hi)
    
    653
    -	      (when lo
    
    654
    -		(setf (schar newstring (incf new-index)) lo)))))
    
    655
    -	;;@@ WARNING: see above
    
    656
    -	(do ((index end (1+ index))
    
    641
    +	  (setf (schar newstring new-index)
    
    642
    +                (char-upcase (schar string index))))
    
    643
    +        (do ((index end (1+ index))
    
    657 644
     	     (new-index (- (the fixnum end) offset) (1+ new-index)))
    
    658 645
     	    ((= index offset-slen))
    
    659 646
     	  (declare (fixnum index new-index))
    
    660 647
     	  (setf (schar newstring new-index) (schar string index)))
    
    661
    -	newstring))))
    
    648
    +        newstring))))
    
    662 649
     
    
    663 650
     (defun string-downcase (string &key (start 0) end)
    
    664 651
       _N"Given a string, returns a new string that is a copy of it with all
    
    ... ... @@ -680,64 +667,8 @@
    680 667
     	     (new-index (- start offset) (1+ new-index)))
    
    681 668
     	    ((= index (the fixnum end)))
    
    682 669
     	  (declare (fixnum index new-index))
    
    683
    -	  (multiple-value-bind (code wide) (codepoint string index)
    
    684
    -	    (when wide (incf index))
    
    685
    -            ;; Use char-downcase if it's not a surrogate pair so that
    
    686
    -            ;; we're always consist.
    
    687
    -            (if wide
    
    688
    -                (setq code (unicode-lower code))
    
    689
    -                (setq code (char-code (char-downcase (code-char code)))))
    
    690
    -	    ;;@@ WARNING: this may, in theory, need to extend newstring
    
    691
    -	    ;;  but that never actually occurs as of Unicode 5.1.0,
    
    692
    -	    ;;  so I'm just going to ignore it for now...
    
    693
    -	    (multiple-value-bind (hi lo) (surrogates code)
    
    694
    -	      (setf (schar newstring new-index) hi)
    
    695
    -	      (when lo
    
    696
    -		(setf (schar newstring (incf new-index)) lo)))))
    
    697
    -	;;@@ WARNING: see above
    
    698
    -	(do ((index end (1+ index))
    
    699
    -	     (new-index (- (the fixnum end) offset) (1+ new-index)))
    
    700
    -	    ((= index offset-slen))
    
    701
    -	  (declare (fixnum index new-index))
    
    702
    -	  (setf (schar newstring new-index) (schar string index)))
    
    703
    -	newstring))))
    
    704
    -
    
    705
    -#+nil
    
    706
    -(defun string-capitalize (string &key (start 0) end)
    
    707
    -  _N"Given a string, returns a copy of the string with the first
    
    708
    -  character of each ``word'' converted to upper-case, and remaining
    
    709
    -  chars in the word converted to lower case. A ``word'' is defined
    
    710
    -  to be a string of case-modifiable characters delimited by
    
    711
    -  non-case-modifiable chars."
    
    712
    -  (declare (fixnum start))
    
    713
    -  (let* ((string (if (stringp string) string (string string)))
    
    714
    -	 (slen (length string)))
    
    715
    -    (declare (fixnum slen))
    
    716
    -    (with-one-string string start end offset
    
    717
    -      (let ((offset-slen (+ slen offset))
    
    718
    -	    (newstring (make-string slen)))
    
    719
    -	(declare (fixnum offset-slen))
    
    720
    -	(do ((index offset (1+ index))
    
    721
    -	     (new-index 0 (1+ new-index)))
    
    722
    -	    ((= index start))
    
    723
    -	  (declare (fixnum index new-index))
    
    724
    -	  (setf (schar newstring new-index) (schar string index)))
    
    725
    -	(do ((index start (1+ index))
    
    726
    -	     (new-index (- start offset) (1+ new-index))
    
    727
    -	     (newword t)
    
    728
    -	     (char ()))
    
    729
    -	    ((= index (the fixnum end)))
    
    730
    -	  (declare (fixnum index new-index))
    
    731
    -	  (setq char (schar string index))
    
    732
    -	  (cond ((not (alphanumericp char))
    
    733
    -		 (setq newword t))
    
    734
    -		(newword
    
    735
    -		 ;;char is first case-modifiable after non-case-modifiable
    
    736
    -		 (setq char (char-titlecase char))
    
    737
    -		 (setq newword ()))
    
    738
    -		;;char is case-modifiable, but not first
    
    739
    -		(t (setq char (char-downcase char))))
    
    740
    -	  (setf (schar newstring new-index) char))
    
    670
    +	  (setf (schar newstring new-index)
    
    671
    +                (char-downcase (schar string index))))
    
    741 672
     	(do ((index end (1+ index))
    
    742 673
     	     (new-index (- (the fixnum end) offset) (1+ new-index)))
    
    743 674
     	    ((= index offset-slen))
    
    ... ... @@ -796,20 +727,9 @@
    796 727
           (do ((index start (1+ index)))
    
    797 728
     	  ((= index (the fixnum end)))
    
    798 729
     	(declare (fixnum index))
    
    799
    -	(multiple-value-bind (code wide) (codepoint string index)
    
    800
    -          (if wide
    
    801
    -              (setq code (unicode-upper code))
    
    802
    -              (setf code (char-code (char-upcase (code-char code)))))
    
    803
    -	  ;;@@ WARNING: this may, in theory, need to extend string
    
    804
    -	  ;;      (which, obviously, we can't do here.  Unless
    
    805
    -	  ;;       STRING is adjustable, maybe)
    
    806
    -	  ;;  but that never actually occurs as of Unicode 5.1.0,
    
    807
    -	  ;;  so I'm just going to ignore it for now...
    
    808
    -	  (multiple-value-bind (hi lo) (surrogates code)
    
    809
    -	    (setf (schar string index) hi)
    
    810
    -	    (when lo
    
    811
    -	      (setf (schar string (incf index)) lo))))))
    
    812
    -    save-header))
    
    730
    +        (setf (schar string index)
    
    731
    +              (char-upcase (schar string index))))
    
    732
    +    save-header)))
    
    813 733
     
    
    814 734
     (defun nstring-downcase (string &key (start 0) end)
    
    815 735
       _N"Given a string, returns that string with all upper case alphabetic
    
    ... ... @@ -820,45 +740,8 @@
    820 740
           (do ((index start (1+ index)))
    
    821 741
     	  ((= index (the fixnum end)))
    
    822 742
     	(declare (fixnum index))
    
    823
    -	(multiple-value-bind (code wide) (codepoint string index)
    
    824
    -          (if wide
    
    825
    -              (setq code (unicode-lower code))
    
    826
    -              (setf code (char-code (char-downcase (code-char code)))))
    
    827
    -	  ;;@@ WARNING: this may, in theory, need to extend string
    
    828
    -	  ;;      (which, obviously, we can't do here.  Unless
    
    829
    -	  ;;       STRING is adjustable, maybe)
    
    830
    -	  ;;  but that never actually occurs as of Unicode 5.1.0,
    
    831
    -	  ;;  so I'm just going to ignore it for now...
    
    832
    -	  (multiple-value-bind (hi lo) (surrogates code)
    
    833
    -	    (setf (schar string index) hi)
    
    834
    -	    (when lo
    
    835
    -	      (setf (schar string (incf index)) lo))))))
    
    836
    -    save-header))
    
    837
    -
    
    838
    -#+nil
    
    839
    -(defun nstring-capitalize (string &key (start 0) end)
    
    840
    -  "Given a string, returns that string with the first
    
    841
    -  character of each ``word'' converted to upper-case, and remaining
    
    842
    -  chars in the word converted to lower case. A ``word'' is defined
    
    843
    -  to be a string of case-modifiable characters delimited by
    
    844
    -  non-case-modifiable chars."
    
    845
    -  (declare (fixnum start))
    
    846
    -  (let ((save-header string))
    
    847
    -    (with-one-string string start end offset
    
    848
    -      (do ((index start (1+ index))
    
    849
    -	   (newword t)
    
    850
    -	   (char ()))
    
    851
    -	  ((= index (the fixnum end)))
    
    852
    -	(declare (fixnum index))
    
    853
    -	(setq char (schar string index))
    
    854
    -	(cond ((not (alphanumericp char))
    
    855
    -	       (setq newword t))
    
    856
    -	      (newword
    
    857
    -	       ;;char is first case-modifiable after non-case-modifiable
    
    858
    -	       (setf (schar string index) (char-titlecase char))
    
    859
    -	       (setq newword ()))
    
    860
    -	      (t
    
    861
    -	       (setf (schar string index) (char-downcase char))))))
    
    743
    +        (setf (schar string index)
    
    744
    +              (char-downcase (schar string index)))))
    
    862 745
         save-header))
    
    863 746
     
    
    864 747
     (defun nstring-capitalize (string &key (start 0) end)
    

  • src/code/unicode.lisp
    ... ... @@ -17,14 +17,57 @@
    17 17
     (in-package "UNICODE")
    
    18 18
     (intl:textdomain "cmucl")
    
    19 19
     
    
    20
    +(defun string-upcase-simple (string &key (start 0) end)
    
    21
    +  _N"Given a string, returns a new string that is a copy of it with all
    
    22
    +  lower case alphabetic characters converted to uppercase."
    
    23
    +  (declare (fixnum start))
    
    24
    +  (let* ((string (string string))
    
    25
    +	 (slen (length string)))
    
    26
    +    (declare (fixnum slen))
    
    27
    +    (lisp::with-one-string string start end offset
    
    28
    +      (let ((offset-slen (+ slen offset))
    
    29
    +	    (newstring (make-string slen)))
    
    30
    +	(declare (fixnum offset-slen))
    
    31
    +	(do ((index offset (1+ index))
    
    32
    +	     (new-index 0 (1+ new-index)))
    
    33
    +	    ((= index start))
    
    34
    +	  (declare (fixnum index new-index))
    
    35
    +	  (setf (schar newstring new-index) (schar string index)))
    
    36
    +	(do ((index start (1+ index))
    
    37
    +	     (new-index (- start offset) (1+ new-index)))
    
    38
    +	    ((= index (the fixnum end)))
    
    39
    +	  (declare (fixnum index new-index))
    
    40
    +	  (multiple-value-bind (code wide) (codepoint string index)
    
    41
    +	    (when wide (incf index))
    
    42
    +            ;; Use char-upcase if it's not a surrogate pair so that
    
    43
    +            ;; we're always consist.
    
    44
    +            (if wide
    
    45
    +                (setq code (unicode-upper code))
    
    46
    +                (setf code (char-code (char-upcase (code-char code)))))
    
    47
    +	    ;;@@ WARNING: this may, in theory, need to extend newstring
    
    48
    +	    ;;  but that never actually occurs as of Unicode 5.1.0,
    
    49
    +	    ;;  so I'm just going to ignore it for now...
    
    50
    +	    (multiple-value-bind (hi lo) (surrogates code)
    
    51
    +	      (setf (schar newstring new-index) hi)
    
    52
    +	      (when lo
    
    53
    +		(setf (schar newstring (incf new-index)) lo)))))
    
    54
    +	;;@@ WARNING: see above
    
    55
    +	(do ((index end (1+ index))
    
    56
    +	     (new-index (- (the fixnum end) offset) (1+ new-index)))
    
    57
    +	    ((= index offset-slen))
    
    58
    +	  (declare (fixnum index new-index))
    
    59
    +	  (setf (schar newstring new-index) (schar string index)))
    
    60
    +	newstring))))
    
    61
    +
    
    62
    +
    
    20 63
     ;; An example where this differs from cl:string-upcase differ:
    
    21 64
     ;; #\Latin_Small_Letter_Sharp_S
    
    22 65
     (defun string-upcase-full (string &key (start 0) end)
    
    23 66
       _N"Given a string, returns a new string that is a copy of it with
    
    24 67
       all lower case alphabetic characters converted to uppercase using
    
    25 68
       full case conversion."
    
    26
    -  (declare (fixnum start)) (let* ((string (if
    
    27
    -  (stringp string) string (string string)))
    
    69
    +  (declare (fixnum start))
    
    70
    +  (let* ((string (string string))
    
    28 71
     	 (slen (length string)))
    
    29 72
         (declare (fixnum slen))
    
    30 73
         (with-output-to-string (s)
    
    ... ... @@ -57,11 +100,55 @@
    57 100
       all lower case alphabetic characters converted to uppercase.  Casing
    
    58 101
       is :simple or :full for simple or full case conversion,
    
    59 102
       respectively."
    
    60
    -  (declare (fixnum start))
    
    61
    -  (if (eq casing :simple)
    
    62
    -      (cl:string-upcase string :start start :end end)
    
    63
    -      (string-upcase-full string :start start :end end)))
    
    103
    +  (declare (fixnum start)
    
    104
    +           (type (member :simple :full) casing))
    
    105
    +  (ecase casing
    
    106
    +    (:simple
    
    107
    +     (string-upcase-simple string :start start :end end))
    
    108
    +    (:full
    
    109
    +     (string-upcase-full string :start start :end end))))
    
    64 110
     
    
    111
    +(defun string-downcase-simple (string &key (start 0) end)
    
    112
    +  _N"Given a string, returns a new string that is a copy of it with all
    
    113
    +  upper case alphabetic characters converted to lowercase."
    
    114
    +  (declare (fixnum start))
    
    115
    +  (let* ((string (if (stringp string) string (string string)))
    
    116
    +	 (slen (length string)))
    
    117
    +    (declare (fixnum slen))
    
    118
    +    (lisp::with-one-string string start end offset
    
    119
    +      (let ((offset-slen (+ slen offset))
    
    120
    +	    (newstring (make-string slen)))
    
    121
    +	(declare (fixnum offset-slen))
    
    122
    +	(do ((index offset (1+ index))
    
    123
    +	     (new-index 0 (1+ new-index)))
    
    124
    +	    ((= index start))
    
    125
    +	  (declare (fixnum index new-index))
    
    126
    +	  (setf (schar newstring new-index) (schar string index)))
    
    127
    +	(do ((index start (1+ index))
    
    128
    +	     (new-index (- start offset) (1+ new-index)))
    
    129
    +	    ((= index (the fixnum end)))
    
    130
    +	  (declare (fixnum index new-index))
    
    131
    +	  (multiple-value-bind (code wide) (codepoint string index)
    
    132
    +	    (when wide (incf index))
    
    133
    +            ;; Use char-downcase if it's not a surrogate pair so that
    
    134
    +            ;; we're always consist.
    
    135
    +            (if wide
    
    136
    +                (setq code (unicode-lower code))
    
    137
    +                (setq code (char-code (char-downcase (code-char code)))))
    
    138
    +	    ;;@@ WARNING: this may, in theory, need to extend newstring
    
    139
    +	    ;;  but that never actually occurs as of Unicode 5.1.0,
    
    140
    +	    ;;  so I'm just going to ignore it for now...
    
    141
    +	    (multiple-value-bind (hi lo) (surrogates code)
    
    142
    +	      (setf (schar newstring new-index) hi)
    
    143
    +	      (when lo
    
    144
    +		(setf (schar newstring (incf new-index)) lo)))))
    
    145
    +	;;@@ WARNING: see above
    
    146
    +	(do ((index end (1+ index))
    
    147
    +	     (new-index (- (the fixnum end) offset) (1+ new-index)))
    
    148
    +	    ((= index offset-slen))
    
    149
    +	  (declare (fixnum index new-index))
    
    150
    +	  (setf (schar newstring new-index) (schar string index)))
    
    151
    +	newstring))))
    
    65 152
     
    
    66 153
     ;; An example this differs from cl:string-downcase:
    
    67 154
     ;; #\Latin_Capital_Letter_I_With_Dot_Above.
    
    ... ... @@ -104,10 +191,13 @@
    104 191
       uppercase alphabetic characters converted to lowercase.  Casing is
    
    105 192
       :simple or :full for simple or full case conversion, respectively."
    
    106 193
     
    
    107
    -  (declare (fixnum start))
    
    108
    -  (if (eq casing :simple)
    
    109
    -      (cl:string-downcase string :start start :end end)
    
    110
    -      (string-downcase-full string :start start :end end)))
    
    194
    +  (declare (fixnum start)
    
    195
    +           (type (member :simple :full) casing))
    
    196
    +  (ecase casing
    
    197
    +    (:simple
    
    198
    +     (string-downcase-simple string :start start :end end))
    
    199
    +    (:full
    
    200
    +     (string-downcase-full string :start start :end end))))
    
    111 201
     
    
    112 202
     
    
    113 203
     ;;;
    
    ... ... @@ -553,12 +643,14 @@
    553 643
       delimited by non-case-modifiable chars.  "
    
    554 644
     
    
    555 645
       (declare (fixnum start)
    
    556
    -	   (type (member :simple :full :title) casing))
    
    646
    +	   (type (member :simple-title :simple :full :title) casing))
    
    557 647
       (if unicode-word-break
    
    558 648
           (string-capitalize-unicode string :start start :end end :casing casing)
    
    559
    -      (if (eq casing :simple)
    
    560
    -	  (string-capitalize-simple string :start start :end end)
    
    561
    -	  (string-capitalize-full string :start start :end end :casing casing))))
    
    649
    +      (ecase casing
    
    650
    +        (:simple-title
    
    651
    +	 (string-capitalize-simple string :start start :end end))
    
    652
    +        ((:simple :full :title)
    
    653
    +	 (string-capitalize-full string :start start :end end :casing casing)))))
    
    562 654
     
    
    563 655
     
    
    564 656
     (defun decompose-hangul-syllable (cp stream)
    

  • src/i18n/locale/cmucl.pot
    ... ... @@ -3880,13 +3880,13 @@ msgid ""
    3880 3880
     "  a new string Count long filled with the fill character."
    
    3881 3881
     msgstr ""
    
    3882 3882
     
    
    3883
    -#: src/code/string.lisp
    
    3883
    +#: src/code/unicode.lisp src/code/string.lisp
    
    3884 3884
     msgid ""
    
    3885 3885
     "Given a string, returns a new string that is a copy of it with all\n"
    
    3886 3886
     "  lower case alphabetic characters converted to uppercase."
    
    3887 3887
     msgstr ""
    
    3888 3888
     
    
    3889
    -#: src/code/string.lisp
    
    3889
    +#: src/code/unicode.lisp src/code/string.lisp
    
    3890 3890
     msgid ""
    
    3891 3891
     "Given a string, returns a new string that is a copy of it with all\n"
    
    3892 3892
     "  upper case alphabetic characters converted to lowercase."
    

  • tests/string.lisp
    ... ... @@ -8,11 +8,10 @@
    8 8
     (defun make-test-string ()
    
    9 9
       ;; Create a string consisting of all the code units EXCEPT for the
    
    10 10
       ;; surrogates because string casing handles that differently.
    
    11
    -  (coerce
    
    12
    -   (loop for code from 0 to #xffff
    
    13
    -         unless (lisp::surrogatep code)
    
    14
    -         collect (code-char code))
    
    15
    -   'string))
    
    11
    +  (let ((s (make-string char-code-limit)))
    
    12
    +    (dotimes (k char-code-limit)
    
    13
    +      (setf (aref s k) (code-char k)))
    
    14
    +    s))
    
    16 15
     
    
    17 16
     (define-test string-upcase
    
    18 17
         (:tag :issues)
    
    ... ... @@ -62,32 +61,33 @@
    62 61
     
    
    63 62
     (define-test string-capitalize
    
    64 63
         (:tag :issues)
    
    65
    -  (let* ((s
    
    66
    -           ;; This string contains a couple of characters where
    
    67
    -           ;; Unicode has a titlecase version of the character.  We
    
    68
    -           ;; want to make sure we use char-upcase to capitalize the
    
    69
    -           ;; string instead of lisp::char-titlecse
    
    70
    -           (coerce
    
    71
    -             '(#\Latin_Small_Letter_Dz
    
    72
    -               #\a #\b
    
    73
    -               #\space
    
    74
    -               #\Latin_Small_Letter_Lj
    
    75
    -               #\A #\B)
    
    76
    -             'string))
    
    64
    +  (let* ((s (string-capitalize
    
    65
    +             ;; #\Latin_Small_Letter_Dz and #\Latin_Small_Letter_Lj
    
    66
    +             ;; #have Unicode titlecase characters that differ from
    
    67
    +             ;; CHAR-UPCASE.  This tests that STRING-CAPITALIZE use
    
    68
    +             ;; CHAR-UPCASE to produce the capitalization.
    
    69
    +             (coerce
    
    70
    +              '(#\Latin_Small_Letter_Dz
    
    71
    +                #\a #\b
    
    72
    +                #\space
    
    73
    +                #\Latin_Small_Letter_Lj
    
    74
    +                #\A #\B)
    
    75
    +              'string)))
    
    77 76
              (expected
    
    78
    -           ;; Manually convert S to a capitalized string using
    
    79
    -           ;; char-upcase/downcase.
    
    80
    -           (map 'string
    
    81
    -                #'(lambda (ch f)
    
    82
    -                    (funcall f ch))
    
    77
    +           ;; Manually convert the test string by calling CHAR-UPCASE
    
    78
    +           ;; or CHAR-DOWNCASE (or IDENTITY) to get the desired
    
    79
    +           ;; capitalized string.
    
    80
    +           (map 'list
    
    81
    +                #'(lambda (c f)
    
    82
    +                    (funcall f c))
    
    83 83
                     s
    
    84 84
                     (list #'char-upcase
    
    85 85
                           #'char-downcase
    
    86 86
                           #'char-downcase
    
    87
    -                      #'char-downcase
    
    87
    +                      #'identity
    
    88 88
                           #'char-upcase
    
    89 89
                           #'char-downcase
    
    90 90
                           #'char-downcase))))
    
    91 91
         (assert-equal
    
    92 92
          (map 'list #'char-name expected)
    
    93
    -     (map 'list #'char-name (string-capitalize s)))))
    93
    +     (map 'list #'char-name s))))

  • tests/unicode.lisp
    ... ... @@ -192,3 +192,19 @@
    192 192
     			      computed-breaks)))
    
    193 193
     		     (assert-equalp b
    
    194 194
     				    (do-test s)))))))))
    
    195
    +
    
    196
    +(define-test unicode.case-extend
    
    197
    +  "Test that Unicode never produces an upper or lower case character
    
    198
    +  outside the BMP for a character in the BMP"
    
    199
    +  (:tag :unicode)
    
    200
    +  ;; For each character code (that isn't a surrogate), find the
    
    201
    +  ;; corresponding Unicode upper and lowe case character.  Verify that
    
    202
    +  ;; this character is in the BMP.
    
    203
    +  (loop for code from 0 below char-code-limit
    
    204
    +        unless (lisp::surrogatep code)
    
    205
    +          do
    
    206
    +             (assert-true (< (lisp::unicode-upper code) char-code-limit)
    
    207
    +                          code (lisp::unicode-upper code))
    
    208
    +             (assert-true (< (lisp::unicode-lower code) char-code-limit)
    
    209
    +                          code (lisp::unicode-upper code))))
    
    210
    +