Raymond Toy pushed to branch master at cmucl / cmucl

Commits:

18 changed files:

Changes:

  • .gitlab-ci.yml
    1 1
     variables:
    
    2 2
       download_url: "https://common-lisp.net/project/cmucl/downloads/snapshots/2024/08"
    
    3 3
       version: "2024-08-x86"
    
    4
    -  bootstrap: ""
    
    4
    +  bootstrap: "-B boot-2024-08"
    
    5 5
     
    
    6 6
     
    
    7 7
     stages:
    

  • src/bootfiles/21e/boot-2024-08.lisp
    1
    +;; Bootstrap file for the ef-octet-count changes.  Just need to change
    
    2
    +;; the value of +ef-max+
    
    3
    +
    
    4
    +(in-package "STREAM")
    
    5
    +
    
    6
    +(handler-bind
    
    7
    +    ((error (lambda (c)
    
    8
    +	      (declare (ignore c))
    
    9
    +	      (invoke-restart 'continue))))
    
    10
    +  (defconstant +ef-max+ 14))

  • src/code/exports.lisp
    ... ... @@ -1456,7 +1456,8 @@
    1456 1456
         (name
    
    1457 1457
           '("STRING-TO-OCTETS" "OCTETS-TO-STRING" "*DEFAULT-EXTERNAL-FORMAT*"
    
    1458 1458
     	"STRING-ENCODE" "STRING-DECODE" "SET-SYSTEM-EXTERNAL-FORMAT"
    
    1459
    -	"LIST-ALL-EXTERNAL-FORMATS" "DESCRIBE-EXTERNAL-FORMAT"))
    
    1459
    +	"LIST-ALL-EXTERNAL-FORMATS" "DESCRIBE-EXTERNAL-FORMAT"
    
    1460
    +	"STRING-OCTET-COUNT"))
    
    1460 1461
       (intern name "STREAM"))
    
    1461 1462
     
    
    1462 1463
     (defpackage "EXTENSIONS"
    

  • src/code/extfmts.lisp
    ... ... @@ -19,7 +19,8 @@
    19 19
     	  string-encode string-decode set-system-external-format
    
    20 20
     	  +replacement-character-code+
    
    21 21
     	  list-all-external-formats
    
    22
    -	  describe-external-format))
    
    22
    +	  describe-external-format
    
    23
    +	  string-octet-count))
    
    23 24
     
    
    24 25
     (defvar *default-external-format*
    
    25 26
       :utf-8
    
    ... ... @@ -52,6 +53,7 @@
    52 53
       flush					; flush state
    
    53 54
       copy-state				; copy state
    
    54 55
       osc					; octets to string, counted
    
    56
    +  oc					; number of octets to encode string
    
    55 57
       max)
    
    56 58
     
    
    57 59
     ;; Unicode replacement character U+FFFD
    
    ... ... @@ -96,6 +98,11 @@
    96 98
       (copy-state nil :type (or null function) :read-only t)
    
    97 99
       (cache nil :type (or null simple-vector))
    
    98 100
       ;;
    
    101
    +  ;; Function to count the number of octets needed to encode a
    
    102
    +  ;; codepoint.  Basically like code-to-octets, except we return the
    
    103
    +  ;; number of octets needed instead of the octets themselves.
    
    104
    +  (octet-count #'%efni :type (or null function) :read-only t)
    
    105
    +  ;;
    
    99 106
       ;; Minimum number of octets needed to form a codepoint
    
    100 107
       (min 1 :type kernel:index :read-only t)
    
    101 108
       ;;
    
    ... ... @@ -126,7 +133,8 @@
    126 133
       (setf (gethash (ef-name ef) *external-formats*) ef))
    
    127 134
     
    
    128 135
     (declaim (inline ef-octets-to-code ef-code-to-octets ef-flush-state ef-copy-state
    
    129
    -		 ef-cache ef-min-octets ef-max-octets))
    
    136
    +		 ef-cache ef-min-octets ef-max-octets
    
    137
    +		 ef-octet-count))
    
    130 138
     
    
    131 139
     (defun ef-octets-to-code (ef)
    
    132 140
       (efx-octets-to-code (ef-efx ef)))
    
    ... ... @@ -143,6 +151,9 @@
    143 151
     (defun ef-cache (ef)
    
    144 152
       (efx-cache (ef-efx ef)))
    
    145 153
     
    
    154
    +(defun ef-octet-count (ef)
    
    155
    +  (efx-octet-count (ef-efx ef)))
    
    156
    +
    
    146 157
     (defun ef-min-octets (ef)
    
    147 158
       (efx-min (ef-efx ef)))
    
    148 159
     
    
    ... ... @@ -166,7 +177,7 @@
    166 177
     ;;; DEFINE-EXTERNAL-FORMAT  -- Public
    
    167 178
     ;;;
    
    168 179
     ;;; name (&key base min max size documentation) (&rest slots) octets-to-code
    
    169
    -;;;       code-to-octets flush-state copy-state
    
    180
    +;;;       code-to-octets flush-state copy-state octet-count
    
    170 181
     ;;;
    
    171 182
     ;;;   Define a new external format.  If base is specified, then an
    
    172 183
     ;;;   external format is defined that is based on a previously defined
    
    ... ... @@ -228,6 +239,15 @@
    228 239
     ;;;   This should probably be a deep copy so that if the original
    
    229 240
     ;;;   state is modified, the copy is not.
    
    230 241
     ;;;
    
    242
    +;;; octet-count (code state error &rest vars)
    
    243
    +;;;   Defines a form to determine the number of octets needed to
    
    244
    +;;;   encode the given CODE using the external format.  This is
    
    245
    +;;;   essentially the same as CODE-TO-OCTETS, except the encoding is
    
    246
    +;;;   not saved anywhere.  ERROR is the same as in CODE-TO-OCTETS.
    
    247
    +;;;
    
    248
    +;;;   This should return one value: the number of octets needed to
    
    249
    +;;;   encode the given code.
    
    250
    +;;;
    
    231 251
     ;;; Note: external-formats work on code-points, not
    
    232 252
     ;;;   characters, so that the entire 31 bit ISO-10646 range can be
    
    233 253
     ;;;   used internally regardless of the size of a character recognized
    
    ... ... @@ -238,7 +258,7 @@
    238 258
     (defmacro define-external-format (name (&key base min max size (documentation ""))
    
    239 259
     				       (&rest slots)
    
    240 260
     				       &optional octets-to-code code-to-octets
    
    241
    -				       flush-state copy-state)
    
    261
    +				       flush-state copy-state octet-count)
    
    242 262
       (let* ((tmp (gensym))
    
    243 263
     	 (min (or min size 1))
    
    244 264
     	 (max (or max size 6))
    
    ... ... @@ -282,7 +302,17 @@
    282 302
     		     (declare (ignorable ,state))
    
    283 303
     		     (let (,@',slotb
    
    284 304
     			   ,@(loop for var in vars collect `(,var (gensym))))
    
    285
    -		       ,body))))
    
    305
    +		       ,body)))
    
    306
    +		(octet-count ((code state error &rest vars) body)
    
    307
    +		  `(lambda (,',tmp ,state ,error)
    
    308
    +		     (declare (ignorable ,state ,error)
    
    309
    +			      (optimize (ext:inhibit-warnings 3)))
    
    310
    +		     (let (,@',slotb
    
    311
    +			   (,code ',code)
    
    312
    +			   ,@(loop for var in vars collect `(,var (gensym))))
    
    313
    +		       `(let ((,',code (the lisp:codepoint ,,',tmp)))
    
    314
    +			  (declare (ignorable ,',code))
    
    315
    +			  ,,body)))))
    
    286 316
            (%intern-ef (make-external-format ,name
    
    287 317
     		    ,(if base
    
    288 318
     			 `(ef-efx (find-external-format ,(ef-name base)))
    
    ... ... @@ -291,7 +321,8 @@
    291 321
     				    :flush-state ,flush-state
    
    292 322
     			            :copy-state ,copy-state
    
    293 323
     				    :cache (make-array +ef-max+
    
    294
    -							  :initial-element nil)
    
    324
    +						       :initial-element nil)
    
    325
    +				    :octet-count ,octet-count
    
    295 326
     				    :min ,(min min max)
    
    296 327
     				    :max ,(max min max)))
    
    297 328
     		    nil
    
    ... ... @@ -688,7 +719,20 @@ character and illegal outputs are replaced by a question mark.")
    688 719
     				 (intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream")
    
    689 720
     				 ,code 1))
    
    690 721
     		      #x3F)
    
    691
    -		  ,code))))
    
    722
    +		  ,code)))
    
    723
    +  ()
    
    724
    +  ()
    
    725
    +  (octet-count (code state error)
    
    726
    +    `(if (> ,code 255)
    
    727
    +	 (if ,error
    
    728
    +	     (locally
    
    729
    +		 ;; No warnings about fdefinition
    
    730
    +		 (declare (optimize (ext:inhibit-warnings 3)))
    
    731
    +	       (funcall ,error
    
    732
    +			(intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream")
    
    733
    +			,code 1))
    
    734
    +	     1)
    
    735
    +	 1)))
    
    692 736
     
    
    693 737
     ;;; OCTETS-TO-CODEPOINT, CODEPOINT-TO-OCTETS  -- Semi-Public
    
    694 738
     ;;;
    
    ... ... @@ -709,6 +753,10 @@ character and illegal outputs are replaced by a question mark.")
    709 753
       (let ((ef (find-external-format external-format)))
    
    710 754
         (funcall (ef-code-to-octets ef) code state output error)))
    
    711 755
     
    
    756
    +(defmacro count-codepoint-octets (external-format code state &optional error)
    
    757
    +  (let ((ef (find-external-format external-format)))
    
    758
    +    (funcall (ef-octet-count ef) code state error)))
    
    759
    +
    
    712 760
     
    
    713 761
     
    
    714 762
     (defvar *ef-base* +ef-max+)
    
    ... ... @@ -878,6 +926,42 @@ character and illegal outputs are replaced by a question mark.")
    878 926
         (when f
    
    879 927
           (funcall f state))))
    
    880 928
     
    
    929
    +(defmacro octet-count (external-format char state &optional error)
    
    930
    +  (let ((nchar (gensym))
    
    931
    +	(nstate (gensym))
    
    932
    +	(count-it (gensym))
    
    933
    +	(ch (gensym)))
    
    934
    +    `(let ((,nchar ,char)
    
    935
    +	   (,nstate ,state))
    
    936
    +       (when (null ,nstate) (setq ,nstate (setf ,state (cons nil nil))))
    
    937
    +       (if (lisp::surrogatep (char-code ,nchar) :high)
    
    938
    +	   (setf (car ,nstate) ,nchar)
    
    939
    +	   (flet ((,count-it (,ch)
    
    940
    +		    (count-codepoint-octets ,external-format ,ch (cdr ,nstate) ,error)))
    
    941
    +	     (if (car ,nstate)
    
    942
    +		 (prog1
    
    943
    +		     (,count-it (if (lisp::surrogatep (char-code ,nchar) :low)
    
    944
    +				    (surrogates-to-codepoint (car ,nstate) ,nchar)
    
    945
    +				    (if ,error
    
    946
    +					(locally
    
    947
    +					    (declare (optimize (ext:inhibit-warnings 3)))
    
    948
    +					  (funcall ,error
    
    949
    +						   (intl:gettext "Cannot convert invalide surrogate #~x~X to character")
    
    950
    +						   ,nchar))
    
    951
    +					+replacement-character-code+)))
    
    952
    +		   (setf (car ,nstate) nil))
    
    953
    +		 ;; A lone trailing (low surrogate gets replaced with
    
    954
    +		 ;; the replacement character.
    
    955
    +		 (,count-it (if (lisp::surrogatep (char-code ,nchar) :low)
    
    956
    +				(if ,error
    
    957
    +				    (locally
    
    958
    +					(declare (optimize (ext:inhibit-warnings 3)))
    
    959
    +				      (funcall ,error
    
    960
    +					       (intl:gettext "Cannot convert lone trailing surrogate #x~X to character")
    
    961
    +					       ,nchar))
    
    962
    +				    +replacement-character-code+)
    
    963
    +				(char-code ,nchar)))))))))
    
    964
    +
    
    881 965
     (def-ef-macro ef-string-to-octets (extfmt lisp::lisp +ef-max+ +ef-so+)
    
    882 966
       `(lambda (string start end buffer buffer-start buffer-end error bufferp
    
    883 967
     	    &aux (ptr buffer-start) (state nil) (last-octet buffer-start))
    
    ... ... @@ -1071,6 +1155,31 @@ character and illegal outputs are replaced by a question mark.")
    1071 1155
           (values (if stringp string (lisp::shrink-vector string pos)) (- pos s-start) last-octet new-state))))
    
    1072 1156
     
    
    1073 1157
     
    
    1158
    +(def-ef-macro ef-string-octet-count (extfmt lisp::lisp +ef-max+ +ef-oc+)
    
    1159
    +  `(lambda (string start end error &aux (total 0) (state nil))
    
    1160
    +     (dotimes (i (- end start) total)
    
    1161
    +       (incf total
    
    1162
    +	     (octet-count ,extfmt (schar string (+ start i)) state error)))))
    
    1163
    +
    
    1164
    +(defun string-octet-count (string &key (start 0) end  (external-format :default) error)
    
    1165
    +  "Compute the number of octets needed to convert String using the
    
    1166
    +  specified External-format.  The string is bound by Start (defaulting
    
    1167
    +  to 0) and End (defaulting to the end of the string)."
    
    1168
    +  (let ((composing-format-p
    
    1169
    +	  ;; Determine is the external format is a composing format
    
    1170
    +	  ;; which we determine by seeing that the name of the format
    
    1171
    +	  ;; is a cons.  Probably not the best way.
    
    1172
    +	  (consp (ef-name (find-external-format external-format)))))
    
    1173
    +    ;; We currently don't know how to get just the number of octets
    
    1174
    +    ;; when a composing external format is used.  As a workaround, use
    
    1175
    +    ;; STRING-TO-OCTETS to find the number of octets.
    
    1176
    +    (if composing-format-p
    
    1177
    +	 (nth-value 1
    
    1178
    +		    (string-to-octets string :start start :end end
    
    1179
    +					     :external-format external-format))
    
    1180
    +	 (lisp::with-array-data ((string string) (start start) (end end))
    
    1181
    +	   (funcall (ef-string-octet-count external-format)
    
    1182
    +		    string start end error)))))
    
    1074 1183
     
    
    1075 1184
     (def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
    
    1076 1185
       `(lambda (string start end result error  &aux (ptr 0) (state nil))
    
    ... ... @@ -1186,10 +1295,11 @@ character and illegal outputs are replaced by a question mark.")
    1186 1295
     	(#.+ef-so+ (%ef-string-to-octets ef))
    
    1187 1296
     	(#.+ef-en+ (%ef-encode ef))
    
    1188 1297
     	(#.+ef-de+ (%ef-decode ef))
    
    1189
    -	(#.+ef-osc+ (%ef-octets-to-string-counted ef))))
    
    1298
    +	(#.+ef-osc+ (%ef-octets-to-string-counted ef))
    
    1299
    +	(#.+ef-oc+ (%ef-octet-count ef))))
    
    1190 1300
         `(setf (aref (ef-cache (find-external-format ,(ef-name ef))) ,slot)
    
    1191 1301
     	 ,(subst (ef-name ef) ef
    
    1192
    -		 (function-lambda-expression (aref (ef-cache ef) slot))))))
    
    1302
    +		 (function-lambda-expression (aref (ef-cache ef) slot)))))))
    
    1193 1303
     
    
    1194 1304
     ;;; Builtin external formats.
    
    1195 1305
     
    
    ... ... @@ -1307,7 +1417,17 @@ replacement character.")
    1307 1417
              ((< ,code #x800) (utf8 ,code 1))
    
    1308 1418
              ((< ,code #x10000) (utf8 ,code 2))
    
    1309 1419
              ((< ,code #x110000) (utf8 ,code 3))
    
    1310
    -         (t (error "How did this happen?  Codepoint U+~X is illegal" ,code))))))
    
    1420
    +         (t (error "How did this happen?  Codepoint U+~X is illegal" ,code)))))
    
    1421
    +  ()
    
    1422
    +  ()
    
    1423
    +  (octet-count (code state error)
    
    1424
    +    `(locally
    
    1425
    +	 (declare (optimize (ext:inhibit-warnings 3)))
    
    1426
    +       (cond ((< ,code #x80) 1)
    
    1427
    +             ((< ,code #x800) 2)
    
    1428
    +             ((< ,code #x10000) 3)
    
    1429
    +             ((< ,code #x110000) 4)
    
    1430
    +             (t (error "How did this happen?  Codepoint U+~X is illegal" ,code))))))
    
    1311 1431
     
    
    1312 1432
     (define-external-format :ascii (:size 1 :documentation
    
    1313 1433
     "US ASCII 7-bit encoding.  Illegal input sequences are replaced with
    
    ... ... @@ -1333,4 +1453,14 @@ replaced with a question mark.")
    1333 1453
     			  (declare (optimize (ext:inhibit-warnings 3)))
    
    1334 1454
     			(funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
    
    1335 1455
     		      #x3F)
    
    1336
    -		  ,code))))
    1456
    +		  ,code)))
    
    1457
    +  ()
    
    1458
    +  ()
    
    1459
    +  (octet-count (code state error)
    
    1460
    +    `(if (> ,code #x7f)
    
    1461
    +	 (if ,error
    
    1462
    +	     (locally
    
    1463
    +		 (declare (optimize (ext:inhibit-warnings 3)))
    
    1464
    +	       (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
    
    1465
    +	     1)
    
    1466
    +	 1)))

  • src/i18n/locale/cmucl.pot
    ... ... @@ -9300,6 +9300,13 @@ msgid ""
    9300 9300
     "  external format."
    
    9301 9301
     msgstr ""
    
    9302 9302
     
    
    9303
    +#: src/code/extfmts.lisp
    
    9304
    +msgid ""
    
    9305
    +"Compute the number of octets needed to convert String using the\n"
    
    9306
    +"  specified External-format.  The string is bound by Start (defaulting\n"
    
    9307
    +"  to 0) and End (defaulting to the end of the string)."
    
    9308
    +msgstr ""
    
    9309
    +
    
    9303 9310
     #: src/code/extfmts.lisp
    
    9304 9311
     msgid ""
    
    9305 9312
     "Encode the given String using External-Format and return a new\n"
    

  • src/pcl/simple-streams/external-formats/ascii.lisp
    ... ... @@ -33,5 +33,15 @@ replaced with a question mark.")
    33 33
     			  (declare (optimize (ext:inhibit-warnings 3)))
    
    34 34
     			(funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
    
    35 35
     		      #x3F)
    
    36
    -		  ,code))))
    
    36
    +		  ,code)))
    
    37
    +  ()
    
    38
    +  ()
    
    39
    +  (octet-count (code state error)
    
    40
    +    `(if (> ,code #x7f)
    
    41
    +	 (if ,error
    
    42
    +	     (locally
    
    43
    +		 (declare (optimize (ext:inhibit-warnings 3)))
    
    44
    +	       (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
    
    45
    +	     1)
    
    46
    +	 1)))
    
    37 47
     

  • src/pcl/simple-streams/external-formats/euc-kr.lisp
    ... ... @@ -1007,4 +1007,16 @@ character and illegal outputs are replaced by a question mark.")
    1007 1007
     		 (t
    
    1008 1008
     		  (if ,error
    
    1009 1009
     		      (funcall ,error "Cannot output codepoint #x~X to EUC-KR format." ,code)
    
    1010
    -		      (,output #X3f))))))))
    1010
    +		      (,output #X3f)))))))
    
    1011
    +  ()
    
    1012
    +  ()
    
    1013
    +  (octet-count (code state error present)
    
    1014
    +    `(if (<= ,code #x7f)
    
    1015
    +	 1
    
    1016
    +	 (let ((,present (get-inverse ,itable ,code)))
    
    1017
    +	   (cond (,present
    
    1018
    +		  2)
    
    1019
    +		 (t
    
    1020
    +		  (if ,error
    
    1021
    +		      (funcall ,error "Cannot output codepoint #x~X to EUC-KR format." ,code)
    
    1022
    +		      1)))))))

  • src/pcl/simple-streams/external-formats/iso8859-1.lisp
    ... ... @@ -31,4 +31,17 @@ character and illegal outputs are replaced by a question mark.")
    31 31
     		     (funcall ,error "Cannot output codepoint #x~X to ISO8859-1 stream"
    
    32 32
     			      ,code 1))
    
    33 33
     		      #x3F)
    
    34
    -		  ,code))))
    34
    +		  ,code)))
    
    35
    +  ()
    
    36
    +  ()
    
    37
    +  (octet-count (code state error)
    
    38
    +    `(if (> ,code 255)
    
    39
    +	 (if ,error
    
    40
    +	     (locally
    
    41
    +		 ;; No warnings about fdefinition
    
    42
    +		 (declare (optimize (ext:inhibit-warnings 3)))
    
    43
    +	       (funcall ,error
    
    44
    +			(intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream")
    
    45
    +			,code 1))
    
    46
    +	     1)
    
    47
    +	 1)))

  • src/pcl/simple-streams/external-formats/iso8859-2.lisp
    ... ... @@ -47,4 +47,19 @@ character and illegal outputs are replaced by a question mark.")
    47 47
     				(declare (optimize (ext:inhibit-warnings 3)))
    
    48 48
     			      (funcall ,error "Cannot output codepoint #x~X to ISO8859-2 stream"
    
    49 49
     				       ,code))
    
    50
    -			    #x3F)))))))
    50
    +			    #x3F))))))
    
    51
    +  ()
    
    52
    +  ()
    
    53
    +  (octet-count (code state error present)
    
    54
    +    `(if (< ,code 160)
    
    55
    +	 1
    
    56
    +	 (let ((,present (get-inverse ,itable ,code)))
    
    57
    +	   (if ,present
    
    58
    +	       1
    
    59
    +	       (if ,error
    
    60
    +		   (locally
    
    61
    +		       ;; No warnings about fdefinition
    
    62
    +		       (declare (optimize (ext:inhibit-warnings 3)))
    
    63
    +		     (funcall ,error "Cannot output codepoint #x~X to ISO8859-2 stream"
    
    64
    +			      ,code))
    
    65
    +		   1))))))

  • src/pcl/simple-streams/external-formats/mac-roman.lisp
    ... ... @@ -49,4 +49,19 @@ character and illegal outputs are replaced by a question mark.")
    49 49
     				(declare (optimize (ext:inhibit-warnings 3)))
    
    50 50
     			      (funcall ,error "Cannot output codepoint #x~X to MAC-ROMAN stream"
    
    51 51
     				       ,code))
    
    52
    -			    #x3F)))))))
    52
    +			    #x3F))))))
    
    53
    +  ()
    
    54
    +  ()
    
    55
    +  (octet-count (code state error present)
    
    56
    +    `(if (< ,code 128)
    
    57
    +	 1
    
    58
    +	 (let ((,present (get-inverse ,itable ,code)))
    
    59
    +	   (if ,present
    
    60
    +	       1
    
    61
    +	       (if ,error
    
    62
    +		   (locally
    
    63
    +		       ;; No warnings about fdefinition
    
    64
    +		       (declare (optimize (ext:inhibit-warnings 3)))
    
    65
    +		     (funcall ,error "Cannot output codepoint #x~X to MAC-ROMAN stream"
    
    66
    +			      ,code))
    
    67
    +		   1))))))

  • src/pcl/simple-streams/external-formats/utf-16-be.lisp
    ... ... @@ -110,4 +110,12 @@ Unicode replacement character.")
    110 110
       (copy-state (state)
    
    111 111
         ;; The state is either NIL or a codepoint, so nothing really
    
    112 112
         ;; special is needed to copy it.
    
    113
    -    `(progn ,state)))
    113
    +    `(progn ,state))
    
    114
    +  (octet-count (code state error)
    
    115
    +    `(cond ((< ,code #x10000)
    
    116
    +	    2)
    
    117
    +	   ((< ,code #x110000)
    
    118
    +	    4)
    
    119
    +	   (t
    
    120
    +	    ;; Replacement character is 2 octets
    
    121
    +	    2))))

  • src/pcl/simple-streams/external-formats/utf-16-le.lisp
    ... ... @@ -111,4 +111,12 @@ Unicode replacement character.")
    111 111
       (copy-state (state)
    
    112 112
         ;; The state is either NIL or a codepoint, so nothing really
    
    113 113
         ;; special is needed.
    
    114
    -    `(progn ,state)))
    114
    +    `(progn ,state))
    
    115
    +  (octet-count (code state error)
    
    116
    +    `(cond ((< ,code #x10000)
    
    117
    +	    2)
    
    118
    +	   ((< ,code #x110000)
    
    119
    +	    4)
    
    120
    +	   (t
    
    121
    +	    ;; Replacement character is 2 octets
    
    122
    +	    2))))

  • src/pcl/simple-streams/external-formats/utf-16.lisp
    ... ... @@ -156,4 +156,18 @@ Unicode replacement character.")
    156 156
     		    ,c))))))
    
    157 157
       (copy-state (state)
    
    158 158
         ;; The state is list. Copy it
    
    159
    -    `(copy-list ,state)))
    159
    +	      `(copy-list ,state))
    
    160
    +  (octet-count (code state error)
    
    161
    +    `(let ((bom-count 0))
    
    162
    +       (unless ,state
    
    163
    +	 ;; Output BOM
    
    164
    +	 (setf bom-count 2)
    
    165
    +	 (setf ,state t))
    
    166
    +       (+ bom-count
    
    167
    +	  (cond ((< ,code #x10000)
    
    168
    +		 2)
    
    169
    +		((< ,code #x110000)
    
    170
    +		 4)
    
    171
    +		(t
    
    172
    +		 ;; Replacement character is 2 octets
    
    173
    +		 2))))))

  • src/pcl/simple-streams/external-formats/utf-32-be.lisp
    ... ... @@ -61,4 +61,18 @@ Unicode replacement character.")
    61 61
     				  ,code))
    
    62 62
     		       +replacement-character-code+)))
    
    63 63
     	     (t
    
    64
    -	      (out ,code))))))
    64
    +	      (out ,code)))))
    
    65
    +  ()
    
    66
    +  ()
    
    67
    +  (octet-count (code state error)
    
    68
    +    `(cond ((lisp::surrogatep ,code)
    
    69
    +	    (if ,error
    
    70
    +		(locally
    
    71
    +		    ;; No warnings about fdefinition
    
    72
    +		    (declare (optimize (ext:inhibit-warnings 3)))
    
    73
    +		  (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output"
    
    74
    +			   ,code))
    
    75
    +		;; Replacement character is 2 octets
    
    76
    +		2))
    
    77
    +	   (t
    
    78
    +	    4))))

  • src/pcl/simple-streams/external-formats/utf-32-le.lisp
    ... ... @@ -62,4 +62,18 @@ Unicode replacement character.")
    62 62
     				  ,code))
    
    63 63
     		       +replacement-character-code+)))
    
    64 64
     	     (t
    
    65
    -	      (out ,code))))))
    65
    +	      (out ,code)))))
    
    66
    +  ()
    
    67
    +  ()
    
    68
    +  (octet-count (code state error)
    
    69
    +    `(cond ((lisp::surrogatep ,code)
    
    70
    +	    (if ,error
    
    71
    +		(locally
    
    72
    +		    ;; No warnings about fdefinition
    
    73
    +		    (declare (optimize (ext:inhibit-warnings 3)))
    
    74
    +		  (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output"
    
    75
    +			   ,code))
    
    76
    +		;; Replacement character is 2 octets
    
    77
    +		2))
    
    78
    +	   (t
    
    79
    +	    4))))

  • src/pcl/simple-streams/external-formats/utf-32.lisp
    ... ... @@ -114,4 +114,20 @@ Unicode replacement character.")
    114 114
       nil
    
    115 115
       (copy-state (state)
    
    116 116
         ;; The state is either NIL or T, so we can just return that.
    
    117
    -    `(progn ,state)))
    117
    +    `(progn ,state))
    
    118
    +  (octet-count (code state error)
    
    119
    +    `(let ((bom-count 0))
    
    120
    +       (unless ,state
    
    121
    +	 (setf bom-count 4)
    
    122
    +	 (setf ,state t))
    
    123
    +       (cond ((lisp::surrogatep ,code)
    
    124
    +	      (if ,error
    
    125
    +		  (locally
    
    126
    +		      ;; No warnings about fdefinition
    
    127
    +		      (declare (optimize (ext:inhibit-warnings 3)))
    
    128
    +		    (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output"
    
    129
    +			     ,code))
    
    130
    +		  ;; Replacement character is 2 octets
    
    131
    +		  (+ 2 bom-count)))
    
    132
    +	     (t
    
    133
    +	      (+ 4 bom-count))))))

  • src/pcl/simple-streams/external-formats/utf-8.lisp
    ... ... @@ -127,4 +127,14 @@ replacement character.")
    127 127
              ((< ,code #x800) (utf8 ,code 1))
    
    128 128
              ((< ,code #x10000) (utf8 ,code 2))
    
    129 129
              ((< ,code #x110000) (utf8 ,code 3))
    
    130
    -         (t (error "How did this happen?  Codepoint U+~X is illegal" ,code))))))
    130
    +         (t (error "How did this happen?  Codepoint U+~X is illegal" ,code)))))
    
    131
    +  ()
    
    132
    +  ()
    
    133
    +  (octet-count (code state error)
    
    134
    +    `(locally
    
    135
    +	 (declare (optimize (ext:inhibit-warnings 3)))
    
    136
    +       (cond ((< ,code #x80) 1)
    
    137
    +             ((< ,code #x800) 2)
    
    138
    +             ((< ,code #x10000) 3)
    
    139
    +             ((< ,code #x110000) 4)
    
    140
    +             (t (error "How did this happen?  Codepoint U+~X is illegal" ,code))))))

  • tests/external-formats.lisp
    1
    +;;; Tests for external formats
    
    2
    +
    
    3
    +(defpackage :external-formats-tests
    
    4
    +  (:use :cl :lisp-unit))
    
    5
    +
    
    6
    +(in-package "EXTERNAL-FORMATS-TESTS")
    
    7
    +
    
    8
    +(defparameter *test-iso8859-1*
    
    9
    +  (let ((rs (kernel::make-random-object :state (kernel::init-random-state 27182828))))
    
    10
    +    (lisp::codepoints-string
    
    11
    +     (loop for k from 0 below 1000
    
    12
    +	   collect (random 256 rs))))
    
    13
    +  "Random test string with ISO8859-1 characters")
    
    14
    +
    
    15
    +(defparameter *test-unicode*
    
    16
    +  (let ((rs (kernel::make-random-object :state (kernel::init-random-state 27182828))))
    
    17
    +    (lisp::codepoints-string
    
    18
    +     (loop for k from 0 below 1000
    
    19
    +	   collect (random 20000 rs))))
    
    20
    +  "Random test string with codepoints below 20000")
    
    21
    +
    
    22
    +
    
    23
    +
    
    24
    +(defmacro test-octet-count (string format)
    
    25
    +  "Test that STRING-OCTET-COUNT returns the correct number of octets"
    
    26
    +  ;; We expect STRING-OCTET-COUNT returns the same number of octets
    
    27
    +  ;; that are produced by STRING-TO-OCTETS.
    
    28
    +  `(multiple-value-bind (octets count converted)
    
    29
    +       (stream:string-to-octets ,string :external-format ,format)
    
    30
    +     ;; While we're at it, make sure that the length of the octet
    
    31
    +     ;; buffer matches returned count.  And make sure we converted all
    
    32
    +     ;; the characters in the string.
    
    33
    +     (assert-equal (length octets) count)
    
    34
    +     (assert-equal (length ,string) converted)
    
    35
    +     ;; Finally, make sure that STRING-OCTET-COUNT returns the same
    
    36
    +     ;; number of octets from STRING-TO-OCTETS.
    
    37
    +     (assert-equal (length octets)
    
    38
    +		   (stream::string-octet-count ,string :external-format ,format))))
    
    39
    +
    
    40
    +(define-test octet-count.iso8859-1
    
    41
    +    (:tag :octet-count)
    
    42
    +  (test-octet-count *test-iso8859-1* :iso8859-1))
    
    43
    +
    
    44
    +(define-test octet-count.ascii
    
    45
    +    (:tag :octet-count)
    
    46
    +  (test-octet-count *test-iso8859-1* :ascii))
    
    47
    +
    
    48
    +(define-test octet-count.ascii.error
    
    49
    +    (:tag :octet-count)
    
    50
    +  (assert-error 'simple-error
    
    51
    +		(stream::string-octet-count *test-iso8859-1*
    
    52
    +					    :external-format :ascii
    
    53
    +					    :error 'error)))
    
    54
    +
    
    55
    +(define-test octet-count.utf-8
    
    56
    +    (:tag :octet-count)
    
    57
    +  (test-octet-count *test-unicode* :utf-8))
    
    58
    +
    
    59
    +(define-test octet-count.utf-16
    
    60
    +    (:tag :octet-count)
    
    61
    +  (test-octet-count *test-unicode* :utf-16))
    
    62
    +
    
    63
    +(define-test octet-count.utf-16-be
    
    64
    +    (:tag :octet-count)
    
    65
    +  (test-octet-count *test-unicode* :utf-16-be))
    
    66
    +
    
    67
    +(define-test octet-count.utf-16-le
    
    68
    +    (:tag :octet-count)
    
    69
    +  (test-octet-count *test-unicode* :utf-16-le))
    
    70
    +
    
    71
    +(define-test octet-count.utf-32
    
    72
    +    (:tag :octet-count)
    
    73
    +  (test-octet-count *test-unicode* :utf-32))
    
    74
    +
    
    75
    +(define-test octet-count.utf-32-le
    
    76
    +    (:tag :octet-count)
    
    77
    +  (test-octet-count *test-unicode* :utf-32-le))
    
    78
    +
    
    79
    +(define-test octet-count.utf-32-le
    
    80
    +    (:tag :octet-count)
    
    81
    +  (test-octet-count *test-unicode* :utf-32-le))
    
    82
    +
    
    83
    +(define-test octet-count.euc-kr
    
    84
    +    (:tag :octet-count)
    
    85
    +  (test-octet-count *test-unicode* :euc-kr))
    
    86
    +
    
    87
    +(define-test octet-count.iso8859-2
    
    88
    +    (:tag :octet-count)
    
    89
    +  (test-octet-count *test-iso8859-1* :iso8859-2))
    
    90
    +
    
    91
    +(define-test octet-count.iso8859-3
    
    92
    +    (:tag :octet-count)
    
    93
    +  (test-octet-count *test-iso8859-1* :iso8859-3))
    
    94
    +
    
    95
    +(define-test octet-count.iso8859-4
    
    96
    +    (:tag :octet-count)
    
    97
    +  (test-octet-count *test-iso8859-1* :iso8859-4))
    
    98
    +
    
    99
    +(define-test octet-count.iso8859-5
    
    100
    +    (:tag :octet-count)
    
    101
    +  (test-octet-count *test-iso8859-1* :iso8859-5))
    
    102
    +
    
    103
    +(define-test octet-count.iso8859-6
    
    104
    +    (:tag :octet-count)
    
    105
    +  (test-octet-count *test-iso8859-1* :iso8859-6))
    
    106
    +
    
    107
    +(define-test octet-count.iso8859-7
    
    108
    +    (:tag :octet-count)
    
    109
    +  (test-octet-count *test-iso8859-1* :iso8859-7))
    
    110
    +
    
    111
    +(define-test octet-count.iso8859-8
    
    112
    +    (:tag :octet-count)
    
    113
    +  (test-octet-count *test-iso8859-1* :iso8859-8))
    
    114
    +
    
    115
    +(define-test octet-count.iso8859-10
    
    116
    +    (:tag :octet-count)
    
    117
    +  (test-octet-count *test-iso8859-1* :iso8859-10))
    
    118
    +
    
    119
    +(define-test octet-count.iso8859-13
    
    120
    +    (:tag :octet-count)
    
    121
    +  (test-octet-count *test-iso8859-1* :iso8859-13))
    
    122
    +
    
    123
    +(define-test octet-count.iso8859-14
    
    124
    +    (:tag :octet-count)
    
    125
    +  (test-octet-count *test-iso8859-1* :iso8859-14))
    
    126
    +
    
    127
    +(define-test octet-count.iso8859-15
    
    128
    +    (:tag :octet-count)
    
    129
    +  (test-octet-count *test-iso8859-1* :iso8859-15))
    
    130
    +
    
    131
    +(define-test octet-count.mac-roman
    
    132
    +    (:tag :octet-count)
    
    133
    +  (test-octet-count *test-iso8859-1* :mac-roman))
    
    134
    +  
    
    135
    +