Raymond Toy pushed to branch master at cmucl / cmucl
Commits:
-
fd391099
by Raymond Toy at 2025-01-31T14:39:40+00:00
-
12e15d8b
by Raymond Toy at 2025-01-31T14:39:40+00:00
18 changed files:
- .gitlab-ci.yml
- + src/bootfiles/21e/boot-2024-08.lisp
- src/code/exports.lisp
- src/code/extfmts.lisp
- src/i18n/locale/cmucl.pot
- src/pcl/simple-streams/external-formats/ascii.lisp
- src/pcl/simple-streams/external-formats/euc-kr.lisp
- src/pcl/simple-streams/external-formats/iso8859-1.lisp
- src/pcl/simple-streams/external-formats/iso8859-2.lisp
- src/pcl/simple-streams/external-formats/mac-roman.lisp
- src/pcl/simple-streams/external-formats/utf-16-be.lisp
- src/pcl/simple-streams/external-formats/utf-16-le.lisp
- src/pcl/simple-streams/external-formats/utf-16.lisp
- src/pcl/simple-streams/external-formats/utf-32-be.lisp
- src/pcl/simple-streams/external-formats/utf-32-le.lisp
- src/pcl/simple-streams/external-formats/utf-32.lisp
- src/pcl/simple-streams/external-formats/utf-8.lisp
- + tests/external-formats.lisp
Changes:
1 | 1 | variables:
|
2 | 2 | download_url: "https://common-lisp.net/project/cmucl/downloads/snapshots/2024/08"
|
3 | 3 | version: "2024-08-x86"
|
4 | - bootstrap: ""
|
|
4 | + bootstrap: "-B boot-2024-08"
|
|
5 | 5 | |
6 | 6 | |
7 | 7 | stages:
|
1 | +;; Bootstrap file for the ef-octet-count changes. Just need to change
|
|
2 | +;; the value of +ef-max+
|
|
3 | + |
|
4 | +(in-package "STREAM")
|
|
5 | + |
|
6 | +(handler-bind
|
|
7 | + ((error (lambda (c)
|
|
8 | + (declare (ignore c))
|
|
9 | + (invoke-restart 'continue))))
|
|
10 | + (defconstant +ef-max+ 14)) |
... | ... | @@ -1456,7 +1456,8 @@ |
1456 | 1456 | (name
|
1457 | 1457 | '("STRING-TO-OCTETS" "OCTETS-TO-STRING" "*DEFAULT-EXTERNAL-FORMAT*"
|
1458 | 1458 | "STRING-ENCODE" "STRING-DECODE" "SET-SYSTEM-EXTERNAL-FORMAT"
|
1459 | - "LIST-ALL-EXTERNAL-FORMATS" "DESCRIBE-EXTERNAL-FORMAT"))
|
|
1459 | + "LIST-ALL-EXTERNAL-FORMATS" "DESCRIBE-EXTERNAL-FORMAT"
|
|
1460 | + "STRING-OCTET-COUNT"))
|
|
1460 | 1461 | (intern name "STREAM"))
|
1461 | 1462 | |
1462 | 1463 | (defpackage "EXTENSIONS"
|
... | ... | @@ -19,7 +19,8 @@ |
19 | 19 | string-encode string-decode set-system-external-format
|
20 | 20 | +replacement-character-code+
|
21 | 21 | list-all-external-formats
|
22 | - describe-external-format))
|
|
22 | + describe-external-format
|
|
23 | + string-octet-count))
|
|
23 | 24 | |
24 | 25 | (defvar *default-external-format*
|
25 | 26 | :utf-8
|
... | ... | @@ -52,6 +53,7 @@ |
52 | 53 | flush ; flush state
|
53 | 54 | copy-state ; copy state
|
54 | 55 | osc ; octets to string, counted
|
56 | + oc ; number of octets to encode string
|
|
55 | 57 | max)
|
56 | 58 | |
57 | 59 | ;; Unicode replacement character U+FFFD
|
... | ... | @@ -96,6 +98,11 @@ |
96 | 98 | (copy-state nil :type (or null function) :read-only t)
|
97 | 99 | (cache nil :type (or null simple-vector))
|
98 | 100 | ;;
|
101 | + ;; Function to count the number of octets needed to encode a
|
|
102 | + ;; codepoint. Basically like code-to-octets, except we return the
|
|
103 | + ;; number of octets needed instead of the octets themselves.
|
|
104 | + (octet-count #'%efni :type (or null function) :read-only t)
|
|
105 | + ;;
|
|
99 | 106 | ;; Minimum number of octets needed to form a codepoint
|
100 | 107 | (min 1 :type kernel:index :read-only t)
|
101 | 108 | ;;
|
... | ... | @@ -126,7 +133,8 @@ |
126 | 133 | (setf (gethash (ef-name ef) *external-formats*) ef))
|
127 | 134 | |
128 | 135 | (declaim (inline ef-octets-to-code ef-code-to-octets ef-flush-state ef-copy-state
|
129 | - ef-cache ef-min-octets ef-max-octets))
|
|
136 | + ef-cache ef-min-octets ef-max-octets
|
|
137 | + ef-octet-count))
|
|
130 | 138 | |
131 | 139 | (defun ef-octets-to-code (ef)
|
132 | 140 | (efx-octets-to-code (ef-efx ef)))
|
... | ... | @@ -143,6 +151,9 @@ |
143 | 151 | (defun ef-cache (ef)
|
144 | 152 | (efx-cache (ef-efx ef)))
|
145 | 153 | |
154 | +(defun ef-octet-count (ef)
|
|
155 | + (efx-octet-count (ef-efx ef)))
|
|
156 | + |
|
146 | 157 | (defun ef-min-octets (ef)
|
147 | 158 | (efx-min (ef-efx ef)))
|
148 | 159 | |
... | ... | @@ -166,7 +177,7 @@ |
166 | 177 | ;;; DEFINE-EXTERNAL-FORMAT -- Public
|
167 | 178 | ;;;
|
168 | 179 | ;;; name (&key base min max size documentation) (&rest slots) octets-to-code
|
169 | -;;; code-to-octets flush-state copy-state
|
|
180 | +;;; code-to-octets flush-state copy-state octet-count
|
|
170 | 181 | ;;;
|
171 | 182 | ;;; Define a new external format. If base is specified, then an
|
172 | 183 | ;;; external format is defined that is based on a previously defined
|
... | ... | @@ -228,6 +239,15 @@ |
228 | 239 | ;;; This should probably be a deep copy so that if the original
|
229 | 240 | ;;; state is modified, the copy is not.
|
230 | 241 | ;;;
|
242 | +;;; octet-count (code state error &rest vars)
|
|
243 | +;;; Defines a form to determine the number of octets needed to
|
|
244 | +;;; encode the given CODE using the external format. This is
|
|
245 | +;;; essentially the same as CODE-TO-OCTETS, except the encoding is
|
|
246 | +;;; not saved anywhere. ERROR is the same as in CODE-TO-OCTETS.
|
|
247 | +;;;
|
|
248 | +;;; This should return one value: the number of octets needed to
|
|
249 | +;;; encode the given code.
|
|
250 | +;;;
|
|
231 | 251 | ;;; Note: external-formats work on code-points, not
|
232 | 252 | ;;; characters, so that the entire 31 bit ISO-10646 range can be
|
233 | 253 | ;;; used internally regardless of the size of a character recognized
|
... | ... | @@ -238,7 +258,7 @@ |
238 | 258 | (defmacro define-external-format (name (&key base min max size (documentation ""))
|
239 | 259 | (&rest slots)
|
240 | 260 | &optional octets-to-code code-to-octets
|
241 | - flush-state copy-state)
|
|
261 | + flush-state copy-state octet-count)
|
|
242 | 262 | (let* ((tmp (gensym))
|
243 | 263 | (min (or min size 1))
|
244 | 264 | (max (or max size 6))
|
... | ... | @@ -282,7 +302,17 @@ |
282 | 302 | (declare (ignorable ,state))
|
283 | 303 | (let (,@',slotb
|
284 | 304 | ,@(loop for var in vars collect `(,var (gensym))))
|
285 | - ,body))))
|
|
305 | + ,body)))
|
|
306 | + (octet-count ((code state error &rest vars) body)
|
|
307 | + `(lambda (,',tmp ,state ,error)
|
|
308 | + (declare (ignorable ,state ,error)
|
|
309 | + (optimize (ext:inhibit-warnings 3)))
|
|
310 | + (let (,@',slotb
|
|
311 | + (,code ',code)
|
|
312 | + ,@(loop for var in vars collect `(,var (gensym))))
|
|
313 | + `(let ((,',code (the lisp:codepoint ,,',tmp)))
|
|
314 | + (declare (ignorable ,',code))
|
|
315 | + ,,body)))))
|
|
286 | 316 | (%intern-ef (make-external-format ,name
|
287 | 317 | ,(if base
|
288 | 318 | `(ef-efx (find-external-format ,(ef-name base)))
|
... | ... | @@ -291,7 +321,8 @@ |
291 | 321 | :flush-state ,flush-state
|
292 | 322 | :copy-state ,copy-state
|
293 | 323 | :cache (make-array +ef-max+
|
294 | - :initial-element nil)
|
|
324 | + :initial-element nil)
|
|
325 | + :octet-count ,octet-count
|
|
295 | 326 | :min ,(min min max)
|
296 | 327 | :max ,(max min max)))
|
297 | 328 | nil
|
... | ... | @@ -688,7 +719,20 @@ character and illegal outputs are replaced by a question mark.") |
688 | 719 | (intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream")
|
689 | 720 | ,code 1))
|
690 | 721 | #x3F)
|
691 | - ,code))))
|
|
722 | + ,code)))
|
|
723 | + ()
|
|
724 | + ()
|
|
725 | + (octet-count (code state error)
|
|
726 | + `(if (> ,code 255)
|
|
727 | + (if ,error
|
|
728 | + (locally
|
|
729 | + ;; No warnings about fdefinition
|
|
730 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
731 | + (funcall ,error
|
|
732 | + (intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream")
|
|
733 | + ,code 1))
|
|
734 | + 1)
|
|
735 | + 1)))
|
|
692 | 736 | |
693 | 737 | ;;; OCTETS-TO-CODEPOINT, CODEPOINT-TO-OCTETS -- Semi-Public
|
694 | 738 | ;;;
|
... | ... | @@ -709,6 +753,10 @@ character and illegal outputs are replaced by a question mark.") |
709 | 753 | (let ((ef (find-external-format external-format)))
|
710 | 754 | (funcall (ef-code-to-octets ef) code state output error)))
|
711 | 755 | |
756 | +(defmacro count-codepoint-octets (external-format code state &optional error)
|
|
757 | + (let ((ef (find-external-format external-format)))
|
|
758 | + (funcall (ef-octet-count ef) code state error)))
|
|
759 | + |
|
712 | 760 | |
713 | 761 | |
714 | 762 | (defvar *ef-base* +ef-max+)
|
... | ... | @@ -878,6 +926,42 @@ character and illegal outputs are replaced by a question mark.") |
878 | 926 | (when f
|
879 | 927 | (funcall f state))))
|
880 | 928 | |
929 | +(defmacro octet-count (external-format char state &optional error)
|
|
930 | + (let ((nchar (gensym))
|
|
931 | + (nstate (gensym))
|
|
932 | + (count-it (gensym))
|
|
933 | + (ch (gensym)))
|
|
934 | + `(let ((,nchar ,char)
|
|
935 | + (,nstate ,state))
|
|
936 | + (when (null ,nstate) (setq ,nstate (setf ,state (cons nil nil))))
|
|
937 | + (if (lisp::surrogatep (char-code ,nchar) :high)
|
|
938 | + (setf (car ,nstate) ,nchar)
|
|
939 | + (flet ((,count-it (,ch)
|
|
940 | + (count-codepoint-octets ,external-format ,ch (cdr ,nstate) ,error)))
|
|
941 | + (if (car ,nstate)
|
|
942 | + (prog1
|
|
943 | + (,count-it (if (lisp::surrogatep (char-code ,nchar) :low)
|
|
944 | + (surrogates-to-codepoint (car ,nstate) ,nchar)
|
|
945 | + (if ,error
|
|
946 | + (locally
|
|
947 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
948 | + (funcall ,error
|
|
949 | + (intl:gettext "Cannot convert invalide surrogate #~x~X to character")
|
|
950 | + ,nchar))
|
|
951 | + +replacement-character-code+)))
|
|
952 | + (setf (car ,nstate) nil))
|
|
953 | + ;; A lone trailing (low surrogate gets replaced with
|
|
954 | + ;; the replacement character.
|
|
955 | + (,count-it (if (lisp::surrogatep (char-code ,nchar) :low)
|
|
956 | + (if ,error
|
|
957 | + (locally
|
|
958 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
959 | + (funcall ,error
|
|
960 | + (intl:gettext "Cannot convert lone trailing surrogate #x~X to character")
|
|
961 | + ,nchar))
|
|
962 | + +replacement-character-code+)
|
|
963 | + (char-code ,nchar)))))))))
|
|
964 | + |
|
881 | 965 | (def-ef-macro ef-string-to-octets (extfmt lisp::lisp +ef-max+ +ef-so+)
|
882 | 966 | `(lambda (string start end buffer buffer-start buffer-end error bufferp
|
883 | 967 | &aux (ptr buffer-start) (state nil) (last-octet buffer-start))
|
... | ... | @@ -1071,6 +1155,31 @@ character and illegal outputs are replaced by a question mark.") |
1071 | 1155 | (values (if stringp string (lisp::shrink-vector string pos)) (- pos s-start) last-octet new-state))))
|
1072 | 1156 | |
1073 | 1157 | |
1158 | +(def-ef-macro ef-string-octet-count (extfmt lisp::lisp +ef-max+ +ef-oc+)
|
|
1159 | + `(lambda (string start end error &aux (total 0) (state nil))
|
|
1160 | + (dotimes (i (- end start) total)
|
|
1161 | + (incf total
|
|
1162 | + (octet-count ,extfmt (schar string (+ start i)) state error)))))
|
|
1163 | + |
|
1164 | +(defun string-octet-count (string &key (start 0) end (external-format :default) error)
|
|
1165 | + "Compute the number of octets needed to convert String using the
|
|
1166 | + specified External-format. The string is bound by Start (defaulting
|
|
1167 | + to 0) and End (defaulting to the end of the string)."
|
|
1168 | + (let ((composing-format-p
|
|
1169 | + ;; Determine is the external format is a composing format
|
|
1170 | + ;; which we determine by seeing that the name of the format
|
|
1171 | + ;; is a cons. Probably not the best way.
|
|
1172 | + (consp (ef-name (find-external-format external-format)))))
|
|
1173 | + ;; We currently don't know how to get just the number of octets
|
|
1174 | + ;; when a composing external format is used. As a workaround, use
|
|
1175 | + ;; STRING-TO-OCTETS to find the number of octets.
|
|
1176 | + (if composing-format-p
|
|
1177 | + (nth-value 1
|
|
1178 | + (string-to-octets string :start start :end end
|
|
1179 | + :external-format external-format))
|
|
1180 | + (lisp::with-array-data ((string string) (start start) (end end))
|
|
1181 | + (funcall (ef-string-octet-count external-format)
|
|
1182 | + string start end error)))))
|
|
1074 | 1183 | |
1075 | 1184 | (def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
|
1076 | 1185 | `(lambda (string start end result error &aux (ptr 0) (state nil))
|
... | ... | @@ -1186,10 +1295,11 @@ character and illegal outputs are replaced by a question mark.") |
1186 | 1295 | (#.+ef-so+ (%ef-string-to-octets ef))
|
1187 | 1296 | (#.+ef-en+ (%ef-encode ef))
|
1188 | 1297 | (#.+ef-de+ (%ef-decode ef))
|
1189 | - (#.+ef-osc+ (%ef-octets-to-string-counted ef))))
|
|
1298 | + (#.+ef-osc+ (%ef-octets-to-string-counted ef))
|
|
1299 | + (#.+ef-oc+ (%ef-octet-count ef))))
|
|
1190 | 1300 | `(setf (aref (ef-cache (find-external-format ,(ef-name ef))) ,slot)
|
1191 | 1301 | ,(subst (ef-name ef) ef
|
1192 | - (function-lambda-expression (aref (ef-cache ef) slot))))))
|
|
1302 | + (function-lambda-expression (aref (ef-cache ef) slot)))))))
|
|
1193 | 1303 | |
1194 | 1304 | ;;; Builtin external formats.
|
1195 | 1305 | |
... | ... | @@ -1307,7 +1417,17 @@ replacement character.") |
1307 | 1417 | ((< ,code #x800) (utf8 ,code 1))
|
1308 | 1418 | ((< ,code #x10000) (utf8 ,code 2))
|
1309 | 1419 | ((< ,code #x110000) (utf8 ,code 3))
|
1310 | - (t (error "How did this happen? Codepoint U+~X is illegal" ,code))))))
|
|
1420 | + (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))
|
|
1421 | + ()
|
|
1422 | + ()
|
|
1423 | + (octet-count (code state error)
|
|
1424 | + `(locally
|
|
1425 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
1426 | + (cond ((< ,code #x80) 1)
|
|
1427 | + ((< ,code #x800) 2)
|
|
1428 | + ((< ,code #x10000) 3)
|
|
1429 | + ((< ,code #x110000) 4)
|
|
1430 | + (t (error "How did this happen? Codepoint U+~X is illegal" ,code))))))
|
|
1311 | 1431 | |
1312 | 1432 | (define-external-format :ascii (:size 1 :documentation
|
1313 | 1433 | "US ASCII 7-bit encoding. Illegal input sequences are replaced with
|
... | ... | @@ -1333,4 +1453,14 @@ replaced with a question mark.") |
1333 | 1453 | (declare (optimize (ext:inhibit-warnings 3)))
|
1334 | 1454 | (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
|
1335 | 1455 | #x3F)
|
1336 | - ,code)))) |
|
1456 | + ,code)))
|
|
1457 | + ()
|
|
1458 | + ()
|
|
1459 | + (octet-count (code state error)
|
|
1460 | + `(if (> ,code #x7f)
|
|
1461 | + (if ,error
|
|
1462 | + (locally
|
|
1463 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
1464 | + (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
|
|
1465 | + 1)
|
|
1466 | + 1))) |
... | ... | @@ -9300,6 +9300,13 @@ msgid "" |
9300 | 9300 | " external format."
|
9301 | 9301 | msgstr ""
|
9302 | 9302 | |
9303 | +#: src/code/extfmts.lisp
|
|
9304 | +msgid ""
|
|
9305 | +"Compute the number of octets needed to convert String using the\n"
|
|
9306 | +" specified External-format. The string is bound by Start (defaulting\n"
|
|
9307 | +" to 0) and End (defaulting to the end of the string)."
|
|
9308 | +msgstr ""
|
|
9309 | + |
|
9303 | 9310 | #: src/code/extfmts.lisp
|
9304 | 9311 | msgid ""
|
9305 | 9312 | "Encode the given String using External-Format and return a new\n"
|
... | ... | @@ -33,5 +33,15 @@ replaced with a question mark.") |
33 | 33 | (declare (optimize (ext:inhibit-warnings 3)))
|
34 | 34 | (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
|
35 | 35 | #x3F)
|
36 | - ,code))))
|
|
36 | + ,code)))
|
|
37 | + ()
|
|
38 | + ()
|
|
39 | + (octet-count (code state error)
|
|
40 | + `(if (> ,code #x7f)
|
|
41 | + (if ,error
|
|
42 | + (locally
|
|
43 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
44 | + (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
|
|
45 | + 1)
|
|
46 | + 1)))
|
|
37 | 47 |
... | ... | @@ -1007,4 +1007,16 @@ character and illegal outputs are replaced by a question mark.") |
1007 | 1007 | (t
|
1008 | 1008 | (if ,error
|
1009 | 1009 | (funcall ,error "Cannot output codepoint #x~X to EUC-KR format." ,code)
|
1010 | - (,output #X3f)))))))) |
|
1010 | + (,output #X3f)))))))
|
|
1011 | + ()
|
|
1012 | + ()
|
|
1013 | + (octet-count (code state error present)
|
|
1014 | + `(if (<= ,code #x7f)
|
|
1015 | + 1
|
|
1016 | + (let ((,present (get-inverse ,itable ,code)))
|
|
1017 | + (cond (,present
|
|
1018 | + 2)
|
|
1019 | + (t
|
|
1020 | + (if ,error
|
|
1021 | + (funcall ,error "Cannot output codepoint #x~X to EUC-KR format." ,code)
|
|
1022 | + 1))))))) |
... | ... | @@ -31,4 +31,17 @@ character and illegal outputs are replaced by a question mark.") |
31 | 31 | (funcall ,error "Cannot output codepoint #x~X to ISO8859-1 stream"
|
32 | 32 | ,code 1))
|
33 | 33 | #x3F)
|
34 | - ,code)))) |
|
34 | + ,code)))
|
|
35 | + ()
|
|
36 | + ()
|
|
37 | + (octet-count (code state error)
|
|
38 | + `(if (> ,code 255)
|
|
39 | + (if ,error
|
|
40 | + (locally
|
|
41 | + ;; No warnings about fdefinition
|
|
42 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
43 | + (funcall ,error
|
|
44 | + (intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream")
|
|
45 | + ,code 1))
|
|
46 | + 1)
|
|
47 | + 1))) |
... | ... | @@ -47,4 +47,19 @@ character and illegal outputs are replaced by a question mark.") |
47 | 47 | (declare (optimize (ext:inhibit-warnings 3)))
|
48 | 48 | (funcall ,error "Cannot output codepoint #x~X to ISO8859-2 stream"
|
49 | 49 | ,code))
|
50 | - #x3F))))))) |
|
50 | + #x3F))))))
|
|
51 | + ()
|
|
52 | + ()
|
|
53 | + (octet-count (code state error present)
|
|
54 | + `(if (< ,code 160)
|
|
55 | + 1
|
|
56 | + (let ((,present (get-inverse ,itable ,code)))
|
|
57 | + (if ,present
|
|
58 | + 1
|
|
59 | + (if ,error
|
|
60 | + (locally
|
|
61 | + ;; No warnings about fdefinition
|
|
62 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
63 | + (funcall ,error "Cannot output codepoint #x~X to ISO8859-2 stream"
|
|
64 | + ,code))
|
|
65 | + 1)))))) |
... | ... | @@ -49,4 +49,19 @@ character and illegal outputs are replaced by a question mark.") |
49 | 49 | (declare (optimize (ext:inhibit-warnings 3)))
|
50 | 50 | (funcall ,error "Cannot output codepoint #x~X to MAC-ROMAN stream"
|
51 | 51 | ,code))
|
52 | - #x3F))))))) |
|
52 | + #x3F))))))
|
|
53 | + ()
|
|
54 | + ()
|
|
55 | + (octet-count (code state error present)
|
|
56 | + `(if (< ,code 128)
|
|
57 | + 1
|
|
58 | + (let ((,present (get-inverse ,itable ,code)))
|
|
59 | + (if ,present
|
|
60 | + 1
|
|
61 | + (if ,error
|
|
62 | + (locally
|
|
63 | + ;; No warnings about fdefinition
|
|
64 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
65 | + (funcall ,error "Cannot output codepoint #x~X to MAC-ROMAN stream"
|
|
66 | + ,code))
|
|
67 | + 1)))))) |
... | ... | @@ -110,4 +110,12 @@ Unicode replacement character.") |
110 | 110 | (copy-state (state)
|
111 | 111 | ;; The state is either NIL or a codepoint, so nothing really
|
112 | 112 | ;; special is needed to copy it.
|
113 | - `(progn ,state))) |
|
113 | + `(progn ,state))
|
|
114 | + (octet-count (code state error)
|
|
115 | + `(cond ((< ,code #x10000)
|
|
116 | + 2)
|
|
117 | + ((< ,code #x110000)
|
|
118 | + 4)
|
|
119 | + (t
|
|
120 | + ;; Replacement character is 2 octets
|
|
121 | + 2)))) |
... | ... | @@ -111,4 +111,12 @@ Unicode replacement character.") |
111 | 111 | (copy-state (state)
|
112 | 112 | ;; The state is either NIL or a codepoint, so nothing really
|
113 | 113 | ;; special is needed.
|
114 | - `(progn ,state))) |
|
114 | + `(progn ,state))
|
|
115 | + (octet-count (code state error)
|
|
116 | + `(cond ((< ,code #x10000)
|
|
117 | + 2)
|
|
118 | + ((< ,code #x110000)
|
|
119 | + 4)
|
|
120 | + (t
|
|
121 | + ;; Replacement character is 2 octets
|
|
122 | + 2)))) |
... | ... | @@ -156,4 +156,18 @@ Unicode replacement character.") |
156 | 156 | ,c))))))
|
157 | 157 | (copy-state (state)
|
158 | 158 | ;; The state is list. Copy it
|
159 | - `(copy-list ,state))) |
|
159 | + `(copy-list ,state))
|
|
160 | + (octet-count (code state error)
|
|
161 | + `(let ((bom-count 0))
|
|
162 | + (unless ,state
|
|
163 | + ;; Output BOM
|
|
164 | + (setf bom-count 2)
|
|
165 | + (setf ,state t))
|
|
166 | + (+ bom-count
|
|
167 | + (cond ((< ,code #x10000)
|
|
168 | + 2)
|
|
169 | + ((< ,code #x110000)
|
|
170 | + 4)
|
|
171 | + (t
|
|
172 | + ;; Replacement character is 2 octets
|
|
173 | + 2)))))) |
... | ... | @@ -61,4 +61,18 @@ Unicode replacement character.") |
61 | 61 | ,code))
|
62 | 62 | +replacement-character-code+)))
|
63 | 63 | (t
|
64 | - (out ,code)))))) |
|
64 | + (out ,code)))))
|
|
65 | + ()
|
|
66 | + ()
|
|
67 | + (octet-count (code state error)
|
|
68 | + `(cond ((lisp::surrogatep ,code)
|
|
69 | + (if ,error
|
|
70 | + (locally
|
|
71 | + ;; No warnings about fdefinition
|
|
72 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
73 | + (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output"
|
|
74 | + ,code))
|
|
75 | + ;; Replacement character is 2 octets
|
|
76 | + 2))
|
|
77 | + (t
|
|
78 | + 4)))) |
... | ... | @@ -62,4 +62,18 @@ Unicode replacement character.") |
62 | 62 | ,code))
|
63 | 63 | +replacement-character-code+)))
|
64 | 64 | (t
|
65 | - (out ,code)))))) |
|
65 | + (out ,code)))))
|
|
66 | + ()
|
|
67 | + ()
|
|
68 | + (octet-count (code state error)
|
|
69 | + `(cond ((lisp::surrogatep ,code)
|
|
70 | + (if ,error
|
|
71 | + (locally
|
|
72 | + ;; No warnings about fdefinition
|
|
73 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
74 | + (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output"
|
|
75 | + ,code))
|
|
76 | + ;; Replacement character is 2 octets
|
|
77 | + 2))
|
|
78 | + (t
|
|
79 | + 4)))) |
... | ... | @@ -114,4 +114,20 @@ Unicode replacement character.") |
114 | 114 | nil
|
115 | 115 | (copy-state (state)
|
116 | 116 | ;; The state is either NIL or T, so we can just return that.
|
117 | - `(progn ,state))) |
|
117 | + `(progn ,state))
|
|
118 | + (octet-count (code state error)
|
|
119 | + `(let ((bom-count 0))
|
|
120 | + (unless ,state
|
|
121 | + (setf bom-count 4)
|
|
122 | + (setf ,state t))
|
|
123 | + (cond ((lisp::surrogatep ,code)
|
|
124 | + (if ,error
|
|
125 | + (locally
|
|
126 | + ;; No warnings about fdefinition
|
|
127 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
128 | + (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output"
|
|
129 | + ,code))
|
|
130 | + ;; Replacement character is 2 octets
|
|
131 | + (+ 2 bom-count)))
|
|
132 | + (t
|
|
133 | + (+ 4 bom-count)))))) |
... | ... | @@ -127,4 +127,14 @@ replacement character.") |
127 | 127 | ((< ,code #x800) (utf8 ,code 1))
|
128 | 128 | ((< ,code #x10000) (utf8 ,code 2))
|
129 | 129 | ((< ,code #x110000) (utf8 ,code 3))
|
130 | - (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))) |
|
130 | + (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))
|
|
131 | + ()
|
|
132 | + ()
|
|
133 | + (octet-count (code state error)
|
|
134 | + `(locally
|
|
135 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
136 | + (cond ((< ,code #x80) 1)
|
|
137 | + ((< ,code #x800) 2)
|
|
138 | + ((< ,code #x10000) 3)
|
|
139 | + ((< ,code #x110000) 4)
|
|
140 | + (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))) |
1 | +;;; Tests for external formats
|
|
2 | + |
|
3 | +(defpackage :external-formats-tests
|
|
4 | + (:use :cl :lisp-unit))
|
|
5 | + |
|
6 | +(in-package "EXTERNAL-FORMATS-TESTS")
|
|
7 | + |
|
8 | +(defparameter *test-iso8859-1*
|
|
9 | + (let ((rs (kernel::make-random-object :state (kernel::init-random-state 27182828))))
|
|
10 | + (lisp::codepoints-string
|
|
11 | + (loop for k from 0 below 1000
|
|
12 | + collect (random 256 rs))))
|
|
13 | + "Random test string with ISO8859-1 characters")
|
|
14 | + |
|
15 | +(defparameter *test-unicode*
|
|
16 | + (let ((rs (kernel::make-random-object :state (kernel::init-random-state 27182828))))
|
|
17 | + (lisp::codepoints-string
|
|
18 | + (loop for k from 0 below 1000
|
|
19 | + collect (random 20000 rs))))
|
|
20 | + "Random test string with codepoints below 20000")
|
|
21 | + |
|
22 | + |
|
23 | + |
|
24 | +(defmacro test-octet-count (string format)
|
|
25 | + "Test that STRING-OCTET-COUNT returns the correct number of octets"
|
|
26 | + ;; We expect STRING-OCTET-COUNT returns the same number of octets
|
|
27 | + ;; that are produced by STRING-TO-OCTETS.
|
|
28 | + `(multiple-value-bind (octets count converted)
|
|
29 | + (stream:string-to-octets ,string :external-format ,format)
|
|
30 | + ;; While we're at it, make sure that the length of the octet
|
|
31 | + ;; buffer matches returned count. And make sure we converted all
|
|
32 | + ;; the characters in the string.
|
|
33 | + (assert-equal (length octets) count)
|
|
34 | + (assert-equal (length ,string) converted)
|
|
35 | + ;; Finally, make sure that STRING-OCTET-COUNT returns the same
|
|
36 | + ;; number of octets from STRING-TO-OCTETS.
|
|
37 | + (assert-equal (length octets)
|
|
38 | + (stream::string-octet-count ,string :external-format ,format))))
|
|
39 | + |
|
40 | +(define-test octet-count.iso8859-1
|
|
41 | + (:tag :octet-count)
|
|
42 | + (test-octet-count *test-iso8859-1* :iso8859-1))
|
|
43 | + |
|
44 | +(define-test octet-count.ascii
|
|
45 | + (:tag :octet-count)
|
|
46 | + (test-octet-count *test-iso8859-1* :ascii))
|
|
47 | + |
|
48 | +(define-test octet-count.ascii.error
|
|
49 | + (:tag :octet-count)
|
|
50 | + (assert-error 'simple-error
|
|
51 | + (stream::string-octet-count *test-iso8859-1*
|
|
52 | + :external-format :ascii
|
|
53 | + :error 'error)))
|
|
54 | + |
|
55 | +(define-test octet-count.utf-8
|
|
56 | + (:tag :octet-count)
|
|
57 | + (test-octet-count *test-unicode* :utf-8))
|
|
58 | + |
|
59 | +(define-test octet-count.utf-16
|
|
60 | + (:tag :octet-count)
|
|
61 | + (test-octet-count *test-unicode* :utf-16))
|
|
62 | + |
|
63 | +(define-test octet-count.utf-16-be
|
|
64 | + (:tag :octet-count)
|
|
65 | + (test-octet-count *test-unicode* :utf-16-be))
|
|
66 | + |
|
67 | +(define-test octet-count.utf-16-le
|
|
68 | + (:tag :octet-count)
|
|
69 | + (test-octet-count *test-unicode* :utf-16-le))
|
|
70 | + |
|
71 | +(define-test octet-count.utf-32
|
|
72 | + (:tag :octet-count)
|
|
73 | + (test-octet-count *test-unicode* :utf-32))
|
|
74 | + |
|
75 | +(define-test octet-count.utf-32-le
|
|
76 | + (:tag :octet-count)
|
|
77 | + (test-octet-count *test-unicode* :utf-32-le))
|
|
78 | + |
|
79 | +(define-test octet-count.utf-32-le
|
|
80 | + (:tag :octet-count)
|
|
81 | + (test-octet-count *test-unicode* :utf-32-le))
|
|
82 | + |
|
83 | +(define-test octet-count.euc-kr
|
|
84 | + (:tag :octet-count)
|
|
85 | + (test-octet-count *test-unicode* :euc-kr))
|
|
86 | + |
|
87 | +(define-test octet-count.iso8859-2
|
|
88 | + (:tag :octet-count)
|
|
89 | + (test-octet-count *test-iso8859-1* :iso8859-2))
|
|
90 | + |
|
91 | +(define-test octet-count.iso8859-3
|
|
92 | + (:tag :octet-count)
|
|
93 | + (test-octet-count *test-iso8859-1* :iso8859-3))
|
|
94 | + |
|
95 | +(define-test octet-count.iso8859-4
|
|
96 | + (:tag :octet-count)
|
|
97 | + (test-octet-count *test-iso8859-1* :iso8859-4))
|
|
98 | + |
|
99 | +(define-test octet-count.iso8859-5
|
|
100 | + (:tag :octet-count)
|
|
101 | + (test-octet-count *test-iso8859-1* :iso8859-5))
|
|
102 | + |
|
103 | +(define-test octet-count.iso8859-6
|
|
104 | + (:tag :octet-count)
|
|
105 | + (test-octet-count *test-iso8859-1* :iso8859-6))
|
|
106 | + |
|
107 | +(define-test octet-count.iso8859-7
|
|
108 | + (:tag :octet-count)
|
|
109 | + (test-octet-count *test-iso8859-1* :iso8859-7))
|
|
110 | + |
|
111 | +(define-test octet-count.iso8859-8
|
|
112 | + (:tag :octet-count)
|
|
113 | + (test-octet-count *test-iso8859-1* :iso8859-8))
|
|
114 | + |
|
115 | +(define-test octet-count.iso8859-10
|
|
116 | + (:tag :octet-count)
|
|
117 | + (test-octet-count *test-iso8859-1* :iso8859-10))
|
|
118 | + |
|
119 | +(define-test octet-count.iso8859-13
|
|
120 | + (:tag :octet-count)
|
|
121 | + (test-octet-count *test-iso8859-1* :iso8859-13))
|
|
122 | + |
|
123 | +(define-test octet-count.iso8859-14
|
|
124 | + (:tag :octet-count)
|
|
125 | + (test-octet-count *test-iso8859-1* :iso8859-14))
|
|
126 | + |
|
127 | +(define-test octet-count.iso8859-15
|
|
128 | + (:tag :octet-count)
|
|
129 | + (test-octet-count *test-iso8859-1* :iso8859-15))
|
|
130 | + |
|
131 | +(define-test octet-count.mac-roman
|
|
132 | + (:tag :octet-count)
|
|
133 | + (test-octet-count *test-iso8859-1* :mac-roman))
|
|
134 | +
|
|
135 | + |