Raymond Toy pushed to branch master at cmucl / cmucl
Commits:
-
fd391099
by Raymond Toy at 2025-01-31T14:39:40+00:00
-
12e15d8b
by Raymond Toy at 2025-01-31T14:39:40+00:00
18 changed files:
- .gitlab-ci.yml
- + src/bootfiles/21e/boot-2024-08.lisp
- src/code/exports.lisp
- src/code/extfmts.lisp
- src/i18n/locale/cmucl.pot
- src/pcl/simple-streams/external-formats/ascii.lisp
- src/pcl/simple-streams/external-formats/euc-kr.lisp
- src/pcl/simple-streams/external-formats/iso8859-1.lisp
- src/pcl/simple-streams/external-formats/iso8859-2.lisp
- src/pcl/simple-streams/external-formats/mac-roman.lisp
- src/pcl/simple-streams/external-formats/utf-16-be.lisp
- src/pcl/simple-streams/external-formats/utf-16-le.lisp
- src/pcl/simple-streams/external-formats/utf-16.lisp
- src/pcl/simple-streams/external-formats/utf-32-be.lisp
- src/pcl/simple-streams/external-formats/utf-32-le.lisp
- src/pcl/simple-streams/external-formats/utf-32.lisp
- src/pcl/simple-streams/external-formats/utf-8.lisp
- + tests/external-formats.lisp
Changes:
| 1 | 1 | variables:
|
| 2 | 2 | download_url: "https://common-lisp.net/project/cmucl/downloads/snapshots/2024/08"
|
| 3 | 3 | version: "2024-08-x86"
|
| 4 | - bootstrap: ""
|
|
| 4 | + bootstrap: "-B boot-2024-08"
|
|
| 5 | 5 | |
| 6 | 6 | |
| 7 | 7 | stages:
|
| 1 | +;; Bootstrap file for the ef-octet-count changes. Just need to change
|
|
| 2 | +;; the value of +ef-max+
|
|
| 3 | + |
|
| 4 | +(in-package "STREAM")
|
|
| 5 | + |
|
| 6 | +(handler-bind
|
|
| 7 | + ((error (lambda (c)
|
|
| 8 | + (declare (ignore c))
|
|
| 9 | + (invoke-restart 'continue))))
|
|
| 10 | + (defconstant +ef-max+ 14)) |
| ... | ... | @@ -1456,7 +1456,8 @@ |
| 1456 | 1456 | (name
|
| 1457 | 1457 | '("STRING-TO-OCTETS" "OCTETS-TO-STRING" "*DEFAULT-EXTERNAL-FORMAT*"
|
| 1458 | 1458 | "STRING-ENCODE" "STRING-DECODE" "SET-SYSTEM-EXTERNAL-FORMAT"
|
| 1459 | - "LIST-ALL-EXTERNAL-FORMATS" "DESCRIBE-EXTERNAL-FORMAT"))
|
|
| 1459 | + "LIST-ALL-EXTERNAL-FORMATS" "DESCRIBE-EXTERNAL-FORMAT"
|
|
| 1460 | + "STRING-OCTET-COUNT"))
|
|
| 1460 | 1461 | (intern name "STREAM"))
|
| 1461 | 1462 | |
| 1462 | 1463 | (defpackage "EXTENSIONS"
|
| ... | ... | @@ -19,7 +19,8 @@ |
| 19 | 19 | string-encode string-decode set-system-external-format
|
| 20 | 20 | +replacement-character-code+
|
| 21 | 21 | list-all-external-formats
|
| 22 | - describe-external-format))
|
|
| 22 | + describe-external-format
|
|
| 23 | + string-octet-count))
|
|
| 23 | 24 | |
| 24 | 25 | (defvar *default-external-format*
|
| 25 | 26 | :utf-8
|
| ... | ... | @@ -52,6 +53,7 @@ |
| 52 | 53 | flush ; flush state
|
| 53 | 54 | copy-state ; copy state
|
| 54 | 55 | osc ; octets to string, counted
|
| 56 | + oc ; number of octets to encode string
|
|
| 55 | 57 | max)
|
| 56 | 58 | |
| 57 | 59 | ;; Unicode replacement character U+FFFD
|
| ... | ... | @@ -96,6 +98,11 @@ |
| 96 | 98 | (copy-state nil :type (or null function) :read-only t)
|
| 97 | 99 | (cache nil :type (or null simple-vector))
|
| 98 | 100 | ;;
|
| 101 | + ;; Function to count the number of octets needed to encode a
|
|
| 102 | + ;; codepoint. Basically like code-to-octets, except we return the
|
|
| 103 | + ;; number of octets needed instead of the octets themselves.
|
|
| 104 | + (octet-count #'%efni :type (or null function) :read-only t)
|
|
| 105 | + ;;
|
|
| 99 | 106 | ;; Minimum number of octets needed to form a codepoint
|
| 100 | 107 | (min 1 :type kernel:index :read-only t)
|
| 101 | 108 | ;;
|
| ... | ... | @@ -126,7 +133,8 @@ |
| 126 | 133 | (setf (gethash (ef-name ef) *external-formats*) ef))
|
| 127 | 134 | |
| 128 | 135 | (declaim (inline ef-octets-to-code ef-code-to-octets ef-flush-state ef-copy-state
|
| 129 | - ef-cache ef-min-octets ef-max-octets))
|
|
| 136 | + ef-cache ef-min-octets ef-max-octets
|
|
| 137 | + ef-octet-count))
|
|
| 130 | 138 | |
| 131 | 139 | (defun ef-octets-to-code (ef)
|
| 132 | 140 | (efx-octets-to-code (ef-efx ef)))
|
| ... | ... | @@ -143,6 +151,9 @@ |
| 143 | 151 | (defun ef-cache (ef)
|
| 144 | 152 | (efx-cache (ef-efx ef)))
|
| 145 | 153 | |
| 154 | +(defun ef-octet-count (ef)
|
|
| 155 | + (efx-octet-count (ef-efx ef)))
|
|
| 156 | + |
|
| 146 | 157 | (defun ef-min-octets (ef)
|
| 147 | 158 | (efx-min (ef-efx ef)))
|
| 148 | 159 | |
| ... | ... | @@ -166,7 +177,7 @@ |
| 166 | 177 | ;;; DEFINE-EXTERNAL-FORMAT -- Public
|
| 167 | 178 | ;;;
|
| 168 | 179 | ;;; name (&key base min max size documentation) (&rest slots) octets-to-code
|
| 169 | -;;; code-to-octets flush-state copy-state
|
|
| 180 | +;;; code-to-octets flush-state copy-state octet-count
|
|
| 170 | 181 | ;;;
|
| 171 | 182 | ;;; Define a new external format. If base is specified, then an
|
| 172 | 183 | ;;; external format is defined that is based on a previously defined
|
| ... | ... | @@ -228,6 +239,15 @@ |
| 228 | 239 | ;;; This should probably be a deep copy so that if the original
|
| 229 | 240 | ;;; state is modified, the copy is not.
|
| 230 | 241 | ;;;
|
| 242 | +;;; octet-count (code state error &rest vars)
|
|
| 243 | +;;; Defines a form to determine the number of octets needed to
|
|
| 244 | +;;; encode the given CODE using the external format. This is
|
|
| 245 | +;;; essentially the same as CODE-TO-OCTETS, except the encoding is
|
|
| 246 | +;;; not saved anywhere. ERROR is the same as in CODE-TO-OCTETS.
|
|
| 247 | +;;;
|
|
| 248 | +;;; This should return one value: the number of octets needed to
|
|
| 249 | +;;; encode the given code.
|
|
| 250 | +;;;
|
|
| 231 | 251 | ;;; Note: external-formats work on code-points, not
|
| 232 | 252 | ;;; characters, so that the entire 31 bit ISO-10646 range can be
|
| 233 | 253 | ;;; used internally regardless of the size of a character recognized
|
| ... | ... | @@ -238,7 +258,7 @@ |
| 238 | 258 | (defmacro define-external-format (name (&key base min max size (documentation ""))
|
| 239 | 259 | (&rest slots)
|
| 240 | 260 | &optional octets-to-code code-to-octets
|
| 241 | - flush-state copy-state)
|
|
| 261 | + flush-state copy-state octet-count)
|
|
| 242 | 262 | (let* ((tmp (gensym))
|
| 243 | 263 | (min (or min size 1))
|
| 244 | 264 | (max (or max size 6))
|
| ... | ... | @@ -282,7 +302,17 @@ |
| 282 | 302 | (declare (ignorable ,state))
|
| 283 | 303 | (let (,@',slotb
|
| 284 | 304 | ,@(loop for var in vars collect `(,var (gensym))))
|
| 285 | - ,body))))
|
|
| 305 | + ,body)))
|
|
| 306 | + (octet-count ((code state error &rest vars) body)
|
|
| 307 | + `(lambda (,',tmp ,state ,error)
|
|
| 308 | + (declare (ignorable ,state ,error)
|
|
| 309 | + (optimize (ext:inhibit-warnings 3)))
|
|
| 310 | + (let (,@',slotb
|
|
| 311 | + (,code ',code)
|
|
| 312 | + ,@(loop for var in vars collect `(,var (gensym))))
|
|
| 313 | + `(let ((,',code (the lisp:codepoint ,,',tmp)))
|
|
| 314 | + (declare (ignorable ,',code))
|
|
| 315 | + ,,body)))))
|
|
| 286 | 316 | (%intern-ef (make-external-format ,name
|
| 287 | 317 | ,(if base
|
| 288 | 318 | `(ef-efx (find-external-format ,(ef-name base)))
|
| ... | ... | @@ -291,7 +321,8 @@ |
| 291 | 321 | :flush-state ,flush-state
|
| 292 | 322 | :copy-state ,copy-state
|
| 293 | 323 | :cache (make-array +ef-max+
|
| 294 | - :initial-element nil)
|
|
| 324 | + :initial-element nil)
|
|
| 325 | + :octet-count ,octet-count
|
|
| 295 | 326 | :min ,(min min max)
|
| 296 | 327 | :max ,(max min max)))
|
| 297 | 328 | nil
|
| ... | ... | @@ -688,7 +719,20 @@ character and illegal outputs are replaced by a question mark.") |
| 688 | 719 | (intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream")
|
| 689 | 720 | ,code 1))
|
| 690 | 721 | #x3F)
|
| 691 | - ,code))))
|
|
| 722 | + ,code)))
|
|
| 723 | + ()
|
|
| 724 | + ()
|
|
| 725 | + (octet-count (code state error)
|
|
| 726 | + `(if (> ,code 255)
|
|
| 727 | + (if ,error
|
|
| 728 | + (locally
|
|
| 729 | + ;; No warnings about fdefinition
|
|
| 730 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 731 | + (funcall ,error
|
|
| 732 | + (intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream")
|
|
| 733 | + ,code 1))
|
|
| 734 | + 1)
|
|
| 735 | + 1)))
|
|
| 692 | 736 | |
| 693 | 737 | ;;; OCTETS-TO-CODEPOINT, CODEPOINT-TO-OCTETS -- Semi-Public
|
| 694 | 738 | ;;;
|
| ... | ... | @@ -709,6 +753,10 @@ character and illegal outputs are replaced by a question mark.") |
| 709 | 753 | (let ((ef (find-external-format external-format)))
|
| 710 | 754 | (funcall (ef-code-to-octets ef) code state output error)))
|
| 711 | 755 | |
| 756 | +(defmacro count-codepoint-octets (external-format code state &optional error)
|
|
| 757 | + (let ((ef (find-external-format external-format)))
|
|
| 758 | + (funcall (ef-octet-count ef) code state error)))
|
|
| 759 | + |
|
| 712 | 760 | |
| 713 | 761 | |
| 714 | 762 | (defvar *ef-base* +ef-max+)
|
| ... | ... | @@ -878,6 +926,42 @@ character and illegal outputs are replaced by a question mark.") |
| 878 | 926 | (when f
|
| 879 | 927 | (funcall f state))))
|
| 880 | 928 | |
| 929 | +(defmacro octet-count (external-format char state &optional error)
|
|
| 930 | + (let ((nchar (gensym))
|
|
| 931 | + (nstate (gensym))
|
|
| 932 | + (count-it (gensym))
|
|
| 933 | + (ch (gensym)))
|
|
| 934 | + `(let ((,nchar ,char)
|
|
| 935 | + (,nstate ,state))
|
|
| 936 | + (when (null ,nstate) (setq ,nstate (setf ,state (cons nil nil))))
|
|
| 937 | + (if (lisp::surrogatep (char-code ,nchar) :high)
|
|
| 938 | + (setf (car ,nstate) ,nchar)
|
|
| 939 | + (flet ((,count-it (,ch)
|
|
| 940 | + (count-codepoint-octets ,external-format ,ch (cdr ,nstate) ,error)))
|
|
| 941 | + (if (car ,nstate)
|
|
| 942 | + (prog1
|
|
| 943 | + (,count-it (if (lisp::surrogatep (char-code ,nchar) :low)
|
|
| 944 | + (surrogates-to-codepoint (car ,nstate) ,nchar)
|
|
| 945 | + (if ,error
|
|
| 946 | + (locally
|
|
| 947 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 948 | + (funcall ,error
|
|
| 949 | + (intl:gettext "Cannot convert invalide surrogate #~x~X to character")
|
|
| 950 | + ,nchar))
|
|
| 951 | + +replacement-character-code+)))
|
|
| 952 | + (setf (car ,nstate) nil))
|
|
| 953 | + ;; A lone trailing (low surrogate gets replaced with
|
|
| 954 | + ;; the replacement character.
|
|
| 955 | + (,count-it (if (lisp::surrogatep (char-code ,nchar) :low)
|
|
| 956 | + (if ,error
|
|
| 957 | + (locally
|
|
| 958 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 959 | + (funcall ,error
|
|
| 960 | + (intl:gettext "Cannot convert lone trailing surrogate #x~X to character")
|
|
| 961 | + ,nchar))
|
|
| 962 | + +replacement-character-code+)
|
|
| 963 | + (char-code ,nchar)))))))))
|
|
| 964 | + |
|
| 881 | 965 | (def-ef-macro ef-string-to-octets (extfmt lisp::lisp +ef-max+ +ef-so+)
|
| 882 | 966 | `(lambda (string start end buffer buffer-start buffer-end error bufferp
|
| 883 | 967 | &aux (ptr buffer-start) (state nil) (last-octet buffer-start))
|
| ... | ... | @@ -1071,6 +1155,31 @@ character and illegal outputs are replaced by a question mark.") |
| 1071 | 1155 | (values (if stringp string (lisp::shrink-vector string pos)) (- pos s-start) last-octet new-state))))
|
| 1072 | 1156 | |
| 1073 | 1157 | |
| 1158 | +(def-ef-macro ef-string-octet-count (extfmt lisp::lisp +ef-max+ +ef-oc+)
|
|
| 1159 | + `(lambda (string start end error &aux (total 0) (state nil))
|
|
| 1160 | + (dotimes (i (- end start) total)
|
|
| 1161 | + (incf total
|
|
| 1162 | + (octet-count ,extfmt (schar string (+ start i)) state error)))))
|
|
| 1163 | + |
|
| 1164 | +(defun string-octet-count (string &key (start 0) end (external-format :default) error)
|
|
| 1165 | + "Compute the number of octets needed to convert String using the
|
|
| 1166 | + specified External-format. The string is bound by Start (defaulting
|
|
| 1167 | + to 0) and End (defaulting to the end of the string)."
|
|
| 1168 | + (let ((composing-format-p
|
|
| 1169 | + ;; Determine is the external format is a composing format
|
|
| 1170 | + ;; which we determine by seeing that the name of the format
|
|
| 1171 | + ;; is a cons. Probably not the best way.
|
|
| 1172 | + (consp (ef-name (find-external-format external-format)))))
|
|
| 1173 | + ;; We currently don't know how to get just the number of octets
|
|
| 1174 | + ;; when a composing external format is used. As a workaround, use
|
|
| 1175 | + ;; STRING-TO-OCTETS to find the number of octets.
|
|
| 1176 | + (if composing-format-p
|
|
| 1177 | + (nth-value 1
|
|
| 1178 | + (string-to-octets string :start start :end end
|
|
| 1179 | + :external-format external-format))
|
|
| 1180 | + (lisp::with-array-data ((string string) (start start) (end end))
|
|
| 1181 | + (funcall (ef-string-octet-count external-format)
|
|
| 1182 | + string start end error)))))
|
|
| 1074 | 1183 | |
| 1075 | 1184 | (def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
|
| 1076 | 1185 | `(lambda (string start end result error &aux (ptr 0) (state nil))
|
| ... | ... | @@ -1186,10 +1295,11 @@ character and illegal outputs are replaced by a question mark.") |
| 1186 | 1295 | (#.+ef-so+ (%ef-string-to-octets ef))
|
| 1187 | 1296 | (#.+ef-en+ (%ef-encode ef))
|
| 1188 | 1297 | (#.+ef-de+ (%ef-decode ef))
|
| 1189 | - (#.+ef-osc+ (%ef-octets-to-string-counted ef))))
|
|
| 1298 | + (#.+ef-osc+ (%ef-octets-to-string-counted ef))
|
|
| 1299 | + (#.+ef-oc+ (%ef-octet-count ef))))
|
|
| 1190 | 1300 | `(setf (aref (ef-cache (find-external-format ,(ef-name ef))) ,slot)
|
| 1191 | 1301 | ,(subst (ef-name ef) ef
|
| 1192 | - (function-lambda-expression (aref (ef-cache ef) slot))))))
|
|
| 1302 | + (function-lambda-expression (aref (ef-cache ef) slot)))))))
|
|
| 1193 | 1303 | |
| 1194 | 1304 | ;;; Builtin external formats.
|
| 1195 | 1305 | |
| ... | ... | @@ -1307,7 +1417,17 @@ replacement character.") |
| 1307 | 1417 | ((< ,code #x800) (utf8 ,code 1))
|
| 1308 | 1418 | ((< ,code #x10000) (utf8 ,code 2))
|
| 1309 | 1419 | ((< ,code #x110000) (utf8 ,code 3))
|
| 1310 | - (t (error "How did this happen? Codepoint U+~X is illegal" ,code))))))
|
|
| 1420 | + (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))
|
|
| 1421 | + ()
|
|
| 1422 | + ()
|
|
| 1423 | + (octet-count (code state error)
|
|
| 1424 | + `(locally
|
|
| 1425 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 1426 | + (cond ((< ,code #x80) 1)
|
|
| 1427 | + ((< ,code #x800) 2)
|
|
| 1428 | + ((< ,code #x10000) 3)
|
|
| 1429 | + ((< ,code #x110000) 4)
|
|
| 1430 | + (t (error "How did this happen? Codepoint U+~X is illegal" ,code))))))
|
|
| 1311 | 1431 | |
| 1312 | 1432 | (define-external-format :ascii (:size 1 :documentation
|
| 1313 | 1433 | "US ASCII 7-bit encoding. Illegal input sequences are replaced with
|
| ... | ... | @@ -1333,4 +1453,14 @@ replaced with a question mark.") |
| 1333 | 1453 | (declare (optimize (ext:inhibit-warnings 3)))
|
| 1334 | 1454 | (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
|
| 1335 | 1455 | #x3F)
|
| 1336 | - ,code)))) |
|
| 1456 | + ,code)))
|
|
| 1457 | + ()
|
|
| 1458 | + ()
|
|
| 1459 | + (octet-count (code state error)
|
|
| 1460 | + `(if (> ,code #x7f)
|
|
| 1461 | + (if ,error
|
|
| 1462 | + (locally
|
|
| 1463 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 1464 | + (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
|
|
| 1465 | + 1)
|
|
| 1466 | + 1))) |
| ... | ... | @@ -9300,6 +9300,13 @@ msgid "" |
| 9300 | 9300 | " external format."
|
| 9301 | 9301 | msgstr ""
|
| 9302 | 9302 | |
| 9303 | +#: src/code/extfmts.lisp
|
|
| 9304 | +msgid ""
|
|
| 9305 | +"Compute the number of octets needed to convert String using the\n"
|
|
| 9306 | +" specified External-format. The string is bound by Start (defaulting\n"
|
|
| 9307 | +" to 0) and End (defaulting to the end of the string)."
|
|
| 9308 | +msgstr ""
|
|
| 9309 | + |
|
| 9303 | 9310 | #: src/code/extfmts.lisp
|
| 9304 | 9311 | msgid ""
|
| 9305 | 9312 | "Encode the given String using External-Format and return a new\n"
|
| ... | ... | @@ -33,5 +33,15 @@ replaced with a question mark.") |
| 33 | 33 | (declare (optimize (ext:inhibit-warnings 3)))
|
| 34 | 34 | (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
|
| 35 | 35 | #x3F)
|
| 36 | - ,code))))
|
|
| 36 | + ,code)))
|
|
| 37 | + ()
|
|
| 38 | + ()
|
|
| 39 | + (octet-count (code state error)
|
|
| 40 | + `(if (> ,code #x7f)
|
|
| 41 | + (if ,error
|
|
| 42 | + (locally
|
|
| 43 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 44 | + (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code))
|
|
| 45 | + 1)
|
|
| 46 | + 1)))
|
|
| 37 | 47 |
| ... | ... | @@ -1007,4 +1007,16 @@ character and illegal outputs are replaced by a question mark.") |
| 1007 | 1007 | (t
|
| 1008 | 1008 | (if ,error
|
| 1009 | 1009 | (funcall ,error "Cannot output codepoint #x~X to EUC-KR format." ,code)
|
| 1010 | - (,output #X3f)))))))) |
|
| 1010 | + (,output #X3f)))))))
|
|
| 1011 | + ()
|
|
| 1012 | + ()
|
|
| 1013 | + (octet-count (code state error present)
|
|
| 1014 | + `(if (<= ,code #x7f)
|
|
| 1015 | + 1
|
|
| 1016 | + (let ((,present (get-inverse ,itable ,code)))
|
|
| 1017 | + (cond (,present
|
|
| 1018 | + 2)
|
|
| 1019 | + (t
|
|
| 1020 | + (if ,error
|
|
| 1021 | + (funcall ,error "Cannot output codepoint #x~X to EUC-KR format." ,code)
|
|
| 1022 | + 1))))))) |
| ... | ... | @@ -31,4 +31,17 @@ character and illegal outputs are replaced by a question mark.") |
| 31 | 31 | (funcall ,error "Cannot output codepoint #x~X to ISO8859-1 stream"
|
| 32 | 32 | ,code 1))
|
| 33 | 33 | #x3F)
|
| 34 | - ,code)))) |
|
| 34 | + ,code)))
|
|
| 35 | + ()
|
|
| 36 | + ()
|
|
| 37 | + (octet-count (code state error)
|
|
| 38 | + `(if (> ,code 255)
|
|
| 39 | + (if ,error
|
|
| 40 | + (locally
|
|
| 41 | + ;; No warnings about fdefinition
|
|
| 42 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 43 | + (funcall ,error
|
|
| 44 | + (intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream")
|
|
| 45 | + ,code 1))
|
|
| 46 | + 1)
|
|
| 47 | + 1))) |
| ... | ... | @@ -47,4 +47,19 @@ character and illegal outputs are replaced by a question mark.") |
| 47 | 47 | (declare (optimize (ext:inhibit-warnings 3)))
|
| 48 | 48 | (funcall ,error "Cannot output codepoint #x~X to ISO8859-2 stream"
|
| 49 | 49 | ,code))
|
| 50 | - #x3F))))))) |
|
| 50 | + #x3F))))))
|
|
| 51 | + ()
|
|
| 52 | + ()
|
|
| 53 | + (octet-count (code state error present)
|
|
| 54 | + `(if (< ,code 160)
|
|
| 55 | + 1
|
|
| 56 | + (let ((,present (get-inverse ,itable ,code)))
|
|
| 57 | + (if ,present
|
|
| 58 | + 1
|
|
| 59 | + (if ,error
|
|
| 60 | + (locally
|
|
| 61 | + ;; No warnings about fdefinition
|
|
| 62 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 63 | + (funcall ,error "Cannot output codepoint #x~X to ISO8859-2 stream"
|
|
| 64 | + ,code))
|
|
| 65 | + 1)))))) |
| ... | ... | @@ -49,4 +49,19 @@ character and illegal outputs are replaced by a question mark.") |
| 49 | 49 | (declare (optimize (ext:inhibit-warnings 3)))
|
| 50 | 50 | (funcall ,error "Cannot output codepoint #x~X to MAC-ROMAN stream"
|
| 51 | 51 | ,code))
|
| 52 | - #x3F))))))) |
|
| 52 | + #x3F))))))
|
|
| 53 | + ()
|
|
| 54 | + ()
|
|
| 55 | + (octet-count (code state error present)
|
|
| 56 | + `(if (< ,code 128)
|
|
| 57 | + 1
|
|
| 58 | + (let ((,present (get-inverse ,itable ,code)))
|
|
| 59 | + (if ,present
|
|
| 60 | + 1
|
|
| 61 | + (if ,error
|
|
| 62 | + (locally
|
|
| 63 | + ;; No warnings about fdefinition
|
|
| 64 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 65 | + (funcall ,error "Cannot output codepoint #x~X to MAC-ROMAN stream"
|
|
| 66 | + ,code))
|
|
| 67 | + 1)))))) |
| ... | ... | @@ -110,4 +110,12 @@ Unicode replacement character.") |
| 110 | 110 | (copy-state (state)
|
| 111 | 111 | ;; The state is either NIL or a codepoint, so nothing really
|
| 112 | 112 | ;; special is needed to copy it.
|
| 113 | - `(progn ,state))) |
|
| 113 | + `(progn ,state))
|
|
| 114 | + (octet-count (code state error)
|
|
| 115 | + `(cond ((< ,code #x10000)
|
|
| 116 | + 2)
|
|
| 117 | + ((< ,code #x110000)
|
|
| 118 | + 4)
|
|
| 119 | + (t
|
|
| 120 | + ;; Replacement character is 2 octets
|
|
| 121 | + 2)))) |
| ... | ... | @@ -111,4 +111,12 @@ Unicode replacement character.") |
| 111 | 111 | (copy-state (state)
|
| 112 | 112 | ;; The state is either NIL or a codepoint, so nothing really
|
| 113 | 113 | ;; special is needed.
|
| 114 | - `(progn ,state))) |
|
| 114 | + `(progn ,state))
|
|
| 115 | + (octet-count (code state error)
|
|
| 116 | + `(cond ((< ,code #x10000)
|
|
| 117 | + 2)
|
|
| 118 | + ((< ,code #x110000)
|
|
| 119 | + 4)
|
|
| 120 | + (t
|
|
| 121 | + ;; Replacement character is 2 octets
|
|
| 122 | + 2)))) |
| ... | ... | @@ -156,4 +156,18 @@ Unicode replacement character.") |
| 156 | 156 | ,c))))))
|
| 157 | 157 | (copy-state (state)
|
| 158 | 158 | ;; The state is list. Copy it
|
| 159 | - `(copy-list ,state))) |
|
| 159 | + `(copy-list ,state))
|
|
| 160 | + (octet-count (code state error)
|
|
| 161 | + `(let ((bom-count 0))
|
|
| 162 | + (unless ,state
|
|
| 163 | + ;; Output BOM
|
|
| 164 | + (setf bom-count 2)
|
|
| 165 | + (setf ,state t))
|
|
| 166 | + (+ bom-count
|
|
| 167 | + (cond ((< ,code #x10000)
|
|
| 168 | + 2)
|
|
| 169 | + ((< ,code #x110000)
|
|
| 170 | + 4)
|
|
| 171 | + (t
|
|
| 172 | + ;; Replacement character is 2 octets
|
|
| 173 | + 2)))))) |
| ... | ... | @@ -61,4 +61,18 @@ Unicode replacement character.") |
| 61 | 61 | ,code))
|
| 62 | 62 | +replacement-character-code+)))
|
| 63 | 63 | (t
|
| 64 | - (out ,code)))))) |
|
| 64 | + (out ,code)))))
|
|
| 65 | + ()
|
|
| 66 | + ()
|
|
| 67 | + (octet-count (code state error)
|
|
| 68 | + `(cond ((lisp::surrogatep ,code)
|
|
| 69 | + (if ,error
|
|
| 70 | + (locally
|
|
| 71 | + ;; No warnings about fdefinition
|
|
| 72 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 73 | + (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output"
|
|
| 74 | + ,code))
|
|
| 75 | + ;; Replacement character is 2 octets
|
|
| 76 | + 2))
|
|
| 77 | + (t
|
|
| 78 | + 4)))) |
| ... | ... | @@ -62,4 +62,18 @@ Unicode replacement character.") |
| 62 | 62 | ,code))
|
| 63 | 63 | +replacement-character-code+)))
|
| 64 | 64 | (t
|
| 65 | - (out ,code)))))) |
|
| 65 | + (out ,code)))))
|
|
| 66 | + ()
|
|
| 67 | + ()
|
|
| 68 | + (octet-count (code state error)
|
|
| 69 | + `(cond ((lisp::surrogatep ,code)
|
|
| 70 | + (if ,error
|
|
| 71 | + (locally
|
|
| 72 | + ;; No warnings about fdefinition
|
|
| 73 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 74 | + (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output"
|
|
| 75 | + ,code))
|
|
| 76 | + ;; Replacement character is 2 octets
|
|
| 77 | + 2))
|
|
| 78 | + (t
|
|
| 79 | + 4)))) |
| ... | ... | @@ -114,4 +114,20 @@ Unicode replacement character.") |
| 114 | 114 | nil
|
| 115 | 115 | (copy-state (state)
|
| 116 | 116 | ;; The state is either NIL or T, so we can just return that.
|
| 117 | - `(progn ,state))) |
|
| 117 | + `(progn ,state))
|
|
| 118 | + (octet-count (code state error)
|
|
| 119 | + `(let ((bom-count 0))
|
|
| 120 | + (unless ,state
|
|
| 121 | + (setf bom-count 4)
|
|
| 122 | + (setf ,state t))
|
|
| 123 | + (cond ((lisp::surrogatep ,code)
|
|
| 124 | + (if ,error
|
|
| 125 | + (locally
|
|
| 126 | + ;; No warnings about fdefinition
|
|
| 127 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 128 | + (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output"
|
|
| 129 | + ,code))
|
|
| 130 | + ;; Replacement character is 2 octets
|
|
| 131 | + (+ 2 bom-count)))
|
|
| 132 | + (t
|
|
| 133 | + (+ 4 bom-count)))))) |
| ... | ... | @@ -127,4 +127,14 @@ replacement character.") |
| 127 | 127 | ((< ,code #x800) (utf8 ,code 1))
|
| 128 | 128 | ((< ,code #x10000) (utf8 ,code 2))
|
| 129 | 129 | ((< ,code #x110000) (utf8 ,code 3))
|
| 130 | - (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))) |
|
| 130 | + (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))
|
|
| 131 | + ()
|
|
| 132 | + ()
|
|
| 133 | + (octet-count (code state error)
|
|
| 134 | + `(locally
|
|
| 135 | + (declare (optimize (ext:inhibit-warnings 3)))
|
|
| 136 | + (cond ((< ,code #x80) 1)
|
|
| 137 | + ((< ,code #x800) 2)
|
|
| 138 | + ((< ,code #x10000) 3)
|
|
| 139 | + ((< ,code #x110000) 4)
|
|
| 140 | + (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))) |
| 1 | +;;; Tests for external formats
|
|
| 2 | + |
|
| 3 | +(defpackage :external-formats-tests
|
|
| 4 | + (:use :cl :lisp-unit))
|
|
| 5 | + |
|
| 6 | +(in-package "EXTERNAL-FORMATS-TESTS")
|
|
| 7 | + |
|
| 8 | +(defparameter *test-iso8859-1*
|
|
| 9 | + (let ((rs (kernel::make-random-object :state (kernel::init-random-state 27182828))))
|
|
| 10 | + (lisp::codepoints-string
|
|
| 11 | + (loop for k from 0 below 1000
|
|
| 12 | + collect (random 256 rs))))
|
|
| 13 | + "Random test string with ISO8859-1 characters")
|
|
| 14 | + |
|
| 15 | +(defparameter *test-unicode*
|
|
| 16 | + (let ((rs (kernel::make-random-object :state (kernel::init-random-state 27182828))))
|
|
| 17 | + (lisp::codepoints-string
|
|
| 18 | + (loop for k from 0 below 1000
|
|
| 19 | + collect (random 20000 rs))))
|
|
| 20 | + "Random test string with codepoints below 20000")
|
|
| 21 | + |
|
| 22 | + |
|
| 23 | + |
|
| 24 | +(defmacro test-octet-count (string format)
|
|
| 25 | + "Test that STRING-OCTET-COUNT returns the correct number of octets"
|
|
| 26 | + ;; We expect STRING-OCTET-COUNT returns the same number of octets
|
|
| 27 | + ;; that are produced by STRING-TO-OCTETS.
|
|
| 28 | + `(multiple-value-bind (octets count converted)
|
|
| 29 | + (stream:string-to-octets ,string :external-format ,format)
|
|
| 30 | + ;; While we're at it, make sure that the length of the octet
|
|
| 31 | + ;; buffer matches returned count. And make sure we converted all
|
|
| 32 | + ;; the characters in the string.
|
|
| 33 | + (assert-equal (length octets) count)
|
|
| 34 | + (assert-equal (length ,string) converted)
|
|
| 35 | + ;; Finally, make sure that STRING-OCTET-COUNT returns the same
|
|
| 36 | + ;; number of octets from STRING-TO-OCTETS.
|
|
| 37 | + (assert-equal (length octets)
|
|
| 38 | + (stream::string-octet-count ,string :external-format ,format))))
|
|
| 39 | + |
|
| 40 | +(define-test octet-count.iso8859-1
|
|
| 41 | + (:tag :octet-count)
|
|
| 42 | + (test-octet-count *test-iso8859-1* :iso8859-1))
|
|
| 43 | + |
|
| 44 | +(define-test octet-count.ascii
|
|
| 45 | + (:tag :octet-count)
|
|
| 46 | + (test-octet-count *test-iso8859-1* :ascii))
|
|
| 47 | + |
|
| 48 | +(define-test octet-count.ascii.error
|
|
| 49 | + (:tag :octet-count)
|
|
| 50 | + (assert-error 'simple-error
|
|
| 51 | + (stream::string-octet-count *test-iso8859-1*
|
|
| 52 | + :external-format :ascii
|
|
| 53 | + :error 'error)))
|
|
| 54 | + |
|
| 55 | +(define-test octet-count.utf-8
|
|
| 56 | + (:tag :octet-count)
|
|
| 57 | + (test-octet-count *test-unicode* :utf-8))
|
|
| 58 | + |
|
| 59 | +(define-test octet-count.utf-16
|
|
| 60 | + (:tag :octet-count)
|
|
| 61 | + (test-octet-count *test-unicode* :utf-16))
|
|
| 62 | + |
|
| 63 | +(define-test octet-count.utf-16-be
|
|
| 64 | + (:tag :octet-count)
|
|
| 65 | + (test-octet-count *test-unicode* :utf-16-be))
|
|
| 66 | + |
|
| 67 | +(define-test octet-count.utf-16-le
|
|
| 68 | + (:tag :octet-count)
|
|
| 69 | + (test-octet-count *test-unicode* :utf-16-le))
|
|
| 70 | + |
|
| 71 | +(define-test octet-count.utf-32
|
|
| 72 | + (:tag :octet-count)
|
|
| 73 | + (test-octet-count *test-unicode* :utf-32))
|
|
| 74 | + |
|
| 75 | +(define-test octet-count.utf-32-le
|
|
| 76 | + (:tag :octet-count)
|
|
| 77 | + (test-octet-count *test-unicode* :utf-32-le))
|
|
| 78 | + |
|
| 79 | +(define-test octet-count.utf-32-le
|
|
| 80 | + (:tag :octet-count)
|
|
| 81 | + (test-octet-count *test-unicode* :utf-32-le))
|
|
| 82 | + |
|
| 83 | +(define-test octet-count.euc-kr
|
|
| 84 | + (:tag :octet-count)
|
|
| 85 | + (test-octet-count *test-unicode* :euc-kr))
|
|
| 86 | + |
|
| 87 | +(define-test octet-count.iso8859-2
|
|
| 88 | + (:tag :octet-count)
|
|
| 89 | + (test-octet-count *test-iso8859-1* :iso8859-2))
|
|
| 90 | + |
|
| 91 | +(define-test octet-count.iso8859-3
|
|
| 92 | + (:tag :octet-count)
|
|
| 93 | + (test-octet-count *test-iso8859-1* :iso8859-3))
|
|
| 94 | + |
|
| 95 | +(define-test octet-count.iso8859-4
|
|
| 96 | + (:tag :octet-count)
|
|
| 97 | + (test-octet-count *test-iso8859-1* :iso8859-4))
|
|
| 98 | + |
|
| 99 | +(define-test octet-count.iso8859-5
|
|
| 100 | + (:tag :octet-count)
|
|
| 101 | + (test-octet-count *test-iso8859-1* :iso8859-5))
|
|
| 102 | + |
|
| 103 | +(define-test octet-count.iso8859-6
|
|
| 104 | + (:tag :octet-count)
|
|
| 105 | + (test-octet-count *test-iso8859-1* :iso8859-6))
|
|
| 106 | + |
|
| 107 | +(define-test octet-count.iso8859-7
|
|
| 108 | + (:tag :octet-count)
|
|
| 109 | + (test-octet-count *test-iso8859-1* :iso8859-7))
|
|
| 110 | + |
|
| 111 | +(define-test octet-count.iso8859-8
|
|
| 112 | + (:tag :octet-count)
|
|
| 113 | + (test-octet-count *test-iso8859-1* :iso8859-8))
|
|
| 114 | + |
|
| 115 | +(define-test octet-count.iso8859-10
|
|
| 116 | + (:tag :octet-count)
|
|
| 117 | + (test-octet-count *test-iso8859-1* :iso8859-10))
|
|
| 118 | + |
|
| 119 | +(define-test octet-count.iso8859-13
|
|
| 120 | + (:tag :octet-count)
|
|
| 121 | + (test-octet-count *test-iso8859-1* :iso8859-13))
|
|
| 122 | + |
|
| 123 | +(define-test octet-count.iso8859-14
|
|
| 124 | + (:tag :octet-count)
|
|
| 125 | + (test-octet-count *test-iso8859-1* :iso8859-14))
|
|
| 126 | + |
|
| 127 | +(define-test octet-count.iso8859-15
|
|
| 128 | + (:tag :octet-count)
|
|
| 129 | + (test-octet-count *test-iso8859-1* :iso8859-15))
|
|
| 130 | + |
|
| 131 | +(define-test octet-count.mac-roman
|
|
| 132 | + (:tag :octet-count)
|
|
| 133 | + (test-octet-count *test-iso8859-1* :mac-roman))
|
|
| 134 | +
|
|
| 135 | + |