Raymond Toy pushed to branch issue-367-count-octets-for-encoding at cmucl / cmucl
Commits:
3f19d402 by Raymond Toy at 2025-01-16T19:36:41-08:00
Update pot file for new docstrings
- - - - -
1 changed file:
- src/i18n/locale/cmucl.pot
Changes:
=====================================
src/i18n/locale/cmucl.pot
=====================================
@@ -9300,6 +9300,13 @@ msgid ""
" external format."
msgstr ""
+#: src/code/extfmts.lisp
+msgid ""
+"Compute the number of octets needed to convert String using the\n"
+" specified External-format. The string is bound by Start (defaulting\n"
+" to 0) and End (defaulting to the end of the string)."
+msgstr ""
+
#: src/code/extfmts.lisp
msgid ""
"Encode the given String using External-Format and return a new\n"
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/3f19d40275a95d5f6d69fce…
--
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/3f19d40275a95d5f6d69fce…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-367-count-octets-for-encoding at cmucl / cmucl
Commits:
f859892f by Raymond Toy at 2025-01-15T11:52:51-08:00
Actually use random state in the generator
We created a random state to use for generating the test strings, but
we didn't actually use the state with `random`. Do so now.
- - - - -
1 changed file:
- tests/external-formats.lisp
Changes:
=====================================
tests/external-formats.lisp
=====================================
@@ -9,14 +9,14 @@
(let ((rs (kernel::make-random-object :state (kernel::init-random-state 27182828))))
(lisp::codepoints-string
(loop for k from 0 below 1000
- collect (random 256))))
+ collect (random 256 rs))))
"Random test string with ISO8859-1 characters")
(defparameter *test-unicode*
(let ((rs (kernel::make-random-object :state (kernel::init-random-state 27182828))))
(lisp::codepoints-string
(loop for k from 0 below 1000
- collect (random 20000))))
+ collect (random 20000 rs))))
"Random test string with codepoints below 20000")
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/f859892fe82927e2f8ce5a2…
--
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/f859892fe82927e2f8ce5a2…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-367-count-octets-for-encoding at cmucl / cmucl
Commits:
afc9afb0 by Raymond Toy at 2025-01-15T09:04:00-08:00
Hack to handle composing formats in string-octet-count
I can't figure out how to make `string-octet-count` work when using
composing formats (like (:latin1 :crlf) to output CR/LF for every LF).
As a workaround, if the external format is a composing format, use
`string-to-octets` to get the number of octets. This is not great,
but allows `string-octet-count` to work with composing formats.
- - - - -
1 changed file:
- src/code/extfmts.lisp
Changes:
=====================================
src/code/extfmts.lisp
=====================================
@@ -1165,9 +1165,21 @@ character and illegal outputs are replaced by a question mark.")
"Compute the number of octets needed to convert String using the
specified External-format. The string is bound by Start (defaulting
to 0) and End (defaulting to the end of the string)."
- (lisp::with-array-data ((string string) (start start) (end end))
- (funcall (ef-string-octet-count external-format)
- string start end error)))
+ (let ((composing-format-p
+ ;; Determine is the external format is a composing format
+ ;; which we determine by seeing that the name of the format
+ ;; is a cons. Probably not the best way.
+ (consp (ef-name (find-external-format external-format)))))
+ ;; We currently don't know how to get just the number of octets
+ ;; when a composing external format is used. As a workaround, use
+ ;; STRING-TO-OCTETS to find the number of octets.
+ (if composing-format-p
+ (nth-value 1
+ (string-to-octets string :start start :end end
+ :external-format external-format))
+ (lisp::with-array-data ((string string) (start start) (end end))
+ (funcall (ef-string-octet-count external-format)
+ string start end error)))))
(def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
`(lambda (string start end result error &aux (ptr 0) (state nil))
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/afc9afb0e69ec877fafa02b…
--
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/afc9afb0e69ec877fafa02b…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-367-count-octets-for-encoding at cmucl / cmucl
Commits:
1a5889eb by Raymond Toy at 2025-01-14T21:18:44-08:00
Rename def-ef-macro ef-octet-count to ef-string-octet-count
It's a bit confusing to use `ef-octet-count` defined by `def-ef-macro`
and also have a function named `ef-octet-count`. Thus, rename it.
While were at it, add a docstring for `string-octet-count`.
- - - - -
1 changed file:
- src/code/extfmts.lisp
Changes:
=====================================
src/code/extfmts.lisp
=====================================
@@ -1155,15 +1155,18 @@ character and illegal outputs are replaced by a question mark.")
(values (if stringp string (lisp::shrink-vector string pos)) (- pos s-start) last-octet new-state))))
-(def-ef-macro ef-octet-count (extfmt lisp::lisp +ef-max+ +ef-oc+)
+(def-ef-macro ef-string-octet-count (extfmt lisp::lisp +ef-max+ +ef-oc+)
`(lambda (string start end error &aux (total 0) (state nil))
(dotimes (i (- end start) total)
(incf total
(octet-count ,extfmt (schar string (+ start i)) state error)))))
(defun string-octet-count (string &key (start 0) end (external-format :default) error)
+ "Compute the number of octets needed to convert String using the
+ specified External-format. The string is bound by Start (defaulting
+ to 0) and End (defaulting to the end of the string)."
(lisp::with-array-data ((string string) (start start) (end end))
- (funcall (ef-octet-count external-format)
+ (funcall (ef-string-octet-count external-format)
string start end error)))
(def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/1a5889ebc36c14498e1cffa…
--
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/1a5889ebc36c14498e1cffa…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-367-count-octets-for-encoding at cmucl / cmucl
Commits:
02be9b8b by Raymond Toy at 2025-01-13T13:19:18-08:00
Add test that errors are signaled in string-octet-count
Just test that we signal an error when converting a string to :ascii.
- - - - -
06488aeb by Raymond Toy at 2025-01-13T13:25:19-08:00
Add some sanity checks for STRING-TO-OCTETS
Since `string-to-octets` can return a buffer of octets whose length is
not the same as the returned number of octets, add a check that they
are the same. If not, then we `string-octet-count` may not be
consistent. Also add a check that the converted number of characters
matches the length of the string.
- - - - -
1 changed file:
- tests/external-formats.lisp
Changes:
=====================================
tests/external-formats.lisp
=====================================
@@ -20,9 +20,22 @@
"Random test string with codepoints below 20000")
+
(defmacro test-octet-count (string format)
- `(assert-equal (length (stream:string-to-octets ,string :external-format ,format))
- (stream::string-octet-count ,string :external-format ,format)))
+ "Test that STRING-OCTET-COUNT returns the correct number of octets"
+ ;; We expect STRING-OCTET-COUNT returns the same number of octets
+ ;; that are produced by STRING-TO-OCTETS.
+ `(multiple-value-bind (octets count converted)
+ (stream:string-to-octets ,string :external-format ,format)
+ ;; While we're at it, make sure that the length of the octet
+ ;; buffer matches returned count. And make sure we converted all
+ ;; the characters in the string.
+ (assert-equal (length octets) count)
+ (assert-equal (length ,string) converted)
+ ;; Finally, make sure that STRING-OCTET-COUNT returns the same
+ ;; number of octets from STRING-TO-OCTETS.
+ (assert-equal (length octets)
+ (stream::string-octet-count ,string :external-format ,format))))
(define-test octet-count.iso8859-1
(:tag :octet-count)
@@ -32,6 +45,13 @@
(:tag :octet-count)
(test-octet-count *test-iso8859-1* :ascii))
+(define-test octet-count.ascii.error
+ (:tag :octet-count)
+ (assert-error 'simple-error
+ (stream::string-octet-count *test-iso8859-1*
+ :external-format :ascii
+ :error 'error)))
+
(define-test octet-count.utf-8
(:tag :octet-count)
(test-octet-count *test-unicode* :utf-8))
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/ccd15fcecd9a55e61b22a7…
--
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/ccd15fcecd9a55e61b22a7…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-367-count-octets-for-encoding at cmucl / cmucl
Commits:
ccd15fce by Raymond Toy at 2025-01-13T12:40:36-08:00
Add octet-count for mac-roman format with corresponding test
This also means all the cp12??, koi8-5, mac-greek, mac-icelandic,
mac-latin2, and mac-turkish also has octet-count now too since
mac-roman is the base for these other formats.
- - - - -
2 changed files:
- src/pcl/simple-streams/external-formats/mac-roman.lisp
- tests/external-formats.lisp
Changes:
=====================================
src/pcl/simple-streams/external-formats/mac-roman.lisp
=====================================
@@ -49,4 +49,19 @@ character and illegal outputs are replaced by a question mark.")
(declare (optimize (ext:inhibit-warnings 3)))
(funcall ,error "Cannot output codepoint #x~X to MAC-ROMAN stream"
,code))
- #x3F)))))))
+ #x3F))))))
+ ()
+ ()
+ (octet-count (code state error present)
+ `(if (< ,code 128)
+ 1
+ (let ((,present (get-inverse ,itable ,code)))
+ (if ,present
+ 1
+ (if ,error
+ (locally
+ ;; No warnings about fdefinition
+ (declare (optimize (ext:inhibit-warnings 3)))
+ (funcall ,error "Cannot output codepoint #x~X to MAC-ROMAN stream"
+ ,code))
+ 1))))))
=====================================
tests/external-formats.lisp
=====================================
@@ -108,5 +108,8 @@
(:tag :octet-count)
(test-octet-count *test-iso8859-1* :iso8859-15))
-
+(define-test octet-count.mac-roman
+ (:tag :octet-count)
+ (test-octet-count *test-iso8859-1* :mac-roman))
+
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/ccd15fcecd9a55e61b22a7f…
--
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/ccd15fcecd9a55e61b22a7f…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-367-count-octets-for-encoding at cmucl / cmucl
Commits:
865a06b0 by Raymond Toy at 2025-01-12T17:45:18-08:00
Make EF-OCTET-COUNT an inline function and add documentation
Like the other EF- functions, make EF-OCTET-COUNT an inline function.
Add comments in define-external-format about the OCTET-COUNT slot.
- - - - -
1 changed file:
- src/code/extfmts.lisp
Changes:
=====================================
src/code/extfmts.lisp
=====================================
@@ -133,7 +133,8 @@
(setf (gethash (ef-name ef) *external-formats*) ef))
(declaim (inline ef-octets-to-code ef-code-to-octets ef-flush-state ef-copy-state
- ef-cache ef-min-octets ef-max-octets))
+ ef-cache ef-min-octets ef-max-octets
+ ef-octet-count))
(defun ef-octets-to-code (ef)
(efx-octets-to-code (ef-efx ef)))
@@ -176,7 +177,7 @@
;;; DEFINE-EXTERNAL-FORMAT -- Public
;;;
;;; name (&key base min max size documentation) (&rest slots) octets-to-code
-;;; code-to-octets flush-state copy-state
+;;; code-to-octets flush-state copy-state octet-count
;;;
;;; Define a new external format. If base is specified, then an
;;; external format is defined that is based on a previously defined
@@ -238,6 +239,15 @@
;;; This should probably be a deep copy so that if the original
;;; state is modified, the copy is not.
;;;
+;;; octet-count (code state error &rest vars)
+;;; Defines a form to determine the number of octets needed to
+;;; encode the given CODE using the external format. This is
+;;; essentially the same as CODE-TO-OCTETS, except the encoding is
+;;; not saved anywhere. ERROR is the same as in CODE-TO-OCTETS.
+;;;
+;;; This should return one value: the number of octets needed to
+;;; encode the given code.
+;;;
;;; Note: external-formats work on code-points, not
;;; characters, so that the entire 31 bit ISO-10646 range can be
;;; used internally regardless of the size of a character recognized
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/865a06b03710020d17e7f9d…
--
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/865a06b03710020d17e7f9d…
You're receiving this email because of your account on gitlab.common-lisp.net.
Raymond Toy pushed to branch issue-367-count-octets-for-encoding at cmucl / cmucl
Commits:
31a76ff7 by Raymond Toy at 2025-01-12T14:08:01-08:00
Include BOM in octet count for UTF-16 and UTF-32
`string-to-octets` includes the BOM when encoding strings. To be
consistent, update `string-octet-count` to include the BOM when
computing the number of octets.
This is only needed for :UTF-16 and :UTF-32 formats. The other utf-16
and utf-32 formats don't include the BOM.
Enable tests for these two formats too.
- - - - -
3 changed files:
- src/pcl/simple-streams/external-formats/utf-16.lisp
- src/pcl/simple-streams/external-formats/utf-32.lisp
- tests/external-formats.lisp
Changes:
=====================================
src/pcl/simple-streams/external-formats/utf-16.lisp
=====================================
@@ -158,16 +158,16 @@ Unicode replacement character.")
;; The state is list. Copy it
`(copy-list ,state))
(octet-count (code state error)
- `(progn
- #+nil
+ `(let ((bom-count 0))
(unless ,state
;; Output BOM
- (output #xFEFF)
+ (setf bom-count 2)
(setf ,state t))
- (cond ((< ,code #x10000)
- 2)
- ((< ,code #x110000)
- 4)
- (t
- ;; Replacement character is 2 octets
- 2)))))
+ (+ bom-count
+ (cond ((< ,code #x10000)
+ 2)
+ ((< ,code #x110000)
+ 4)
+ (t
+ ;; Replacement character is 2 octets
+ 2))))))
=====================================
src/pcl/simple-streams/external-formats/utf-32.lisp
=====================================
@@ -116,11 +116,9 @@ Unicode replacement character.")
;; The state is either NIL or T, so we can just return that.
`(progn ,state))
(octet-count (code state error)
- `(progn
- ;; Should we count the BOM?
- #+nil
+ `(let ((bom-count 0))
(unless ,state
- (out #xFEFF)
+ (setf bom-count 4)
(setf ,state t))
(cond ((lisp::surrogatep ,code)
(if ,error
@@ -130,6 +128,6 @@ Unicode replacement character.")
(funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output"
,code))
;; Replacement character is 2 octets
- 2))
+ (+ 2 bom-count)))
(t
- 4)))))
+ (+ 4 bom-count))))))
=====================================
tests/external-formats.lisp
=====================================
@@ -36,7 +36,6 @@
(:tag :octet-count)
(test-octet-count *test-unicode* :utf-8))
-#+nil
(define-test octet-count.utf-16
(:tag :octet-count)
(test-octet-count *test-unicode* :utf-16))
@@ -49,7 +48,6 @@
(:tag :octet-count)
(test-octet-count *test-unicode* :utf-16-le))
-#+nil
(define-test octet-count.utf-32
(:tag :octet-count)
(test-octet-count *test-unicode* :utf-32))
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/31a76ff7e674b0dac9f89d9…
--
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/commit/31a76ff7e674b0dac9f89d9…
You're receiving this email because of your account on gitlab.common-lisp.net.