Raymond Toy pushed to branch issue-367-count-octets-for-encoding at cmucl / cmucl Commits: f5b4ae8a by Raymond Toy at 2025-01-12T11:39:37-08:00 Copy the utf-8, iso8859-1, ascii impl These external formats are defined in extfmts. For completeness, copy these implementations to the corresponding files in src/pcl/simple-streams/external-formats. - - - - - 83db856a by Raymond Toy at 2025-01-12T11:49:48-08:00 Implement octet-count utf-16 and utf-32. Pretty basic implementation, taking care of handling the replacement character if needed. - - - - - 9 changed files: - src/pcl/simple-streams/external-formats/ascii.lisp - src/pcl/simple-streams/external-formats/iso8859-1.lisp - src/pcl/simple-streams/external-formats/utf-16-be.lisp - src/pcl/simple-streams/external-formats/utf-16-le.lisp - src/pcl/simple-streams/external-formats/utf-16.lisp - src/pcl/simple-streams/external-formats/utf-32-be.lisp - src/pcl/simple-streams/external-formats/utf-32-le.lisp - src/pcl/simple-streams/external-formats/utf-32.lisp - src/pcl/simple-streams/external-formats/utf-8.lisp Changes: ===================================== src/pcl/simple-streams/external-formats/ascii.lisp ===================================== @@ -33,5 +33,15 @@ replaced with a question mark.") (declare (optimize (ext:inhibit-warnings 3))) (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code)) #x3F) - ,code)))) + ,code))) + () + () + (octet-count (code state error) + `(if (> ,code #x7f) + (if ,error + (locally + (declare (optimize (ext:inhibit-warnings 3))) + (funcall ,error "Cannot output codepoint #x~X to ASCII stream" ,code)) + 1) + 1))) ===================================== src/pcl/simple-streams/external-formats/iso8859-1.lisp ===================================== @@ -31,4 +31,17 @@ character and illegal outputs are replaced by a question mark.") (funcall ,error "Cannot output codepoint #x~X to ISO8859-1 stream" ,code 1)) #x3F) - ,code)))) + ,code))) + () + () + (octet-count (code state error) + `(if (> ,code 255) + (if ,error + (locally + ;; No warnings about fdefinition + (declare (optimize (ext:inhibit-warnings 3))) + (funcall ,error + (intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream") + ,code 1)) + 1) + 1))) ===================================== src/pcl/simple-streams/external-formats/utf-16-be.lisp ===================================== @@ -110,4 +110,12 @@ Unicode replacement character.") (copy-state (state) ;; The state is either NIL or a codepoint, so nothing really ;; special is needed to copy it. - `(progn ,state))) + `(progn ,state)) + (code-to-octets (code state error) + `(cond ((< ,code #x10000) + 2) + ((< ,code #x110000) + 4) + (t + ;; Replacement character is 2 octets + 2)))) ===================================== src/pcl/simple-streams/external-formats/utf-16-le.lisp ===================================== @@ -111,4 +111,12 @@ Unicode replacement character.") (copy-state (state) ;; The state is either NIL or a codepoint, so nothing really ;; special is needed. - `(progn ,state))) + `(progn ,state)) + (code-to-octets (code state error) + `(cond ((< ,code #x10000) + 2) + ((< ,code #x110000) + 4) + (t + ;; Replacement character is 2 octets + 2)))) ===================================== src/pcl/simple-streams/external-formats/utf-16.lisp ===================================== @@ -156,4 +156,18 @@ Unicode replacement character.") ,c)))))) (copy-state (state) ;; The state is list. Copy it - `(copy-list ,state))) + `(copy-list ,state)) + (code-to-octets (code state error) + `(progn + #+nil + (unless ,state + ;; Output BOM + (output #xFEFF) + (setf ,state t)) + (cond ((< ,code #x10000) + 2) + ((< ,code #x110000) + 4) + (t + ;; Replacement character is 2 octets + 2))))) ===================================== src/pcl/simple-streams/external-formats/utf-32-be.lisp ===================================== @@ -61,4 +61,18 @@ Unicode replacement character.") ,code)) +replacement-character-code+))) (t - (out ,code)))))) + (out ,code))))) + () + () + (code-to-octets (code state error) + `(cond ((lisp::surrogatep ,code) + (if ,error + (locally + ;; No warnings about fdefinition + (declare (optimize (ext:inhibit-warnings 3))) + (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output" + ,code)) + ;; Replacement character is 2 octets + 2)) + (t + 4))))) ===================================== src/pcl/simple-streams/external-formats/utf-32-le.lisp ===================================== @@ -62,4 +62,18 @@ Unicode replacement character.") ,code)) +replacement-character-code+))) (t - (out ,code)))))) + (out ,code))))) + () + () + (code-to-octets (code state error) + `(cond ((lisp::surrogatep ,code) + (if ,error + (locally + ;; No warnings about fdefinition + (declare (optimize (ext:inhibit-warnings 3))) + (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output" + ,code)) + ;; Replacement character is 2 octets + 2)) + (t + 4)))) ===================================== src/pcl/simple-streams/external-formats/utf-32.lisp ===================================== @@ -114,4 +114,22 @@ Unicode replacement character.") nil (copy-state (state) ;; The state is either NIL or T, so we can just return that. - `(progn ,state))) + `(progn ,state)) + (code-to-octets (code state output error i c) + `(progn + ;; Should we count the BOM? + #+nil + (unless ,state + (out #xFEFF) + (setf ,state t)) + (cond ((lisp::surrogatep ,code) + (if ,error + (locally + ;; No warnings about fdefinition + (declare (optimize (ext:inhibit-warnings 3))) + (funcall ,error "Surrogate code #x~4,'0X is illegal for UTF32 output" + ,code)) + ;; Replacement character is 2 octets + 2)) + (t + 4))))) ===================================== src/pcl/simple-streams/external-formats/utf-8.lisp ===================================== @@ -127,4 +127,14 @@ replacement character.") ((< ,code #x800) (utf8 ,code 1)) ((< ,code #x10000) (utf8 ,code 2)) ((< ,code #x110000) (utf8 ,code 3)) - (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))) + (t (error "How did this happen? Codepoint U+~X is illegal" ,code))))) + () + () + (octet-count (code state error) + `(locally + (declare (optimize (ext:inhibit-warnings 3))) + (cond ((< ,code #x80) 1) + ((< ,code #x800) 2) + ((< ,code #x10000) 3) + ((< ,code #x110000) 4) + (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))) View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/582a9f51db3e6df7b653df9... -- View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/582a9f51db3e6df7b653df9... You're receiving this email because of your account on gitlab.common-lisp.net.
participants (1)
-
Raymond Toy (@rtoy)