This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "CMU Common Lisp".
The branch, master has been updated
via b040afc230f3b1647620756c64f462d2570c2ae7 (commit)
from a60274829b62c938292920f8f19b1c269240e8c2 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit b040afc230f3b1647620756c64f462d2570c2ae7
Author: Raymond Toy <toy.raymond(a)gmail.com>
Date: Fri Apr 20 19:44:04 2012 -0700
Fix bug in handling the state BOM marker and also extend to work
composing external formats. The state BOM marker also needs to
indicate how long the BOM is.
* src/code/stream.lisp
* Handle the BOM marker correctly for composed format.
* Handle the new values for the state BOM marker.
* src/pcl/simple-streams/external-formats/utf-16.lisp
* Change BOM state marker to +2 and -2 instead of 1 and 2 to
indicate the length of the BOM.
* src/pcl/simple-streams/external-formats/utf-32.lisp
* Change BOM state marker to +4 and -4 instead of 1 and 2 to
indicate the length of the BOM.
diff --git a/src/code/stream.lisp b/src/code/stream.lisp
index b5d34ad..0e27cba 100644
--- a/src/code/stream.lisp
+++ b/src/code/stream.lisp
@@ -850,39 +850,34 @@
(format t "octet-count = ~A~%" octet-count)
(format t "in-index = ~A~%" (lisp-stream-in-index stream))
(format t "new state = ~S~%" new-state))
- ;; FIXME: We need to know if a BOM
- ;; character was read so that we can
- ;; adjust the octet count correctly
- ;; because OCTETS-TO-CHAR does not include
- ;; the BOM in the number of octets
- ;; processed. To do that, we look into
- ;; the state, and thus is very fragile.
- ;; OCTETS-TO-CHAR and thus
- ;; OCTETS-TO-STRING-COUNTED should
- ;; indicate that instead of doing it here.
+ ;; FIXME: We need to know if a BOM character was read so that
+ ;; we can adjust the octet count correctly because
+ ;; OCTETS-TO-CHAR does not include the BOM in the number of
+ ;; octets processed. To do that, we look into the state, and
+ ;; thus is very fragile. OCTETS-TO-CHAR and thus
+ ;; OCTETS-TO-STRING-COUNTED should indicate that instead of
+ ;; doing it here.
;;
- ;; So far, only utf-16 and utf-32 needs to
- ;; handle BOM specially. In both of these
- ;; cases, (cadr state) contains
- ;; information about whether a BOM
- ;; character was read or not. If a BOM
- ;; was read, then we need to increment the
- ;; octet-count by 2 for the BOM because
- ;; OCTETS-TO-STRING doesn't include that
- ;; in its count.
+ ;; So far, only utf-16 and utf-32 needs to handle BOM
+ ;; specially. In both of these cases, (cadr state) contains
+ ;; information about whether a BOM character was read or not.
+ ;; If a BOM was read, then we need to increment the
+ ;; octet-count by 2 for the BOM because OCTETS-TO-STRING
+ ;; doesn't include that in its count.
+ ;;
+ ;; But we could have a composing external format too, like
+ ;; :crlf, so what we really want to look at is the last
+ ;; element of the state.
(when (and (consp (cdr new-state))
- (not (eql (cadr old-state)
- (cadr new-state))))
+ (not (eq (car (last old-state))
+ (car (last new-state)))))
#+debug-frc-sr
(format t "state changed from ~S to ~S~%" old-state new-state)
- ;; See utf-16.lisp and utf-32.lisp to
- ;; see where the 1 and 2 come from.
- ;; They indicate that the BOM was read,
- ;; and whether we're reading big-endian
- ;; or little-endian data.
- (when (member (cadr new-state) '(1 2))
- ;; We read a BOM.
- (incf octet-count 2)))
+ ;; See utf-16.lisp and utf-32.lisp. The part of the state
+ ;; we're interested in encodes the endianness and the size
+ ;; of the BOM in octets.
+ (incf octet-count (abs (the (integer -4 4)
+ (car (last new-state))))))
(when (> char-count 0)
(setf (fd-stream-oc-state stream) new-state)
(setf (lisp-stream-string-buffer-len stream) (1+ char-count))
diff --git a/src/pcl/simple-streams/external-formats/utf-16.lisp b/src/pcl/simple-streams/external-formats/utf-16.lisp
index d25fdea..d197aea 100644
--- a/src/pcl/simple-streams/external-formats/utf-16.lisp
+++ b/src/pcl/simple-streams/external-formats/utf-16.lisp
@@ -13,15 +13,19 @@
;; one here, anyway. This should be compatible with the Unicode spec.
;; The state is a cons. The car is an integer:
-;; 0 = initial state, nothing has been read yet
-;; 1 = BOM has been read, little-endian
-;; 2 = BOM has been read, big-endian, or non-BOM char has been read
+;; 0 = initial state, nothing has been read yet
+;; -2 = BOM has been read, little-endian
+;; 2 = BOM has been read, big-endian, or non-BOM char has been read
+;;
+;; The absolute value of car specifies the size of the BOM in octets.
+;; This is used in stream.lisp to account for the BOM.
;;
;; The cdr is either NIL or a codepoint which is used for converting
;; surrogate pairs into codepoints. If the cdr is non-NIL, then it is
;; the leading (high) surrogate of a surrogate pair.
;;
-;; When writing, never output a BOM.
+;;
+;; When writing, always output a BOM.
(define-external-format :utf-16 (:size 2 :documentation
"UTF-16 is a variable length character encoding for Unicode. On
@@ -42,8 +46,10 @@ Unicode replacement character.")
(let* ((,st (car ,state))
(,c1 ,input)
(,c2 ,input)
- (,code (if (oddp ,st)
+ (,code (if (minusp ,st)
+ ;; Little endian
(+ (* 256 ,c2) ,c1)
+ ;; Big endian (including BOM, if any)
(+ (* 256 ,c1) ,c2)))
(,wd 2))
(declare (type (integer 0 2) ,st)
@@ -79,9 +85,9 @@ Unicode replacement character.")
(setf (cdr ,state) ,code)
(let* ((,c1 ,input)
(,c2 ,input)
- (,next (if (oddp ,st)
- (+ (* 256 ,c2) ,c1)
- (+ (* 256 ,c1) ,c2))))
+ (,next (if (plusp ,st)
+ (+ (* 256 ,c1) ,c2)
+ (+ (* 256 ,c2) ,c1))))
;; We read the trailing surrogate, so clear the state.
(setf (cdr ,state) nil)
;; If we don't have a high and low surrogate,
@@ -99,8 +105,10 @@ Unicode replacement character.")
(funcall ,error "High surrogate followed by #x~4,'0X instead of low surrogate" ,next ,wd))
+replacement-character-code+)))))
((and (= ,code #xFFFE) (zerop ,st))
- (setf (car ,state) 1) (go :again))
+ ;; BOM for little-endian order
+ (setf (car ,state) -2) (go :again))
((and (= ,code #xFEFF) (zerop ,st))
+ ;; BOM for big-endian order
(setf (car ,state) 2) (go :again))
((= ,code #xFFFE)
;; Replace with REPLACEMENT CHARACTER.
diff --git a/src/pcl/simple-streams/external-formats/utf-32.lisp b/src/pcl/simple-streams/external-formats/utf-32.lisp
index 1fd3cf1..c0fa820 100644
--- a/src/pcl/simple-streams/external-formats/utf-32.lisp
+++ b/src/pcl/simple-streams/external-formats/utf-32.lisp
@@ -17,16 +17,18 @@
;;
;; This is modeled after the utf-16 format.
-;; make state an integer:
-;; or (or state 0) to cope with NIL case
-;; 0 = initial state, nothing has been read yet
-;; 1 = BOM has been read, little-endian
-;; 2 = BOM has been read, big-endian, or non-BOM char has been read
+;; The state is a cons. The car is an integer:
+;; 0 = initial state, nothing has been read yet
+;; -4 = BOM has been read, little-endian
+;; 4 = BOM has been read, big-endian, or non-BOM char has been read
;;
-;; (oddp state) = little-endian
-;; (evenp state) = big-endian
+;; (minusp state) = little-endian
+;; (plusp state) = big-endian
;; (zerop state) = #xFEFF/#xFFFE is BOM (to be skipped)
;;
+;; The absolute value of the car specifies the size of the BOM in
+;; octets. This is used in stream.lisp to account for the BOM.
+;;
(define-external-format :utf-32 (:size 4 :documentation
"UTF-32 is a fixed-length character encoding of 4 octets for Unicode.
On input, a byte-order mark is recognized. If no byte-order mark is
@@ -40,21 +42,21 @@ Unicode replacement character.")
(octets-to-code (state input unput error code c1 c2 c3 c4 st wd)
`(block nil
- (when (null ,state) (setf ,state 0))
+ (when (null ,state) (setf ,state (cons 0 nil)))
(tagbody
:again
- (let* ((,st ,state)
+ (let* ((,st (car ,state))
(,c1 ,input)
(,c2 ,input)
(,c3 ,input)
(,c4 ,input)
- (,code (if (oddp ,st)
+ (,code (if (minusp ,st)
;; Little-endian
(+ ,c1
(ash ,c2 8)
(ash ,c3 16)
(ash ,c4 24))
- ;; Big-endian
+ ;; Big-endian (including BOM, if any)
(+ (ash ,c1 24)
(ash ,c2 16)
(ash ,c3 8)
@@ -80,11 +82,11 @@ Unicode replacement character.")
+replacement-character-code+)))
((and (zerop ,st) (= ,code #xFFFE0000))
;; BOM for little-endian
- (setf ,state 1)
+ (setf (car ,state) -4)
(go :again))
((and (zerop ,st) (= ,code #xFEFF))
;; BOM for big-endian
- (setf ,state 2)
+ (setf (car ,state) 4)
(go :again)))
(return (values ,code ,wd))))))
-----------------------------------------------------------------------
Summary of changes:
src/code/stream.lisp | 53 +++++++++-----------
.../simple-streams/external-formats/utf-16.lisp | 26 ++++++---
.../simple-streams/external-formats/utf-32.lisp | 28 ++++++-----
3 files changed, 56 insertions(+), 51 deletions(-)
hooks/post-receive
--
CMU Common Lisp