[cmucl-cvs] [git] CMU Common Lisp branch master updated. snapshot-2012-04-3-gf3db74d

19 Apr 2012

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "CMU Common Lisp".

The branch, master has been updated
       via  f3db74d49bf24c108053873f06905dbb2ed3cebd (commit)
      from  4c7f35da68188371f72ad068860df3f86a69f6b2 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit f3db74d49bf24c108053873f06905dbb2ed3cebd
Author: Raymond Toy <toy.raymond@gmail.com>
Date:   Wed Apr 18 23:53:31 2012 -0700

    Fix ticket:58.  Handle the BOM character for utf-16 and utf-32.  This
    is a bit of a hack.
    
     * src/code/stream.lisp:
      * Check the state to see if a BOM was read.  This critically depends
        on knowing the format of the state variable for utf16 and utf32
        formats, but the stream code shouldn't have to know the state
        internals.
    
     * src/general-info/release-20d.txt
       * Update.

diff --git a/src/code/stream.lisp b/src/code/stream.lisp
index 77a90f7..92987b7 100644
--- a/src/code/stream.lisp
+++ b/src/code/stream.lisp
@@ -828,35 +828,70 @@
 		 ;; haven't processed yet and the ones we just read in.
 		 (flet
 		     ((convert-buffer ()
-			(multiple-value-bind (s char-count octet-count new-state)
-			    (stream::octets-to-string-counted
-			     ibuf
-			     (fd-stream-octet-count stream)
-			     :start 0
-			     :end (fd-stream-in-length stream)
-			     :state (fd-stream-oc-state stream)
-			     :string sbuf
-			     :s-start 1
-			     :external-format (fd-stream-external-format stream)
-			     :error (fd-stream-octets-to-char-error stream))
-			  (declare (ignore s)
-				   (type (integer 0 #.in-buffer-length) char-count octet-count))
-			  #+(or debug-frc-sr)
-			  (progn
-			    (format t "char-count = ~A~%" char-count)
-			    (format t "octet-count = ~A~%" octet-count)
-			    (format t "in-index = ~A~%" (lisp-stream-in-index stream)))
-			  (when (> char-count 0)
-			    (setf (fd-stream-oc-state stream) new-state)
-			    (setf (lisp-stream-string-buffer-len stream) (1+ char-count))
-			    (setf (lisp-stream-string-index stream) 2)
-			    (setf (lisp-stream-in-index stream) octet-count)
+			(let ((old-state (fd-stream-oc-state stream)))
+			  (multiple-value-bind (s char-count octet-count new-state)
+			      (stream::octets-to-string-counted
+			       ibuf
+			       (fd-stream-octet-count stream)
+			       :start 0
+			       :end (fd-stream-in-length stream)
+			       :state (fd-stream-oc-state stream)
+			       :string sbuf
+			       :s-start 1
+			       :external-format (fd-stream-external-format stream)
+			       :error (fd-stream-octets-to-char-error stream))
+			    (declare (ignore s)
+				     (type (integer 0 #.in-buffer-length) char-count octet-count))
 			    #+(or debug-frc-sr)
 			    (progn
-			      (format t "new in-index = ~A~%" (lisp-stream-in-index stream))
-			      (format t "new sbuf = ~S~%" 
-				      (subseq sbuf 0 (1+ char-count))))
-			    (schar sbuf 1)))))
+			      (format t "char-count = ~A~%" char-count)
+			      (format t "octet-count = ~A~%" octet-count)
+			      (format t "in-index = ~A~%" (lisp-stream-in-index stream))
+			      (format t "old state = ~S~%" old-state)
+			      (format t "new state = ~S~%" new-state))
+			    ;; FIXME: We need to know if a BOM
+			    ;; character was read so that we can
+			    ;; adjust the octet count correctly
+			    ;; because OCTETS-TO-CHAR does not include
+			    ;; the BOM in the number of octets
+			    ;; processed.  To do that, we look into
+			    ;; the state, and thus is very fragile.
+			    ;; OCTETS-TO-CHAR and thus
+			    ;; OCTETS-TO-STRING-COUNTED should
+			    ;; indicate that instead of doing it here.
+			    ;;
+			    ;; So far, only utf-16 and utf-32 needs to
+			    ;; handle BOM specially.  In both of these
+			    ;; cases, (cadr state) contains
+			    ;; information about whether a BOM
+			    ;; character was read or not.  If a BOM
+			    ;; was read, then we need to increment the
+			    ;; octet-count by 2 for the BOM because
+			    ;; OCTETS-TO-STRING doesn't include that
+			    ;; in its count.
+			    (when (not (eql (cadr old-state)
+					    (cadr new-state)))
+			      #+debug-frc-sr
+			      (format t "state changed from ~S to ~S~%" old-state new-state)
+			      ;; See utf-16.lisp and utf-32.lisp to
+			      ;; see where the 1 and 2 come from.
+			      ;; They indicate that the BOM was read,
+			      ;; and whether we're reading big-endian
+			      ;; or little-endian data.
+			      (when (member (cadr new-state) '(1 2))
+				;; We read a BOM.
+				(incf octet-count 2)))
+			    (when (> char-count 0)
+			      (setf (fd-stream-oc-state stream) new-state)
+			      (setf (lisp-stream-string-buffer-len stream) (1+ char-count))
+			      (setf (lisp-stream-string-index stream) 2)
+			      (setf (lisp-stream-in-index stream) octet-count)
+			      #+(or debug-frc-sr)
+			      (progn
+				(format t "new in-index = ~A~%" (lisp-stream-in-index stream))
+				(format t "new sbuf = ~S~%" 
+					(subseq sbuf 0 (1+ char-count))))
+			      (schar sbuf 1))))))
 		   (let ((out (convert-buffer)))
 		     (or out
 			 ;; There weren't enough octets to convert at
diff --git a/src/general-info/release-20d.txt b/src/general-info/release-20d.txt
index 44626fd..6bd52d5 100644
--- a/src/general-info/release-20d.txt
+++ b/src/general-info/release-20d.txt
@@ -67,6 +67,7 @@ New in this release:
     * #53: UTF-8 in core.
     * #52: UNICODE-COMPLETE-NAME misses a completion.
     * #55: blocked signals.
+    * #58: UTF-16 buffering problem.
 
   * Other changes:
     * The layout of the cmucl directories has been changed.

-----------------------------------------------------------------------

Summary of changes:
 src/code/stream.lisp             |   89 ++++++++++++++++++++++++++-----------
 src/general-info/release-20d.txt |    1 +
 2 files changed, 63 insertions(+), 27 deletions(-)


hooks/post-receive
-- 
CMU Common Lisp

    

Raymond Toy

tags

participants (1)