This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "CMU Common Lisp".
The branch, master has been updated via 63ecc76acc5a671c45af3a650a239ef59b825777 (commit) from f364ebbe14fa351117a843905b06ae1f7fbff3ae (commit)
Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below.
- Log ----------------------------------------------------------------- commit 63ecc76acc5a671c45af3a650a239ef59b825777 Author: Raymond Toy toy.raymond@gmail.com Date: Wed Jan 11 22:08:48 2012 -0800
Fix ticket:53 by adding UTF-8 to the core.
code/extfmts.lisp: * Move the utf-8 implementation to this file * Update %find-external-format to return quickly for :utf-8.
code/fd-stream-comp.lisp: * Precompile utf-8 functions.
pcl/simple-streams/external-formats/utf-8.lisp: * Add note that this is only used for reference now because it's in the core.
diff --git a/src/code/extfmts.lisp b/src/code/extfmts.lisp index 43adbcc..f7e684c 100644 --- a/src/code/extfmts.lisp +++ b/src/code/extfmts.lisp @@ -464,6 +464,9 @@ (and (eq name :default) (eq *default-external-format* :iso8859-1))) (return-from %find-external-format (gethash :iso8859-1 *external-formats*))) + (when (eq name :utf-8) + (return-from %find-external-format + (gethash :utf-8 *external-formats*)))
(when (zerop (hash-table-count *external-format-aliases*)) (setf (gethash :latin1 *external-format-aliases*) :iso8859-1) @@ -1158,3 +1161,119 @@ character and illegal outputs are replaced by a question mark.") `(setf (aref (ef-cache (find-external-format ,(ef-name ef))) ,slot) ,(subst (ef-name ef) ef (function-lambda-expression (aref (ef-cache ef) slot)))))) + +;; A safe UTF-8 external format. Any illegal UTF-8 sequences on input +;; are replaced with the Unicode REPLACEMENT CHARACTER (U+FFFD), or +;; signals an error as appropriate. +;; +;; See Table 3-7, Ch 3.9 in the Unicode book. + +(define-external-format :utf-8 (:min 1 :max 4 :documentation +"UTF-8 is a variable-length character encoding for Unicode. By +default, illegal input sequences are replaced by the Unicode +replacement character.") + + () + (octets-to-code (state input unput error c i j n) + `(labels ((utf8 (,c ,i) + (declare (type (unsigned-byte 8) ,c) + (type (integer 1 5) ,i)) + (let ((,n (ash (ldb (byte (- 6 ,i) 0) ,c) + (* 6 ,i)))) + (declare (type (unsigned-byte 31) ,n)) + (dotimes (,j ,i (check ,n ,i)) + (let ((,c ,input)) + ;; Following bytes must all have the form + ;; #b10xxxxxx. If not, put back the octet we + ;; just read and return the replacement character + ;; for the bad sequence. + (if (< (logxor ,c #x80) #x40) + (setf (ldb (byte 6 (* 6 (- ,i ,j 1))) ,n) + (ldb (byte 6 0) ,c)) + (progn + (,unput 1) + (return + (values + (locally + ;; No warnings about fdefinition + (declare (optimize (ext:inhibit-warnings 3))) + (if ,error + (funcall ,error "Invalid utf8 octet #x~X at offset ~D" + ,c (1+ ,j)) + +replacement-character-code+)) + (1+ ,j))))))))) + (check (,n ,i) + (declare (type (unsigned-byte 31) ,n) + (type (integer 1 5) ,i)) + ;; We check for overlong sequences (sequences that + ;; encode to codepoints that don't need that long of a + ;; sequence) and any surrogate values and any code + ;; outside the 21-bit Unicode range. + (if (or (>= ,n lisp:codepoint-limit) + (<= ,n (the (member 127 2047 65535) + (svref #(127 2047 65535) (1- ,i)))) ; overlong + (lisp::surrogatep ,n)) ; surrogate + (progn + ;; Replace the entire sequence with the + ;; replacment character + (values (if ,error + (cond + ((>= ,n lisp:codepoint-limit) + (locally + ;; No warnings about fdefinition + (declare (optimize (ext:inhibit-warnings 3))) + (funcall ,error "Invalid codepoint #x~X of ~D octets" + ,n (1+ ,i)))) + ((lisp::surrogatep ,n) + (locally + ;; No warnings about fdefinition + (declare (optimize (ext:inhibit-warnings 3))) + (funcall ,error "Invalid surrogate code #x~X" ,n (1+ ,i)))) + (t + (locally + ;; No warnings about fdefinition + (declare (optimize (ext:inhibit-warnings 3))) + (funcall ,error "Overlong utf8 sequence of ~*~D octets" nil (1+ ,i))))) + +replacement-character-code+) + (1+ ,i))) + (values ,n (1+ ,i))))) + (let ((,c ,input)) + (declare (optimize (ext:inhibit-warnings 3))) + (cond ((null ,c) (values nil 0)) + ((< ,c #b10000000) (values ,c 1)) + ((< ,c #b11000010) + (values + (locally + ;; No warnings about fdefinition + (declare (optimize (ext:inhibit-warnings 3))) + (if ,error + (funcall ,error "Invalid initial utf8 octet: #x~X" ,c 1) + +replacement-character-code+)) + 1)) + ((< ,c #b11100000) (utf8 ,c 1)) + ((< ,c #b11110000) (utf8 ,c 2)) + ((< ,c #b11111000) (utf8 ,c 3)) + (t + (values + (locally + ;; No warnings about fdefinition + (declare (optimize (ext:inhibit-warnings 3))) + (if ,error + (funcall ,error "Invalid initial utf8 octet: #x~X" ,c 1) + +replacement-character-code+)) + 1)))))) + (code-to-octets (code state output error i j n p init) + `(flet ((utf8 (,n ,i) + (let* ((,j (- 6 ,i)) + (,p (* 6 ,i)) + (,init (logand #xFF (ash #b01111110 ,j)))) + (,output (logior ,init (ldb (byte ,j ,p) ,n))) + (dotimes (,i ,i) + (decf ,p 6) + (,output (logior 128 (ldb (byte 6 ,p) ,n))))))) + (declare (optimize (ext:inhibit-warnings 3))) + (cond ((< ,code #x80) (,output ,code)) + ((< ,code #x800) (utf8 ,code 1)) + ((< ,code #x10000) (utf8 ,code 2)) + ((< ,code #x110000) (utf8 ,code 3)) + (t (error "How did this happen? Codepoint U+~X is illegal" ,code)))))) diff --git a/src/code/fd-stream-comp.lisp b/src/code/fd-stream-comp.lisp index a383499..e05dc1a 100644 --- a/src/code/fd-stream-comp.lisp +++ b/src/code/fd-stream-comp.lisp @@ -27,3 +27,11 @@ (stream::precompile-ef-slot :iso8859-1 #.stream::+ef-en+) (stream::precompile-ef-slot :iso8859-1 #.stream::+ef-de+)
+(stream::precompile-ef-slot :utf-8 #.stream::+ef-cin+) +(stream::precompile-ef-slot :utf-8 #.stream::+ef-cout+) +(stream::precompile-ef-slot :utf-8 #.stream::+ef-sout+) +(stream::precompile-ef-slot :utf-8 #.stream::+ef-os+) +(stream::precompile-ef-slot :utf-8 #.stream::+ef-so+) +(stream::precompile-ef-slot :utf-8 #.stream::+ef-en+) +(stream::precompile-ef-slot :utf-8 #.stream::+ef-de+) + diff --git a/src/pcl/simple-streams/external-formats/utf-8.lisp b/src/pcl/simple-streams/external-formats/utf-8.lisp index fe14f04..7d2084c 100644 --- a/src/pcl/simple-streams/external-formats/utf-8.lisp +++ b/src/pcl/simple-streams/external-formats/utf-8.lisp @@ -9,8 +9,13 @@ (in-package "STREAM") (intl:textdomain "cmucl")
+;; This is actually implemented in the external-formats code +;; It appears here only for reference, and will never get loaded + + ;; A safe UTF-8 external format. Any illegal UTF-8 sequences on input -;; are replaced with the Unicode REPLACEMENT CHARACTER (U+FFFD). +;; are replaced with the Unicode REPLACEMENT CHARACTER (U+FFFD), or +;; signals an error as appropriate. ;; ;; See Table 3-7, Ch 3.9 in the Unicode book.
-----------------------------------------------------------------------
Summary of changes: src/code/extfmts.lisp | 119 ++++++++++++++++++++ src/code/fd-stream-comp.lisp | 8 ++ src/pcl/simple-streams/external-formats/utf-8.lisp | 7 +- 3 files changed, 133 insertions(+), 1 deletions(-)
hooks/post-receive