Update of /project/cxml/cvsroot/closure-common In directory clnet:/tmp/cvs-serv1529
Modified Files: closure-common.asd encodings.lisp xstream.lisp ystream.lisp Log Message: Use 21 bit characters on Lisp offering them.
--- /project/cxml/cvsroot/closure-common/closure-common.asd 2007/10/21 17:07:38 1.3 +++ /project/cxml/cvsroot/closure-common/closure-common.asd 2007/12/22 15:19:25 1.4 @@ -15,19 +15,31 @@ (let (#+sbcl (*compile-print* nil)) (call-next-method))))
-#-(or rune-is-character rune-is-integer) (progn (format t "~&;;; Checking for wide character support...") (force-output) - (pushnew (dotimes (x 65536 - (progn - (format t " ok, characters have at least 16 bits.~%") - :rune-is-character)) - (unless (or (<= #xD800 x #xDFFF) - (and (< x char-code-limit) (code-char x))) - (format t " no, reverting to octet strings.~%") - (return :rune-is-integer))) - *features*)) + (flet ((test (code) + (and (< code char-code-limit) (code-char code)))) + (cond + ((not (test 50000)) + (format t " no, reverting to octet strings.~%") + #+rune-is-character + (error "conflicting unicode configuration. Please recompile.") + (pushnew :rune-is-integer *features*)) + ((code-char 70000) + (when (test #xD800) + (format t " WARNING: Lisp implementation doesn't use UTF-16, ~ + but accepts surrogate code points.~%")) + (format t " yes, using code points.~%") + #+(or rune-is-integer rune-is-utf-16) + (error "conflicting unicode configuration. Please recompile.") + (pushnew :rune-is-character *features*)) + (t + (format t " yes, using UTF-16.~%") + #+(or rune-is-integer (and rune-is-character (not rune-is-utf-16))) + (error "conflicting unicode configuration. Please recompile.") + (pushnew :rune-is-utf-16 *features*) + (pushnew :rune-is-character *features*)))))
#-rune-is-character (format t "~&;;; Building Closure with (UNSIGNED-BYTE 16) RUNES~%") --- /project/cxml/cvsroot/closure-common/encodings.lisp 2007/07/22 19:59:26 1.7 +++ /project/cxml/cvsroot/closure-common/encodings.lisp 2007/12/22 15:19:25 1.8 @@ -1,5 +1,10 @@ (in-package :runes-encoding)
+(eval-when (:compile-toplevel :load-toplevel :execute) + (defparameter +buffer-byte+ + #+rune-is-utf-16 '(unsigned-byte 16) + #-rune-is-utf-16 '(unsigned-byte 32))) + (define-condition encoding-error (simple-error) ())
(defun xerror (fmt &rest args) @@ -82,7 +87,7 @@
(defun make-simple-8-bit-encoding (&key charset) (make-instance 'simple-8-bit-encoding - :table (coerce (to-unicode-table charset) '(simple-array (unsigned-byte 16) (256))))) + :table (coerce (to-unicode-table charset) '(simple-array #.+buffer-byte+ (256)))))
;;;;;;;
@@ -150,16 +155,30 @@ (return)) (when (>= (%+ rptr 1) in-end) (return)) - (let ((hi (aref in rptr)) - (lo (aref in (%+ 1 rptr)))) + (let* ((hi (aref in rptr)) + (lo (aref in (%+ 1 rptr))) + (x (logior (ash hi 8) lo))) + (when (or (eql x #xFFFE) (eql x #xFFFF)) + (xerror "not a valid code point: #x~X" x)) + (when (<= #xDC00 x #xDFFF) + (xerror "unexpected high surrogate: #x~X" x)) + (when (<= #xD800 x #xDBFF) + ;; seen low surrogate, look for high surrogate now + (when (>= (%+ rptr 3) in-end) + (return)) + (let* ((hi2 (aref in (%+ 2 rptr))) + (lo2 (aref in (%+ 3 rptr))) + (y (logior (ash hi2 8) lo2))) + (unless (<= #xDC00 x #xDFFF) + (xerror "expected a high surrogate but found: #x~X" x)) + #-rune-is-utf-16 + (progn + (setf x (logior (ash (%- x #xd7c0) 10) (%and y #x3FF))) + (setf rptr (%+ 2 rptr)))) + ;; end of surrogate handling + ) + (setf (aref out wptr) x) (setf rptr (%+ 2 rptr)) - ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste - ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere - ;; Haelfte fehlt! - (let ((x (logior (ash hi 8) lo))) - (when (or (eql x #xFFFE) (eql x #xFFFF)) - (xerror "not a valid code point: #x~X" x)) - (setf (aref out wptr) x)) (setf wptr (%+ 1 wptr)))) (values wptr rptr)))
@@ -173,16 +192,30 @@ (return)) (when (>= (%+ rptr 1) in-end) (return)) - (let ((lo (aref in (%+ 0 rptr))) - (hi (aref in (%+ 1 rptr)))) + (let* ((lo (aref in rptr)) + (hi (aref in (%+ 1 rptr))) + (x (logior (ash hi 8) lo))) + (when (or (eql x #xFFFE) (eql x #xFFFF)) + (xerror "not a valid code point: #x~X" x)) + (when (<= #xDC00 x #xDFFF) + (xerror "unexpected high surrogate: #x~X" x)) + (when (<= #xD800 x #xDBFF) + ;; seen low surrogate, look for high surrogate now + (when (>= (%+ rptr 3) in-end) + (return)) + (let* ((lo2 (aref in (%+ 2 rptr))) + (hi2 (aref in (%+ 3 rptr))) + (y (logior (ash hi2 8) lo2))) + (unless (<= #xDC00 x #xDFFF) + (xerror "expected a high surrogate but found: #x~X" x)) + #-rune-is-utf-16 + (progn + (setf x (logior (ash (%- x #xd7c0) 10) (%and y #x3FF))) + (setf rptr (%+ 2 rptr)))) + ;; end of surrogate handling + ) + (setf (aref out wptr) x) (setf rptr (%+ 2 rptr)) - ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste - ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere - ;; Haelfte fehlt! - (let ((x (logior (ash hi 8) lo))) - (when (or (eql x #xFFFE) (eql x #xFFFF)) - (xerror "not a valid code point: #x~X" x)) - (setf (aref out wptr) x)) (setf wptr (%+ 1 wptr)))) (values wptr rptr)))
@@ -190,7 +223,8 @@ in in-start in-end out out-start out-end eof?) (declare (optimize (speed 3) (safety 0)) (type (simple-array (unsigned-byte 8) (*)) in) - (type (simple-array (unsigned-byte 16) (*)) out) + (type (simple-array #.+buffer-byte+ (*)) + out) (type fixnum in-start in-end out-start out-end)) (let ((wptr out-start) (rptr in-start) @@ -204,6 +238,7 @@ (eql x #xFFFE) (eql x #xFFFF)) (xerror "not a valid code point: #x~X" x)) + #+rune-is-utf-16 ((%> x #xFFFF) (setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10)) (aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF))) @@ -325,7 +360,7 @@ eof?) (declare (optimize (speed 3) (safety 0)) (type (simple-array (unsigned-byte 8) (*)) in) - (type (simple-array (unsigned-byte 16) (*)) out) + (type (simple-array #.+buffer-byte+ (*)) out) (type fixnum in-start in-end out-start out-end)) (let ((wptr out-start) (rptr in-start) @@ -333,7 +368,7 @@ (table (slot-value encoding 'table))) (declare (type fixnum wptr rptr) (type (unsigned-byte 8) byte) - (type (simple-array (unsigned-byte 16) (*)) table)) + (type (simple-array #.+buffer-byte+ (*)) table)) (loop (when (%= wptr out-end) (return)) (when (%>= rptr in-end) (return)) @@ -387,7 +422,7 @@ :name ',name :to-unicode-table ',(make-array 256 - :element-type '(unsigned-byte 16) + :element-type '#.+buffer-byte+ :initial-contents codes))) ',name))
--- /project/cxml/cvsroot/closure-common/xstream.lisp 2007/10/14 21:14:08 1.8 +++ /project/cxml/cvsroot/closure-common/xstream.lisp 2007/12/22 15:19:25 1.9 @@ -83,7 +83,10 @@ `(unsigned-byte ,(integer-length array-total-size-limit)))
(deftype buffer-byte () - `(unsigned-byte 16)) + #+rune-is-utf-16 + `(unsigned-byte 16) + #-rune-is-utf-16 + `(unsigned-byte 32))
(deftype octet () `(unsigned-byte 8)) --- /project/cxml/cvsroot/closure-common/ystream.lisp 2007/06/16 11:27:19 1.6 +++ /project/cxml/cvsroot/closure-common/ystream.lisp 2007/12/22 15:19:25 1.7 @@ -76,14 +76,17 @@ (when (plusp ptr) (let* ((in (ystream-in-buffer ystream)) (out (ystream-out-buffer ystream)) + #+rune-is-utf-16 (surrogatep (<= #xD800 (rune-code (elt in (1- ptr))) #xDBFF)) n) + #+rune-is-utf-16 (when surrogatep (decf ptr)) (when (plusp ptr) (setf n (runes-to-utf8 out in ptr)) (ystream-device-write ystream out n) (cond + #+rune-is-utf-16 (surrogatep (setf (elt in 0) (elt in (1- ptr))) (setf (ystream-in-ptr ystream) 1)) @@ -98,7 +101,7 @@
(macrolet ((define-utf8-writer (name (byte &rest aux) result &body body) `(defun ,name (out in n) - (let ((high-surrogate nil) + (let (#+rune-is-utf-16 (high-surrogate nil) ,@aux) (labels ((write0 (,byte) @@ -134,13 +137,19 @@ (write0 (logior #b10000000 (ldb (byte 6 0) r)))))) (write2 (r) (cond + #+rune-is-utf-16 ((<= #xD800 r #xDBFF) (setf high-surrogate r)) + #+rune-is-utf-16 ((<= #xDC00 r #xDFFF) (let ((q (logior (ash (- high-surrogate #xD7C0) 10) (- r #xDC00)))) (write1 q)) (setf high-surrogate nil)) + #-rune-is-utf-16 + ((<= #xD800 r #xDFFF) + (error + "surrogates not allowed in this configuration")) (t (write1 r))))) (dotimes (j n) @@ -259,7 +268,7 @@
(defun utf8-string-to-rod (str) (let* ((bytes (map '(vector (unsigned-byte 8)) #'char-code str)) - (buffer (make-array (length bytes) :element-type '(unsigned-byte 16))) + (buffer (make-array (length bytes) :element-type 'buffer-byte)) (n (runes-encoding:decode-sequence :utf-8 bytes 0 (length bytes) buffer 0 0 nil)) (result (make-array n :element-type 'rune)))