Update of /project/cxml/cvsroot/closure-common
In directory clnet:/tmp/cvs-serv1529
Modified Files:
closure-common.asd encodings.lisp xstream.lisp ystream.lisp
Log Message:
Use 21 bit characters on Lisp offering them.
--- /project/cxml/cvsroot/closure-common/closure-common.asd 2007/10/21 17:07:38 1.3
+++ /project/cxml/cvsroot/closure-common/closure-common.asd 2007/12/22 15:19:25 1.4
@@ -15,19 +15,31 @@
(let (#+sbcl (*compile-print* nil))
(call-next-method))))
-#-(or rune-is-character rune-is-integer)
(progn
(format t "~&;;; Checking for wide character support...")
(force-output)
- (pushnew (dotimes (x 65536
- (progn
- (format t " ok, characters have at least 16 bits.~%")
- :rune-is-character))
- (unless (or (<= #xD800 x #xDFFF)
- (and (< x char-code-limit) (code-char x)))
- (format t " no, reverting to octet strings.~%")
- (return :rune-is-integer)))
- *features*))
+ (flet ((test (code)
+ (and (< code char-code-limit) (code-char code))))
+ (cond
+ ((not (test 50000))
+ (format t " no, reverting to octet strings.~%")
+ #+rune-is-character
+ (error "conflicting unicode configuration. Please recompile.")
+ (pushnew :rune-is-integer *features*))
+ ((code-char 70000)
+ (when (test #xD800)
+ (format t " WARNING: Lisp implementation doesn't use UTF-16, ~
+ but accepts surrogate code points.~%"))
+ (format t " yes, using code points.~%")
+ #+(or rune-is-integer rune-is-utf-16)
+ (error "conflicting unicode configuration. Please recompile.")
+ (pushnew :rune-is-character *features*))
+ (t
+ (format t " yes, using UTF-16.~%")
+ #+(or rune-is-integer (and rune-is-character (not rune-is-utf-16)))
+ (error "conflicting unicode configuration. Please recompile.")
+ (pushnew :rune-is-utf-16 *features*)
+ (pushnew :rune-is-character *features*)))))
#-rune-is-character
(format t "~&;;; Building Closure with (UNSIGNED-BYTE 16) RUNES~%")
--- /project/cxml/cvsroot/closure-common/encodings.lisp 2007/07/22 19:59:26 1.7
+++ /project/cxml/cvsroot/closure-common/encodings.lisp 2007/12/22 15:19:25 1.8
@@ -1,5 +1,10 @@
(in-package :runes-encoding)
+(eval-when (:compile-toplevel :load-toplevel :execute)
+ (defparameter +buffer-byte+
+ #+rune-is-utf-16 '(unsigned-byte 16)
+ #-rune-is-utf-16 '(unsigned-byte 32)))
+
(define-condition encoding-error (simple-error) ())
(defun xerror (fmt &rest args)
@@ -82,7 +87,7 @@
(defun make-simple-8-bit-encoding (&key charset)
(make-instance 'simple-8-bit-encoding
- :table (coerce (to-unicode-table charset) '(simple-array (unsigned-byte 16) (256)))))
+ :table (coerce (to-unicode-table charset) '(simple-array #.+buffer-byte+ (256)))))
;;;;;;;
@@ -150,16 +155,30 @@
(return))
(when (>= (%+ rptr 1) in-end)
(return))
- (let ((hi (aref in rptr))
- (lo (aref in (%+ 1 rptr))))
+ (let* ((hi (aref in rptr))
+ (lo (aref in (%+ 1 rptr)))
+ (x (logior (ash hi 8) lo)))
+ (when (or (eql x #xFFFE) (eql x #xFFFF))
+ (xerror "not a valid code point: #x~X" x))
+ (when (<= #xDC00 x #xDFFF)
+ (xerror "unexpected high surrogate: #x~X" x))
+ (when (<= #xD800 x #xDBFF)
+ ;; seen low surrogate, look for high surrogate now
+ (when (>= (%+ rptr 3) in-end)
+ (return))
+ (let* ((hi2 (aref in (%+ 2 rptr)))
+ (lo2 (aref in (%+ 3 rptr)))
+ (y (logior (ash hi2 8) lo2)))
+ (unless (<= #xDC00 x #xDFFF)
+ (xerror "expected a high surrogate but found: #x~X" x))
+ #-rune-is-utf-16
+ (progn
+ (setf x (logior (ash (%- x #xd7c0) 10) (%and y #x3FF)))
+ (setf rptr (%+ 2 rptr))))
+ ;; end of surrogate handling
+ )
+ (setf (aref out wptr) x)
(setf rptr (%+ 2 rptr))
- ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
- ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
- ;; Haelfte fehlt!
- (let ((x (logior (ash hi 8) lo)))
- (when (or (eql x #xFFFE) (eql x #xFFFF))
- (xerror "not a valid code point: #x~X" x))
- (setf (aref out wptr) x))
(setf wptr (%+ 1 wptr))))
(values wptr rptr)))
@@ -173,16 +192,30 @@
(return))
(when (>= (%+ rptr 1) in-end)
(return))
- (let ((lo (aref in (%+ 0 rptr)))
- (hi (aref in (%+ 1 rptr))))
+ (let* ((lo (aref in rptr))
+ (hi (aref in (%+ 1 rptr)))
+ (x (logior (ash hi 8) lo)))
+ (when (or (eql x #xFFFE) (eql x #xFFFF))
+ (xerror "not a valid code point: #x~X" x))
+ (when (<= #xDC00 x #xDFFF)
+ (xerror "unexpected high surrogate: #x~X" x))
+ (when (<= #xD800 x #xDBFF)
+ ;; seen low surrogate, look for high surrogate now
+ (when (>= (%+ rptr 3) in-end)
+ (return))
+ (let* ((lo2 (aref in (%+ 2 rptr)))
+ (hi2 (aref in (%+ 3 rptr)))
+ (y (logior (ash hi2 8) lo2)))
+ (unless (<= #xDC00 x #xDFFF)
+ (xerror "expected a high surrogate but found: #x~X" x))
+ #-rune-is-utf-16
+ (progn
+ (setf x (logior (ash (%- x #xd7c0) 10) (%and y #x3FF)))
+ (setf rptr (%+ 2 rptr))))
+ ;; end of surrogate handling
+ )
+ (setf (aref out wptr) x)
(setf rptr (%+ 2 rptr))
- ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
- ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
- ;; Haelfte fehlt!
- (let ((x (logior (ash hi 8) lo)))
- (when (or (eql x #xFFFE) (eql x #xFFFF))
- (xerror "not a valid code point: #x~X" x))
- (setf (aref out wptr) x))
(setf wptr (%+ 1 wptr))))
(values wptr rptr)))
@@ -190,7 +223,8 @@
in in-start in-end out out-start out-end eof?)
(declare (optimize (speed 3) (safety 0))
(type (simple-array (unsigned-byte 8) (*)) in)
- (type (simple-array (unsigned-byte 16) (*)) out)
+ (type (simple-array #.+buffer-byte+ (*))
+ out)
(type fixnum in-start in-end out-start out-end))
(let ((wptr out-start)
(rptr in-start)
@@ -204,6 +238,7 @@
(eql x #xFFFE)
(eql x #xFFFF))
(xerror "not a valid code point: #x~X" x))
+ #+rune-is-utf-16
((%> x #xFFFF)
(setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10))
(aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF)))
@@ -325,7 +360,7 @@
eof?)
(declare (optimize (speed 3) (safety 0))
(type (simple-array (unsigned-byte 8) (*)) in)
- (type (simple-array (unsigned-byte 16) (*)) out)
+ (type (simple-array #.+buffer-byte+ (*)) out)
(type fixnum in-start in-end out-start out-end))
(let ((wptr out-start)
(rptr in-start)
@@ -333,7 +368,7 @@
(table (slot-value encoding 'table)))
(declare (type fixnum wptr rptr)
(type (unsigned-byte 8) byte)
- (type (simple-array (unsigned-byte 16) (*)) table))
+ (type (simple-array #.+buffer-byte+ (*)) table))
(loop
(when (%= wptr out-end) (return))
(when (%>= rptr in-end) (return))
@@ -387,7 +422,7 @@
:name ',name
:to-unicode-table
',(make-array 256
- :element-type '(unsigned-byte 16)
+ :element-type '#.+buffer-byte+
:initial-contents codes)))
',name))
--- /project/cxml/cvsroot/closure-common/xstream.lisp 2007/10/14 21:14:08 1.8
+++ /project/cxml/cvsroot/closure-common/xstream.lisp 2007/12/22 15:19:25 1.9
@@ -83,7 +83,10 @@
`(unsigned-byte ,(integer-length array-total-size-limit)))
(deftype buffer-byte ()
- `(unsigned-byte 16))
+ #+rune-is-utf-16
+ `(unsigned-byte 16)
+ #-rune-is-utf-16
+ `(unsigned-byte 32))
(deftype octet ()
`(unsigned-byte 8))
--- /project/cxml/cvsroot/closure-common/ystream.lisp 2007/06/16 11:27:19 1.6
+++ /project/cxml/cvsroot/closure-common/ystream.lisp 2007/12/22 15:19:25 1.7
@@ -76,14 +76,17 @@
(when (plusp ptr)
(let* ((in (ystream-in-buffer ystream))
(out (ystream-out-buffer ystream))
+ #+rune-is-utf-16
(surrogatep (<= #xD800 (rune-code (elt in (1- ptr))) #xDBFF))
n)
+ #+rune-is-utf-16
(when surrogatep
(decf ptr))
(when (plusp ptr)
(setf n (runes-to-utf8 out in ptr))
(ystream-device-write ystream out n)
(cond
+ #+rune-is-utf-16
(surrogatep
(setf (elt in 0) (elt in (1- ptr)))
(setf (ystream-in-ptr ystream) 1))
@@ -98,7 +101,7 @@
(macrolet ((define-utf8-writer (name (byte &rest aux) result &body body)
`(defun ,name (out in n)
- (let ((high-surrogate nil)
+ (let (#+rune-is-utf-16 (high-surrogate nil)
,@aux)
(labels
((write0 (,byte)
@@ -134,13 +137,19 @@
(write0 (logior #b10000000 (ldb (byte 6 0) r))))))
(write2 (r)
(cond
+ #+rune-is-utf-16
((<= #xD800 r #xDBFF)
(setf high-surrogate r))
+ #+rune-is-utf-16
((<= #xDC00 r #xDFFF)
(let ((q (logior (ash (- high-surrogate #xD7C0) 10)
(- r #xDC00))))
(write1 q))
(setf high-surrogate nil))
+ #-rune-is-utf-16
+ ((<= #xD800 r #xDFFF)
+ (error
+ "surrogates not allowed in this configuration"))
(t
(write1 r)))))
(dotimes (j n)
@@ -259,7 +268,7 @@
(defun utf8-string-to-rod (str)
(let* ((bytes (map '(vector (unsigned-byte 8)) #'char-code str))
- (buffer (make-array (length bytes) :element-type '(unsigned-byte 16)))
+ (buffer (make-array (length bytes) :element-type 'buffer-byte))
(n (runes-encoding:decode-sequence
:utf-8 bytes 0 (length bytes) buffer 0 0 nil))
(result (make-array n :element-type 'rune)))