Author: eweitz Date: Sat May 24 21:43:56 2008 New Revision: 54
Modified: branches/edi/decode.lisp branches/edi/doc/index.html branches/edi/packages.lisp branches/edi/specials.lisp branches/edi/strings.lisp branches/edi/util.lisp Log: Compute decoding length
Modified: branches/edi/decode.lisp ============================================================================== --- branches/edi/decode.lisp (original) +++ branches/edi/decode.lisp Sat May 24 21:43:56 2008 @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.16 2008/05/20 23:01:50 edi Exp $ +;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.18 2008/05/25 01:42:50 edi Exp $
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
@@ -29,6 +29,234 @@
(in-package :flexi-streams)
+(defgeneric compute-number-of-chars (format sequence start end) + (declare #.*standard-optimize-settings*) + (:documentation "Computes the exact number of characters required to +decode the sequence of octets in SEQUENCE from START to END using the +external format FORMAT.")) + +(defmethod compute-number-of-chars :around (format (list list) start end) + (declare #.*standard-optimize-settings*) + (call-next-method format (coerce list 'vector) start end)) + +(defmethod compute-number-of-chars ((format flexi-8-bit-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (declare (ignore sequence)) + (- end start)) + +(defmethod compute-number-of-chars ((format flexi-crlf-8-bit-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (let ((i start) + (length (- end start))) + (declare (fixnum i length)) + (loop + (when (>= i end) + (return)) + (let ((position (search #.(vector +cr+ +lf+) sequence :start2 i :end2 end :test #'=))) + (unless position + (return)) + (setq i (1+ position)) + (decf length))) + length)) + +(defun check-end (format start end i) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end i)) + (unless (= i end) + (signal-encoding-error format "These ~A octet~:P can't be ~ +decoded using ~A as the sequence is too short. ~A octet~:P ~ +missing at then end." + (- end start) + (external-format-name format) + (- i end)))) + +(defmethod compute-number-of-chars ((format flexi-utf-8-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (let ((sum 0) + (i start)) + (declare (fixnum i sum)) + (loop + (when (>= i end) + (return)) + (let* ((octet (aref sequence i)) + (length (cond ((not (logbitp 7 octet)) 1) + ((= #b11000000 (logand octet #b11100000)) 2) + ((= #b11100000 (logand octet #b11110000)) 3) + (t 4)))) + (declare (fixnum length) (octet octet)) + (incf sum) + (incf i length))) + (check-end format start end i) + sum)) + +(defmethod compute-number-of-chars ((format flexi-crlf-utf-8-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (let ((sum 0) + (i start) + (last-octet 0)) + (declare (fixnum i sum) (octet last-octet)) + (loop + (when (>= i end) + (return)) + (let* ((octet (aref sequence i)) + (length (cond ((not (logbitp 7 octet)) 1) + ((= #b11000000 (logand octet #b11100000)) 2) + ((= #b11100000 (logand octet #b11110000)) 3) + (t 4)))) + (declare (fixnum length) (octet octet)) + (unless (and (= octet +lf+) (= last-octet +cr+)) + (incf sum)) + (incf i length) + (setq last-octet octet))) + (check-end format start end i) + sum)) + +(defmethod compute-number-of-chars :before ((format flexi-utf-16-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (unless (evenp (- end start)) + (signal-encoding-error format "~A octet~:P cannot be decoded using ~ +UTF-16 as ~:*~A is not even." + (- end start)))) + +(defmethod compute-number-of-chars ((format flexi-utf-16-le-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (let ((sum 0) + (i start)) + (declare (fixnum i sum)) + (loop + (when (>= i end) + (return)) + (let* ((high-octet (aref sequence (1+ i))) + (length (cond ((<= #xd8 high-octet #xdf) 4) + (t 2)))) + (declare (fixnum length) (octet high-octet)) + (incf sum) + (incf i length))) + (check-end format start end i) + sum)) + +(defmethod compute-number-of-chars ((format flexi-utf-16-be-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (let ((sum 0) + (i start)) + (declare (fixnum i sum)) + (loop + (when (>= i end) + (return)) + (let* ((high-octet (aref sequence i)) + (length (cond ((<= #xd8 high-octet #xdf) 4) + (t 2)))) + (declare (fixnum length) (octet high-octet)) + (incf sum) + (incf i length))) + (check-end format start end i) + sum)) + +(defmethod compute-number-of-chars ((format flexi-crlf-utf-16-le-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (let ((sum 0) + (i start) + (last-octet 0)) + (declare (fixnum i sum) (octet last-octet)) + (loop + (when (>= i end) + (return)) + (let* ((high-octet (aref sequence (1+ i))) + (length (cond ((<= #xd8 high-octet #xdf) 4) + (t 2)))) + (declare (fixnum length) (octet high-octet)) + (unless (and (zerop high-octet) + (= (the octet (aref sequence i)) +lf+) + (= last-octet +cr+)) + (incf sum)) + (incf i length) + (setq last-octet (if (zerop high-octet) + (aref sequence i) + 0)))) + (check-end format start end i) + sum)) + +(defmethod compute-number-of-chars ((format flexi-crlf-utf-16-be-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (let ((sum 0) + (i start) + (last-octet 0)) + (declare (fixnum i sum) (octet last-octet)) + (loop + (when (>= i end) + (return)) + (let* ((high-octet (aref sequence i)) + (length (cond ((<= #xd8 high-octet #xdf) 4) + (t 2)))) + (declare (fixnum length) (octet high-octet)) + (unless (and (zerop high-octet) + (= (the octet (aref sequence (1+ i))) +lf+) + (= last-octet +cr+)) + (incf sum)) + (incf i length) + (setq last-octet (if (zerop high-octet) + (aref sequence (1+ i)) + 0)))) + (check-end format start end i) + sum)) +(defmethod compute-number-of-chars :before ((format flexi-utf-32-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (let ((length (- end start))) + (unless (zerop (mod length 4)) + (signal-encoding-error format "~A octet~:P cannot be decoded using ~ +UTF-32 as ~:*~A is not a multiple-value of four." + length)))) + +(defmethod compute-number-of-chars ((format flexi-utf-32-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (declare (ignore sequence)) + (/ (- end start) 4)) + +(defmethod compute-number-of-chars ((format flexi-crlf-utf-32-le-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (let ((i start) + (length (/ (- end start) 4))) + (decf end 8) + (loop + (when (> i end) + (return)) + (cond ((loop for j of-type fixnum from i + for octet across #.(vector +cr+ 0 0 0 +lf+ 0 0 0) + always (= octet (aref sequence j))) + (decf length) + (incf i 8)) + (t (incf i 4)))) + length)) + +(defmethod compute-number-of-chars ((format flexi-crlf-utf-32-be-format) sequence start end) + (declare #.*fixnum-optimize-settings*) + (declare (fixnum start end)) + (let ((i start) + (length (/ (- end start) 4))) + (decf end 8) + (loop + (when (> i end) + (return)) + (cond ((loop for j of-type fixnum from i + for octet across #.(vector 0 0 0 +cr+ 0 0 0 +lf+) + always (= octet (aref sequence j))) + (decf length) + (incf i 8)) + (t (incf i 4)))) + length)) + (defun recover-from-encoding-error (external-format format-control &rest format-args) "Helper function used by OCTETS-TO-CHAR-CODE below to deal with encoding errors. Checks if *SUBSTITUTION-CHAR* is not NIL and returns @@ -242,7 +470,7 @@ (declare (ignore reader)) (let ((char-code (call-next-method))) (case char-code - (#.(char-code #\Return) #.(char-code #\Newline)) + (#.+cr+ #.(char-code #\Newline)) (otherwise char-code))))
(defmethod octets-to-char-code ((format flexi-crlf-mixin) reader) @@ -251,13 +479,13 @@ (declare (ignore reader)) (let ((char-code (call-next-method))) (case char-code - (#.(char-code #\Return) + (#.+cr+ (let ((next-char-code (call-next-method))) (case next-char-code - (#.(char-code #\Linefeed) #.(char-code #\Newline)) + (#.+lf+ #.(char-code #\Newline)) ;; we saw a CR but no LF afterwards, but then the data ;; ended, so we just return #\Return - ((nil) #.(char-code #\Return)) + ((nil) +cr+) ;; if the character we peeked at wasn't a ;; linefeed character we unread its constituents (otherwise (funcall *current-unreader* (code-char next-char-code))
Modified: branches/edi/doc/index.html ============================================================================== --- branches/edi/doc/index.html (original) +++ branches/edi/doc/index.html Sat May 24 21:43:56 2008 @@ -116,6 +116,7 @@ <li><a href="#string-to-octets"><code>string-to-octets</code></a> <li><a href="#octets-to-string"><code>octets-to-string</code></a> <li><a href="#octet-length"><code>octet-length</code></a> + <li><a href="#char-length"><code>char-length</code></a> </ol> </ol> <li><a href="#position">File positions</a> @@ -1005,16 +1006,30 @@ </blockquote>
<p><br>[Function] -<br><a class=none name="octet-length"><b>octet-length</b> <i>string <tt>&key</tt> external-format start end</i> => <i>length-or-nil</i></a> +<br><a class=none name="octet-length"><b>octet-length</b> <i>string <tt>&key</tt> external-format start end</i> => <i>length</i></a>
<blockquote><br>
Returns the length of the substring of <code><i>string</i></code> from <code><i>start</i></code> to <code><i>end</i></code> in <a href="#octet">octets</a> if encoded using the <a href="#external-formats">external format</a> designated -by <code><i>external-format</i></code>. Might return <code>NIL</code> -if there's no efficient way to compute the length without iterating -through the whole string. +by <code><i>external-format</i></code>. +The defaults for +<code><i>start</i></code> and <code><i>end</i></code> +are <code>0</code> and the length of the string. The default +for <code><i>external-format</i></code> is <code>:LATIN1</code>. +</blockquote> + +<p><br>[Function] +<br><a class=none name="char-length"><b>char-length</b> <i>sequence <tt>&key</tt> external-format start end</i> => <i>length</i></a> + +<blockquote><br> + +Kind of the inverse of <a href="#octet-length"><code>OCTET-LENGTH</code></a>. +Returns the length of the subsequence (of <a href="#octet">octets</a>) of <code><i>sequence</i></code> from <code><i>start</i></code> to <code><i>end</i></code> in +characters if decoded using +the <a href="#external-formats">external format</a> designated +by <code><i>external-format</i></code>. The defaults for <code><i>start</i></code> and <code><i>end</i></code> are <code>0</code> and the length of the sequence. The default @@ -1060,7 +1075,7 @@ his work on making FLEXI-STREAMS faster.
<p> -$Header: /usr/local/cvsrep/flexi-streams/doc/index.html,v 1.111 2008/05/23 14:56:47 edi Exp $ +$Header: /usr/local/cvsrep/flexi-streams/doc/index.html,v 1.112 2008/05/25 01:41:25 edi Exp $ <p><a href="http://weitz.de/index.html">BACK TO MY HOMEPAGE</a>
</body>
Modified: branches/edi/packages.lisp ============================================================================== --- branches/edi/packages.lisp (original) +++ branches/edi/packages.lisp Sat May 24 21:43:56 2008 @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/flexi-streams/packages.lisp,v 1.35 2008/05/21 01:43:43 edi Exp $ +;;; $Header: /usr/local/cvsrep/flexi-streams/packages.lisp,v 1.36 2008/05/25 01:40:54 edi Exp $
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
@@ -40,6 +40,7 @@ (:export :*default-eol-style* :*default-little-endian* :*substitution-char* + :char-length :external-format-eol-style :external-format-error :external-format-error-external-format
Modified: branches/edi/specials.lisp ============================================================================== --- branches/edi/specials.lisp (original) +++ branches/edi/specials.lisp Sat May 24 21:43:56 2008 @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/flexi-streams/specials.lisp,v 1.32 2008/05/20 23:01:51 edi Exp $ +;;; $Header: /usr/local/cvsrep/flexi-streams/specials.lisp,v 1.33 2008/05/25 01:40:54 edi Exp $
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
@@ -49,6 +49,10 @@ "Like *STANDARD-OPTIMIZE-SETTINGS*, but (on LispWorks) with all arithmetic being fixnum arithmetic.")
+(defconstant +lf+ (char-code #\Linefeed)) + +(defconstant +cr+ (char-code #\Return)) + (defvar *current-unreader* nil "A unary function which might be called to `unread' a character (i.e. the sequence of octets it represents).
Modified: branches/edi/strings.lisp ============================================================================== --- branches/edi/strings.lisp (original) +++ branches/edi/strings.lisp Sat May 24 21:43:56 2008 @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/flexi-streams/strings.lisp,v 1.24 2008/05/24 23:15:25 edi Exp $ +;;; $Header: /usr/local/cvsrep/flexi-streams/strings.lisp,v 1.26 2008/05/25 01:41:32 edi Exp $
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
@@ -116,12 +116,17 @@
(defun octet-length (string &key (external-format :latin1) (start 0) (end (length string))) "Returns the length of the substring of STRING from START to END in -octets if encoded using the external format EXTERNAL-FORMAT. Might -return NIL if there's no efficient way to compute the length without -iterating through the whole string." +octets if encoded using the external format EXTERNAL-FORMAT." (declare #.*standard-optimize-settings*) (declare (fixnum start end) (string string)) (setq external-format (maybe-convert-external-format external-format)) - (let ((factor (encoding-factor external-format))) - (typecase factor - (fixnum (* factor (- end start)))))) + (compute-number-of-octets external-format string start end)) + +(defun char-length (sequence &key (external-format :latin1) (start 0) (end (length sequence))) + "Kind of the inverse of OCTET-LENGTH. Returns the length of the +subsequence (of octets) of SEQUENCE from START to END in characters +if decoded using the external format EXTERNAL-FORMAT." + (declare #.*standard-optimize-settings*) + (declare (fixnum start end) (string string)) + (setq external-format (maybe-convert-external-format external-format)) + (compute-number-of-chars external-format sequence start end))
Modified: branches/edi/util.lisp ============================================================================== --- branches/edi/util.lisp (original) +++ branches/edi/util.lisp Sat May 24 21:43:56 2008 @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/flexi-streams/util.lisp,v 1.21 2008/05/20 23:44:45 edi Exp $ +;;; $Header: /usr/local/cvsrep/flexi-streams/util.lisp,v 1.22 2008/05/25 01:40:54 edi Exp $
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
@@ -115,7 +115,7 @@ (unless (find real-name +name-map+ :test #'eq :key #'cdr) - (error 'external-format-error + (error 'external-format-simple-error :format-control "~S is not known to be a name for an external format." :format-arguments (list name))) real-name))