Update of /project/s-xml/cvsroot/s-xml/src In directory common-lisp.net:/tmp/cvs-serv2384/src
Modified Files: xml.lisp Log Message: first version with XML namespace parsing support
Date: Thu Aug 18 16:00:50 2005 Author: scaekenberghe
Index: s-xml/src/xml.lisp diff -u s-xml/src/xml.lisp:1.7 s-xml/src/xml.lisp:1.8 --- s-xml/src/xml.lisp:1.7 Wed Aug 17 15:44:29 2005 +++ s-xml/src/xml.lisp Thu Aug 18 16:00:48 2005 @@ -1,14 +1,15 @@ ;;;; -*- mode: lisp -*- ;;;; -;;;; $Id: xml.lisp,v 1.7 2005/08/17 13:44:29 scaekenberghe Exp $ +;;;; $Id: xml.lisp,v 1.8 2005/08/18 14:00:48 scaekenberghe Exp $ ;;;; -;;;; This is a Common Lisp implementation of a very basic XML parser. -;;;; The parser is non-validating and not at all complete (no CDATA). +;;;; This is a Common Lisp implementation of a basic but usable XML parser. +;;;; The parser is non-validating and not complete (no CDATA). +;;;; Namespace and entities are handled. ;;;; The API into the parser is a pure functional parser hook model that comes from SSAX, ;;;; see also http://pobox.com/~oleg/ftp/Scheme/xml.html or http://ssax.sourceforge.net ;;;; Different DOM models are provided, an XSML, an LXML and a xml-element struct based one. ;;;; -;;;; Copyright (C) 2002, 2004 Sven Van Caekenberghe, Beta Nine BVBA. +;;;; Copyright (C) 2002, 2003, 2004, 2005 Sven Van Caekenberghe, Beta Nine BVBA. ;;;; ;;;; You are granted the rights to distribute and use this software ;;;; as governed by the terms of the Lisp Lesser General Public License @@ -134,6 +135,100 @@ (error (parser-error "encountered unknown entity &~s;" (list entity) stream))))) extendable-string)
+;;; namespace support + +(defclass xml-namespace () + ((uri :documentation "The URI used to identify this namespace" + :accessor get-uri + :initarg :uri) + (prefix :documentation "The prefix assigned to this namespace" + :accessor get-prefix + :initarg :prefix + :initform nil) + (package :documentation "The Common Lisp package where this namespace's symbols are interned" + :accessor get-package + :initarg :package + :initform nil)) + (:documentation "Describes an XML namespace and how it is handled")) + +(defvar *local-namespace* (make-instance 'xml-namespace + :uri "local" + :prefix "" + :package (find-package :keyword)) + "The local (global default) XML namespace") + +(defvar *namespaces* (list *local-namespace*) + "Ordered list of XML namespaces currently in effect") + +(defun split-identifier (identifier) + "Split an identifier 'prefix:name' and return (values prefix identifier)" + (let ((colon-position (position #: identifier :test #'char=))) + (if colon-position + (values (subseq identifier 0 colon-position) + (subseq identifier (1+ colon-position))) + (values nil identifier)))) + +(defvar *require-existing-symbols* nil + "If t, each XML identifier must exist as symbol already") + +(defvar *auto-export-symbols* t + "If t, export newly interned symbols form their packages") + +(defun resolve-identifier (identifier namespaces &optional as-attribute) + "Resolve the string identifier in the list of namespaces" + (flet ((intern-symbol (string package) + (if *require-existing-symbols* + (let ((symbol (find-symbol string package))) + (or symbol + (error "Symbol ~s does not exist in ~s" string package))) + (let ((symbol (intern string package))) + (when (and *auto-export-symbols* + (not (eql package (find-package :keyword)))) + (export symbol package)) + symbol)))) + (multiple-value-bind (prefix name) + (split-identifier identifier) + (if (or (null prefix) (string= prefix "xmlns")) + (if as-attribute + (intern (if (string= prefix "xmlns") identifier name) (get-package *local-namespace*)) + (let ((default-namespace (find "" namespaces :key #'get-prefix :test #'string-equal))) + (intern-symbol name (get-package default-namespace)))) + (let ((namespace (find prefix namespaces :key #'get-prefix :test #'string-equal))) + (if namespace + (intern-symbol name (get-package namespace)) + (error "namespace not found for prefix ~s" prefix))))))) + +(defvar *auto-create-namespace-packages* t + "If t, new packages will be created for namespaces, if needed, named by the prefix") + +(defun extend-namespaces (attributes namespaces) + "Given possible 'xmlns[:prefix]' attributes, extend namespaces" + (let (default-namespace-uri) + (loop :for (key . value) :in attributes + :do (if (string= key "xmlns") + (setf default-namespace-uri value) + (multiple-value-bind (prefix name) + (split-identifier key) + (when (string= prefix "xmlns") + (push (make-instance 'xml-namespace + :uri value + :prefix name + :package (or (find-package name) + (if *auto-create-namespace-packages* + (make-package name :nicknames (list (string-upcase name))) + (error "Cannot find or create package ~s" name)))) + namespaces))))) + (when default-namespace-uri + (let ((namespace (find default-namespace-uri namespaces :key #'get-uri :test #'string-equal))) + (if namespace + (push (make-instance 'xml-namespace + :uri (get-uri namespace) + :prefix "" + :package (get-package namespace)) + namespaces) + (error "No prefix found for default namespace ~s" default-namespace-uri))))) + namespaces) + ;;; the parser state
(defclass xml-parser-state () @@ -326,6 +421,7 @@ "Parse XML element attributes from stream positioned after the tag identifier, returning the attributes as an assoc list, ending at either a '>' or a '/' which is peeked and also returned" + (declare (special *namespaces*)) (let (char attributes) (loop ;; skip whitespace separating items @@ -333,7 +429,7 @@ ;; start tag attributes ends with > or /> (when (and char (or (char= char #>) (char= char #/))) (return)) ;; read the attribute key - (let ((key (intern (parse-identifier stream (get-mini-buffer state)) :keyword))) + (let ((key (copy-seq (parse-identifier stream (get-mini-buffer state))))) ;; skip separating whitespace (setf char (skip-whitespace stream)) ;; require = sign (and consume it if present) @@ -350,6 +446,7 @@
(defun parse-xml-element (stream state) "Parse and return an XML element from stream, positioned after the opening '<'" + (declare (special *namespaces*)) ;; opening < has been read (when (char= (peek-char nil stream nil nil) #!) (skip-special-tag stream) @@ -357,62 +454,67 @@ (let (char buffer open-tag parent-seed has-children) (setf parent-seed (get-seed state)) ;; read tag name (no whitespace between < and name ?) - (setf open-tag (intern (parse-identifier stream (get-mini-buffer state)) :keyword)) + (setf open-tag (copy-seq (parse-identifier stream (get-mini-buffer state)))) ;; tag has been read, read attributes if any (multiple-value-bind (attributes peeked-char) (parse-xml-element-attributes stream state) - (setf (get-seed state) (funcall (get-new-element-hook state) - open-tag attributes (get-seed state))) - (setf char peeked-char) - (when (char= char #/) - ;; handle solitary tag of the form <tag .. /> - (read-char stream) - (setf char (read-char stream nil nil)) - (if (char= #> char) - (progn - (setf (get-seed state) (funcall (get-finish-element-hook state) - open-tag attributes parent-seed (get-seed state))) - (return-from parse-xml-element)) - (error (parser-error "expected >" nil stream)))) - ;; consume > - (read-char stream) - (loop - (setf buffer (get-buffer state)) - ;; read whitespace into buffer - (setf char (parse-whitespace stream buffer)) - ;; see what ended the whitespace scan - (cond ((null char) (error (parser-error "encountered unexpected eof handling ~a" (list open-tag)))) - ((char= char #<) - ;; consume the < - (read-char stream) - (if (char= (peek-char nil stream nil nil) #/) - (progn - ;; handle the matching closing tag </tag> and done - ;; if we read whitespace as this (leaf) element's contents, it is significant - (when (and (not has-children) (plusp (length buffer))) - (setf (get-seed state) (funcall (get-text-hook state) - (copy-seq buffer) (get-seed state)))) - (read-char stream) - (let ((close-tag (intern (parse-identifier stream (get-mini-buffer state)) :keyword))) - (unless (eq open-tag close-tag) - (error (parser-error "found <~a> not matched by </~a> but by <~a>" - (list open-tag open-tag close-tag) stream))) - (unless (char= (read-char stream nil nil) #>) - (error (parser-error "expected >" nil stream))) - (setf (get-seed state) (funcall (get-finish-element-hook state) - open-tag attributes parent-seed (get-seed state)))) - (return)) - ;; handle child tag and loop, no hooks to call here - ;; whitespace between child elements is skipped - (progn - (setf has-children t) - (parse-xml-element stream state)))) - (t - ;; no child tag, concatenate text to whitespace in buffer - ;; handle text content and loop - (setf char (parse-text stream state buffer)) - (setf (get-seed state) (funcall (get-text-hook state) - (copy-seq buffer) (get-seed state))))))))) + (let ((*namespaces* (extend-namespaces attributes *namespaces*))) + (setf open-tag (resolve-identifier open-tag *namespaces*) + attributes (loop :for (key . value) :in attributes + :collect (cons (resolve-identifier key *namespaces* t) value))) + (setf (get-seed state) (funcall (get-new-element-hook state) + open-tag attributes (get-seed state))) + (setf char peeked-char) + (when (char= char #/) + ;; handle solitary tag of the form <tag .. /> + (read-char stream) + (setf char (read-char stream nil nil)) + (if (char= #> char) + (progn + (setf (get-seed state) (funcall (get-finish-element-hook state) + open-tag attributes parent-seed (get-seed state))) + (return-from parse-xml-element)) + (error (parser-error "expected >" nil stream)))) + ;; consume > + (read-char stream) + (loop + (setf buffer (get-buffer state)) + ;; read whitespace into buffer + (setf char (parse-whitespace stream buffer)) + ;; see what ended the whitespace scan + (cond ((null char) (error (parser-error "encountered unexpected eof handling ~a" (list open-tag)))) + ((char= char #<) + ;; consume the < + (read-char stream) + (if (char= (peek-char nil stream nil nil) #/) + (progn + ;; handle the matching closing tag </tag> and done + ;; if we read whitespace as this (leaf) element's contents, it is significant + (when (and (not has-children) (plusp (length buffer))) + (setf (get-seed state) (funcall (get-text-hook state) + (copy-seq buffer) (get-seed state)))) + (read-char stream) + (let ((close-tag (resolve-identifier (parse-identifier stream (get-mini-buffer state)) + *namespaces*))) + (unless (eq open-tag close-tag) + (error (parser-error "found <~a> not matched by </~a> but by <~a>" + (list open-tag open-tag close-tag) stream))) + (unless (char= (read-char stream nil nil) #>) + (error (parser-error "expected >" nil stream))) + (setf (get-seed state) (funcall (get-finish-element-hook state) + open-tag attributes parent-seed (get-seed state)))) + (return)) + ;; handle child tag and loop, no hooks to call here + ;; whitespace between child elements is skipped + (progn + (setf has-children t) + (parse-xml-element stream state)))) + (t + ;; no child tag, concatenate text to whitespace in buffer + ;; handle text content and loop + (setf char (parse-text stream state buffer)) + (setf (get-seed state) (funcall (get-text-hook state) + (copy-seq buffer) (get-seed state))))))))))
(defun start-parse-xml (stream &optional (state (make-instance 'xml-parser-state))) "Parse and return a toplevel XML element from stream, using parser state"