#|
Hi.
I think there is a bug in cl-ppcre (do-register-groups). I wrote
two almost identical versions of function
'dtd-attributes-to-sexp'. One with 'do-register-groups' hangs,
another which uses 'scan' works.
example:
(dtd-attributes-to-sexp
"attr-foo (foo | bar) \"foo\" attr-bar CDATA #FIXED 'zonk' attr-baz NMTOKEN #REQUIRED")
result without 'read-dtd-parens':
(("attr-foo" "(foo | bar)" "foo")
("attr-bar" :CDATA (:FIXED "zonk"))
("attr-baz" :NMTOKEN :REQUIRED))
result with 'read-dtd-parens':
(("attr-foo" (:ALTERNATION 1 "foo" "bar") "foo")
("attr-bar" :CDATA (:FIXED "zonk"))
("attr-baz" :NMTOKEN :REQUIRED))
|#
(use-package :cl-ppcre)
(defparameter *dtd-attribute-name-regex*
"([a-zA-z0-9._\\-:]+)")
(defparameter *dtd-attribute-type-regex*
"(?:(\\(.*?\\))|(CDATA)|(ID)|(IDREF)|(IDREFS)|(NMTOKEN)|(NMTOKENS)|(ENTITY)|(ENTITIES)|(NOTATION)|(xml:\\w+))")
(defparameter *dtd-attribute-default-value-regex*
"(?:('[^<>\"']*')|(\"[^<>\"]*\")|(#REQUIRED)|(#IMPLIED)|(?:#FIXED\\s+(?:('[^<>\"']*')|(\"[^<>\"]*\"))))")
;;; ---------- buggy 'dtd-attributes-to-sexp' ----------
;; Version with 'do-register-groups' macro. It works only without call
;; to 'read-dtd-parens' (which 'read-dtd...' uses 'cl-ppcre:scan').
;; 'read-dtd-parens' itself seems to be ok.
;; I think 'do-register-groups' has a bug.
;;
(defun dtd-attributes-to-sexp (target &aux result)
(do-register-groups
(a b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 c1 c2 c3 c4 c5.1 c5.2)
((format nil "\\s*~A\\s+~A\\s+~A\\s*" ; no load-time-value beause it's an example.
*dtd-attribute-name-regex*
*dtd-attribute-type-regex*
*dtd-attribute-default-value-regex*)
target (nreverse result))
(push (list a
(cond (b1 (read-dtd-parens b1)) #| *** !!!!!! *** |#
(b2 :cdata)
(b3 :id)
(b4 :idref)
(b5 :idrefs)
(b6 :nmtoken)
(b7 :nmtokens)
(b8 :entity)
(b9 :entities)
(b10 :notation)
(b11 b11))
(cond (c1 (subseq c1 1 (1- (length c1))))
(c2 (subseq c2 1 (1- (length c2))))
(c3 :required)
(c4 :implied)
(c5.1 (list :fixed (subseq c5.1 1 (1- (length c5.1)))))
(c5.2 (list :fixed (subseq c5.2 1 (1- (length c5.2)))))))
result)))
;;; ---------- 'dtd-attributes-to-sexp' that works ----------
;; Version without 'do-register-groups', i used 'scan' instead of it.
;; It works as intended.
;;
(defun dtd-attributes-to-sexp (string &aux result (start 0))
(loop
(multiple-value-bind (m-start m-end r-start r-end)
(scan (format nil "\\s*~A\\s+~A\\s+~A\\s*" ; no load-time-value beause it's an example.
*dtd-attribute-name-regex*
*dtd-attribute-type-regex*
*dtd-attribute-default-value-regex*)
string
:start start)
(declare (ignore m-start))
(push (list (subseq string (svref r-start 0) (svref r-end 0))
(case (position-if #'identity r-start :start 1)
(1 (read-dtd-parens (subseq string (svref r-start 1) (svref r-end 1))))
(2 :cdata)
(3 :id)
(4 :idref)
(5 :idrefs)
(6 :nmtoken)
(7 :nmtokens)
(8 :entity)
(9 :entities)
(10 :notation)
(11 (subseq string (svref r-start 11) (svref r-end 11))))
(case (position-if #'identity r-start :start 12)
(12 (subseq string (1+ (svref r-start 12)) (1- (svref r-end 12))))
(13 (subseq string (1+ (svref r-start 13)) (1- (svref r-end 13))))
(14 :required)
(15 :implied)
(16 (list :fixed (subseq string (1+ (svref r-start 16)) (1- (svref r-end 16)))))
(17 (list :fixed (subseq string (1+ (svref r-start 17)) (1- (svref r-end 17)))))))
result)
(when (eql m-end (length string))
(return (nreverse result)))
(setq start m-end))))
;; This function is meant to be a pareser to fragments of dtd's
;; 'element' and "attribute' tags.
;; for example:
;;
;; (read-dtd-parens "(e | f)")
;;
;; ===> (:ALTERNATION 1 "e" "f")
;;
;; or
;;
;; (read-dtd-parens "((script|style|meta|link|object)*, ((title, (script|style|meta|link|object)*, (base, (script|style|meta|link|object)*)?) | (base, (script|style|meta|link|object)*, (title, (script|style|meta|link|object)*))))")
;;
;; ===> big sexp :)
;;
(defun read-dtd-parens (string
&optional (start 0)
&aux match-begin match-end result reg type stuff (index (1+ start)))
(loop (cond ((char= #\( (char string index))
(multiple-value-setq (stuff index) (read-dtd-parens string index))
(push stuff result))
((multiple-value-setq (match-begin match-end)
(scan (load-time-value (create-scanner "^\\w+") t) string :start index))
(setq index match-end)
(if (find (char string match-end) "+*?")
(progn
(push (list (intern (string (char string match-end)))
(subseq string match-begin match-end))
result)
(incf index))
(push (subseq string match-begin match-end) result)))
((multiple-value-setq (match-begin match-end reg)
(scan (case type
(:alternation
(load-time-value (create-scanner "^\\s*\\|\\s*") t))
(:sequence
(load-time-value (create-scanner "^\\s*,\\s*") t))
('nil
(load-time-value (create-scanner "^\\s*(?:(\\|)|(,))\\s*") t)))
string :start index))
(unless type (setq type (if (svref reg 0) :alternation :sequence)))
(setq index match-end))
((char= #\) (char string index))
(setq result (nreverse result))
(return (case (if (array-in-bounds-p string (1+ index))
(char string (1+ index)))
(#\* (values (cons type (cons '* result)) (+ 2 index)))
(#\? (values (cons type (cons '? result)) (+ 2 index)))
(#\+ (values (cons type (cons '+ result)) (+ 2 index)))
(t (values (cons type (cons '1 result)) (+ 1 index)))))))))
;;; ---------- END.