;;; -*- Mode: lisp; Syntax: ansi-common-lisp; Base: 10; Package: xml-parser; -*-
#|
the "tokenizer" for XML-encoded document is implemented in terms of a collection of
context charachter tables, one for each of the lexical contexts which stipulates
special handling for specific characters.
the table is rebound by the respective parse function to indicate the context. for each
special character, the table contains a function which transforms the input into the
appropriate result. possible results are meta symbols and entity references.
as they perform the lexical transform only, the functions need only replace the input
datum with the respetive result.
initial version
modified to work with an input ring-buffer which
holds character codes and intermediate tokenizing results.
xutils:*parsetable*
fixed parsetables for SystemId and ExternalId to recognoze
parameter entity references
added markup-context-wo-references to DoctypeDecl
context in order to support direct parse of document definitions
changes for 'fast token hashing'
|#
(in-package "XML-PARSER")
;; note that categories do not generate their own net and therefor can't bind their
;; own parsetable. note also, that a token is parsed once only. this won't work if
;; the effective token would change bepending on which bnf path is to be followed...
#|
the direct lexical contexts for quote processing are
AttValue EncodingDecl EntityValue SystemLiteral PubidLiteral VersionInfo SDDecl
these contexts enable the '-reader. from this selection, the indirect contexts are
{GEDecl/EntityDef,PEDecl/PEDef}/EntityValue
{DefaultDecl,Attribute}/AttValue
ExternalID/SystemLiteral
{PublicID,ExternalID}/PubidLiteral
{XMLDecl,TextDecl}/VersionInfo
XMLDecl/SDDecl
{XMLDecl,TextDecl}/EncodingDecl
to simplify the binding and catch lexical errors, the outer contexts serve as
quote binding contexts, as reflected in make-parser-bindings.
Attribute DefaultDecl ExternalID GEDecl PEDecl PublicID TextDecl XMLDecl
while the open and close quote themselves must be recognized in the direct contexts.
|#
(defClass xml-parsetable (parsetable) ())
(defMethod compute-token-reader ((table xml-parsetable) character)
(let ((token (intern (make-string 1 :initial-element character)
*token-package*)))
#'(lambda ()
(declare (ignore char reader arg))
(replace-input token))))
(defMethod compute-parser-macros ((property (eql 'attribute-value)))
'(( #\< . |character-error| )
( #\& . |&-reader| )))
(defMethod compute-parser-macros ((property (eql 'cdata-character-data)))
'(( #\] . |]-reader| )))
(defMethod compute-parser-macros ((property (eql 'character-data)))
'(( #\& . |&-reader| )
( #\< . |
#\[))
(defMethod compute-parser-macros ((property (eql 'decl-tag-content)))
(list #\= #\>))
(defMethod compute-parser-macros ((property (eql 'default-attribute-value)))
'(( #\% . |%-reader| )
( #\& . |&-reader| )
( #\< . |character-error| )))
(defMethod compute-parser-macros ((property (eql 'element-character-data)))
'(( #\< . |))
(defMethod compute-parser-macros ((property (eql 'ignorable-whitespace)))
(mapcar #'(lambda (c) (cons c '|ignorable-whitespace-reader|))
'(#\space #\return #\linefeed #\tab)))
(defMethod compute-parser-macros ((property (eql 'ignore)))
'(( #\< . |<-ignore-reader| )
( #\] . |]-reader| )))
(defMethod compute-parser-macros ((property (eql 'literal-whitespace)))
nil)
(defMethod compute-parser-macros ((property (eql 'markup-context)))
'(( #\& . |&-reader| )
( #\< . | . |>-if-in-tag-reader|)
( #\& . |&-if-not-in-tag-reader| )
( #\< . | . |>-reader|)))
(defMethod compute-parser-macros ((property (eql 'tag-start-and-content)))
'(#\=
( #\/ . |/-reader|)
( #\> . |>-reader|)
( #\< . | pe-ref-context quote-context tokenized-whitespace attribute-value)
(defParsetable (|CDataCharData| xml-parsetable) cdata-character-data)
(defParsetable (|CharData| xml-parsetable) character-data)
(defParsetable (|CommentCharData| xml-parsetable) comment-char-sequence)
(defParsetable (|Content| xml-parsetable) element-character-data ignorable-whitespace)
(defParsetable (|ContentSpec| xml-parsetable) expression-context tokenized-whitespace)
; the AttValue as a whole recognizes %, but the text itself does not
(defParsetable (|DefaultAttChild| xml-parsetable) quote-context attribute-value)
(defParsetable (|DefaultAttValue| xml-parsetable) quote-context tokenized-whitespace default-attribute-value)
(defParsetable (|Document| xml-parsetable) markup-context-wo-references tokenized-whitespace)
(defParsetable (|DoctypeDecl| xml-parsetable) #\] #\[ #\> markup-context-wo-references tokenized-whitespace)
;; (defParsetable (|Element| xml-parsetable) element-character-data)
;; (defParsetable (|EmptyElemTag| xml-parsetable) maybe-tag-content tokenized-whitespace-in-tag)
(defParsetable (|EncodingDecl| xml-parsetable) ?-tag-content decl-tag-content quote-context tokenized-whitespace)
;; (defParsetable (|EntityChild| xml-parsetable) quote-context entity-value)
(defParsetable (|EntityData| xml-parsetable) quote-context entity-value)
(defParsetable (|EntityValue| xml-parsetable) quote-context tokenized-whitespace entity-value)
(defParsetable (|Eq| xml-parsetable) quote-context #\=)
;;(defParsetable (|ETag| xml-parsetable) maybe-tag-content tokenized-whitespace-in-tag)
;;(defParsetable (|ETag| xml-parsetable) tag-start-and-content tokenized-whitespace)
(defParsetable (|ETag| xml-parsetable) tag-content tokenized-whitespace)
(defParsetable (|ExternalID| xml-parsetable) #\[ decl-tag-content quote-context tokenized-whitespace pe-ref-context)
(defParsetable (|ExtParsedEnt| xml-parsetable) element-character-data ignorable-whitespace)
(defParsetable (|ExtSubsetDecl| xml-parsetable) conditional-context expression-context quote-context markup-decl-context tokenized-whitespace)
(defParsetable (|IgnoreSectContents| xml-parsetable) ignore)
(defParsetable (|IntSubsetDecl| xml-parsetable) expression-context quote-context markup-decl-context tokenized-whitespace quote-context)
;; (defParsetable (|MarkupDeclOrPEReference| markup-decl-context tokenized-whitespace)
(defParsetable (|Pi| xml-parsetable) ?-tag-content tokenized-whitespace)
(defParsetable (|PiCharData| xml-parsetable) ?-tag-content)
(defParsetable (|PubidCharData| xml-parsetable) quote-context )
;; (defParsetable (|PubidLiteral| xml-parsetable) decl-tag-content quote-context tokenized-whitespace)
(defParsetable (|PublicID| xml-parsetable) decl-tag-content quote-context tokenized-whitespace pe-ref-context)
(defParsetable (|SDDecl| xml-parsetable) ?-tag-content decl-tag-content quote-context tokenized-whitespace)
;;(defParsetable (|STag| xml-parsetable) maybe-tag-content tokenized-whitespace-in-tag)
;;(defParsetable (|STag| xml-parsetable) tag-start-and-content tokenized-whitespace)
(defParsetable (|STag| xml-parsetable) tag-content tokenized-whitespace)
(defParsetable (|SystemCharData| xml-parsetable) quote-context )
;; (defParsetable (|SystemLiteral| xml-parsetable) decl-tag-content quote-context tokenized-whitespace)
(defParsetable (|TextDecl| xml-parsetable) ?-tag-content decl-tag-content markup-decl-context markup-context) ; tokenized-whitespace)
(defParsetable (|VersionInfo| xml-parsetable) ?-tag-content decl-tag-content quote-context tokenized-whitespace)
(defParsetable (|XMLDecl| xml-parsetable) ?-tag-content decl-tag-content markup-context tokenized-whitespace)
;; nb. this tokenizer interns markup tokens only. as they are recognized and replaced,
;; there is no need to bind the the token package.
;; nb.2. the macros are applied as soon as they are seen. this is in order that they
;; have the specified effect in the context in which they are recognized. if they were
;; to break only, then the interpretation might be deferred into a substructure's
;; lexical context.
;; nb.3. accumulated text is not built in a string. it is consed to a list which remains
;; local to the tokenizer. the result is extracted either as as string - in the
;; case of character data, or as a list - in the case of entity definitions. these strings
;; are managed in a pool, to which the constructors return strings where they denote
;; existing interned names.
;; parse process. these strings will be mostly names for tags and attributes, attribute values, or
;; character content.
;; while reading in the dtd, numerous keywords are left as text, are recognized by string
;; comparison, and are subsequently discarded. which entails a certain amount of garbage.
;; within the root element itself, on the other hand, all keywords are expressed in the grammar
;; as interned tokens and mapped to same by the reader macros. all other text is denotes tag or
;; attribute names or is character data. by yielding the buffer to the constructors, they can
;; determine whether to construct a string for the data, or to find (or generate) an interned
;; name, and thereby ovoid a consed string.
#|
the token reader iterates over the input buffer to construct a token.
the token extent is noted and the content extracted first when a boundary is hit.
the reader's state takes the form of four pointers
token-start : the first byte of the present token
token-fill : the position to add the next byte to the token
input-start : the position of the next input byte
input-fill : the position into which stream bytes are placed.
the boundary states are:
input-start == fill-fill : read stream bytes
input-fill == token-start : input stops
token-start is initially == token-fill == input-start
when a macro is recognized, it is invoked to operate on the buffer content.
the result can be that it extends the token, replaces the input, or skips the input.
the replacement may be a symbol, for a markup token, or a content or dtd declaration.
|#
(defun token-reader
(&aux datum)
(unless *parsetable*
(error "no parsetable."))
(let* ((macros (table.macros *parsetable*))
(macro-table-size (length macros))
(macro nil))
(declare (optimize (speed 3) (safety 0))
(type fixnum position macro-table-size)
(type simple-vector macros)
(ftype (function (t) t) parse-external-subset parse-external-general-entity))
(when bnfp::*atn-trace*
(format *trace-output* "~% (token-reader: ~@[in context ~s~] ~s)" *parsetable* bnfp::*atn-stack))
(loop
; (format *trace-output* "(input ~s) " *input-datum*)
(setf datum (get-input))
(typecase datum
;; raw input appears as a fixnum character code
(fixnum
(setf macro (when (< (the fixnum datum) macro-table-size)
(svref macros (the fixnum datum))))
(cond (macro
(when bnfp::*atn-trace*
(format *trace-output* " (dispatching with ~s using ~a(~s)->~a)"
(peek-token-string) (table.name *parsetable*)
(code-char datum) macro))
(funcall macro))
(t
(extend-token (eol-normalize-input datum))
(advance-input))))
;; reader macros replace input with token symbols
(symbol
(return (if *token-end*
(make-token)
(progn (advance-input) datum))))
;; separate multiple values
(cons
(return (if *token-end*
(make-token)
(progn (replace-input (rest datum))
(first datum)))))
(string
(advance-input)
(return datum))
;; internal entities are inserted in place, whereby a parameter entity
;; may be bracketed with spaces. where the replacement text is one byte long,
;; add it directly to the token so that markup is not recognized
(def-internal-entity
;; (print (list *atn-term* bnfp::*atn-stack))
(when (and (is-def-parameter-entity datum)
xutils::|REC-xml-19980210.PEs in Internal Subset|
*in-internal-subset*
(not *in-decl-sep*))
(xml-error |WFC: PEs in Internal Subset| :name (name datum)))
(when (children datum)
;; a single byte is appended directly to the token...
(if (> (length (children datum)) 1)
(push-input-source datum)
(extend-token (first (children datum)))))
(advance-input))
;; external entites are parsed and inserted in-line if not in a literal
(def-external-parameter-entity
(unless (in-dtd?)
(xml-error |WFC: In DTD| :node datum))
(if (in-entity-value?) ;; load the external entity data
(progn (push-input-source datum)
(advance-input))
(return (if *token-end*
(make-token)
(progn (advance-input)
(handler-case (parse-external-subset datum)
(error (error)
(|XML-ERROR-WFC: External Subset|
:system-id (system-id datum)
:public-id (public-id datum) :datum error))))))))
(def-external-general-entity
(if *token-end*
(return (make-token))
(progn (when (notation datum)
(|XML-ERROR-WFC: Parsed Entity| :name (name datum)))
(advance-input)
(return (handler-case (parse-external-general-entity datum)
(error (error)
(|XML-ERROR-WFC: External Parsed Entity|
:system-id (system-id datum)
:public-id (public-id datum) :datum error)))))))
;; null input implies end-of-file, which yields an in-progress string or NIL
(null (return (make-token)))
(ref-entity
(return (if *token-end*
(make-token)
(progn (advance-input) datum))))
(t
(warn "unrecognized input ignored: ~s." datum)
(advance-input))))))
#|
(defmacro tt (node string)
`(with-parsetable (or (parsetable ',node) (error "no readtable: ~s." ',node))
(with-ring-buffer (64)
(let ((stream (make-instance 'vector-input-stream :vector ,string))
(results nil)
(token nil))
(setf *input-fill* (fill-ring-buffer stream *input-fill* *token-start*))
(handler-case (loop (unless (setf token (token-reader stream)) (return (reverse results)))
(push token results))
(end-of-file (condition)
(declare (ignore condition))
(push `(*EOF* ,(make-token)) results)
(reverse results)))))))
(tt |Attribute| "asdf='zzzz'/>") ;; ok that quote not recognized
(let ((*quote-token* '|xml|:|"|)) (tt |AttChild| "some text' with a general&entity;in it\""))
(let ((*quote-token* '|xml|:|"|)) (tt |AttChild| "&entity;in it\""))
(let ((*quote-token* '|xml|:|"|)) (tt |AttChild| "some text' with a general&entity;\""))
(let ((*quote-token* '|xml|:|"|)) (tt |AttChild| "some text' with a char ref
in it\""))
(let ((*quote-token* t)) (tt |AttValue| "\"some-text-without-spaces\""))
(tt |AttDef| "elementName attName CDATA >")
(tt |CDataCharData| "asasas ][ aasas[ ]] asasas ]]< ]]a ]] > ]]>")
(tt |CharData| "< ")
(tt |CharData| "asasasas< ")
(tt |CharData| "&entity;< ")
(tt |CommentCharData| "aaas - asas - - - -> -->")
(tt |Content| "asasasa&entity; ] >")
(let ((*quote-token* t)) (tt |EncodingDecl| "encoding='asasas'"))
(let ((*quote-token* t)) (tt |EntityChild| "'before%entity;after¬Recognized;'"))
(let ((*quote-token* t)) (tt |EntityValue| "'break on space &asaas; %as;qwqw'"))
(let ((*quote-token* t)) (tt |Eq| "-'asas'"))
(tt |ETag| "asdf/> ")
(tt |ExtSubsetDecl| " ]]>")
(tt |ExtSubsetDecl| "INCLUDE[ ]]>")
(tt |ExtSubsetDecl| "[ ]]>")
(tt |Pi| "sometarget 1 2 3 4 ? version< with more > stuff ?>")
(tt |PiCharData| "sometarget 1 2 3 4 ? version< with more > stuff ?>")
(let ((*quote-token* t)) (tt |PubidCharData| "\"some text with spaces\""))
(let ((*quote-token* t)) (tt |PubidLiteral| "\"some-text-without-spaces\""))
(let ((*quote-token* t)) (tt |SDDecl| "standalone='yes'"))
(tt |STag| "qwer asdf='zzzz'/> ")
(let ((*quote-token* t)) (tt |SystemCharData| "\"some text with spaces\""))
(let ((*quote-token* t)) (tt |SystemLiteral| "\"some-text-without-spaces\""))
(let ((*quote-token* t)) (tt |VersionInfo| "version='1.*'"))
(tt |XMLDecl| "")
|#
:EOF