diff options
Diffstat (limited to 'lisp/nxml/nxml-parse.el')
-rw-r--r-- | lisp/nxml/nxml-parse.el | 323 |
1 files changed, 323 insertions, 0 deletions
diff --git a/lisp/nxml/nxml-parse.el b/lisp/nxml/nxml-parse.el new file mode 100644 index 00000000000..267c18cf887 --- /dev/null +++ b/lisp/nxml/nxml-parse.el @@ -0,0 +1,323 @@ +;;; nxml-parse.el --- XML parser, sharing infrastructure with nxml-mode + +;; Copyright (C) 2003, 2007 Free Software Foundation, Inc. + +;; Author: James Clark +;; Keywords: XML + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs; see the file COPYING. If not, write to the +;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +;; Boston, MA 02110-1301, USA. + +;;; Commentary: + +;; Entry point is `nxml-parse-file'. + +;;; Code: + +(require 'nxml-util) +(require 'xmltok) +(require 'nxml-enc) +(require 'nxml-ns) + +(defvar nxml-parse-file-name nil) + +(defvar nxml-validate-function nil + "Nil or a function to be called by `nxml-parse-file' to perform validation. +The function will be called once for each start-tag or end-tag. The +function is passed two arguments TEXT and START-TAG. For a start-tag, +START-TAG is a list (NAME ATTRIBUTES) where NAME and ATTRIBUTES are in +the same form as returned by `nxml-parse-file. For an end-tag, +START-TAG is nil. TEXT is a string containing the text immediately +preceding the tag, or nil if there was no such text. An empty element +is treated as a start-tag followed by an end-tag. + +For a start-tag, the namespace state will be the state after +processing the namespace declarations in the start-tag. For an +end-tag, the namespace state will be the state before popping the +namespace declarations for the corresponding start-tag. + +The function must return nil if no error is detected or a +cons (MESSAGE . LOCATION) where MESSAGE is a string containing +an error message and LOCATION indicates what caused the error +as follows: + +- nil indicates the tag as whole caused it; this is always allowed; + +- text indicates the text caused it; this is allowed only if +TEXT is non-nil; + +- tag-close indicates the close of the tag caused it; this is +allowed only if START-TAG is non-nil; + +- (attribute-name . N) indicates that the name of the Nth attribute +caused it; N counts from 0; this is allowed only if START-TAG is non-nil +and N must be less than the number of attributes; + +- (attribute-value . N) indicates that the value of the Nth attribute +caused it; N counts from 0; this is allowed only if START-TAG is non-nil +and N must be less than the number of attributes.") + +(defun nxml-parse-file (file) + "Parse the XML document in FILE and return it as a list. +An XML element is represented as a list (NAME ATTRIBUTES . CHILDREN). +NAME is either a string, in the case where the name does not have a +namespace, or a cons (NAMESPACE . LOCAL-NAME), where NAMESPACE is a +symbol and LOCAL-NAME is a string, in the case where the name does +have a namespace. NAMESPACE is a keyword whose name is `:URI', where +URI is the namespace name. ATTRIBUTES is an alist of attributes where +each attribute has the form (NAME . VALUE), where NAME has the same +form as an element name, and VALUE is a string. A namespace +declaration is represented as an attribute whose name is +\(:http://www.w3.org/2000/xmlns/ . LOCAL-NAME). CHILDREN is a list +containing strings and child elements; CHILDREN never contains two +consecutive strings and never contains an empty string. Processing +instructions and comments are not represented. The return value is a +list representing the document element. + +If the XML document is not well-formed, an error having the condition +`nxml-file-parse-error' will be signaled; the error data will be a +list of the \(FILE POSITION MESSAGE), where POSITION is an integer +specifying the position where the error was detected, and MESSAGE is a +string describing the error. + +The current contents of FILE will be parsed even if there is a +modified buffer currently visiting FILE. + +If the variable `nxml-validation-function' is non-nil, it will be +called twice for each element, and any reported error will be signaled +in the same way as well-formedness error." + (save-excursion + (set-buffer (nxml-parse-find-file file)) + (unwind-protect + (let ((nxml-parse-file-name file)) + (nxml-parse-instance)) + (kill-buffer nil)))) + +(defun nxml-parse-find-file (file) + (save-excursion + (set-buffer (get-buffer-create " *nXML Parse*")) + (erase-buffer) + (let ((set-auto-coding-function 'nxml-set-xml-coding)) + (insert-file-contents file)) + (current-buffer))) + +(defun nxml-parse-instance () + (let (xmltok-dtd) + (xmltok-save + (xmltok-forward-prolog) + (nxml-check-xmltok-errors) + (nxml-ns-save + (nxml-parse-instance-1))))) + +(defun nxml-parse-instance-1 () + (let* ((top (cons nil nil)) + ;; tail is a cons cell, whose cdr is nil + ;; additional elements will destructively appended to tail + (tail top) + ;; stack of tails one for each open element + tail-stack + ;; list of QNames of open elements + open-element-tags + ;; list of strings buffering a text node, in reverse order + text + ;; position of beginning of first (in buffer) string in text + text-pos) + (while (xmltok-forward) + (nxml-check-xmltok-errors) + (cond ((memq xmltok-type '(start-tag end-tag empty-element)) + (when text + (setq text (apply 'concat (nreverse text))) + (setcdr tail (cons text nil)) + (setq tail (cdr tail))) + (when (not (eq xmltok-type 'end-tag)) + (when (and (not open-element-tags) + (not (eq tail top))) + (nxml-parse-error nil "Multiple top-level elements")) + (setq open-element-tags + (cons (xmltok-start-tag-qname) + open-element-tags)) + (nxml-ns-push-state) + (let ((tag (nxml-parse-start-tag))) + (nxml-validate-tag text text-pos tag) + (setq text nil) + (setcdr tail (cons tag nil)) + (setq tail (cdr tail)) + (setq tail-stack (cons tail tail-stack)) + (setq tail (last tag)))) + (when (not (eq xmltok-type 'start-tag)) + (or (eq xmltok-type 'empty-element) + (equal (car open-element-tags) + (xmltok-end-tag-qname)) + (if open-element-tags + (nxml-parse-error nil + "Unbalanced end-tag; expected </%s>" + (car open-element-tags)) + (nxml-parse-error nil "Extra end-tag"))) + (nxml-validate-tag text text-pos nil) + (setq text nil) + (nxml-ns-pop-state) + (setq open-element-tags (cdr open-element-tags)) + (setq tail (car tail-stack)) + (setq tail-stack (cdr tail-stack))) + (setq text-pos nil)) + ((memq xmltok-type '(space data entity-ref char-ref cdata-section)) + (cond (open-element-tags + (unless text-pos + (setq text-pos xmltok-start)) + (setq text + (cons (nxml-current-text-string) text))) + ((not (eq xmltok-type 'space)) + (nxml-parse-error + nil + "%s at top-level" + (cdr (assq xmltok-type + '((data . "Text characters") + (entity-ref . "Entity reference") + (char-ref . "Character reference") + (cdata-section . "CDATA section")))))))))) + (unless (cdr top) + (nxml-parse-error (point-max) "Missing document element")) + (cadr top))) + +(defun nxml-parse-start-tag () + (let (parsed-attributes + parsed-namespace-attributes + atts att prefixes prefix ns value name) + (setq atts xmltok-namespace-attributes) + (while atts + (setq att (car atts)) + (setq value (or (xmltok-attribute-value att) + (nxml-parse-error nil "Invalid attribute value"))) + (setq ns (nxml-make-namespace value)) + (setq prefix (and (xmltok-attribute-prefix att) + (xmltok-attribute-local-name att))) + (cond ((member prefix prefixes) + (nxml-parse-error nil "Duplicate namespace declaration")) + ((not prefix) + (nxml-ns-set-default ns)) + (ns + (nxml-ns-set-prefix prefix ns)) + (t (nxml-parse-error nil "Cannot undeclare namespace prefix"))) + (setq prefixes (cons prefix prefixes)) + (setq parsed-namespace-attributes + (cons (cons (nxml-make-name nxml-xmlns-namespace-uri + (xmltok-attribute-local-name att)) + value) + parsed-namespace-attributes)) + (setq atts (cdr atts))) + (setq name + (nxml-make-name + (let ((prefix (xmltok-start-tag-prefix))) + (if prefix + (or (nxml-ns-get-prefix prefix) + (nxml-parse-error (1+ xmltok-start) + "Prefix `%s' undeclared" + prefix)) + (nxml-ns-get-default))) + (xmltok-start-tag-local-name))) + (setq atts xmltok-attributes) + (while atts + (setq att (car atts)) + (setq ns + (let ((prefix (xmltok-attribute-prefix att))) + (and prefix + (or (nxml-ns-get-prefix prefix) + (nxml-parse-error (xmltok-attribute-name-start att) + "Prefix `%s' undeclared" + prefix))))) + (setq parsed-attributes + (let ((nm (nxml-make-name ns + (xmltok-attribute-local-name att)))) + (when (assoc nm parsed-attributes) + (nxml-parse-error (xmltok-attribute-name-start att) + "Duplicate attribute")) + (cons (cons nm (or (xmltok-attribute-value att) + (nxml-parse-error nil "Invalid attribute value"))) + parsed-attributes))) + (setq atts (cdr atts))) + ;; We want to end up with the attributes followed by the + ;; the namespace attributes in the same order as + ;; xmltok-attributes and xmltok-namespace-attributes respectively. + (when parsed-namespace-attributes + (setq parsed-attributes + (nconc parsed-namespace-attributes parsed-attributes))) + (list name (nreverse parsed-attributes)))) + +(defun nxml-validate-tag (text text-pos tag) + (when nxml-validate-function + (let ((err (funcall nxml-validate-function text tag)) + pos) + (when err + (setq pos (nxml-validate-error-position (cdr err) + (and text text-pos) + tag)) + (or pos (error "Incorrect return value from %s" + nxml-validate-function)) + (nxml-parse-error pos (car err)))))) + +(defun nxml-validate-error-position (location text-pos tag) + (cond ((null location) xmltok-start) + ((eq location 'text) text-pos) + ((eq location 'tag-close) + (and tag (- (point) (if (eq xmltok-type 'empty-element ) 2 1)))) + ((consp location) + (let ((att (nth (cdr location) xmltok-attributes))) + (when (not att) + (setq att (nth (- (cdr location) (length xmltok-attributes)) + xmltok-namespace-attributes))) + (cond ((not att)) + ((eq (car location) 'attribute-name) + (xmltok-attribute-name-start att)) + ((eq (car location) 'attribute-value) + (xmltok-attribute-value-start att))))))) + +(defun nxml-make-name (ns local-name) + (if ns + (cons ns local-name) + local-name)) + +(defun nxml-current-text-string () + (cond ((memq xmltok-type '(space data)) + (buffer-substring-no-properties xmltok-start + (point))) + ((eq xmltok-type 'cdata-section) + (buffer-substring-no-properties (+ xmltok-start 9) + (- (point) 3))) + ((memq xmltok-type '(char-ref entity-ref)) + (unless xmltok-replacement + (nxml-parse-error nil + (if (eq xmltok-type 'char-ref) + "Reference to unsupported Unicode character" + "Unresolvable entity reference"))) + xmltok-replacement))) + +(defun nxml-parse-error (position &rest args) + (nxml-signal-file-parse-error nxml-parse-file-name + (or position xmltok-start) + (apply 'format args))) + +(defun nxml-check-xmltok-errors () + (when xmltok-errors + (let ((err (car (last xmltok-errors)))) + (nxml-signal-file-parse-error nxml-parse-file-name + (xmltok-error-start err) + (xmltok-error-message err))))) + +(provide 'nxml-parse) + +;; arch-tag: fc19639b-1bff-4673-9992-f539da89ba1e +;;; nxml-parse.el ends here |