diff options
author | Mark A. Hershberger <mah@everybody.org> | 2007-11-23 06:58:00 +0000 |
---|---|---|
committer | Mark A. Hershberger <mah@everybody.org> | 2007-11-23 06:58:00 +0000 |
commit | 8cd39fb3c4cf47d2464f00eaa69c587e17dd11cc (patch) | |
tree | 7bcd47a7dcbbad100dd3e8f8a7e08b48353c58a8 /lisp/nxml/nxml-rap.el | |
parent | f7cf8b2009b0bc2526d50c3455f737a543122dd4 (diff) | |
download | emacs-8cd39fb3c4cf47d2464f00eaa69c587e17dd11cc.tar.gz emacs-8cd39fb3c4cf47d2464f00eaa69c587e17dd11cc.tar.bz2 emacs-8cd39fb3c4cf47d2464f00eaa69c587e17dd11cc.zip |
Initial merge of nxml
Diffstat (limited to 'lisp/nxml/nxml-rap.el')
-rw-r--r-- | lisp/nxml/nxml-rap.el | 467 |
1 files changed, 467 insertions, 0 deletions
diff --git a/lisp/nxml/nxml-rap.el b/lisp/nxml/nxml-rap.el new file mode 100644 index 00000000000..8b0c55e573e --- /dev/null +++ b/lisp/nxml/nxml-rap.el @@ -0,0 +1,467 @@ +;;; nxml-rap.el --- low-level support for random access parsing for nXML mode + +;; Copyright (C) 2003, 2004 Free Software Foundation, Inc. + +;; Author: James Clark +;; Keywords: XML + +;; This program is free software; you can redistribute it and/or +;; modify it under the terms of the GNU General Public License as +;; published by the Free Software Foundation; either version 2 of +;; the License, or (at your option) any later version. + +;; This program is distributed in the hope that it will be +;; useful, but WITHOUT ANY WARRANTY; without even the implied +;; warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +;; PURPOSE. See the GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public +;; License along with this program; if not, write to the Free +;; Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +;; MA 02111-1307 USA + +;;; Commentary: + +;; This uses xmltok.el to do XML parsing. The fundamental problem is +;; how to handle changes. We don't want to maintain a complete parse +;; tree. We also don't want to reparse from the start of the document +;; on every keystroke. However, it is not possible in general to +;; parse an XML document correctly starting at a random point in the +;; middle. The main problems are comments, CDATA sections and +;; processing instructions: these can all contain things that are +;; indistinguishable from elements. Literals in the prolog are also a +;; problem. Attribute value literals are not a problem because +;; attribute value literals cannot contain less-than signs. +;; +;; Our strategy is to keep track of just the problematic things. +;; Specifically, we keep track of all comments, CDATA sections and +;; processing instructions in the instance. We do this by marking all +;; except the first character of these with a non-nil nxml-inside text +;; property. The value of the nxml-inside property is comment, +;; cdata-section or processing-instruction. The first character does +;; not have the nxml-inside property so we can find the beginning of +;; the construct by looking for a change in a text property value +;; (Emacs provides primitives for this). We use text properties +;; rather than overlays, since the implementation of overlays doesn't +;; look like it scales to large numbers of overlays in a buffer. +;; +;; We don't in fact track all these constructs, but only track them in +;; some initial part of the instance. The variable `nxml-scan-end' +;; contains the limit of where we have scanned up to for them. +;; +;; Thus to parse some random point in the file we first ensure that we +;; have scanned up to that point. Then we search backwards for a +;; <. Then we check whether the < has an nxml-inside property. If it +;; does we go backwards to first character that does not have an +;; nxml-inside property (this character must be a <). Then we start +;; parsing forward from the < we have found. +;; +;; The prolog has to be parsed specially, so we also keep track of the +;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on +;; every change to the prolog. This won't work well if people try to +;; edit huge internal subsets. Hopefully that will be rare. +;; +;; We keep track of the changes by adding to the buffer's +;; after-change-functions hook. Scanning is also done as a +;; prerequisite to fontification by adding to fontification-functions +;; (in the same way as jit-lock). This means that scanning for these +;; constructs had better be quick. Fortunately it is. Firstly, the +;; typical proportion of comments, CDATA sections and processing +;; instructions is small relative to other things. Secondly, to scan +;; we just search for the regexp <[!?]. +;; +;; One problem is unclosed comments, processing instructions and CDATA +;; sections. Suppose, for example, we encounter a <!-- but there's no +;; matching -->. This is not an unexpected situation if the user is +;; creating a comment. It is not helpful to treat the whole of the +;; file starting from the <!-- onwards as a single unclosed comment +;; token. Instead we treat just the <!-- as a piece of not well-formed +;; markup and continue. The problem is that if at some later stage a +;; --> gets added to the buffer after the unclosed <!--, we will need +;; to reparse the buffer starting from the <!--. We need to keep +;; track of these reparse dependencies; they are called dependent +;; regions in the code. + +;;; Code: + +(require 'xmltok) +(require 'nxml-util) + +(defvar nxml-prolog-end nil + "Integer giving position following end of the prolog.") +(make-variable-buffer-local 'nxml-prolog-end) + +(defvar nxml-scan-end nil + "Marker giving position up to which we have scanned. +nxml-scan-end must be >= nxml-prolog-end. Furthermore, nxml-scan-end +must not an inside position in the following sense. A position is +inside if the following character is a part of, but not the first +character of, a CDATA section, comment or processing instruction. +Furthermore all positions >= nxml-prolog-end and < nxml-scan-end that +are inside positions must have a non-nil nxml-inside property whose +value is a symbol specifying what it is inside. Any characters with a +non-nil fontified property must have position < nxml-scan-end and the +correct face. Dependent regions must also be established for any +unclosed constructs starting before nxml-scan-end. +There must be no nxml-inside properties after nxml-scan-end.") +(make-variable-buffer-local 'nxml-scan-end) + +(defsubst nxml-get-inside (pos) + (get-text-property pos 'nxml-inside)) + +(defsubst nxml-clear-inside (start end) + (remove-text-properties start end '(nxml-inside nil))) + +(defsubst nxml-set-inside (start end type) + (put-text-property start end 'nxml-inside type)) + +(defun nxml-inside-end (pos) + "Return the end of the inside region containing POS. +Return nil if the character at POS is not inside." + (if (nxml-get-inside pos) + (or (next-single-property-change pos 'nxml-inside) + (point-max)) + nil)) + +(defun nxml-inside-start (pos) + "Return the start of the inside region containing POS. +Return nil if the character at POS is not inside." + (if (nxml-get-inside pos) + (or (previous-single-property-change (1+ pos) 'nxml-inside) + (point-min)) + nil)) + +;;; Change management + +(defun nxml-scan-after-change (start end) + "Restore `nxml-scan-end' invariants after a change. +The change happened between START and END. +Return position after which lexical state is unchanged. +END must be > nxml-prolog-end." + (if (>= start nxml-scan-end) + nxml-scan-end + (goto-char start) + (nxml-move-outside-backwards) + (setq start (point)) + (let ((inside-remove-start start) + xmltok-errors + xmltok-dependent-regions) + (while (or (when (xmltok-forward-special (min end nxml-scan-end)) + (when (memq xmltok-type + '(comment + cdata-section + processing-instruction)) + (nxml-clear-inside inside-remove-start + (1+ xmltok-start)) + (nxml-set-inside (1+ xmltok-start) + (point) + xmltok-type) + (setq inside-remove-start (point))) + (if (< (point) (min end nxml-scan-end)) + t + (setq end (point)) + nil)) + ;; The end of the change was inside but is now outside. + ;; Imagine something really weird like + ;; <![CDATA[foo <!-- bar ]]> <![CDATA[ stuff --> <!-- ]]> --> + ;; and suppose we deleted "<![CDATA[f" + (let ((inside-end (nxml-inside-end end))) + (when inside-end + (setq end inside-end) + t)))) + (nxml-clear-inside inside-remove-start end) + (nxml-clear-dependent-regions start end) + (nxml-mark-parse-dependent-regions)) + (when (> end nxml-scan-end) + (set-marker nxml-scan-end end)) + end)) + +(defun nxml-scan-prolog () + (goto-char (point-min)) + (let (xmltok-dtd + xmltok-errors + xmltok-dependent-regions) + (setq nxml-prolog-regions (xmltok-forward-prolog)) + (setq nxml-prolog-end (point)) + (nxml-clear-inside (point-min) nxml-prolog-end) + (nxml-clear-dependent-regions (point-min) nxml-prolog-end) + (nxml-mark-parse-dependent-regions)) + (when (< nxml-scan-end nxml-prolog-end) + (set-marker nxml-scan-end nxml-prolog-end))) + + +;;; Dependent regions + +(defun nxml-adjust-start-for-dependent-regions (start end pre-change-length) + (let ((overlays (overlays-in (1- start) start)) + (adjusted-start start)) + (while overlays + (let* ((overlay (car overlays)) + (ostart (overlay-start overlay))) + (when (and (eq (overlay-get overlay 'category) 'nxml-dependent) + (< ostart adjusted-start)) + (let ((funargs (overlay-get overlay 'nxml-funargs))) + (when (apply (car funargs) + (append (list start + end + pre-change-length + ostart + (overlay-end overlay)) + (cdr funargs))) + (setq adjusted-start ostart))))) + (setq overlays (cdr overlays))) + adjusted-start)) + +(defun nxml-mark-parse-dependent-regions () + (while xmltok-dependent-regions + (apply 'nxml-mark-parse-dependent-region + (car xmltok-dependent-regions)) + (setq xmltok-dependent-regions + (cdr xmltok-dependent-regions)))) + +(defun nxml-mark-parse-dependent-region (fun start end &rest args) + (let ((overlay (make-overlay start end nil t t))) + (overlay-put overlay 'category 'nxml-dependent) + (overlay-put overlay 'nxml-funargs (cons fun args)))) + +(put 'nxml-dependent 'evaporate t) + +(defun nxml-clear-dependent-regions (start end) + (let ((overlays (overlays-in start end))) + (while overlays + (let* ((overlay (car overlays)) + (category (overlay-get overlay 'category))) + (when (and (eq category 'nxml-dependent) + (<= start (overlay-start overlay))) + (delete-overlay overlay))) + (setq overlays (cdr overlays))))) + +;;; Random access parsing + +(defun nxml-token-after () + "Return the position after the token containing the char after point. +Sets up the variables `xmltok-type', `xmltok-start', +`xmltok-name-end', `xmltok-name-colon', `xmltok-attributes', +`xmltok-namespace-attributes' in the same was as does +`xmltok-forward'. The prolog will be treated as a single token with +type `prolog'." + (let ((pos (point))) + (if (< pos nxml-prolog-end) + (progn + (setq xmltok-type 'prolog + xmltok-start (point-min)) + (min nxml-prolog-end (point-max))) + (nxml-ensure-scan-up-to-date) + (if (nxml-get-inside pos) + (save-excursion + (nxml-move-outside-backwards) + (xmltok-forward) + (point)) + (save-excursion + (if (or (eq (char-after) ?<) + (search-backward "<" + (max (point-min) nxml-prolog-end) + t)) + (nxml-move-outside-backwards) + (goto-char (if (<= (point-min) nxml-prolog-end) + nxml-prolog-end + (or (nxml-inside-end (point-min)) + (point-min))))) + (while (and (nxml-tokenize-forward) + (<= (point) pos))) + (point)))))) + +(defun nxml-token-before () + "Return the position after the token containing the char before point. +Sets variables like `nxml-token-after'." + (if (/= (point-min) (point)) + (save-excursion + (goto-char (1- (point))) + (nxml-token-after)) + (setq xmltok-start (point)) + (setq xmltok-type nil) + (point))) + +(defun nxml-tokenize-forward () + (let (xmltok-dependent-regions + xmltok-errors) + (when (and (xmltok-forward) + (> (point) nxml-scan-end)) + (cond ((memq xmltok-type '(comment + cdata-section + processing-instruction)) + (nxml-with-unmodifying-text-property-changes + (nxml-set-inside (1+ xmltok-start) (point) xmltok-type))) + (xmltok-dependent-regions + (nxml-mark-parse-dependent-regions))) + (set-marker nxml-scan-end (point))) + xmltok-type)) + +(defun nxml-move-outside-backwards () + "Move point to first character of the containing special thing. +Leave point unmoved if it is not inside anything special." + (let ((start (nxml-inside-start (point)))) + (when start + (goto-char (1- start)) + (when (nxml-get-inside (point)) + (error "Char before inside-start at %s had nxml-inside property %s" + (point) + (nxml-get-inside (point))))))) + +(defun nxml-ensure-scan-up-to-date () + (let ((pos (point))) + (when (< nxml-scan-end pos) + (save-excursion + (goto-char nxml-scan-end) + (let (xmltok-errors + xmltok-dependent-regions) + (while (when (xmltok-forward-special pos) + (when (memq xmltok-type + '(comment + processing-instruction + cdata-section)) + (nxml-with-unmodifying-text-property-changes + (nxml-set-inside (1+ xmltok-start) + (point) + xmltok-type))) + (if (< (point) pos) + t + (setq pos (point)) + nil))) + (nxml-clear-dependent-regions nxml-scan-end pos) + (nxml-mark-parse-dependent-regions) + (set-marker nxml-scan-end pos)))))) + +;;; Element scanning + +(defun nxml-scan-element-forward (from &optional up) + "Scan forward from FROM over a single balanced element. +Point must between tokens. Return the position of the end of the tag +that ends the element. `xmltok-start' will contain the position of the +start of the tag. If UP is non-nil, then scan past end-tag of element +containing point. If no element is found, return nil. If a +well-formedness error prevents scanning, signal an nxml-scan-error. +Point is not moved." + (let ((open-tags (and up t)) + found) + (save-excursion + (goto-char from) + (while (cond ((not (nxml-tokenize-forward)) + (when (consp open-tags) + (nxml-scan-error (cadr open-tags) + "Start-tag has no end-tag")) + nil) + ((eq xmltok-type 'start-tag) + (setq open-tags + (cons (xmltok-start-tag-qname) + (cons xmltok-start + open-tags))) + t) + ((eq xmltok-type 'end-tag) + (cond ((not open-tags) nil) + ((not (consp open-tags)) (setq found (point)) nil) + ((not (string= (car open-tags) + (xmltok-end-tag-qname))) + (nxml-scan-error (+ 2 xmltok-start) + "Mismatched end-tag; \ +expected `%s'" + (car open-tags))) + ((setq open-tags (cddr open-tags)) t) + (t (setq found (point)) nil))) + ((memq xmltok-type '(empty-element + partial-empty-element)) + (if open-tags + t + (setq found (point)) + nil)) + ((eq xmltok-type 'partial-end-tag) + (cond ((not open-tags) nil) + ((not (consp open-tags)) (setq found (point)) nil) + ((setq open-tags (cddr open-tags)) t) + (t (setq found (point)) nil))) + ((eq xmltok-type 'partial-start-tag) + (nxml-scan-error xmltok-start + "Missing `>'")) + (t t)))) + found)) + +(defun nxml-scan-element-backward (from &optional up bound) + "Scan backward from FROM over a single balanced element. +Point must between tokens. Return the position of the end of the tag +that starts the element. `xmltok-start' will contain the position of +the start of the tag. If UP is non-nil, then scan past start-tag of +element containing point. If BOUND is non-nil, then don't scan back +past BOUND. If no element is found, return nil. If a well-formedness +error prevents scanning, signal an nxml-scan-error. Point is not +moved." + (let ((open-tags (and up t)) + token-end found) + (save-excursion + (goto-char from) + (while (cond ((or (< (point) nxml-prolog-end) + (not (search-backward "<" + (max (or bound 0) + nxml-prolog-end) + t))) + (when (and (consp open-tags) (not bound)) + (nxml-scan-error (cadr open-tags) + "End-tag has no start-tag")) + nil) + ((progn + (nxml-move-outside-backwards) + (save-excursion + (nxml-tokenize-forward) + (setq token-end (point))) + (eq xmltok-type 'end-tag)) + (setq open-tags + (cons (xmltok-end-tag-qname) + (cons xmltok-start open-tags))) + t) + ((eq xmltok-type 'start-tag) + (cond ((not open-tags) nil) + ((not (consp open-tags)) + (setq found token-end) + nil) + ((and (car open-tags) + (not (string= (car open-tags) + (xmltok-start-tag-qname)))) + (nxml-scan-error (1+ xmltok-start) + "Mismatched start-tag; \ +expected `%s'" + (car open-tags))) + ((setq open-tags (cddr open-tags)) t) + (t (setq found token-end) nil))) + ((memq xmltok-type '(empty-element + partial-empty-element)) + (if open-tags + t + (setq found token-end) + nil)) + ((eq xmltok-type 'partial-end-tag) + (setq open-tags + (cons nil (cons xmltok-start open-tags))) + t) + ((eq xmltok-type 'partial-start-tag) + ;; if we have only a partial-start-tag + ;; then it's unlikely that there's a matching + ;; end-tag, so it's probably not helpful + ;; to treat it as a complete start-tag + (nxml-scan-error xmltok-start + "Missing `>'")) + (t t)))) + found)) + +(defun nxml-scan-error (&rest args) + (signal 'nxml-scan-error args)) + +(put 'nxml-scan-error + 'error-conditions + '(error nxml-error nxml-scan-error)) + +(put 'nxml-scan-error + 'error-message + "Scan over element that is not well-formed") + +(provide 'nxml-rap) + +;;; nxml-rap.el ends here |