summaryrefslogtreecommitdiff
path: root/lisp/emacs-lisp/rx.el
diff options
context:
space:
mode:
Diffstat (limited to 'lisp/emacs-lisp/rx.el')
-rw-r--r--lisp/emacs-lisp/rx.el219
1 files changed, 146 insertions, 73 deletions
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index 30195cbae32..d46d0ca5a98 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -35,8 +35,43 @@
;; Olin Shivers's SRE, with concessions to Emacs regexp peculiarities,
;; and the older Emacs package Sregex.
+;;; Legacy syntax still accepted by rx:
+;;
+;; These are constructs from earlier rx and sregex implementations
+;; that were mistakes, accidents or just not very good ideas in hindsight.
+
+;; Obsolete: accepted but not documented
+;;
+;; Obsolete Preferred
+;; --------------------------------------------------------
+;; (not word-boundary) not-word-boundary
+;; (not-syntax X) (not (syntax X))
+;; not-wordchar (not wordchar)
+;; (not-char ...) (not (any ...))
+;; any nonl, not-newline
+;; (repeat N FORM) (= N FORM)
+;; (syntax CHARACTER) (syntax NAME)
+;; (syntax CHAR-SYM) [1] (syntax NAME)
+;; (category chinse-two-byte) (category chinese-two-byte)
+;; unibyte ascii
+;; multibyte nonascii
+;; --------------------------------------------------------
+;; [1] where CHAR-SYM is a symbol with single-character name
+
+;; Obsolescent: accepted and documented but discouraged
+;;
+;; Obsolescent Preferred
+;; --------------------------------------------------------
+;; (and ...) (seq ...), (: ...), (sequence ...)
+;; anything anychar
+;; minimal-match, maximal-match lazy ops: ??, *?, +?
+
+;; FIXME: Prepare a phase-out by emitting compile-time warnings about
+;; at least some of the legacy constructs above.
+
;;; Code:
+
;; The `rx--translate...' functions below return (REGEXP . PRECEDENCE),
;; where REGEXP is a list of string expressions that will be
;; concatenated into a regexp, and PRECEDENCE is one of
@@ -167,7 +202,7 @@ Each entry is:
('not-word-boundary (cons (list "\\B") t))
('symbol-start (cons (list "\\_<") t))
('symbol-end (cons (list "\\_>") t))
- ('not-wordchar (cons (list "\\W") t))
+ ('not-wordchar (rx--translate '(not wordchar)))
(_
(cond
((let ((class (cdr (assq sym rx--char-classes))))
@@ -419,80 +454,96 @@ a list of named character classes in the order they occur in BODY."
If NEGATED is non-nil, negate the result; INTERVALS is a sorted
list of disjoint intervals and CLASSES a list of named character
classes."
- (let ((items (append intervals classes)))
- ;; Move lone ] and range ]-x to the start.
- (let ((rbrac-l (assq ?\] items)))
- (when rbrac-l
- (setq items (cons rbrac-l (delq rbrac-l items)))))
-
- ;; Split x-] and move the lone ] to the start.
- (let ((rbrac-r (rassq ?\] items)))
- (when (and rbrac-r (not (eq (car rbrac-r) ?\])))
- (setcdr rbrac-r ?\\)
- (setq items (cons '(?\] . ?\]) items))))
-
- ;; Split ,-- (which would end up as ,- otherwise).
- (let ((dash-r (rassq ?- items)))
- (when (eq (car dash-r) ?,)
- (setcdr dash-r ?,)
- (setq items (nconc items '((?- . ?-))))))
-
- ;; Remove - (lone or at start of interval)
- (let ((dash-l (assq ?- items)))
- (when dash-l
- (if (eq (cdr dash-l) ?-)
- (setq items (delq dash-l items)) ; Remove lone -
- (setcar dash-l ?.)) ; Reduce --x to .-x
- (setq items (nconc items '((?- . ?-))))))
-
- ;; Deal with leading ^ and range ^-x in non-negated set.
- (when (and (eq (car-safe (car items)) ?^)
- (not negated))
- (if (eq (cdar items) ?^)
- ;; single leading ^
- (when (cdr items)
- ;; Move the ^ to second place.
- (setq items (cons (cadr items)
- (cons (car items) (cddr items)))))
- ;; Split ^-x to _-x^
- (setq items (cons (cons ?_ (cdar items))
- (cons '(?^ . ?^)
- (cdr items))))))
-
- (cond
- ;; Empty set: if negated, any char, otherwise match-nothing.
- ((null items)
+ ;; No, this is not pretty code. You try doing it in a way that is both
+ ;; elegant and efficient. Or just one of the two. I dare you.
+ (cond
+ ;; Single character.
+ ((and intervals (eq (caar intervals) (cdar intervals))
+ (null (cdr intervals))
+ (null classes))
+ (let ((ch (caar intervals)))
(if negated
- (rx--translate-symbol 'anything)
- (rx--empty)))
- ;; Single non-negated character.
- ((and (null (cdr items))
- (consp (car items))
- (eq (caar items) (cdar items))
- (not negated))
- (cons (list (regexp-quote (char-to-string (caar items))))
- t))
- ;; Negated newline.
- ((and (equal items '((?\n . ?\n)))
- negated)
- (rx--translate-symbol 'nonl))
- ;; At least one character or class, possibly negated.
- (t
+ (if (eq ch ?\n)
+ ;; Single negated newline.
+ (rx--translate-symbol 'nonl)
+ ;; Single negated character (other than newline).
+ (cons (list (string ?\[ ?^ ch ?\])) t))
+ ;; Single literal character.
+ (cons (list (regexp-quote (char-to-string ch))) t))))
+
+ ;; Empty set (or any char).
+ ((and (null intervals) (null classes))
+ (if negated
+ (rx--translate-symbol 'anything)
+ (rx--empty)))
+
+ ;; More than one character, or at least one class.
+ (t
+ (let ((dash nil) (caret nil))
+ ;; Move ] and range ]-x to the start.
+ (let ((rbrac-l (assq ?\] intervals)))
+ (when rbrac-l
+ (setq intervals (cons rbrac-l (remq rbrac-l intervals)))))
+
+ ;; Split x-] and move the lone ] to the start.
+ (let ((rbrac-r (rassq ?\] intervals)))
+ (when (and rbrac-r (not (eq (car rbrac-r) ?\])))
+ (setcdr rbrac-r ?\\)
+ (setq intervals (cons '(?\] . ?\]) intervals))))
+
+ ;; Split ,-- (which would end up as ,- otherwise).
+ (let ((dash-r (rassq ?- intervals)))
+ (when (eq (car dash-r) ?,)
+ (setcdr dash-r ?,)
+ (setq dash "-")))
+
+ ;; Remove - (lone or at start of interval)
+ (let ((dash-l (assq ?- intervals)))
+ (when dash-l
+ (if (eq (cdr dash-l) ?-)
+ (setq intervals (remq dash-l intervals)) ; Remove lone -
+ (setcar dash-l ?.)) ; Reduce --x to .-x
+ (setq dash "-")))
+
+ ;; Deal with leading ^ and range ^-x in non-negated set.
+ (when (and (eq (caar intervals) ?^)
+ (not negated))
+ (if (eq (cdar intervals) ?^)
+ ;; single leading ^
+ (if (or (cdr intervals) classes)
+ ;; something else to put before the ^
+ (progn
+ (setq intervals (cdr intervals)) ; remove lone ^
+ (setq caret "^")) ; put ^ (almost) last
+ ;; nothing else but a lone -
+ (setq intervals (cons '(?- . ?-) intervals)) ; move - first
+ (setq dash nil))
+ ;; split ^-x to _-x^
+ (setq intervals `((?_ . ,(cdar intervals)) (?^ . ?^)
+ . ,(cdr intervals)))))
+
(cons
(list
(concat
"["
(and negated "^")
- (mapconcat (lambda (item)
- (cond ((symbolp item)
- (format "[:%s:]" item))
- ((eq (car item) (cdr item))
- (char-to-string (car item)))
- ((eq (1+ (car item)) (cdr item))
- (string (car item) (cdr item)))
+ (mapconcat (lambda (iv)
+ (cond ((eq (car iv) (cdr iv))
+ (char-to-string (car iv)))
+ ((eq (1+ (car iv)) (cdr iv))
+ (string (car iv) (cdr iv)))
+ ;; Ranges that go between normal chars and raw bytes
+ ;; must be split to avoid being mutilated
+ ;; by Emacs's regexp parser.
+ ((<= (car iv) #x3fff7f (cdr iv))
+ (string (car iv) ?- #x3fff7f
+ #x3fff80 ?- (cdr iv)))
(t
- (string (car item) ?- (cdr item)))))
- items nil)
+ (string (car iv) ?- (cdr iv)))))
+ intervals)
+ (mapconcat (lambda (cls) (format "[:%s:]" cls)) classes)
+ caret ; ^ or nothing
+ dash ; - or nothing
"]"))
t)))))
@@ -596,10 +647,28 @@ If NEGATED, negate the sense (thus making it positive)."
(defun rx--union-intervals (ivs-a ivs-b)
"Union of the interval lists IVS-A and IVS-B."
- (rx--complement-intervals
- (rx--intersect-intervals
- (rx--complement-intervals ivs-a)
- (rx--complement-intervals ivs-b))))
+ (let ((union nil))
+ (while (and ivs-a ivs-b)
+ (let ((a (car ivs-a))
+ (b (car ivs-b)))
+ (cond
+ ((< (1+ (cdr a)) (car b)) ; a before b, not adacent
+ (push a union)
+ (setq ivs-a (cdr ivs-a)))
+ ((< (1+ (cdr b)) (car a)) ; b before a, not adacent
+ (push b union)
+ (setq ivs-b (cdr ivs-b)))
+ (t ; a and b adjacent or overlap
+ (setq ivs-a (cdr ivs-a))
+ (setq ivs-b (cdr ivs-b))
+ (if (< (cdr a) (cdr b))
+ (push (cons (min (car a) (car b))
+ (cdr b))
+ ivs-b)
+ (push (cons (min (car a) (car b))
+ (cdr a))
+ ivs-a))))))
+ (nconc (nreverse union) (or ivs-a ivs-b))))
(defun rx--charset-intervals (charset)
"Return a sorted list of non-adjacent disjoint intervals from CHARSET.
@@ -783,7 +852,10 @@ Return (REGEXP . PRECEDENCE)."
(setq syntax char)))))))
(unless syntax
(error "Unknown rx syntax name `%s'" sym)))
- (cons (list (string ?\\ (if negated ?S ?s) syntax))
+ ;; Produce \w and \W instead of \sw and \Sw, for smaller size.
+ (cons (list (if (eq syntax ?w)
+ (string ?\\ (if negated ?W ?w))
+ (string ?\\ (if negated ?S ?s) syntax)))
t)))
(defconst rx--categories
@@ -1150,6 +1222,7 @@ If NO-GROUP is non-nil, don't bracket the result in a non-capturing
group.
For extending the `rx' notation in FORM, use `rx-define' or `rx-let-eval'."
+ (declare (important-return-value t))
(let* ((item (rx--translate form))
(exprs (if no-group
(car item)