summaryrefslogtreecommitdiff
path: root/lisp/emacs-lisp/rx.el
diff options
context:
space:
mode:
authorMattias EngdegÄrd <mattiase@acm.org>2019-12-12 23:04:00 +0100
committerMattias EngdegÄrd <mattiase@acm.org>2019-12-12 23:47:25 +0100
commitf16766a0eb2a78b58a4856d31306fc37f913d70e (patch)
treed3be560c8aaf4f4d3a59b285e27aab224922bb33 /lisp/emacs-lisp/rx.el
parentd7efe98951730842db4fc136e3b631c5ee0d8a53 (diff)
downloademacs-f16766a0eb2a78b58a4856d31306fc37f913d70e.tar.gz
emacs-f16766a0eb2a78b58a4856d31306fc37f913d70e.tar.bz2
emacs-f16766a0eb2a78b58a4856d31306fc37f913d70e.zip
Use `or' instead of `union' for charset union in rx
Design change suggested by Stefan Monnier. * doc/lispref/searching.texi (Rx Constructs): * etc/NEWS: Document. * lisp/emacs-lisp/rx.el (rx--translate-or): Detect charset arguments. (rx--charset-p): New. (rx--translate-not, rx--charset-intervals, rx--translate-union): Change from `union' to `or'. (rx--translate-form, rx--builtin-forms, rx): Remove `union'. * test/lisp/emacs-lisp/rx-tests.el (rx-union, rx-def-in-union) (rx-intersection): Rename tests and change `union' to `or' and `|'.
Diffstat (limited to 'lisp/emacs-lisp/rx.el')
-rw-r--r--lisp/emacs-lisp/rx.el41
1 files changed, 26 insertions, 15 deletions
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index d4b21c3c9ad..a5cab1db888 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -273,10 +273,8 @@ Return (REGEXP . PRECEDENCE)."
;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
;;
- ;; - Fuse patterns into a single character alternative if they fit.
- ;; regexp-opt will do that if all are strings, but we want to do that for:
- ;; * symbols that expand to classes: space, alpha, ...
- ;; * character alternatives: (any ...)
+ ;; - Optimise single-character alternatives better:
+ ;; * classes: space, alpha, ...
;; * (syntax S), for some S (whitespace, word)
;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
;; -> (any "@" "%" digit "A-Z" space word)
@@ -294,6 +292,8 @@ Return (REGEXP . PRECEDENCE)."
((rx--every #'stringp body) ; All strings.
(cons (list (regexp-opt body nil t))
t))
+ ((rx--every #'rx--charset-p body) ; All charsets.
+ (rx--translate-union nil body))
(t
(cons (append (car (rx--translate (car body)))
(mapcan (lambda (item)
@@ -301,6 +301,19 @@ Return (REGEXP . PRECEDENCE)."
(cdr body)))
nil))))
+(defun rx--charset-p (form)
+ "Whether FORM looks like a charset, only consisting of character intervals
+and set operations."
+ (or (and (consp form)
+ (or (and (memq (car form) '(any 'in 'char))
+ (rx--every (lambda (x) (not (symbolp x))) (cdr form)))
+ (and (memq (car form) '(not or | intersection))
+ (rx--every #'rx--charset-p (cdr form)))))
+ (and (or (symbolp form) (consp form))
+ (let ((expanded (rx--expand-def form)))
+ (and expanded
+ (rx--charset-p expanded))))))
+
(defun rx--string-to-intervals (str)
"Decode STR as intervals: A-Z becomes (?A . ?Z), and the single
character X becomes (?X . ?X). Return the intervals in a list."
@@ -477,7 +490,7 @@ If NEGATED, negate the sense."
(not negated) (rx--complement-intervals intervals) nil)))
;; FIXME: Consider turning `not' into a variadic operator, following SRE:
-;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and
+;; (not A B) = (not (or A B)) = (intersection (not A) (not B)), and
;; (not) = anychar.
;; Maybe allow singleton characters as arguments.
@@ -498,7 +511,7 @@ If NEGATED, negate the sense (thus making it positive)."
(rx--translate-category (not negated) (cdr arg)))
('not
(rx--translate-not (not negated) (cdr arg)))
- ('union
+ ((or 'or '|)
(rx--translate-union (not negated) (cdr arg)))
('intersection
(rx--translate-intersection (not negated) (cdr arg))))))
@@ -558,7 +571,7 @@ If NEGATED, negate the sense (thus making it positive)."
(defun rx--charset-intervals (charset)
"Return a sorted list of non-adjacent disjoint intervals from CHARSET.
CHARSET is any expression allowed in a character set expression:
-either `any' (no classes permitted), or `not', `union' or `intersection'
+either `any' (no classes permitted), or `not', `or' or `intersection'
forms whose arguments are charsets."
(pcase charset
(`(,(or 'any 'in 'char) . ,body)
@@ -569,8 +582,8 @@ forms whose arguments are charsets."
(cadr parsed)))
(car parsed)))
(`(not ,x) (rx--complement-intervals (rx--charset-intervals x)))
- (`(union . ,xs) (rx--charset-union xs))
- (`(intersection . ,xs) (rx--charset-intersection xs))
+ (`(,(or 'or '|) . ,body) (rx--charset-union body))
+ (`(intersection . ,body) (rx--charset-intersection body))
(_ (let ((expanded (rx--expand-def charset)))
(if expanded
(rx--charset-intervals expanded)
@@ -589,7 +602,7 @@ forms whose arguments are charsets."
(mapcar #'rx--charset-intervals charsets)))
(defun rx--translate-union (negated body)
- "Translate a (union ...) construct. Return (REGEXP . PRECEDENCE).
+ "Translate an (or ...) construct of charsets. Return (REGEXP . PRECEDENCE).
If NEGATED, negate the sense."
(rx--intervals-to-alt negated (rx--charset-union body)))
@@ -976,7 +989,6 @@ can expand to any number of values."
((or 'any 'in 'char) (rx--translate-any nil body))
('not-char (rx--translate-any t body))
('not (rx--translate-not nil body))
- ('union (rx--translate-union nil body))
('intersection (rx--translate-intersection nil body))
('repeat (rx--translate-repeat body))
@@ -1036,7 +1048,7 @@ can expand to any number of values."
(t (error "Unknown rx form `%s'" op)))))))
(defconst rx--builtin-forms
- '(seq sequence : and or | any in char not-char not union intersection
+ '(seq sequence : and or | any in char not-char not intersection
repeat = >= **
zero-or-more 0+ *
one-or-more 1+ +
@@ -1149,11 +1161,10 @@ CHAR Match a literal character.
character, a string, a range as string \"A-Z\" or cons
(?A . ?Z), or a character class (see below). Alias: in, char.
(not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC
- can be (any ...), (union ...), (intersection ...),
+ can be (any ...), (or ...), (intersection ...),
(syntax ...), (category ...), or a character class.
-(union CHARSET...) Union of CHARSETs.
(intersection CHARSET...) Intersection of CHARSETs.
- CHARSET is (any...), (not...), (union...) or (intersection...).
+ CHARSET is (any...), (not...), (or...) or (intersection...).
not-newline Match any character except a newline. Alias: nonl.
anychar Match any character. Alias: anything.
unmatchable Never match anything at all.