diff options
author | Mattias EngdegÄrd <mattiase@acm.org> | 2019-12-12 23:04:00 +0100 |
---|---|---|
committer | Mattias EngdegÄrd <mattiase@acm.org> | 2019-12-12 23:47:25 +0100 |
commit | f16766a0eb2a78b58a4856d31306fc37f913d70e (patch) | |
tree | d3be560c8aaf4f4d3a59b285e27aab224922bb33 /lisp | |
parent | d7efe98951730842db4fc136e3b631c5ee0d8a53 (diff) | |
download | emacs-f16766a0eb2a78b58a4856d31306fc37f913d70e.tar.gz emacs-f16766a0eb2a78b58a4856d31306fc37f913d70e.tar.bz2 emacs-f16766a0eb2a78b58a4856d31306fc37f913d70e.zip |
Use `or' instead of `union' for charset union in rx
Design change suggested by Stefan Monnier.
* doc/lispref/searching.texi (Rx Constructs):
* etc/NEWS: Document.
* lisp/emacs-lisp/rx.el (rx--translate-or): Detect charset arguments.
(rx--charset-p): New.
(rx--translate-not, rx--charset-intervals, rx--translate-union):
Change from `union' to `or'.
(rx--translate-form, rx--builtin-forms, rx): Remove `union'.
* test/lisp/emacs-lisp/rx-tests.el (rx-union, rx-def-in-union)
(rx-intersection): Rename tests and change `union' to `or' and `|'.
Diffstat (limited to 'lisp')
-rw-r--r-- | lisp/emacs-lisp/rx.el | 41 |
1 files changed, 26 insertions, 15 deletions
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index d4b21c3c9ad..a5cab1db888 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -273,10 +273,8 @@ Return (REGEXP . PRECEDENCE)." ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) ;; - ;; - Fuse patterns into a single character alternative if they fit. - ;; regexp-opt will do that if all are strings, but we want to do that for: - ;; * symbols that expand to classes: space, alpha, ... - ;; * character alternatives: (any ...) + ;; - Optimise single-character alternatives better: + ;; * classes: space, alpha, ... ;; * (syntax S), for some S (whitespace, word) ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) ;; -> (any "@" "%" digit "A-Z" space word) @@ -294,6 +292,8 @@ Return (REGEXP . PRECEDENCE)." ((rx--every #'stringp body) ; All strings. (cons (list (regexp-opt body nil t)) t)) + ((rx--every #'rx--charset-p body) ; All charsets. + (rx--translate-union nil body)) (t (cons (append (car (rx--translate (car body))) (mapcan (lambda (item) @@ -301,6 +301,19 @@ Return (REGEXP . PRECEDENCE)." (cdr body))) nil)))) +(defun rx--charset-p (form) + "Whether FORM looks like a charset, only consisting of character intervals +and set operations." + (or (and (consp form) + (or (and (memq (car form) '(any 'in 'char)) + (rx--every (lambda (x) (not (symbolp x))) (cdr form))) + (and (memq (car form) '(not or | intersection)) + (rx--every #'rx--charset-p (cdr form))))) + (and (or (symbolp form) (consp form)) + (let ((expanded (rx--expand-def form))) + (and expanded + (rx--charset-p expanded)))))) + (defun rx--string-to-intervals (str) "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single character X becomes (?X . ?X). Return the intervals in a list." @@ -477,7 +490,7 @@ If NEGATED, negate the sense." (not negated) (rx--complement-intervals intervals) nil))) ;; FIXME: Consider turning `not' into a variadic operator, following SRE: -;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and +;; (not A B) = (not (or A B)) = (intersection (not A) (not B)), and ;; (not) = anychar. ;; Maybe allow singleton characters as arguments. @@ -498,7 +511,7 @@ If NEGATED, negate the sense (thus making it positive)." (rx--translate-category (not negated) (cdr arg))) ('not (rx--translate-not (not negated) (cdr arg))) - ('union + ((or 'or '|) (rx--translate-union (not negated) (cdr arg))) ('intersection (rx--translate-intersection (not negated) (cdr arg)))))) @@ -558,7 +571,7 @@ If NEGATED, negate the sense (thus making it positive)." (defun rx--charset-intervals (charset) "Return a sorted list of non-adjacent disjoint intervals from CHARSET. CHARSET is any expression allowed in a character set expression: -either `any' (no classes permitted), or `not', `union' or `intersection' +either `any' (no classes permitted), or `not', `or' or `intersection' forms whose arguments are charsets." (pcase charset (`(,(or 'any 'in 'char) . ,body) @@ -569,8 +582,8 @@ forms whose arguments are charsets." (cadr parsed))) (car parsed))) (`(not ,x) (rx--complement-intervals (rx--charset-intervals x))) - (`(union . ,xs) (rx--charset-union xs)) - (`(intersection . ,xs) (rx--charset-intersection xs)) + (`(,(or 'or '|) . ,body) (rx--charset-union body)) + (`(intersection . ,body) (rx--charset-intersection body)) (_ (let ((expanded (rx--expand-def charset))) (if expanded (rx--charset-intervals expanded) @@ -589,7 +602,7 @@ forms whose arguments are charsets." (mapcar #'rx--charset-intervals charsets))) (defun rx--translate-union (negated body) - "Translate a (union ...) construct. Return (REGEXP . PRECEDENCE). + "Translate an (or ...) construct of charsets. Return (REGEXP . PRECEDENCE). If NEGATED, negate the sense." (rx--intervals-to-alt negated (rx--charset-union body))) @@ -976,7 +989,6 @@ can expand to any number of values." ((or 'any 'in 'char) (rx--translate-any nil body)) ('not-char (rx--translate-any t body)) ('not (rx--translate-not nil body)) - ('union (rx--translate-union nil body)) ('intersection (rx--translate-intersection nil body)) ('repeat (rx--translate-repeat body)) @@ -1036,7 +1048,7 @@ can expand to any number of values." (t (error "Unknown rx form `%s'" op))))))) (defconst rx--builtin-forms - '(seq sequence : and or | any in char not-char not union intersection + '(seq sequence : and or | any in char not-char not intersection repeat = >= ** zero-or-more 0+ * one-or-more 1+ + @@ -1149,11 +1161,10 @@ CHAR Match a literal character. character, a string, a range as string \"A-Z\" or cons (?A . ?Z), or a character class (see below). Alias: in, char. (not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC - can be (any ...), (union ...), (intersection ...), + can be (any ...), (or ...), (intersection ...), (syntax ...), (category ...), or a character class. -(union CHARSET...) Union of CHARSETs. (intersection CHARSET...) Intersection of CHARSETs. - CHARSET is (any...), (not...), (union...) or (intersection...). + CHARSET is (any...), (not...), (or...) or (intersection...). not-newline Match any character except a newline. Alias: nonl. anychar Match any character. Alias: anything. unmatchable Never match anything at all. |