diff options
author | Mattias EngdegÄrd <mattiase@acm.org> | 2020-02-11 20:04:42 +0100 |
---|---|---|
committer | Mattias EngdegÄrd <mattiase@acm.org> | 2020-03-01 10:47:14 +0100 |
commit | 49d3cd90bd80a225d5ec26027318ffb4606ff513 (patch) | |
tree | 13ba7b7faec7014b626d1c4dfdbfec98df44631f /lisp | |
parent | 6b48aedb6b3b1de0b41b61b727d14ab8277d2f73 (diff) | |
download | emacs-49d3cd90bd80a225d5ec26027318ffb4606ff513.tar.gz emacs-49d3cd90bd80a225d5ec26027318ffb4606ff513.tar.bz2 emacs-49d3cd90bd80a225d5ec26027318ffb4606ff513.zip |
rx: Improve 'or' compositionality (bug#37659)
Perform 'regexp-opt' on nested 'or' forms, and after expansion of
user-defined and 'eval' forms. Characters are now turned into strings
for wider 'regexp-opt' scope. This preserves the longest-match
semantics for string in 'or' forms over composition.
* doc/lispref/searching.texi (Rx Constructs): Document.
* lisp/emacs-lisp/rx.el (rx--normalise-or-arg)
(rx--all-string-or-args): New.
(rx--translate-or): Normalise arguments first, and check for strings
in subforms.
(rx--expand-eval): Extracted from rx--translate-eval.
(rx--translate-eval): Call rx--expand-eval.
* test/lisp/emacs-lisp/rx-tests.el (rx-or, rx-def-in-or): Add tests.
* etc/NEWS: Announce.
Diffstat (limited to 'lisp')
-rw-r--r-- | lisp/emacs-lisp/rx.el | 76 |
1 files changed, 48 insertions, 28 deletions
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 1ee5e8294a6..a0b2444346a 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -254,22 +254,39 @@ Left-fold the list L, starting with X, by the binary function F." (setq l (cdr l))) x) +(defun rx--normalise-or-arg (form) + "Normalise the `or' argument FORM. +Characters become strings, user-definitions and `eval' forms are expanded, +and `or' forms are normalised recursively." + (cond ((characterp form) + (char-to-string form)) + ((and (consp form) (memq (car form) '(or |))) + (cons (car form) (mapcar #'rx--normalise-or-arg (cdr form)))) + ((and (consp form) (eq (car form) 'eval)) + (rx--normalise-or-arg (rx--expand-eval (cdr form)))) + (t + (let ((expanded (rx--expand-def form))) + (if expanded + (rx--normalise-or-arg expanded) + form))))) + +(defun rx--all-string-or-args (body) + "If BODY only consists of strings or such `or' forms, return all the strings. +Otherwise throw `rx--nonstring'." + (mapcan (lambda (form) + (cond ((stringp form) (list form)) + ((and (consp form) (memq (car form) '(or |))) + (rx--all-string-or-args (cdr form))) + (t (throw 'rx--nonstring nil)))) + body)) + (defun rx--translate-or (body) "Translate an or-pattern of zero or more rx items. Return (REGEXP . PRECEDENCE)." ;; FIXME: Possible improvements: ;; - ;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"), - ;; so that they can be candidates for regexp-opt. - ;; - ;; - Translate compile-time strings (`eval' forms), again for regexp-opt. - ;; ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D) - ;; in order to improve effectiveness of regexp-opt. - ;; This would also help composability. - ;; - ;; - Use associativity to run regexp-opt on contiguous subsets of arguments - ;; if not all of them are strings. Example: + ;; Then call regexp-opt on runs of string arguments. Example: ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) ;; @@ -279,27 +296,26 @@ Return (REGEXP . PRECEDENCE)." ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) ;; -> (any "@" "%" digit "A-Z" space word) ;; -> "[A-Z@%[:digit:][:space:][:word:]]" - ;; - ;; Problem: If a subpattern is carefully written to be - ;; optimizable by regexp-opt, how do we prevent the transforms - ;; above from destroying that property? - ;; Example: (or "a" (or "abc" "abd" "abe")) (cond ((null body) ; No items: a never-matching regexp. (rx--empty)) ((null (cdr body)) ; Single item. (rx--translate (car body))) - ((rx--every #'stringp body) ; All strings. - (cons (list (regexp-opt body nil)) - t)) - ((rx--every #'rx--charset-p body) ; All charsets. - (rx--translate-union nil body)) (t - (cons (append (car (rx--translate (car body))) - (mapcan (lambda (item) - (cons "\\|" (car (rx--translate item)))) - (cdr body))) - nil)))) + (let* ((args (mapcar #'rx--normalise-or-arg body)) + (all-strings (catch 'rx--nonstring (rx--all-string-or-args args)))) + (cond + (all-strings ; Only strings. + (cons (list (regexp-opt all-strings nil)) + t)) + ((rx--every #'rx--charset-p args) ; All charsets. + (rx--translate-union nil args)) + (t + (cons (append (car (rx--translate (car args))) + (mapcan (lambda (item) + (cons "\\|" (car (rx--translate item)))) + (cdr args))) + nil))))))) (defun rx--charset-p (form) "Whether FORM looks like a charset, only consisting of character intervals @@ -840,11 +856,15 @@ Return (REGEXP . PRECEDENCE)." (cons (list (list 'regexp-quote arg)) 'seq)) (t (error "rx `literal' form with non-string argument"))))) -(defun rx--translate-eval (body) - "Translate the `eval' form. Return (REGEXP . PRECEDENCE)." +(defun rx--expand-eval (body) + "Expand `eval' arguments. Return a new rx form." (unless (and body (null (cdr body))) (error "rx `eval' form takes exactly one argument")) - (rx--translate (eval (car body)))) + (eval (car body))) + +(defun rx--translate-eval (body) + "Translate the `eval' form. Return (REGEXP . PRECEDENCE)." + (rx--translate (rx--expand-eval body))) (defvar rx--regexp-atomic-regexp nil) |