summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMattias EngdegÄrd <mattiase@acm.org>2019-02-24 22:12:52 +0100
committerMattias EngdegÄrd <mattiase@acm.org>2019-03-02 15:35:28 +0100
commitda758046da74e33273265cd2e72a8aa1a0c9c7e3 (patch)
tree4337523f0b56c12d69f27a91ee0a1b61376c0e7e
parentdbffbe08815644fd30404891ef81496277ed27da (diff)
downloademacs-da758046da74e33273265cd2e72a8aa1a0c9c7e3.tar.gz
emacs-da758046da74e33273265cd2e72a8aa1a0c9c7e3.tar.bz2
emacs-da758046da74e33273265cd2e72a8aa1a0c9c7e3.zip
rx: fix `or' ordering by adding argument to regexp-opt
The rx `or' form may reorder its arguments in an unpredictable way, contrary to user expectation, since it sometimes uses `regexp-opt'. Add a NOREORDER option to `regexp-opt' for preventing it from producing a reordered regexp (Bug#34641). * doc/lispref/searching.texi (Regular Expression Functions): * etc/NEWS (Lisp Changes in Emacs 27.1): Describe the new regexp-opt NOREORDER argument. * lisp/emacs-lisp/regexp-opt.el (regexp-opt): Add NOREORDER. Make no attempt at regexp improvement if the set of strings contains a prefix of another string. (regexp-opt--contains-prefix): New. * lisp/emacs-lisp/rx.el (rx-or): Call regexp-opt with NOREORDER. * test/lisp/emacs-lisp/rx-tests.el: Test rx `or' form match order.
-rw-r--r--doc/lispref/searching.texi13
-rw-r--r--etc/NEWS7
-rw-r--r--lisp/emacs-lisp/regexp-opt.el38
-rw-r--r--lisp/emacs-lisp/rx.el2
-rw-r--r--test/lisp/emacs-lisp/rx-tests.el13
5 files changed, 65 insertions, 8 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index cfbd2449b13..fb7f48474d5 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -950,7 +950,7 @@ whitespace:
@end defun
@cindex optimize regexp
-@defun regexp-opt strings &optional paren
+@defun regexp-opt strings &optional paren noreorder
This function returns an efficient regular expression that will match
any of the strings in the list @var{strings}. This is useful when you
need to make matching or searching as fast as possible---for example,
@@ -985,8 +985,15 @@ if it is necessary to ensure that a postfix operator appended to
it will apply to the whole expression.
@end table
-The resulting regexp of @code{regexp-opt} is equivalent to but usually
-more efficient than that of a simplified version:
+The optional argument @var{noreorder}, if @code{nil} or omitted,
+allows the returned regexp to match the strings in any order. If
+non-@code{nil}, the match is guaranteed to be performed in the order
+given, as if the strings were made into a regexp by joining them with
+the @samp{\|} operator.
+
+Up to reordering, the resulting regexp of @code{regexp-opt} is
+equivalent to but usually more efficient than that of a simplified
+version:
@example
(defun simplified-regexp-opt (strings &optional paren)
diff --git a/etc/NEWS b/etc/NEWS
index 29ed7ab4819..7c95988ff52 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -1642,6 +1642,13 @@ MS-Windows.
** New module environment function 'process_input' to process user
input while module code is running.
++++
+** The function 'regexp-opt' accepts an additional optional argument.
+By default, the regexp returned by 'regexp-opt' may match the strings
+in any order. If the new third argument is non-nil, the match is
+guaranteed to be performed in the order given, as if the strings were
+made into a regexp by joining them with '\|'.
+
* Changes in Emacs 27.1 on Non-Free Operating Systems
diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el
index 63786c1508c..d0c5f2d3fc4 100644
--- a/lisp/emacs-lisp/regexp-opt.el
+++ b/lisp/emacs-lisp/regexp-opt.el
@@ -84,7 +84,7 @@
;;; Code:
;;;###autoload
-(defun regexp-opt (strings &optional paren)
+(defun regexp-opt (strings &optional paren noreorder)
"Return a regexp to match a string in the list STRINGS.
Each string should be unique in STRINGS and should not contain
any regexps, quoted or not. Optional PAREN specifies how the
@@ -111,8 +111,14 @@ nil
necessary to ensure that a postfix operator appended to it will
apply to the whole expression.
-The resulting regexp is equivalent to but usually more efficient
-than that of a simplified version:
+The optional argument NOREORDER, if nil or omitted, allows the
+returned regexp to match the strings in any order. If non-nil,
+the match is guaranteed to be performed in the order given, as if
+the strings were made into a regexp by joining them with the
+`\\|' operator.
+
+Up to reordering, the resulting regexp is equivalent to but
+usually more efficient than that of a simplified version:
(defun simplified-regexp-opt (strings &optional paren)
(let ((parens
@@ -133,7 +139,15 @@ than that of a simplified version:
(open (cond ((stringp paren) paren) (paren "\\(")))
(sorted-strings (delete-dups
(sort (copy-sequence strings) 'string-lessp)))
- (re (regexp-opt-group sorted-strings (or open t) (not open))))
+ (re
+ ;; If NOREORDER is non-nil and the list contains a prefix
+ ;; of another string, we give up all attempts at optimisation.
+ ;; There is plenty of room for improvement (Bug#34641).
+ (if (and noreorder (regexp-opt--contains-prefix sorted-strings))
+ (concat (or open "\\(?:")
+ (mapconcat #'regexp-quote strings "\\|")
+ "\\)")
+ (regexp-opt-group sorted-strings (or open t) (not open)))))
(cond ((eq paren 'words)
(concat "\\<" re "\\>"))
((eq paren 'symbols)
@@ -313,6 +327,22 @@ CHARS should be a list of characters."
(concat "[" dash caret "]"))
(concat "[" bracket charset caret dash "]"))))
+
+(defun regexp-opt--contains-prefix (strings)
+ "Whether STRINGS contains a proper prefix of one of its other elements.
+STRINGS must be a list of sorted strings without duplicates."
+ (let ((s strings))
+ ;; In a lexicographically sorted list, a string always immediately
+ ;; succeeds one of its prefixes.
+ (while (and (cdr s)
+ (not (string-equal
+ (car s)
+ (substring (cadr s) 0 (min (length (car s))
+ (length (cadr s)))))))
+ (setq s (cdr s)))
+ (cdr s)))
+
+
(provide 'regexp-opt)
;;; regexp-opt.el ends here
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index 715cd608c46..ca756efb493 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -393,7 +393,7 @@ FORM is of the form `(and FORM1 ...)'."
(rx-group-if
(if (memq nil (mapcar 'stringp (cdr form)))
(mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
- (regexp-opt (cdr form)))
+ (regexp-opt (cdr form) nil t))
(and (memq rx-parent '(: * t)) rx-parent)))
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index e14feda347f..fa3d9b0d5ea 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -92,5 +92,18 @@
(*? "e") (+? "f") (\?? "g") (?? "h"))))
"a*b+c?d?e*?f+?g??h??")))
+(ert-deftest rx-or ()
+ ;; Test or-pattern reordering (Bug#34641).
+ (let ((s "abc"))
+ (should (equal (and (string-match (rx (or "abc" "ab" "a")) s)
+ (match-string 0 s))
+ "abc"))
+ (should (equal (and (string-match (rx (or "ab" "abc" "a")) s)
+ (match-string 0 s))
+ "ab"))
+ (should (equal (and (string-match (rx (or "a" "ab" "abc")) s)
+ (match-string 0 s))
+ "a"))))
+
(provide 'rx-tests)
;; rx-tests.el ends here.