diff options
author | Mattias EngdegÄrd <mattiase@acm.org> | 2019-02-24 22:12:52 +0100 |
---|---|---|
committer | Mattias EngdegÄrd <mattiase@acm.org> | 2019-03-02 15:35:28 +0100 |
commit | da758046da74e33273265cd2e72a8aa1a0c9c7e3 (patch) | |
tree | 4337523f0b56c12d69f27a91ee0a1b61376c0e7e /lisp/emacs-lisp/regexp-opt.el | |
parent | dbffbe08815644fd30404891ef81496277ed27da (diff) | |
download | emacs-da758046da74e33273265cd2e72a8aa1a0c9c7e3.tar.gz emacs-da758046da74e33273265cd2e72a8aa1a0c9c7e3.tar.bz2 emacs-da758046da74e33273265cd2e72a8aa1a0c9c7e3.zip |
rx: fix `or' ordering by adding argument to regexp-opt
The rx `or' form may reorder its arguments in an unpredictable way,
contrary to user expectation, since it sometimes uses `regexp-opt'.
Add a NOREORDER option to `regexp-opt' for preventing it from
producing a reordered regexp (Bug#34641).
* doc/lispref/searching.texi (Regular Expression Functions):
* etc/NEWS (Lisp Changes in Emacs 27.1):
Describe the new regexp-opt NOREORDER argument.
* lisp/emacs-lisp/regexp-opt.el (regexp-opt): Add NOREORDER.
Make no attempt at regexp improvement if the set of strings contains
a prefix of another string.
(regexp-opt--contains-prefix): New.
* lisp/emacs-lisp/rx.el (rx-or): Call regexp-opt with NOREORDER.
* test/lisp/emacs-lisp/rx-tests.el: Test rx `or' form match order.
Diffstat (limited to 'lisp/emacs-lisp/regexp-opt.el')
-rw-r--r-- | lisp/emacs-lisp/regexp-opt.el | 38 |
1 files changed, 34 insertions, 4 deletions
diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el index 63786c1508c..d0c5f2d3fc4 100644 --- a/lisp/emacs-lisp/regexp-opt.el +++ b/lisp/emacs-lisp/regexp-opt.el @@ -84,7 +84,7 @@ ;;; Code: ;;;###autoload -(defun regexp-opt (strings &optional paren) +(defun regexp-opt (strings &optional paren noreorder) "Return a regexp to match a string in the list STRINGS. Each string should be unique in STRINGS and should not contain any regexps, quoted or not. Optional PAREN specifies how the @@ -111,8 +111,14 @@ nil necessary to ensure that a postfix operator appended to it will apply to the whole expression. -The resulting regexp is equivalent to but usually more efficient -than that of a simplified version: +The optional argument NOREORDER, if nil or omitted, allows the +returned regexp to match the strings in any order. If non-nil, +the match is guaranteed to be performed in the order given, as if +the strings were made into a regexp by joining them with the +`\\|' operator. + +Up to reordering, the resulting regexp is equivalent to but +usually more efficient than that of a simplified version: (defun simplified-regexp-opt (strings &optional paren) (let ((parens @@ -133,7 +139,15 @@ than that of a simplified version: (open (cond ((stringp paren) paren) (paren "\\("))) (sorted-strings (delete-dups (sort (copy-sequence strings) 'string-lessp))) - (re (regexp-opt-group sorted-strings (or open t) (not open)))) + (re + ;; If NOREORDER is non-nil and the list contains a prefix + ;; of another string, we give up all attempts at optimisation. + ;; There is plenty of room for improvement (Bug#34641). + (if (and noreorder (regexp-opt--contains-prefix sorted-strings)) + (concat (or open "\\(?:") + (mapconcat #'regexp-quote strings "\\|") + "\\)") + (regexp-opt-group sorted-strings (or open t) (not open))))) (cond ((eq paren 'words) (concat "\\<" re "\\>")) ((eq paren 'symbols) @@ -313,6 +327,22 @@ CHARS should be a list of characters." (concat "[" dash caret "]")) (concat "[" bracket charset caret dash "]")))) + +(defun regexp-opt--contains-prefix (strings) + "Whether STRINGS contains a proper prefix of one of its other elements. +STRINGS must be a list of sorted strings without duplicates." + (let ((s strings)) + ;; In a lexicographically sorted list, a string always immediately + ;; succeeds one of its prefixes. + (while (and (cdr s) + (not (string-equal + (car s) + (substring (cadr s) 0 (min (length (car s)) + (length (cadr s))))))) + (setq s (cdr s))) + (cdr s))) + + (provide 'regexp-opt) ;;; regexp-opt.el ends here |