From 157e735ce89ede9cc939f4ed0f72c5af7ae60735 Mon Sep 17 00:00:00 2001 From: Mattias Engdegård Date: Mon, 17 Jul 2023 13:05:21 +0200 Subject: Don't distort character ranges in rx translation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Emacs regexp engine interprets character ranges from ASCII to raw bytes, such as [a-\xfe], as not including non-ASCII Unicode at all; ranges from non-ACII Unicode to raw bytes, such as [ü-\x91], are ignored entirely. To make rx produce a translation that works as intended, split ranges that that go from ordinary characters to raw bytes. Such ranges may appear from set manipulation and regexp optimisation. * lisp/emacs-lisp/rx.el (rx--generate-alt): Split intervals that straddle the char-raw boundary when rendering a string regexp from an interval set. * test/lisp/emacs-lisp/rx-tests.el (rx-char-any-raw-byte): Add test cases. --- test/lisp/emacs-lisp/rx-tests.el | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'test/lisp/emacs-lisp') diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index 028250b7352..995d297ff08 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el @@ -98,7 +98,17 @@ "[\177Å\211\326-\377]")) ;; Split range; \177-\377ÿ should not be optimized to \177-\377. (should (equal (rx (any "\177-\377" ?ÿ)) - "[\177ÿ\200-\377]"))) + "[\177ÿ\200-\377]")) + ;; Range between normal chars and raw bytes: must be split to be parsed + ;; correctly by the Emacs regexp engine. + (should (equal + (rx (any (0 . #x3fffff)) (any (?G . #x3fff9a)) (any (?Ü . #x3ffff2))) + "[\0-\x3fff7f\x80-\xff][G-\x3fff7f\x80-\x9a][Ü-\x3fff7f\x80-\xf2]")) + ;; As above but with ranges in string form. For historical reasons, + ;; we special-case ASCII-to-raw ranges to exclude non-ASCII unicode. + (should (equal + (rx (any "\x00-\xff") (any "G-\x9a") (any "Ü-\xf2")) + "[\0-\x7f\x80-\xff][G-\x7f\x80-\x9a][Ü-\x3fff7f\x80-\xf2]"))) (ert-deftest rx-any () (should (equal (rx (any ?A (?C . ?D) "F-H" "J-L" "M" "N-P" "Q" "RS")) -- cgit v1.2.3