summaryrefslogtreecommitdiff
path: root/test/lisp/emacs-lisp
diff options
context:
space:
mode:
authorMattias Engdegård <mattiase@acm.org>2023-07-17 13:05:21 +0200
committerMattias Engdegård <mattiase@acm.org>2023-07-17 17:56:54 +0200
commit157e735ce89ede9cc939f4ed0f72c5af7ae60735 (patch)
tree58c17cfd219647bec129ccbc77ced41233d65844 /test/lisp/emacs-lisp
parent7446a8c34e2b793df52dbf56b630e20f8c10568c (diff)
downloademacs-157e735ce89ede9cc939f4ed0f72c5af7ae60735.tar.gz
emacs-157e735ce89ede9cc939f4ed0f72c5af7ae60735.tar.bz2
emacs-157e735ce89ede9cc939f4ed0f72c5af7ae60735.zip
Don't distort character ranges in rx translation
The Emacs regexp engine interprets character ranges from ASCII to raw bytes, such as [a-\xfe], as not including non-ASCII Unicode at all; ranges from non-ACII Unicode to raw bytes, such as [ü-\x91], are ignored entirely. To make rx produce a translation that works as intended, split ranges that that go from ordinary characters to raw bytes. Such ranges may appear from set manipulation and regexp optimisation. * lisp/emacs-lisp/rx.el (rx--generate-alt): Split intervals that straddle the char-raw boundary when rendering a string regexp from an interval set. * test/lisp/emacs-lisp/rx-tests.el (rx-char-any-raw-byte): Add test cases.
Diffstat (limited to 'test/lisp/emacs-lisp')
-rw-r--r--test/lisp/emacs-lisp/rx-tests.el12
1 files changed, 11 insertions, 1 deletions
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index 028250b7352..995d297ff08 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -98,7 +98,17 @@
"[\177Å\211\326-\377]"))
;; Split range; \177-\377ÿ should not be optimized to \177-\377.
(should (equal (rx (any "\177-\377" ?ÿ))
- "[\177ÿ\200-\377]")))
+ "[\177ÿ\200-\377]"))
+ ;; Range between normal chars and raw bytes: must be split to be parsed
+ ;; correctly by the Emacs regexp engine.
+ (should (equal
+ (rx (any (0 . #x3fffff)) (any (?G . #x3fff9a)) (any (?Ü . #x3ffff2)))
+ "[\0-\x3fff7f\x80-\xff][G-\x3fff7f\x80-\x9a][Ü-\x3fff7f\x80-\xf2]"))
+ ;; As above but with ranges in string form. For historical reasons,
+ ;; we special-case ASCII-to-raw ranges to exclude non-ASCII unicode.
+ (should (equal
+ (rx (any "\x00-\xff") (any "G-\x9a") (any "Ü-\xf2"))
+ "[\0-\x7f\x80-\xff][G-\x7f\x80-\x9a][Ü-\x3fff7f\x80-\xf2]")))
(ert-deftest rx-any ()
(should (equal (rx (any ?A (?C . ?D) "F-H" "J-L" "M" "N-P" "Q" "RS"))