From 478bbf7c80e71ff84f0e4e1363bf86e93d9c51c3 Mon Sep 17 00:00:00 2001
From: Mattias Engdegård <mattiase@acm.org>
Date: Fri, 15 Feb 2019 19:27:48 +0100
Subject: Prevent over-eager rx character range condensation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`rx' incorrectly considers character ranges between ASCII and raw bytes to
cover all codes in-between, which includes all non-ASCII Unicode chars.
This causes (any "\000-\377" ?Å) to be simplified to (any "\000-\377"),
which is not at all the same thing: [\000-\377] really means
[\000-\177\200-\377] (Bug#34492).

* lisp/emacs-lisp/rx.el (rx-any-condense-range): Split ranges going
from ASCII to raw bytes.
* test/lisp/emacs-lisp/rx-tests.el (rx-char-any-raw-byte): Add test case.
* etc/NEWS: Mention the overall change (Bug#33205).
---
 lisp/emacs-lisp/rx.el | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'lisp/emacs-lisp')

diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index b2299030a1b..715cd608c46 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -429,6 +429,13 @@ Only both edges of each range is checked."
     ;; set L list of all ranges
     (mapc (lambda (e) (cond ((stringp e) (push e str))
 			    ((numberp e) (push (cons e e) l))
+                            ;; Ranges between ASCII and raw bytes are split,
+                            ;; to prevent accidental inclusion of Unicode
+                            ;; characters later on.
+                            ((and (<= (car e) #x7f)
+                                  (>= (cdr e) #x3fff80))
+                             (push (cons (car e) #x7f) l)
+                             (push (cons #x3fff80 (cdr e)) l))
 			    (t (push e l))))
 	  args)
     ;; condense overlapped ranges in L
-- 
cgit v1.2.3