diff options
Diffstat (limited to 'test/src/regex-emacs-tests.el')
-rw-r--r-- | test/src/regex-emacs-tests.el | 870 |
1 files changed, 870 insertions, 0 deletions
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el new file mode 100644 index 00000000000..ff0d6be3f5d --- /dev/null +++ b/test/src/regex-emacs-tests.el @@ -0,0 +1,870 @@ +;;; regex-emacs-tests.el --- tests for regex-emacs.c -*- lexical-binding: t -*- + +;; Copyright (C) 2015-2022 Free Software Foundation, Inc. + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. + +;;; Code: + +(require 'ert) + +(defvar regex-tests--resources-dir + (concat (concat (file-name-directory (or load-file-name buffer-file-name)) + "/regex-resources/")) + "Path to regex-resources directory next to the \"regex-emacs-tests.el\" file.") + +(ert-deftest regex-word-cc-fallback-test () + "Test that \"[[:cc:]]*x\" matches \"x\" (bug#24020). + +Test that a regex of the form \"[[:cc:]]*x\" where CC is +a character class which matches a multibyte character X, matches +string \"x\". + +For example, \"[[:word:]]*\u2620\" regex (note: \u2620 is a word +character) must match a string \"\u2420\"." + (dolist (class '("[[:word:]]" "\\sw")) + (dolist (repeat '("*" "+")) + (dolist (suffix '("" "b" "bar" "\u2620")) + (dolist (string '("" "foo")) + (when (not (and (string-equal repeat "+") + (string-equal string ""))) + (should (string-match (concat "^" class repeat suffix "$") + (concat string suffix))))))))) + +(defun regex--test-cc (name matching not-matching) + (let (case-fold-search) + (should (string-match-p (concat "^[[:" name ":]]*$") matching)) + (should (string-match-p (concat "^[[:" name ":]]*?\u2622$") + (concat matching "\u2622"))) + (should (string-match-p (concat "^[^[:" name ":]]*$") not-matching)) + (should (string-match-p (concat "^[^[:" name ":]]*\u2622$") + (concat not-matching "\u2622"))) + (with-temp-buffer + (insert matching) + (let ((p (point))) + (insert not-matching) + (goto-char (point-min)) + (skip-chars-forward (concat "[:" name ":]")) + (should (equal (point) p)) + (skip-chars-forward (concat "^[:" name ":]")) + (should (equal (point) (point-max))) + (goto-char (point-min)) + (skip-chars-forward (concat "[:" name ":]\u2622")) + (should (or (equal (point) p) (equal (point) (1+ p)))))))) + +(dolist (test '(("alnum" "abcABC012łąka" "-, \t\n") + ("alpha" "abcABCłąka" "-,012 \t\n") + ("digit" "012" "abcABCłąka-, \t\n") + ("xdigit" "0123aBc" "łąk-, \t\n") + ("upper" "ABCŁĄKA" "abc012-, \t\n") + ("lower" "abcłąka" "ABC012-, \t\n") + + ("word" "abcABC012\u2620" "-, \t\n") + + ("punct" ".,-" "abcABC012\u2620 \t\n") + ("cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ") + ("graph" "abcłąka\u2620-," " \t\n\1") + ("print" "abcłąka\u2620-, " "\t\n\1") + + ("space" " \t\n\u2001" "abcABCł0123") + ("blank" " \t\u2001" "\n") + + ("ascii" "abcABC012 \t\n\1" "łą\u2620") + ("nonascii" "łą\u2622" "abcABC012 \t\n\1") + ("unibyte" "abcABC012 \t\n\1" "łą\u2622") + ("multibyte" "łą\u2622" "abcABC012 \t\n\1"))) + (let ((name (intern (concat "regex-tests-" (car test) "-character-class"))) + (doc (concat "Perform sanity test of regexes using " (car test) + " character class. + +Go over all the supported character classes and test whether the +classes and their inversions match what they are supposed to +match. The test is done using `string-match-p' as well as +`skip-chars-forward'."))) + (eval `(ert-deftest ,name () ,doc ,(cons 'regex--test-cc test)) t))) + + +(defmacro regex-tests-generic-line (comment-char test-file whitelist &rest body) + "Reads a line of the test file TEST-FILE, skipping +comments (defined by COMMENT-CHAR), and evaluates the tests in +this line as defined in the BODY. Line numbers in the WHITELIST +are known failures, and are skipped." + + `(with-temp-buffer + (modify-syntax-entry ?_ "w;; ") ; tests expect _ to be a word + (insert-file-contents (concat regex-tests--resources-dir ,test-file)) + (let ((case-fold-search nil) + (line-number 1) + (whitelist-idx 0)) + + (goto-char (point-min)) + + (while (not (eobp)) + (let ((start (point))) + (end-of-line) + (narrow-to-region start (point)) + + (goto-char (point-min)) + + (when + (and + ;; ignore comments + (save-excursion + (re-search-forward ,(concat "^[^" (string comment-char) "]") nil t)) + + ;; skip lines in the whitelist + (let ((whitelist-next + (condition-case nil + (aref ,whitelist whitelist-idx) (args-out-of-range nil)))) + (cond + ;; whitelist exhausted. do process this line + ((null whitelist-next) t) + + ;; we're not yet at the next whitelist element. do + ;; process this line + ((< line-number whitelist-next) t) + + ;; we're past the next whitelist element. This + ;; shouldn't happen + ((> line-number whitelist-next) + (error + (format + "We somehow skipped the next whitelist element: line %d" whitelist-next))) + + ;; we're at the next whitelist element. Skip this + ;; line, and advance the whitelist index + (t + (setq whitelist-idx (1+ whitelist-idx)) nil)))) + ,@body) + + (widen) + (forward-line) + (beginning-of-line) + (setq line-number (1+ line-number))))))) + +(defun regex-tests-compare (string what-failed bounds-ref &optional substring-ref) + "I just ran a search, looking at STRING. WHAT-FAILED describes +what failed, if anything; valid values are `search-failed', +`compilation-failed' and nil. I compare the beginning/end of each +group with their expected values. This is done with either +BOUNDS-REF or SUBSTRING-REF; one of those should be non-nil. +BOUNDS-REF is a sequence [start-ref0 end-ref0 start-ref1 +end-ref1 ....] while SUBSTRING-REF is the expected substring +obtained by indexing the input string by start/end-ref. + +If the search was supposed to fail then start-ref0/substring-ref0 +is `search-failed'. If the search wasn't even supposed to compile +successfully, then start-ref0/substring-ref0 is +`compilation-failed'. If I only care about a match succeeding, +this can be set to t. + +This function returns a string that describes the failure, or nil +on success" + + (when (or + (and bounds-ref substring-ref) + (not (or bounds-ref substring-ref))) + (error "Exactly one of bounds-ref and bounds-ref should be non-nil")) + + (let ((what-failed-ref (car (or bounds-ref substring-ref)))) + + (cond + ((eq what-failed 'search-failed) + (cond + ((eq what-failed-ref 'search-failed) + nil) + ((eq what-failed-ref 'compilation-failed) + "Expected pattern failure; but no match") + (t + "Expected match; but no match"))) + + ((eq what-failed 'compilation-failed) + (cond + ((eq what-failed-ref 'search-failed) + "Expected no match; but pattern failure") + ((eq what-failed-ref 'compilation-failed) + nil) + (t + "Expected match; but pattern failure"))) + + ;; The regex match succeeded + ((eq what-failed-ref 'search-failed) + "Expected no match; but match") + ((eq what-failed-ref 'compilation-failed) + "Expected pattern failure; but match") + + ;; The regex match succeeded, as expected. I now check all the + ;; bounds + (t + (let ((idx 0) + msg + ref next-ref-function compare-ref-function mismatched-ref-function) + + (if bounds-ref + (setq ref bounds-ref + next-ref-function (lambda (x) (cddr x)) + compare-ref-function (lambda (ref start-pos end-pos) + (or (eq (car ref) t) + (and (eq start-pos (car ref)) + (eq end-pos (cadr ref))))) + mismatched-ref-function (lambda (ref start-pos end-pos) + (format + "beginning/end positions: %d/%s and %d/%s" + start-pos (car ref) end-pos (cadr ref)))) + (setq ref substring-ref + next-ref-function (lambda (x) (cdr x)) + compare-ref-function (lambda (ref start-pos end-pos) + (or (eq (car ref) t) + (string= (substring string start-pos end-pos) (car ref)))) + mismatched-ref-function (lambda (ref start-pos end-pos) + (format + "beginning/end positions: %d/%s and %d/%s" + start-pos (car ref) end-pos (cadr ref))))) + + (while (not (or (null ref) msg)) + + (let ((start (match-beginning idx)) + (end (match-end idx))) + + (when (not (funcall compare-ref-function ref start end)) + (setq msg + (format + "Have expected match, but mismatch in group %d: %s" idx (funcall mismatched-ref-function ref start end)))) + + (setq ref (funcall next-ref-function ref) + idx (1+ idx)))) + + (or msg + nil)))))) + + + +(defun regex-tests-match (pattern string bounds-ref &optional substring-ref) + "I match the given STRING against PATTERN. I compare the +beginning/end of each group with their expected values. +BOUNDS-REF is a sequence [start-ref0 end-ref0 start-ref1 end-ref1 +....]. + +If the search was supposed to fail then start-ref0 is +`search-failed'. If the search wasn't even supposed to compile +successfully, then start-ref0 is `compilation-failed'. + +This function returns a string that describes the failure, or nil +on success" + + (if (string-match "\\[\\([\\.=]\\)..?\\1\\]" pattern) + ;; Skipping test: [.x.] and [=x=] forms not supported by emacs + nil + + (regex-tests-compare + string + (condition-case nil + (if (string-match pattern string) nil 'search-failed) + ('invalid-regexp 'compilation-failed)) + bounds-ref substring-ref))) + + +(defconst regex-tests-re-even-escapes + "\\(?:^\\|[^\\]\\)\\(?:\\\\\\\\\\)*" + "Regex that matches an even number of \\ characters.") + +(defconst regex-tests-re-odd-escapes + (concat regex-tests-re-even-escapes "\\\\") + "Regex that matches an odd number of \\ characters.") + + +(defun regex-tests-unextend (pattern) + "Basic conversion from extended regexes to emacs ones. This is +mostly a hack that adds \\ to () and | and {}, and removes it if +it already exists. We also change \\S (and \\s) to \\S- (and +\\s-) because extended regexes see the former as whitespace, but +emacs requires an extra symbol character" + + (with-temp-buffer + (insert pattern) + (goto-char (point-min)) + + (while (re-search-forward "[()|{}]" nil t) + ;; point is past special character. If it is escaped, unescape + ;; it + + (if (save-excursion + (re-search-backward (concat regex-tests-re-odd-escapes ".\\=") nil t)) + + ;; This special character is preceded by an odd number of \, + ;; so I unescape it by removing the last one + (progn + (forward-char -2) + (delete-char 1) + (forward-char 1)) + + ;; This special character is preceded by an even (possibly 0) + ;; number of \. I add an escape + (forward-char -1) + (insert "\\") + (forward-char 1))) + + ;; convert \s to \s- + (goto-char (point-min)) + (while (re-search-forward (concat regex-tests-re-odd-escapes "[Ss]") nil t) + (insert "-")) + + (buffer-string))) + +(defun regex-tests-BOOST-frob-escapes (s ispattern) + "Mangle \\ the way it is done in frob_escapes() in +regex-tests-BOOST.c in glibc: \\t, \\n, \\r are interpreted; +\\\\, \\^, \\{, \\|, \\} are unescaped for the string (not +pattern)" + + ;; this is all similar to (regex-tests-unextend) + (with-temp-buffer + (insert s) + + (let ((interpret-list (list "t" "n" "r"))) + (while interpret-list + (goto-char (point-min)) + (while (re-search-forward + (concat "\\(" regex-tests-re-even-escapes "\\)" + "\\\\" (car interpret-list)) + nil t) + (replace-match (concat "\\1" (car (read-from-string + (concat "\"\\" (car interpret-list) "\"")))))) + + (setq interpret-list (cdr interpret-list)))) + + (when (not ispattern) + ;; unescape \\, \^, \{, \|, \} + (let ((unescape-list (list "\\\\" "^" "{" "|" "}"))) + (while unescape-list + (goto-char (point-min)) + (while (re-search-forward + (concat "\\(" regex-tests-re-even-escapes "\\)" + "\\\\" (car unescape-list)) + nil t) + (replace-match (concat "\\1" (car unescape-list)))) + + (setq unescape-list (cdr unescape-list)))) + ) + (buffer-string))) + + + + +(defconst regex-tests-BOOST-whitelist + [ + ;; emacs is more stringent with regexes involving unbalanced ) + 63 65 69 + + ;; in emacs, regex . doesn't match \n + 91 + + ;; emacs is more forgiving with * and ? that don't apply to + ;; characters + 107 108 109 122 123 124 140 141 142 + + ;; emacs accepts regexes with {} + 161 + + ;; emacs doesn't fail on bogus ranges such as [3-1] or [1-3-5] + 222 223 + + ;; emacs doesn't match (ab*)[ab]*\1 greedily: only 4 chars of + ;; ababaaa match + 284 294 + + ;; ambiguous groupings are ambiguous + 443 444 445 446 448 449 450 + + ;; emacs doesn't know how to handle weird ranges such as [a-Z] and + ;; [[:alpha:]-a] + 539 580 581 + + ;; emacs matches non-greedy regex ab.*? non-greedily + 639 677 712 + ] + "Line numbers in the boost test that should be skipped. +These are false-positive test failures that represent +known/benign differences in behavior.") + +;; - Format +;; - Comments are lines starting with ; +;; - Lines starting with - set options passed to regcomp() and regexec(): +;; - if no "REG_BASIC" is found, with have an extended regex +;; - These set a flag: +;; - REG_ICASE +;; - REG_NEWLINE (ignored by this function) +;; - REG_NOTBOL +;; - REG_NOTEOL +;; +;; - Test lines are +;; pattern string start0 end0 start1 end1 ... +;; +;; - pattern, string can have escapes +;; - string can have whitespace if enclosed in "" +;; - if string is "!", then the pattern is supposed to fail compilation +;; - start/end are of group0, group1, etc. group 0 is the full match +;; - start<0 indicates "no match" +;; - start is the 0-based index of the first character +;; - end is the 0-based index of the first character past the group +(defun regex-tests-BOOST () + (let (failures + basic icase notbol noteol) + (regex-tests-generic-line + ?\; "BOOST.tests" regex-tests-BOOST-whitelist + (if (save-excursion (re-search-forward "^-" nil t)) + (setq basic (save-excursion (re-search-forward "REG_BASIC" nil t)) + icase (save-excursion (re-search-forward "REG_ICASE" nil t)) + notbol (save-excursion (re-search-forward "REG_NOTBOL" nil t)) + noteol (save-excursion (re-search-forward "REG_NOTEOL" nil t))) + + (save-excursion + (or (re-search-forward "\\(\\S-+\\)\\s-+\"\\(.*\\)\"\\s-+?\\(.+\\)" nil t) + (re-search-forward "\\(\\S-+\\)\\s-+\\(\\S-+\\)\\s-+?\\(.+\\)" nil t) + (re-search-forward "\\(\\S-+\\)\\s-+\\(!\\)" nil t))) + + (let* ((pattern-raw (match-string 1)) + (string-raw (match-string 2)) + (positions-raw (match-string 3)) + (pattern (regex-tests-BOOST-frob-escapes pattern-raw t)) + (string (regex-tests-BOOST-frob-escapes string-raw nil)) + (positions + (if (string= string "!") + (list 'compilation-failed 0) + (mapcar + (lambda (x) + (let ((x (string-to-number x))) + (if (< x 0) nil x))) + (split-string positions-raw))))) + + (when (null (car positions)) + (setcar positions 'search-failed)) + + (when (not basic) + (setq pattern (regex-tests-unextend pattern))) + + ;; great. I now have all the data parsed. Let's use it to do + ;; stuff + (let* ((case-fold-search icase) + (msg (regex-tests-match pattern string positions))) + + (if (and + ;; Skipping test: notbol/noteol not supported + (not notbol) (not noteol) + + msg) + + ;; store failure + (setq failures + (cons (format "line number %d: Regex '%s': %s" + line-number pattern msg) + failures))))))) + + failures)) + +(defconst regex-tests-PCRE-whitelist + [ + ;; ambiguous groupings are ambiguous + 610 611 1154 1157 1160 1168 1171 1176 1179 1182 1185 1188 1193 1196 1203 + ] + "Line numbers in the PCRE test that should be skipped. +These are false-positive test failures that represent +known/benign differences in behavior.") + +;; - Format +;; +;; regex +;; input_string +;; group_num: group_match | "No match" +;; input_string +;; group_num: group_match | "No match" +;; input_string +;; group_num: group_match | "No match" +;; input_string +;; group_num: group_match | "No match" +;; ... +(defun regex-tests-PCRE () + (let (failures + pattern icase string what-failed matches-observed) + (regex-tests-generic-line + ?# "PCRE.tests" regex-tests-PCRE-whitelist + + (cond + + ;; pattern + ((save-excursion (re-search-forward "^/\\(.*\\)/\\(.*\\)$" nil t)) + (setq icase (string= "i" (match-string 2)) + pattern (regex-tests-unextend (match-string 1)))) + + ;; string. read it in, match against pattern, and save all the results + ((save-excursion (re-search-forward "^ \\(.*\\)" nil t)) + (let ((case-fold-search icase)) + (setq string (match-string 1) + + ;; the regex match under test + what-failed + (condition-case nil + (if (string-match pattern string) nil 'search-failed) + ('invalid-regexp 'compilation-failed)) + + matches-observed + (cl-loop for x from 0 to 20 + collect (and (not what-failed) + (or (match-string x string) "<unset>"))))) + nil) + + ;; verification line: failed match + ((save-excursion (re-search-forward "^No match" nil t)) + (unless what-failed + (setq failures + (cons (format "line number %d: Regex '%s': Expected no match; but match" + line-number pattern) + failures)))) + + ;; verification line: succeeded match + ((save-excursion (re-search-forward "^ *\\([0-9]+\\): \\(.*\\)" nil t)) + (let* ((match-ref (match-string 2)) + (idx (string-to-number (match-string 1)))) + + (if what-failed + "Expected match; but no match" + (unless (string= match-ref (elt matches-observed idx)) + (setq failures + (cons (format "line number %d: Regex '%s': Have expected match, but group %d is wrong: '%s'/'%s'" + line-number pattern + idx match-ref (elt matches-observed idx)) + failures)))))) + + ;; reset + (t (setq pattern nil) nil))) + + failures)) + +(defconst regex-tests-PTESTS-whitelist + [ + ;; emacs doesn't see DEL (0x7f) as a [:cntrl:] character + 138 + + ;; emacs doesn't barf on weird ranges such as [b-a], but simply + ;; fails to match + 168 + ] + "Line numbers in the PTESTS test that should be skipped. +These are false-positive test failures that represent +known/benign differences in behavior.") + +;; - Format +;; - fields separated by ¦ (note: this is not a |) +;; - start¦end¦pattern¦string +;; - start is the 1-based index of the first character +;; - end is the 1-based index of the last character +(defun regex-tests-PTESTS () + (let (failures) + (regex-tests-generic-line + ?# "PTESTS" regex-tests-PTESTS-whitelist + (let* ((fields (split-string (buffer-string) "¦")) + + ;; string has 1-based index of first char in the + ;; match. -1 means "no match". -2 means "invalid + ;; regex". + ;; + ;; start-ref is 0-based index of first char in the + ;; match + ;; + ;; string==0 is a special case, and I have to treat + ;; it as start-ref = 0 + (start-ref (let ((raw (string-to-number (elt fields 0)))) + (cond + ((= raw -2) 'compilation-failed) + ((= raw -1) 'search-failed) + ((= raw 0) 0) + (t (1- raw))))) + + ;; string has 1-based index of last char in the + ;; match. end-ref is 0-based index of first char past + ;; the match + (end-ref (string-to-number (elt fields 1))) + (pattern (elt fields 2)) + (string (elt fields 3))) + + (let ((msg (regex-tests-match pattern string (list start-ref end-ref)))) + (when msg + (setq failures + (cons (format "line number %d: Regex '%s': %s" + line-number pattern msg) + failures)))))) + failures)) + +(defconst regex-tests-TESTS-whitelist + [ + ;; emacs doesn't barf on weird ranges such as [b-a], but simply + ;; fails to match + 42 + + ;; emacs is more forgiving with * and ? that don't apply to + ;; characters + 57 58 59 60 + + ;; emacs is more stringent with regexes involving unbalanced ) + 67 + ] + "Line numbers in the TESTS test that should be skipped. +These are false-positive test failures that represent +known/benign differences in behavior.") + +;; - Format +;; - fields separated by :. Watch for [\[:xxx:]] +;; - expected:pattern:string +;; +;; expected: +;; | 0 | successful match | +;; | 1 | failed match | +;; | 2 | regcomp() should fail | +(defun regex-tests-TESTS () + (let (failures) + (regex-tests-generic-line + ?# "TESTS" regex-tests-TESTS-whitelist + (if (save-excursion (re-search-forward "^\\([^:]+\\):\\(.*\\):\\([^:]*\\)$" nil t)) + (let* ((what-failed + (let ((raw (string-to-number (match-string 1)))) + (cond + ((= raw 2) 'compilation-failed) + ((= raw 1) 'search-failed) + (t t)))) + (string (match-string 3)) + (pattern (regex-tests-unextend (match-string 2)))) + + (let ((msg (regex-tests-match pattern string nil (list what-failed)))) + (when msg + (setq failures + (cons (format "line number %d: Regex '%s': %s" + line-number pattern msg) + failures))))) + + (error "Error parsing TESTS file line: '%s'" (buffer-string)))) + failures)) + +(ert-deftest regex-tests-BOOST () + "Tests of the regular expression engine. +This evaluates the BOOST test cases from glibc." + (should-not (regex-tests-BOOST))) + +(ert-deftest regex-tests-PCRE () + "Tests of the regular expression engine. +This evaluates the PCRE test cases from glibc." + (should-not (regex-tests-PCRE))) + +(ert-deftest regex-tests-PTESTS () + "Tests of the regular expression engine. +This evaluates the PTESTS test cases from glibc." + (should-not (regex-tests-PTESTS))) + +(ert-deftest regex-tests-TESTS () + "Tests of the regular expression engine. +This evaluates the TESTS test cases from glibc." + (should-not (regex-tests-TESTS))) + +(ert-deftest regex-repeat-limit () + "Test the #xFFFF repeat limit." + (should (string-match "\\`x\\{65535\\}" (make-string 65535 ?x))) + (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x))) + (should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp)) + +(ert-deftest regexp-unibyte-unibyte () + "Test matching a unibyte regexp against a unibyte string." + ;; Sanity check + (should-not (multibyte-string-p "ab")) + (should-not (multibyte-string-p "\xff")) + ;; ASCII + (should (string-match "a[b]" "ab")) + ;; Raw + (should (string-match "\xf1" "\xf1")) + (should-not (string-match "\xf1" "\xc1\xb1")) + ;; Raw, char alt + (should (string-match "[\xf1]" "\xf1")) + (should-not (string-match "[\xf1]" "\xc1\xb1")) + ;; Raw range + (should (string-match "[\x82-\xd3]" "\xbb")) + (should-not (string-match "[\x82-\xd3]" "a")) + (should-not (string-match "[\x82-\xd3]" "\x81")) + (should-not (string-match "[\x82-\xd3]" "\xd4")) + ;; ASCII-raw range + (should (string-match "[f-\xd3]" "q")) + (should (string-match "[f-\xd3]" "\xbb")) + (should-not (string-match "[f-\xd3]" "e")) + (should-not (string-match "[f-\xd3]" "\xd4"))) + +(ert-deftest regexp-multibyte-multibyte () + "Test matching a multibyte regexp against a multibyte string." + ;; Sanity check + (should (multibyte-string-p "åü")) + ;; ASCII + (should (string-match (string-to-multibyte "a[b]") + (string-to-multibyte "ab"))) + ;; Unicode + (should (string-match "å[ü]z" "åüz")) + (should-not (string-match "ü" (string-to-multibyte "\xc3\xbc"))) + ;; Raw + (should (string-match (string-to-multibyte "\xf1") + (string-to-multibyte "\xf1"))) + (should-not (string-match (string-to-multibyte "\xf1") + (string-to-multibyte "\xc1\xb1"))) + (should-not (string-match (string-to-multibyte "\xc1\xb1") + (string-to-multibyte "\xf1"))) + ;; Raw, char alt + (should (string-match (string-to-multibyte "[\xf1]") + (string-to-multibyte "\xf1"))) + ;; Raw range + (should (string-match (string-to-multibyte "[\x82-\xd3]") + (string-to-multibyte "\xbb"))) + (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "a")) + (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "Å")) + (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "ü")) + (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\x81")) + (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\xd4")) + ;; ASCII-raw range: should exclude U+0100..U+10FFFF + (should (string-match (string-to-multibyte "[f-\xd3]") + (string-to-multibyte "q"))) + (should (string-match (string-to-multibyte "[f-\xd3]") + (string-to-multibyte "\xbb"))) + (should-not (string-match (string-to-multibyte "[f-\xd3]") "e")) + (should-not (string-match (string-to-multibyte "[f-\xd3]") "Å")) + (should-not (string-match (string-to-multibyte "[f-\xd3]") "ü")) + (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4")) + ;; Unicode-raw range: should be empty + (should-not (string-match "[å-\xd3]" "å")) + (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xd3"))) + (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xbb"))) + (should-not (string-match "[å-\xd3]" "ü")) + ;; No equivalence between raw bytes and latin-1 + (should-not (string-match "å" (string-to-multibyte "\xe5"))) + (should-not (string-match "[å]" (string-to-multibyte "\xe5"))) + (should-not (string-match "\xe5" "å")) + (should-not (string-match "[\xe5]" "å"))) + +(ert-deftest regexp-unibyte-multibyte () + "Test matching a unibyte regexp against a multibyte string." + ;; ASCII + (should (string-match "a[b]" (string-to-multibyte "ab"))) + ;; Unicode + (should (string-match "a.[^b]c" (string-to-multibyte "aåüc"))) + ;; Raw + (should (string-match "\xf1" (string-to-multibyte "\xf1"))) + (should-not (string-match "\xc1\xb1" (string-to-multibyte "\xf1"))) + ;; Raw, char alt + (should (string-match "[\xf1]" (string-to-multibyte "\xf1"))) + (should-not (string-match "[\xc1][\xb1]" (string-to-multibyte "\xf1"))) + ;; ASCII-raw range: should exclude U+0100..U+10FFFF + (should (string-match "[f-\xd3]" (string-to-multibyte "q"))) + (should (string-match "[f-\xd3]" (string-to-multibyte "\xbb"))) + (should-not (string-match "[f-\xd3]" "e")) + (should-not (string-match "[f-\xd3]" "Å")) + (should-not (string-match "[f-\xd3]" "ü")) + (should-not (string-match "[f-\xd3]" "\xd4")) + ;; No equivalence between raw bytes and latin-1 + (should-not (string-match "\xe5" "å")) + (should-not (string-match "[\xe5]" "å"))) + +(ert-deftest regexp-multibyte-unibyte () + "Test matching a multibyte regexp against a unibyte string." + ;; ASCII + (should (string-match (string-to-multibyte "a[b]") "ab")) + ;; Unicode + (should (string-match "a[^ü]c" "abc")) + (should-not (string-match "ü" "\xc3\xbc")) + ;; Raw + (should (string-match (string-to-multibyte "\xf1") "\xf1")) + (should-not (string-match (string-to-multibyte "\xf1") "\xc1\xb1")) + ;; Raw, char alt + (should (string-match (string-to-multibyte "[\xf1]") "\xf1")) + (should-not (string-match (string-to-multibyte "[\xf1]") "\xc1\xb1")) + ;; ASCII-raw range: should exclude U+0100..U+10FFFF + (should (string-match (string-to-multibyte "[f-\xd3]") "q")) + (should (string-match (string-to-multibyte "[f-\xd3]") "\xbb")) + (should-not (string-match (string-to-multibyte "[f-\xd3]") "e")) + (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4")) + ;; Unicode-raw range: should be empty + (should-not (string-match "[å-\xd3]" "\xd3")) + (should-not (string-match "[å-\xd3]" "\xbb")) + ;; No equivalence between raw bytes and latin-1 + (should-not (string-match "å" "\xe5")) + (should-not (string-match "[å]" "\xe5"))) + +(ert-deftest regexp-case-fold () + "Test case-sensitive and case-insensitive matching." + (let ((case-fold-search nil)) + (should (equal (string-match "aB" "ABaB") 2)) + (should (equal (string-match "åÄ" "ÅäåäÅÄåÄ") 6)) + (should (equal (string-match "λΛ" "lΛλλΛ") 3)) + (should (equal (string-match "шШ" "zШшшШ") 3)) + (should (equal (string-match "[[:alpha:]]+" ".3aBåÄßλΛшШ中﷽") 2)) + (should (equal (match-end 0) 12)) + (should (equal (string-match "[[:alnum:]]+" ".3aBåÄßλΛшШ中﷽") 1)) + (should (equal (match-end 0) 12)) + (should (equal (string-match "[[:upper:]]+" ".3aåλшBÄΛШ中﷽") 6)) + (should (equal (match-end 0) 10)) + (should (equal (string-match "[[:lower:]]+" ".3BÄΛШaåλш中﷽") 6)) + (should (equal (match-end 0) 10))) + (let ((case-fold-search t)) + (should (equal (string-match "aB" "ABaB") 0)) + (should (equal (string-match "åÄ" "ÅäåäÅÄåÄ") 0)) + (should (equal (string-match "λΛ" "lΛλλΛ") 1)) + (should (equal (string-match "шШ" "zШшшШ") 1)) + (should (equal (string-match "[[:alpha:]]+" ".3aBåÄßλΛшШ中﷽") 2)) + (should (equal (match-end 0) 12)) + (should (equal (string-match "[[:alnum:]]+" ".3aBåÄßλΛшШ中﷽") 1)) + (should (equal (match-end 0) 12)) + (should (equal (string-match "[[:upper:]]+" ".3aåλшBÄΛШ中﷽") 2)) + (should (equal (match-end 0) 10)) + (should (equal (string-match "[[:lower:]]+" ".3BÄΛШaåλш中﷽") 2)) + (should (equal (match-end 0) 10)))) + +(ert-deftest regexp-eszett () + "Test matching of ß and ẞ." + ;; Sanity checks. + (should (equal (upcase "ß") "SS")) + (should (equal (downcase "ß") "ß")) + (should (equal (capitalize "ß") "Ss")) ; undeutsch... + (should (equal (upcase "ẞ") "ẞ")) + (should (equal (downcase "ẞ") "ß")) + (should (equal (capitalize "ẞ") "ẞ")) + ;; ß is a lower-case letter (Ll); ẞ is an upper-case letter (Lu). + (let ((case-fold-search nil)) + (should (equal (string-match "ß" "ß") 0)) + (should (equal (string-match "ß" "ẞ") nil)) + (should (equal (string-match "ẞ" "ß") nil)) + (should (equal (string-match "ẞ" "ẞ") 0)) + (should (equal (string-match "[[:alpha:]]" "ß") 0)) + ;; bug#11309 + (should (equal (string-match "[[:lower:]]" "ß") 0)) + (should (equal (string-match "[[:upper:]]" "ß") nil)) + (should (equal (string-match "[[:alpha:]]" "ẞ") 0)) + (should (equal (string-match "[[:lower:]]" "ẞ") nil)) + (should (equal (string-match "[[:upper:]]" "ẞ") 0))) + (let ((case-fold-search t)) + (should (equal (string-match "ß" "ß") 0)) + (should (equal (string-match "ß" "ẞ") 0)) + (should (equal (string-match "ẞ" "ß") 0)) + (should (equal (string-match "ẞ" "ẞ") 0)) + (should (equal (string-match "[[:alpha:]]" "ß") 0)) + ;; bug#11309 + (should (equal (string-match "[[:lower:]]" "ß") 0)) + (should (equal (string-match "[[:upper:]]" "ß") 0)) + (should (equal (string-match "[[:alpha:]]" "ẞ") 0)) + (should (equal (string-match "[[:lower:]]" "ẞ") 0)) + (should (equal (string-match "[[:upper:]]" "ẞ") 0)))) + +;;; regex-emacs-tests.el ends here |