diff options
-rw-r--r-- | doc/lispref/searching.texi | 6 | ||||
-rw-r--r-- | etc/NEWS | 6 | ||||
-rw-r--r-- | src/character.c | 17 | ||||
-rw-r--r-- | src/character.h | 1 | ||||
-rw-r--r-- | src/regex.c | 12 | ||||
-rw-r--r-- | test/lisp/subr-tests.el | 10 | ||||
-rw-r--r-- | test/src/regex-tests.el | 2 |
7 files changed, 48 insertions, 6 deletions
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index b011d14ee35..67d4c224647 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -553,7 +553,11 @@ characters whose Unicode @samp{general-category} property (@pxref{Character Properties}) indicates they are alphabetic characters. @item [:blank:] -This matches space and tab only. +This matches horizontal whitespace, as defined by Annex C of the +Unicode Technical Standard #18. In particular, it matches spaces, +tabs, and other characters whose Unicode @samp{general-category} +property (@pxref{Character Properties}) indicates they are spacing +separators. @item [:cntrl:] This matches any @acronym{ASCII} control character. @item [:digit:] @@ -710,6 +710,12 @@ of curved quotes in format arguments to functions like 'message' and now generate less chatter and more-compact diagnostics. The auxiliary function 'check-declare-errmsg' has been removed. ++++ +** The regular expression character class [:blank:] now matches +Unicode horizontal whitespace as defined in the Unicode Technical +Standard #18. If you only want to match space and tab, use [ \t] +instead. + * Lisp Changes in Emacs 26.1 diff --git a/src/character.c b/src/character.c index b594af040c1..bc99daf0df0 100644 --- a/src/character.c +++ b/src/character.c @@ -1038,6 +1038,23 @@ printablep (int c) || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */ } +/* Return true if C is a horizontal whitespace character, as defined + by http://www.unicode.org/reports/tr18/tr18-19.html#blank. */ +bool +blankp (int c) +{ + /* Fast path for ASCII characters that are always assumed to + constitute horizontal whitespace. */ + if (c == ' ' || c == '\t') + return true; + + Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c); + if (! INTEGERP (category)) + return false; + + return XINT (category) == UNICODE_CATEGORY_Zs; /* separator, space */ +} + void syms_of_character (void) { diff --git a/src/character.h b/src/character.h index fc8a0dd74d2..62d252e91ba 100644 --- a/src/character.h +++ b/src/character.h @@ -680,6 +680,7 @@ extern bool alphabeticp (int); extern bool alphanumericp (int); extern bool graphicp (int); extern bool printablep (int); +extern bool blankp (int); /* Return a translation table of id number ID. */ #define GET_TRANSLATION_TABLE(id) \ diff --git a/src/regex.c b/src/regex.c index ae3fde80c9e..7e70c494f47 100644 --- a/src/regex.c +++ b/src/regex.c @@ -310,11 +310,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; || ((c) >= 'a' && (c) <= 'f') \ || ((c) >= 'A' && (c) <= 'F')) -/* This is only used for single-byte characters. */ -# define ISBLANK(c) ((c) == ' ' || (c) == '\t') - /* The rest must handle multibyte characters. */ +# define ISBLANK(c) (IS_REAL_ASCII (c) \ + ? ((c) == ' ' || (c) == '\t') \ + : blankp (c)) + # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \ : graphicp (c)) @@ -1790,6 +1791,7 @@ struct range_table_work_area #define BIT_ALNUM 0x80 #define BIT_GRAPH 0x100 #define BIT_PRINT 0x200 +#define BIT_BLANK 0x400 /* Set the bit for character C in a list. */ @@ -2066,8 +2068,9 @@ re_wctype_to_bit (re_wctype_t cc) case RECC_SPACE: return BIT_SPACE; case RECC_GRAPH: return BIT_GRAPH; case RECC_PRINT: return BIT_PRINT; + case RECC_BLANK: return BIT_BLANK; case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: - case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; + case RECC_UNIBYTE: case RECC_ERROR: return 0; default: abort (); } @@ -4658,6 +4661,7 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte) (class_bits & BIT_ALNUM && ISALNUM (c)) || (class_bits & BIT_ALPHA && ISALPHA (c)) || (class_bits & BIT_SPACE && ISSPACE (c)) || + (class_bits & BIT_BLANK && ISBLANK (c)) || (class_bits & BIT_WORD && ISWORD (c)) || ((class_bits & BIT_UPPER) && (ISUPPER (c) || (corig != c && diff --git a/test/lisp/subr-tests.el b/test/lisp/subr-tests.el index 3c5dbcdbd76..a3b08e96971 100644 --- a/test/lisp/subr-tests.el +++ b/test/lisp/subr-tests.el @@ -271,5 +271,15 @@ indirectly `mapbacktrace'." (let ((frame-lists (subr-test--frames-1 'subr-test--frames-2))) (should (equal (car frame-lists) (cdr frame-lists))))) +(ert-deftest subr-tests--string-match-p--blank () + "Test that [:blank:] matches horizontal whitespace, cf. Bug#25366." + (should (equal (string-match-p "\\`[[:blank:]]\\'" " ") 0)) + (should (equal (string-match-p "\\`[[:blank:]]\\'" "\t") 0)) + (should-not (string-match-p "\\`[[:blank:]]\\'" "\n")) + (should-not (string-match-p "\\`[[:blank:]]\\'" "a")) + (should (equal (string-match-p "\\`[[:blank:]]\\'" "\N{HAIR SPACE}") 0)) + (should (equal (string-match-p "\\`[[:blank:]]\\'" "\u3000") 0)) + (should-not (string-match-p "\\`[[:blank:]]\\'" "\N{LINE SEPARATOR}"))) + (provide 'subr-tests) ;;; subr-tests.el ends here diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el index 74c27111cfe..db187fd4a6a 100644 --- a/test/src/regex-tests.el +++ b/test/src/regex-tests.el @@ -80,7 +80,7 @@ character) must match a string \"\u2420\"." ("print" "abcłąka\u2620-, " "\t\n\1") ("space" " \t\n\u2001" "abcABCł0123") - ("blank" " \t" "\n\u2001") + ("blank" " \t\u2001" "\n") ("ascii" "abcABC012 \t\n\1" "łą\u2620") ("nonascii" "łą\u2622" "abcABC012 \t\n\1") |