diff options
Diffstat (limited to 'lisp/emacs-lisp/rx.el')
-rw-r--r-- | lisp/emacs-lisp/rx.el | 765 |
1 files changed, 322 insertions, 443 deletions
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index a16c5da053a..249529e54e3 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -1,4 +1,4 @@ -;;; rx.el --- sexp notation for regular expressions +;;; rx.el --- sexp notation for regular expressions -*- lexical-binding: t -*- ;; Copyright (C) 2001-2019 Free Software Foundation, Inc. @@ -47,57 +47,58 @@ ;; Rx translates a sexp notation for regular expressions into the ;; usual string notation. The translation can be done at compile-time -;; by using the `rx' macro. It can be done at run-time by calling -;; function `rx-to-string'. See the documentation of `rx' for a -;; complete description of the sexp notation. +;; by using the `rx' macro. The `regexp' and `literal' forms accept +;; non-constant expressions, in which case `rx' will translate to a +;; `concat' expression. Translation can be done fully at run time by +;; calling function `rx-to-string'. See the documentation of `rx' for +;; a complete description of the sexp notation. ;; ;; Some examples of string regexps and their sexp counterparts: ;; ;; "^[a-z]*" -;; (rx (and line-start (0+ (in "a-z")))) +;; (rx line-start (0+ (in "a-z"))) ;; ;; "\n[^ \t]" -;; (rx (and "\n" (not (any " \t")))) +;; (rx ?\n (not (in " \t"))) ;; ;; "\\*\\*\\* EOOH \\*\\*\\*\n" ;; (rx "*** EOOH ***\n") ;; ;; "\\<\\(catch\\|finally\\)\\>[^_]" -;; (rx (and word-start (submatch (or "catch" "finally")) word-end -;; (not (any ?_)))) +;; (rx word-start (submatch (or "catch" "finally")) word-end +;; (not (in ?_))) ;; -;; "[ \t\n]*:\\([^:]+\\|$\\)" -;; (rx (and (zero-or-more (in " \t\n")) ":" -;; (submatch (or line-end (one-or-more (not (any ?:))))))) +;; "[ \t\n]*:\\($\\|[^:]+\\)" +;; (rx (* (in " \t\n")) ":" +;; (submatch (or line-end (+ (not (in ?:)))))) ;; -;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*" -;; (rx (and line-start -;; "content-transfer-encoding:" -;; (+ (? ?\n)) (any " \t") -;; "quoted-printable" -;; (+ (? ?\n)) (any " \t")) +;; "^content-transfer-encoding:\\(?:\n?[\t ]\\)*quoted-printable\\(?:\n?[\t ]\\)*" +;; (rx line-start +;; "content-transfer-encoding:" +;; (* (? ?\n) (in " \t")) +;; "quoted-printable" +;; (* (? ?\n) (in " \t"))) ;; ;; (concat "^\\(?:" something-else "\\)") -;; (rx (and line-start (eval something-else))), statically or -;; (rx-to-string '(and line-start ,something-else)), dynamically. +;; (rx line-start (regexp something-else)) ;; ;; (regexp-opt '(STRING1 STRING2 ...)) ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically ;; calls `regexp-opt' as needed. ;; ;; "^;;\\s-*\n\\|^\n" -;; (rx (or (and line-start ";;" (0+ space) ?\n) -;; (and line-start ?\n))) +;; (rx (or (seq line-start ";;" (0+ space) ?\n) +;; (seq line-start ?\n))) ;; ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) " -;; (rx (and "$Id: " -;; (1+ (not (in " "))) -;; " " -;; (submatch (1+ (not (in " ")))) -;; " ")) +;; (rx "$Id: " +;; (1+ (not (in " "))) +;; " " +;; (submatch (1+ (not (in " ")))) +;; " ") ;; ;; "\\\\\\\\\\[\\w+" -;; (rx (and ?\\ ?\\ ?\[ (1+ word))) +;; (rx "\\\\[" (1+ word)) ;; ;; etc. @@ -106,14 +107,17 @@ ;;; Code: +(require 'cl-lib) +(require 'cl-extra) + ;; FIXME: support macros. (defvar rx-constituents ;Not `const' because some modes extend it. - '((and . (rx-and 1 nil)) + '((and . (rx-and 0 nil)) (seq . and) ; SRE (: . and) ; SRE (sequence . and) ; sregex - (or . (rx-or 1 nil)) + (or . (rx-or 0 nil)) (| . or) ; SRE (not-newline . ".") (nonl . not-newline) ; SRE @@ -173,6 +177,7 @@ (not-syntax . (rx-not-syntax 1 1)) ; sregex (category . (rx-category 1 1 rx-check-category)) (eval . (rx-eval 1 1)) + (literal . (rx-literal 1 1 stringp)) (regexp . (rx-regexp 1 1 stringp)) (regex . regexp) ; sregex (digit . "[[:digit:]]") @@ -244,7 +249,9 @@ regular expressions.") (defconst rx-categories - '((consonant . ?0) + '((space-for-indent . ?\s) + (base . ?.) + (consonant . ?0) (base-vowel . ?1) (upper-diacritical-mark . ?2) (lower-diacritical-mark . ?3) @@ -263,7 +270,9 @@ regular expressions.") (japanese-hiragana-two-byte . ?H) (indian-two-byte . ?I) (japanese-katakana-two-byte . ?K) + (strong-left-to-right . ?L) (korean-hangul-two-byte . ?N) + (strong-right-to-left . ?R) (cyrillic-two-byte . ?Y) (combining-diacritic . ?^) (ascii . ?a) @@ -295,6 +304,10 @@ regular expression strings.") "Non-nil means produce greedy regular expressions for `zero-or-one', `zero-or-more', and `one-or-more'. Dynamically bound.") +(defvar rx--compile-to-lisp nil + "Nil means return a regexp as a string. +Non-nil means we may return a lisp form which produces a +string (used for `rx' macro).") (defun rx-info (op head) "Return parsing/code generation info for OP. @@ -337,7 +350,7 @@ a standalone symbol." (> nargs max-args)) (error "rx form `%s' accepts at most %d args" (car form) max-args)) - (when (not (null type-pred)) + (when type-pred (dolist (sub-form (cdr form)) (unless (funcall type-pred sub-form) (error "rx form `%s' requires args satisfying `%s'" @@ -353,8 +366,9 @@ is non-nil." ;; for concatenation ((eq group ':) (if (rx-atomic-p - (if (string-match - "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp) + (if (and (stringp regexp) + (string-match + "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)) (substring regexp 0 (match-beginning 0)) regexp)) (setq group nil))) @@ -363,9 +377,10 @@ is non-nil." ;; do anyway ((eq group t)) ((rx-atomic-p regexp t) (setq group nil))) - (if group - (concat "\\(?:" regexp "\\)") - regexp)) + (cond ((and group (stringp regexp)) + (concat "\\(?:" regexp "\\)")) + (group `("\\(?:" ,@regexp "\\)")) + (t regexp))) (defvar rx-parent) @@ -377,7 +392,7 @@ is non-nil." FORM is of the form `(and FORM1 ...)'." (rx-check form) (rx-group-if - (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil) + (rx--subforms (cdr form) ':) (and (memq rx-parent '(* t)) rx-parent))) @@ -385,9 +400,11 @@ FORM is of the form `(and FORM1 ...)'." "Parse and produce code from FORM, which is `(or FORM1 ...)'." (rx-check form) (rx-group-if - (if (memq nil (mapcar 'stringp (cdr form))) - (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|") - (regexp-opt (cdr form))) + (cond + ((null (cdr form)) regexp-unmatchable) + ((cl-every #'stringp (cdr form)) + (regexp-opt (cdr form) nil t)) + (t (rx--subforms (cdr form) '| "\\|"))) (and (memq rx-parent '(: * t)) rx-parent))) @@ -423,6 +440,13 @@ Only both edges of each range is checked." ;; set L list of all ranges (mapc (lambda (e) (cond ((stringp e) (push e str)) ((numberp e) (push (cons e e) l)) + ;; Ranges between ASCII and raw bytes are split, + ;; to prevent accidental inclusion of Unicode + ;; characters later on. + ((and (<= (car e) #x7f) + (>= (cdr e) #x3fff80)) + (push (cons (car e) #x7f) l) + (push (cons #x3fff80 (cdr e)) l)) (t (push e l)))) args) ;; condense overlapped ranges in L @@ -447,28 +471,38 @@ Only both edges of each range is checked." (defun rx-check-any-string (str) - "Check string argument STR for Rx `any'." - (let ((i 0) - c1 c2 l) - (if (= 0 (length str)) - (error "String arg for Rx `any' must not be empty")) - (while (string-match ".-." str i) - ;; string before range: convert it to characters - (if (< i (match-beginning 0)) - (setq l (nconc - l - (append (substring str i (match-beginning 0)) nil)))) - ;; range - (setq i (match-end 0) - c1 (aref str (match-beginning 0)) - c2 (aref str (1- i))) - (cond - ((< c1 c2) (setq l (nconc l (list (cons c1 c2))))) - ((= c1 c2) (setq l (nconc l (list c1)))))) - ;; rest? - (if (< i (length str)) - (setq l (nconc l (append (substring str i) nil)))) - l)) + "Turn the `any' argument string STR into a list of characters. +The original order is not preserved. Ranges, \"A-Z\", become pairs, (?A . ?Z)." + (let ((decode-char + ;; Make sure raw bytes are decoded as such, to avoid confusion with + ;; U+0080..U+00FF. + (if (multibyte-string-p str) + #'identity + (lambda (c) (if (<= #x80 c #xff) + (+ c #x3fff00) + c)))) + (len (length str)) + (i 0) + (ret nil)) + (if (= 0 len) + (error "String arg for Rx `any' must not be empty")) + (while (< i len) + (cond ((and (< i (- len 2)) + (= (aref str (+ i 1)) ?-)) + ;; Range. + (let ((start (funcall decode-char (aref str i))) + (end (funcall decode-char (aref str (+ i 2))))) + (cond ((< start end) (push (cons start end) ret)) + ((= start end) (push start ret)) + (t + (error "Rx character range `%c-%c' is reversed" + start end))) + (setq i (+ i 3)))) + (t + ;; Single character. + (push (funcall decode-char (aref str i)) ret) + (setq i (+ i 1))))) + ret)) (defun rx-check-any (arg) @@ -483,7 +517,10 @@ Only both edges of each range is checked." (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation))) (error "Invalid char class `%s' in Rx `any'" arg)) (list (substring translation 1 -1)))) ; strip outer brackets - ((and (integerp (car-safe arg)) (integerp (cdr-safe arg))) + ((and (characterp (car-safe arg)) (characterp (cdr-safe arg))) + (unless (<= (car arg) (cdr arg)) + (error "Rx character range `%c-%c' is reversed" + (car arg) (cdr arg))) (list arg)) ((stringp arg) (rx-check-any-string arg)) ((error @@ -589,7 +626,7 @@ ARG is optional." (rx-check form) (let ((result (rx-form (cadr form) '!)) case-fold-search) - (cond ((string-match "\\`\\[^" result) + (cond ((string-match "\\`\\[\\^" result) (cond ((equal result "[^]") "[^^]") ((and (= (length result) 4) (null (eq rx-parent '!))) @@ -640,7 +677,10 @@ If SKIP is non-nil, allow that number of items after the head, i.e. (unless (and (integerp (nth 1 form)) (> (nth 1 form) 0)) (error "rx `=' requires positive integer first arg")) - (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form))) + (let ((subform (rx-form (nth 2 form) '*))) + (if (stringp subform) + (format "%s\\{%d\\}" subform (nth 1 form)) + `(,@subform ,(format "\\{%d\\}" (nth 1 form)))))) (defun rx->= (form) @@ -650,7 +690,10 @@ If SKIP is non-nil, allow that number of items after the head, i.e. (unless (and (integerp (nth 1 form)) (> (nth 1 form) 0)) (error "rx `>=' requires positive integer first arg")) - (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form))) + (let ((subform (rx-form (nth 2 form) '*))) + (if (stringp subform) + (format "%s\\{%d,\\}" subform (nth 1 form)) + `(,@subform ,(format "\\{%d,\\}" (nth 1 form)))))) (defun rx-** (form) @@ -671,7 +714,10 @@ FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'." (unless (and (integerp (nth 1 form)) (> (nth 1 form) 0)) (error "rx `repeat' requires positive integer first arg")) - (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form))) + (let ((subform (rx-form (nth 2 form) '*))) + (if (stringp subform) + (format "%s\\{%d\\}" subform (nth 1 form)) + `(,@subform ,(format "\\{%d\\}" (nth 1 form)))))) ((or (not (integerp (nth 2 form))) (< (nth 2 form) 0) (not (integerp (nth 1 form))) @@ -679,30 +725,28 @@ FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'." (< (nth 2 form) (nth 1 form))) (error "rx `repeat' range error")) (t - (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*) - (nth 1 form) (nth 2 form))))) + (let ((subform (rx-form (nth 3 form) '*))) + (if (stringp subform) + (format "%s\\{%d,%d\\}" subform (nth 1 form) (nth 2 form)) + `(,@subform ,(format "\\{%d,%d\\}" (nth 1 form) (nth 2 form)))))))) (defun rx-submatch (form) "Parse and produce code from FORM, which is `(submatch ...)'." - (concat "\\(" - (if (= 2 (length form)) - ;; Only one sub-form. - (rx-form (cadr form)) - ;; Several sub-forms implicitly concatenated. - (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil)) - "\\)")) + (let ((subforms (rx--subforms (cdr form) ':))) + (if (stringp subforms) + (concat "\\(" subforms "\\)") + `("\\(" ,@subforms "\\)")))) (defun rx-submatch-n (form) "Parse and produce code from FORM, which is `(submatch-n N ...)'." - (let ((n (nth 1 form))) - (concat "\\(?" (number-to-string n) ":" - (if (= 3 (length form)) - ;; Only one sub-form. - (rx-form (nth 2 form)) - ;; Several sub-forms implicitly concatenated. - (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil)) - "\\)"))) + (let ((n (nth 1 form)) + (subforms (rx--subforms (cddr form) ':))) + (unless (and (integerp n) (> n 0)) + (error "rx `submatch-n' argument must be positive")) + (if (stringp subforms) + (concat "\\(?" (number-to-string n) ":" subforms "\\)") + `("\\(?" ,(number-to-string n) ":" ,@subforms "\\)")))) (defun rx-backref (form) "Parse and produce code from FORM, which is `(backref N)'." @@ -724,15 +768,18 @@ If OP is anything else, produce a greedy regexp if `rx-greedy-flag' is non-nil." (rx-check form) (setq form (rx-trans-forms form)) - (let ((suffix (cond ((memq (car form) '(* + ?\s)) "") - ((memq (car form) '(*? +? ??)) "?") + (let ((suffix (cond ((memq (car form) '(* + \? ?\s)) "") + ((memq (car form) '(*? +? \?? ??)) "?") (rx-greedy-flag "") (t "?"))) (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*") ((memq (car form) '(+ +? 1+ one-or-more)) "+") - (t "?")))) + (t "?"))) + (subform (rx-form (cadr form) '*))) (rx-group-if - (concat (rx-form (cadr form) '*) op suffix) + (if (stringp subform) + (concat subform op suffix) + `(,@subform ,(concat op suffix))) (and (memq rx-parent '(t *)) rx-parent)))) @@ -760,15 +807,18 @@ regexps that are atomic but end in operators, such as be detected without much effort. A guarantee of no false negatives would require a theoretic specification of the set of all atomic regexps." - (let ((l (length r))) - (cond - ((<= l 1)) - ((= l 2) (= (aref r 0) ?\\)) - ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r)) - ((null lax) + (if (and rx--compile-to-lisp + (not (stringp r))) + nil ;; Runtime value, we must assume non-atomic. + (let ((l (length r))) (cond - ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*\\]\\'" r)) - ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r))))))) + ((<= l 1)) + ((= l 2) (= (aref r 0) ?\\)) + ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r)) + ((null lax) + (cond + ((string-match "\\`\\[\\^?]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*]\\'" r)) + ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r)))))))) (defun rx-syntax (form) @@ -824,360 +874,197 @@ If FORM is `(minimal-match FORM1)', non-greedy versions of `*', (defun rx-regexp (form) "Parse and produce code from FORM, which is `(regexp STRING)'." - (rx-check form) - (rx-group-if (cadr form) rx-parent)) - - -(defun rx-form (form &optional rx-parent) + (cond ((stringp (cadr form)) + (rx-group-if (cadr form) rx-parent)) + (rx--compile-to-lisp + ;; Always group non-string forms, since we can't be sure they + ;; are atomic. + (rx-group-if (cdr form) t)) + (t (rx-check form)))) + +(defun rx-literal (form) + "Parse and produce code from FORM, which is `(literal STRING-EXP)'." + (cond ((stringp (cadr form)) + ;; This is allowed, but makes little sense, you could just + ;; use STRING directly. + (rx-group-if (regexp-quote (cadr form)) rx-parent)) + (rx--compile-to-lisp + (rx-group-if `((regexp-quote ,(cadr form))) rx-parent)) + (t (rx-check form)))) + +(defun rx-form (form &optional parent) "Parse and produce code for regular expression FORM. FORM is a regular expression in sexp form. -RX-PARENT shows which type of expression calls and controls putting of +PARENT shows which type of expression calls and controls putting of shy groups around the result and some more in other functions." - (cond - ((stringp form) - (rx-group-if (regexp-quote form) - (if (and (eq rx-parent '*) (< 1 (length form))) - rx-parent))) - ((integerp form) - (regexp-quote (char-to-string form))) - ((symbolp form) - (let ((info (rx-info form nil))) - (cond ((stringp info) - info) - ((null info) - (error "Unknown rx form `%s'" form)) - (t - (funcall (nth 0 info) form))))) - ((consp form) - (let ((info (rx-info (car form) 'head))) - (unless (consp info) - (error "Unknown rx form `%s'" (car form))) - (funcall (nth 0 info) form))) - (t - (error "rx syntax error at `%s'" form)))) + (let ((rx-parent parent)) + (cond + ((stringp form) + (rx-group-if (regexp-quote form) + (if (and (eq parent '*) (< 1 (length form))) + parent))) + ((integerp form) + (regexp-quote (char-to-string form))) + ((symbolp form) + (let ((info (rx-info form nil))) + (cond ((stringp info) + info) + ((null info) + (error "Unknown rx form `%s'" form)) + (t + (funcall (nth 0 info) form))))) + ((consp form) + (let ((info (rx-info (car form) 'head))) + (unless (consp info) + (error "Unknown rx form `%s'" (car form))) + (funcall (nth 0 info) form))) + (t + (error "rx syntax error at `%s'" form))))) + +(defun rx--subforms (subforms &optional parent separator) + "Produce code for regular expressions SUBFORMS. +SUBFORMS is a list of regular expression sexps. +PARENT controls grouping, as in `rx-form'. +Insert SEPARATOR between the code from each of SUBFORMS." + (if (null (cdr subforms)) + ;; Zero or one forms, no need for grouping. + (and subforms (rx-form (car subforms))) + (let ((listify (lambda (x) + (if (listp x) (copy-sequence x) + (list x))))) + (setq subforms (mapcar (lambda (x) (rx-form x parent)) subforms)) + (cond ((or (not rx--compile-to-lisp) + (cl-every #'stringp subforms)) + (mapconcat #'identity subforms separator)) + (separator + (nconc (funcall listify (car subforms)) + (mapcan (lambda (x) + (cons separator (funcall listify x))) + (cdr subforms)))) + (t (mapcan listify subforms)))))) ;;;###autoload (defun rx-to-string (form &optional no-group) "Parse and produce code for regular expression FORM. FORM is a regular expression in sexp form. -NO-GROUP non-nil means don't put shy groups around the result." +NO-GROUP non-nil means don't put shy groups around the result. + +In contrast to the `rx' macro, subforms `literal' and `regexp' +will not accept non-string arguments, i.e., (literal STRING) +becomes just a more verbose version of STRING." (rx-group-if (rx-form form) (null no-group))) ;;;###autoload (defmacro rx (&rest regexps) "Translate regular expressions REGEXPS in sexp form to a regexp string. -REGEXPS is a non-empty sequence of forms of the sort listed below. - -Note that `rx' is a Lisp macro; when used in a Lisp program being -compiled, the translation is performed by the compiler. -See `rx-to-string' for how to do such a translation at run-time. - -The following are valid subforms of regular expressions in sexp -notation. - -STRING - matches string STRING literally. - -CHAR - matches character CHAR literally. - -`not-newline', `nonl' - matches any character except a newline. - -`anything' - matches any character - -`(any SET ...)' -`(in SET ...)' -`(char SET ...)' - matches any character in SET .... SET may be a character or string. - Ranges of characters can be specified as `A-Z' in strings. - Ranges may also be specified as conses like `(?A . ?Z)'. - - SET may also be the name of a character class: `digit', - `control', `hex-digit', `blank', `graph', `print', `alnum', - `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper', - `word', or one of their synonyms. - -`(not (any SET ...))' - matches any character not in SET ... - -`line-start', `bol' - matches the empty string, but only at the beginning of a line - in the text being matched - -`line-end', `eol' - is similar to `line-start' but matches only at the end of a line - -`string-start', `bos', `bot' - matches the empty string, but only at the beginning of the - string being matched against. - -`string-end', `eos', `eot' - matches the empty string, but only at the end of the - string being matched against. - -`buffer-start' - matches the empty string, but only at the beginning of the - buffer being matched against. Actually equivalent to `string-start'. - -`buffer-end' - matches the empty string, but only at the end of the - buffer being matched against. Actually equivalent to `string-end'. - -`point' - matches the empty string, but only at point. - -`word-start', `bow' - matches the empty string, but only at the beginning of a word. - -`word-end', `eow' - matches the empty string, but only at the end of a word. - -`word-boundary' - matches the empty string, but only at the beginning or end of a - word. - -`(not word-boundary)' -`not-word-boundary' - matches the empty string, but not at the beginning or end of a - word. - -`symbol-start' - matches the empty string, but only at the beginning of a symbol. - -`symbol-end' - matches the empty string, but only at the end of a symbol. - -`digit', `numeric', `num' - matches 0 through 9. - -`control', `cntrl' - matches ASCII control characters. - -`hex-digit', `hex', `xdigit' - matches 0 through 9, a through f and A through F. - -`blank' - matches horizontal whitespace, as defined by Annex C of the - Unicode Technical Standard #18. In particular, it matches - spaces, tabs, and other characters whose Unicode - `general-category' property indicates they are spacing - separators. - -`graphic', `graph' - matches graphic characters--everything except whitespace, ASCII - and non-ASCII control characters, surrogates, and codepoints - unassigned by Unicode. - -`printing', `print' - matches whitespace and graphic characters. - -`alphanumeric', `alnum' - matches alphabetic characters and digits. For multibyte characters, - it matches characters whose Unicode `general-category' property - indicates they are alphabetic or decimal number characters. - -`letter', `alphabetic', `alpha' - matches alphabetic characters. For multibyte characters, - it matches characters whose Unicode `general-category' property - indicates they are alphabetic characters. - -`ascii' - matches ASCII (unibyte) characters. - -`nonascii' - matches non-ASCII (multibyte) characters. - -`lower', `lower-case' - matches anything lower-case, as determined by the current case - table. If `case-fold-search' is non-nil, this also matches any - upper-case letter. - -`upper', `upper-case' - matches anything upper-case, as determined by the current case - table. If `case-fold-search' is non-nil, this also matches any - lower-case letter. - -`punctuation', `punct' - matches punctuation. (But at present, for multibyte characters, - it matches anything that has non-word syntax.) - -`space', `whitespace', `white' - matches anything that has whitespace syntax. - -`word', `wordchar' - matches anything that has word syntax. - -`not-wordchar' - matches anything that has non-word syntax. - -`(syntax SYNTAX)' - matches a character with syntax SYNTAX. SYNTAX must be one - of the following symbols, or a symbol corresponding to the syntax - character, e.g. `\\.' for `\\s.'. - - `whitespace' (\\s- in string notation) - `punctuation' (\\s.) - `word' (\\sw) - `symbol' (\\s_) - `open-parenthesis' (\\s() - `close-parenthesis' (\\s)) - `expression-prefix' (\\s') - `string-quote' (\\s\") - `paired-delimiter' (\\s$) - `escape' (\\s\\) - `character-quote' (\\s/) - `comment-start' (\\s<) - `comment-end' (\\s>) - `string-delimiter' (\\s|) - `comment-delimiter' (\\s!) - -`(not (syntax SYNTAX))' - matches a character that doesn't have syntax SYNTAX. - -`(category CATEGORY)' - matches a character with category CATEGORY. CATEGORY must be - either a character to use for C, or one of the following symbols. - - `consonant' (\\c0 in string notation) - `base-vowel' (\\c1) - `upper-diacritical-mark' (\\c2) - `lower-diacritical-mark' (\\c3) - `tone-mark' (\\c4) - `symbol' (\\c5) - `digit' (\\c6) - `vowel-modifying-diacritical-mark' (\\c7) - `vowel-sign' (\\c8) - `semivowel-lower' (\\c9) - `not-at-end-of-line' (\\c<) - `not-at-beginning-of-line' (\\c>) - `alpha-numeric-two-byte' (\\cA) - `chinese-two-byte' (\\cC) - `greek-two-byte' (\\cG) - `japanese-hiragana-two-byte' (\\cH) - `indian-two-byte' (\\cI) - `japanese-katakana-two-byte' (\\cK) - `korean-hangul-two-byte' (\\cN) - `cyrillic-two-byte' (\\cY) - `combining-diacritic' (\\c^) - `ascii' (\\ca) - `arabic' (\\cb) - `chinese' (\\cc) - `ethiopic' (\\ce) - `greek' (\\cg) - `korean' (\\ch) - `indian' (\\ci) - `japanese' (\\cj) - `japanese-katakana' (\\ck) - `latin' (\\cl) - `lao' (\\co) - `tibetan' (\\cq) - `japanese-roman' (\\cr) - `thai' (\\ct) - `vietnamese' (\\cv) - `hebrew' (\\cw) - `cyrillic' (\\cy) - `can-break' (\\c|) - -`(not (category CATEGORY))' - matches a character that doesn't have category CATEGORY. - -`(and SEXP1 SEXP2 ...)' -`(: SEXP1 SEXP2 ...)' -`(seq SEXP1 SEXP2 ...)' -`(sequence SEXP1 SEXP2 ...)' - matches what SEXP1 matches, followed by what SEXP2 matches, etc. - -`(submatch SEXP1 SEXP2 ...)' -`(group SEXP1 SEXP2 ...)' - like `and', but makes the match accessible with `match-end', - `match-beginning', and `match-string'. - -`(submatch-n N SEXP1 SEXP2 ...)' -`(group-n N SEXP1 SEXP2 ...)' - like `group', but make it an explicitly-numbered group with - group number N. - -`(or SEXP1 SEXP2 ...)' -`(| SEXP1 SEXP2 ...)' - matches anything that matches SEXP1 or SEXP2, etc. If all - args are strings, use `regexp-opt' to optimize the resulting - regular expression. - -`(minimal-match SEXP)' - produce a non-greedy regexp for SEXP. Normally, regexps matching - zero or more occurrences of something are \"greedy\" in that they - match as much as they can, as long as the overall regexp can - still match. A non-greedy regexp matches as little as possible. - -`(maximal-match SEXP)' - produce a greedy regexp for SEXP. This is the default. - -Below, `SEXP ...' represents a sequence of regexp forms, treated as if -enclosed in `(and ...)'. - -`(zero-or-more SEXP ...)' -`(0+ SEXP ...)' - matches zero or more occurrences of what SEXP ... matches. - -`(* SEXP ...)' - like `zero-or-more', but always produces a greedy regexp, independent - of `rx-greedy-flag'. - -`(*? SEXP ...)' - like `zero-or-more', but always produces a non-greedy regexp, - independent of `rx-greedy-flag'. - -`(one-or-more SEXP ...)' -`(1+ SEXP ...)' - matches one or more occurrences of SEXP ... - -`(+ SEXP ...)' - like `one-or-more', but always produces a greedy regexp. - -`(+? SEXP ...)' - like `one-or-more', but always produces a non-greedy regexp. - -`(zero-or-one SEXP ...)' -`(optional SEXP ...)' -`(opt SEXP ...)' - matches zero or one occurrences of A. - -`(? SEXP ...)' - like `zero-or-one', but always produces a greedy regexp. - -`(?? SEXP ...)' - like `zero-or-one', but always produces a non-greedy regexp. - -`(repeat N SEXP)' -`(= N SEXP ...)' - matches N occurrences. - -`(>= N SEXP ...)' - matches N or more occurrences. - -`(repeat N M SEXP)' -`(** N M SEXP ...)' - matches N to M occurrences. - -`(backref N)' - matches what was matched previously by submatch N. - -`(eval FORM)' - evaluate FORM and insert result. If result is a string, - `regexp-quote' it. - -`(regexp REGEXP)' - include REGEXP in string notation in the result." - (cond ((null regexps) - (error "No regexp")) - ((cdr regexps) - (rx-to-string `(and ,@regexps) t)) - (t - (rx-to-string (car regexps) t)))) +Each argument is one of the forms below; RX is a subform, and RX... stands +for one or more RXs. For details, see Info node `(elisp) Rx Notation'. +See `rx-to-string' for the corresponding function. + +STRING Match a literal string. +CHAR Match a literal character. + +(seq RX...) Match the RXs in sequence. Alias: :, sequence, and. +(or RX...) Match one of the RXs. Alias: |. + +(zero-or-more RX...) Match RXs zero or more times. Alias: 0+. +(one-or-more RX...) Match RXs one or more times. Alias: 1+. +(zero-or-one RX...) Match RXs or the empty string. Alias: opt, optional. +(* RX...) Match RXs zero or more times; greedy. +(+ RX...) Match RXs one or more times; greedy. +(? RX...) Match RXs or the empty string; greedy. +(*? RX...) Match RXs zero or more times; non-greedy. +(+? RX...) Match RXs one or more times; non-greedy. +(?? RX...) Match RXs or the empty string; non-greedy. +(= N RX...) Match RXs exactly N times. +(>= N RX...) Match RXs N or more times. +(** N M RX...) Match RXs N to M times. Alias: repeat. +(minimal-match RX) Match RX, with zero-or-more, one-or-more, zero-or-one + and aliases using non-greedy matching. +(maximal-match RX) Match RX, with zero-or-more, one-or-more, zero-or-one + and aliases using greedy matching, which is the default. + +(any SET...) Match a character from one of the SETs. Each SET is a + character, a string, a range as string \"A-Z\" or cons + (?A . ?Z), or a character class (see below). Alias: in, char. +(not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC + can be (any ...), (syntax ...), (category ...), + or a character class. +not-newline Match any character except a newline. Alias: nonl. +anything Match any character. + +CHARCLASS Match a character from a character class. One of: + alpha, alphabetic, letter Alphabetic characters (defined by Unicode). + alnum, alphanumeric Alphabetic or decimal digit chars (Unicode). + digit numeric, num 0-9. + xdigit, hex-digit, hex 0-9, A-F, a-f. + cntrl, control ASCII codes 0-31. + blank Horizontal whitespace (Unicode). + space, whitespace, white Chars with whitespace syntax. + lower, lower-case Lower-case chars, from current case table. + upper, upper-case Upper-case chars, from current case table. + graph, graphic Graphic characters (Unicode). + print, printing Whitespace or graphic (Unicode). + punct, punctuation Not control, space, letter or digit (ASCII); + not word syntax (non-ASCII). + word, wordchar Characters with word syntax. + ascii ASCII characters (codes 0-127). + nonascii Non-ASCII characters (but not raw bytes). + +(syntax SYNTAX) Match a character with syntax SYNTAX, being one of: + whitespace, punctuation, word, symbol, open-parenthesis, + close-parenthesis, expression-prefix, string-quote, + paired-delimiter, escape, character-quote, comment-start, + comment-end, string-delimiter, comment-delimiter + +(category CAT) Match a character in category CAT, being one of: + space-for-indent, base, consonant, base-vowel, + upper-diacritical-mark, lower-diacritical-mark, tone-mark, symbol, + digit, vowel-modifying-diacritical-mark, vowel-sign, + semivowel-lower, not-at-end-of-line, not-at-beginning-of-line, + alpha-numeric-two-byte, chinese-two-byte, greek-two-byte, + japanese-hiragana-two-byte, indian-two-byte, + japanese-katakana-two-byte, strong-left-to-right, + korean-hangul-two-byte, strong-right-to-left, cyrillic-two-byte, + combining-diacritic, ascii, arabic, chinese, ethiopic, greek, + korean, indian, japanese, japanese-katakana, latin, lao, + tibetan, japanese-roman, thai, vietnamese, hebrew, cyrillic, + can-break + +Zero-width assertions: these all match the empty string in specific places. + line-start At the beginning of a line. Alias: bol. + line-end At the end of a line. Alias: eol. + string-start At the start of the string or buffer. + Alias: buffer-start, bos, bot. + string-end At the end of the string or buffer. + Alias: buffer-end, eos, eot. + point At point. + word-start At the beginning of a word. + word-end At the end of a word. + word-boundary At the beginning or end of a word. + not-word-boundary Not at the beginning or end of a word. + symbol-start At the beginning of a symbol. + symbol-end At the end of a symbol. + +(group RX...) Match RXs and define a capture group. Alias: submatch. +(group-n N RX...) Match RXs and define capture group N. Alias: submatch-n. +(backref N) Match the text that capture group N matched. + +(literal EXPR) Match the literal string from evaluating EXPR at run time. +(regexp EXPR) Match the string regexp from evaluating EXPR at run time. +(eval EXPR) Match the rx sexp from evaluating EXPR at compile time." + (let* ((rx--compile-to-lisp t) + (re (cond ((null regexps) + (error "No regexp")) + ((cdr regexps) + (rx-to-string `(and ,@regexps) t)) + (t + (rx-to-string (car regexps) t))))) + (if (stringp re) + re + `(concat ,@re)))) (pcase-defmacro rx (&rest regexps) @@ -1239,14 +1126,6 @@ string as argument to `match-string'." for var in vars collect `(app (match-string ,i) ,var))))) -;; ;; sregex.el replacement - -;; ;;;###autoload (provide 'sregex) -;; ;;;###autoload (autoload 'sregex "rx") -;; (defalias 'sregex 'rx-to-string) -;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro) -;; (defalias 'sregexq 'rx) - (provide 'rx) ;;; rx.el ends here |