summaryrefslogtreecommitdiff
path: root/lisp/emacs-lisp/rx.el
diff options
context:
space:
mode:
Diffstat (limited to 'lisp/emacs-lisp/rx.el')
-rw-r--r--lisp/emacs-lisp/rx.el765
1 files changed, 322 insertions, 443 deletions
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index a16c5da053a..249529e54e3 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -1,4 +1,4 @@
-;;; rx.el --- sexp notation for regular expressions
+;;; rx.el --- sexp notation for regular expressions -*- lexical-binding: t -*-
;; Copyright (C) 2001-2019 Free Software Foundation, Inc.
@@ -47,57 +47,58 @@
;; Rx translates a sexp notation for regular expressions into the
;; usual string notation. The translation can be done at compile-time
-;; by using the `rx' macro. It can be done at run-time by calling
-;; function `rx-to-string'. See the documentation of `rx' for a
-;; complete description of the sexp notation.
+;; by using the `rx' macro. The `regexp' and `literal' forms accept
+;; non-constant expressions, in which case `rx' will translate to a
+;; `concat' expression. Translation can be done fully at run time by
+;; calling function `rx-to-string'. See the documentation of `rx' for
+;; a complete description of the sexp notation.
;;
;; Some examples of string regexps and their sexp counterparts:
;;
;; "^[a-z]*"
-;; (rx (and line-start (0+ (in "a-z"))))
+;; (rx line-start (0+ (in "a-z")))
;;
;; "\n[^ \t]"
-;; (rx (and "\n" (not (any " \t"))))
+;; (rx ?\n (not (in " \t")))
;;
;; "\\*\\*\\* EOOH \\*\\*\\*\n"
;; (rx "*** EOOH ***\n")
;;
;; "\\<\\(catch\\|finally\\)\\>[^_]"
-;; (rx (and word-start (submatch (or "catch" "finally")) word-end
-;; (not (any ?_))))
+;; (rx word-start (submatch (or "catch" "finally")) word-end
+;; (not (in ?_)))
;;
-;; "[ \t\n]*:\\([^:]+\\|$\\)"
-;; (rx (and (zero-or-more (in " \t\n")) ":"
-;; (submatch (or line-end (one-or-more (not (any ?:)))))))
+;; "[ \t\n]*:\\($\\|[^:]+\\)"
+;; (rx (* (in " \t\n")) ":"
+;; (submatch (or line-end (+ (not (in ?:))))))
;;
-;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
-;; (rx (and line-start
-;; "content-transfer-encoding:"
-;; (+ (? ?\n)) (any " \t")
-;; "quoted-printable"
-;; (+ (? ?\n)) (any " \t"))
+;; "^content-transfer-encoding:\\(?:\n?[\t ]\\)*quoted-printable\\(?:\n?[\t ]\\)*"
+;; (rx line-start
+;; "content-transfer-encoding:"
+;; (* (? ?\n) (in " \t"))
+;; "quoted-printable"
+;; (* (? ?\n) (in " \t")))
;;
;; (concat "^\\(?:" something-else "\\)")
-;; (rx (and line-start (eval something-else))), statically or
-;; (rx-to-string '(and line-start ,something-else)), dynamically.
+;; (rx line-start (regexp something-else))
;;
;; (regexp-opt '(STRING1 STRING2 ...))
;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
;; calls `regexp-opt' as needed.
;;
;; "^;;\\s-*\n\\|^\n"
-;; (rx (or (and line-start ";;" (0+ space) ?\n)
-;; (and line-start ?\n)))
+;; (rx (or (seq line-start ";;" (0+ space) ?\n)
+;; (seq line-start ?\n)))
;;
;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
-;; (rx (and "$Id: "
-;; (1+ (not (in " ")))
-;; " "
-;; (submatch (1+ (not (in " "))))
-;; " "))
+;; (rx "$Id: "
+;; (1+ (not (in " ")))
+;; " "
+;; (submatch (1+ (not (in " "))))
+;; " ")
;;
;; "\\\\\\\\\\[\\w+"
-;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
+;; (rx "\\\\[" (1+ word))
;;
;; etc.
@@ -106,14 +107,17 @@
;;; Code:
+(require 'cl-lib)
+(require 'cl-extra)
+
;; FIXME: support macros.
(defvar rx-constituents ;Not `const' because some modes extend it.
- '((and . (rx-and 1 nil))
+ '((and . (rx-and 0 nil))
(seq . and) ; SRE
(: . and) ; SRE
(sequence . and) ; sregex
- (or . (rx-or 1 nil))
+ (or . (rx-or 0 nil))
(| . or) ; SRE
(not-newline . ".")
(nonl . not-newline) ; SRE
@@ -173,6 +177,7 @@
(not-syntax . (rx-not-syntax 1 1)) ; sregex
(category . (rx-category 1 1 rx-check-category))
(eval . (rx-eval 1 1))
+ (literal . (rx-literal 1 1 stringp))
(regexp . (rx-regexp 1 1 stringp))
(regex . regexp) ; sregex
(digit . "[[:digit:]]")
@@ -244,7 +249,9 @@ regular expressions.")
(defconst rx-categories
- '((consonant . ?0)
+ '((space-for-indent . ?\s)
+ (base . ?.)
+ (consonant . ?0)
(base-vowel . ?1)
(upper-diacritical-mark . ?2)
(lower-diacritical-mark . ?3)
@@ -263,7 +270,9 @@ regular expressions.")
(japanese-hiragana-two-byte . ?H)
(indian-two-byte . ?I)
(japanese-katakana-two-byte . ?K)
+ (strong-left-to-right . ?L)
(korean-hangul-two-byte . ?N)
+ (strong-right-to-left . ?R)
(cyrillic-two-byte . ?Y)
(combining-diacritic . ?^)
(ascii . ?a)
@@ -295,6 +304,10 @@ regular expression strings.")
"Non-nil means produce greedy regular expressions for `zero-or-one',
`zero-or-more', and `one-or-more'. Dynamically bound.")
+(defvar rx--compile-to-lisp nil
+ "Nil means return a regexp as a string.
+Non-nil means we may return a lisp form which produces a
+string (used for `rx' macro).")
(defun rx-info (op head)
"Return parsing/code generation info for OP.
@@ -337,7 +350,7 @@ a standalone symbol."
(> nargs max-args))
(error "rx form `%s' accepts at most %d args"
(car form) max-args))
- (when (not (null type-pred))
+ (when type-pred
(dolist (sub-form (cdr form))
(unless (funcall type-pred sub-form)
(error "rx form `%s' requires args satisfying `%s'"
@@ -353,8 +366,9 @@ is non-nil."
;; for concatenation
((eq group ':)
(if (rx-atomic-p
- (if (string-match
- "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
+ (if (and (stringp regexp)
+ (string-match
+ "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp))
(substring regexp 0 (match-beginning 0))
regexp))
(setq group nil)))
@@ -363,9 +377,10 @@ is non-nil."
;; do anyway
((eq group t))
((rx-atomic-p regexp t) (setq group nil)))
- (if group
- (concat "\\(?:" regexp "\\)")
- regexp))
+ (cond ((and group (stringp regexp))
+ (concat "\\(?:" regexp "\\)"))
+ (group `("\\(?:" ,@regexp "\\)"))
+ (t regexp)))
(defvar rx-parent)
@@ -377,7 +392,7 @@ is non-nil."
FORM is of the form `(and FORM1 ...)'."
(rx-check form)
(rx-group-if
- (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
+ (rx--subforms (cdr form) ':)
(and (memq rx-parent '(* t)) rx-parent)))
@@ -385,9 +400,11 @@ FORM is of the form `(and FORM1 ...)'."
"Parse and produce code from FORM, which is `(or FORM1 ...)'."
(rx-check form)
(rx-group-if
- (if (memq nil (mapcar 'stringp (cdr form)))
- (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
- (regexp-opt (cdr form)))
+ (cond
+ ((null (cdr form)) regexp-unmatchable)
+ ((cl-every #'stringp (cdr form))
+ (regexp-opt (cdr form) nil t))
+ (t (rx--subforms (cdr form) '| "\\|")))
(and (memq rx-parent '(: * t)) rx-parent)))
@@ -423,6 +440,13 @@ Only both edges of each range is checked."
;; set L list of all ranges
(mapc (lambda (e) (cond ((stringp e) (push e str))
((numberp e) (push (cons e e) l))
+ ;; Ranges between ASCII and raw bytes are split,
+ ;; to prevent accidental inclusion of Unicode
+ ;; characters later on.
+ ((and (<= (car e) #x7f)
+ (>= (cdr e) #x3fff80))
+ (push (cons (car e) #x7f) l)
+ (push (cons #x3fff80 (cdr e)) l))
(t (push e l))))
args)
;; condense overlapped ranges in L
@@ -447,28 +471,38 @@ Only both edges of each range is checked."
(defun rx-check-any-string (str)
- "Check string argument STR for Rx `any'."
- (let ((i 0)
- c1 c2 l)
- (if (= 0 (length str))
- (error "String arg for Rx `any' must not be empty"))
- (while (string-match ".-." str i)
- ;; string before range: convert it to characters
- (if (< i (match-beginning 0))
- (setq l (nconc
- l
- (append (substring str i (match-beginning 0)) nil))))
- ;; range
- (setq i (match-end 0)
- c1 (aref str (match-beginning 0))
- c2 (aref str (1- i)))
- (cond
- ((< c1 c2) (setq l (nconc l (list (cons c1 c2)))))
- ((= c1 c2) (setq l (nconc l (list c1))))))
- ;; rest?
- (if (< i (length str))
- (setq l (nconc l (append (substring str i) nil))))
- l))
+ "Turn the `any' argument string STR into a list of characters.
+The original order is not preserved. Ranges, \"A-Z\", become pairs, (?A . ?Z)."
+ (let ((decode-char
+ ;; Make sure raw bytes are decoded as such, to avoid confusion with
+ ;; U+0080..U+00FF.
+ (if (multibyte-string-p str)
+ #'identity
+ (lambda (c) (if (<= #x80 c #xff)
+ (+ c #x3fff00)
+ c))))
+ (len (length str))
+ (i 0)
+ (ret nil))
+ (if (= 0 len)
+ (error "String arg for Rx `any' must not be empty"))
+ (while (< i len)
+ (cond ((and (< i (- len 2))
+ (= (aref str (+ i 1)) ?-))
+ ;; Range.
+ (let ((start (funcall decode-char (aref str i)))
+ (end (funcall decode-char (aref str (+ i 2)))))
+ (cond ((< start end) (push (cons start end) ret))
+ ((= start end) (push start ret))
+ (t
+ (error "Rx character range `%c-%c' is reversed"
+ start end)))
+ (setq i (+ i 3))))
+ (t
+ ;; Single character.
+ (push (funcall decode-char (aref str i)) ret)
+ (setq i (+ i 1)))))
+ ret))
(defun rx-check-any (arg)
@@ -483,7 +517,10 @@ Only both edges of each range is checked."
(null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation)))
(error "Invalid char class `%s' in Rx `any'" arg))
(list (substring translation 1 -1)))) ; strip outer brackets
- ((and (integerp (car-safe arg)) (integerp (cdr-safe arg)))
+ ((and (characterp (car-safe arg)) (characterp (cdr-safe arg)))
+ (unless (<= (car arg) (cdr arg))
+ (error "Rx character range `%c-%c' is reversed"
+ (car arg) (cdr arg)))
(list arg))
((stringp arg) (rx-check-any-string arg))
((error
@@ -589,7 +626,7 @@ ARG is optional."
(rx-check form)
(let ((result (rx-form (cadr form) '!))
case-fold-search)
- (cond ((string-match "\\`\\[^" result)
+ (cond ((string-match "\\`\\[\\^" result)
(cond
((equal result "[^]") "[^^]")
((and (= (length result) 4) (null (eq rx-parent '!)))
@@ -640,7 +677,10 @@ If SKIP is non-nil, allow that number of items after the head, i.e.
(unless (and (integerp (nth 1 form))
(> (nth 1 form) 0))
(error "rx `=' requires positive integer first arg"))
- (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
+ (let ((subform (rx-form (nth 2 form) '*)))
+ (if (stringp subform)
+ (format "%s\\{%d\\}" subform (nth 1 form))
+ `(,@subform ,(format "\\{%d\\}" (nth 1 form))))))
(defun rx->= (form)
@@ -650,7 +690,10 @@ If SKIP is non-nil, allow that number of items after the head, i.e.
(unless (and (integerp (nth 1 form))
(> (nth 1 form) 0))
(error "rx `>=' requires positive integer first arg"))
- (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
+ (let ((subform (rx-form (nth 2 form) '*)))
+ (if (stringp subform)
+ (format "%s\\{%d,\\}" subform (nth 1 form))
+ `(,@subform ,(format "\\{%d,\\}" (nth 1 form))))))
(defun rx-** (form)
@@ -671,7 +714,10 @@ FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
(unless (and (integerp (nth 1 form))
(> (nth 1 form) 0))
(error "rx `repeat' requires positive integer first arg"))
- (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
+ (let ((subform (rx-form (nth 2 form) '*)))
+ (if (stringp subform)
+ (format "%s\\{%d\\}" subform (nth 1 form))
+ `(,@subform ,(format "\\{%d\\}" (nth 1 form))))))
((or (not (integerp (nth 2 form)))
(< (nth 2 form) 0)
(not (integerp (nth 1 form)))
@@ -679,30 +725,28 @@ FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
(< (nth 2 form) (nth 1 form)))
(error "rx `repeat' range error"))
(t
- (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
- (nth 1 form) (nth 2 form)))))
+ (let ((subform (rx-form (nth 3 form) '*)))
+ (if (stringp subform)
+ (format "%s\\{%d,%d\\}" subform (nth 1 form) (nth 2 form))
+ `(,@subform ,(format "\\{%d,%d\\}" (nth 1 form) (nth 2 form))))))))
(defun rx-submatch (form)
"Parse and produce code from FORM, which is `(submatch ...)'."
- (concat "\\("
- (if (= 2 (length form))
- ;; Only one sub-form.
- (rx-form (cadr form))
- ;; Several sub-forms implicitly concatenated.
- (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
- "\\)"))
+ (let ((subforms (rx--subforms (cdr form) ':)))
+ (if (stringp subforms)
+ (concat "\\(" subforms "\\)")
+ `("\\(" ,@subforms "\\)"))))
(defun rx-submatch-n (form)
"Parse and produce code from FORM, which is `(submatch-n N ...)'."
- (let ((n (nth 1 form)))
- (concat "\\(?" (number-to-string n) ":"
- (if (= 3 (length form))
- ;; Only one sub-form.
- (rx-form (nth 2 form))
- ;; Several sub-forms implicitly concatenated.
- (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil))
- "\\)")))
+ (let ((n (nth 1 form))
+ (subforms (rx--subforms (cddr form) ':)))
+ (unless (and (integerp n) (> n 0))
+ (error "rx `submatch-n' argument must be positive"))
+ (if (stringp subforms)
+ (concat "\\(?" (number-to-string n) ":" subforms "\\)")
+ `("\\(?" ,(number-to-string n) ":" ,@subforms "\\)"))))
(defun rx-backref (form)
"Parse and produce code from FORM, which is `(backref N)'."
@@ -724,15 +768,18 @@ If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
is non-nil."
(rx-check form)
(setq form (rx-trans-forms form))
- (let ((suffix (cond ((memq (car form) '(* + ?\s)) "")
- ((memq (car form) '(*? +? ??)) "?")
+ (let ((suffix (cond ((memq (car form) '(* + \? ?\s)) "")
+ ((memq (car form) '(*? +? \?? ??)) "?")
(rx-greedy-flag "")
(t "?")))
(op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
((memq (car form) '(+ +? 1+ one-or-more)) "+")
- (t "?"))))
+ (t "?")))
+ (subform (rx-form (cadr form) '*)))
(rx-group-if
- (concat (rx-form (cadr form) '*) op suffix)
+ (if (stringp subform)
+ (concat subform op suffix)
+ `(,@subform ,(concat op suffix)))
(and (memq rx-parent '(t *)) rx-parent))))
@@ -760,15 +807,18 @@ regexps that are atomic but end in operators, such as
be detected without much effort. A guarantee of no false
negatives would require a theoretic specification of the set
of all atomic regexps."
- (let ((l (length r)))
- (cond
- ((<= l 1))
- ((= l 2) (= (aref r 0) ?\\))
- ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
- ((null lax)
+ (if (and rx--compile-to-lisp
+ (not (stringp r)))
+ nil ;; Runtime value, we must assume non-atomic.
+ (let ((l (length r)))
(cond
- ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*\\]\\'" r))
- ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r)))))))
+ ((<= l 1))
+ ((= l 2) (= (aref r 0) ?\\))
+ ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
+ ((null lax)
+ (cond
+ ((string-match "\\`\\[\\^?]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*]\\'" r))
+ ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r))))))))
(defun rx-syntax (form)
@@ -824,360 +874,197 @@ If FORM is `(minimal-match FORM1)', non-greedy versions of `*',
(defun rx-regexp (form)
"Parse and produce code from FORM, which is `(regexp STRING)'."
- (rx-check form)
- (rx-group-if (cadr form) rx-parent))
-
-
-(defun rx-form (form &optional rx-parent)
+ (cond ((stringp (cadr form))
+ (rx-group-if (cadr form) rx-parent))
+ (rx--compile-to-lisp
+ ;; Always group non-string forms, since we can't be sure they
+ ;; are atomic.
+ (rx-group-if (cdr form) t))
+ (t (rx-check form))))
+
+(defun rx-literal (form)
+ "Parse and produce code from FORM, which is `(literal STRING-EXP)'."
+ (cond ((stringp (cadr form))
+ ;; This is allowed, but makes little sense, you could just
+ ;; use STRING directly.
+ (rx-group-if (regexp-quote (cadr form)) rx-parent))
+ (rx--compile-to-lisp
+ (rx-group-if `((regexp-quote ,(cadr form))) rx-parent))
+ (t (rx-check form))))
+
+(defun rx-form (form &optional parent)
"Parse and produce code for regular expression FORM.
FORM is a regular expression in sexp form.
-RX-PARENT shows which type of expression calls and controls putting of
+PARENT shows which type of expression calls and controls putting of
shy groups around the result and some more in other functions."
- (cond
- ((stringp form)
- (rx-group-if (regexp-quote form)
- (if (and (eq rx-parent '*) (< 1 (length form)))
- rx-parent)))
- ((integerp form)
- (regexp-quote (char-to-string form)))
- ((symbolp form)
- (let ((info (rx-info form nil)))
- (cond ((stringp info)
- info)
- ((null info)
- (error "Unknown rx form `%s'" form))
- (t
- (funcall (nth 0 info) form)))))
- ((consp form)
- (let ((info (rx-info (car form) 'head)))
- (unless (consp info)
- (error "Unknown rx form `%s'" (car form)))
- (funcall (nth 0 info) form)))
- (t
- (error "rx syntax error at `%s'" form))))
+ (let ((rx-parent parent))
+ (cond
+ ((stringp form)
+ (rx-group-if (regexp-quote form)
+ (if (and (eq parent '*) (< 1 (length form)))
+ parent)))
+ ((integerp form)
+ (regexp-quote (char-to-string form)))
+ ((symbolp form)
+ (let ((info (rx-info form nil)))
+ (cond ((stringp info)
+ info)
+ ((null info)
+ (error "Unknown rx form `%s'" form))
+ (t
+ (funcall (nth 0 info) form)))))
+ ((consp form)
+ (let ((info (rx-info (car form) 'head)))
+ (unless (consp info)
+ (error "Unknown rx form `%s'" (car form)))
+ (funcall (nth 0 info) form)))
+ (t
+ (error "rx syntax error at `%s'" form)))))
+
+(defun rx--subforms (subforms &optional parent separator)
+ "Produce code for regular expressions SUBFORMS.
+SUBFORMS is a list of regular expression sexps.
+PARENT controls grouping, as in `rx-form'.
+Insert SEPARATOR between the code from each of SUBFORMS."
+ (if (null (cdr subforms))
+ ;; Zero or one forms, no need for grouping.
+ (and subforms (rx-form (car subforms)))
+ (let ((listify (lambda (x)
+ (if (listp x) (copy-sequence x)
+ (list x)))))
+ (setq subforms (mapcar (lambda (x) (rx-form x parent)) subforms))
+ (cond ((or (not rx--compile-to-lisp)
+ (cl-every #'stringp subforms))
+ (mapconcat #'identity subforms separator))
+ (separator
+ (nconc (funcall listify (car subforms))
+ (mapcan (lambda (x)
+ (cons separator (funcall listify x)))
+ (cdr subforms))))
+ (t (mapcan listify subforms))))))
;;;###autoload
(defun rx-to-string (form &optional no-group)
"Parse and produce code for regular expression FORM.
FORM is a regular expression in sexp form.
-NO-GROUP non-nil means don't put shy groups around the result."
+NO-GROUP non-nil means don't put shy groups around the result.
+
+In contrast to the `rx' macro, subforms `literal' and `regexp'
+will not accept non-string arguments, i.e., (literal STRING)
+becomes just a more verbose version of STRING."
(rx-group-if (rx-form form) (null no-group)))
;;;###autoload
(defmacro rx (&rest regexps)
"Translate regular expressions REGEXPS in sexp form to a regexp string.
-REGEXPS is a non-empty sequence of forms of the sort listed below.
-
-Note that `rx' is a Lisp macro; when used in a Lisp program being
-compiled, the translation is performed by the compiler.
-See `rx-to-string' for how to do such a translation at run-time.
-
-The following are valid subforms of regular expressions in sexp
-notation.
-
-STRING
- matches string STRING literally.
-
-CHAR
- matches character CHAR literally.
-
-`not-newline', `nonl'
- matches any character except a newline.
-
-`anything'
- matches any character
-
-`(any SET ...)'
-`(in SET ...)'
-`(char SET ...)'
- matches any character in SET .... SET may be a character or string.
- Ranges of characters can be specified as `A-Z' in strings.
- Ranges may also be specified as conses like `(?A . ?Z)'.
-
- SET may also be the name of a character class: `digit',
- `control', `hex-digit', `blank', `graph', `print', `alnum',
- `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
- `word', or one of their synonyms.
-
-`(not (any SET ...))'
- matches any character not in SET ...
-
-`line-start', `bol'
- matches the empty string, but only at the beginning of a line
- in the text being matched
-
-`line-end', `eol'
- is similar to `line-start' but matches only at the end of a line
-
-`string-start', `bos', `bot'
- matches the empty string, but only at the beginning of the
- string being matched against.
-
-`string-end', `eos', `eot'
- matches the empty string, but only at the end of the
- string being matched against.
-
-`buffer-start'
- matches the empty string, but only at the beginning of the
- buffer being matched against. Actually equivalent to `string-start'.
-
-`buffer-end'
- matches the empty string, but only at the end of the
- buffer being matched against. Actually equivalent to `string-end'.
-
-`point'
- matches the empty string, but only at point.
-
-`word-start', `bow'
- matches the empty string, but only at the beginning of a word.
-
-`word-end', `eow'
- matches the empty string, but only at the end of a word.
-
-`word-boundary'
- matches the empty string, but only at the beginning or end of a
- word.
-
-`(not word-boundary)'
-`not-word-boundary'
- matches the empty string, but not at the beginning or end of a
- word.
-
-`symbol-start'
- matches the empty string, but only at the beginning of a symbol.
-
-`symbol-end'
- matches the empty string, but only at the end of a symbol.
-
-`digit', `numeric', `num'
- matches 0 through 9.
-
-`control', `cntrl'
- matches ASCII control characters.
-
-`hex-digit', `hex', `xdigit'
- matches 0 through 9, a through f and A through F.
-
-`blank'
- matches horizontal whitespace, as defined by Annex C of the
- Unicode Technical Standard #18. In particular, it matches
- spaces, tabs, and other characters whose Unicode
- `general-category' property indicates they are spacing
- separators.
-
-`graphic', `graph'
- matches graphic characters--everything except whitespace, ASCII
- and non-ASCII control characters, surrogates, and codepoints
- unassigned by Unicode.
-
-`printing', `print'
- matches whitespace and graphic characters.
-
-`alphanumeric', `alnum'
- matches alphabetic characters and digits. For multibyte characters,
- it matches characters whose Unicode `general-category' property
- indicates they are alphabetic or decimal number characters.
-
-`letter', `alphabetic', `alpha'
- matches alphabetic characters. For multibyte characters,
- it matches characters whose Unicode `general-category' property
- indicates they are alphabetic characters.
-
-`ascii'
- matches ASCII (unibyte) characters.
-
-`nonascii'
- matches non-ASCII (multibyte) characters.
-
-`lower', `lower-case'
- matches anything lower-case, as determined by the current case
- table. If `case-fold-search' is non-nil, this also matches any
- upper-case letter.
-
-`upper', `upper-case'
- matches anything upper-case, as determined by the current case
- table. If `case-fold-search' is non-nil, this also matches any
- lower-case letter.
-
-`punctuation', `punct'
- matches punctuation. (But at present, for multibyte characters,
- it matches anything that has non-word syntax.)
-
-`space', `whitespace', `white'
- matches anything that has whitespace syntax.
-
-`word', `wordchar'
- matches anything that has word syntax.
-
-`not-wordchar'
- matches anything that has non-word syntax.
-
-`(syntax SYNTAX)'
- matches a character with syntax SYNTAX. SYNTAX must be one
- of the following symbols, or a symbol corresponding to the syntax
- character, e.g. `\\.' for `\\s.'.
-
- `whitespace' (\\s- in string notation)
- `punctuation' (\\s.)
- `word' (\\sw)
- `symbol' (\\s_)
- `open-parenthesis' (\\s()
- `close-parenthesis' (\\s))
- `expression-prefix' (\\s')
- `string-quote' (\\s\")
- `paired-delimiter' (\\s$)
- `escape' (\\s\\)
- `character-quote' (\\s/)
- `comment-start' (\\s<)
- `comment-end' (\\s>)
- `string-delimiter' (\\s|)
- `comment-delimiter' (\\s!)
-
-`(not (syntax SYNTAX))'
- matches a character that doesn't have syntax SYNTAX.
-
-`(category CATEGORY)'
- matches a character with category CATEGORY. CATEGORY must be
- either a character to use for C, or one of the following symbols.
-
- `consonant' (\\c0 in string notation)
- `base-vowel' (\\c1)
- `upper-diacritical-mark' (\\c2)
- `lower-diacritical-mark' (\\c3)
- `tone-mark' (\\c4)
- `symbol' (\\c5)
- `digit' (\\c6)
- `vowel-modifying-diacritical-mark' (\\c7)
- `vowel-sign' (\\c8)
- `semivowel-lower' (\\c9)
- `not-at-end-of-line' (\\c<)
- `not-at-beginning-of-line' (\\c>)
- `alpha-numeric-two-byte' (\\cA)
- `chinese-two-byte' (\\cC)
- `greek-two-byte' (\\cG)
- `japanese-hiragana-two-byte' (\\cH)
- `indian-two-byte' (\\cI)
- `japanese-katakana-two-byte' (\\cK)
- `korean-hangul-two-byte' (\\cN)
- `cyrillic-two-byte' (\\cY)
- `combining-diacritic' (\\c^)
- `ascii' (\\ca)
- `arabic' (\\cb)
- `chinese' (\\cc)
- `ethiopic' (\\ce)
- `greek' (\\cg)
- `korean' (\\ch)
- `indian' (\\ci)
- `japanese' (\\cj)
- `japanese-katakana' (\\ck)
- `latin' (\\cl)
- `lao' (\\co)
- `tibetan' (\\cq)
- `japanese-roman' (\\cr)
- `thai' (\\ct)
- `vietnamese' (\\cv)
- `hebrew' (\\cw)
- `cyrillic' (\\cy)
- `can-break' (\\c|)
-
-`(not (category CATEGORY))'
- matches a character that doesn't have category CATEGORY.
-
-`(and SEXP1 SEXP2 ...)'
-`(: SEXP1 SEXP2 ...)'
-`(seq SEXP1 SEXP2 ...)'
-`(sequence SEXP1 SEXP2 ...)'
- matches what SEXP1 matches, followed by what SEXP2 matches, etc.
-
-`(submatch SEXP1 SEXP2 ...)'
-`(group SEXP1 SEXP2 ...)'
- like `and', but makes the match accessible with `match-end',
- `match-beginning', and `match-string'.
-
-`(submatch-n N SEXP1 SEXP2 ...)'
-`(group-n N SEXP1 SEXP2 ...)'
- like `group', but make it an explicitly-numbered group with
- group number N.
-
-`(or SEXP1 SEXP2 ...)'
-`(| SEXP1 SEXP2 ...)'
- matches anything that matches SEXP1 or SEXP2, etc. If all
- args are strings, use `regexp-opt' to optimize the resulting
- regular expression.
-
-`(minimal-match SEXP)'
- produce a non-greedy regexp for SEXP. Normally, regexps matching
- zero or more occurrences of something are \"greedy\" in that they
- match as much as they can, as long as the overall regexp can
- still match. A non-greedy regexp matches as little as possible.
-
-`(maximal-match SEXP)'
- produce a greedy regexp for SEXP. This is the default.
-
-Below, `SEXP ...' represents a sequence of regexp forms, treated as if
-enclosed in `(and ...)'.
-
-`(zero-or-more SEXP ...)'
-`(0+ SEXP ...)'
- matches zero or more occurrences of what SEXP ... matches.
-
-`(* SEXP ...)'
- like `zero-or-more', but always produces a greedy regexp, independent
- of `rx-greedy-flag'.
-
-`(*? SEXP ...)'
- like `zero-or-more', but always produces a non-greedy regexp,
- independent of `rx-greedy-flag'.
-
-`(one-or-more SEXP ...)'
-`(1+ SEXP ...)'
- matches one or more occurrences of SEXP ...
-
-`(+ SEXP ...)'
- like `one-or-more', but always produces a greedy regexp.
-
-`(+? SEXP ...)'
- like `one-or-more', but always produces a non-greedy regexp.
-
-`(zero-or-one SEXP ...)'
-`(optional SEXP ...)'
-`(opt SEXP ...)'
- matches zero or one occurrences of A.
-
-`(? SEXP ...)'
- like `zero-or-one', but always produces a greedy regexp.
-
-`(?? SEXP ...)'
- like `zero-or-one', but always produces a non-greedy regexp.
-
-`(repeat N SEXP)'
-`(= N SEXP ...)'
- matches N occurrences.
-
-`(>= N SEXP ...)'
- matches N or more occurrences.
-
-`(repeat N M SEXP)'
-`(** N M SEXP ...)'
- matches N to M occurrences.
-
-`(backref N)'
- matches what was matched previously by submatch N.
-
-`(eval FORM)'
- evaluate FORM and insert result. If result is a string,
- `regexp-quote' it.
-
-`(regexp REGEXP)'
- include REGEXP in string notation in the result."
- (cond ((null regexps)
- (error "No regexp"))
- ((cdr regexps)
- (rx-to-string `(and ,@regexps) t))
- (t
- (rx-to-string (car regexps) t))))
+Each argument is one of the forms below; RX is a subform, and RX... stands
+for one or more RXs. For details, see Info node `(elisp) Rx Notation'.
+See `rx-to-string' for the corresponding function.
+
+STRING Match a literal string.
+CHAR Match a literal character.
+
+(seq RX...) Match the RXs in sequence. Alias: :, sequence, and.
+(or RX...) Match one of the RXs. Alias: |.
+
+(zero-or-more RX...) Match RXs zero or more times. Alias: 0+.
+(one-or-more RX...) Match RXs one or more times. Alias: 1+.
+(zero-or-one RX...) Match RXs or the empty string. Alias: opt, optional.
+(* RX...) Match RXs zero or more times; greedy.
+(+ RX...) Match RXs one or more times; greedy.
+(? RX...) Match RXs or the empty string; greedy.
+(*? RX...) Match RXs zero or more times; non-greedy.
+(+? RX...) Match RXs one or more times; non-greedy.
+(?? RX...) Match RXs or the empty string; non-greedy.
+(= N RX...) Match RXs exactly N times.
+(>= N RX...) Match RXs N or more times.
+(** N M RX...) Match RXs N to M times. Alias: repeat.
+(minimal-match RX) Match RX, with zero-or-more, one-or-more, zero-or-one
+ and aliases using non-greedy matching.
+(maximal-match RX) Match RX, with zero-or-more, one-or-more, zero-or-one
+ and aliases using greedy matching, which is the default.
+
+(any SET...) Match a character from one of the SETs. Each SET is a
+ character, a string, a range as string \"A-Z\" or cons
+ (?A . ?Z), or a character class (see below). Alias: in, char.
+(not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC
+ can be (any ...), (syntax ...), (category ...),
+ or a character class.
+not-newline Match any character except a newline. Alias: nonl.
+anything Match any character.
+
+CHARCLASS Match a character from a character class. One of:
+ alpha, alphabetic, letter Alphabetic characters (defined by Unicode).
+ alnum, alphanumeric Alphabetic or decimal digit chars (Unicode).
+ digit numeric, num 0-9.
+ xdigit, hex-digit, hex 0-9, A-F, a-f.
+ cntrl, control ASCII codes 0-31.
+ blank Horizontal whitespace (Unicode).
+ space, whitespace, white Chars with whitespace syntax.
+ lower, lower-case Lower-case chars, from current case table.
+ upper, upper-case Upper-case chars, from current case table.
+ graph, graphic Graphic characters (Unicode).
+ print, printing Whitespace or graphic (Unicode).
+ punct, punctuation Not control, space, letter or digit (ASCII);
+ not word syntax (non-ASCII).
+ word, wordchar Characters with word syntax.
+ ascii ASCII characters (codes 0-127).
+ nonascii Non-ASCII characters (but not raw bytes).
+
+(syntax SYNTAX) Match a character with syntax SYNTAX, being one of:
+ whitespace, punctuation, word, symbol, open-parenthesis,
+ close-parenthesis, expression-prefix, string-quote,
+ paired-delimiter, escape, character-quote, comment-start,
+ comment-end, string-delimiter, comment-delimiter
+
+(category CAT) Match a character in category CAT, being one of:
+ space-for-indent, base, consonant, base-vowel,
+ upper-diacritical-mark, lower-diacritical-mark, tone-mark, symbol,
+ digit, vowel-modifying-diacritical-mark, vowel-sign,
+ semivowel-lower, not-at-end-of-line, not-at-beginning-of-line,
+ alpha-numeric-two-byte, chinese-two-byte, greek-two-byte,
+ japanese-hiragana-two-byte, indian-two-byte,
+ japanese-katakana-two-byte, strong-left-to-right,
+ korean-hangul-two-byte, strong-right-to-left, cyrillic-two-byte,
+ combining-diacritic, ascii, arabic, chinese, ethiopic, greek,
+ korean, indian, japanese, japanese-katakana, latin, lao,
+ tibetan, japanese-roman, thai, vietnamese, hebrew, cyrillic,
+ can-break
+
+Zero-width assertions: these all match the empty string in specific places.
+ line-start At the beginning of a line. Alias: bol.
+ line-end At the end of a line. Alias: eol.
+ string-start At the start of the string or buffer.
+ Alias: buffer-start, bos, bot.
+ string-end At the end of the string or buffer.
+ Alias: buffer-end, eos, eot.
+ point At point.
+ word-start At the beginning of a word.
+ word-end At the end of a word.
+ word-boundary At the beginning or end of a word.
+ not-word-boundary Not at the beginning or end of a word.
+ symbol-start At the beginning of a symbol.
+ symbol-end At the end of a symbol.
+
+(group RX...) Match RXs and define a capture group. Alias: submatch.
+(group-n N RX...) Match RXs and define capture group N. Alias: submatch-n.
+(backref N) Match the text that capture group N matched.
+
+(literal EXPR) Match the literal string from evaluating EXPR at run time.
+(regexp EXPR) Match the string regexp from evaluating EXPR at run time.
+(eval EXPR) Match the rx sexp from evaluating EXPR at compile time."
+ (let* ((rx--compile-to-lisp t)
+ (re (cond ((null regexps)
+ (error "No regexp"))
+ ((cdr regexps)
+ (rx-to-string `(and ,@regexps) t))
+ (t
+ (rx-to-string (car regexps) t)))))
+ (if (stringp re)
+ re
+ `(concat ,@re))))
(pcase-defmacro rx (&rest regexps)
@@ -1239,14 +1126,6 @@ string as argument to `match-string'."
for var in vars
collect `(app (match-string ,i) ,var)))))
-;; ;; sregex.el replacement
-
-;; ;;;###autoload (provide 'sregex)
-;; ;;;###autoload (autoload 'sregex "rx")
-;; (defalias 'sregex 'rx-to-string)
-;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
-;; (defalias 'sregexq 'rx)
-
(provide 'rx)
;;; rx.el ends here