diff options
Diffstat (limited to 'lisp/emacs-lisp/rx.el')
-rw-r--r-- | lisp/emacs-lisp/rx.el | 2416 |
1 files changed, 1221 insertions, 1195 deletions
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index a16c5da053a..d7677f14443 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -1,11 +1,7 @@ -;;; rx.el --- sexp notation for regular expressions +;;; rx.el --- S-exp notation for regexps --*- lexical-binding: t -*- ;; Copyright (C) 2001-2019 Free Software Foundation, Inc. -;; Author: Gerd Moellmann <gerd@gnu.org> -;; Maintainer: emacs-devel@gnu.org -;; Keywords: strings, regexps, extensions - ;; This file is part of GNU Emacs. ;; GNU Emacs is free software: you can redistribute it and/or modify @@ -23,1230 +19,1260 @@ ;;; Commentary: -;; This is another implementation of sexp-form regular expressions. -;; It was unfortunately written without being aware of the Sregex -;; package coming with Emacs, but as things stand, Rx completely -;; covers all regexp features, which Sregex doesn't, doesn't suffer -;; from the bugs mentioned in the commentary section of Sregex, and -;; uses a nicer syntax (IMHO, of course :-). - -;; This significantly extended version of the original, is almost -;; compatible with Sregex. The only incompatibility I (fx) know of is -;; that the `repeat' form can't have multiple regexp args. - -;; Now alternative forms are provided for a degree of compatibility -;; with Olin Shivers' attempted definitive SRE notation. SRE forms -;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>, -;; ,<exp>, (word ...), word+, posix-string, and character class forms. -;; Some forms are inconsistent with SRE, either for historical reasons -;; or because of the implementation -- simple translation into Emacs -;; regexp strings. These include: any, word. Also, case-sensitivity -;; and greediness are controlled by variables external to the regexp, -;; and you need to feed the forms to the `posix-' functions to get -;; SRE's POSIX semantics. There are probably more difficulties. - -;; Rx translates a sexp notation for regular expressions into the -;; usual string notation. The translation can be done at compile-time -;; by using the `rx' macro. It can be done at run-time by calling -;; function `rx-to-string'. See the documentation of `rx' for a -;; complete description of the sexp notation. -;; -;; Some examples of string regexps and their sexp counterparts: -;; -;; "^[a-z]*" -;; (rx (and line-start (0+ (in "a-z")))) -;; -;; "\n[^ \t]" -;; (rx (and "\n" (not (any " \t")))) -;; -;; "\\*\\*\\* EOOH \\*\\*\\*\n" -;; (rx "*** EOOH ***\n") -;; -;; "\\<\\(catch\\|finally\\)\\>[^_]" -;; (rx (and word-start (submatch (or "catch" "finally")) word-end -;; (not (any ?_)))) -;; -;; "[ \t\n]*:\\([^:]+\\|$\\)" -;; (rx (and (zero-or-more (in " \t\n")) ":" -;; (submatch (or line-end (one-or-more (not (any ?:))))))) -;; -;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*" -;; (rx (and line-start -;; "content-transfer-encoding:" -;; (+ (? ?\n)) (any " \t") -;; "quoted-printable" -;; (+ (? ?\n)) (any " \t")) -;; -;; (concat "^\\(?:" something-else "\\)") -;; (rx (and line-start (eval something-else))), statically or -;; (rx-to-string '(and line-start ,something-else)), dynamically. +;; This facility allows writing regexps in a sexp-based language +;; instead of strings. Regexps in the `rx' notation are easier to +;; read, write and maintain; they can be indented and commented in a +;; natural way, and are easily composed by program code. +;; The translation to string regexp is done by a macro and does not +;; incur any extra processing during run time. Example: ;; -;; (regexp-opt '(STRING1 STRING2 ...)) -;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically -;; calls `regexp-opt' as needed. +;; (rx bos (or (not (any "^")) +;; (seq "^" (or " *" "[")))) ;; -;; "^;;\\s-*\n\\|^\n" -;; (rx (or (and line-start ";;" (0+ space) ?\n) -;; (and line-start ?\n))) -;; -;; "\\$[I]d: [^ ]+ \\([^ ]+\\) " -;; (rx (and "$Id: " -;; (1+ (not (in " "))) -;; " " -;; (submatch (1+ (not (in " ")))) -;; " ")) -;; -;; "\\\\\\\\\\[\\w+" -;; (rx (and ?\\ ?\\ ?\[ (1+ word))) -;; -;; etc. - -;;; History: +;; => "\\`\\(?:[^^]\\|\\^\\(?: \\*\\|\\[\\)\\)" ;; +;; The notation is much influenced by and retains some compatibility with +;; Olin Shivers's SRE, with concessions to Emacs regexp peculiarities, +;; and the older Emacs package Sregex. ;;; Code: -;; FIXME: support macros. - -(defvar rx-constituents ;Not `const' because some modes extend it. - '((and . (rx-and 1 nil)) - (seq . and) ; SRE - (: . and) ; SRE - (sequence . and) ; sregex - (or . (rx-or 1 nil)) - (| . or) ; SRE - (not-newline . ".") - (nonl . not-newline) ; SRE - (anything . (rx-anything 0 nil)) - (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE - (any . ".") ; sregex - (in . any) - (char . any) ; sregex - (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex - (not . (rx-not 1 1 rx-check-not)) - (repeat . (rx-repeat 2 nil)) - (= . (rx-= 2 nil)) ; SRE - (>= . (rx->= 2 nil)) ; SRE - (** . (rx-** 2 nil)) ; SRE - (submatch . (rx-submatch 1 nil)) ; SRE - (group . submatch) ; sregex - (submatch-n . (rx-submatch-n 2 nil)) - (group-n . submatch-n) - (zero-or-more . (rx-kleene 1 nil)) - (one-or-more . (rx-kleene 1 nil)) - (zero-or-one . (rx-kleene 1 nil)) - (\? . zero-or-one) ; SRE - (\?? . zero-or-one) - (* . zero-or-more) ; SRE - (*? . zero-or-more) - (0+ . zero-or-more) - (+ . one-or-more) ; SRE - (+? . one-or-more) - (1+ . one-or-more) - (optional . zero-or-one) - (opt . zero-or-one) ; sregex - (minimal-match . (rx-greedy 1 1)) - (maximal-match . (rx-greedy 1 1)) - (backref . (rx-backref 1 1 rx-check-backref)) - (line-start . "^") - (bol . line-start) ; SRE - (line-end . "$") - (eol . line-end) ; SRE - (string-start . "\\`") - (bos . string-start) ; SRE - (bot . string-start) ; sregex - (string-end . "\\'") - (eos . string-end) ; SRE - (eot . string-end) ; sregex - (buffer-start . "\\`") - (buffer-end . "\\'") - (point . "\\=") - (word-start . "\\<") - (bow . word-start) ; SRE - (word-end . "\\>") - (eow . word-end) ; SRE - (word-boundary . "\\b") - (not-word-boundary . "\\B") ; sregex - (symbol-start . "\\_<") - (symbol-end . "\\_>") - (syntax . (rx-syntax 1 1)) - (not-syntax . (rx-not-syntax 1 1)) ; sregex - (category . (rx-category 1 1 rx-check-category)) - (eval . (rx-eval 1 1)) - (regexp . (rx-regexp 1 1 stringp)) - (regex . regexp) ; sregex - (digit . "[[:digit:]]") - (numeric . digit) ; SRE - (num . digit) ; SRE - (control . "[[:cntrl:]]") ; SRE - (cntrl . control) ; SRE - (hex-digit . "[[:xdigit:]]") ; SRE - (hex . hex-digit) ; SRE - (xdigit . hex-digit) ; SRE - (blank . "[[:blank:]]") ; SRE - (graphic . "[[:graph:]]") ; SRE - (graph . graphic) ; SRE - (printing . "[[:print:]]") ; SRE - (print . printing) ; SRE - (alphanumeric . "[[:alnum:]]") ; SRE - (alnum . alphanumeric) ; SRE - (letter . "[[:alpha:]]") - (alphabetic . letter) ; SRE - (alpha . letter) ; SRE - (ascii . "[[:ascii:]]") ; SRE - (nonascii . "[[:nonascii:]]") - (lower . "[[:lower:]]") ; SRE - (lower-case . lower) ; SRE - (punctuation . "[[:punct:]]") ; SRE - (punct . punctuation) ; SRE - (space . "[[:space:]]") ; SRE - (whitespace . space) ; SRE - (white . space) ; SRE - (upper . "[[:upper:]]") ; SRE - (upper-case . upper) ; SRE - (word . "[[:word:]]") ; inconsistent with SRE - (wordchar . word) ; sregex - (not-wordchar . "\\W")) - "Alist of sexp form regexp constituents. -Each element of the alist has the form (SYMBOL . DEFN). -SYMBOL is a valid constituent of sexp regular expressions. -If DEFN is a string, SYMBOL is translated into DEFN. -If DEFN is a symbol, use the definition of DEFN, recursively. -Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE). -FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS -are the minimum and maximum number of arguments the function-form -sexp constituent SYMBOL may have in sexp regular expressions. -MAX-ARGS nil means no limit. PREDICATE, if specified, means that -all arguments must satisfy PREDICATE.") - - -(defconst rx-syntax - '((whitespace . ?-) - (punctuation . ?.) - (word . ?w) - (symbol . ?_) - (open-parenthesis . ?\() - (close-parenthesis . ?\)) - (expression-prefix . ?\') - (string-quote . ?\") - (paired-delimiter . ?$) - (escape . ?\\) - (character-quote . ?/) - (comment-start . ?<) - (comment-end . ?>) - (string-delimiter . ?|) - (comment-delimiter . ?!)) - "Alist mapping Rx syntax symbols to syntax characters. -Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid -symbol in `(syntax SYMBOL)', and CHAR is the syntax character -corresponding to SYMBOL, as it would be used with \\s or \\S in -regular expressions.") - - -(defconst rx-categories - '((consonant . ?0) - (base-vowel . ?1) - (upper-diacritical-mark . ?2) - (lower-diacritical-mark . ?3) - (tone-mark . ?4) - (symbol . ?5) - (digit . ?6) - (vowel-modifying-diacritical-mark . ?7) - (vowel-sign . ?8) - (semivowel-lower . ?9) - (not-at-end-of-line . ?<) - (not-at-beginning-of-line . ?>) - (alpha-numeric-two-byte . ?A) - (chinese-two-byte . ?C) - (chinse-two-byte . ?C) ;; A typo in Emacs 21.1-24.3. - (greek-two-byte . ?G) - (japanese-hiragana-two-byte . ?H) - (indian-two-byte . ?I) - (japanese-katakana-two-byte . ?K) - (korean-hangul-two-byte . ?N) - (cyrillic-two-byte . ?Y) - (combining-diacritic . ?^) - (ascii . ?a) - (arabic . ?b) - (chinese . ?c) - (ethiopic . ?e) - (greek . ?g) - (korean . ?h) - (indian . ?i) - (japanese . ?j) - (japanese-katakana . ?k) - (latin . ?l) - (lao . ?o) - (tibetan . ?q) - (japanese-roman . ?r) - (thai . ?t) - (vietnamese . ?v) - (hebrew . ?w) - (cyrillic . ?y) - (can-break . ?|)) - "Alist mapping symbols to category characters. -Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid -symbol in `(category SYMBOL)', and CHAR is the category character -corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in -regular expression strings.") - - -(defvar rx-greedy-flag t - "Non-nil means produce greedy regular expressions for `zero-or-one', -`zero-or-more', and `one-or-more'. Dynamically bound.") - - -(defun rx-info (op head) - "Return parsing/code generation info for OP. -If OP is the space character ASCII 32, return info for the symbol `?'. -If OP is the character `?', return info for the symbol `??'. -See also `rx-constituents'. -If HEAD is non-nil, then OP is the head of a sexp, otherwise it's -a standalone symbol." - (cond ((eq op ? ) (setq op '\?)) - ((eq op ??) (setq op '\??))) - (let (old-op) - (while (and (not (null op)) (symbolp op)) - (setq old-op op) - (setq op (cdr (assq op rx-constituents))) - (when (if head (stringp op) (consp op)) - ;; We found something but of the wrong kind. Let's look for an - ;; alternate definition for the other case. - (let ((new-op - (cdr (assq old-op (cdr (memq (assq old-op rx-constituents) - rx-constituents)))))) - (if (and new-op (not (if head (stringp new-op) (consp new-op)))) - (setq op new-op)))))) - op) - - -(defun rx-check (form) - "Check FORM according to its car's parsing info." - (unless (listp form) - (error "rx `%s' needs argument(s)" form)) - (let* ((rx (rx-info (car form) 'head)) - (nargs (1- (length form))) - (min-args (nth 1 rx)) - (max-args (nth 2 rx)) - (type-pred (nth 3 rx))) - (when (and (not (null min-args)) - (< nargs min-args)) - (error "rx form `%s' requires at least %d args" - (car form) min-args)) - (when (and (not (null max-args)) - (> nargs max-args)) - (error "rx form `%s' accepts at most %d args" - (car form) max-args)) - (when (not (null type-pred)) - (dolist (sub-form (cdr form)) - (unless (funcall type-pred sub-form) - (error "rx form `%s' requires args satisfying `%s'" - (car form) type-pred)))))) - - -(defun rx-group-if (regexp group) - "Put shy groups around REGEXP if seemingly necessary when GROUP -is non-nil." +;; The `rx--translate...' functions below return (REGEXP . PRECEDENCE), +;; where REGEXP is a list of string expressions that will be +;; concatenated into a regexp, and PRECEDENCE is one of +;; +;; t -- can be used as argument to postfix operators (eg. "a") +;; seq -- can be concatenated in sequence with other seq or higher (eg. "ab") +;; lseq -- can be concatenated to the left of rseq or higher (eg. "^a") +;; rseq -- can be concatenated to the right of lseq or higher (eg. "a$") +;; nil -- can only be used in alternatives (eg. "a\\|b") +;; +;; They form a lattice: +;; +;; t highest precedence +;; | +;; seq +;; / \ +;; lseq rseq +;; \ / +;; nil lowest precedence + + +(defconst rx--char-classes + '((digit . digit) + (numeric . digit) + (num . digit) + (control . cntrl) + (cntrl . cntrl) + (hex-digit . xdigit) + (hex . xdigit) + (xdigit . xdigit) + (blank . blank) + (graphic . graph) + (graph . graph) + (printing . print) + (print . print) + (alphanumeric . alnum) + (alnum . alnum) + (letter . alpha) + (alphabetic . alpha) + (alpha . alpha) + (ascii . ascii) + (nonascii . nonascii) + (lower . lower) + (lower-case . lower) + (punctuation . punct) + (punct . punct) + (space . space) + (whitespace . space) + (white . space) + (upper . upper) + (upper-case . upper) + (word . word) + (wordchar . word) + (unibyte . unibyte) + (multibyte . multibyte)) + "Alist mapping rx symbols to character classes. +Most of the names are from SRE.") + +(defvar rx-constituents nil + "Alist of old-style rx extensions, for compatibility. +For new code, use `rx-define', `rx-let' or `rx-let-eval'. + +Each element is (SYMBOL . DEF). + +If DEF is a symbol, then SYMBOL is an alias of DEF. + +If DEF is a string, then SYMBOL is a plain rx symbol defined as the + regexp string DEF. + +If DEF is a list on the form (FUN MIN-ARGS MAX-ARGS PRED), then + SYMBOL is an rx form with at least MIN-ARGS and at most + MAX-ARGS arguments. If MAX-ARGS is nil, then there is no upper limit. + FUN is a function taking the entire rx form as single argument + and returning the translated regexp string. + If PRED is non-nil, it is a predicate that all actual arguments must + satisfy.") + +(defvar rx--local-definitions nil + "Alist of dynamic local rx definitions. +Each entry is: + (NAME DEF) -- NAME is an rx symbol defined as the rx form DEF. + (NAME ARGS DEF) -- NAME is an rx form with arglist ARGS, defined + as the rx form DEF (which can contain members of ARGS).") + +(defsubst rx--lookup-def (name) + "Current definition of NAME: (DEF) or (ARGS DEF), or nil if none." + (or (cdr (assq name rx--local-definitions)) + (get name 'rx-definition))) + +(defun rx--expand-def (form) + "FORM expanded (once) if a user-defined construct; otherwise nil." + (cond ((symbolp form) + (let ((def (rx--lookup-def form))) + (and def + (if (cdr def) + (error "Not an `rx' symbol definition: %s" form) + (car def))))) + ((consp form) + (let* ((op (car form)) + (def (rx--lookup-def op))) + (and def + (if (cdr def) + (rx--expand-template + op (cdr form) (nth 0 def) (nth 1 def)) + (error "Not an `rx' form definition: %s" op))))))) + +;; TODO: Additions to consider: +;; - A construct like `or' but without the match order guarantee, +;; maybe `unordered-or'. Useful for composition or generation of +;; alternatives; permits more effective use of regexp-opt. + +(defun rx--translate-symbol (sym) + "Translate an rx symbol. Return (REGEXP . PRECEDENCE)." + (pcase sym + ;; Use `list' instead of a quoted list to wrap the strings here, + ;; since the return value may be mutated. + ((or 'nonl 'not-newline 'any) (cons (list ".") t)) + ((or 'anychar 'anything) (cons (list "[^z-a]") t)) + ('unmatchable (rx--empty)) + ((or 'bol 'line-start) (cons (list "^") 'lseq)) + ((or 'eol 'line-end) (cons (list "$") 'rseq)) + ((or 'bos 'string-start 'bot 'buffer-start) (cons (list "\\`") t)) + ((or 'eos 'string-end 'eot 'buffer-end) (cons (list "\\'") t)) + ('point (cons (list "\\=") t)) + ((or 'bow 'word-start) (cons (list "\\<") t)) + ((or 'eow 'word-end) (cons (list "\\>") t)) + ('word-boundary (cons (list "\\b") t)) + ('not-word-boundary (cons (list "\\B") t)) + ('symbol-start (cons (list "\\_<") t)) + ('symbol-end (cons (list "\\_>") t)) + ('not-wordchar (cons (list "\\W") t)) + (_ + (cond + ((let ((class (cdr (assq sym rx--char-classes)))) + (and class (cons (list (concat "[[:" (symbol-name class) ":]]")) t)))) + + ((let ((expanded (rx--expand-def sym))) + (and expanded (rx--translate expanded)))) + + ;; For compatibility with old rx. + ((let ((entry (assq sym rx-constituents))) + (and (progn + (while (and entry (not (stringp (cdr entry)))) + (setq entry + (if (symbolp (cdr entry)) + ;; Alias for another entry. + (assq (cdr entry) rx-constituents) + ;; Wrong type, try further down the list. + (assq (car entry) + (cdr (memq entry rx-constituents)))))) + entry) + (cons (list (cdr entry)) nil)))) + (t (error "Unknown rx symbol `%s'" sym)))))) + +(defun rx--enclose (left-str rexp right-str) + "Bracket REXP by LEFT-STR and RIGHT-STR." + (append (list left-str) rexp (list right-str))) + +(defun rx--bracket (rexp) + (rx--enclose "\\(?:" rexp "\\)")) + +(defun rx--sequence (left right) + "Return the sequence (concatenation) of two translated items, +each on the form (REGEXP . PRECEDENCE), returning (REGEXP . PRECEDENCE)." + ;; Concatenation rules: + ;; seq ++ seq -> seq + ;; lseq ++ seq -> lseq + ;; seq ++ rseq -> rseq + ;; lseq ++ rseq -> nil + (cond ((not (car left)) right) + ((not (car right)) left) + (t + (let ((l (if (memq (cdr left) '(nil rseq)) + (cons (rx--bracket (car left)) t) + left)) + (r (if (memq (cdr right) '(nil lseq)) + (cons (rx--bracket (car right)) t) + right))) + (cons (append (car l) (car r)) + (if (eq (cdr l) 'lseq) + (if (eq (cdr r) 'rseq) + nil ; lseq ++ rseq + 'lseq) ; lseq ++ seq + (if (eq (cdr r) 'rseq) + 'rseq ; seq ++ rseq + 'seq))))))) ; seq ++ seq + +(defun rx--translate-seq (body) + "Translate a sequence of zero or more rx items. +Return (REGEXP . PRECEDENCE)." + (if body + (let* ((items (mapcar #'rx--translate body)) + (result (car items))) + (dolist (item (cdr items)) + (setq result (rx--sequence result item))) + result) + (cons nil 'seq))) + +(defun rx--empty () + "Regexp that never matches anything." + (cons (list regexp-unmatchable) 'seq)) + +;; `cl-every' replacement to avoid bootstrapping problems. +(defun rx--every (pred list) + "Whether PRED is true for every element of LIST." + (while (and list (funcall pred (car list))) + (setq list (cdr list))) + (null list)) + +(defun rx--translate-or (body) + "Translate an or-pattern of zero or more rx items. +Return (REGEXP . PRECEDENCE)." + ;; FIXME: Possible improvements: + ;; + ;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"), + ;; so that they can be candidates for regexp-opt. + ;; + ;; - Translate compile-time strings (`eval' forms), again for regexp-opt. + ;; + ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D) + ;; in order to improve effectiveness of regexp-opt. + ;; This would also help composability. + ;; + ;; - Use associativity to run regexp-opt on contiguous subsets of arguments + ;; if not all of them are strings. Example: + ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) + ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) + ;; + ;; - Fuse patterns into a single character alternative if they fit. + ;; regexp-opt will do that if all are strings, but we want to do that for: + ;; * symbols that expand to classes: space, alpha, ... + ;; * character alternatives: (any ...) + ;; * (syntax S), for some S (whitespace, word) + ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) + ;; -> (any "@" "%" digit "A-Z" space word) + ;; -> "[A-Z@%[:digit:][:space:][:word:]]" + ;; + ;; Problem: If a subpattern is carefully written to be + ;; optimisable by regexp-opt, how do we prevent the transforms + ;; above from destroying that property? + ;; Example: (or "a" (or "abc" "abd" "abe")) (cond - ;; for some repetition - ((eq group '*) (if (rx-atomic-p regexp) (setq group nil))) - ;; for concatenation - ((eq group ':) - (if (rx-atomic-p - (if (string-match - "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp) - (substring regexp 0 (match-beginning 0)) - regexp)) - (setq group nil))) - ;; for OR - ((eq group '|) (setq group nil)) - ;; do anyway - ((eq group t)) - ((rx-atomic-p regexp t) (setq group nil))) - (if group - (concat "\\(?:" regexp "\\)") - regexp)) - - -(defvar rx-parent) -;; dynamically bound in some functions. - - -(defun rx-and (form) - "Parse and produce code from FORM. -FORM is of the form `(and FORM1 ...)'." - (rx-check form) - (rx-group-if - (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil) - (and (memq rx-parent '(* t)) rx-parent))) - - -(defun rx-or (form) - "Parse and produce code from FORM, which is `(or FORM1 ...)'." - (rx-check form) - (rx-group-if - (if (memq nil (mapcar 'stringp (cdr form))) - (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|") - (regexp-opt (cdr form))) - (and (memq rx-parent '(: * t)) rx-parent))) - - -(defun rx-anything (form) - "Match any character." - (if (consp form) - (error "rx `anything' syntax error: %s" form)) - (rx-or (list 'or 'not-newline ?\n))) - - -(defun rx-any-delete-from-range (char ranges) - "Delete by side effect character CHAR from RANGES. -Only both edges of each range is checked." - (let (m) - (cond - ((memq char ranges) (setq ranges (delq char ranges))) - ((setq m (assq char ranges)) - (if (eq (1+ char) (cdr m)) - (setcar (memq m ranges) (1+ char)) - (setcar m (1+ char)))) - ((setq m (rassq char ranges)) - (if (eq (1- char) (car m)) - (setcar (memq m ranges) (1- char)) - (setcdr m (1- char))))) - ranges)) - - -(defun rx-any-condense-range (args) - "Condense by side effect ARGS as range for Rx `any'." - (let (str - l) - ;; set STR list of all strings - ;; set L list of all ranges - (mapc (lambda (e) (cond ((stringp e) (push e str)) - ((numberp e) (push (cons e e) l)) - (t (push e l)))) - args) - ;; condense overlapped ranges in L - (let ((tail (setq l (sort l #'car-less-than-car))) - d) - (while (setq d (cdr tail)) - (if (>= (cdar tail) (1- (caar d))) - (progn - (setcdr (car tail) (max (cdar tail) (cdar d))) - (setcdr tail (cdr d))) - (setq tail d)))) - ;; Separate small ranges to single number, and delete dups. - (nconc - (apply #'nconc - (mapcar (lambda (e) - (cond - ((= (car e) (cdr e)) (list (car e))) - ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e))) - ((list e)))) - l)) - (delete-dups str)))) - - -(defun rx-check-any-string (str) - "Check string argument STR for Rx `any'." - (let ((i 0) - c1 c2 l) - (if (= 0 (length str)) - (error "String arg for Rx `any' must not be empty")) - (while (string-match ".-." str i) - ;; string before range: convert it to characters - (if (< i (match-beginning 0)) - (setq l (nconc - l - (append (substring str i (match-beginning 0)) nil)))) - ;; range - (setq i (match-end 0) - c1 (aref str (match-beginning 0)) - c2 (aref str (1- i))) + ((null body) ; No items: a never-matching regexp. + (rx--empty)) + ((null (cdr body)) ; Single item. + (rx--translate (car body))) + ((rx--every #'stringp body) ; All strings. + (cons (list (regexp-opt body nil t)) + t)) + (t + (cons (append (car (rx--translate (car body))) + (mapcan (lambda (item) + (cons "\\|" (car (rx--translate item)))) + (cdr body))) + nil)))) + +(defun rx--string-to-intervals (str) + "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single +character X becomes (?X . ?X). Return the intervals in a list." + ;; We could just do string-to-multibyte on the string and work with + ;; that instead of this `decode-char' workaround. + (let ((decode-char + (if (multibyte-string-p str) + #'identity + #'unibyte-char-to-multibyte)) + (len (length str)) + (i 0) + (intervals nil)) + (while (< i len) + (cond ((and (< i (- len 2)) + (= (aref str (1+ i)) ?-)) + ;; Range. + (let ((start (funcall decode-char (aref str i))) + (end (funcall decode-char (aref str (+ i 2))))) + (cond ((and (<= start #x7f) (>= end #x3fff80)) + ;; Ranges between ASCII and raw bytes are split to + ;; avoid having them absorb Unicode characters + ;; caught in-between. + (push (cons start #x7f) intervals) + (push (cons #x3fff80 end) intervals)) + ((<= start end) + (push (cons start end) intervals)) + (t + (error "Invalid rx `any' range: %s" + (substring str i 3)))) + (setq i (+ i 3)))) + (t + ;; Single character. + (let ((char (funcall decode-char (aref str i)))) + (push (cons char char) intervals)) + (setq i (+ i 1))))) + intervals)) + +(defun rx--condense-intervals (intervals) + "Merge adjacent and overlapping intervals by mutation, preserving the order. +INTERVALS is a list of (START . END) with START ≤ END, sorted by START." + (let ((tail intervals) + d) + (while (setq d (cdr tail)) + (if (>= (cdar tail) (1- (caar d))) + (progn + (setcdr (car tail) (max (cdar tail) (cdar d))) + (setcdr tail (cdr d))) + (setq tail d))) + intervals)) + +;; FIXME: Consider expanding definitions inside (any ...) and (not ...), +;; and perhaps allow (any ...) inside (any ...). +;; It would be benefit composability (build a character alternative by pieces) +;; and be handy for obtaining the complement of a defined set of +;; characters. (See, for example, python.el:421, `not-simple-operator'.) +;; (Expansion in other non-rx positions is probably not a good idea: +;; syntax, category, backref, and the integer parameters of group-n, +;; =, >=, **, repeat) +;; Similar effect could be attained by ensuring that +;; (or (any X) (any Y)) -> (any X Y), and find a way to compose negative +;; sets. `and' is taken, but we could add +;; (intersection (not (any X)) (not (any Y))) -> (not (any X Y)). + +(defun rx--translate-any (negated body) + "Translate an (any ...) construct. Return (REGEXP . PRECEDENCE). +If NEGATED, negate the sense." + (let ((classes nil) + (strings nil) + (conses nil)) + ;; Collect strings, conses and characters, and classes in separate bins. + (dolist (arg body) + (cond ((stringp arg) + (push arg strings)) + ((and (consp arg) + (characterp (car arg)) + (characterp (cdr arg)) + (<= (car arg) (cdr arg))) + ;; Copy the cons, in case we need to modify it. + (push (cons (car arg) (cdr arg)) conses)) + ((characterp arg) + (push (cons arg arg) conses)) + ((and (symbolp arg) + (let ((class (cdr (assq arg rx--char-classes)))) + (and class (push class classes))))) + (t (error "Invalid rx `any' argument: %s" arg)))) + (let ((items + ;; Translate strings and conses into nonoverlapping intervals, + ;; and add classes as symbols at the end. + (append + (rx--condense-intervals + (sort (append conses + (mapcan #'rx--string-to-intervals strings)) + #'car-less-than-car)) + (reverse classes)))) + + ;; Move lone ] and range ]-x to the start. + (let ((rbrac-l (assq ?\] items))) + (when rbrac-l + (setq items (cons rbrac-l (delq rbrac-l items))))) + + ;; Split x-] and move the lone ] to the start. + (let ((rbrac-r (rassq ?\] items))) + (when (and rbrac-r (not (eq (car rbrac-r) ?\]))) + (setcdr rbrac-r ?\\) + (setq items (cons '(?\] . ?\]) items)))) + + ;; Split ,-- (which would end up as ,- otherwise). + (let ((dash-r (rassq ?- items))) + (when (eq (car dash-r) ?,) + (setcdr dash-r ?,) + (setq items (nconc items '((?- . ?-)))))) + + ;; Remove - (lone or at start of interval) + (let ((dash-l (assq ?- items))) + (when dash-l + (if (eq (cdr dash-l) ?-) + (setq items (delq dash-l items)) ; Remove lone - + (setcar dash-l ?.)) ; Reduce --x to .-x + (setq items (nconc items '((?- . ?-)))))) + + ;; Deal with leading ^ and range ^-x. + (when (and (consp (car items)) + (eq (caar items) ?^) + (cdr items)) + ;; Move ^ and ^-x to second place. + (setq items (cons (cadr items) + (cons (car items) (cddr items))))) + (cond - ((< c1 c2) (setq l (nconc l (list (cons c1 c2))))) - ((= c1 c2) (setq l (nconc l (list c1)))))) - ;; rest? - (if (< i (length str)) - (setq l (nconc l (append (substring str i) nil)))) - l)) - - -(defun rx-check-any (arg) - "Check arg ARG for Rx `any'." - (cond - ((integerp arg) (list arg)) - ((symbolp arg) - (let ((translation (condition-case nil - (rx-form arg) - (error nil)))) - (if (or (null translation) - (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation))) - (error "Invalid char class `%s' in Rx `any'" arg)) - (list (substring translation 1 -1)))) ; strip outer brackets - ((and (integerp (car-safe arg)) (integerp (cdr-safe arg))) - (list arg)) - ((stringp arg) (rx-check-any-string arg)) - ((error - "rx `any' requires string, character, char pair or char class args")))) - - -(defun rx-any (form) - "Parse and produce code from FORM, which is `(any ARG ...)'. -ARG is optional." - (rx-check form) - (let* ((args (rx-any-condense-range - (apply - #'nconc - (mapcar #'rx-check-any (cdr form))))) - m - s) - (cond - ;; single close bracket - ;; => "[]...-]" or "[]...--.]" - ((memq ?\] args) - ;; set ] at the beginning - (setq args (cons ?\] (delq ?\] args))) - ;; set - at the end - (if (or (memq ?- args) (assq ?- args)) - (setq args (nconc (rx-any-delete-from-range ?- args) - (list ?-))))) - ;; close bracket starts a range - ;; => "[]-....-]" or "[]-.--....]" - ((setq m (assq ?\] args)) - ;; bring it to the beginning - (setq args (cons m (delq m args))) - (cond ((memq ?- args) - ;; to the end - (setq args (nconc (delq ?- args) (list ?-)))) - ((setq m (assq ?- args)) - ;; next to the bracket's range, make the second range - (setcdr args (cons m (delq m (cdr args))))))) - ;; bracket in the end range - ;; => "[]...-]" - ((setq m (rassq ?\] args)) - ;; set ] at the beginning - (setq args (cons ?\] (rx-any-delete-from-range ?\] args))) - ;; set - at the end - (if (or (memq ?- args) (assq ?- args)) - (setq args (nconc (rx-any-delete-from-range ?- args) - (list ?-))))) - ;; {no close bracket appears} - ;; - ;; bring single bar to the beginning - ((memq ?- args) - (setq args (cons ?- (delq ?- args)))) - ;; bar start a range, bring it to the beginning - ((setq m (assq ?- args)) - (setq args (cons m (delq m args)))) - ;; - ;; hat at the beginning? - ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^)) - (setq args (if (cdr args) - `(,(cadr args) ,(car args) ,@(cddr args)) - (nconc (rx-any-delete-from-range ?^ args) - (list ?^)))))) - ;; some 1-char? - (if (and (null (cdr args)) (numberp (car args)) - (or (= 1 (length - (setq s (regexp-quote (string (car args)))))) - (and (equal (car args) ?^) ;; unnecessary predicate? - (null (eq rx-parent '!))))) - s - (concat "[" - (mapconcat - (lambda (e) (cond - ((numberp e) (string e)) - ((consp e) - (if (and (= (1+ (car e)) (cdr e)) - ;; rx-any-condense-range should - ;; prevent this case from happening. - (null (memq (car e) '(?\] ?-))) - (null (memq (cdr e) '(?\] ?-)))) - (string (car e) (cdr e)) - (string (car e) ?- (cdr e)))) - (e))) - args - nil) - "]")))) - - -(defun rx-check-not (arg) - "Check arg ARG for Rx `not'." - (unless (or (and (symbolp arg) - (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" - (condition-case nil - (rx-form arg) - (error "")))) - (eq arg 'word-boundary) - (and (consp arg) - (memq (car arg) '(not any in syntax category)))) - (error "rx `not' syntax error: %s" arg)) - t) - - -(defun rx-not (form) - "Parse and produce code from FORM. FORM is `(not ...)'." - (rx-check form) - (let ((result (rx-form (cadr form) '!)) - case-fold-search) - (cond ((string-match "\\`\\[^" result) - (cond - ((equal result "[^]") "[^^]") - ((and (= (length result) 4) (null (eq rx-parent '!))) - (regexp-quote (substring result 2 3))) - ((concat "[" (substring result 2))))) - ((eq ?\[ (aref result 0)) - (concat "[^" (substring result 1))) - ((string-match "\\`\\\\[scbw]" result) - (concat (upcase (substring result 0 2)) - (substring result 2))) - ((string-match "\\`\\\\[SCBW]" result) - (concat (downcase (substring result 0 2)) - (substring result 2))) - (t - (concat "[^" result "]"))))) - - -(defun rx-not-char (form) - "Parse and produce code from FORM. FORM is `(not-char ...)'." - (rx-check form) - (rx-not `(not (in ,@(cdr form))))) - - -(defun rx-not-syntax (form) - "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'." - (rx-check form) - (rx-not `(not (syntax ,@(cdr form))))) - - -(defun rx-trans-forms (form &optional skip) - "If FORM's length is greater than two, transform it to length two. -A form (HEAD REST ...) becomes (HEAD (and REST ...)). -If SKIP is non-nil, allow that number of items after the head, i.e. -`(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1." - (unless skip (setq skip 0)) - (let ((tail (nthcdr (1+ skip) form))) - (if (= (length tail) 1) - form - (let ((form (copy-sequence form))) - (setcdr (nthcdr skip form) (list (cons 'and tail))) - form)))) - - -(defun rx-= (form) - "Parse and produce code from FORM `(= N ...)'." - (rx-check form) - (setq form (rx-trans-forms form 1)) - (unless (and (integerp (nth 1 form)) - (> (nth 1 form) 0)) - (error "rx `=' requires positive integer first arg")) - (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form))) - - -(defun rx->= (form) - "Parse and produce code from FORM `(>= N ...)'." - (rx-check form) - (setq form (rx-trans-forms form 1)) - (unless (and (integerp (nth 1 form)) - (> (nth 1 form) 0)) - (error "rx `>=' requires positive integer first arg")) - (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form))) - - -(defun rx-** (form) - "Parse and produce code from FORM `(** N M ...)'." - (rx-check form) - (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*)) - - -(defun rx-repeat (form) - "Parse and produce code from FORM. -FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'." - (rx-check form) - (if (> (length form) 4) - (setq form (rx-trans-forms form 2))) - (if (null (nth 2 form)) - (setq form (cons (nth 0 form) (cons (nth 1 form) (nthcdr 3 form))))) - (cond ((= (length form) 3) - (unless (and (integerp (nth 1 form)) - (> (nth 1 form) 0)) - (error "rx `repeat' requires positive integer first arg")) - (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form))) - ((or (not (integerp (nth 2 form))) - (< (nth 2 form) 0) - (not (integerp (nth 1 form))) - (< (nth 1 form) 0) - (< (nth 2 form) (nth 1 form))) - (error "rx `repeat' range error")) - (t - (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*) - (nth 1 form) (nth 2 form))))) - - -(defun rx-submatch (form) - "Parse and produce code from FORM, which is `(submatch ...)'." - (concat "\\(" - (if (= 2 (length form)) - ;; Only one sub-form. - (rx-form (cadr form)) - ;; Several sub-forms implicitly concatenated. - (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil)) - "\\)")) - -(defun rx-submatch-n (form) - "Parse and produce code from FORM, which is `(submatch-n N ...)'." - (let ((n (nth 1 form))) - (concat "\\(?" (number-to-string n) ":" - (if (= 3 (length form)) - ;; Only one sub-form. - (rx-form (nth 2 form)) - ;; Several sub-forms implicitly concatenated. - (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil)) - "\\)"))) - -(defun rx-backref (form) - "Parse and produce code from FORM, which is `(backref N)'." - (rx-check form) - (format "\\%d" (nth 1 form))) - -(defun rx-check-backref (arg) - "Check arg ARG for Rx `backref'." - (or (and (integerp arg) (>= arg 1) (<= arg 9)) - (error "rx `backref' requires numeric 1<=arg<=9: %s" arg))) - -(defun rx-kleene (form) - "Parse and produce code from FORM. -FORM is `(OP FORM1)', where OP is one of the `zero-or-one', -`zero-or-more' etc. operators. -If OP is one of `*', `+', `?', produce a greedy regexp. -If OP is one of `*?', `+?', `??', produce a non-greedy regexp. -If OP is anything else, produce a greedy regexp if `rx-greedy-flag' -is non-nil." - (rx-check form) - (setq form (rx-trans-forms form)) - (let ((suffix (cond ((memq (car form) '(* + ?\s)) "") - ((memq (car form) '(*? +? ??)) "?") - (rx-greedy-flag "") - (t "?"))) - (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*") - ((memq (car form) '(+ +? 1+ one-or-more)) "+") - (t "?")))) - (rx-group-if - (concat (rx-form (cadr form) '*) op suffix) - (and (memq rx-parent '(t *)) rx-parent)))) - - -(defun rx-atomic-p (r &optional lax) - "Return non-nil if regexp string R is atomic. -An atomic regexp R is one such that a suffix operator -appended to R will apply to all of R. For example, \"a\" -\"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\", -\"[ab]c\", and \"ab\\|ab*c\" are not atomic. - -This function may return false negatives, but it will not -return false positives. It is nevertheless useful in -situations where an efficiency shortcut can be taken only if a -regexp is atomic. The function can be improved to detect -more cases of atomic regexps. Presently, this function -detects the following categories of atomic regexp; - - a group or shy group: \\(...\\) - a character class: [...] - a single character: a - -On the other hand, false negatives will be returned for -regexps that are atomic but end in operators, such as -\"a+\". I think these are rare. Probably such cases could -be detected without much effort. A guarantee of no false -negatives would require a theoretic specification of the set -of all atomic regexps." - (let ((l (length r))) + ;; Empty set: if negated, any char, otherwise match-nothing. + ((null items) + (if negated + (rx--translate-symbol 'anything) + (rx--empty))) + ;; Single non-negated character. + ((and (null (cdr items)) + (consp (car items)) + (eq (caar items) (cdar items)) + (not negated)) + (cons (list (regexp-quote (char-to-string (caar items)))) + t)) + ;; At least one character or class, possibly negated. + (t + (cons + (list + (concat + "[" + (and negated "^") + (mapconcat (lambda (item) + (cond ((symbolp item) + (format "[:%s:]" item)) + ((eq (car item) (cdr item)) + (char-to-string (car item))) + ((eq (1+ (car item)) (cdr item)) + (string (car item) (cdr item))) + (t + (string (car item) ?- (cdr item))))) + items nil) + "]")) + t)))))) + +(defun rx--translate-not (negated body) + "Translate a (not ...) construct. Return (REGEXP . PRECEDENCE). +If NEGATED, negate the sense (thus making it positive)." + (unless (and body (null (cdr body))) + (error "rx `not' form takes exactly one argument")) + (let ((arg (car body))) (cond - ((<= l 1)) - ((= l 2) (= (aref r 0) ?\\)) - ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r)) - ((null lax) - (cond - ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*\\]\\'" r)) - ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r))))))) - - -(defun rx-syntax (form) - "Parse and produce code from FORM, which is `(syntax SYMBOL)'." - (rx-check form) - (let* ((sym (cadr form)) - (syntax (cdr (assq sym rx-syntax)))) + ((and (consp arg) + (pcase (car arg) + ((or 'any 'in 'char) + (rx--translate-any (not negated) (cdr arg))) + ('syntax + (rx--translate-syntax (not negated) (cdr arg))) + ('category + (rx--translate-category (not negated) (cdr arg))) + ('not + (rx--translate-not (not negated) (cdr arg)))))) + ((let ((class (cdr (assq arg rx--char-classes)))) + (and class + (rx--translate-any (not negated) (list class))))) + ((eq arg 'word-boundary) + (rx--translate-symbol + (if negated 'word-boundary 'not-word-boundary))) + (t (error "Illegal argument to rx `not': %S" arg))))) + +(defun rx--atomic-regexp (item) + "ITEM is (REGEXP . PRECEDENCE); return a regexp of precedence t." + (if (eq (cdr item) t) + (car item) + (rx--bracket (car item)))) + +(defun rx--translate-counted-repetition (min-count max-count body) + (let ((operand (rx--translate-seq body))) + (if (car operand) + (cons (append + (rx--atomic-regexp operand) + (list (concat "\\{" + (number-to-string min-count) + (cond ((null max-count) ",") + ((< min-count max-count) + (concat "," (number-to-string max-count)))) + "\\}"))) + t) + operand))) + +(defun rx--check-repeat-arg (name min-args body) + (unless (>= (length body) min-args) + (error "rx `%s' requires at least %d argument%s" + name min-args (if (= min-args 1) "" "s"))) + ;; There seems to be no reason to disallow zero counts. + (unless (natnump (car body)) + (error "rx `%s' first argument must be nonnegative" name))) + +(defun rx--translate-bounded-repetition (name body) + (let ((min-count (car body)) + (max-count (cadr body)) + (items (cddr body))) + (unless (and (natnump min-count) + (natnump max-count) + (<= min-count max-count)) + (error "rx `%s' range error" name)) + (rx--translate-counted-repetition min-count max-count items))) + +(defun rx--translate-repeat (body) + (rx--check-repeat-arg 'repeat 2 body) + (if (= (length body) 2) + (rx--translate-counted-repetition (car body) (car body) (cdr body)) + (rx--translate-bounded-repetition 'repeat body))) + +(defun rx--translate-** (body) + (rx--check-repeat-arg '** 2 body) + (rx--translate-bounded-repetition '** body)) + +(defun rx--translate->= (body) + (rx--check-repeat-arg '>= 1 body) + (rx--translate-counted-repetition (car body) nil (cdr body))) + +(defun rx--translate-= (body) + (rx--check-repeat-arg '= 1 body) + (rx--translate-counted-repetition (car body) (car body) (cdr body))) + +(defvar rx--greedy t) + +(defun rx--translate-rep (op-string greedy body) + "Translate a repetition; OP-STRING is one of \"*\", \"+\" or \"?\". +GREEDY is a boolean. Return (REGEXP . PRECEDENCE)." + (let ((operand (rx--translate-seq body))) + (if (car operand) + (cons (append (rx--atomic-regexp operand) + (list (concat op-string (unless greedy "?")))) + ;; The result has precedence seq to avoid (? (* "a")) -> "a*?" + 'seq) + operand))) + +(defun rx--control-greedy (greedy body) + "Translate the sequence BODY with greediness GREEDY. +Return (REGEXP . PRECEDENCE)." + (let ((rx--greedy greedy)) + (rx--translate-seq body))) + +(defun rx--translate-group (body) + "Translate the `group' form. Return (REGEXP . PRECEDENCE)." + (cons (rx--enclose "\\(" + (car (rx--translate-seq body)) + "\\)") + t)) + +(defun rx--translate-group-n (body) + "Translate the `group-n' form. Return (REGEXP . PRECEDENCE)." + (unless (and (integerp (car body)) (> (car body) 0)) + (error "rx `group-n' requires a positive number as first argument")) + (cons (rx--enclose (concat "\\(?" (number-to-string (car body)) ":") + (car (rx--translate-seq (cdr body))) + "\\)") + t)) + +(defun rx--translate-backref (body) + "Translate the `backref' form. Return (REGEXP . PRECEDENCE)." + (unless (and (= (length body) 1) (integerp (car body)) (<= 1 (car body) 9)) + (error "rx `backref' requires an argument in the range 1..9")) + (cons (list "\\" (number-to-string (car body))) t)) + +(defconst rx--syntax-codes + '((whitespace . ?-) ; SPC also accepted + (punctuation . ?.) + (word . ?w) ; W also accepted + (symbol . ?_) + (open-parenthesis . ?\() + (close-parenthesis . ?\)) + (expression-prefix . ?\') + (string-quote . ?\") + (paired-delimiter . ?$) + (escape . ?\\) + (character-quote . ?/) + (comment-start . ?<) + (comment-end . ?>) + (string-delimiter . ?|) + (comment-delimiter . ?!))) + +(defun rx--translate-syntax (negated body) + "Translate the `syntax' form. Return (REGEXP . PRECEDENCE)." + (unless (and body (null (cdr body))) + (error "rx `syntax' form takes exactly one argument")) + (let* ((sym (car body)) + (syntax (cdr (assq sym rx--syntax-codes)))) (unless syntax - ;; Try sregex compatibility. (cond - ((characterp sym) (setq syntax sym)) + ;; Syntax character directly (sregex compatibility) + ((and (characterp sym) (rassq sym rx--syntax-codes)) + (setq syntax sym)) + ;; Syntax character as symbol (sregex compatibility) ((symbolp sym) (let ((name (symbol-name sym))) - (if (= 1 (length name)) - (setq syntax (aref name 0)))))) + (when (= (length name) 1) + (let ((char (string-to-char name))) + (when (rassq char rx--syntax-codes) + (setq syntax char))))))) (unless syntax - (error "Unknown rx syntax `%s'" sym))) - (format "\\s%c" syntax))) - - -(defun rx-check-category (form) - "Check the argument FORM of a `(category FORM)'." - (unless (or (integerp form) - (cdr (assq form rx-categories))) - (error "Unknown category `%s'" form)) - t) - - -(defun rx-category (form) - "Parse and produce code from FORM, which is `(category SYMBOL)'." - (rx-check form) - (let ((char (if (integerp (cadr form)) - (cadr form) - (cdr (assq (cadr form) rx-categories))))) - (format "\\c%c" char))) - - -(defun rx-eval (form) - "Parse and produce code from FORM, which is `(eval FORM)'." - (rx-check form) - (rx-form (eval (cadr form)) rx-parent)) - - -(defun rx-greedy (form) - "Parse and produce code from FORM. -If FORM is `(minimal-match FORM1)', non-greedy versions of `*', -`+', and `?' operators will be used in FORM1. If FORM is -`(maximal-match FORM1)', greedy operators will be used." - (rx-check form) - (let ((rx-greedy-flag (eq (car form) 'maximal-match))) - (rx-form (cadr form) rx-parent))) - - -(defun rx-regexp (form) - "Parse and produce code from FORM, which is `(regexp STRING)'." - (rx-check form) - (rx-group-if (cadr form) rx-parent)) - + (error "Unknown rx syntax name `%s'" sym))) + (cons (list (string ?\\ (if negated ?S ?s) syntax)) + t))) + +(defconst rx--categories + '((space-for-indent . ?\s) + (base . ?.) + (consonant . ?0) + (base-vowel . ?1) + (upper-diacritical-mark . ?2) + (lower-diacritical-mark . ?3) + (tone-mark . ?4) + (symbol . ?5) + (digit . ?6) + (vowel-modifying-diacritical-mark . ?7) + (vowel-sign . ?8) + (semivowel-lower . ?9) + (not-at-end-of-line . ?<) + (not-at-beginning-of-line . ?>) + (alpha-numeric-two-byte . ?A) + (chinese-two-byte . ?C) + (chinse-two-byte . ?C) ; A typo in Emacs 21.1-24.3. + (greek-two-byte . ?G) + (japanese-hiragana-two-byte . ?H) + (indian-two-byte . ?I) + (japanese-katakana-two-byte . ?K) + (strong-left-to-right . ?L) + (korean-hangul-two-byte . ?N) + (strong-right-to-left . ?R) + (cyrillic-two-byte . ?Y) + (combining-diacritic . ?^) + (ascii . ?a) + (arabic . ?b) + (chinese . ?c) + (ethiopic . ?e) + (greek . ?g) + (korean . ?h) + (indian . ?i) + (japanese . ?j) + (japanese-katakana . ?k) + (latin . ?l) + (lao . ?o) + (tibetan . ?q) + (japanese-roman . ?r) + (thai . ?t) + (vietnamese . ?v) + (hebrew . ?w) + (cyrillic . ?y) + (can-break . ?|))) + +(defun rx--translate-category (negated body) + "Translate the `category' form. Return (REGEXP . PRECEDENCE)." + (unless (and body (null (cdr body))) + (error "rx `category' form takes exactly one argument")) + (let* ((arg (car body)) + (category + (cond ((symbolp arg) + (let ((cat (assq arg rx--categories))) + (unless cat + (error "Unknown rx category `%s'" arg)) + (cdr cat))) + ((characterp arg) arg) + (t (error "Invalid rx `category' argument `%s'" arg))))) + (cons (list (string ?\\ (if negated ?C ?c) category)) + t))) + +(defvar rx--delayed-evaluation nil + "Whether to allow certain forms to be evaluated at runtime.") + +(defun rx--translate-literal (body) + "Translate the `literal' form. Return (REGEXP . PRECEDENCE)." + (unless (and body (null (cdr body))) + (error "rx `literal' form takes exactly one argument")) + (let ((arg (car body))) + (cond ((stringp arg) + (cons (list (regexp-quote arg)) (if (= (length arg) 1) t 'seq))) + (rx--delayed-evaluation + (cons (list (list 'regexp-quote arg)) 'seq)) + (t (error "rx `literal' form with non-string argument"))))) + +(defun rx--translate-eval (body) + "Translate the `eval' form. Return (REGEXP . PRECEDENCE)." + (unless (and body (null (cdr body))) + (error "rx `eval' form takes exactly one argument")) + (rx--translate (eval (car body)))) + +(defvar rx--regexp-atomic-regexp nil) + +(defun rx--translate-regexp (body) + "Translate the `regexp' form. Return (REGEXP . PRECEDENCE)." + (unless (and body (null (cdr body))) + (error "rx `regexp' form takes exactly one argument")) + (let ((arg (car body))) + (cond ((stringp arg) + ;; Generate the regexp when needed, since rx isn't + ;; necessarily present in the byte-compilation environment. + (unless rx--regexp-atomic-regexp + (setq rx--regexp-atomic-regexp + ;; Match atomic (precedence t) regexps: may give + ;; false negatives but no false positives, assuming + ;; the target string is syntactically correct. + (rx-to-string + '(seq + bos + (or (seq "[" + (opt "^") + (opt "]") + (* (or (seq "[:" (+ (any "a-z")) ":]") + (not (any "]")))) + "]") + anything + (seq "\\" + (or anything + (seq (any "sScC_") anything) + (seq "(" + (* (or (not (any "\\")) + (seq "\\" (not (any ")"))))) + "\\)")))) + eos) + t))) + (cons (list arg) + (if (string-match-p rx--regexp-atomic-regexp arg) t nil))) + (rx--delayed-evaluation + (cons (list arg) nil)) + (t (error "rx `regexp' form with non-string argument"))))) + +(defun rx--translate-compat-form (def form) + "Translate a compatibility form from `rx-constituents'. +DEF is the definition tuple. Return (REGEXP . PRECEDENCE)." + (let* ((fn (nth 0 def)) + (min-args (nth 1 def)) + (max-args (nth 2 def)) + (predicate (nth 3 def)) + (nargs (1- (length form)))) + (when (< nargs min-args) + (error "The `%s' form takes at least %d argument(s)" + (car form) min-args)) + (when (and max-args (> nargs max-args)) + (error "The `%s' form takes at most %d argument(s)" + (car form) max-args)) + (when (and predicate (not (rx--every predicate (cdr form)))) + (error "The `%s' form requires arguments satisfying `%s'" + (car form) predicate)) + (let ((regexp (funcall fn form))) + (unless (stringp regexp) + (error "The `%s' form did not expand to a string" (car form))) + (cons (list regexp) nil)))) + +(defun rx--substitute (bindings form) + "Substitute BINDINGS in FORM. BINDINGS is an alist of (NAME . VALUES) +where VALUES is a list to splice into FORM wherever NAME occurs. +Return the substitution result wrapped in a list, since a single value +can expand to any number of values." + (cond ((symbolp form) + (let ((binding (assq form bindings))) + (if binding + (cdr binding) + (list form)))) + ((consp form) + (if (listp (cdr form)) + ;; Proper list. We substitute variables even in the head + ;; position -- who knows, might be handy one day. + (list (mapcan (lambda (x) (copy-sequence + (rx--substitute bindings x))) + form)) + ;; Cons pair (presumably an interval). + (let ((first (rx--substitute bindings (car form))) + (second (rx--substitute bindings (cdr form)))) + (if (and first (not (cdr first)) + second (not (cdr second))) + (list (cons (car first) (car second))) + (error + "Cannot substitute a &rest parameter into a dotted pair"))))) + (t (list form)))) + +;; FIXME: Consider adding extensions in Lisp macro style, where +;; arguments are passed unevaluated to code that returns the rx form +;; to use. Example: +;; +;; (rx-let ((radix-digit (radix) +;; :lisp (list 'any (cons ?0 (+ ?0 (eval radix) -1))))) +;; (rx (radix-digit (+ 5 3)))) +;; => +;; "[0-7]" +;; +;; While this would permit more powerful extensions, it's unclear just +;; how often they would be used in practice. Let's wait until there is +;; demand for it. -(defun rx-form (form &optional rx-parent) - "Parse and produce code for regular expression FORM. -FORM is a regular expression in sexp form. -RX-PARENT shows which type of expression calls and controls putting of -shy groups around the result and some more in other functions." +;; FIXME: An alternative binding syntax would be +;; +;; (NAME RXs...) +;; and +;; ((NAME ARGS...) RXs...) +;; +;; which would have two minor advantages: multiple RXs with implicit +;; `seq' in the definition, and the arglist is no longer an optional +;; element in the middle of the list. On the other hand, it's less +;; like traditional lisp arglist constructs (defun, defmacro). +;; Since it's a Scheme-like syntax, &rest parameters could be done using +;; dotted lists: +;; (rx-let (((name arg1 arg2 . rest) ...definition...)) ...) + +(defun rx--expand-template (op values arglist template) + "Return TEMPLATE with variables in ARGLIST replaced with VALUES." + (let ((bindings nil) + (value-tail values) + (formals arglist)) + (while formals + (pcase (car formals) + ('&rest + (unless (cdr formals) + (error + "Expanding rx def `%s': missing &rest parameter name" op)) + (push (cons (cadr formals) value-tail) bindings) + (setq formals nil) + (setq value-tail nil)) + (name + (unless value-tail + (error + "Expanding rx def `%s': too few arguments (got %d, need %s%d)" + op (length values) + (if (memq '&rest arglist) "at least " "") + (- (length arglist) (length (memq '&rest arglist))))) + (push (cons name (list (car value-tail))) bindings) + (setq value-tail (cdr value-tail)))) + (setq formals (cdr formals))) + (when value-tail + (error + "Expanding rx def `%s': too many arguments (got %d, need %d)" + op (length values) (length arglist))) + (let ((subst (rx--substitute bindings template))) + (if (and subst (not (cdr subst))) + (car subst) + (error "Expanding rx def `%s': must result in a single value" op))))) + +(defun rx--translate-form (form) + "Translate an rx form (list structure). Return (REGEXP . PRECEDENCE)." + (let ((body (cdr form))) + (pcase (car form) + ((or 'seq : 'and 'sequence) (rx--translate-seq body)) + ((or 'or '|) (rx--translate-or body)) + ((or 'any 'in 'char) (rx--translate-any nil body)) + ('not-char (rx--translate-any t body)) + ('not (rx--translate-not nil body)) + + ('repeat (rx--translate-repeat body)) + ('= (rx--translate-= body)) + ('>= (rx--translate->= body)) + ('** (rx--translate-** body)) + + ((or 'zero-or-more '0+) (rx--translate-rep "*" rx--greedy body)) + ((or 'one-or-more '1+) (rx--translate-rep "+" rx--greedy body)) + ((or 'zero-or-one 'opt 'optional) (rx--translate-rep "?" rx--greedy body)) + + ('* (rx--translate-rep "*" t body)) + ('+ (rx--translate-rep "+" t body)) + ((or '\? ?\s) (rx--translate-rep "?" t body)) + + ('*? (rx--translate-rep "*" nil body)) + ('+? (rx--translate-rep "+" nil body)) + ((or '\?? ??) (rx--translate-rep "?" nil body)) + + ('minimal-match (rx--control-greedy nil body)) + ('maximal-match (rx--control-greedy t body)) + + ((or 'group 'submatch) (rx--translate-group body)) + ((or 'group-n 'submatch-n) (rx--translate-group-n body)) + ('backref (rx--translate-backref body)) + + ('syntax (rx--translate-syntax nil body)) + ('not-syntax (rx--translate-syntax t body)) + ('category (rx--translate-category nil body)) + + ('literal (rx--translate-literal body)) + ('eval (rx--translate-eval body)) + ((or 'regexp 'regex) (rx--translate-regexp body)) + + (op + (cond + ((not (symbolp op)) (error "Bad rx operator `%S'" op)) + + ((let ((expanded (rx--expand-def form))) + (and expanded + (rx--translate expanded)))) + + ;; For compatibility with old rx. + ((let ((entry (assq op rx-constituents))) + (and (progn + (while (and entry (not (consp (cdr entry)))) + (setq entry + (if (symbolp (cdr entry)) + ;; Alias for another entry. + (assq (cdr entry) rx-constituents) + ;; Wrong type, try further down the list. + (assq (car entry) + (cdr (memq entry rx-constituents)))))) + entry) + (rx--translate-compat-form (cdr entry) form)))) + + (t (error "Unknown rx form `%s'" op))))))) + +(defconst rx--builtin-forms + '(seq sequence : and or | any in char not-char not + repeat = >= ** + zero-or-more 0+ * + one-or-more 1+ + + zero-or-one opt optional \? + *? +? \?? + minimal-match maximal-match + group submatch group-n submatch-n backref + syntax not-syntax category + literal eval regexp regex) + "List of built-in rx function-like symbols.") + +(defconst rx--builtin-symbols + (append '(nonl not-newline any anychar anything unmatchable + bol eol line-start line-end + bos eos string-start string-end + bow eow word-start word-end + symbol-start symbol-end + point word-boundary not-word-boundary not-wordchar) + (mapcar #'car rx--char-classes)) + "List of built-in rx variable-like symbols.") + +(defconst rx--builtin-names + (append rx--builtin-forms rx--builtin-symbols) + "List of built-in rx names. These cannot be redefined by the user.") + +(defun rx--translate (item) + "Translate the rx-expression ITEM. Return (REGEXP . PRECEDENCE)." (cond - ((stringp form) - (rx-group-if (regexp-quote form) - (if (and (eq rx-parent '*) (< 1 (length form))) - rx-parent))) - ((integerp form) - (regexp-quote (char-to-string form))) - ((symbolp form) - (let ((info (rx-info form nil))) - (cond ((stringp info) - info) - ((null info) - (error "Unknown rx form `%s'" form)) - (t - (funcall (nth 0 info) form))))) - ((consp form) - (let ((info (rx-info (car form) 'head))) - (unless (consp info) - (error "Unknown rx form `%s'" (car form))) - (funcall (nth 0 info) form))) - (t - (error "rx syntax error at `%s'" form)))) + ((stringp item) + (if (= (length item) 0) + (cons nil 'seq) + (cons (list (regexp-quote item)) (if (= (length item) 1) t 'seq)))) + ((characterp item) + (cons (list (regexp-quote (char-to-string item))) t)) + ((symbolp item) + (rx--translate-symbol item)) + ((consp item) + (rx--translate-form item)) + (t (error "Bad rx expression: %S" item)))) ;;;###autoload (defun rx-to-string (form &optional no-group) - "Parse and produce code for regular expression FORM. -FORM is a regular expression in sexp form. -NO-GROUP non-nil means don't put shy groups around the result." - (rx-group-if (rx-form form) (null no-group))) + "Translate FORM from `rx' sexp syntax into a string regexp. +The arguments to `literal' and `regexp' forms inside FORM must be +constant strings. +If NO-GROUP is non-nil, don't bracket the result in a non-capturing +group. + +For extending the `rx' notation in FORM, use `rx-define' or `rx-let-eval'." + (let* ((item (rx--translate form)) + (exprs (if no-group + (car item) + (rx--atomic-regexp item)))) + (apply #'concat exprs))) + +(defun rx--to-expr (form) + "Translate the rx-expression FORM to a Lisp expression yielding a regexp." + (let* ((rx--delayed-evaluation t) + (elems (car (rx--translate form))) + (args nil)) + ;; Merge adjacent strings. + (while elems + (let ((strings nil)) + (while (and elems (stringp (car elems))) + (push (car elems) strings) + (setq elems (cdr elems))) + (let ((s (apply #'concat (nreverse strings)))) + (unless (zerop (length s)) + (push s args)))) + (when elems + (push (car elems) args) + (setq elems (cdr elems)))) + (cond ((null args) "") ; 0 args + ((cdr args) (cons 'concat (nreverse args))) ; ≥2 args + (t (car args))))) ; 1 arg ;;;###autoload (defmacro rx (&rest regexps) "Translate regular expressions REGEXPS in sexp form to a regexp string. -REGEXPS is a non-empty sequence of forms of the sort listed below. - -Note that `rx' is a Lisp macro; when used in a Lisp program being -compiled, the translation is performed by the compiler. -See `rx-to-string' for how to do such a translation at run-time. - -The following are valid subforms of regular expressions in sexp -notation. - -STRING - matches string STRING literally. - -CHAR - matches character CHAR literally. +Each argument is one of the forms below; RX is a subform, and RX... stands +for zero or more RXs. For details, see Info node `(elisp) Rx Notation'. +See `rx-to-string' for the corresponding function. + +STRING Match a literal string. +CHAR Match a literal character. + +(seq RX...) Match the RXs in sequence. Alias: :, sequence, and. +(or RX...) Match one of the RXs. Alias: |. + +(zero-or-more RX...) Match RXs zero or more times. Alias: 0+. +(one-or-more RX...) Match RXs one or more times. Alias: 1+. +(zero-or-one RX...) Match RXs or the empty string. Alias: opt, optional. +(* RX...) Match RXs zero or more times; greedy. +(+ RX...) Match RXs one or more times; greedy. +(? RX...) Match RXs or the empty string; greedy. +(*? RX...) Match RXs zero or more times; non-greedy. +(+? RX...) Match RXs one or more times; non-greedy. +(?? RX...) Match RXs or the empty string; non-greedy. +(= N RX...) Match RXs exactly N times. +(>= N RX...) Match RXs N or more times. +(** N M RX...) Match RXs N to M times. Alias: repeat. +(minimal-match RX) Match RX, with zero-or-more, one-or-more, zero-or-one + and aliases using non-greedy matching. +(maximal-match RX) Match RX, with zero-or-more, one-or-more, zero-or-one + and aliases using greedy matching, which is the default. + +(any SET...) Match a character from one of the SETs. Each SET is a + character, a string, a range as string \"A-Z\" or cons + (?A . ?Z), or a character class (see below). Alias: in, char. +(not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC + can be (any ...), (syntax ...), (category ...), + or a character class. +not-newline Match any character except a newline. Alias: nonl. +anychar Match any character. Alias: anything. +unmatchable Never match anything at all. + +CHARCLASS Match a character from a character class. One of: + alpha, alphabetic, letter Alphabetic characters (defined by Unicode). + alnum, alphanumeric Alphabetic or decimal digit chars (Unicode). + digit numeric, num 0-9. + xdigit, hex-digit, hex 0-9, A-F, a-f. + cntrl, control ASCII codes 0-31. + blank Horizontal whitespace (Unicode). + space, whitespace, white Chars with whitespace syntax. + lower, lower-case Lower-case chars, from current case table. + upper, upper-case Upper-case chars, from current case table. + graph, graphic Graphic characters (Unicode). + print, printing Whitespace or graphic (Unicode). + punct, punctuation Not control, space, letter or digit (ASCII); + not word syntax (non-ASCII). + word, wordchar Characters with word syntax. + ascii ASCII characters (codes 0-127). + nonascii Non-ASCII characters (but not raw bytes). + +(syntax SYNTAX) Match a character with syntax SYNTAX, being one of: + whitespace, punctuation, word, symbol, open-parenthesis, + close-parenthesis, expression-prefix, string-quote, + paired-delimiter, escape, character-quote, comment-start, + comment-end, string-delimiter, comment-delimiter + +(category CAT) Match a character in category CAT, being one of: + space-for-indent, base, consonant, base-vowel, + upper-diacritical-mark, lower-diacritical-mark, tone-mark, symbol, + digit, vowel-modifying-diacritical-mark, vowel-sign, + semivowel-lower, not-at-end-of-line, not-at-beginning-of-line, + alpha-numeric-two-byte, chinese-two-byte, greek-two-byte, + japanese-hiragana-two-byte, indian-two-byte, + japanese-katakana-two-byte, strong-left-to-right, + korean-hangul-two-byte, strong-right-to-left, cyrillic-two-byte, + combining-diacritic, ascii, arabic, chinese, ethiopic, greek, + korean, indian, japanese, japanese-katakana, latin, lao, + tibetan, japanese-roman, thai, vietnamese, hebrew, cyrillic, + can-break + +Zero-width assertions: these all match the empty string in specific places. + line-start At the beginning of a line. Alias: bol. + line-end At the end of a line. Alias: eol. + string-start At the start of the string or buffer. + Alias: buffer-start, bos, bot. + string-end At the end of the string or buffer. + Alias: buffer-end, eos, eot. + point At point. + word-start At the beginning of a word. Alias: bow. + word-end At the end of a word. Alias: eow. + word-boundary At the beginning or end of a word. + not-word-boundary Not at the beginning or end of a word. + symbol-start At the beginning of a symbol. + symbol-end At the end of a symbol. + +(group RX...) Match RXs and define a capture group. Alias: submatch. +(group-n N RX...) Match RXs and define capture group N. Alias: submatch-n. +(backref N) Match the text that capture group N matched. + +(literal EXPR) Match the literal string from evaluating EXPR at run time. +(regexp EXPR) Match the string regexp from evaluating EXPR at run time. +(eval EXPR) Match the rx sexp from evaluating EXPR at compile time. + +Additional constructs can be defined using `rx-define' and `rx-let', +which see. + +\(fn REGEXPS...)" + ;; Retrieve local definitions from the macroexpansion environment. + ;; (It's unclear whether the previous value of `rx--local-definitions' + ;; should be included, and if so, in which order.) + (let ((rx--local-definitions + (cdr (assq :rx-locals macroexpand-all-environment)))) + (rx--to-expr (cons 'seq regexps)))) + +(defun rx--make-binding (name tail) + "Make a definitions entry out of TAIL. +TAIL is on the form ([ARGLIST] DEFINITION)." + (unless (symbolp name) + (error "Bad `rx' definition name: %S" name)) + ;; FIXME: Consider using a hash table or symbol property, for speed. + (when (memq name rx--builtin-names) + (error "Cannot redefine built-in rx name `%s'" name)) + (pcase tail + (`(,def) + (list def)) + (`(,args ,def) + (unless (and (listp args) (rx--every #'symbolp args)) + (error "Bad argument list for `rx' definition %s: %S" name args)) + (list args def)) + (_ (error "Bad `rx' definition of %s: %S" name tail)))) + +(defun rx--make-named-binding (bindspec) + "Make a definitions entry out of BINDSPEC. +BINDSPEC is on the form (NAME [ARGLIST] DEFINITION)." + (unless (consp bindspec) + (error "Bad `rx-let' binding: %S" bindspec)) + (cons (car bindspec) + (rx--make-binding (car bindspec) (cdr bindspec)))) + +(defun rx--extend-local-defs (bindspecs) + (append (mapcar #'rx--make-named-binding bindspecs) + rx--local-definitions)) -`not-newline', `nonl' - matches any character except a newline. - -`anything' - matches any character - -`(any SET ...)' -`(in SET ...)' -`(char SET ...)' - matches any character in SET .... SET may be a character or string. - Ranges of characters can be specified as `A-Z' in strings. - Ranges may also be specified as conses like `(?A . ?Z)'. - - SET may also be the name of a character class: `digit', - `control', `hex-digit', `blank', `graph', `print', `alnum', - `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper', - `word', or one of their synonyms. - -`(not (any SET ...))' - matches any character not in SET ... - -`line-start', `bol' - matches the empty string, but only at the beginning of a line - in the text being matched - -`line-end', `eol' - is similar to `line-start' but matches only at the end of a line - -`string-start', `bos', `bot' - matches the empty string, but only at the beginning of the - string being matched against. - -`string-end', `eos', `eot' - matches the empty string, but only at the end of the - string being matched against. - -`buffer-start' - matches the empty string, but only at the beginning of the - buffer being matched against. Actually equivalent to `string-start'. - -`buffer-end' - matches the empty string, but only at the end of the - buffer being matched against. Actually equivalent to `string-end'. - -`point' - matches the empty string, but only at point. +;;;###autoload +(defmacro rx-let-eval (bindings &rest body) + "Evaluate BODY with local BINDINGS for `rx-to-string'. +BINDINGS, after evaluation, is a list of definitions each on the form +(NAME [(ARGS...)] RX), in effect for calls to `rx-to-string' +in BODY. + +For bindings without an ARGS list, NAME is defined as an alias +for the `rx' expression RX. Where ARGS is supplied, NAME is +defined as an `rx' form with ARGS as argument list. The +parameters are bound from the values in the (NAME ...) form and +are substituted in RX. ARGS can contain `&rest' parameters, +whose values are spliced into RX where the parameter name occurs. + +Any previous definitions with the same names are shadowed during +the expansion of BODY only. +For extensions when using the `rx' macro, use `rx-let'. +To make global rx extensions, use `rx-define'. +For more details, see Info node `(elisp) Extending Rx'. + +\(fn BINDINGS BODY...)" + (declare (indent 1) (debug (form body))) + ;; FIXME: this way, `rx--extend-local-defs' may need to be autoloaded. + `(let ((rx--local-definitions (rx--extend-local-defs ,bindings))) + ,@body)) -`word-start', `bow' - matches the empty string, but only at the beginning of a word. - -`word-end', `eow' - matches the empty string, but only at the end of a word. - -`word-boundary' - matches the empty string, but only at the beginning or end of a - word. - -`(not word-boundary)' -`not-word-boundary' - matches the empty string, but not at the beginning or end of a - word. - -`symbol-start' - matches the empty string, but only at the beginning of a symbol. - -`symbol-end' - matches the empty string, but only at the end of a symbol. - -`digit', `numeric', `num' - matches 0 through 9. - -`control', `cntrl' - matches ASCII control characters. - -`hex-digit', `hex', `xdigit' - matches 0 through 9, a through f and A through F. - -`blank' - matches horizontal whitespace, as defined by Annex C of the - Unicode Technical Standard #18. In particular, it matches - spaces, tabs, and other characters whose Unicode - `general-category' property indicates they are spacing - separators. - -`graphic', `graph' - matches graphic characters--everything except whitespace, ASCII - and non-ASCII control characters, surrogates, and codepoints - unassigned by Unicode. - -`printing', `print' - matches whitespace and graphic characters. - -`alphanumeric', `alnum' - matches alphabetic characters and digits. For multibyte characters, - it matches characters whose Unicode `general-category' property - indicates they are alphabetic or decimal number characters. - -`letter', `alphabetic', `alpha' - matches alphabetic characters. For multibyte characters, - it matches characters whose Unicode `general-category' property - indicates they are alphabetic characters. - -`ascii' - matches ASCII (unibyte) characters. - -`nonascii' - matches non-ASCII (multibyte) characters. - -`lower', `lower-case' - matches anything lower-case, as determined by the current case - table. If `case-fold-search' is non-nil, this also matches any - upper-case letter. - -`upper', `upper-case' - matches anything upper-case, as determined by the current case - table. If `case-fold-search' is non-nil, this also matches any - lower-case letter. - -`punctuation', `punct' - matches punctuation. (But at present, for multibyte characters, - it matches anything that has non-word syntax.) - -`space', `whitespace', `white' - matches anything that has whitespace syntax. - -`word', `wordchar' - matches anything that has word syntax. - -`not-wordchar' - matches anything that has non-word syntax. - -`(syntax SYNTAX)' - matches a character with syntax SYNTAX. SYNTAX must be one - of the following symbols, or a symbol corresponding to the syntax - character, e.g. `\\.' for `\\s.'. - - `whitespace' (\\s- in string notation) - `punctuation' (\\s.) - `word' (\\sw) - `symbol' (\\s_) - `open-parenthesis' (\\s() - `close-parenthesis' (\\s)) - `expression-prefix' (\\s') - `string-quote' (\\s\") - `paired-delimiter' (\\s$) - `escape' (\\s\\) - `character-quote' (\\s/) - `comment-start' (\\s<) - `comment-end' (\\s>) - `string-delimiter' (\\s|) - `comment-delimiter' (\\s!) - -`(not (syntax SYNTAX))' - matches a character that doesn't have syntax SYNTAX. - -`(category CATEGORY)' - matches a character with category CATEGORY. CATEGORY must be - either a character to use for C, or one of the following symbols. - - `consonant' (\\c0 in string notation) - `base-vowel' (\\c1) - `upper-diacritical-mark' (\\c2) - `lower-diacritical-mark' (\\c3) - `tone-mark' (\\c4) - `symbol' (\\c5) - `digit' (\\c6) - `vowel-modifying-diacritical-mark' (\\c7) - `vowel-sign' (\\c8) - `semivowel-lower' (\\c9) - `not-at-end-of-line' (\\c<) - `not-at-beginning-of-line' (\\c>) - `alpha-numeric-two-byte' (\\cA) - `chinese-two-byte' (\\cC) - `greek-two-byte' (\\cG) - `japanese-hiragana-two-byte' (\\cH) - `indian-two-byte' (\\cI) - `japanese-katakana-two-byte' (\\cK) - `korean-hangul-two-byte' (\\cN) - `cyrillic-two-byte' (\\cY) - `combining-diacritic' (\\c^) - `ascii' (\\ca) - `arabic' (\\cb) - `chinese' (\\cc) - `ethiopic' (\\ce) - `greek' (\\cg) - `korean' (\\ch) - `indian' (\\ci) - `japanese' (\\cj) - `japanese-katakana' (\\ck) - `latin' (\\cl) - `lao' (\\co) - `tibetan' (\\cq) - `japanese-roman' (\\cr) - `thai' (\\ct) - `vietnamese' (\\cv) - `hebrew' (\\cw) - `cyrillic' (\\cy) - `can-break' (\\c|) - -`(not (category CATEGORY))' - matches a character that doesn't have category CATEGORY. - -`(and SEXP1 SEXP2 ...)' -`(: SEXP1 SEXP2 ...)' -`(seq SEXP1 SEXP2 ...)' -`(sequence SEXP1 SEXP2 ...)' - matches what SEXP1 matches, followed by what SEXP2 matches, etc. - -`(submatch SEXP1 SEXP2 ...)' -`(group SEXP1 SEXP2 ...)' - like `and', but makes the match accessible with `match-end', - `match-beginning', and `match-string'. - -`(submatch-n N SEXP1 SEXP2 ...)' -`(group-n N SEXP1 SEXP2 ...)' - like `group', but make it an explicitly-numbered group with - group number N. - -`(or SEXP1 SEXP2 ...)' -`(| SEXP1 SEXP2 ...)' - matches anything that matches SEXP1 or SEXP2, etc. If all - args are strings, use `regexp-opt' to optimize the resulting - regular expression. - -`(minimal-match SEXP)' - produce a non-greedy regexp for SEXP. Normally, regexps matching - zero or more occurrences of something are \"greedy\" in that they - match as much as they can, as long as the overall regexp can - still match. A non-greedy regexp matches as little as possible. - -`(maximal-match SEXP)' - produce a greedy regexp for SEXP. This is the default. - -Below, `SEXP ...' represents a sequence of regexp forms, treated as if -enclosed in `(and ...)'. - -`(zero-or-more SEXP ...)' -`(0+ SEXP ...)' - matches zero or more occurrences of what SEXP ... matches. - -`(* SEXP ...)' - like `zero-or-more', but always produces a greedy regexp, independent - of `rx-greedy-flag'. - -`(*? SEXP ...)' - like `zero-or-more', but always produces a non-greedy regexp, - independent of `rx-greedy-flag'. - -`(one-or-more SEXP ...)' -`(1+ SEXP ...)' - matches one or more occurrences of SEXP ... - -`(+ SEXP ...)' - like `one-or-more', but always produces a greedy regexp. - -`(+? SEXP ...)' - like `one-or-more', but always produces a non-greedy regexp. - -`(zero-or-one SEXP ...)' -`(optional SEXP ...)' -`(opt SEXP ...)' - matches zero or one occurrences of A. - -`(? SEXP ...)' - like `zero-or-one', but always produces a greedy regexp. - -`(?? SEXP ...)' - like `zero-or-one', but always produces a non-greedy regexp. - -`(repeat N SEXP)' -`(= N SEXP ...)' - matches N occurrences. - -`(>= N SEXP ...)' - matches N or more occurrences. - -`(repeat N M SEXP)' -`(** N M SEXP ...)' - matches N to M occurrences. - -`(backref N)' - matches what was matched previously by submatch N. - -`(eval FORM)' - evaluate FORM and insert result. If result is a string, - `regexp-quote' it. - -`(regexp REGEXP)' - include REGEXP in string notation in the result." - (cond ((null regexps) - (error "No regexp")) - ((cdr regexps) - (rx-to-string `(and ,@regexps) t)) - (t - (rx-to-string (car regexps) t)))) +;;;###autoload +(defmacro rx-let (bindings &rest body) + "Evaluate BODY with local BINDINGS for `rx'. +BINDINGS is an unevaluated list of bindings each on the form +(NAME [(ARGS...)] RX). +They are bound lexically and are available in `rx' expressions in +BODY only. + +For bindings without an ARGS list, NAME is defined as an alias +for the `rx' expression RX. Where ARGS is supplied, NAME is +defined as an `rx' form with ARGS as argument list. The +parameters are bound from the values in the (NAME ...) form and +are substituted in RX. ARGS can contain `&rest' parameters, +whose values are spliced into RX where the parameter name occurs. + +Any previous definitions with the same names are shadowed during +the expansion of BODY only. +For local extensions to `rx-to-string', use `rx-let-eval'. +To make global rx extensions, use `rx-define'. +For more details, see Info node `(elisp) Extending Rx'. + +\(fn BINDINGS BODY...)" + (declare (indent 1) (debug (sexp body))) + (let ((prev-locals (cdr (assq :rx-locals macroexpand-all-environment))) + (new-locals (mapcar #'rx--make-named-binding bindings))) + (macroexpand-all (cons 'progn body) + (cons (cons :rx-locals (append new-locals prev-locals)) + macroexpand-all-environment)))) +;;;###autoload +(defmacro rx-define (name &rest definition) + "Define NAME as a global `rx' definition. +If the ARGS list is omitted, define NAME as an alias for the `rx' +expression RX. + +If the ARGS list is supplied, define NAME as an `rx' form with +ARGS as argument list. The parameters are bound from the values +in the (NAME ...) form and are substituted in RX. +ARGS can contain `&rest' parameters, whose values are spliced +into RX where the parameter name occurs. + +Any previous global definition of NAME is overwritten with the new one. +To make local rx extensions, use `rx-let' for `rx', +`rx-let-eval' for `rx-to-string'. +For more details, see Info node `(elisp) Extending Rx'. + +\(fn NAME [(ARGS...)] RX)" + (declare (indent 1)) + `(eval-and-compile + (put ',name 'rx-definition ',(rx--make-binding name definition)) + ',name)) + +;; During `rx--pcase-transform', list of defined variables in right-to-left +;; order. +(defvar rx--pcase-vars) + +;; FIXME: The rewriting strategy for pcase works so-so with extensions; +;; definitions cannot expand to `let' or named `backref'. If this ever +;; becomes a problem, we can handle those forms in the ordinary parser, +;; using a dynamic variable for activating the augmented forms. + +(defun rx--pcase-transform (rx) + "Transform RX, an rx-expression augmented with `let' and named `backref', +into a plain rx-expression, collecting names into `rx--pcase-vars'." + (pcase rx + (`(let ,name . ,body) + (let* ((index (length (memq name rx--pcase-vars))) + (i (if (zerop index) + (length (push name rx--pcase-vars)) + index))) + `(group-n ,i ,(rx--pcase-transform (cons 'seq body))))) + ((and `(backref ,ref) + (guard (symbolp ref))) + (let ((index (length (memq ref rx--pcase-vars)))) + (when (zerop index) + (error "rx `backref' variable must be one of: %s" + (mapconcat #'symbol-name rx--pcase-vars " "))) + `(backref ,index))) + ((and `(,head . ,rest) + (guard (and (symbolp head) + (not (memq head '(literal regexp regex eval)))))) + (cons head (mapcar #'rx--pcase-transform rest))) + (_ rx))) (pcase-defmacro rx (&rest regexps) - "Build a `pcase' pattern matching `rx' REGEXPS in sexp form. -The REGEXPS are interpreted as in `rx'. The pattern matches any -string that is a match for the regular expression so constructed, -as if by `string-match'. + "A pattern that matches strings against `rx' REGEXPS in sexp form. +REGEXPS are interpreted as in `rx'. The pattern matches any +string that is a match for REGEXPS, as if by `string-match'. -In addition to the usual `rx' constructs, REGEXPS can contain the +In addition to the usual `rx' syntax, REGEXPS can contain the following constructs: - (let REF SEXP...) creates a new explicitly named reference to - a submatch that matches regular expressions - SEXP, and binds the match to REF. - (backref REF) creates a backreference to the submatch - introduced by a previous (let REF ...) - construct. REF can be the same symbol - in the first argument of the corresponding - (let REF ...) construct, or it can be a - submatch number. It matches the referenced - submatch. - -The REFs are associated with explicitly named submatches starting -from 1. Multiple occurrences of the same REF refer to the same -submatch. - -If a case matches, the match data is modified as usual so you can -use it in the case body, but you still have to pass the correct -string as argument to `match-string'." - (let* ((vars ()) - (rx-constituents - `((let - ,(lambda (form) - (rx-check form) - (let ((var (cadr form))) - (cl-check-type var symbol) - (let ((i (or (cl-position var vars :test #'eq) - (prog1 (length vars) - (setq vars `(,@vars ,var)))))) - (rx-form `(submatch-n ,(1+ i) ,@(cddr form)))))) - 1 nil) - (backref - ,(lambda (form) - (rx-check form) - (rx-backref - `(backref ,(let ((var (cadr form))) - (if (integerp var) var - (1+ (cl-position var vars :test #'eq))))))) - 1 1 - ,(lambda (var) - (cond ((integerp var) (rx-check-backref var)) - ((memq var vars) t) - (t (error "rx `backref' variable must be one of %s: %s" - vars var))))) - ,@rx-constituents)) - (regexp (rx-to-string `(seq ,@regexps) :no-group))) + (let REF RX...) binds the symbol REF to a submatch that matches + the regular expressions RX. REF is bound in + CODE to the string of the submatch or nil, but + can also be used in `backref'. + (backref REF) matches whatever the submatch REF matched. + REF can be a number, as usual, or a name + introduced by a previous (let REF ...) + construct." + (let* ((rx--pcase-vars nil) + (regexp (rx--to-expr (rx--pcase-transform (cons 'seq regexps))))) `(and (pred (string-match ,regexp)) - ,@(cl-loop for i from 1 - for var in vars - collect `(app (match-string ,i) ,var))))) - -;; ;; sregex.el replacement - -;; ;;;###autoload (provide 'sregex) -;; ;;;###autoload (autoload 'sregex "rx") -;; (defalias 'sregex 'rx-to-string) -;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro) -;; (defalias 'sregexq 'rx) - + ,@(let ((i 0)) + (mapcar (lambda (name) + (setq i (1+ i)) + `(app (match-string ,i) ,name)) + (reverse rx--pcase-vars)))))) + +;; Obsolete internal symbol, used in old versions of the `flycheck' package. +(define-obsolete-function-alias 'rx-submatch-n 'rx-to-string "27.1") + (provide 'rx) ;;; rx.el ends here |