summaryrefslogtreecommitdiff
path: root/lisp/char-fold.el
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2016-05-26 12:55:06 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2016-05-26 12:55:06 -0700
commit0bf5739b77c75f13d46fc49d5e3c098fe49a5070 (patch)
treed342da7bc9065197736a8184e55c9925a40f04f0 /lisp/char-fold.el
parent764f04871d67a5aad8943136d5142ed59bfa9a51 (diff)
parentc3489d050405ccb026cd44a280ead3a5f6b456d9 (diff)
downloademacs-0bf5739b77c75f13d46fc49d5e3c098fe49a5070.tar.gz
emacs-0bf5739b77c75f13d46fc49d5e3c098fe49a5070.tar.bz2
emacs-0bf5739b77c75f13d46fc49d5e3c098fe49a5070.zip
Merge from origin/emacs-25
c3489d0 * lisp/w32-fns.el (set-message-beep, w32-get-locale-info) (w3... a4d882c Correct old cell name unbinding when renaming cell. 6c12c53 Merge branch 'emacs-25' of git.sv.gnu.org:/srv/git/emacs into... 0be6725 Document problem: slow screen refresh on missing font. 853b9b9 * admin/admin.el (add-release-logs): Basic check of existing ... 5fa80cf * build-aux/gitlog-to-emacslog: Handle empty generated Change... 3c79e51 * admin/admin.el (add-release-logs): Generate ChangeLog if ne... 42275df * doc/misc/texinfo.tex: Revert previous change (Bug#23611). 3f4a9d9 * admin/authors.el (authors): First update the ChangeLog. 897fb6f ; 'Changes from the pre-25.1 API' copyedits 825ca25 Rename vc-stay-local back to vc-cvs-stay-local 4efb3e8 * doc/emacs/files.texi (Comparing Files): * doc/emacs/trouble... b995d1e * doc/misc/eww.texi (Advanced): Fix xref. 2e589c0 Fix cross-references between manuals f3d2ded * doc/misc/vhdl-mode.texi (Sample Init File): Rename node to ... 906c810 ; * admin/release-process: Move etc/HISTORY from here... ; * ... bea1b65 * admin/admin.el (add-release-logs): Also update etc/HISTORY. 503e752 ; * CONTRIBUTE: Fix a typo. fbfd478 Avoid aborting due to errors in arguments of 'set-face-attrib... bdfbe6d ; * admin/release-process: Copyedits. 44a6aed ; * test/automated/data-tests.el: Standardize license notice. c33ed39 ; * test/automated/viper-tests.el: Standardize license notice. df4a14b Add automated test for viper-tests.el c0139e3 Fix viper undo breakage from undo-boundary changes 920d76c Fix reference to obsolete fn ps-eval-switch 18a9bc1 Do not trash symlinks to init file 2671179 Don't print the "decomposition" line for control chars in wha... 869092c Bring back xterm pasting with middle mouse 5ab0830 Provide workaround for xftfont rendering problem c9f7ec7 * lisp/desktop.el: Disable restore frameset if in non-graphic... 30989a0 Mention GTK+ problems in etc/PROBLEMS 421e3c4 * lisp/emacs-lisp/package.el (package-refresh-contents): dadfc30 Revert "epg: Add a way to detect gpg1 executable for tests" e41a5cb Avoid errors with Czech and Slovak input methods d4ae6d7 epg: Add a way to detect gpg1 executable for tests ebc3a94 * lisp/emacs-lisp/package.el: Fix free variable warnings. 6e71295 * lisp/emacs-lisp/package.el (package--with-response-buffer): c45d9f6 Improve documentation of 'server-name' 3b5e38c Modernize ASLR advice in etc/PROBLEMS 1fe1e0a * lisp/char-fold.el: Rename from character-fold.el.
Diffstat (limited to 'lisp/char-fold.el')
-rw-r--r--lisp/char-fold.el242
1 files changed, 242 insertions, 0 deletions
diff --git a/lisp/char-fold.el b/lisp/char-fold.el
new file mode 100644
index 00000000000..68bea29ea45
--- /dev/null
+++ b/lisp/char-fold.el
@@ -0,0 +1,242 @@
+;;; char-fold.el --- match unicode to similar ASCII -*- lexical-binding: t; -*-
+
+;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
+
+;; Maintainer: emacs-devel@gnu.org
+;; Keywords: matching
+
+;; This file is part of GNU Emacs.
+
+;; GNU Emacs is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
+
+;;; Code:
+
+(eval-and-compile (put 'char-fold-table 'char-table-extra-slots 1))
+
+(defconst char-fold-table
+ (eval-when-compile
+ (let ((equiv (make-char-table 'char-fold-table))
+ (equiv-multi (make-char-table 'char-fold-table))
+ (table (unicode-property-table-internal 'decomposition)))
+ (set-char-table-extra-slot equiv 0 equiv-multi)
+
+ ;; Ensure the table is populated.
+ (let ((func (char-table-extra-slot table 1)))
+ (map-char-table (lambda (char v)
+ (when (consp char)
+ (funcall func (car char) v table)))
+ table))
+
+ ;; Compile a list of all complex characters that each simple
+ ;; character should match.
+ ;; In summary this loop does 3 things:
+ ;; - A complex character might be allowed to match its decomp.
+ ;; - The decomp is allowed to match the complex character.
+ ;; - A single char of the decomp might be allowed to match the
+ ;; character.
+ ;; Some examples in the comments below.
+ (map-char-table
+ (lambda (char decomp)
+ (when (consp decomp)
+ ;; Skip trivial cases like ?a decomposing to (?a).
+ (unless (and (not (cdr decomp))
+ (eq char (car decomp)))
+ (if (symbolp (car decomp))
+ ;; Discard a possible formatting tag.
+ (setq decomp (cdr decomp))
+ ;; If there's no formatting tag, ensure that char matches
+ ;; its decomp exactly. This is because we want 'ä' to
+ ;; match 'ä', but we don't want '¹' to match '1'.
+ (aset equiv char
+ (cons (apply #'string decomp)
+ (aref equiv char))))
+
+ ;; Allow the entire decomp to match char. If decomp has
+ ;; multiple characters, this is done by adding an entry
+ ;; to the alist of the first character in decomp. This
+ ;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to
+ ;; match '¹'.
+ (let ((make-decomp-match-char
+ (lambda (decomp char)
+ (if (cdr decomp)
+ (aset equiv-multi (car decomp)
+ (cons (cons (apply #'string (cdr decomp))
+ (regexp-quote (string char)))
+ (aref equiv-multi (car decomp))))
+ (aset equiv (car decomp)
+ (cons (char-to-string char)
+ (aref equiv (car decomp))))))))
+ (funcall make-decomp-match-char decomp char)
+ ;; Do it again, without the non-spacing characters.
+ ;; This allows 'a' to match 'ä'.
+ (let ((simpler-decomp nil)
+ (found-one nil))
+ (dolist (c decomp)
+ (if (> (get-char-code-property c 'canonical-combining-class) 0)
+ (setq found-one t)
+ (push c simpler-decomp)))
+ (when (and simpler-decomp found-one)
+ (funcall make-decomp-match-char simpler-decomp char)
+ ;; Finally, if the decomp only had one spacing
+ ;; character, we allow this character to match the
+ ;; decomp. This is to let 'a' match 'ä'.
+ (unless (cdr simpler-decomp)
+ (aset equiv (car simpler-decomp)
+ (cons (apply #'string decomp)
+ (aref equiv (car simpler-decomp)))))))))))
+ table)
+
+ ;; Add some manual entries.
+ (dolist (it '((?\" """ "“" "”" "”" "„" "⹂" "〞" "‟" "‟" "❞" "❝" "❠" "“" "„" "〝" "〟" "🙷" "🙶" "🙸" "«" "»")
+ (?' "❟" "❛" "❜" "‘" "’" "‚" "‛" "‚" "󠀢" "❮" "❯" "‹" "›")
+ (?` "❛" "‘" "‛" "󠀢" "❮" "‹")))
+ (let ((idx (car it))
+ (chars (cdr it)))
+ (aset equiv idx (append chars (aref equiv idx)))))
+
+ ;; Convert the lists of characters we compiled into regexps.
+ (map-char-table
+ (lambda (char dec-list)
+ (let ((re (regexp-opt (cons (char-to-string char) dec-list))))
+ (if (consp char)
+ (set-char-table-range equiv char re)
+ (aset equiv char re))))
+ equiv)
+ equiv))
+ "Used for folding characters of the same group during search.
+This is a char-table with the `char-fold-table' subtype.
+
+Let us refer to the character in question by char-x.
+Each entry is either nil (meaning char-x only matches literally)
+or a regexp. This regexp should match anything that char-x can
+match by itself \(including char-x). For instance, the default
+regexp for the ?+ character is \"[+⁺₊﬩﹢+]\".
+
+This table also has one extra slot which is also a char-table.
+Each entry in the extra slot is an alist used for multi-character
+matching (which may be nil). The elements of the alist should
+have the form (SUFFIX . OTHER-REGEXP). If the characters after
+char-x are equal to SUFFIX, then this combination of char-x +
+SUFFIX is allowed to match OTHER-REGEXP. This is in addition to
+char-x being allowed to match REGEXP.
+For instance, the default alist for ?f includes:
+ \((\"fl\" . \"ffl\") (\"fi\" . \"ffi\")
+ (\"i\" . \"fi\") (\"f\" . \"ff\"))
+
+Exceptionally for the space character (32), ALIST is ignored.")
+
+(defun char-fold--make-space-string (n)
+ "Return a string that matches N spaces."
+ (format "\\(?:%s\\|%s\\)"
+ (make-string n ?\s)
+ (apply #'concat
+ (make-list n (or (aref char-fold-table ?\s) " ")))))
+
+;;;###autoload
+(defun char-fold-to-regexp (string &optional _lax from)
+ "Return a regexp matching anything that char-folds into STRING.
+Any character in STRING that has an entry in
+`char-fold-table' is replaced with that entry (which is a
+regexp) and other characters are `regexp-quote'd.
+
+If the resulting regexp would be too long for Emacs to handle,
+just return the result of calling `regexp-quote' on STRING.
+
+FROM is for internal use. It specifies an index in the STRING
+from which to start."
+ (let* ((spaces 0)
+ (multi-char-table (char-table-extra-slot char-fold-table 0))
+ (i (or from 0))
+ (end (length string))
+ (out nil))
+ ;; When the user types a space, we want to match the table entry
+ ;; for ?\s, which is generally a regexp like "[ ...]". However,
+ ;; the `search-spaces-regexp' variable doesn't "see" spaces inside
+ ;; these regexp constructs, so we need to use "\\( \\|[ ...]\\)"
+ ;; instead (to manually expose a space). Furthermore, the lax
+ ;; search engine acts on a bunch of spaces, not on individual
+ ;; spaces, so if the string contains sequential spaces like " ", we
+ ;; need to keep them grouped together like this: "\\( \\|[ ...][ ...]\\)".
+ (while (< i end)
+ (pcase (aref string i)
+ (`?\s (setq spaces (1+ spaces)))
+ (c (when (> spaces 0)
+ (push (char-fold--make-space-string spaces) out)
+ (setq spaces 0))
+ (let ((regexp (or (aref char-fold-table c)
+ (regexp-quote (string c))))
+ ;; Long string. The regexp would probably be too long.
+ (alist (unless (> end 50)
+ (aref multi-char-table c))))
+ (push (let ((matched-entries nil)
+ (max-length 0))
+ (dolist (entry alist)
+ (let* ((suffix (car entry))
+ (len-suf (length suffix)))
+ (when (eq (compare-strings suffix 0 nil
+ string (1+ i) (+ i 1 len-suf)
+ nil)
+ t)
+ (push (cons len-suf (cdr entry)) matched-entries)
+ (setq max-length (max max-length len-suf)))))
+ ;; If no suffixes matched, just go on.
+ (if (not matched-entries)
+ regexp
+;;; If N suffixes match, we "branch" out into N+1 executions for the
+;;; length of the longest match. This means "fix" will match "fix" but
+;;; not "fⅸ", but it's necessary to keep the regexp size from scaling
+;;; exponentially. See https://lists.gnu.org/archive/html/emacs-devel/2015-11/msg02562.html
+ (let ((subs (substring string (1+ i) (+ i 1 max-length))))
+ ;; `i' is still going to inc by 1 below.
+ (setq i (+ i max-length))
+ (concat
+ "\\(?:"
+ (mapconcat (lambda (entry)
+ (let ((length (car entry))
+ (suffix-regexp (cdr entry)))
+ (concat suffix-regexp
+ (char-fold-to-regexp subs nil length))))
+ `((0 . ,regexp) . ,matched-entries) "\\|")
+ "\\)"))))
+ out))))
+ (setq i (1+ i)))
+ (when (> spaces 0)
+ (push (char-fold--make-space-string spaces) out))
+ (let ((regexp (apply #'concat (nreverse out))))
+ ;; Limited by `MAX_BUF_SIZE' in `regex.c'.
+ (if (> (length regexp) 5000)
+ (regexp-quote string)
+ regexp))))
+
+
+;;; Commands provided for completeness.
+(defun char-fold-search-forward (string &optional bound noerror count)
+ "Search forward for a char-folded version of STRING.
+STRING is converted to a regexp with `char-fold-to-regexp',
+which is searched for with `re-search-forward'.
+BOUND NOERROR COUNT are passed to `re-search-forward'."
+ (interactive "sSearch: ")
+ (re-search-forward (char-fold-to-regexp string) bound noerror count))
+
+(defun char-fold-search-backward (string &optional bound noerror count)
+ "Search backward for a char-folded version of STRING.
+STRING is converted to a regexp with `char-fold-to-regexp',
+which is searched for with `re-search-backward'.
+BOUND NOERROR COUNT are passed to `re-search-backward'."
+ (interactive "sSearch: ")
+ (re-search-backward (char-fold-to-regexp string) bound noerror count))
+
+(provide 'char-fold)
+
+;;; char-fold.el ends here