summaryrefslogtreecommitdiff
path: root/lisp/international/textsec.el
diff options
context:
space:
mode:
Diffstat (limited to 'lisp/international/textsec.el')
-rw-r--r--lisp/international/textsec.el467
1 files changed, 467 insertions, 0 deletions
diff --git a/lisp/international/textsec.el b/lisp/international/textsec.el
new file mode 100644
index 00000000000..82eba1b5d51
--- /dev/null
+++ b/lisp/international/textsec.el
@@ -0,0 +1,467 @@
+;;; textsec.el --- Functions for handling homoglyphs and the like -*- lexical-binding: t; -*-
+
+;; Copyright (C) 2022 Free Software Foundation, Inc.
+
+;; This file is part of GNU Emacs.
+
+;; GNU Emacs is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;;
+
+;;; Code:
+
+(require 'cl-lib)
+(require 'uni-confusable)
+(require 'ucs-normalize)
+(require 'idna-mapping)
+(require 'puny)
+(require 'mail-parse)
+(require 'url)
+
+(defvar textsec--char-scripts nil)
+
+(eval-and-compile
+ (defun textsec--create-script-table (data)
+ "Create the textsec--char-scripts char table."
+ (setq textsec--char-scripts (make-char-table nil))
+ (dolist (scripts data)
+ (dolist (range (cadr scripts))
+ (set-char-table-range textsec--char-scripts
+ range (car scripts)))))
+ (require 'uni-scripts))
+
+(defun textsec-scripts (string)
+ "Return a list of Unicode scripts used in STRING.
+The scripts returned by this function use the Unicode Script property
+as defined by the Unicode Standard Annex 24 (UAX#24)."
+ (seq-map (lambda (char)
+ (elt textsec--char-scripts char))
+ string))
+
+(defun textsec-single-script-p (string)
+ "Return non-nil if STRING is all in a single Unicode script.
+
+Note that the concept of \"single script\" used by this function
+isn't obvious -- some mixtures of scripts count as a \"single
+script\". See
+
+ https://www.unicode.org/reports/tr39/#Mixed_Script_Detection
+
+for details. The Unicode scripts are as defined by the
+Unicode Standard Annex 24 (UAX#24)."
+ (let ((scripts (mapcar
+ (lambda (s)
+ (append s
+ ;; Some scripts used in East Asia are
+ ;; commonly used across borders, so we add
+ ;; those.
+ (mapcan (lambda (script)
+ (copy-sequence
+ (textsec--augment-script script)))
+ s)))
+ (textsec-scripts string))))
+ (catch 'empty
+ (cl-loop for s1 in scripts
+ do (cl-loop for s2 in scripts
+ ;; Common/inherited chars can be used in
+ ;; text with all scripts.
+ when (and (not (memq 'common s1))
+ (not (memq 'common s2))
+ (not (memq 'inherited s1))
+ (not (memq 'inherited s2))
+ (not (seq-intersection s1 s2)))
+ do (throw 'empty nil)))
+ t)))
+
+(defun textsec--augment-script (script)
+ (cond
+ ((eq script 'han)
+ '(hangul japan korea))
+ ((or (eq script 'hiragana)
+ (eq script 'katakana))
+ '(japan))
+ ((or (eq script 'hangul)
+ (eq script 'bopomofo))
+ '(korea))))
+
+(defun textsec-covering-scripts (string)
+ "Return a minimal list of scripts used in STRING.
+Note that a string may have several different minimal cover sets.
+The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)."
+ (let* ((scripts (textsec-scripts string))
+ (set (car scripts)))
+ (dolist (s scripts)
+ (setq set (seq-union set (seq-difference s set))))
+ (sort (delq 'common (delq 'inherited set)) #'string<)))
+
+(defun textsec-restriction-level (string)
+ "Say what restriction level STRING qualifies for.
+Levels are (in decreasing order of restrictiveness) `ascii-only',
+`single-script', `highly-restrictive', `moderately-restrictive',
+`minimally-restrictive' and `unrestricted'."
+ (let ((scripts (textsec-covering-scripts string)))
+ (cond
+ ((string-match "\\`[[:ascii:]]+\\'" string)
+ 'ascii-only)
+ ((textsec-single-script-p string)
+ 'single-script)
+ ((or (null (seq-difference scripts '(latin han hiragana katakana)))
+ (null (seq-difference scripts '(latin han bopomofo)))
+ (null (seq-difference scripts '(latin han hangul))))
+ 'highly-restrictive)
+ ((and (= (length scripts) 2)
+ (memq 'latin scripts)
+ ;; This list comes from
+ ;; https://www.unicode.org/reports/tr31/#Table_Recommended_Scripts
+ ;; (but without latin, cyrillic and greek).
+ (seq-intersection scripts
+ '(arabic
+ armenian
+ bengali
+ bopomofo
+ devanagari
+ ethiopic
+ georgian
+ gujarati
+ gurmukhi
+ hangul
+ han
+ hebrew
+ hiragana
+ katakana
+ kannada
+ khmer
+ lao
+ malayalam
+ myanmar
+ oriya
+ sinhala
+ tamil
+ telugu
+ thaana
+ thai
+ tibetan)))
+ ;; The string is covered by Latin and any one other Recommended
+ ;; script, except Cyrillic, Greek.
+ 'moderately-retrictive)
+ ;; Fixme `minimally-restrictive' -- needs well-formedness criteria
+ ;; and Identifier Profile.
+ (t
+ 'unrestricted))))
+
+(defun textsec-mixed-numbers-p (string)
+ "Return non-nil if STRING includes numbers from different decimal systems."
+ (>
+ (length
+ (seq-uniq
+ (mapcar
+ (lambda (char)
+ ;; Compare zeros in the respective decimal systems.
+ (- char (get-char-code-property char 'numeric-value)))
+ (seq-filter (lambda (char)
+ ;; We're selecting the characters that
+ ;; have a numeric property.
+ (eq (get-char-code-property char 'general-category)
+ 'Nd))
+ string))))
+ 1))
+
+(defun textsec-ascii-confusable-p (string)
+ "Return non-nil if non-ASCII STRING can be confused with ASCII on display."
+ (and (not (eq (textsec-restriction-level string) 'ascii-only))
+ (eq (textsec-restriction-level (textsec-unconfuse-string string))
+ 'ascii-only)))
+
+(defun textsec-unconfuse-string (string)
+ "Return a de-confused version of STRING.
+This algorithm is described in:
+
+ https://www.unicode.org/reports/tr39/#Confusable_Detection"
+ (ucs-normalize-NFD-string
+ (apply #'concat
+ (seq-map (lambda (char)
+ (or (gethash char uni-confusable-table)
+ (string char)))
+ (ucs-normalize-NFD-string string)))))
+
+(defun textsec-resolved-script-set (string)
+ "Return the resolved script set for STRING.
+This is the minimal covering script set for STRING, but is nil is
+STRING isn't a single script string.
+The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)."
+ (and (textsec-single-script-p string)
+ (textsec-covering-scripts string)))
+
+(defun textsec-single-script-confusable-p (string1 string2)
+ "Say whether STRING1 and STRING2 are single-script confusables.
+The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)."
+ (and (equal (textsec-unconfuse-string string1)
+ (textsec-unconfuse-string string2))
+ ;; And they have to have at least one resolved script in
+ ;; common.
+ (seq-intersection (textsec-resolved-script-set string1)
+ (textsec-resolved-script-set string2))))
+
+(defun textsec-mixed-script-confusable-p (string1 string2)
+ "Say whether STRING1 and STRING2 are mixed-script confusables.
+The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)."
+ (and (equal (textsec-unconfuse-string string1)
+ (textsec-unconfuse-string string2))
+ ;; And they have no resolved scripts in common.
+ (null (seq-intersection (textsec-resolved-script-set string1)
+ (textsec-resolved-script-set string2)))))
+
+(defun textsec-whole-script-confusable-p (string1 string2)
+ "Say whether STRING1 and STRING2 are whole-script confusables.
+The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)."
+ (and (textsec-mixed-script-confusable-p string1 string2)
+ (textsec-single-script-p string1)
+ (textsec-single-script-p string2)))
+
+(defun textsec--ipvx-address-p (domain)
+ "Return non-nil if DOMAIN is an ipv4 or ipv6 address."
+ ;; This is a very relaxed pattern for IPv4 or IPv6 addresses. The
+ ;; assumption is that any malformed address accepted by this rule
+ ;; will be rejected by the actual address parser eventually.
+ (let ((case-fold-search t))
+ (rx-let ((ipv4 (** 1 4
+ (** 1 3 (in "0-9"))
+ (? ".")))
+ (ipv6 (: (** 1 7
+ (** 0 4 (in "0-9a-f"))
+ ":")
+ (** 0 4 (in "0-9a-f"))
+ (? ":" ipv4))))
+ (string-match-p (rx bos (or ipv4 ipv6 (: "[" ipv6 "]")) eos) domain))))
+
+(defun textsec-domain-suspicious-p (domain)
+ "Say whether DOMAIN's name looks suspicious.
+Return nil if it isn't suspicious. If it is, return a string explaining
+the potential problem.
+
+Domain names are considered suspicious if they use characters
+that can look similar to other characters when displayed, or
+use characters that are not allowed by Unicode's IDNA mapping,
+or use certain other unusual mixtures of characters."
+ (catch 'found
+ ;; Plain domains aren't suspicious.
+ (when (textsec--ipvx-address-p domain)
+ (throw 'found nil))
+ (seq-do
+ (lambda (char)
+ (when (eq (elt idna-mapping-table char) t)
+ (throw 'found
+ (format "Disallowed character%s (#x%x, %s)"
+ (if (eq (get-char-code-property char 'general-category)
+ 'Cf)
+ ""
+ (concat ": " (string char)))
+ char
+ (get-char-code-property char 'name)))))
+ domain)
+ ;; Does IDNA allow it?
+ (unless (puny-highly-restrictive-domain-p domain)
+ (throw
+ 'found
+ (format "`%s' mixes characters from different scripts in suspicious ways"
+ domain)))
+ ;; Check whether any segment of the domain name is confusable with
+ ;; an ASCII-only segment.
+ (dolist (elem (split-string domain "\\."))
+ (when (textsec-ascii-confusable-p elem)
+ (throw 'found (format "`%s' is confusable with ASCII" elem))))
+ nil))
+
+(defun textsec-local-address-suspicious-p (local)
+ "Say whether LOCAL part of an email address looks suspicious.
+LOCAL is the bit before \"@\" in an email address.
+
+If it isn't suspicious, return nil. If it is, return a string explaining
+the potential problem.
+
+Email addresses are considered suspicious if they use characters
+that can look similar to other characters when displayed, or use
+certain other unusual mixtures of characters."
+ (cond
+ ((not (equal local (ucs-normalize-NFKC-string local)))
+ (format "`%s' is not in normalized format `%s'"
+ local (ucs-normalize-NFKC-string local)))
+ ((textsec-mixed-numbers-p local)
+ (format "`%s' contains numbers from different number systems" local))
+ ((eq (textsec-restriction-level local) 'unrestricted)
+ (format "`%s' isn't restrictive enough" local))
+ ((string-match-p "\\`\\.\\|\\.\\'\\|\\.\\." local)
+ (format "`%s' contains invalid dots" local))))
+
+(defun textsec-bidi-controls-suspicious-p (string)
+ "Return non-nil of STRING uses bidi controls in suspicious ways.
+If STRING doesn't include any suspicious uses of bidirectional
+formatting control characters, return nil. Otherwise, return the
+index of the first character in STRING affected by such suspicious
+use of bidi controls. If the returned value is beyond the length
+of STRING, it means any text following STRING on display might be
+affected by bidi controls in STRING."
+ (with-temp-buffer
+ ;; We add a string that's representative of some text that could
+ ;; follow STRING, with the purpose of detecting residual bidi
+ ;; state at end of STRING which could then affect the following
+ ;; text.
+ (insert string "a1א:!")
+ (let ((pos (bidi-find-overridden-directionality 1 (point-max) nil)))
+ (and (fixnump pos)
+ (1- pos)))))
+
+(defun textsec-name-suspicious-p (name)
+ "Say whether NAME looks suspicious.
+NAME is (for instance) the free-text display name part of an
+email address.
+
+If it isn't suspicious, return nil. If it is, return a string
+explaining the potential problem.
+
+Names are considered suspicious if they use characters that can
+look similar to other characters when displayed, or use certain
+other unusual mixtures of characters."
+ (cond
+ ((not (equal name (ucs-normalize-NFC-string name)))
+ (format "`%s' is not in normalized format `%s'"
+ name (ucs-normalize-NFC-string name)))
+ ((and (seq-find (lambda (char)
+ (and (member char bidi-control-characters)
+ (not (member char
+ '( ?\N{left-to-right mark}
+ ?\N{right-to-left mark}
+ ?\N{arabic letter mark})))))
+ name)
+ ;; We have bidirectional formatting characters, but check
+ ;; whether they affect any other characters in suspicious
+ ;; ways. If not, NAME is not suspicious.
+ (fixnump (textsec-bidi-controls-suspicious-p name)))
+ (format "`%s' contains suspicious uses of bidirectional control characters"
+ name))
+ ((textsec-suspicious-nonspacing-p name))))
+
+(defun textsec-suspicious-nonspacing-p (string)
+ "Say whether STRING uses nonspacing characters in suspicious ways.
+If it doesn't, return nil. If it does, return a string explaining
+the potential problem.
+
+Use of nonspacing characters is considered suspicious if there are
+two or more consecutive identical nonspacing characters, or too many
+consecutive nonspacing characters."
+ (let ((prev nil)
+ (nonspace-count 0))
+ (catch 'found
+ (seq-do
+ (lambda (char)
+ (let ((nonspacing
+ (memq (get-char-code-property char 'general-category)
+ '(Mn Me))))
+ (when (and nonspacing
+ (equal char prev))
+ (throw 'found "Two identical consecutive nonspacing characters"))
+ (setq nonspace-count (if nonspacing
+ (1+ nonspace-count)
+ 0))
+ (when (> nonspace-count 4)
+ (throw 'found
+ "Too many consecutive nonspacing characters"))
+ (setq prev char)))
+ string)
+ nil)))
+
+(defun textsec-email-address-suspicious-p (address)
+ "Say whether EMAIL address looks suspicious.
+If it isn't, return nil. If it is, return a string explaining the
+potential problem.
+
+An email address is considered suspicious if either of its two
+parts -- the local address name or the domain -- are found to be
+suspicious by, respectively, `textsec-local-address-suspicious-p'
+and `textsec-domain-suspicious-p'."
+ (pcase-let ((`(,local ,domain) (split-string address "@")))
+ (or
+ (textsec-domain-suspicious-p domain)
+ (textsec-local-address-suspicious-p local))))
+
+(defun textsec-email-address-header-suspicious-p (email)
+ "Say whether EMAIL looks suspicious.
+If it isn't, return nil. If it is, return a string explaining the
+potential problem.
+
+Note that EMAIL has to be a valid email specification according
+to RFC2047bis -- strings that can't be parsed will be flagged as
+suspicious.
+
+An email specification is considered suspicious if either of its
+two parts -- the address or the name -- are found to be
+suspicious by, respectively, `textsec-email-address-suspicious-p'
+and `textsec-name-suspicious-p'."
+ (catch 'end
+ (pcase-let ((`(,address . ,name)
+ (condition-case nil
+ (mail-header-parse-address email t)
+ (error (throw 'end "Email address can't be parsed.")))))
+ (or
+ (textsec-email-address-suspicious-p address)
+ (and name (textsec-name-suspicious-p name))))))
+
+(defun textsec-url-suspicious-p (url)
+ "Say whether URL looks suspicious.
+If it isn't, return nil. If it is, return a string explaining the
+potential problem."
+ (let ((parsed (url-generic-parse-url url)))
+ ;; The URL may not have a domain.
+ (and (url-host parsed)
+ (textsec-domain-suspicious-p (url-host parsed)))))
+
+(defun textsec-link-suspicious-p (link)
+ "Say whether LINK is suspicious.
+LINK should be a cons cell where the first element is the URL,
+and the second element is the link text.
+
+This function will return non-nil if it seems like the link text
+is misleading about where the URL takes you. This is typical
+when the link text looks like an URL itself, but doesn't lead to
+the same domain as the URL."
+ (let* ((url (car link))
+ (text (string-trim (cdr link))))
+ (catch 'found
+ (let ((udomain (url-host (url-generic-parse-url url)))
+ (tdomain (url-host (url-generic-parse-url text))))
+ (cond
+ ((and udomain
+ tdomain
+ (not (equal udomain tdomain))
+ ;; One may be a sub-domain of the other, but don't allow too
+ ;; short domains.
+ (not (or (and (string-suffix-p udomain tdomain)
+ (url-domsuf-cookie-allowed-p udomain))
+ (and (string-suffix-p tdomain udomain)
+ (url-domsuf-cookie-allowed-p tdomain)))))
+ (throw 'found
+ (format "Text `%s' doesn't point to link URL `%s'"
+ text url)))
+ ((and tdomain
+ (textsec-domain-suspicious-p tdomain))
+ (throw 'found
+ (format "Domain `%s' in the link text is suspicious"
+ (bidi-string-strip-control-characters
+ tdomain)))))))))
+
+(provide 'textsec)
+
+;;; textsec.el ends here