diff options
Diffstat (limited to 'lisp/international/textsec.el')
-rw-r--r-- | lisp/international/textsec.el | 467 |
1 files changed, 467 insertions, 0 deletions
diff --git a/lisp/international/textsec.el b/lisp/international/textsec.el new file mode 100644 index 00000000000..82eba1b5d51 --- /dev/null +++ b/lisp/international/textsec.el @@ -0,0 +1,467 @@ +;;; textsec.el --- Functions for handling homoglyphs and the like -*- lexical-binding: t; -*- + +;; Copyright (C) 2022 Free Software Foundation, Inc. + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. + +;;; Commentary: + +;; + +;;; Code: + +(require 'cl-lib) +(require 'uni-confusable) +(require 'ucs-normalize) +(require 'idna-mapping) +(require 'puny) +(require 'mail-parse) +(require 'url) + +(defvar textsec--char-scripts nil) + +(eval-and-compile + (defun textsec--create-script-table (data) + "Create the textsec--char-scripts char table." + (setq textsec--char-scripts (make-char-table nil)) + (dolist (scripts data) + (dolist (range (cadr scripts)) + (set-char-table-range textsec--char-scripts + range (car scripts))))) + (require 'uni-scripts)) + +(defun textsec-scripts (string) + "Return a list of Unicode scripts used in STRING. +The scripts returned by this function use the Unicode Script property +as defined by the Unicode Standard Annex 24 (UAX#24)." + (seq-map (lambda (char) + (elt textsec--char-scripts char)) + string)) + +(defun textsec-single-script-p (string) + "Return non-nil if STRING is all in a single Unicode script. + +Note that the concept of \"single script\" used by this function +isn't obvious -- some mixtures of scripts count as a \"single +script\". See + + https://www.unicode.org/reports/tr39/#Mixed_Script_Detection + +for details. The Unicode scripts are as defined by the +Unicode Standard Annex 24 (UAX#24)." + (let ((scripts (mapcar + (lambda (s) + (append s + ;; Some scripts used in East Asia are + ;; commonly used across borders, so we add + ;; those. + (mapcan (lambda (script) + (copy-sequence + (textsec--augment-script script))) + s))) + (textsec-scripts string)))) + (catch 'empty + (cl-loop for s1 in scripts + do (cl-loop for s2 in scripts + ;; Common/inherited chars can be used in + ;; text with all scripts. + when (and (not (memq 'common s1)) + (not (memq 'common s2)) + (not (memq 'inherited s1)) + (not (memq 'inherited s2)) + (not (seq-intersection s1 s2))) + do (throw 'empty nil))) + t))) + +(defun textsec--augment-script (script) + (cond + ((eq script 'han) + '(hangul japan korea)) + ((or (eq script 'hiragana) + (eq script 'katakana)) + '(japan)) + ((or (eq script 'hangul) + (eq script 'bopomofo)) + '(korea)))) + +(defun textsec-covering-scripts (string) + "Return a minimal list of scripts used in STRING. +Note that a string may have several different minimal cover sets. +The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)." + (let* ((scripts (textsec-scripts string)) + (set (car scripts))) + (dolist (s scripts) + (setq set (seq-union set (seq-difference s set)))) + (sort (delq 'common (delq 'inherited set)) #'string<))) + +(defun textsec-restriction-level (string) + "Say what restriction level STRING qualifies for. +Levels are (in decreasing order of restrictiveness) `ascii-only', +`single-script', `highly-restrictive', `moderately-restrictive', +`minimally-restrictive' and `unrestricted'." + (let ((scripts (textsec-covering-scripts string))) + (cond + ((string-match "\\`[[:ascii:]]+\\'" string) + 'ascii-only) + ((textsec-single-script-p string) + 'single-script) + ((or (null (seq-difference scripts '(latin han hiragana katakana))) + (null (seq-difference scripts '(latin han bopomofo))) + (null (seq-difference scripts '(latin han hangul)))) + 'highly-restrictive) + ((and (= (length scripts) 2) + (memq 'latin scripts) + ;; This list comes from + ;; https://www.unicode.org/reports/tr31/#Table_Recommended_Scripts + ;; (but without latin, cyrillic and greek). + (seq-intersection scripts + '(arabic + armenian + bengali + bopomofo + devanagari + ethiopic + georgian + gujarati + gurmukhi + hangul + han + hebrew + hiragana + katakana + kannada + khmer + lao + malayalam + myanmar + oriya + sinhala + tamil + telugu + thaana + thai + tibetan))) + ;; The string is covered by Latin and any one other Recommended + ;; script, except Cyrillic, Greek. + 'moderately-retrictive) + ;; Fixme `minimally-restrictive' -- needs well-formedness criteria + ;; and Identifier Profile. + (t + 'unrestricted)))) + +(defun textsec-mixed-numbers-p (string) + "Return non-nil if STRING includes numbers from different decimal systems." + (> + (length + (seq-uniq + (mapcar + (lambda (char) + ;; Compare zeros in the respective decimal systems. + (- char (get-char-code-property char 'numeric-value))) + (seq-filter (lambda (char) + ;; We're selecting the characters that + ;; have a numeric property. + (eq (get-char-code-property char 'general-category) + 'Nd)) + string)))) + 1)) + +(defun textsec-ascii-confusable-p (string) + "Return non-nil if non-ASCII STRING can be confused with ASCII on display." + (and (not (eq (textsec-restriction-level string) 'ascii-only)) + (eq (textsec-restriction-level (textsec-unconfuse-string string)) + 'ascii-only))) + +(defun textsec-unconfuse-string (string) + "Return a de-confused version of STRING. +This algorithm is described in: + + https://www.unicode.org/reports/tr39/#Confusable_Detection" + (ucs-normalize-NFD-string + (apply #'concat + (seq-map (lambda (char) + (or (gethash char uni-confusable-table) + (string char))) + (ucs-normalize-NFD-string string))))) + +(defun textsec-resolved-script-set (string) + "Return the resolved script set for STRING. +This is the minimal covering script set for STRING, but is nil is +STRING isn't a single script string. +The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)." + (and (textsec-single-script-p string) + (textsec-covering-scripts string))) + +(defun textsec-single-script-confusable-p (string1 string2) + "Say whether STRING1 and STRING2 are single-script confusables. +The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)." + (and (equal (textsec-unconfuse-string string1) + (textsec-unconfuse-string string2)) + ;; And they have to have at least one resolved script in + ;; common. + (seq-intersection (textsec-resolved-script-set string1) + (textsec-resolved-script-set string2)))) + +(defun textsec-mixed-script-confusable-p (string1 string2) + "Say whether STRING1 and STRING2 are mixed-script confusables. +The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)." + (and (equal (textsec-unconfuse-string string1) + (textsec-unconfuse-string string2)) + ;; And they have no resolved scripts in common. + (null (seq-intersection (textsec-resolved-script-set string1) + (textsec-resolved-script-set string2))))) + +(defun textsec-whole-script-confusable-p (string1 string2) + "Say whether STRING1 and STRING2 are whole-script confusables. +The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)." + (and (textsec-mixed-script-confusable-p string1 string2) + (textsec-single-script-p string1) + (textsec-single-script-p string2))) + +(defun textsec--ipvx-address-p (domain) + "Return non-nil if DOMAIN is an ipv4 or ipv6 address." + ;; This is a very relaxed pattern for IPv4 or IPv6 addresses. The + ;; assumption is that any malformed address accepted by this rule + ;; will be rejected by the actual address parser eventually. + (let ((case-fold-search t)) + (rx-let ((ipv4 (** 1 4 + (** 1 3 (in "0-9")) + (? "."))) + (ipv6 (: (** 1 7 + (** 0 4 (in "0-9a-f")) + ":") + (** 0 4 (in "0-9a-f")) + (? ":" ipv4)))) + (string-match-p (rx bos (or ipv4 ipv6 (: "[" ipv6 "]")) eos) domain)))) + +(defun textsec-domain-suspicious-p (domain) + "Say whether DOMAIN's name looks suspicious. +Return nil if it isn't suspicious. If it is, return a string explaining +the potential problem. + +Domain names are considered suspicious if they use characters +that can look similar to other characters when displayed, or +use characters that are not allowed by Unicode's IDNA mapping, +or use certain other unusual mixtures of characters." + (catch 'found + ;; Plain domains aren't suspicious. + (when (textsec--ipvx-address-p domain) + (throw 'found nil)) + (seq-do + (lambda (char) + (when (eq (elt idna-mapping-table char) t) + (throw 'found + (format "Disallowed character%s (#x%x, %s)" + (if (eq (get-char-code-property char 'general-category) + 'Cf) + "" + (concat ": " (string char))) + char + (get-char-code-property char 'name))))) + domain) + ;; Does IDNA allow it? + (unless (puny-highly-restrictive-domain-p domain) + (throw + 'found + (format "`%s' mixes characters from different scripts in suspicious ways" + domain))) + ;; Check whether any segment of the domain name is confusable with + ;; an ASCII-only segment. + (dolist (elem (split-string domain "\\.")) + (when (textsec-ascii-confusable-p elem) + (throw 'found (format "`%s' is confusable with ASCII" elem)))) + nil)) + +(defun textsec-local-address-suspicious-p (local) + "Say whether LOCAL part of an email address looks suspicious. +LOCAL is the bit before \"@\" in an email address. + +If it isn't suspicious, return nil. If it is, return a string explaining +the potential problem. + +Email addresses are considered suspicious if they use characters +that can look similar to other characters when displayed, or use +certain other unusual mixtures of characters." + (cond + ((not (equal local (ucs-normalize-NFKC-string local))) + (format "`%s' is not in normalized format `%s'" + local (ucs-normalize-NFKC-string local))) + ((textsec-mixed-numbers-p local) + (format "`%s' contains numbers from different number systems" local)) + ((eq (textsec-restriction-level local) 'unrestricted) + (format "`%s' isn't restrictive enough" local)) + ((string-match-p "\\`\\.\\|\\.\\'\\|\\.\\." local) + (format "`%s' contains invalid dots" local)))) + +(defun textsec-bidi-controls-suspicious-p (string) + "Return non-nil of STRING uses bidi controls in suspicious ways. +If STRING doesn't include any suspicious uses of bidirectional +formatting control characters, return nil. Otherwise, return the +index of the first character in STRING affected by such suspicious +use of bidi controls. If the returned value is beyond the length +of STRING, it means any text following STRING on display might be +affected by bidi controls in STRING." + (with-temp-buffer + ;; We add a string that's representative of some text that could + ;; follow STRING, with the purpose of detecting residual bidi + ;; state at end of STRING which could then affect the following + ;; text. + (insert string "a1א:!") + (let ((pos (bidi-find-overridden-directionality 1 (point-max) nil))) + (and (fixnump pos) + (1- pos))))) + +(defun textsec-name-suspicious-p (name) + "Say whether NAME looks suspicious. +NAME is (for instance) the free-text display name part of an +email address. + +If it isn't suspicious, return nil. If it is, return a string +explaining the potential problem. + +Names are considered suspicious if they use characters that can +look similar to other characters when displayed, or use certain +other unusual mixtures of characters." + (cond + ((not (equal name (ucs-normalize-NFC-string name))) + (format "`%s' is not in normalized format `%s'" + name (ucs-normalize-NFC-string name))) + ((and (seq-find (lambda (char) + (and (member char bidi-control-characters) + (not (member char + '( ?\N{left-to-right mark} + ?\N{right-to-left mark} + ?\N{arabic letter mark}))))) + name) + ;; We have bidirectional formatting characters, but check + ;; whether they affect any other characters in suspicious + ;; ways. If not, NAME is not suspicious. + (fixnump (textsec-bidi-controls-suspicious-p name))) + (format "`%s' contains suspicious uses of bidirectional control characters" + name)) + ((textsec-suspicious-nonspacing-p name)))) + +(defun textsec-suspicious-nonspacing-p (string) + "Say whether STRING uses nonspacing characters in suspicious ways. +If it doesn't, return nil. If it does, return a string explaining +the potential problem. + +Use of nonspacing characters is considered suspicious if there are +two or more consecutive identical nonspacing characters, or too many +consecutive nonspacing characters." + (let ((prev nil) + (nonspace-count 0)) + (catch 'found + (seq-do + (lambda (char) + (let ((nonspacing + (memq (get-char-code-property char 'general-category) + '(Mn Me)))) + (when (and nonspacing + (equal char prev)) + (throw 'found "Two identical consecutive nonspacing characters")) + (setq nonspace-count (if nonspacing + (1+ nonspace-count) + 0)) + (when (> nonspace-count 4) + (throw 'found + "Too many consecutive nonspacing characters")) + (setq prev char))) + string) + nil))) + +(defun textsec-email-address-suspicious-p (address) + "Say whether EMAIL address looks suspicious. +If it isn't, return nil. If it is, return a string explaining the +potential problem. + +An email address is considered suspicious if either of its two +parts -- the local address name or the domain -- are found to be +suspicious by, respectively, `textsec-local-address-suspicious-p' +and `textsec-domain-suspicious-p'." + (pcase-let ((`(,local ,domain) (split-string address "@"))) + (or + (textsec-domain-suspicious-p domain) + (textsec-local-address-suspicious-p local)))) + +(defun textsec-email-address-header-suspicious-p (email) + "Say whether EMAIL looks suspicious. +If it isn't, return nil. If it is, return a string explaining the +potential problem. + +Note that EMAIL has to be a valid email specification according +to RFC2047bis -- strings that can't be parsed will be flagged as +suspicious. + +An email specification is considered suspicious if either of its +two parts -- the address or the name -- are found to be +suspicious by, respectively, `textsec-email-address-suspicious-p' +and `textsec-name-suspicious-p'." + (catch 'end + (pcase-let ((`(,address . ,name) + (condition-case nil + (mail-header-parse-address email t) + (error (throw 'end "Email address can't be parsed."))))) + (or + (textsec-email-address-suspicious-p address) + (and name (textsec-name-suspicious-p name)))))) + +(defun textsec-url-suspicious-p (url) + "Say whether URL looks suspicious. +If it isn't, return nil. If it is, return a string explaining the +potential problem." + (let ((parsed (url-generic-parse-url url))) + ;; The URL may not have a domain. + (and (url-host parsed) + (textsec-domain-suspicious-p (url-host parsed))))) + +(defun textsec-link-suspicious-p (link) + "Say whether LINK is suspicious. +LINK should be a cons cell where the first element is the URL, +and the second element is the link text. + +This function will return non-nil if it seems like the link text +is misleading about where the URL takes you. This is typical +when the link text looks like an URL itself, but doesn't lead to +the same domain as the URL." + (let* ((url (car link)) + (text (string-trim (cdr link)))) + (catch 'found + (let ((udomain (url-host (url-generic-parse-url url))) + (tdomain (url-host (url-generic-parse-url text)))) + (cond + ((and udomain + tdomain + (not (equal udomain tdomain)) + ;; One may be a sub-domain of the other, but don't allow too + ;; short domains. + (not (or (and (string-suffix-p udomain tdomain) + (url-domsuf-cookie-allowed-p udomain)) + (and (string-suffix-p tdomain udomain) + (url-domsuf-cookie-allowed-p tdomain))))) + (throw 'found + (format "Text `%s' doesn't point to link URL `%s'" + text url))) + ((and tdomain + (textsec-domain-suspicious-p tdomain)) + (throw 'found + (format "Domain `%s' in the link text is suspicious" + (bidi-string-strip-control-characters + tdomain))))))))) + +(provide 'textsec) + +;;; textsec.el ends here |