blob: 884425d49227cea64921710827d12ab4ac317009 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
;;; textsec.el --- Functions for handling homoglyphs and the like -*- lexical-binding: t; -*-
;; Copyright (C) 2022 Free Software Foundation, Inc.
;; This file is part of GNU Emacs.
;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
;;; Commentary:
;;
;;; Code:
(require 'cl-lib)
(defvar textsec--char-scripts nil)
(eval-and-compile
(defun textsec--create-script-table (data)
"Create the textsec--char-scripts char table."
(setq textsec--char-scripts (make-char-table nil))
(dolist (scripts data)
(dolist (range (cadr scripts))
(set-char-table-range textsec--char-scripts
range (car scripts)))))
(require 'uni-scripts))
(defun textsec-scripts (string)
"Return a list of scripts used in STRING."
(seq-map (lambda (char)
(elt textsec--char-scripts char))
string))
(defun textsec-single-script-p (string)
"Return non-nil if STRING is all in a single script.
Note that the concept of \"single script\" used by this function
isn't obvious -- some mixtures of scripts count as a \"single
script\". See
https://www.unicode.org/reports/tr39/#Mixed_Script_Detection
for details."
(let ((scripts (mapcar
(lambda (s)
(append s
;; Some scripts used in East Asia are
;; commonly used across borders, so we add
;; those.
(mapcan (lambda (script)
(copy-sequence
(textsec--augment-script script)))
s)))
(textsec-scripts string))))
(catch 'empty
(cl-loop for s1 in scripts
do (cl-loop for s2 in scripts
;; Common/inherited chars can be used in
;; text with all scripts.
when (and (not (memq 'common s1))
(not (memq 'common s2))
(not (memq 'inherited s1))
(not (memq 'inherited s2))
(not (seq-intersection s1 s2)))
do (throw 'empty nil)))
t)))
(defun textsec--augment-script (script)
(cond
((eq script 'han)
'(hangul japan korea))
((or (eq script 'hiragana)
(eq script 'katakana))
'(japan))
((or (eq script 'hangul)
(eq script 'bopomofo))
'(korea))))
(defun textsec-covering-scripts (string)
"Return a minimal list of scripts used in STRING.
Not that a string may have several different minimal cover sets."
(let* ((scripts (textsec-scripts string))
(set (car scripts)))
(dolist (s scripts)
(setq set (seq-union set (seq-difference s set))))
(sort (delq 'common (delq 'inherited set)) #'string<)))
(provide 'textsec)
;;; textsec.el ends here
|