summaryrefslogtreecommitdiff
path: root/src/character.h
diff options
context:
space:
mode:
authorStefan Monnier <monnier@iro.umontreal.ca>2022-09-25 16:15:16 -0400
committerStefan Monnier <monnier@iro.umontreal.ca>2022-09-25 16:15:16 -0400
commit650c20f1ca4e07591a727e1cfcc74b3363d15985 (patch)
tree85d11f6437cde22f410c25e0e5f71a3131ebd07d /src/character.h
parent8869332684c2302b5ba1ead4568bbc7ba1c0183e (diff)
parent4b85ae6a24380fb67a3315eaec9233f17a872473 (diff)
downloademacs-650c20f1ca4e07591a727e1cfcc74b3363d15985.tar.gz
emacs-650c20f1ca4e07591a727e1cfcc74b3363d15985.tar.bz2
emacs-650c20f1ca4e07591a727e1cfcc74b3363d15985.zip
Merge 'master' into noverlay
Diffstat (limited to 'src/character.h')
-rw-r--r--src/character.h869
1 files changed, 381 insertions, 488 deletions
diff --git a/src/character.h b/src/character.h
index c716885d46b..6d0f035c2bb 100644
--- a/src/character.h
+++ b/src/character.h
@@ -31,34 +31,39 @@ INLINE_HEADER_BEGIN
/* character code 1st byte byte sequence
-------------- -------- -------------
0-7F 00..7F 0xxxxxxx
- 80-7FF C2..DF 110xxxxx 10xxxxxx
- 800-FFFF E0..EF 1110xxxx 10xxxxxx 10xxxxxx
- 10000-1FFFFF F0..F7 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- 200000-3FFF7F F8 11111000 1000xxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 80-7FF C2..DF 110yyyyx 10xxxxxx
+ 800-FFFF E0..EF 1110yyyy 10yxxxxx 10xxxxxx
+ 10000-1FFFFF F0..F7 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
+ 200000-3FFF7F F8 11111000 1000yxxx 10xxxxxx 10xxxxxx 10xxxxxx
3FFF80-3FFFFF C0..C1 1100000x 10xxxxxx (for eight-bit-char)
400000-... invalid
invalid 1st byte 80..BF 10xxxxxx
- F9..FF 11111xxx (xxx != 000)
+ F9..FF 11111yyy
+
+ In each bit pattern, 'x' and 'y' each represent a single bit of the
+ character code payload, and at least one 'y' must be a 1 bit.
+ In the 5-byte sequence, the 22-bit payload cannot exceed 3FFF7F.
*/
/* Maximum character code ((1 << CHARACTERBITS) - 1). */
-#define MAX_CHAR 0x3FFFFF
+enum { MAX_CHAR = 0x3FFFFF };
/* Maximum Unicode character code. */
-#define MAX_UNICODE_CHAR 0x10FFFF
+enum { MAX_UNICODE_CHAR = 0x10FFFF };
/* Maximum N-byte character codes. */
-#define MAX_1_BYTE_CHAR 0x7F
-#define MAX_2_BYTE_CHAR 0x7FF
-#define MAX_3_BYTE_CHAR 0xFFFF
-#define MAX_4_BYTE_CHAR 0x1FFFFF
-#define MAX_5_BYTE_CHAR 0x3FFF7F
+enum { MAX_1_BYTE_CHAR = 0x7F };
+enum { MAX_2_BYTE_CHAR = 0x7FF };
+enum { MAX_3_BYTE_CHAR = 0xFFFF };
+enum { MAX_4_BYTE_CHAR = 0x1FFFFF };
+enum { MAX_5_BYTE_CHAR = 0x3FFF7F };
/* Minimum leading code of multibyte characters. */
-#define MIN_MULTIBYTE_LEADING_CODE 0xC0
-/* Maximum leading code of multibyte characters. */
-#define MAX_MULTIBYTE_LEADING_CODE 0xF8
+enum { MIN_MULTIBYTE_LEADING_CODE = 0xC0 };
+/* Maximum leading code of multibyte characters. Note: this must be
+ updated if we ever increase MAX_CHAR above. */
+enum { MAX_MULTIBYTE_LEADING_CODE = 0xF8 };
/* Unicode character values. */
enum
@@ -77,534 +82,436 @@ enum
LEFT_ANGLE_BRACKET = 0x3008,
RIGHT_ANGLE_BRACKET = 0x3009,
OBJECT_REPLACEMENT_CHARACTER = 0xFFFC,
+ TAG_SPACE = 0xE0020,
+ CANCEL_TAG = 0xE007F,
};
+extern int char_string (unsigned, unsigned char *);
+
/* UTF-8 encodings. Use \x escapes, so they are portable to pre-C11
compilers and can be concatenated with ordinary string literals. */
#define uLSQM "\xE2\x80\x98" /* U+2018 LEFT SINGLE QUOTATION MARK */
#define uRSQM "\xE2\x80\x99" /* U+2019 RIGHT SINGLE QUOTATION MARK */
-/* Nonzero iff C is a character that corresponds to a raw 8-bit
+/* True iff C is a character of code less than 0x100. */
+INLINE bool
+SINGLE_BYTE_CHAR_P (intmax_t c)
+{
+ return 0 <= c && c < 0x100;
+}
+
+/* True iff C is a character that corresponds to a raw 8-bit
byte. */
-#define CHAR_BYTE8_P(c) ((c) > MAX_5_BYTE_CHAR)
+INLINE bool
+CHAR_BYTE8_P (int c)
+{
+ return MAX_5_BYTE_CHAR < c;
+}
/* Return the character code for raw 8-bit byte BYTE. */
-#define BYTE8_TO_CHAR(byte) ((byte) + 0x3FFF00)
+INLINE int
+BYTE8_TO_CHAR (int byte)
+{
+ return byte + 0x3FFF00;
+}
-#define UNIBYTE_TO_CHAR(byte) \
- (ASCII_CHAR_P (byte) ? (byte) : BYTE8_TO_CHAR (byte))
+INLINE int
+UNIBYTE_TO_CHAR (int byte)
+{
+ return ASCII_CHAR_P (byte) ? byte : BYTE8_TO_CHAR (byte);
+}
/* Return the raw 8-bit byte for character C. */
-#define CHAR_TO_BYTE8(c) (CHAR_BYTE8_P (c) ? (c) - 0x3FFF00 : (c & 0xFF))
+INLINE int
+CHAR_TO_BYTE8 (int c)
+{
+ return CHAR_BYTE8_P (c) ? c - 0x3FFF00 : c & 0xFF;
+}
/* Return the raw 8-bit byte for character C,
or -1 if C doesn't correspond to a byte. */
-#define CHAR_TO_BYTE_SAFE(c) \
- (ASCII_CHAR_P (c) ? c : (CHAR_BYTE8_P (c) ? (c) - 0x3FFF00 : -1))
+INLINE int
+CHAR_TO_BYTE_SAFE (int c)
+{
+ return ASCII_CHAR_P (c) ? c : CHAR_BYTE8_P (c) ? c - 0x3FFF00 : -1;
+}
-/* Nonzero iff BYTE is the 1st byte of a multibyte form of a character
+/* True iff BYTE is the 1st byte of a multibyte form of a character
that corresponds to a raw 8-bit byte. */
-#define CHAR_BYTE8_HEAD_P(byte) ((byte) == 0xC0 || (byte) == 0xC1)
-
-/* If C is not ASCII, make it unibyte. */
-#define MAKE_CHAR_UNIBYTE(c) \
- do { \
- if (! ASCII_CHAR_P (c)) \
- c = CHAR_TO_BYTE8 (c); \
- } while (false)
-
+INLINE bool
+CHAR_BYTE8_HEAD_P (int byte)
+{
+ return byte == 0xC0 || byte == 0xC1;
+}
/* If C is not ASCII, make it multibyte. Assumes C < 256. */
-#define MAKE_CHAR_MULTIBYTE(c) \
- (eassert ((c) >= 0 && (c) < 256), (c) = UNIBYTE_TO_CHAR (c))
+INLINE int
+make_char_multibyte (int c)
+{
+ eassert (SINGLE_BYTE_CHAR_P (c));
+ return UNIBYTE_TO_CHAR (c);
+}
/* This is the maximum byte length of multibyte form. */
-#define MAX_MULTIBYTE_LENGTH 5
-
-/* Nonzero iff X is a character. */
-#define CHARACTERP(x) (NATNUMP (x) && XFASTINT (x) <= MAX_CHAR)
+enum { MAX_MULTIBYTE_LENGTH = 5 };
/* Nonzero iff C is valid as a character code. */
-#define CHAR_VALID_P(c) UNSIGNED_CMP (c, <=, MAX_CHAR)
+INLINE bool
+CHAR_VALID_P (intmax_t c)
+{
+ return 0 <= c && c <= MAX_CHAR;
+}
-/* Check if Lisp object X is a character or not. */
-#define CHECK_CHARACTER(x) \
- CHECK_TYPE (CHARACTERP (x), Qcharacterp, x)
+/* Nonzero iff X is a character. */
+INLINE bool
+CHARACTERP (Lisp_Object x)
+{
+ return FIXNUMP (x) && CHAR_VALID_P (XFIXNUM (x));
+}
-#define CHECK_CHARACTER_CAR(x) \
- do { \
- Lisp_Object tmp = XCAR (x); \
- CHECK_CHARACTER (tmp); \
- } while (false)
+/* Check if Lisp object X is a character or not. */
+INLINE void
+CHECK_CHARACTER (Lisp_Object x)
+{
+ CHECK_TYPE (CHARACTERP (x), Qcharacterp, x);
+}
-#define CHECK_CHARACTER_CDR(x) \
- do { \
- Lisp_Object tmp = XCDR (x); \
- CHECK_CHARACTER (tmp); \
- } while (false)
+INLINE void
+CHECK_CHARACTER_CAR (Lisp_Object x)
+{
+ CHECK_CHARACTER (XCAR (x));
+}
-/* Nonzero iff C is a character of code less than 0x100. */
-#define SINGLE_BYTE_CHAR_P(c) UNSIGNED_CMP (c, <, 0x100)
+INLINE void
+CHECK_CHARACTER_CDR (Lisp_Object x)
+{
+ CHECK_CHARACTER (XCDR (x));
+}
-/* Nonzero if character C has a printable glyph. */
-#define CHAR_PRINTABLE_P(c) \
- (((c) >= 32 && (c) < 127) \
- || ! NILP (CHAR_TABLE_REF (Vprintable_chars, (c))))
+/* True if character C has a printable glyph. */
+INLINE bool
+CHAR_PRINTABLE_P (int c)
+{
+ return ((32 <= c && c < 127)
+ || ! NILP (CHAR_TABLE_REF (Vprintable_chars, c)));
+}
/* Return byte length of multibyte form for character C. */
-#define CHAR_BYTES(c) \
- ( (c) <= MAX_1_BYTE_CHAR ? 1 \
- : (c) <= MAX_2_BYTE_CHAR ? 2 \
- : (c) <= MAX_3_BYTE_CHAR ? 3 \
- : (c) <= MAX_4_BYTE_CHAR ? 4 \
- : (c) <= MAX_5_BYTE_CHAR ? 5 \
- : 2)
-
+INLINE int
+CHAR_BYTES (int c)
+{
+ return ((MAX_5_BYTE_CHAR < c ? -2 : 1)
+ + (MAX_1_BYTE_CHAR < c)
+ + (MAX_2_BYTE_CHAR < c)
+ + (MAX_3_BYTE_CHAR < c)
+ + (MAX_4_BYTE_CHAR < c));
+}
/* Return the leading code of multibyte form of C. */
-#define CHAR_LEADING_CODE(c) \
- ((c) <= MAX_1_BYTE_CHAR ? c \
- : (c) <= MAX_2_BYTE_CHAR ? (0xC0 | ((c) >> 6)) \
- : (c) <= MAX_3_BYTE_CHAR ? (0xE0 | ((c) >> 12)) \
- : (c) <= MAX_4_BYTE_CHAR ? (0xF0 | ((c) >> 18)) \
- : (c) <= MAX_5_BYTE_CHAR ? 0xF8 \
- : (0xC0 | (((c) >> 6) & 0x01)))
+INLINE int
+CHAR_LEADING_CODE (int c)
+{
+ return (c <= MAX_1_BYTE_CHAR ? c
+ : c <= MAX_2_BYTE_CHAR ? 0xC0 | (c >> 6)
+ : c <= MAX_3_BYTE_CHAR ? 0xE0 | (c >> 12)
+ : c <= MAX_4_BYTE_CHAR ? 0xF0 | (c >> 18)
+ : c <= MAX_5_BYTE_CHAR ? 0xF8
+ : 0xC0 | ((c >> 6) & 0x01));
+}
/* Store multibyte form of the character C in P. The caller should
allocate at least MAX_MULTIBYTE_LENGTH bytes area at P in advance.
Returns the length of the multibyte form. */
-#define CHAR_STRING(c, p) \
- (UNSIGNED_CMP (c, <=, MAX_1_BYTE_CHAR) \
- ? ((p)[0] = (c), \
- 1) \
- : UNSIGNED_CMP (c, <=, MAX_2_BYTE_CHAR) \
- ? ((p)[0] = (0xC0 | ((c) >> 6)), \
- (p)[1] = (0x80 | ((c) & 0x3F)), \
- 2) \
- : UNSIGNED_CMP (c, <=, MAX_3_BYTE_CHAR) \
- ? ((p)[0] = (0xE0 | ((c) >> 12)), \
- (p)[1] = (0x80 | (((c) >> 6) & 0x3F)), \
- (p)[2] = (0x80 | ((c) & 0x3F)), \
- 3) \
- : verify_expr (sizeof (c) <= sizeof (unsigned), char_string (c, p)))
+INLINE int
+CHAR_STRING (int c, unsigned char *p)
+{
+ eassume (0 <= c);
+ if (c <= MAX_1_BYTE_CHAR)
+ {
+ p[0] = c;
+ return 1;
+ }
+ if (c <= MAX_2_BYTE_CHAR)
+ {
+ p[0] = 0xC0 | (c >> 6);
+ p[1] = 0x80 | (c & 0x3F);
+ return 2;
+ }
+ if (c <= MAX_3_BYTE_CHAR)
+ {
+ p[0] = 0xE0 | (c >> 12);
+ p[1] = 0x80 | ((c >> 6) & 0x3F);
+ p[2] = 0x80 | (c & 0x3F);
+ return 3;
+ }
+ int len = char_string (c, p);
+ eassume (0 < len && len <= MAX_MULTIBYTE_LENGTH);
+ return len;
+}
/* Store multibyte form of byte B in P. The caller should allocate at
least MAX_MULTIBYTE_LENGTH bytes area at P in advance. Returns the
length of the multibyte form. */
-#define BYTE8_STRING(b, p) \
- ((p)[0] = (0xC0 | (((b) >> 6) & 0x01)), \
- (p)[1] = (0x80 | ((b) & 0x3F)), \
- 2)
-
-
-/* Store multibyte form of the character C in P and advance P to the
- end of the multibyte form. The caller should allocate at least
- MAX_MULTIBYTE_LENGTH bytes area at P in advance. */
-
-#define CHAR_STRING_ADVANCE(c, p) \
- do { \
- if ((c) <= MAX_1_BYTE_CHAR) \
- *(p)++ = (c); \
- else if ((c) <= MAX_2_BYTE_CHAR) \
- *(p)++ = (0xC0 | ((c) >> 6)), \
- *(p)++ = (0x80 | ((c) & 0x3F)); \
- else if ((c) <= MAX_3_BYTE_CHAR) \
- *(p)++ = (0xE0 | ((c) >> 12)), \
- *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
- *(p)++ = (0x80 | ((c) & 0x3F)); \
- else \
- { \
- verify (sizeof (c) <= sizeof (unsigned)); \
- (p) += char_string (c, p); \
- } \
- } while (false)
-
-
-/* Nonzero iff BYTE starts a non-ASCII character in a multibyte
- form. */
-#define LEADING_CODE_P(byte) (((byte) & 0xC0) == 0xC0)
-
-/* Nonzero iff BYTE is a trailing code of a non-ASCII character in a
+INLINE int
+BYTE8_STRING (int b, unsigned char *p)
+{
+ p[0] = 0xC0 | ((b >> 6) & 0x01);
+ p[1] = 0x80 | (b & 0x3F);
+ return 2;
+}
+
+
+/* True iff BYTE starts a non-ASCII character in a multibyte form. */
+INLINE bool
+LEADING_CODE_P (int byte)
+{
+ return (byte & 0xC0) == 0xC0;
+}
+
+/* True iff BYTE is a trailing code of a non-ASCII character in a
multibyte form. */
-#define TRAILING_CODE_P(byte) (((byte) & 0xC0) == 0x80)
+INLINE bool
+TRAILING_CODE_P (int byte)
+{
+ return (byte & 0xC0) == 0x80;
+}
-/* Nonzero iff BYTE starts a character in a multibyte form.
+/* True iff BYTE starts a character in a multibyte form.
This is equivalent to:
(ASCII_CHAR_P (byte) || LEADING_CODE_P (byte)) */
-#define CHAR_HEAD_P(byte) (((byte) & 0xC0) != 0x80)
+INLINE bool
+CHAR_HEAD_P (int byte)
+{
+ return (byte & 0xC0) != 0x80;
+}
/* How many bytes a character that starts with BYTE occupies in a
- multibyte form. Unlike MULTIBYTE_LENGTH below, this macro does not
+ multibyte form. Unlike multibyte_length, this function does not
validate the multibyte form, but looks only at its first byte. */
-#define BYTES_BY_CHAR_HEAD(byte) \
- (!((byte) & 0x80) ? 1 \
- : !((byte) & 0x20) ? 2 \
- : !((byte) & 0x10) ? 3 \
- : !((byte) & 0x08) ? 4 \
- : 5)
+INLINE int
+BYTES_BY_CHAR_HEAD (int byte)
+{
+ return (!(byte & 0x80) ? 1
+ : !(byte & 0x20) ? 2
+ : !(byte & 0x10) ? 3
+ : !(byte & 0x08) ? 4
+ : 5);
+}
-/* The byte length of multibyte form at unibyte string P ending at
- PEND. If the string doesn't point to a valid multibyte form,
- return 0. Unlike BYTES_BY_CHAR_HEAD, this macro validates the
- multibyte form. */
+/* The byte length of the multibyte form at the unibyte string P,
+ ending at PEND if CHECK, and without a length check if !CHECK.
+ If ALLOW_8BIT, allow multibyte forms of eight-bit characters.
+ If the string doesn't point to a valid multibyte form, return 0.
+ Unlike BYTES_BY_CHAR_HEAD, this function validates the multibyte form. */
-#define MULTIBYTE_LENGTH(p, pend) \
- (p >= pend ? 0 \
- : !((p)[0] & 0x80) ? 1 \
- : ((p + 1 >= pend) || (((p)[1] & 0xC0) != 0x80)) ? 0 \
- : ((p)[0] & 0xE0) == 0xC0 ? 2 \
- : ((p + 2 >= pend) || (((p)[2] & 0xC0) != 0x80)) ? 0 \
- : ((p)[0] & 0xF0) == 0xE0 ? 3 \
- : ((p + 3 >= pend) || (((p)[3] & 0xC0) != 0x80)) ? 0 \
- : ((p)[0] & 0xF8) == 0xF0 ? 4 \
- : ((p + 4 >= pend) || (((p)[4] & 0xC0) != 0x80)) ? 0 \
- : (p)[0] == 0xF8 && ((p)[1] & 0xF0) == 0x80 ? 5 \
- : 0)
-
-
-/* Like MULTIBYTE_LENGTH, but don't check the ending address. The
- multibyte form is still validated, unlike BYTES_BY_CHAR_HEAD. */
-
-#define MULTIBYTE_LENGTH_NO_CHECK(p) \
- (!((p)[0] & 0x80) ? 1 \
- : ((p)[1] & 0xC0) != 0x80 ? 0 \
- : ((p)[0] & 0xE0) == 0xC0 ? 2 \
- : ((p)[2] & 0xC0) != 0x80 ? 0 \
- : ((p)[0] & 0xF0) == 0xE0 ? 3 \
- : ((p)[3] & 0xC0) != 0x80 ? 0 \
- : ((p)[0] & 0xF8) == 0xF0 ? 4 \
- : ((p)[4] & 0xC0) != 0x80 ? 0 \
- : (p)[0] == 0xF8 && ((p)[1] & 0xF0) == 0x80 ? 5 \
- : 0)
-
-/* If P is before LIMIT, advance P to the next character boundary.
+INLINE int
+multibyte_length (unsigned char const *p, unsigned char const *pend,
+ bool check, bool allow_8bit)
+{
+ if (!check || p < pend)
+ {
+ unsigned char c = p[0];
+ if (c < 0x80)
+ return 1;
+ if (!check || p + 1 < pend)
+ {
+ unsigned char d = p[1];
+ int w = ((d & 0xC0) << 2) + c;
+ if ((allow_8bit ? 0x2C0 : 0x2C2) <= w && w <= 0x2DF)
+ return 2;
+ if (!check || p + 2 < pend)
+ {
+ unsigned char e = p[2];
+ w += (e & 0xC0) << 4;
+ int w1 = w | ((d & 0x20) >> 2);
+ if (0xAE1 <= w1 && w1 <= 0xAEF)
+ return 3;
+ if (!check || p + 3 < pend)
+ {
+ unsigned char f = p[3];
+ w += (f & 0xC0) << 6;
+ int w2 = w | ((d & 0x30) >> 3);
+ if (0x2AF1 <= w2 && w2 <= 0x2AF7)
+ return 4;
+ if (!check || p + 4 < pend)
+ {
+ int_fast64_t lw = w + ((p[4] & 0xC0) << 8),
+ w3 = (lw << 24) + (d << 16) + (e << 8) + f;
+ if (0xAAF8888080 <= w3 && w3 <= 0xAAF88FBFBD)
+ return 5;
+ }
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+/* Return number of bytes in the multibyte character just before P.
Assumes that P is already at a character boundary of the same
- multibyte form whose end address is LIMIT. */
+ multibyte form, and is not at the start of that form. */
+
+INLINE int
+raw_prev_char_len (unsigned char const *p)
+{
+ for (int len = 1; ; len++)
+ if (CHAR_HEAD_P (p[-len]))
+ return len;
+}
-#define NEXT_CHAR_BOUNDARY(p, limit) \
- do { \
- if ((p) < (limit)) \
- (p) += BYTES_BY_CHAR_HEAD (*(p)); \
- } while (false)
+/* Return the character code of character whose multibyte form is at P,
+ and set *LENGTH to its length. */
-/* If P is after LIMIT, advance P to the previous character boundary.
- Assumes that P is already at a character boundary of the same
- multibyte form whose beginning address is LIMIT. */
-
-#define PREV_CHAR_BOUNDARY(p, limit) \
- do { \
- if ((p) > (limit)) \
- { \
- const unsigned char *chp = (p); \
- do { \
- chp--; \
- } while (chp >= limit && ! CHAR_HEAD_P (*chp)); \
- (p) = (BYTES_BY_CHAR_HEAD (*chp) == (p) - chp) ? chp : (p) - 1; \
- } \
- } while (false)
+INLINE int
+string_char_and_length (unsigned char const *p, int *length)
+{
+ int c = p[0];
+ if (! (c & 0x80))
+ {
+ *length = 1;
+ return c;
+ }
+ eassume (0xC0 <= c);
+
+ int d = (c << 6) + p[1] - ((0xC0 << 6) + 0x80);
+ if (! (c & 0x20))
+ {
+ *length = 2;
+ return d + (c < 0xC2 ? 0x3FFF80 : 0);
+ }
+
+ d = (d << 6) + p[2] - ((0x20 << 12) + 0x80);
+ if (! (c & 0x10))
+ {
+ *length = 3;
+ eassume (MAX_2_BYTE_CHAR < d && d <= MAX_3_BYTE_CHAR);
+ return d;
+ }
+
+ d = (d << 6) + p[3] - ((0x10 << 18) + 0x80);
+ if (! (c & 0x08))
+ {
+ *length = 4;
+ eassume (MAX_3_BYTE_CHAR < d && d <= MAX_4_BYTE_CHAR);
+ return d;
+ }
+
+ d = (d << 6) + p[4] - ((0x08 << 24) + 0x80);
+ *length = 5;
+ eassume (MAX_4_BYTE_CHAR < d && d <= MAX_5_BYTE_CHAR);
+ return d;
+}
/* Return the character code of character whose multibyte form is at P. */
-#define STRING_CHAR(p) \
- (!((p)[0] & 0x80) \
- ? (p)[0] \
- : ! ((p)[0] & 0x20) \
- ? (((((p)[0] & 0x1F) << 6) \
- | ((p)[1] & 0x3F)) \
- + (((unsigned char) (p)[0]) < 0xC2 ? 0x3FFF80 : 0)) \
- : ! ((p)[0] & 0x10) \
- ? ((((p)[0] & 0x0F) << 12) \
- | (((p)[1] & 0x3F) << 6) \
- | ((p)[2] & 0x3F)) \
- : string_char ((p), NULL, NULL))
-
-
-/* Like STRING_CHAR, but set ACTUAL_LEN to the length of multibyte
- form. */
-
-#define STRING_CHAR_AND_LENGTH(p, actual_len) \
- (!((p)[0] & 0x80) \
- ? ((actual_len) = 1, (p)[0]) \
- : ! ((p)[0] & 0x20) \
- ? ((actual_len) = 2, \
- (((((p)[0] & 0x1F) << 6) \
- | ((p)[1] & 0x3F)) \
- + (((unsigned char) (p)[0]) < 0xC2 ? 0x3FFF80 : 0))) \
- : ! ((p)[0] & 0x10) \
- ? ((actual_len) = 3, \
- ((((p)[0] & 0x0F) << 12) \
- | (((p)[1] & 0x3F) << 6) \
- | ((p)[2] & 0x3F))) \
- : string_char ((p), NULL, &actual_len))
-
-
-/* Like STRING_CHAR, but advance P to the end of multibyte form. */
-
-#define STRING_CHAR_ADVANCE(p) \
- (!((p)[0] & 0x80) \
- ? *(p)++ \
- : ! ((p)[0] & 0x20) \
- ? ((p) += 2, \
- ((((p)[-2] & 0x1F) << 6) \
- | ((p)[-1] & 0x3F) \
- | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
- : ! ((p)[0] & 0x10) \
- ? ((p) += 3, \
- ((((p)[-3] & 0x0F) << 12) \
- | (((p)[-2] & 0x3F) << 6) \
- | ((p)[-1] & 0x3F))) \
- : string_char ((p), &(p), NULL))
-
-
-/* Fetch the "next" character from Lisp string STRING at byte position
- BYTEIDX, character position CHARIDX. Store it into OUTPUT.
-
- All the args must be side-effect-free.
- BYTEIDX and CHARIDX must be lvalues;
- we increment them past the character fetched. */
-
-#define FETCH_STRING_CHAR_ADVANCE(OUTPUT, STRING, CHARIDX, BYTEIDX) \
- do \
- { \
- CHARIDX++; \
- if (STRING_MULTIBYTE (STRING)) \
- { \
- unsigned char *chp = &SDATA (STRING)[BYTEIDX]; \
- int chlen; \
- \
- OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen); \
- BYTEIDX += chlen; \
- } \
- else \
- { \
- OUTPUT = SREF (STRING, BYTEIDX); \
- BYTEIDX++; \
- } \
- } \
- while (false)
-
-/* Like FETCH_STRING_CHAR_ADVANCE, but return a multibyte character
- even if STRING is unibyte. */
+INLINE int
+STRING_CHAR (unsigned char const *p)
+{
+ int len;
+ return string_char_and_length (p, &len);
+}
+
+
+/* Like STRING_CHAR (*PP), but advance *PP to the end of multibyte form. */
-#define FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE(OUTPUT, STRING, CHARIDX, BYTEIDX) \
- do \
- { \
- CHARIDX++; \
- if (STRING_MULTIBYTE (STRING)) \
- { \
- unsigned char *chp = &SDATA (STRING)[BYTEIDX]; \
- int chlen; \
- \
- OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen); \
- BYTEIDX += chlen; \
- } \
- else \
- { \
- OUTPUT = SREF (STRING, BYTEIDX); \
- BYTEIDX++; \
- MAKE_CHAR_MULTIBYTE (OUTPUT); \
- } \
- } \
- while (false)
-
-
-/* Like FETCH_STRING_CHAR_ADVANCE, but assumes STRING is multibyte. */
-
-#define FETCH_STRING_CHAR_ADVANCE_NO_CHECK(OUTPUT, STRING, CHARIDX, BYTEIDX) \
- do \
- { \
- unsigned char *fetch_ptr = &SDATA (STRING)[BYTEIDX]; \
- int fetch_len; \
- \
- OUTPUT = STRING_CHAR_AND_LENGTH (fetch_ptr, fetch_len); \
- BYTEIDX += fetch_len; \
- CHARIDX++; \
- } \
- while (false)
-
-
-/* Like FETCH_STRING_CHAR_ADVANCE, but fetch character from the current
- buffer. */
-
-#define FETCH_CHAR_ADVANCE(OUTPUT, CHARIDX, BYTEIDX) \
- do \
- { \
- CHARIDX++; \
- if (!NILP (BVAR (current_buffer, enable_multibyte_characters))) \
- { \
- unsigned char *chp = BYTE_POS_ADDR (BYTEIDX); \
- int chlen; \
- \
- OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen); \
- BYTEIDX += chlen; \
- } \
- else \
- { \
- OUTPUT = *(BYTE_POS_ADDR (BYTEIDX)); \
- BYTEIDX++; \
- } \
- } \
- while (false)
-
-
-/* Like FETCH_CHAR_ADVANCE, but assumes the current buffer is multibyte. */
-
-#define FETCH_CHAR_ADVANCE_NO_CHECK(OUTPUT, CHARIDX, BYTEIDX) \
- do \
- { \
- unsigned char *chp = BYTE_POS_ADDR (BYTEIDX); \
- int chlen; \
- \
- OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen); \
- BYTEIDX += chlen; \
- CHARIDX++; \
- } \
- while (false)
-
-
-/* Increment the buffer byte position POS_BYTE of the current buffer to
- the next character boundary. No range checking of POS. */
-
-#define INC_POS(pos_byte) \
- do { \
- unsigned char *chp = BYTE_POS_ADDR (pos_byte); \
- pos_byte += BYTES_BY_CHAR_HEAD (*chp); \
- } while (false)
-
-
-/* Decrement the buffer byte position POS_BYTE of the current buffer to
- the previous character boundary. No range checking of POS. */
-
-#define DEC_POS(pos_byte) \
- do { \
- unsigned char *chp; \
- \
- pos_byte--; \
- if (pos_byte < GPT_BYTE) \
- chp = BEG_ADDR + pos_byte - BEG_BYTE; \
- else \
- chp = BEG_ADDR + GAP_SIZE + pos_byte - BEG_BYTE; \
- while (!CHAR_HEAD_P (*chp)) \
- { \
- chp--; \
- pos_byte--; \
- } \
- } while (false)
-
-/* Increment both CHARPOS and BYTEPOS, each in the appropriate way. */
-
-#define INC_BOTH(charpos, bytepos) \
- do \
- { \
- (charpos)++; \
- if (NILP (BVAR (current_buffer, enable_multibyte_characters))) \
- (bytepos)++; \
- else \
- INC_POS ((bytepos)); \
- } \
- while (false)
-
-
-/* Decrement both CHARPOS and BYTEPOS, each in the appropriate way. */
-
-#define DEC_BOTH(charpos, bytepos) \
- do \
- { \
- (charpos)--; \
- if (NILP (BVAR (current_buffer, enable_multibyte_characters))) \
- (bytepos)--; \
- else \
- DEC_POS ((bytepos)); \
- } \
- while (false)
-
-
-/* Increment the buffer byte position POS_BYTE of the current buffer to
- the next character boundary. This macro relies on the fact that
- *GPT_ADDR and *Z_ADDR are always accessible and the values are
- '\0'. No range checking of POS_BYTE. */
-
-#define BUF_INC_POS(buf, pos_byte) \
- do { \
- unsigned char *chp = BUF_BYTE_ADDRESS (buf, pos_byte); \
- pos_byte += BYTES_BY_CHAR_HEAD (*chp); \
- } while (false)
-
-
-/* Decrement the buffer byte position POS_BYTE of the current buffer to
- the previous character boundary. No range checking of POS_BYTE. */
-
-#define BUF_DEC_POS(buf, pos_byte) \
- do { \
- unsigned char *chp; \
- pos_byte--; \
- if (pos_byte < BUF_GPT_BYTE (buf)) \
- chp = BUF_BEG_ADDR (buf) + pos_byte - BEG_BYTE; \
- else \
- chp = BUF_BEG_ADDR (buf) + BUF_GAP_SIZE (buf) + pos_byte - BEG_BYTE;\
- while (!CHAR_HEAD_P (*chp)) \
- { \
- chp--; \
- pos_byte--; \
- } \
- } while (false)
-
-
-/* Return a non-outlandish value for the tab width. */
-
-#define SANE_TAB_WIDTH(buf) \
- sanitize_tab_width (XFASTINT (BVAR (buf, tab_width)))
INLINE int
-sanitize_tab_width (EMACS_INT width)
+string_char_advance (unsigned char const **pp)
{
- return 0 < width && width <= 1000 ? width : 8;
+ unsigned char const *p = *pp;
+ int len, c = string_char_and_length (p, &len);
+ *pp = p + len;
+ return c;
}
-/* Return the width of ASCII character C. The width is measured by
- how many columns C will occupy on the screen when displayed in the
- current buffer. */
-#define ASCII_CHAR_WIDTH(c) \
- (c < 0x20 \
- ? (c == '\t' \
- ? SANE_TAB_WIDTH (current_buffer) \
- : (c == '\n' ? 0 : (NILP (BVAR (current_buffer, ctl_arrow)) ? 4 : 2))) \
- : (c < 0x7f \
- ? 1 \
- : ((NILP (BVAR (current_buffer, ctl_arrow)) ? 4 : 2))))
+/* Return the next character from Lisp string STRING at byte position
+ *BYTEIDX, character position *CHARIDX. Update *BYTEIDX and
+ *CHARIDX past the character fetched. */
+
+INLINE int
+fetch_string_char_advance (Lisp_Object string,
+ ptrdiff_t *charidx, ptrdiff_t *byteidx)
+{
+ int output;
+ ptrdiff_t b = *byteidx;
+ unsigned char *chp = SDATA (string) + b;
+ if (STRING_MULTIBYTE (string))
+ {
+ int chlen;
+ output = string_char_and_length (chp, &chlen);
+ b += chlen;
+ }
+ else
+ {
+ output = *chp;
+ b++;
+ }
+ (*charidx)++;
+ *byteidx = b;
+ return output;
+}
-/* Return a non-outlandish value for a character width. */
+/* Like fetch_string_char_advance, but return a multibyte character
+ even if STRING is unibyte. */
INLINE int
-sanitize_char_width (EMACS_INT width)
+fetch_string_char_as_multibyte_advance (Lisp_Object string,
+ ptrdiff_t *charidx, ptrdiff_t *byteidx)
{
- return 0 <= width && width <= 1000 ? width : 1000;
+ int output;
+ ptrdiff_t b = *byteidx;
+ unsigned char *chp = SDATA (string) + b;
+ if (STRING_MULTIBYTE (string))
+ {
+ int chlen;
+ output = string_char_and_length (chp, &chlen);
+ b += chlen;
+ }
+ else
+ {
+ output = make_char_multibyte (*chp);
+ b++;
+ }
+ (*charidx)++;
+ *byteidx = b;
+ return output;
}
-/* Return the width of character C. The width is measured by how many
- columns C will occupy on the screen when displayed in the current
- buffer. The name CHARACTER_WIDTH avoids a collision with <limits.h>
- CHAR_WIDTH when enabled; see ISO/IEC TS 18661-1:2014. */
-#define CHARACTER_WIDTH(c) \
- (ASCII_CHAR_P (c) \
- ? ASCII_CHAR_WIDTH (c) \
- : sanitize_char_width (XINT (CHAR_TABLE_REF (Vchar_width_table, c))))
+/* Like fetch_string_char_advance, but assumes STRING is multibyte. */
+
+INLINE int
+fetch_string_char_advance_no_check (Lisp_Object string,
+ ptrdiff_t *charidx, ptrdiff_t *byteidx)
+{
+ ptrdiff_t b = *byteidx;
+ unsigned char *chp = SDATA (string) + b;
+ int chlen, output = string_char_and_length (chp, &chlen);
+ (*charidx)++;
+ *byteidx = b + chlen;
+ return output;
+}
+
/* If C is a variation selector, return the index of the
variation selector (1..256). Otherwise, return 0. */
-#define CHAR_VARIATION_SELECTOR_P(c) \
- ((c) < 0xFE00 ? 0 \
- : (c) <= 0xFE0F ? (c) - 0xFE00 + 1 \
- : (c) < 0xE0100 ? 0 \
- : (c) <= 0xE01EF ? (c) - 0xE0100 + 17 \
- : 0)
+INLINE int
+CHAR_VARIATION_SELECTOR_P (int c)
+{
+ return (c < 0xFE00 ? 0
+ : c <= 0xFE0F ? c - 0xFE00 + 1
+ : c < 0xE0100 ? 0
+ : c <= 0xE01EF ? c - 0xE0100 + 17
+ : 0);
+}
/* Return true if C is a surrogate. */
@@ -655,23 +562,19 @@ typedef enum {
} unicode_category_t;
extern EMACS_INT char_resolve_modifier_mask (EMACS_INT) ATTRIBUTE_CONST;
-extern int char_string (unsigned, unsigned char *);
-extern int string_char (const unsigned char *,
- const unsigned char **, int *);
extern int translate_char (Lisp_Object, int c);
extern ptrdiff_t count_size_as_multibyte (const unsigned char *, ptrdiff_t);
extern ptrdiff_t str_as_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t,
ptrdiff_t *);
-extern ptrdiff_t str_to_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t);
+extern ptrdiff_t str_to_multibyte (unsigned char *dst, const unsigned char *src,
+ ptrdiff_t nchars);
extern ptrdiff_t str_as_unibyte (unsigned char *, ptrdiff_t);
-extern ptrdiff_t str_to_unibyte (const unsigned char *, unsigned char *,
- ptrdiff_t);
extern ptrdiff_t strwidth (const char *, ptrdiff_t);
extern ptrdiff_t c_string_width (const unsigned char *, ptrdiff_t, int,
ptrdiff_t *, ptrdiff_t *);
-extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t,
- ptrdiff_t *, ptrdiff_t *);
+extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t, ptrdiff_t,
+ ptrdiff_t, ptrdiff_t *, ptrdiff_t *, bool);
extern Lisp_Object Vchar_unify_table;
extern Lisp_Object string_escape_byte8 (Lisp_Object);
@@ -681,10 +584,7 @@ extern bool alphanumericp (int);
extern bool graphicp (int);
extern bool printablep (int);
extern bool blankp (int);
-
-/* Return a translation table of id number ID. */
-#define GET_TRANSLATION_TABLE(id) \
- (XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)]))
+extern bool graphic_base_p (int);
/* Look up the element in char table OBJ at index CH, and return it as
an integer. If the element is not a character, return CH itself. */
@@ -693,21 +593,14 @@ INLINE int
char_table_translate (Lisp_Object obj, int ch)
{
/* This internal function is expected to be called with valid arguments,
- so there is a eassert instead of CHECK_xxx for the sake of speed. */
+ so there is an eassert instead of CHECK_xxx for the sake of speed. */
eassert (CHAR_VALID_P (ch));
eassert (CHAR_TABLE_P (obj));
obj = CHAR_TABLE_REF (obj, ch);
- return CHARACTERP (obj) ? XINT (obj) : ch;
+ return CHARACTERP (obj) ? XFIXNUM (obj) : ch;
}
-#if defined __GNUC__ && !defined __STRICT_ANSI__
-# define HEXDIGIT_CONST const
-# define HEXDIGIT_IS_CONST true
-#else
-# define HEXDIGIT_CONST
-# define HEXDIGIT_IS_CONST false
-#endif
-extern signed char HEXDIGIT_CONST hexdigit[];
+extern signed char const hexdigit[];
/* If C is a hexadecimal digit ('0'-'9', 'a'-'f', 'A'-'F'), return its
value (0-15). Otherwise return -1. */
@@ -715,7 +608,7 @@ extern signed char HEXDIGIT_CONST hexdigit[];
INLINE int
char_hexdigit (int c)
{
- return 0 <= c && c <= UCHAR_MAX ? hexdigit[c] : -1;
+ return 0 <= c && c <= UCHAR_MAX ? hexdigit[c] - 1 : -1;
}
INLINE_HEADER_END