diff options
author | Mattias Engdegård <mattiase@acm.org> | 2022-07-11 10:34:40 +0200 |
---|---|---|
committer | Mattias Engdegård <mattiase@acm.org> | 2022-07-11 10:38:49 +0200 |
commit | 69b68099ecfb053ac77e0a954ab7467c440321ff (patch) | |
tree | 57f810ec7d6addf847ca4b1c70ff5d0cf5d02e21 | |
parent | 96846877930f580e122e9af85b4653918c542f89 (diff) | |
download | emacs-69b68099ecfb053ac77e0a954ab7467c440321ff.tar.gz emacs-69b68099ecfb053ac77e0a954ab7467c440321ff.tar.bz2 emacs-69b68099ecfb053ac77e0a954ab7467c440321ff.zip |
Simplify and speed up string-to-multibyte
* src/character.h (str_to_multibyte):
* src/character.c (str_to_multibyte): Change signature and simplify;
the conversion is no longer done in-place.
* src/fns.c (string_to_multibyte): Drop temporary buffer and memcpy;
adapt to new str_to_multibyte signature.
* src/print.c (print_string): Drop memcpy; adapt call to str_to_multibyte.
* test/src/fns-tests.el (fns--string-to-unibyte): Rename to...
(fns--string-to-unibyte-multibyte): ... this and strengthen, so that
the test covers string-to-multibyte reasonably well.
-rw-r--r-- | src/character.c | 43 | ||||
-rw-r--r-- | src/character.h | 3 | ||||
-rw-r--r-- | src/fns.c | 23 | ||||
-rw-r--r-- | src/print.c | 3 | ||||
-rw-r--r-- | test/src/fns-tests.el | 14 |
5 files changed, 37 insertions, 49 deletions
diff --git a/src/character.c b/src/character.c index d12df23f8ea..841e46c0917 100644 --- a/src/character.c +++ b/src/character.c @@ -666,35 +666,26 @@ count_size_as_multibyte (const unsigned char *str, ptrdiff_t len) } -/* Convert unibyte text at STR of BYTES bytes to a multibyte text - that contains the same single-byte characters. It actually - converts all 8-bit characters to multibyte forms. It is assured - that we can use LEN bytes at STR as a work area and that is - enough. */ - -ptrdiff_t -str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes) +/* Convert unibyte text at SRC of NCHARS bytes to a multibyte text + at DST of NBYTES bytes, that contains the same single-byte characters. */ +void +str_to_multibyte (unsigned char *dst, const unsigned char *src, + ptrdiff_t nchars, ptrdiff_t nbytes) { - unsigned char *p = str, *endp = str + bytes; - unsigned char *to; - - while (p < endp && *p < 0x80) p++; - if (p == endp) - return bytes; - to = p; - bytes = endp - p; - endp = str + len; - memmove (endp - bytes, p, bytes); - p = endp - bytes; - while (p < endp) + const unsigned char *s = src + nchars; + unsigned char *d = dst + nbytes; + for (ptrdiff_t i = 0; i < nchars; i++) { - int c = *p++; - - if (c >= 0x80) - c = BYTE8_TO_CHAR (c); - to += CHAR_STRING (c, to); + unsigned char c = *--s; + if (c <= 0x7f) + *--d = c; + else + { + *--d = 0x80 + (c & 0x3f); + *--d = 0xc0 + ((c >> 6) & 1); + } } - return (to - str); + eassert (d == dst && s == src); } /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It diff --git a/src/character.h b/src/character.h index 2ca935ba04c..36e2b06ee1b 100644 --- a/src/character.h +++ b/src/character.h @@ -567,7 +567,8 @@ extern int translate_char (Lisp_Object, int c); extern ptrdiff_t count_size_as_multibyte (const unsigned char *, ptrdiff_t); extern ptrdiff_t str_as_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t, ptrdiff_t *); -extern ptrdiff_t str_to_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t); +extern void str_to_multibyte (unsigned char *dst, const unsigned char *src, + ptrdiff_t nchars, ptrdiff_t nbytes); extern ptrdiff_t str_as_unibyte (unsigned char *, ptrdiff_t); extern ptrdiff_t strwidth (const char *, ptrdiff_t); extern ptrdiff_t c_string_width (const unsigned char *, ptrdiff_t, int, diff --git a/src/fns.c b/src/fns.c index 61ed01eee4e..7d8f957ef98 100644 --- a/src/fns.c +++ b/src/fns.c @@ -1237,33 +1237,24 @@ string_make_multibyte (Lisp_Object string) /* Convert STRING (if unibyte) to a multibyte string without changing - the number of characters. Characters 0200 through 0237 are - converted to eight-bit characters. */ + the number of characters. Characters 0x80..0xff are interpreted as + raw bytes. */ Lisp_Object string_to_multibyte (Lisp_Object string) { - unsigned char *buf; - ptrdiff_t nbytes; - Lisp_Object ret; - USE_SAFE_ALLOCA; - if (STRING_MULTIBYTE (string)) return string; - nbytes = count_size_as_multibyte (SDATA (string), SBYTES (string)); + ptrdiff_t nchars = SCHARS (string); + ptrdiff_t nbytes = count_size_as_multibyte (SDATA (string), nchars); /* If all the chars are ASCII, they won't need any more bytes once converted. */ - if (nbytes == SBYTES (string)) + if (nbytes == nchars) return make_multibyte_string (SSDATA (string), nbytes, nbytes); - buf = SAFE_ALLOCA (nbytes); - memcpy (buf, SDATA (string), SBYTES (string)); - str_to_multibyte (buf, nbytes, SBYTES (string)); - - ret = make_multibyte_string ((char *) buf, SCHARS (string), nbytes); - SAFE_FREE (); - + Lisp_Object ret = make_uninit_multibyte_string (nchars, nbytes); + str_to_multibyte (SDATA (ret), SDATA (string), nchars, nbytes); return ret; } diff --git a/src/print.c b/src/print.c index 4d7e42df1e8..9a31e386f5e 100644 --- a/src/print.c +++ b/src/print.c @@ -467,8 +467,7 @@ print_string (Lisp_Object string, Lisp_Object printcharfun) if (chars < bytes) { newstr = make_uninit_multibyte_string (chars, bytes); - memcpy (SDATA (newstr), SDATA (string), chars); - str_to_multibyte (SDATA (newstr), bytes, chars); + str_to_multibyte (SDATA (newstr), SDATA (string), chars, bytes); string = newstr; } } diff --git a/test/src/fns-tests.el b/test/src/fns-tests.el index 0119e31df11..20074ca0d21 100644 --- a/test/src/fns-tests.el +++ b/test/src/fns-tests.el @@ -1344,18 +1344,24 @@ (should (equal (plist-member plist (copy-sequence "a") #'equal) '("a" "c"))))) -(ert-deftest fns--string-to-unibyte () - (dolist (str '("" "a" "abc" "a\x00\x7fz" "a\xaa\xbbz ""\x80\xdd\xff")) +(ert-deftest fns--string-to-unibyte-multibyte () + (dolist (str (list "" "a" "abc" "a\x00\x7fz" "a\xaa\xbbz" "\x80\xdd\xff" + (apply #'unibyte-string (number-sequence 0 255)))) (ert-info ((prin1-to-string str) :prefix "str: ") (should-not (multibyte-string-p str)) (let* ((u (string-to-unibyte str)) ; should be identity (m (string-to-multibyte u)) ; lossless conversion - (uu (string-to-unibyte m))) ; also lossless + (mm (string-to-multibyte m)) ; should be identity + (uu (string-to-unibyte m)) ; also lossless + (ml (mapcar (lambda (c) (if (<= c #x7f) c (+ c #x3fff00))) u))) (should-not (multibyte-string-p u)) (should (multibyte-string-p m)) + (should (multibyte-string-p mm)) (should-not (multibyte-string-p uu)) (should (equal str u)) - (should (equal str uu))))) + (should (equal m mm)) + (should (equal str uu)) + (should (equal (append m nil) ml))))) (should-error (string-to-unibyte "å")) (should-error (string-to-unibyte "ABC∀BC"))) |