Simplify and speed up string-to-multibyte

* src/character.h (str_to_multibyte): * src/character.c (str_to_multibyte): Change signature and simplify; the conversion is no longer done in-place. * src/fns.c (string_to_multibyte): Drop temporary buffer and memcpy; adapt to new str_to_multibyte signature. * src/print.c (print_string): Drop memcpy; adapt call to str_to_multibyte. * test/src/fns-tests.el (fns--string-to-unibyte): Rename to... (fns--string-to-unibyte-multibyte): ... this and strengthen, so that the test covers string-to-multibyte reasonably well.
author: Mattias Engdegård <mattiase@acm.org> 2022-07-11 10:34:40 +0200
committer: Mattias Engdegård <mattiase@acm.org> 2022-07-11 10:38:49 +0200
commit: 69b68099ecfb053ac77e0a954ab7467c440321ff (patch)
tree: 57f810ec7d6addf847ca4b1c70ff5d0cf5d02e21
parent: 96846877930f580e122e9af85b4653918c542f89 (diff)
download: emacs-69b68099ecfb053ac77e0a954ab7467c440321ff.tar.gz
emacs-69b68099ecfb053ac77e0a954ab7467c440321ff.tar.bz2
emacs-69b68099ecfb053ac77e0a954ab7467c440321ff.zip
5 files changed, 37 insertions, 49 deletions
diff --git a/src/character.c b/src/character.c
index d12df23f8ea..841e46c0917 100644
--- a/src/character.c
+++ b/src/character.c
@@ -666,35 +666,26 @@ count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 }
 
 
-/* Convert unibyte text at STR of BYTES bytes to a multibyte text
-   that contains the same single-byte characters.  It actually
-   converts all 8-bit characters to multibyte forms.  It is assured
-   that we can use LEN bytes at STR as a work area and that is
-   enough.  */
-
-ptrdiff_t
-str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
+/* Convert unibyte text at SRC of NCHARS bytes to a multibyte text
+   at DST of NBYTES bytes, that contains the same single-byte characters.  */
+void
+str_to_multibyte (unsigned char *dst, const unsigned char *src,
+		  ptrdiff_t nchars, ptrdiff_t nbytes)
 {
-  unsigned char *p = str, *endp = str + bytes;
-  unsigned char *to;
-
-  while (p < endp && *p < 0x80) p++;
-  if (p == endp)
-    return bytes;
-  to = p;
-  bytes = endp - p;
-  endp = str + len;
-  memmove (endp - bytes, p, bytes);
-  p = endp - bytes;
-  while (p < endp)
+  const unsigned char *s = src + nchars;
+  unsigned char *d = dst + nbytes;
+  for (ptrdiff_t i = 0; i < nchars; i++)
     {
-      int c = *p++;
-
-      if (c >= 0x80)
-	c = BYTE8_TO_CHAR (c);
-      to += CHAR_STRING (c, to);
+      unsigned char c = *--s;
+      if (c <= 0x7f)
+	*--d = c;
+      else
+	{
+	  *--d = 0x80 + (c & 0x3f);
+	  *--d = 0xc0 + ((c >> 6) & 1);
+	}
     }
-  return (to - str);
+  eassert (d == dst && s == src);
 }
 
 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
diff --git a/src/character.h b/src/character.h
index 2ca935ba04c..36e2b06ee1b 100644
--- a/src/character.h
+++ b/src/character.h
@@ -567,7 +567,8 @@ extern int translate_char (Lisp_Object, int c);
 extern ptrdiff_t count_size_as_multibyte (const unsigned char *, ptrdiff_t);
 extern ptrdiff_t str_as_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t,
 				   ptrdiff_t *);
-extern ptrdiff_t str_to_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t);
+extern void str_to_multibyte (unsigned char *dst, const unsigned char *src,
+			      ptrdiff_t nchars, ptrdiff_t nbytes);
 extern ptrdiff_t str_as_unibyte (unsigned char *, ptrdiff_t);
 extern ptrdiff_t strwidth (const char *, ptrdiff_t);
 extern ptrdiff_t c_string_width (const unsigned char *, ptrdiff_t, int,
diff --git a/src/fns.c b/src/fns.c
index 61ed01eee4e..7d8f957ef98 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -1237,33 +1237,24 @@ string_make_multibyte (Lisp_Object string)
 
 
 /* Convert STRING (if unibyte) to a multibyte string without changing
-   the number of characters.  Characters 0200 through 0237 are
-   converted to eight-bit characters. */
+   the number of characters.  Characters 0x80..0xff are interpreted as
+   raw bytes. */
 
 Lisp_Object
 string_to_multibyte (Lisp_Object string)
 {
-  unsigned char *buf;
-  ptrdiff_t nbytes;
-  Lisp_Object ret;
-  USE_SAFE_ALLOCA;
-
   if (STRING_MULTIBYTE (string))
     return string;
 
-  nbytes = count_size_as_multibyte (SDATA (string), SBYTES (string));
+  ptrdiff_t nchars = SCHARS (string);
+  ptrdiff_t nbytes = count_size_as_multibyte (SDATA (string), nchars);
   /* If all the chars are ASCII, they won't need any more bytes once
      converted.  */
-  if (nbytes == SBYTES (string))
+  if (nbytes == nchars)
     return make_multibyte_string (SSDATA (string), nbytes, nbytes);
 
-  buf = SAFE_ALLOCA (nbytes);
-  memcpy (buf, SDATA (string), SBYTES (string));
-  str_to_multibyte (buf, nbytes, SBYTES (string));
-
-  ret = make_multibyte_string ((char *) buf, SCHARS (string), nbytes);
-  SAFE_FREE ();
-
+  Lisp_Object ret = make_uninit_multibyte_string (nchars, nbytes);
+  str_to_multibyte (SDATA (ret), SDATA (string), nchars, nbytes);
   return ret;
 }
 
diff --git a/src/print.c b/src/print.c
index 4d7e42df1e8..9a31e386f5e 100644
--- a/src/print.c
+++ b/src/print.c
@@ -467,8 +467,7 @@ print_string (Lisp_Object string, Lisp_Object printcharfun)
 	  if (chars < bytes)
 	    {
 	      newstr = make_uninit_multibyte_string (chars, bytes);
-	      memcpy (SDATA (newstr), SDATA (string), chars);
-	      str_to_multibyte (SDATA (newstr), bytes, chars);
+	      str_to_multibyte (SDATA (newstr), SDATA (string), chars, bytes);
 	      string = newstr;
 	    }
 	}
diff --git a/test/src/fns-tests.el b/test/src/fns-tests.el
index 0119e31df11..20074ca0d21 100644
--- a/test/src/fns-tests.el
+++ b/test/src/fns-tests.el
@@ -1344,18 +1344,24 @@
     (should (equal (plist-member plist (copy-sequence "a") #'equal)
                    '("a" "c")))))
 
-(ert-deftest fns--string-to-unibyte ()
-  (dolist (str '("" "a" "abc" "a\x00\x7fz" "a\xaa\xbbz ""\x80\xdd\xff"))
+(ert-deftest fns--string-to-unibyte-multibyte ()
+  (dolist (str (list "" "a" "abc" "a\x00\x7fz" "a\xaa\xbbz" "\x80\xdd\xff"
+                     (apply #'unibyte-string (number-sequence 0 255))))
     (ert-info ((prin1-to-string str) :prefix "str: ")
       (should-not (multibyte-string-p str))
       (let* ((u (string-to-unibyte str))   ; should be identity
              (m (string-to-multibyte u))   ; lossless conversion
-             (uu (string-to-unibyte m)))   ; also lossless
+             (mm (string-to-multibyte m))  ; should be identity
+             (uu (string-to-unibyte m))    ; also lossless
+             (ml (mapcar (lambda (c) (if (<= c #x7f) c (+ c #x3fff00))) u)))
         (should-not (multibyte-string-p u))
         (should (multibyte-string-p m))
+        (should (multibyte-string-p mm))
         (should-not (multibyte-string-p uu))
         (should (equal str u))
-        (should (equal str uu)))))
+        (should (equal m mm))
+        (should (equal str uu))
+        (should (equal (append m nil) ml)))))
   (should-error (string-to-unibyte "å"))
   (should-error (string-to-unibyte "ABC∀BC")))
author	Mattias Engdegård <mattiase@acm.org>	2022-07-11 10:34:40 +0200
committer	Mattias Engdegård <mattiase@acm.org>	2022-07-11 10:38:49 +0200
commit	69b68099ecfb053ac77e0a954ab7467c440321ff (patch)
tree	57f810ec7d6addf847ca4b1c70ff5d0cf5d02e21
parent	96846877930f580e122e9af85b4653918c542f89 (diff)
download	emacs-69b68099ecfb053ac77e0a954ab7467c440321ff.tar.gz emacs-69b68099ecfb053ac77e0a954ab7467c440321ff.tar.bz2 emacs-69b68099ecfb053ac77e0a954ab7467c440321ff.zip