From 93be35e038bbb19e8d64d3c1f9d1be76a9083d09 Mon Sep 17 00:00:00 2001
From: Philipp Stephani <phst@google.com>
Date: Mon, 24 Oct 2016 21:54:51 +0200
Subject: Fix encoding of JSON surrogate pairs

JSON requires that such pairs be treated as UTF-16 surrogate pairs, not
individual code points; cf. Bug #24784.

* lisp/json.el (json-read-escaped-char): Fix decoding of surrogate
pairs.
(json--decode-utf-16-surrogates): New defun.

* test/lisp/json-tests.el (test-json-read-string): Add test for
surrogate pairs.
---
 lisp/json.el | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'lisp/json.el')

diff --git a/lisp/json.el b/lisp/json.el
index 38f828e8fbb..b2ac356641b 100644
--- a/lisp/json.el
+++ b/lisp/json.el
@@ -363,6 +363,10 @@ representation will be parsed correctly."
 
 ;; String parsing
 
+(defun json--decode-utf-16-surrogates (high low)
+  "Return the code point represented by the UTF-16 surrogates HIGH and LOW."
+  (+ (lsh (- high #xD800) 10) (- low #xDC00) #x10000))
+
 (defun json-read-escaped-char ()
   "Read the JSON string escaped character at point."
   ;; Skip over the '\'
@@ -372,6 +376,17 @@ representation will be parsed correctly."
     (cond
      (special (cdr special))
      ((not (eq char ?u)) char)
+     ;; Special-case UTF-16 surrogate pairs,
+     ;; cf. https://tools.ietf.org/html/rfc7159#section-7.  Note that
+     ;; this clause overlaps with the next one and therefore has to
+     ;; come first.
+     ((looking-at
+       (rx (group (any "Dd") (any "89ABab") (= 2 (any "0-9A-Fa-f")))
+           "\\u" (group (any "Dd") (any "C-Fc-f") (= 2 (any "0-9A-Fa-f")))))
+      (json-advance 10)
+      (json--decode-utf-16-surrogates
+       (string-to-number (match-string 1) 16)
+       (string-to-number (match-string 2) 16)))
      ((looking-at "[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]")
       (let ((hex (match-string 0)))
         (json-advance 4)
-- 
cgit v1.2.3