From 6f2e3b88649543d625b76ac5542dd8838b28f0b1 Mon Sep 17 00:00:00 2001 From: John Wiegley Date: Thu, 12 Feb 2009 02:34:39 -0400 Subject: Properly handle UTF-8 characters in commodity strings. --- src/commodity.cc | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'src/commodity.cc') diff --git a/src/commodity.cc b/src/commodity.cc index 5169e3af..2dfba880 100644 --- a/src/commodity.cc +++ b/src/commodity.cc @@ -571,7 +571,55 @@ void commodity_t::parse_symbol(std::istream& in, string& symbol) else throw_(amount_error, "Quoted commodity symbol lacks closing quote"); } else { - READ_INTO(in, buf, 255, c, ! invalid_chars[static_cast(c)]); + char * _p = buf; + c = in.peek(); + while (_p - buf < 255 && in.good() && ! in.eof() && c != '\n') { + int bytes = 0; + int size = _p - buf; + + unsigned char d = c; + + // Check for the start of a UTF-8 multi-byte encoded string + if (d >= 192 && d <= 223 && size < 254) + bytes = 2; + else if (d >= 224 && d <= 239 && size < 253) + bytes = 3; + else if (d >= 240 && d <= 247 && size < 252) + bytes = 4; + else if (d >= 248 && d <= 251 && size < 251) + bytes = 5; + else if (d >= 252 && d <= 253 && size < 250) + bytes = 6; + else if (d >= 254) // UTF-8 encoding error + break; + + if (bytes > 0) { // we're looking at a UTF-8 encoding + for (int i = 0; i < bytes; i++) { + in.get(c); + if (in.bad() || in.eof()) + break; + *_p++ = c; + } + } + else if (invalid_chars[static_cast(c)]) { + break; + } + else { + in.get(c); + if (in.eof()) + break; + if (c == '\\') { + in.get(c); + if (in.eof()) + break; + } + *_p++ = c; + } + + c = in.peek(); + } + *_p = '\0'; + if (is_reserved_token(buf)) buf[0] = '\0'; } -- cgit v1.2.3