Added support for Unicode text in Ledger files, thanks to 'utfcpp', which can

be located at http://utfcpp.sourceforge.net.
author: John Wiegley <johnw@newartisans.com> 2009-01-23 19:49:22 -0400
committer: John Wiegley <johnw@newartisans.com> 2009-01-23 19:50:00 -0400
commit: 900a92e1158cb178335d16ff0912f5fc5701da32 (patch)
tree: 7da8de9f4a95a18659434c1968c8af43ee34f435 /src
parent: f52e04c2bac1d4900bfe8963f369178f7f76023f (diff)
download: fork-ledger-900a92e1158cb178335d16ff0912f5fc5701da32.tar.gz
fork-ledger-900a92e1158cb178335d16ff0912f5fc5701da32.tar.bz2
fork-ledger-900a92e1158cb178335d16ff0912f5fc5701da32.zip
5 files changed, 114 insertions, 53 deletions
diff --git a/src/format.cc b/src/format.cc
index d1f606d3..a4596761 100644
--- a/src/format.cc
+++ b/src/format.cc
@@ -308,58 +308,60 @@ void format_t::format(std::ostream& out_str, scope_t& scope)
       break;
     }
 
-    string temp = out.str();
-
-    DEBUG("format.expr", "output = \"" << temp << "\"");
+    unistring temp(out.str());
 
+    string result;
     if (! elem->has_flags(ELEMENT_FORMATTED) &&
-	elem->max_width > 0 && elem->max_width < temp.length())
-      out_str << truncate(temp, elem->max_width);
-    else
-      out_str << temp;
+	elem->max_width > 0 && elem->max_width < temp.length()) {
+      result = truncate(temp, elem->max_width);
+    } else {
+      result = temp.extract();
+      for (int i = 0; i < (int)elem->min_width - (int)temp.length(); i++)
+	result += " ";
+    }
+
+    out_str << result;
   }
 }
 
-string format_t::truncate(const string& str, unsigned int width,
+string format_t::truncate(const unistring& ustr, unsigned int width,
 			  const bool is_account)
 {
-  const unsigned int len = str.length();
-  if (len <= width)
-    return str;
-
   assert(width < 4095);
 
-  char buf[4096];
+  const unsigned int len = ustr.length();
+  if (len <= width)
+    return ustr.extract();
+
+  std::ostringstream buf;
 
   switch (elision_style) {
   case TRUNCATE_LEADING:
     // This method truncates at the beginning.
-    std::strncpy(buf, str.c_str() + (len - width), width);
-    buf[0] = '.';
-    buf[1] = '.';
+    buf << ".." << ustr.extract(len - width, width);
     break;
 
   case TRUNCATE_MIDDLE:
     // This method truncates in the middle.
-    std::strncpy(buf, str.c_str(), width / 2);
-    std::strncpy(buf + width / 2,
-		 str.c_str() + (len - (width / 2 + width % 2)),
-		 width / 2 + width % 2);
-    buf[width / 2 - 1] = '.';
-    buf[width / 2] = '.';
+    buf << ustr.extract(0, width / 2)
+	<< ".."
+	<< ustr.extract(len - (width / 2 + width % 2),
+			width / 2 + width % 2);
     break;
 
   case ABBREVIATE:
     if (is_account) {
       std::list<string> parts;
       string::size_type beg = 0;
-      for (string::size_type pos = str.find(':');
+      string strcopy(ustr.extract());
+      for (string::size_type pos = strcopy.find(':');
 	   pos != string::npos;
-	   beg = pos + 1, pos = str.find(':', beg))
-	parts.push_back(string(str, beg, pos - beg));
-      parts.push_back(string(str, beg));
+	   beg = pos + 1, pos = strcopy.find(':', beg))
+	parts.push_back(string(strcopy, beg, pos - beg));
+      parts.push_back(string(strcopy, beg));
+
+      std::ostringstream result;
 
-      string result;
       unsigned int newlen = len;
       for (std::list<string>::iterator i = parts.begin();
 	   i != parts.end();
@@ -367,28 +369,26 @@ string format_t::truncate(const string& str, unsigned int width,
 	// Don't contract the last element
 	std::list<string>::iterator x = i;
 	if (++x == parts.end()) {
-	  result += *i;
+	  result << *i;
 	  break;
 	}
 
 	if (newlen > width) {
-	  result += string(*i, 0, abbrev_length);
-	  result += ":";
-	  newlen -= (*i).length() - abbrev_length;
+	  unistring temp(*i);
+	  result << temp.extract(0, abbrev_length) << ":";
+	  newlen -= temp.length() - abbrev_length;
 	} else {
-	  result += *i;
-	  result += ":";
+	  result << *i << ":";
 	}
       }
 
       if (newlen > width) {
 	// Even abbreviated its too big to show the last account, so
 	// abbreviate all but the last and truncate at the beginning.
-	std::strncpy(buf, result.c_str() + (result.length() - width), width);
-	buf[0] = '.';
-	buf[1] = '.';
+	unistring temp(result.str());
+	buf << ".." << temp.extract(temp.length() - width, width);
       } else {
-	std::strcpy(buf, result.c_str());
+	buf << result.str();
       }
       break;
     }
@@ -396,14 +396,11 @@ string format_t::truncate(const string& str, unsigned int width,
 
   case TRUNCATE_TRAILING:
     // This method truncates at the end (the default).
-    std::strncpy(buf, str.c_str(), width - 2);
-    buf[width - 2] = '.';
-    buf[width - 1] = '.';
+    buf << ustr.extract(0, width -2) << "..";
     break;
   }
-  buf[width] = '\0';
 
-  return buf;
+  return buf.str();
 }
 
 } // namespace ledger
diff --git a/src/format.h b/src/format.h
index 13a2fff2..fbfe452e 100644
--- a/src/format.h
+++ b/src/format.h
@@ -32,13 +32,63 @@
 #ifndef _FORMAT_H
 #define _FORMAT_H
 
+#define SUPPORT_UNICODE 1
+
 #include "journal.h"
 #include "expr.h"
+#if defined(SUPPORT_UNICODE)
+#include "utf8.h"
+#endif
 
 namespace ledger {
 
 DECLARE_EXCEPTION(format_error, std::runtime_error);
 
+#if defined(SUPPORT_UNICODE)
+/**
+ * @class unistring
+ *
+ * @brief Abstract working with UTF-32 encoded Unicode strings
+ *
+ * The input to the string is a UTF8 encoded ledger::string, which can
+ * then have its true length be taken, or characters extracted.
+ */
+class unistring
+{
+  std::vector<uint32_t> utf32chars;
+
+public:
+  unistring(const string& input)
+  {
+    TRACE_CTOR(unistring, "");
+
+    const char * p   = input.c_str();
+    std::size_t	 len = input.length();
+
+    VERIFY(utf8::is_valid(p, p + len));
+
+    utf8::utf8to32(p, p + len, std::back_inserter(utf32chars));
+  }
+  ~unistring() {
+    TRACE_DTOR(unistring);
+  }
+
+  std::size_t length() const {
+    return utf32chars.size();
+  }
+
+  string extract(const std::size_t begin = 0,
+		 const std::size_t len   = 0) const
+  {
+    string utf8result;
+    utf8::utf32to8(utf32chars.begin() + begin,
+		   utf32chars.begin() + begin + (len ? len : length()),
+		   std::back_inserter(utf8result));
+    return utf8result;
+  }
+};
+#endif
+
 class report_t;
 
 class format_t : public noncopyable
@@ -135,7 +185,7 @@ public:
       elem->dump(out);
   }
 
-  static string truncate(const string& str, unsigned int width,
+  static string truncate(const unistring& str, unsigned int width,
 			 const bool is_account = false);
 };
 
diff --git a/src/report.cc b/src/report.cc
index 2dfc9d76..4b45577b 100644
--- a/src/report.cc
+++ b/src/report.cc
@@ -789,6 +789,8 @@ expr_t::ptr_op_t report_t::lookup(const string& name)
   case 't':
     if (std::strcmp(p, "total_expr") == 0)
 	return MAKE_FUNCTOR(report_t::get_total_expr);
+    else if (std::strcmp(p, "truncate") == 0)
+	return MAKE_FUNCTOR(report_t::get_total_expr);
     break;
   }
 
diff --git a/src/session.cc b/src/session.cc
index 9a3cccd5..8e78e9e9 100644
--- a/src/session.cc
+++ b/src/session.cc
@@ -71,9 +71,9 @@ void release_session_context()
 
 session_t::session_t()
   : register_format
-    ("%-.9(date) %-.20(payee) %-.23(account) %!12(print_balance(amount_expr, 12, 67)) "
+    ("%-.9(date) %-.20(payee) %-.23(account(23)) %!12(print_balance(amount_expr, 12, 67)) "
      "%!12(print_balance(display_total, 12, 80, true))\n%/"
-     "%31|%-.23(account) %!12(print_balance(amount_expr, 12, 67)) "
+     "%31|%-.23(account(23)) %!12(print_balance(amount_expr, 12, 67)) "
      "%!12(print_balance(display_total, 12, 80, true))\n"),
     wide_register_format
     ("%-.9D  %-.35P %-.39A %22.108t %!22.132T\n%/"
diff --git a/src/textual.cc b/src/textual.cc
index e1a08f69..4600f3f3 100644
--- a/src/textual.cc
+++ b/src/textual.cc
@@ -596,10 +596,16 @@ static inline void parse_symbol(char *& p, string& symbol)
 
 bool textual_parser_t::test(std::istream& in) const
 {
-  char buf[5];
+  char   buf[12];
+  char * p;
 
-  in.read(buf, 5);
-  if (std::strncmp(buf, "<?xml", 5) == 0) {
+  in.read(buf, 11);
+  if (utf8::is_bom(buf))
+    p = &buf[3];
+  else
+    p = buf;
+
+  if (std::strncmp(p, "<?xml", 5) == 0) {
 #if defined(HAVE_EXPAT) || defined(HAVE_XMLPARSE)
     throw parse_error("Ledger file contains XML data, but format was not recognized");
 #else
@@ -688,10 +694,11 @@ unsigned int textual_parser_t::parse(std::istream& in,
 {
   TRACE_START(parsing_total, 1, "Total time spent parsing text:");
 
-  static bool  added_auto_entry_hook = false;
-  static char  line[MAX_LINE + 1];
-  unsigned int count  = 0;
-  unsigned int errors = 0;
+  static bool	added_auto_entry_hook = false;
+  static char	linebuf[MAX_LINE + 1];
+         char * line;
+  unsigned int	count  = 0;
+  unsigned int	errors = 0;
 
   std::list<account_t *>  account_stack;
   auto_entry_finalizer_t  auto_entry_finalizer(&journal);
@@ -714,10 +721,15 @@ unsigned int textual_parser_t::parse(std::istream& in,
 
   while (in.good() && ! in.eof()) {
     try {
-      in.getline(line, MAX_LINE);
+      in.getline(linebuf, MAX_LINE);
       if (in.eof())
 	break;
 
+      if (linenum == 1 && utf8::is_bom(linebuf))
+	line = &linebuf[3];
+      else
+	line = linebuf;
+
       int len = std::strlen(line);
       if (line[len - 1] == '\r')
 	line[--len] = '\0';
author	John Wiegley <johnw@newartisans.com>	2009-01-23 19:49:22 -0400
committer	John Wiegley <johnw@newartisans.com>	2009-01-23 19:50:00 -0400
commit	900a92e1158cb178335d16ff0912f5fc5701da32 (patch)
tree	7da8de9f4a95a18659434c1968c8af43ee34f435 /src
parent	f52e04c2bac1d4900bfe8963f369178f7f76023f (diff)
download	fork-ledger-900a92e1158cb178335d16ff0912f5fc5701da32.tar.gz fork-ledger-900a92e1158cb178335d16ff0912f5fc5701da32.tar.bz2 fork-ledger-900a92e1158cb178335d16ff0912f5fc5701da32.zip