10 files changed, 151 insertions, 55 deletions
diff --git a/.gitmodules b/.gitmodules
index b685b019..33b949c0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "lib/libofx"]
 	path = lib/libofx
 	url = git://newartisans.com/libofx.git
+[submodule "lib/utfcpp"]
+	path = lib/utfcpp
+	url = git://github.com/jwiegley/utfcpp.git
diff --git a/Makefile.am b/Makefile.am
index c360bf96..5c8384a9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -5,7 +5,8 @@ EXTRA_DIST = autogen.sh contrib
 
 lib_LTLIBRARIES = libamounts.la libledger.la
 
-libamounts_la_CPPFLAGS = -I$(srcdir)/src -I$(srcdir)/lib
+libamounts_la_CPPFLAGS = -I$(srcdir)/src -I$(srcdir)/lib \
+			 -I$(srcdir)/lib/utfcpp/source
 if HAVE_GDTOA
 libamounts_la_CPPFLAGS += -I$(top_builddir)/lib/gdtoa -I$(srcdir)/lib/gdtoa
 endif
diff --git a/doc/LICENSE-utfcpp b/doc/LICENSE-utfcpp
new file mode 100644
index 00000000..1751a003
--- /dev/null
+++ b/doc/LICENSE-utfcpp
@@ -0,0 +1,23 @@
+Copyright 2006 Nemanja Trifunovic
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/doc/sample.dat b/doc/sample.dat
index 907f19ab..a579dd0a 100644
--- a/doc/sample.dat
+++ b/doc/sample.dat
@@ -15,10 +15,18 @@ N $
   Assets:Brokerage              50 AAPL @ $30.00
   Equity:Opening Balances
 
-2004/05/14 * Pay day
+2004/05/14 * Páy dày
   Assets:Bank:Checking          $500.00
   Income:Salary
 
+2004/05/14 * Another dày in which there is Páying
+  Asséts:Bánk:Chécking:Asséts:Bánk:Chécking          $500.00
+  Income:Salary
+
+2004/05/14 * Another dày in which there is Páying
+  Русский язык:Русский язык:Русский язык:Русский язык  $1000.00
+  Income:Salary
+
 2004/05/27 Book Store
   Expenses:Books                 $20.00
   Liabilities:MasterCard
diff --git a/lib/utfcpp b/lib/utfcpp
new file mode 160000
+Subproject 54676a423c356bf128f9c8fc0e7ea68cbcb7587
diff --git a/src/format.cc b/src/format.cc
index d1f606d3..a4596761 100644
--- a/src/format.cc
+++ b/src/format.cc
@@ -308,58 +308,60 @@ void format_t::format(std::ostream& out_str, scope_t& scope)
       break;
     }
 
-    string temp = out.str();
-
-    DEBUG("format.expr", "output = \"" << temp << "\"");
+    unistring temp(out.str());
 
+    string result;
     if (! elem->has_flags(ELEMENT_FORMATTED) &&
-	elem->max_width > 0 && elem->max_width < temp.length())
-      out_str << truncate(temp, elem->max_width);
-    else
-      out_str << temp;
+	elem->max_width > 0 && elem->max_width < temp.length()) {
+      result = truncate(temp, elem->max_width);
+    } else {
+      result = temp.extract();
+      for (int i = 0; i < (int)elem->min_width - (int)temp.length(); i++)
+	result += " ";
+    }
+
+    out_str << result;
   }
 }
 
-string format_t::truncate(const string& str, unsigned int width,
+string format_t::truncate(const unistring& ustr, unsigned int width,
 			  const bool is_account)
 {
-  const unsigned int len = str.length();
-  if (len <= width)
-    return str;
-
   assert(width < 4095);
 
-  char buf[4096];
+  const unsigned int len = ustr.length();
+  if (len <= width)
+    return ustr.extract();
+
+  std::ostringstream buf;
 
   switch (elision_style) {
   case TRUNCATE_LEADING:
     // This method truncates at the beginning.
-    std::strncpy(buf, str.c_str() + (len - width), width);
-    buf[0] = '.';
-    buf[1] = '.';
+    buf << ".." << ustr.extract(len - width, width);
     break;
 
   case TRUNCATE_MIDDLE:
     // This method truncates in the middle.
-    std::strncpy(buf, str.c_str(), width / 2);
-    std::strncpy(buf + width / 2,
-		 str.c_str() + (len - (width / 2 + width % 2)),
-		 width / 2 + width % 2);
-    buf[width / 2 - 1] = '.';
-    buf[width / 2] = '.';
+    buf << ustr.extract(0, width / 2)
+	<< ".."
+	<< ustr.extract(len - (width / 2 + width % 2),
+			width / 2 + width % 2);
     break;
 
   case ABBREVIATE:
     if (is_account) {
       std::list<string> parts;
       string::size_type beg = 0;
-      for (string::size_type pos = str.find(':');
+      string strcopy(ustr.extract());
+      for (string::size_type pos = strcopy.find(':');
 	   pos != string::npos;
-	   beg = pos + 1, pos = str.find(':', beg))
-	parts.push_back(string(str, beg, pos - beg));
-      parts.push_back(string(str, beg));
+	   beg = pos + 1, pos = strcopy.find(':', beg))
+	parts.push_back(string(strcopy, beg, pos - beg));
+      parts.push_back(string(strcopy, beg));
+
+      std::ostringstream result;
 
-      string result;
       unsigned int newlen = len;
       for (std::list<string>::iterator i = parts.begin();
 	   i != parts.end();
@@ -367,28 +369,26 @@ string format_t::truncate(const string& str, unsigned int width,
 	// Don't contract the last element
 	std::list<string>::iterator x = i;
 	if (++x == parts.end()) {
-	  result += *i;
+	  result << *i;
 	  break;
 	}
 
 	if (newlen > width) {
-	  result += string(*i, 0, abbrev_length);
-	  result += ":";
-	  newlen -= (*i).length() - abbrev_length;
+	  unistring temp(*i);
+	  result << temp.extract(0, abbrev_length) << ":";
+	  newlen -= temp.length() - abbrev_length;
 	} else {
-	  result += *i;
-	  result += ":";
+	  result << *i << ":";
 	}
       }
 
       if (newlen > width) {
 	// Even abbreviated its too big to show the last account, so
 	// abbreviate all but the last and truncate at the beginning.
-	std::strncpy(buf, result.c_str() + (result.length() - width), width);
-	buf[0] = '.';
-	buf[1] = '.';
+	unistring temp(result.str());
+	buf << ".." << temp.extract(temp.length() - width, width);
       } else {
-	std::strcpy(buf, result.c_str());
+	buf << result.str();
       }
       break;
     }
@@ -396,14 +396,11 @@ string format_t::truncate(const string& str, unsigned int width,
 
   case TRUNCATE_TRAILING:
     // This method truncates at the end (the default).
-    std::strncpy(buf, str.c_str(), width - 2);
-    buf[width - 2] = '.';
-    buf[width - 1] = '.';
+    buf << ustr.extract(0, width -2) << "..";
     break;
   }
-  buf[width] = '\0';
 
-  return buf;
+  return buf.str();
 }
 
 } // namespace ledger
diff --git a/src/format.h b/src/format.h
index 13a2fff2..fbfe452e 100644
--- a/src/format.h
+++ b/src/format.h
@@ -32,13 +32,63 @@
 #ifndef _FORMAT_H
 #define _FORMAT_H
 
+#define SUPPORT_UNICODE 1
+
 #include "journal.h"
 #include "expr.h"
+#if defined(SUPPORT_UNICODE)
+#include "utf8.h"
+#endif
 
 namespace ledger {
 
 DECLARE_EXCEPTION(format_error, std::runtime_error);
 
+#if defined(SUPPORT_UNICODE)
+/**
+ * @class unistring
+ *
+ * @brief Abstract working with UTF-32 encoded Unicode strings
+ *
+ * The input to the string is a UTF8 encoded ledger::string, which can
+ * then have its true length be taken, or characters extracted.
+ */
+class unistring
+{
+  std::vector<uint32_t> utf32chars;
+
+public:
+  unistring(const string& input)
+  {
+    TRACE_CTOR(unistring, "");
+
+    const char * p   = input.c_str();
+    std::size_t	 len = input.length();
+
+    VERIFY(utf8::is_valid(p, p + len));
+
+    utf8::utf8to32(p, p + len, std::back_inserter(utf32chars));
+  }
+  ~unistring() {
+    TRACE_DTOR(unistring);
+  }
+
+  std::size_t length() const {
+    return utf32chars.size();
+  }
+
+  string extract(const std::size_t begin = 0,
+		 const std::size_t len   = 0) const
+  {
+    string utf8result;
+    utf8::utf32to8(utf32chars.begin() + begin,
+		   utf32chars.begin() + begin + (len ? len : length()),
+		   std::back_inserter(utf8result));
+    return utf8result;
+  }
+};
+#endif
+
 class report_t;
 
 class format_t : public noncopyable
@@ -135,7 +185,7 @@ public:
       elem->dump(out);
   }
 
-  static string truncate(const string& str, unsigned int width,
+  static string truncate(const unistring& str, unsigned int width,
 			 const bool is_account = false);
 };
 
diff --git a/src/report.cc b/src/report.cc
index 2dfc9d76..4b45577b 100644
--- a/src/report.cc
+++ b/src/report.cc
@@ -789,6 +789,8 @@ expr_t::ptr_op_t report_t::lookup(const string& name)
   case 't':
     if (std::strcmp(p, "total_expr") == 0)
 	return MAKE_FUNCTOR(report_t::get_total_expr);
+    else if (std::strcmp(p, "truncate") == 0)
+	return MAKE_FUNCTOR(report_t::get_total_expr);
     break;
   }
 
diff --git a/src/session.cc b/src/session.cc
index 9a3cccd5..8e78e9e9 100644
--- a/src/session.cc
+++ b/src/session.cc
@@ -71,9 +71,9 @@ void release_session_context()
 
 session_t::session_t()
   : register_format
-    ("%-.9(date) %-.20(payee) %-.23(account) %!12(print_balance(amount_expr, 12, 67)) "
+    ("%-.9(date) %-.20(payee) %-.23(account(23)) %!12(print_balance(amount_expr, 12, 67)) "
      "%!12(print_balance(display_total, 12, 80, true))\n%/"
-     "%31|%-.23(account) %!12(print_balance(amount_expr, 12, 67)) "
+     "%31|%-.23(account(23)) %!12(print_balance(amount_expr, 12, 67)) "
      "%!12(print_balance(display_total, 12, 80, true))\n"),
     wide_register_format
     ("%-.9D  %-.35P %-.39A %22.108t %!22.132T\n%/"
diff --git a/src/textual.cc b/src/textual.cc
index e1a08f69..4600f3f3 100644
--- a/src/textual.cc
+++ b/src/textual.cc
@@ -596,10 +596,16 @@ static inline void parse_symbol(char *& p, string& symbol)
 
 bool textual_parser_t::test(std::istream& in) const
 {
-  char buf[5];
+  char   buf[12];
+  char * p;
 
-  in.read(buf, 5);
-  if (std::strncmp(buf, "<?xml", 5) == 0) {
+  in.read(buf, 11);
+  if (utf8::is_bom(buf))
+    p = &buf[3];
+  else
+    p = buf;
+
+  if (std::strncmp(p, "<?xml", 5) == 0) {
 #if defined(HAVE_EXPAT) || defined(HAVE_XMLPARSE)
     throw parse_error("Ledger file contains XML data, but format was not recognized");
 #else
@@ -688,10 +694,11 @@ unsigned int textual_parser_t::parse(std::istream& in,
 {
   TRACE_START(parsing_total, 1, "Total time spent parsing text:");
 
-  static bool  added_auto_entry_hook = false;
-  static char  line[MAX_LINE + 1];
-  unsigned int count  = 0;
-  unsigned int errors = 0;
+  static bool	added_auto_entry_hook = false;
+  static char	linebuf[MAX_LINE + 1];
+         char * line;
+  unsigned int	count  = 0;
+  unsigned int	errors = 0;
 
   std::list<account_t *>  account_stack;
   auto_entry_finalizer_t  auto_entry_finalizer(&journal);
@@ -714,10 +721,15 @@ unsigned int textual_parser_t::parse(std::istream& in,
 
   while (in.good() && ! in.eof()) {
     try {
-      in.getline(line, MAX_LINE);
+      in.getline(linebuf, MAX_LINE);
       if (in.eof())
 	break;
 
+      if (linenum == 1 && utf8::is_bom(linebuf))
+	line = &linebuf[3];
+      else
+	line = linebuf;
+
       int len = std::strlen(line);
       if (line[len - 1] == '\r')
 	line[--len] = '\0';