From c8641a6de65670b8833992c94c51a586a6434a74 Mon Sep 17 00:00:00 2001 From: John Wiegley Date: Sat, 7 Nov 2009 08:32:44 -0500 Subject: Added support for Boost.Regex w/ ICU This allows for correct searching of UTF-8 encoded strings, such as lower-case versions of Russian words to find mixed-case words. --- src/derive.cc | 6 +++--- src/mask.cc | 6 +++++- src/mask.h | 43 +++++++++++++++++++++++++++++++++++++------ src/post.cc | 2 +- src/report.cc | 2 +- src/system.hh.in | 4 ++++ src/unistring.h | 7 +++++-- 7 files changed, 56 insertions(+), 14 deletions(-) (limited to 'src') diff --git a/src/derive.cc b/src/derive.cc index d3a7a37d..081b96b2 100644 --- a/src/derive.cc +++ b/src/derive.cc @@ -307,7 +307,7 @@ namespace { DEBUG("derive.xact", "Setting note from match: " << *added->note); #endif } else { - added->payee = tmpl.payee_mask.expr.str(); + added->payee = tmpl.payee_mask.str(); DEBUG("derive.xact", "Setting payee from template: " << added->payee); } @@ -403,14 +403,14 @@ namespace { account_t * acct = NULL; if (! acct) { - acct = journal.find_account_re(post.account_mask->expr.str()); + acct = journal.find_account_re(post.account_mask->str()); #if defined(DEBUG_ON) if (acct) DEBUG("derive.xact", "Found account as a regular expression"); #endif } if (! acct) { - acct = journal.find_account(post.account_mask->expr.str()); + acct = journal.find_account(post.account_mask->str()); #if defined(DEBUG_ON) if (acct) DEBUG("derive.xact", "Found (or created) account by name"); diff --git a/src/mask.cc b/src/mask.cc index 135f6669..c1e66ced 100644 --- a/src/mask.cc +++ b/src/mask.cc @@ -43,7 +43,11 @@ mask_t::mask_t(const string& pat) : expr() mask_t& mask_t::operator=(const string& pat) { - expr.assign(pat.c_str(), regex::perl | regex::icase); +#if defined(HAVE_BOOST_REGEX_UNICODE) + expr = boost::make_u32regex(pat.c_str(), boost::regex::perl | boost::regex::icase); +#else + expr.assign(pat.c_str(), boost::regex::perl | boost::regex::icase); +#endif VERIFY(valid()); return *this; } diff --git a/src/mask.h b/src/mask.h index 32d27f42..62df9b63 100644 --- a/src/mask.h +++ b/src/mask.h @@ -45,6 +45,9 @@ #define _MASK_H #include "utils.h" +#if defined(HAVE_BOOST_REGEX_UNICODE) +#include "unistring.h" +#endif namespace ledger { @@ -56,7 +59,11 @@ namespace ledger { class mask_t { public: +#if defined(HAVE_BOOST_REGEX_UNICODE) + boost::u32regex expr; +#else boost::regex expr; +#endif explicit mask_t(const string& pattern); @@ -76,17 +83,41 @@ public: return expr == other.expr; } - bool match(const string& str) const { + bool match(const string& text) const { +#if defined(HAVE_BOOST_REGEX_UNICODE) DEBUG("mask.match", - "Matching: \"" << str << "\" =~ /" << expr.str() << "/ = " - << (boost::regex_search(str, expr) ? "true" : "false")); - return boost::regex_search(str, expr); + "Matching: \"" << text << "\" =~ /" << str() << "/ = " + << (boost::u32regex_search(text, expr) ? "true" : "false")); + return boost::u32regex_search(text, expr); +#else + DEBUG("mask.match", + "Matching: \"" << text << "\" =~ /" << str() << "/ = " + << (boost::regex_search(text, expr) ? "true" : "false")); + return boost::regex_search(text, expr); +#endif } bool empty() const { return expr.empty(); } + string str() const { + if (! empty()) { +#if defined(HAVE_BOOST_REGEX_UNICODE) + assert(sizeof(boost::uint32_t) == sizeof(UChar32)); + unistring ustr; + std::basic_string expr_str = expr.str(); + std::copy(expr_str.begin(), expr_str.end(), + std::back_inserter(ustr.utf32chars)); + return ustr.extract(); +#else + return expr.str(); +#endif + } else { + return empty_string; + } + } + bool valid() const { if (expr.status() != 0) { DEBUG("ledger.validate", "mask_t: expr.status() != 0"); @@ -108,7 +139,7 @@ private: ar & temp; *this = temp; } else { - temp = expr.str(); + temp = str(); ar & temp; } } @@ -116,7 +147,7 @@ private: }; inline std::ostream& operator<<(std::ostream& out, const mask_t& mask) { - out << mask.expr.str(); + out << mask.str(); return out; } diff --git a/src/post.cc b/src/post.cc index 4f45592f..0fd763a9 100644 --- a/src/post.cc +++ b/src/post.cc @@ -246,7 +246,7 @@ namespace { if (env.value_at(0).is_string()) account = master->find_account(env.get(0), false); else if (env.value_at(0).is_mask()) - account = master->find_account_re(env.get(0).expr.str()); + account = master->find_account_re(env.get(0).str()); } else { account = env->reported_account(); } diff --git a/src/report.cc b/src/report.cc index 77548cce..fbe8d37c 100644 --- a/src/report.cc +++ b/src/report.cc @@ -322,7 +322,7 @@ value_t report_t::fn_account_total(call_scope_t& args) acct = session.journal->find_account(name, false); } else if (args[0].is_mask()) { - name = args[0].as_mask().expr.str(); + name = args[0].as_mask().str(); acct = session.journal->find_account_re(name); } else { diff --git a/src/system.hh.in b/src/system.hh.in index b0b8f1eb..12f257eb 100644 --- a/src/system.hh.in +++ b/src/system.hh.in @@ -164,7 +164,11 @@ typedef std::ostream::pos_type ostream_pos_type; #include #include #include +#if defined(HAVE_BOOST_REGEX_UNICODE) +#include +#else #include +#endif // HAVE_BOOST_REGEX_UNICODE #include #include diff --git a/src/unistring.h b/src/unistring.h index 268f60e3..bc55b016 100644 --- a/src/unistring.h +++ b/src/unistring.h @@ -59,12 +59,15 @@ namespace ledger { */ class unistring { +public: std::vector utf32chars; -public: + unistring() { + TRACE_CTOR(unistring, ""); + } unistring(const std::string& input) { - TRACE_CTOR(unistring, ""); + TRACE_CTOR(unistring, "std::string"); const char * p = input.c_str(); std::size_t len = input.length(); -- cgit v1.2.3