diff options
author | John Wiegley <johnw@newartisans.com> | 2009-11-07 08:32:44 -0500 |
---|---|---|
committer | John Wiegley <johnw@newartisans.com> | 2009-11-07 08:34:13 -0500 |
commit | c8641a6de65670b8833992c94c51a586a6434a74 (patch) | |
tree | eb59642cd3296a98ec4c7a73ca319b1c57c2f7ad | |
parent | 95a068f5e4b0e5c06fd9824f7f999248e28fee7b (diff) | |
download | fork-ledger-c8641a6de65670b8833992c94c51a586a6434a74.tar.gz fork-ledger-c8641a6de65670b8833992c94c51a586a6434a74.tar.bz2 fork-ledger-c8641a6de65670b8833992c94c51a586a6434a74.zip |
Added support for Boost.Regex w/ ICU
This allows for correct searching of UTF-8 encoded strings, such as
lower-case versions of Russian words to find mixed-case words.
-rwxr-xr-x | acprep | 20 | ||||
-rw-r--r-- | doc/sample.dat | 2 | ||||
-rw-r--r-- | lib/Makefile | 39 | ||||
-rw-r--r-- | src/derive.cc | 6 | ||||
-rw-r--r-- | src/mask.cc | 6 | ||||
-rw-r--r-- | src/mask.h | 43 | ||||
-rw-r--r-- | src/post.cc | 2 | ||||
-rw-r--r-- | src/report.cc | 2 | ||||
-rw-r--r-- | src/system.hh.in | 4 | ||||
-rw-r--r-- | src/unistring.h | 7 | ||||
-rw-r--r-- | tools/configure.ac | 23 |
11 files changed, 127 insertions, 27 deletions
@@ -751,6 +751,10 @@ class PrepareBuild(CommandLineApp): self.sys_include_dirs.insert(0, '/usr/local/stow/cppunit/include') self.sys_library_dirs.insert(0, '/usr/local/stow/cppunit/lib') + if exists('/usr/local/stow/icu/include'): + self.sys_include_dirs.insert(0, '/usr/local/stow/icu/include') + self.sys_library_dirs.insert(0, '/usr/local/stow/icu/lib') + self.CXXFLAGS.append('-march=nocona') self.CXXFLAGS.append('-msse3') self.CPPFLAGS.append('-D_GLIBCXX_FULLY_DYNAMIC_STRING=1') @@ -979,6 +983,14 @@ class PrepareBuild(CommandLineApp): self.sys_include_dirs.insert(0, '/usr/local/stow/cppunit-debug/include') self.sys_library_dirs.insert(0, '/usr/local/stow/cppunit-debug/lib') + if exists('/usr/local/stow/icu-debug/include'): + if '/usr/local/stow/icu/include' in self.sys_include_dirs: + self.sys_include_dirs.remove('/usr/local/stow/icu/include') + self.sys_library_dirs.remove('/usr/local/stow/icu/lib') + + self.sys_include_dirs.insert(0, '/usr/local/stow/icu-debug/include') + self.sys_library_dirs.insert(0, '/usr/local/stow/icu-debug/lib') + if exists('/opt/local/lib/libboost_regex-d.a'): self.envvars['BOOST_HOME'] = '/opt/local' self.envvars['BOOST_SUFFIX'] = '-d' @@ -988,9 +1000,9 @@ class PrepareBuild(CommandLineApp): self.sys_include_dirs.append('/opt/local/include/boost') - elif exists('/usr/local/lib/libboost_regex-xgcc44-sd-1_40.a'): + elif exists('/usr/local/lib/libboost_regex-xgcc44-d-1_40.a'): self.envvars['BOOST_HOME'] = '/usr/local' - self.envvars['BOOST_SUFFIX'] = '-xgcc44-sd-1_40' + self.envvars['BOOST_SUFFIX'] = '-xgcc44-d-1_40' self.log.info('Setting BOOST_SUFFIX => %s' % self.envvars['BOOST_SUFFIX']) @@ -1005,9 +1017,9 @@ class PrepareBuild(CommandLineApp): self.sys_include_dirs.append('/opt/local/include/boost') - elif exists('/usr/local/lib/libboost_regex-xgcc44-s-1_40.a'): + elif exists('/usr/local/lib/libboost_regex-xgcc44-1_40.a'): self.envvars['BOOST_HOME'] = '/usr/local' - self.envvars['BOOST_SUFFIX'] = '-xgcc44-s-1_40' + self.envvars['BOOST_SUFFIX'] = '-xgcc44-1_40' self.log.info('Setting BOOST_SUFFIX => %s' % self.envvars['BOOST_SUFFIX']) diff --git a/doc/sample.dat b/doc/sample.dat index 002d20ee..e773d6df 100644 --- a/doc/sample.dat +++ b/doc/sample.dat @@ -24,7 +24,7 @@ N $ Income:Salary 2004/05/14 * Another dày in which there is Páying - Русский язык:Русский язык:Русский язык:Русский язык $1000.00 + Русский язык:Активы:Русский язык:Русский язык $1000.00 Income:Salary 2004/05/27 Book Store diff --git a/lib/Makefile b/lib/Makefile index 07cf77ea..3a9c3214 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -2,34 +2,57 @@ # This is only important if you intend to produce a Ledger binary for # installation. -#ARCH_CFLAGS = -g -arch i386 -arch ppc -isysroot /Developer/SDKs/MacOSX10.5.sdk -#ARCH_LDFLAGS = -g -arch i386 -arch ppc -Wl,-syslibroot,/Developer/SDKs/MacOSX10.5.sdk - STOW_ROOT = /usr/local/stow BOOST_SOURCE = boost BOOST_VERSION = 1_40_0 -# architecture=combined +icu-release: + -(cd icu/source; make distclean) + (cd icu/source; sh autogen.sh; \ + ./configure CPPFLAGS="" \ + CFLAGS="$(ARCH_CFLAGS)" \ + LDFLAGS="$(ARCH_LDFLAGS)" \ + CC="$(CC)" CXX="$(CXX)" LD="$(LD)" \ + --enable-static \ + --prefix=$(STOW_ROOT)/icu && \ + make install) + +icu-debug: + -(cd icu/source; make distclean) + (cd icu/source; sh autogen.sh; \ + ./configure CPPFLAGS="-D_GLIBCXX_DEBUG=1" \ + CFLAGS="-g $(ARCH_CFLAGS)" \ + LDFLAGS="-g $(ARCH_LDFLAGS)" \ + CC="$(CC)" CXX="$(CXX)" LD="$(LD)" \ + --enable-static --enable-debug \ + --prefix=$(STOW_ROOT)/icu-debug && \ + make install) + +icu-build: icu-release icu-debug + boost-release: (cd $(BOOST_SOURCE) && \ bjam release --prefix=$(STOW_ROOT)/boost_$(BOOST_VERSION) \ --build-dir=$(HOME)/Products/boost_$(BOOST_VERSION) \ - --toolset=darwin --build-type=complete --layout=versioned install) + --toolset=darwin --build-type=complete --layout=versioned \ + -sHAVE_ICU=1 -sICU_PATH=/usr/local/stow/icu install) boost-debug: (cd $(BOOST_SOURCE) && \ bjam debug --prefix=$(STOW_ROOT)/boost_$(BOOST_VERSION) \ --build-dir=$(HOME)/Products/boost_$(BOOST_VERSION) \ --toolset=darwin --build-type=complete --layout=versioned \ - define=_GLIBCXX_DEBUG=1 install) + define=_GLIBCXX_DEBUG=1 \ + -sHAVE_ICU=1 -sICU_PATH=/usr/local/stow/icu-debug install) boost-build: boost-release boost-debug cppunit-release: -(cd cppunit; make distclean) (cd cppunit; sh autogen.sh; \ - ./configure CFLAGS="$(ARCH_CFLAGS)" \ + ./configure CPPFLAGS="" \ + CFLAGS="$(ARCH_CFLAGS)" \ LDFLAGS="$(ARCH_LDFLAGS)" \ CC="$(CC)" CXX="$(CXX)" LD="$(LD)" \ --prefix=$(STOW_ROOT)/cppunit && \ @@ -47,4 +70,4 @@ cppunit-debug: cppunit-build: cppunit-release cppunit-debug -build-all: boost-build cppunit-build +all: boost-build cppunit-build diff --git a/src/derive.cc b/src/derive.cc index d3a7a37d..081b96b2 100644 --- a/src/derive.cc +++ b/src/derive.cc @@ -307,7 +307,7 @@ namespace { DEBUG("derive.xact", "Setting note from match: " << *added->note); #endif } else { - added->payee = tmpl.payee_mask.expr.str(); + added->payee = tmpl.payee_mask.str(); DEBUG("derive.xact", "Setting payee from template: " << added->payee); } @@ -403,14 +403,14 @@ namespace { account_t * acct = NULL; if (! acct) { - acct = journal.find_account_re(post.account_mask->expr.str()); + acct = journal.find_account_re(post.account_mask->str()); #if defined(DEBUG_ON) if (acct) DEBUG("derive.xact", "Found account as a regular expression"); #endif } if (! acct) { - acct = journal.find_account(post.account_mask->expr.str()); + acct = journal.find_account(post.account_mask->str()); #if defined(DEBUG_ON) if (acct) DEBUG("derive.xact", "Found (or created) account by name"); diff --git a/src/mask.cc b/src/mask.cc index 135f6669..c1e66ced 100644 --- a/src/mask.cc +++ b/src/mask.cc @@ -43,7 +43,11 @@ mask_t::mask_t(const string& pat) : expr() mask_t& mask_t::operator=(const string& pat) { - expr.assign(pat.c_str(), regex::perl | regex::icase); +#if defined(HAVE_BOOST_REGEX_UNICODE) + expr = boost::make_u32regex(pat.c_str(), boost::regex::perl | boost::regex::icase); +#else + expr.assign(pat.c_str(), boost::regex::perl | boost::regex::icase); +#endif VERIFY(valid()); return *this; } @@ -45,6 +45,9 @@ #define _MASK_H #include "utils.h" +#if defined(HAVE_BOOST_REGEX_UNICODE) +#include "unistring.h" +#endif namespace ledger { @@ -56,7 +59,11 @@ namespace ledger { class mask_t { public: +#if defined(HAVE_BOOST_REGEX_UNICODE) + boost::u32regex expr; +#else boost::regex expr; +#endif explicit mask_t(const string& pattern); @@ -76,17 +83,41 @@ public: return expr == other.expr; } - bool match(const string& str) const { + bool match(const string& text) const { +#if defined(HAVE_BOOST_REGEX_UNICODE) DEBUG("mask.match", - "Matching: \"" << str << "\" =~ /" << expr.str() << "/ = " - << (boost::regex_search(str, expr) ? "true" : "false")); - return boost::regex_search(str, expr); + "Matching: \"" << text << "\" =~ /" << str() << "/ = " + << (boost::u32regex_search(text, expr) ? "true" : "false")); + return boost::u32regex_search(text, expr); +#else + DEBUG("mask.match", + "Matching: \"" << text << "\" =~ /" << str() << "/ = " + << (boost::regex_search(text, expr) ? "true" : "false")); + return boost::regex_search(text, expr); +#endif } bool empty() const { return expr.empty(); } + string str() const { + if (! empty()) { +#if defined(HAVE_BOOST_REGEX_UNICODE) + assert(sizeof(boost::uint32_t) == sizeof(UChar32)); + unistring ustr; + std::basic_string<UChar32> expr_str = expr.str(); + std::copy(expr_str.begin(), expr_str.end(), + std::back_inserter(ustr.utf32chars)); + return ustr.extract(); +#else + return expr.str(); +#endif + } else { + return empty_string; + } + } + bool valid() const { if (expr.status() != 0) { DEBUG("ledger.validate", "mask_t: expr.status() != 0"); @@ -108,7 +139,7 @@ private: ar & temp; *this = temp; } else { - temp = expr.str(); + temp = str(); ar & temp; } } @@ -116,7 +147,7 @@ private: }; inline std::ostream& operator<<(std::ostream& out, const mask_t& mask) { - out << mask.expr.str(); + out << mask.str(); return out; } diff --git a/src/post.cc b/src/post.cc index 4f45592f..0fd763a9 100644 --- a/src/post.cc +++ b/src/post.cc @@ -246,7 +246,7 @@ namespace { if (env.value_at(0).is_string()) account = master->find_account(env.get<string>(0), false); else if (env.value_at(0).is_mask()) - account = master->find_account_re(env.get<mask_t>(0).expr.str()); + account = master->find_account_re(env.get<mask_t>(0).str()); } else { account = env->reported_account(); } diff --git a/src/report.cc b/src/report.cc index 77548cce..fbe8d37c 100644 --- a/src/report.cc +++ b/src/report.cc @@ -322,7 +322,7 @@ value_t report_t::fn_account_total(call_scope_t& args) acct = session.journal->find_account(name, false); } else if (args[0].is_mask()) { - name = args[0].as_mask().expr.str(); + name = args[0].as_mask().str(); acct = session.journal->find_account_re(name); } else { diff --git a/src/system.hh.in b/src/system.hh.in index b0b8f1eb..12f257eb 100644 --- a/src/system.hh.in +++ b/src/system.hh.in @@ -164,7 +164,11 @@ typedef std::ostream::pos_type ostream_pos_type; #include <boost/random/uniform_int.hpp> #include <boost/random/uniform_real.hpp> #include <boost/random/variate_generator.hpp> +#if defined(HAVE_BOOST_REGEX_UNICODE) +#include <boost/regex/icu.hpp> +#else #include <boost/regex.hpp> +#endif // HAVE_BOOST_REGEX_UNICODE #include <boost/variant.hpp> #include <boost/version.hpp> diff --git a/src/unistring.h b/src/unistring.h index 268f60e3..bc55b016 100644 --- a/src/unistring.h +++ b/src/unistring.h @@ -59,12 +59,15 @@ namespace ledger { */ class unistring { +public: std::vector<boost::uint32_t> utf32chars; -public: + unistring() { + TRACE_CTOR(unistring, ""); + } unistring(const std::string& input) { - TRACE_CTOR(unistring, ""); + TRACE_CTOR(unistring, "std::string"); const char * p = input.c_str(); std::size_t len = input.length(); diff --git a/tools/configure.ac b/tools/configure.ac index 22b4b96a..747d940d 100644 --- a/tools/configure.ac +++ b/tools/configure.ac @@ -193,6 +193,29 @@ else AC_MSG_FAILURE("Could not find boost_regex library (set CPPFLAGS and LDFLAGS?)") fi +AC_CACHE_CHECK( + [if boost_regex w/ICU is available], + [boost_regex_icu_avail_cv_], + [boost_regex_icu_save_libs=$LIBS + LIBS="-licuuc $LIBS" + AC_LANG_PUSH(C++) + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[#include <boost/regex/icu.hpp> + using namespace boost;]], + [[std::string text = "Активы"; + u32regex r = make_u32regex("активы", regex::perl | regex::icase); + return u32regex_search(text, r) ? 0 : 1;]])], + [boost_regex_icu_avail_cv_=true], + [boost_regex_icu_avail_cv_=false]) + AC_LANG_POP + LIBS=$boost_regex_icu_save_libs]) + +if [test x$boost_regex_icu_avail_cv_ = xtrue ]; then + AC_DEFINE([HAVE_BOOST_REGEX_UNICODE], [1], [If the boost_regex library w/ICU is available]) + LIBS="-licuuc $LIBS" +fi + # check for boost_date_time AC_CACHE_CHECK( [if boost_date_time is available], |