summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Wiegley <johnw@newartisans.com>2010-03-07 22:53:57 -0500
committerJohn Wiegley <johnw@newartisans.com>2010-03-08 01:11:55 -0500
commit1bf0220f246c5e984d9a43e5d8b3979b9091bc5a (patch)
tree710291f43415b9aed741a948ee8687316df13bda
parente070cdfc8ddcf9d6a25b593502f1c5ade56c849c (diff)
downloadfork-ledger-1bf0220f246c5e984d9a43e5d8b3979b9091bc5a.tar.gz
fork-ledger-1bf0220f246c5e984d9a43e5d8b3979b9091bc5a.tar.bz2
fork-ledger-1bf0220f246c5e984d9a43e5d8b3979b9091bc5a.zip
Added experimental account lookup logic
This is used by the "xact" command, and the CSV importer. It is very slow O(xacts*records).
-rw-r--r--src/convert.cc50
-rw-r--r--src/draft.cc26
-rw-r--r--src/lookup.cc296
-rw-r--r--src/lookup.h56
-rw-r--r--tools/Makefile.am2
5 files changed, 406 insertions, 24 deletions
diff --git a/src/convert.cc b/src/convert.cc
index e47fb6c0..6c02cff3 100644
--- a/src/convert.cc
+++ b/src/convert.cc
@@ -38,7 +38,8 @@
#include "iterators.h"
#include "report.h"
#include "xact.h"
-#include "output.h"
+#include "print.h"
+#include "lookup.h"
namespace ledger {
@@ -54,7 +55,8 @@ value_t convert_command(call_scope_t& scope)
else
bucket_name = "Equity:Unknown";
- account_t * bucket = journal.master->find_account(bucket_name);
+ account_t * bucket = journal.master->find_account(bucket_name);
+ account_t * unknown = journal.master->find_account(_("Expenses:Unknown"));
// Make an amounts mapping for the account under consideration
@@ -81,14 +83,15 @@ value_t convert_command(call_scope_t& scope)
}
}
+ // Create a flat list o
+ xacts_list current_xacts(journal.xacts_begin(), journal.xacts_end());
+
// Read in the series of transactions from the CSV file
- format_posts formatter(report,
- report.report_format(report.HANDLER(print_format_)),
- false);
+ print_xacts formatter(report);
+ ifstream data(path(args.get<string>(0)));
+ csv_reader reader(data);
- ifstream data(path(args.get<string>(0)));
- csv_reader reader(data);
while (xact_t * xact = reader.read_xact(journal, bucket)) {
bool matched = false;
post_map_t::iterator i = post_map.find(- xact->posts.front()->amount);
@@ -111,17 +114,34 @@ value_t convert_command(call_scope_t& scope)
DEBUG("convert.csv", "Ignored xact with code: " << *xact->code);
delete xact; // ignore it
}
- else if (! journal.add_xact(xact)) {
- delete xact;
- throw_(std::runtime_error,
- _("Failed to finalize derived transaction (check commodities)"));
- }
else {
- xact_posts_iterator xact_iter(*xact);
- while (post_t * post = xact_iter())
- formatter(*post);
+ if (xact->posts.front()->account == NULL) {
+ xacts_iterator xi;
+ xi.xacts_i = current_xacts.begin();
+ xi.xacts_end = current_xacts.end();
+ xi.xacts_uninitialized = false;
+
+ // jww (2010-03-07): Bind this logic to an option: --auto-match
+ if (account_t * acct =
+ lookup_probable_account(xact->payee, xi, bucket).second)
+ xact->posts.front()->account = acct;
+ else
+ xact->posts.front()->account = unknown;
+ }
+
+ if (! journal.add_xact(xact)) {
+ delete xact;
+ throw_(std::runtime_error,
+ _("Failed to finalize derived transaction (check commodities)"));
+ }
+ else {
+ xact_posts_iterator xact_iter(*xact);
+ while (post_t * post = xact_iter())
+ formatter(*post);
+ }
}
}
+ formatter.flush();
// If not, transform the payee according to regexps
diff --git a/src/draft.cc b/src/draft.cc
index 74e17dba..89c68541 100644
--- a/src/draft.cc
+++ b/src/draft.cc
@@ -38,6 +38,7 @@
#include "journal.h"
#include "session.h"
#include "report.h"
+#include "lookup.h"
#include "print.h"
namespace ledger {
@@ -242,18 +243,25 @@ xact_t * draft_t::insert(journal_t& journal)
if (tmpl->payee_mask.empty())
throw std::runtime_error(_("xact' command requires at least a payee"));
- xact_t * matching = NULL;
+ xact_t * matching = NULL;
std::auto_ptr<xact_t> added(new xact_t);
- for (xacts_list::reverse_iterator j = journal.xacts.rbegin();
- j != journal.xacts.rend();
- j++) {
- if (tmpl->payee_mask.match((*j)->payee)) {
- matching = *j;
- DEBUG("derive.xact",
- "Found payee match: transaction on line " << (*j)->pos->beg_line);
- break;
+ xacts_iterator xi(journal);
+ if (xact_t * xact = lookup_probable_account(tmpl->payee_mask.str(), xi).first) {
+ DEBUG("derive.xact", "Found payee by lookup: transaction on line "
+ << xact->pos->beg_line);
+ matching = xact;
+ } else {
+ for (xacts_list::reverse_iterator j = journal.xacts.rbegin();
+ j != journal.xacts.rend();
+ j++) {
+ if (tmpl->payee_mask.match((*j)->payee)) {
+ matching = *j;
+ DEBUG("derive.xact",
+ "Found payee match: transaction on line " << (*j)->pos->beg_line);
+ break;
+ }
}
}
diff --git a/src/lookup.cc b/src/lookup.cc
new file mode 100644
index 00000000..d2056718
--- /dev/null
+++ b/src/lookup.cc
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2003-2010, John Wiegley. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of New Artisans LLC nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <system.hh>
+
+#include "lookup.h"
+#include "unistring.h"
+
+namespace ledger {
+
+namespace {
+ typedef std::pair<xact_t *, int> score_entry_t;
+ typedef std::deque<score_entry_t> scorecard_t;
+ typedef std::map<uint32_t, std::size_t> char_positions_map;
+
+ struct score_sorter {
+ bool operator()(const score_entry_t& left,
+ const score_entry_t& right) const {
+ return left.second > right.second;
+ }
+ };
+
+ typedef std::map<account_t *, int> account_use_map;
+ typedef std::pair<account_t *, int> account_use_pair;
+ typedef std::vector<account_use_pair> account_use_vector;
+
+ struct usage_sorter {
+ bool operator()(const account_use_pair& left,
+ const account_use_pair& right) const {
+ return left.second > right.second;
+ }
+ };
+}
+
+std::pair<xact_t *, account_t *>
+lookup_probable_account(const string& ident,
+ xacts_iterator& iter_func,
+ account_t * ref_account)
+{
+ scorecard_t scores;
+
+#if !defined(HAVE_BOOST_REGEX_UNICODE)
+ string lident = ident;
+ to_lower(lident);
+ unistring lowered_ident(lident);
+#else
+ // jww (2010-03-07): Not yet implemented
+ unistring lowered_ident(ident);
+#endif
+
+ DEBUG("lookup.account",
+ "Looking up identifier '" << lowered_ident.extract() << "'");
+#if defined(DEBUG_ON)
+ if (ref_account != NULL)
+ DEBUG("lookup.account",
+ " with reference account: " << ref_account->fullname());
+#endif
+
+ while (xact_t * xact = iter_func()) {
+ // Only consider transactions from the last two years (jww (2010-03-07):
+ // make this an option)
+ if ((CURRENT_DATE() - xact->date()).days() > 700)
+ continue;
+
+ // An exact match is worth a score of 100
+ if (ident == xact->payee) {
+ DEBUG("lookup", " we have an exact match, score = 100");
+ scores.push_back(score_entry_t(xact, 100));
+ break;
+ }
+
+#if !defined(HAVE_BOOST_REGEX_UNICODE)
+ string payee = xact->payee;
+ to_lower(payee);
+ unistring value_key(payee);
+#else
+ // jww (2010-03-07): Not yet implemented
+ unistring value_key(xact->payee);
+#endif
+
+ DEBUG("lookup", "Considering payee: " << value_key.extract());
+
+ std::size_t index = 0;
+ std::size_t last_match_pos = unistring::npos;
+ int bonus = 0;
+ int score = 0;
+ std::size_t pos;
+ char_positions_map positions;
+
+ // Walk each letter in the source identifier
+ foreach (const uint32_t& ch, lowered_ident.utf32chars) {
+ int addend = 0;
+
+ pos = value_key.find(ch);
+
+ // If it occurs in the same order as the source identifier -- that is,
+ // without intervening letters to break the pattern -- it's worth 10
+ // points. Plus, an extra point is added for every letter in chains of
+ // 3 or more.
+
+ bool added_bonus = false;
+ if (pos == last_match_pos + 1) {
+ DEBUG("lookup",
+ " char " << index << " in-sequence match with bonus " << bonus);
+ addend += 10;
+ if (bonus > 2)
+ addend += bonus - 2;
+ bonus++;
+ added_bonus = true;
+
+ last_match_pos = pos;
+ }
+
+ // If it occurs in the same general sequence as the source identifier,
+ // it's worth 5 points, plus an extra point if it's within the next 3
+ // characters, and an extra point if it's preceded by a non-alphabetic
+ // character.
+ //
+ // If the letter occurs at all in the target identifier, it's worth 1
+ // point, plus an extra point if it's within 3 characters, and an extra
+ // point if it's preceded by a non-alphabetic character.
+
+ else if (pos != unistring::npos) {
+ // Ensure that a letter which has been matched is not matched twice,
+ // so that the two x's of Exxon don't both match to the single x in
+ // Oxford.
+ char_positions_map::iterator pi = positions.find(ch);
+ if (pi != positions.end()) {
+ while (pi != positions.end() &&
+ pos != unistring::npos && pos <= (*pi).second &&
+ (*pi).second + 1 < value_key.length()) {
+ pos = value_key.find(ch, (*pi).second + 1);
+ }
+ }
+
+ if (pos != unistring::npos) {
+ bool in_order_match = (last_match_pos != unistring::npos &&
+ pos > last_match_pos);
+ DEBUG("lookup", " char " << index << " " <<
+ (in_order_match ? "in-order" : "out-of-order")
+ << " match" << (in_order_match && pos - index < 3 ?
+ " with proximity bonus of 1" : ""));
+
+ if (pi != positions.end())
+ (*pi).second = pos;
+ else
+ positions.insert(char_positions_map::value_type(ch, pos));
+
+ if (pos < index)
+ addend += 1;
+ else
+ addend += 5;
+
+ if (in_order_match && pos - index < 3)
+ addend++;
+
+#if !defined(HAVE_BOOST_REGEX_UNICODE)
+ if (pos == 0 || (pos > 0 && !std::isalnum(value_key[pos - 1])))
+ addend++;
+#else
+ // jww (2010-03-07): Not yet implemented
+#endif
+
+ last_match_pos = pos;
+ } else {
+ last_match_pos = unistring::npos;
+
+ DEBUG("lookup", " char " << index << " does not match here");
+ addend--;
+ }
+ }
+
+ // If the letter does not appear at all, decrease the score by 1
+
+ else {
+ last_match_pos = unistring::npos;
+
+ DEBUG("lookup", " char " << index << " does not match");
+ addend--;
+ }
+
+ // Finally, decay what is to be added to the score based on its position
+ // in the word. Since credit card payees in particular often share
+ // information at the end (such as the location where the purchase was
+ // made), we want to give much more credence to what occurs at the
+ // beginning. Every 5 character positions from the beginning becomes a
+ // divisor for the addend.
+
+ if ((int(index / 5) + 1) > 1) {
+ DEBUG("lookup",
+ " discounting the addend by / " << (int(index / 5) + 1));
+ addend = int(double(addend) / (int(index / 5) + 1));
+ }
+
+ DEBUG("lookup", " final addend is " << addend);
+ score += addend;
+ DEBUG("lookup", " score is " << score);
+
+ if (! added_bonus)
+ bonus = 0;
+
+ index++;
+ }
+
+ scores.push_back(score_entry_t(xact, score));
+ }
+
+ // Sort the results by descending score, then look at every account ever
+ // used among the top ten. Rank these by number of times used. Lastly,
+ // "decay" any latter accounts, so that we give recently used accounts a
+ // slightly higher rating in case of a tie.
+
+ std::stable_sort(scores.begin(), scores.end(), score_sorter());
+
+ account_use_map account_usage;
+
+ scorecard_t::iterator si = scores.begin();
+
+ int decay = 0;
+ xact_t * best_xact = si != scores.end() ? (*si).first : NULL;
+
+ for (int i = 0; i < 5 && si != scores.end(); i++, si++) {
+ DEBUG("lookup.account",
+ "Payee: " << std::setw(5) << std::right << (*si).second <<
+ " - " << (*si).first->payee);
+
+ // Only consider payees with a score of 20 or greater
+ if ((*si).second >= 50) {
+ foreach (post_t * post, (*si).first->posts) {
+ if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) &&
+ post->account != ref_account &&
+ ! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) {
+ account_use_map::iterator x = account_usage.find(post->account);
+ if (x == account_usage.end())
+ account_usage.insert(account_use_pair(post->account,
+ ((*si).second - decay)));
+ else
+ (*x).second += ((*si).second - decay);
+ }
+ decay++;
+ }
+ }
+ }
+
+ if (account_usage.size() > 0) {
+ account_use_vector flat_account_usage(account_usage.begin(),
+ account_usage.end());
+
+ std::stable_sort(flat_account_usage.begin(), flat_account_usage.end(),
+ usage_sorter());
+
+#if defined(DEBUG_ON)
+ if (SHOW_DEBUG("lookup.account")) {
+ foreach (account_use_pair& value, flat_account_usage) {
+ DEBUG("lookup.account",
+ "Account: " << value.second << " - " << value.first->fullname());
+ }
+ }
+#endif
+
+ return std::pair<xact_t *, account_t *>(best_xact,
+ flat_account_usage.front().first);
+ } else {
+ return std::pair<xact_t *, account_t *>(best_xact, NULL);
+ }
+}
+
+} // namespace ledger
diff --git a/src/lookup.h b/src/lookup.h
new file mode 100644
index 00000000..53cb5837
--- /dev/null
+++ b/src/lookup.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2003-2010, John Wiegley. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of New Artisans LLC nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @addtogroup data
+ */
+
+/**
+ * @file lookup.h
+ * @author John Wiegley
+ *
+ * @ingroup data
+ */
+#ifndef _LOOKUP_H
+#define _LOOKUP_H
+
+#include "iterators.h"
+
+namespace ledger {
+
+std::pair<xact_t *, account_t *>
+lookup_probable_account(const string& ident,
+ xacts_iterator& iter_func,
+ account_t * ref_account = NULL);
+
+} // namespace ledger
+
+#endif // _LOOKUP_H
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 5e1d3775..e821f150 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -55,6 +55,7 @@ libledger_expr_la_CPPFLAGS = $(lib_cppflags)
libledger_expr_la_LDFLAGS = -release $(VERSION)
libledger_data_la_SOURCES = \
+ src/lookup.cc \
src/compare.cc \
src/iterators.cc \
src/timelog.cc \
@@ -129,6 +130,7 @@ pkginclude_HEADERS = \
src/timelog.h \
src/iterators.h \
src/compare.h \
+ src/lookup.h \
\
src/session.h \
src/report.h \