diff options
author | John Wiegley <johnw@newartisans.com> | 2010-03-07 22:53:57 -0500 |
---|---|---|
committer | John Wiegley <johnw@newartisans.com> | 2010-03-08 01:11:55 -0500 |
commit | 1bf0220f246c5e984d9a43e5d8b3979b9091bc5a (patch) | |
tree | 710291f43415b9aed741a948ee8687316df13bda /src | |
parent | e070cdfc8ddcf9d6a25b593502f1c5ade56c849c (diff) | |
download | fork-ledger-1bf0220f246c5e984d9a43e5d8b3979b9091bc5a.tar.gz fork-ledger-1bf0220f246c5e984d9a43e5d8b3979b9091bc5a.tar.bz2 fork-ledger-1bf0220f246c5e984d9a43e5d8b3979b9091bc5a.zip |
Added experimental account lookup logic
This is used by the "xact" command, and the CSV importer. It is very
slow O(xacts*records).
Diffstat (limited to 'src')
-rw-r--r-- | src/convert.cc | 50 | ||||
-rw-r--r-- | src/draft.cc | 26 | ||||
-rw-r--r-- | src/lookup.cc | 296 | ||||
-rw-r--r-- | src/lookup.h | 56 |
4 files changed, 404 insertions, 24 deletions
diff --git a/src/convert.cc b/src/convert.cc index e47fb6c0..6c02cff3 100644 --- a/src/convert.cc +++ b/src/convert.cc @@ -38,7 +38,8 @@ #include "iterators.h" #include "report.h" #include "xact.h" -#include "output.h" +#include "print.h" +#include "lookup.h" namespace ledger { @@ -54,7 +55,8 @@ value_t convert_command(call_scope_t& scope) else bucket_name = "Equity:Unknown"; - account_t * bucket = journal.master->find_account(bucket_name); + account_t * bucket = journal.master->find_account(bucket_name); + account_t * unknown = journal.master->find_account(_("Expenses:Unknown")); // Make an amounts mapping for the account under consideration @@ -81,14 +83,15 @@ value_t convert_command(call_scope_t& scope) } } + // Create a flat list o + xacts_list current_xacts(journal.xacts_begin(), journal.xacts_end()); + // Read in the series of transactions from the CSV file - format_posts formatter(report, - report.report_format(report.HANDLER(print_format_)), - false); + print_xacts formatter(report); + ifstream data(path(args.get<string>(0))); + csv_reader reader(data); - ifstream data(path(args.get<string>(0))); - csv_reader reader(data); while (xact_t * xact = reader.read_xact(journal, bucket)) { bool matched = false; post_map_t::iterator i = post_map.find(- xact->posts.front()->amount); @@ -111,17 +114,34 @@ value_t convert_command(call_scope_t& scope) DEBUG("convert.csv", "Ignored xact with code: " << *xact->code); delete xact; // ignore it } - else if (! journal.add_xact(xact)) { - delete xact; - throw_(std::runtime_error, - _("Failed to finalize derived transaction (check commodities)")); - } else { - xact_posts_iterator xact_iter(*xact); - while (post_t * post = xact_iter()) - formatter(*post); + if (xact->posts.front()->account == NULL) { + xacts_iterator xi; + xi.xacts_i = current_xacts.begin(); + xi.xacts_end = current_xacts.end(); + xi.xacts_uninitialized = false; + + // jww (2010-03-07): Bind this logic to an option: --auto-match + if (account_t * acct = + lookup_probable_account(xact->payee, xi, bucket).second) + xact->posts.front()->account = acct; + else + xact->posts.front()->account = unknown; + } + + if (! journal.add_xact(xact)) { + delete xact; + throw_(std::runtime_error, + _("Failed to finalize derived transaction (check commodities)")); + } + else { + xact_posts_iterator xact_iter(*xact); + while (post_t * post = xact_iter()) + formatter(*post); + } } } + formatter.flush(); // If not, transform the payee according to regexps diff --git a/src/draft.cc b/src/draft.cc index 74e17dba..89c68541 100644 --- a/src/draft.cc +++ b/src/draft.cc @@ -38,6 +38,7 @@ #include "journal.h" #include "session.h" #include "report.h" +#include "lookup.h" #include "print.h" namespace ledger { @@ -242,18 +243,25 @@ xact_t * draft_t::insert(journal_t& journal) if (tmpl->payee_mask.empty()) throw std::runtime_error(_("xact' command requires at least a payee")); - xact_t * matching = NULL; + xact_t * matching = NULL; std::auto_ptr<xact_t> added(new xact_t); - for (xacts_list::reverse_iterator j = journal.xacts.rbegin(); - j != journal.xacts.rend(); - j++) { - if (tmpl->payee_mask.match((*j)->payee)) { - matching = *j; - DEBUG("derive.xact", - "Found payee match: transaction on line " << (*j)->pos->beg_line); - break; + xacts_iterator xi(journal); + if (xact_t * xact = lookup_probable_account(tmpl->payee_mask.str(), xi).first) { + DEBUG("derive.xact", "Found payee by lookup: transaction on line " + << xact->pos->beg_line); + matching = xact; + } else { + for (xacts_list::reverse_iterator j = journal.xacts.rbegin(); + j != journal.xacts.rend(); + j++) { + if (tmpl->payee_mask.match((*j)->payee)) { + matching = *j; + DEBUG("derive.xact", + "Found payee match: transaction on line " << (*j)->pos->beg_line); + break; + } } } diff --git a/src/lookup.cc b/src/lookup.cc new file mode 100644 index 00000000..d2056718 --- /dev/null +++ b/src/lookup.cc @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2003-2010, John Wiegley. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of New Artisans LLC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <system.hh> + +#include "lookup.h" +#include "unistring.h" + +namespace ledger { + +namespace { + typedef std::pair<xact_t *, int> score_entry_t; + typedef std::deque<score_entry_t> scorecard_t; + typedef std::map<uint32_t, std::size_t> char_positions_map; + + struct score_sorter { + bool operator()(const score_entry_t& left, + const score_entry_t& right) const { + return left.second > right.second; + } + }; + + typedef std::map<account_t *, int> account_use_map; + typedef std::pair<account_t *, int> account_use_pair; + typedef std::vector<account_use_pair> account_use_vector; + + struct usage_sorter { + bool operator()(const account_use_pair& left, + const account_use_pair& right) const { + return left.second > right.second; + } + }; +} + +std::pair<xact_t *, account_t *> +lookup_probable_account(const string& ident, + xacts_iterator& iter_func, + account_t * ref_account) +{ + scorecard_t scores; + +#if !defined(HAVE_BOOST_REGEX_UNICODE) + string lident = ident; + to_lower(lident); + unistring lowered_ident(lident); +#else + // jww (2010-03-07): Not yet implemented + unistring lowered_ident(ident); +#endif + + DEBUG("lookup.account", + "Looking up identifier '" << lowered_ident.extract() << "'"); +#if defined(DEBUG_ON) + if (ref_account != NULL) + DEBUG("lookup.account", + " with reference account: " << ref_account->fullname()); +#endif + + while (xact_t * xact = iter_func()) { + // Only consider transactions from the last two years (jww (2010-03-07): + // make this an option) + if ((CURRENT_DATE() - xact->date()).days() > 700) + continue; + + // An exact match is worth a score of 100 + if (ident == xact->payee) { + DEBUG("lookup", " we have an exact match, score = 100"); + scores.push_back(score_entry_t(xact, 100)); + break; + } + +#if !defined(HAVE_BOOST_REGEX_UNICODE) + string payee = xact->payee; + to_lower(payee); + unistring value_key(payee); +#else + // jww (2010-03-07): Not yet implemented + unistring value_key(xact->payee); +#endif + + DEBUG("lookup", "Considering payee: " << value_key.extract()); + + std::size_t index = 0; + std::size_t last_match_pos = unistring::npos; + int bonus = 0; + int score = 0; + std::size_t pos; + char_positions_map positions; + + // Walk each letter in the source identifier + foreach (const uint32_t& ch, lowered_ident.utf32chars) { + int addend = 0; + + pos = value_key.find(ch); + + // If it occurs in the same order as the source identifier -- that is, + // without intervening letters to break the pattern -- it's worth 10 + // points. Plus, an extra point is added for every letter in chains of + // 3 or more. + + bool added_bonus = false; + if (pos == last_match_pos + 1) { + DEBUG("lookup", + " char " << index << " in-sequence match with bonus " << bonus); + addend += 10; + if (bonus > 2) + addend += bonus - 2; + bonus++; + added_bonus = true; + + last_match_pos = pos; + } + + // If it occurs in the same general sequence as the source identifier, + // it's worth 5 points, plus an extra point if it's within the next 3 + // characters, and an extra point if it's preceded by a non-alphabetic + // character. + // + // If the letter occurs at all in the target identifier, it's worth 1 + // point, plus an extra point if it's within 3 characters, and an extra + // point if it's preceded by a non-alphabetic character. + + else if (pos != unistring::npos) { + // Ensure that a letter which has been matched is not matched twice, + // so that the two x's of Exxon don't both match to the single x in + // Oxford. + char_positions_map::iterator pi = positions.find(ch); + if (pi != positions.end()) { + while (pi != positions.end() && + pos != unistring::npos && pos <= (*pi).second && + (*pi).second + 1 < value_key.length()) { + pos = value_key.find(ch, (*pi).second + 1); + } + } + + if (pos != unistring::npos) { + bool in_order_match = (last_match_pos != unistring::npos && + pos > last_match_pos); + DEBUG("lookup", " char " << index << " " << + (in_order_match ? "in-order" : "out-of-order") + << " match" << (in_order_match && pos - index < 3 ? + " with proximity bonus of 1" : "")); + + if (pi != positions.end()) + (*pi).second = pos; + else + positions.insert(char_positions_map::value_type(ch, pos)); + + if (pos < index) + addend += 1; + else + addend += 5; + + if (in_order_match && pos - index < 3) + addend++; + +#if !defined(HAVE_BOOST_REGEX_UNICODE) + if (pos == 0 || (pos > 0 && !std::isalnum(value_key[pos - 1]))) + addend++; +#else + // jww (2010-03-07): Not yet implemented +#endif + + last_match_pos = pos; + } else { + last_match_pos = unistring::npos; + + DEBUG("lookup", " char " << index << " does not match here"); + addend--; + } + } + + // If the letter does not appear at all, decrease the score by 1 + + else { + last_match_pos = unistring::npos; + + DEBUG("lookup", " char " << index << " does not match"); + addend--; + } + + // Finally, decay what is to be added to the score based on its position + // in the word. Since credit card payees in particular often share + // information at the end (such as the location where the purchase was + // made), we want to give much more credence to what occurs at the + // beginning. Every 5 character positions from the beginning becomes a + // divisor for the addend. + + if ((int(index / 5) + 1) > 1) { + DEBUG("lookup", + " discounting the addend by / " << (int(index / 5) + 1)); + addend = int(double(addend) / (int(index / 5) + 1)); + } + + DEBUG("lookup", " final addend is " << addend); + score += addend; + DEBUG("lookup", " score is " << score); + + if (! added_bonus) + bonus = 0; + + index++; + } + + scores.push_back(score_entry_t(xact, score)); + } + + // Sort the results by descending score, then look at every account ever + // used among the top ten. Rank these by number of times used. Lastly, + // "decay" any latter accounts, so that we give recently used accounts a + // slightly higher rating in case of a tie. + + std::stable_sort(scores.begin(), scores.end(), score_sorter()); + + account_use_map account_usage; + + scorecard_t::iterator si = scores.begin(); + + int decay = 0; + xact_t * best_xact = si != scores.end() ? (*si).first : NULL; + + for (int i = 0; i < 5 && si != scores.end(); i++, si++) { + DEBUG("lookup.account", + "Payee: " << std::setw(5) << std::right << (*si).second << + " - " << (*si).first->payee); + + // Only consider payees with a score of 20 or greater + if ((*si).second >= 50) { + foreach (post_t * post, (*si).first->posts) { + if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) && + post->account != ref_account && + ! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) { + account_use_map::iterator x = account_usage.find(post->account); + if (x == account_usage.end()) + account_usage.insert(account_use_pair(post->account, + ((*si).second - decay))); + else + (*x).second += ((*si).second - decay); + } + decay++; + } + } + } + + if (account_usage.size() > 0) { + account_use_vector flat_account_usage(account_usage.begin(), + account_usage.end()); + + std::stable_sort(flat_account_usage.begin(), flat_account_usage.end(), + usage_sorter()); + +#if defined(DEBUG_ON) + if (SHOW_DEBUG("lookup.account")) { + foreach (account_use_pair& value, flat_account_usage) { + DEBUG("lookup.account", + "Account: " << value.second << " - " << value.first->fullname()); + } + } +#endif + + return std::pair<xact_t *, account_t *>(best_xact, + flat_account_usage.front().first); + } else { + return std::pair<xact_t *, account_t *>(best_xact, NULL); + } +} + +} // namespace ledger diff --git a/src/lookup.h b/src/lookup.h new file mode 100644 index 00000000..53cb5837 --- /dev/null +++ b/src/lookup.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2003-2010, John Wiegley. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of New Artisans LLC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @addtogroup data + */ + +/** + * @file lookup.h + * @author John Wiegley + * + * @ingroup data + */ +#ifndef _LOOKUP_H +#define _LOOKUP_H + +#include "iterators.h" + +namespace ledger { + +std::pair<xact_t *, account_t *> +lookup_probable_account(const string& ident, + xacts_iterator& iter_func, + account_t * ref_account = NULL); + +} // namespace ledger + +#endif // _LOOKUP_H |