summaryrefslogtreecommitdiff
path: root/src/lookup.cc
diff options
context:
space:
mode:
authorJohn Wiegley <johnw@newartisans.com>2010-03-10 01:42:25 -0500
committerJohn Wiegley <johnw@newartisans.com>2010-03-10 01:42:25 -0500
commit94c30dcf7f6fc5be8130b6ecb9fdba40079de6e2 (patch)
tree6225d28b3b85eb5f12db16760a644870ef06c003 /src/lookup.cc
parent485872537756cbee26c0f3b18f0788af4d7e1a58 (diff)
downloadfork-ledger-94c30dcf7f6fc5be8130b6ecb9fdba40079de6e2.tar.gz
fork-ledger-94c30dcf7f6fc5be8130b6ecb9fdba40079de6e2.tar.bz2
fork-ledger-94c30dcf7f6fc5be8130b6ecb9fdba40079de6e2.zip
Cleaned up the intelligent lookup algorithm a bit
Diffstat (limited to 'src/lookup.cc')
-rw-r--r--src/lookup.cc159
1 files changed, 72 insertions, 87 deletions
diff --git a/src/lookup.cc b/src/lookup.cc
index d2056718..76e05bc4 100644
--- a/src/lookup.cc
+++ b/src/lookup.cc
@@ -48,9 +48,8 @@ namespace {
}
};
- typedef std::map<account_t *, int> account_use_map;
- typedef std::pair<account_t *, int> account_use_pair;
- typedef std::vector<account_use_pair> account_use_vector;
+ typedef std::map<account_t *, int> account_use_map;
+ typedef std::pair<account_t *, int> account_use_pair;
struct usage_sorter {
bool operator()(const account_use_pair& left,
@@ -90,7 +89,7 @@ lookup_probable_account(const string& ident,
if ((CURRENT_DATE() - xact->date()).days() > 700)
continue;
- // An exact match is worth a score of 100
+ // An exact match is worth a score of 100 and terminates the search
if (ident == xact->payee) {
DEBUG("lookup", " we have an exact match, score = 100");
scores.push_back(score_entry_t(xact, 100));
@@ -117,51 +116,57 @@ lookup_probable_account(const string& ident,
// Walk each letter in the source identifier
foreach (const uint32_t& ch, lowered_ident.utf32chars) {
- int addend = 0;
+ int addend = 0;
+ bool added_bonus = false;
+ std::size_t value_len = value_key.length();
pos = value_key.find(ch);
- // If it occurs in the same order as the source identifier -- that is,
- // without intervening letters to break the pattern -- it's worth 10
- // points. Plus, an extra point is added for every letter in chains of
- // 3 or more.
+ // Ensure that a letter which has been matched is not matched twice, so
+ // that the two x's of Exxon don't both match to the single x in Oxford.
+ // This part of the loop is very expensive, but avoids a lot of bogus
+ // matches.
+
+ char_positions_map::iterator pi = positions.find(ch);
+ while (pi != positions.end() &&
+ pos != unistring::npos && pos <= (*pi).second &&
+ (*pi).second + 1 < value_len)
+ pos = value_key.find(ch, (*pi).second + 1);
+
+ if (pos != unistring::npos) {
+ if (pi != positions.end())
+ (*pi).second = pos;
+ else
+ positions.insert(char_positions_map::value_type(ch, pos));
+
+ // If it occurs in the same order as the source identifier -- that is,
+ // without intervening letters to break the pattern -- it's worth 10
+ // points. Plus, an extra point is added for every letter in chains
+ // of 3 or more.
+
+ if (last_match_pos == unistring::npos ?
+ index == 0 && pos == 0 : pos == last_match_pos + 1) {
+ DEBUG("lookup",
+ " char " << index << " in-sequence match with bonus " << bonus);
+ addend += 10;
+ if (bonus > 2)
+ addend += bonus - 2;
+ bonus++;
+ added_bonus = true;
- bool added_bonus = false;
- if (pos == last_match_pos + 1) {
- DEBUG("lookup",
- " char " << index << " in-sequence match with bonus " << bonus);
- addend += 10;
- if (bonus > 2)
- addend += bonus - 2;
- bonus++;
- added_bonus = true;
-
- last_match_pos = pos;
- }
-
- // If it occurs in the same general sequence as the source identifier,
- // it's worth 5 points, plus an extra point if it's within the next 3
- // characters, and an extra point if it's preceded by a non-alphabetic
- // character.
- //
- // If the letter occurs at all in the target identifier, it's worth 1
- // point, plus an extra point if it's within 3 characters, and an extra
- // point if it's preceded by a non-alphabetic character.
-
- else if (pos != unistring::npos) {
- // Ensure that a letter which has been matched is not matched twice,
- // so that the two x's of Exxon don't both match to the single x in
- // Oxford.
- char_positions_map::iterator pi = positions.find(ch);
- if (pi != positions.end()) {
- while (pi != positions.end() &&
- pos != unistring::npos && pos <= (*pi).second &&
- (*pi).second + 1 < value_key.length()) {
- pos = value_key.find(ch, (*pi).second + 1);
- }
+ last_match_pos = pos;
}
- if (pos != unistring::npos) {
+ // If it occurs in the same general sequence as the source identifier,
+ // it's worth 5 points, plus an extra point if it's within the next 3
+ // characters, and an extra point if it's preceded by a non-alphabetic
+ // character.
+ //
+ // If the letter occurs at all in the target identifier, it's worth 1
+ // point, plus an extra point if it's within 3 characters, and an
+ // extra point if it's preceded by a non-alphabetic character.
+
+ else {
bool in_order_match = (last_match_pos != unistring::npos &&
pos > last_match_pos);
DEBUG("lookup", " char " << index << " " <<
@@ -169,11 +174,6 @@ lookup_probable_account(const string& ident,
<< " match" << (in_order_match && pos - index < 3 ?
" with proximity bonus of 1" : ""));
- if (pi != positions.end())
- (*pi).second = pos;
- else
- positions.insert(char_positions_map::value_type(ch, pos));
-
if (pos < index)
addend += 1;
else
@@ -190,17 +190,11 @@ lookup_probable_account(const string& ident,
#endif
last_match_pos = pos;
- } else {
- last_match_pos = unistring::npos;
-
- DEBUG("lookup", " char " << index << " does not match here");
- addend--;
}
- }
// If the letter does not appear at all, decrease the score by 1
- else {
+ } else {
last_match_pos = unistring::npos;
DEBUG("lookup", " char " << index << " does not match");
@@ -230,64 +224,55 @@ lookup_probable_account(const string& ident,
index++;
}
- scores.push_back(score_entry_t(xact, score));
+ // Only consider payees with a score of 30 or greater
+ if (score >= 30)
+ scores.push_back(score_entry_t(xact, score));
}
// Sort the results by descending score, then look at every account ever
- // used among the top ten. Rank these by number of times used. Lastly,
+ // used among the top five. Rank these by number of times used. Lastly,
// "decay" any latter accounts, so that we give recently used accounts a
// slightly higher rating in case of a tie.
std::stable_sort(scores.begin(), scores.end(), score_sorter());
- account_use_map account_usage;
-
- scorecard_t::iterator si = scores.begin();
-
- int decay = 0;
- xact_t * best_xact = si != scores.end() ? (*si).first : NULL;
+ scorecard_t::iterator si = scores.begin();
+ int decay = 0;
+ xact_t * best_xact = si != scores.end() ? (*si).first : NULL;
+ account_use_map account_usage;
for (int i = 0; i < 5 && si != scores.end(); i++, si++) {
DEBUG("lookup.account",
"Payee: " << std::setw(5) << std::right << (*si).second <<
" - " << (*si).first->payee);
- // Only consider payees with a score of 20 or greater
- if ((*si).second >= 50) {
- foreach (post_t * post, (*si).first->posts) {
- if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) &&
- post->account != ref_account &&
- ! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) {
- account_use_map::iterator x = account_usage.find(post->account);
- if (x == account_usage.end())
- account_usage.insert(account_use_pair(post->account,
- ((*si).second - decay)));
- else
- (*x).second += ((*si).second - decay);
- }
- decay++;
+ foreach (post_t * post, (*si).first->posts) {
+ if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) &&
+ post->account != ref_account &&
+ ! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) {
+ account_use_map::iterator x = account_usage.find(post->account);
+ if (x == account_usage.end())
+ account_usage.insert(account_use_pair(post->account,
+ ((*si).second - decay)));
+ else
+ (*x).second += ((*si).second - decay);
}
+ decay++;
}
}
if (account_usage.size() > 0) {
- account_use_vector flat_account_usage(account_usage.begin(),
- account_usage.end());
-
- std::stable_sort(flat_account_usage.begin(), flat_account_usage.end(),
- usage_sorter());
-
#if defined(DEBUG_ON)
if (SHOW_DEBUG("lookup.account")) {
- foreach (account_use_pair& value, flat_account_usage) {
+ foreach (const account_use_pair& value, account_usage) {
DEBUG("lookup.account",
"Account: " << value.second << " - " << value.first->fullname());
}
}
#endif
-
- return std::pair<xact_t *, account_t *>(best_xact,
- flat_account_usage.front().first);
+ return std::pair<xact_t *, account_t *>
+ (best_xact, (*std::max_element(account_usage.begin(), account_usage.end(),
+ usage_sorter())).first);
} else {
return std::pair<xact_t *, account_t *>(best_xact, NULL);
}