diff options
author | John Wiegley <johnw@newartisans.com> | 2010-03-10 01:42:25 -0500 |
---|---|---|
committer | John Wiegley <johnw@newartisans.com> | 2010-03-10 01:42:25 -0500 |
commit | 94c30dcf7f6fc5be8130b6ecb9fdba40079de6e2 (patch) | |
tree | 6225d28b3b85eb5f12db16760a644870ef06c003 /src/lookup.cc | |
parent | 485872537756cbee26c0f3b18f0788af4d7e1a58 (diff) | |
download | fork-ledger-94c30dcf7f6fc5be8130b6ecb9fdba40079de6e2.tar.gz fork-ledger-94c30dcf7f6fc5be8130b6ecb9fdba40079de6e2.tar.bz2 fork-ledger-94c30dcf7f6fc5be8130b6ecb9fdba40079de6e2.zip |
Cleaned up the intelligent lookup algorithm a bit
Diffstat (limited to 'src/lookup.cc')
-rw-r--r-- | src/lookup.cc | 159 |
1 files changed, 72 insertions, 87 deletions
diff --git a/src/lookup.cc b/src/lookup.cc index d2056718..76e05bc4 100644 --- a/src/lookup.cc +++ b/src/lookup.cc @@ -48,9 +48,8 @@ namespace { } }; - typedef std::map<account_t *, int> account_use_map; - typedef std::pair<account_t *, int> account_use_pair; - typedef std::vector<account_use_pair> account_use_vector; + typedef std::map<account_t *, int> account_use_map; + typedef std::pair<account_t *, int> account_use_pair; struct usage_sorter { bool operator()(const account_use_pair& left, @@ -90,7 +89,7 @@ lookup_probable_account(const string& ident, if ((CURRENT_DATE() - xact->date()).days() > 700) continue; - // An exact match is worth a score of 100 + // An exact match is worth a score of 100 and terminates the search if (ident == xact->payee) { DEBUG("lookup", " we have an exact match, score = 100"); scores.push_back(score_entry_t(xact, 100)); @@ -117,51 +116,57 @@ lookup_probable_account(const string& ident, // Walk each letter in the source identifier foreach (const uint32_t& ch, lowered_ident.utf32chars) { - int addend = 0; + int addend = 0; + bool added_bonus = false; + std::size_t value_len = value_key.length(); pos = value_key.find(ch); - // If it occurs in the same order as the source identifier -- that is, - // without intervening letters to break the pattern -- it's worth 10 - // points. Plus, an extra point is added for every letter in chains of - // 3 or more. + // Ensure that a letter which has been matched is not matched twice, so + // that the two x's of Exxon don't both match to the single x in Oxford. + // This part of the loop is very expensive, but avoids a lot of bogus + // matches. + + char_positions_map::iterator pi = positions.find(ch); + while (pi != positions.end() && + pos != unistring::npos && pos <= (*pi).second && + (*pi).second + 1 < value_len) + pos = value_key.find(ch, (*pi).second + 1); + + if (pos != unistring::npos) { + if (pi != positions.end()) + (*pi).second = pos; + else + positions.insert(char_positions_map::value_type(ch, pos)); + + // If it occurs in the same order as the source identifier -- that is, + // without intervening letters to break the pattern -- it's worth 10 + // points. Plus, an extra point is added for every letter in chains + // of 3 or more. + + if (last_match_pos == unistring::npos ? + index == 0 && pos == 0 : pos == last_match_pos + 1) { + DEBUG("lookup", + " char " << index << " in-sequence match with bonus " << bonus); + addend += 10; + if (bonus > 2) + addend += bonus - 2; + bonus++; + added_bonus = true; - bool added_bonus = false; - if (pos == last_match_pos + 1) { - DEBUG("lookup", - " char " << index << " in-sequence match with bonus " << bonus); - addend += 10; - if (bonus > 2) - addend += bonus - 2; - bonus++; - added_bonus = true; - - last_match_pos = pos; - } - - // If it occurs in the same general sequence as the source identifier, - // it's worth 5 points, plus an extra point if it's within the next 3 - // characters, and an extra point if it's preceded by a non-alphabetic - // character. - // - // If the letter occurs at all in the target identifier, it's worth 1 - // point, plus an extra point if it's within 3 characters, and an extra - // point if it's preceded by a non-alphabetic character. - - else if (pos != unistring::npos) { - // Ensure that a letter which has been matched is not matched twice, - // so that the two x's of Exxon don't both match to the single x in - // Oxford. - char_positions_map::iterator pi = positions.find(ch); - if (pi != positions.end()) { - while (pi != positions.end() && - pos != unistring::npos && pos <= (*pi).second && - (*pi).second + 1 < value_key.length()) { - pos = value_key.find(ch, (*pi).second + 1); - } + last_match_pos = pos; } - if (pos != unistring::npos) { + // If it occurs in the same general sequence as the source identifier, + // it's worth 5 points, plus an extra point if it's within the next 3 + // characters, and an extra point if it's preceded by a non-alphabetic + // character. + // + // If the letter occurs at all in the target identifier, it's worth 1 + // point, plus an extra point if it's within 3 characters, and an + // extra point if it's preceded by a non-alphabetic character. + + else { bool in_order_match = (last_match_pos != unistring::npos && pos > last_match_pos); DEBUG("lookup", " char " << index << " " << @@ -169,11 +174,6 @@ lookup_probable_account(const string& ident, << " match" << (in_order_match && pos - index < 3 ? " with proximity bonus of 1" : "")); - if (pi != positions.end()) - (*pi).second = pos; - else - positions.insert(char_positions_map::value_type(ch, pos)); - if (pos < index) addend += 1; else @@ -190,17 +190,11 @@ lookup_probable_account(const string& ident, #endif last_match_pos = pos; - } else { - last_match_pos = unistring::npos; - - DEBUG("lookup", " char " << index << " does not match here"); - addend--; } - } // If the letter does not appear at all, decrease the score by 1 - else { + } else { last_match_pos = unistring::npos; DEBUG("lookup", " char " << index << " does not match"); @@ -230,64 +224,55 @@ lookup_probable_account(const string& ident, index++; } - scores.push_back(score_entry_t(xact, score)); + // Only consider payees with a score of 30 or greater + if (score >= 30) + scores.push_back(score_entry_t(xact, score)); } // Sort the results by descending score, then look at every account ever - // used among the top ten. Rank these by number of times used. Lastly, + // used among the top five. Rank these by number of times used. Lastly, // "decay" any latter accounts, so that we give recently used accounts a // slightly higher rating in case of a tie. std::stable_sort(scores.begin(), scores.end(), score_sorter()); - account_use_map account_usage; - - scorecard_t::iterator si = scores.begin(); - - int decay = 0; - xact_t * best_xact = si != scores.end() ? (*si).first : NULL; + scorecard_t::iterator si = scores.begin(); + int decay = 0; + xact_t * best_xact = si != scores.end() ? (*si).first : NULL; + account_use_map account_usage; for (int i = 0; i < 5 && si != scores.end(); i++, si++) { DEBUG("lookup.account", "Payee: " << std::setw(5) << std::right << (*si).second << " - " << (*si).first->payee); - // Only consider payees with a score of 20 or greater - if ((*si).second >= 50) { - foreach (post_t * post, (*si).first->posts) { - if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) && - post->account != ref_account && - ! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) { - account_use_map::iterator x = account_usage.find(post->account); - if (x == account_usage.end()) - account_usage.insert(account_use_pair(post->account, - ((*si).second - decay))); - else - (*x).second += ((*si).second - decay); - } - decay++; + foreach (post_t * post, (*si).first->posts) { + if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) && + post->account != ref_account && + ! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) { + account_use_map::iterator x = account_usage.find(post->account); + if (x == account_usage.end()) + account_usage.insert(account_use_pair(post->account, + ((*si).second - decay))); + else + (*x).second += ((*si).second - decay); } + decay++; } } if (account_usage.size() > 0) { - account_use_vector flat_account_usage(account_usage.begin(), - account_usage.end()); - - std::stable_sort(flat_account_usage.begin(), flat_account_usage.end(), - usage_sorter()); - #if defined(DEBUG_ON) if (SHOW_DEBUG("lookup.account")) { - foreach (account_use_pair& value, flat_account_usage) { + foreach (const account_use_pair& value, account_usage) { DEBUG("lookup.account", "Account: " << value.second << " - " << value.first->fullname()); } } #endif - - return std::pair<xact_t *, account_t *>(best_xact, - flat_account_usage.front().first); + return std::pair<xact_t *, account_t *> + (best_xact, (*std::max_element(account_usage.begin(), account_usage.end(), + usage_sorter())).first); } else { return std::pair<xact_t *, account_t *>(best_xact, NULL); } |