/*
 * Copyright (c) 2003-2010, John Wiegley.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *
 * - Neither the name of New Artisans LLC nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <system.hh>

#include "lookup.h"
#include "unistring.h"

namespace ledger {

namespace {
  typedef std::pair<xact_t *, int> score_entry_t;
  typedef std::deque<score_entry_t> scorecard_t;
  typedef std::map<uint32_t, std::size_t> char_positions_map;

  struct score_sorter {
    bool operator()(const score_entry_t& left,
                    const score_entry_t& right) const {
      return left.second > right.second;
    }
  };

  typedef std::map<account_t *, int>  account_use_map;
  typedef std::pair<account_t *, int> account_use_pair;

  struct usage_sorter {
    bool operator()(const account_use_pair& left,
                    const account_use_pair& right) const {
      return left.second > right.second;
    }
  };
}

std::pair<xact_t *, account_t *>
lookup_probable_account(const string& ident,
                        xacts_list::reverse_iterator iter,
                        xacts_list::reverse_iterator end,
                        account_t * ref_account)
{
  scorecard_t scores;

#if !defined(HAVE_BOOST_REGEX_UNICODE)
    string lident = ident;
    to_lower(lident);
    unistring lowered_ident(lident);
#else
    // jww (2010-03-07): Not yet implemented
    unistring lowered_ident(ident);
#endif

  DEBUG("lookup.account",
        "Looking up identifier '" << lowered_ident.extract() << "'");
#if defined(DEBUG_ON)
  if (ref_account != NULL)
    DEBUG("lookup.account",
          "  with reference account: " << ref_account->fullname());
#endif

  xact_t * xact;
  while (iter != end && (xact = *iter++) != NULL) {
#if 0
    // Only consider transactions from the last two years (jww (2010-03-07):
    // make this an option)
    if ((CURRENT_DATE() - xact->date()).days() > 700)
      continue;
#endif

    // An exact match is worth a score of 100 and terminates the search
    if (ident == xact->payee) {
      DEBUG("lookup", "  we have an exact match, score = 100");
      scores.push_back(score_entry_t(xact, 100));
      break;
    }

#if !defined(HAVE_BOOST_REGEX_UNICODE)
    string payee = xact->payee;
    to_lower(payee);
    unistring value_key(payee);
#else
    // jww (2010-03-07): Not yet implemented
    unistring value_key(xact->payee);
#endif

    DEBUG("lookup", "Considering payee: " << value_key.extract());

    std::size_t        index          = 0;
    std::size_t        last_match_pos = unistring::npos;
    int                bonus          = 0;
    int                score          = 0;
    std::size_t        pos;
    char_positions_map positions;

    // Walk each letter in the source identifier
    foreach (const uint32_t& ch, lowered_ident.utf32chars) {
      int         addend      = 0;
      bool        added_bonus = false;
      std::size_t value_len   = value_key.length();

      pos = value_key.find(ch);

      // Ensure that a letter which has been matched is not matched twice, so
      // that the two x's of Exxon don't both match to the single x in Oxford.
      // This part of the loop is very expensive, but avoids a lot of bogus
      // matches.

      char_positions_map::iterator pi = positions.find(ch);
      while (pi != positions.end() &&
             pos != unistring::npos && pos <= (*pi).second &&
             (*pi).second + 1 < value_len)
        pos = value_key.find(ch, (*pi).second + 1);

      if (pos != unistring::npos) {
        if (pi != positions.end())
          (*pi).second = pos;
        else
          positions.insert(char_positions_map::value_type(ch, pos));

        // If it occurs in the same order as the source identifier -- that is,
        // without intervening letters to break the pattern -- it's worth 10
        // points.  Plus, an extra point is added for every letter in chains
        // of 3 or more.

        if (last_match_pos == unistring::npos ?
            index == 0 && pos == 0 : pos == last_match_pos + 1) {
          DEBUG("lookup",
                "  char " << index << " in-sequence match with bonus " << bonus);
          addend += 10;
          if (bonus > 2)
            addend += bonus - 2;
          bonus++;
          added_bonus = true;

          last_match_pos = pos;
        }

        // If it occurs in the same general sequence as the source identifier,
        // it's worth 5 points, plus an extra point if it's within the next 3
        // characters, and an extra point if it's preceded by a non-alphabetic
        // character.
        //
        // If the letter occurs at all in the target identifier, it's worth 1
        // point, plus an extra point if it's within 3 characters, and an
        // extra point if it's preceded by a non-alphabetic character.

        else {
          bool in_order_match = (last_match_pos != unistring::npos &&
                                 pos > last_match_pos);
          DEBUG("lookup", "  char " << index << " " <<
                (in_order_match ? "in-order" : "out-of-order")
                << " match" << (in_order_match && pos - index < 3 ?
                                " with proximity bonus of 1" : ""));

          if (pos < index)
            addend += 1;
          else
            addend += 5;

          if (in_order_match && pos - index < 3)
            addend++;

#if !defined(HAVE_BOOST_REGEX_UNICODE)
          if (pos == 0 || (pos > 0 && !std::isalnum(value_key[pos - 1])))
            addend++;
#else
          // jww (2010-03-07): Not yet implemented
#endif

          last_match_pos = pos;
        }

      // If the letter does not appear at all, decrease the score by 1

      } else {
        last_match_pos = unistring::npos;

        DEBUG("lookup", "  char " << index << " does not match");
        addend--;
      }

      // Finally, decay what is to be added to the score based on its position
      // in the word.  Since credit card payees in particular often share
      // information at the end (such as the location where the purchase was
      // made), we want to give much more credence to what occurs at the
      // beginning.  Every 5 character positions from the beginning becomes a
      // divisor for the addend.

      if ((int(index / 5) + 1) > 1) {
        DEBUG("lookup",
              "  discounting the addend by / " << (int(index / 5) + 1));
        addend = int(double(addend) / (int(index / 5) + 1));
      }

      DEBUG("lookup", "  final addend is " << addend);
      score += addend;
      DEBUG("lookup", "  score is " << score);

      if (! added_bonus)
        bonus = 0;

      index++;
    }

    // Only consider payees with a score of 30 or greater
    if (score >= 30)
      scores.push_back(score_entry_t(xact, score));
  }

  // Sort the results by descending score, then look at every account ever
  // used among the top five.  Rank these by number of times used.  Lastly,
  // "decay" any latter accounts, so that we give recently used accounts a
  // slightly higher rating in case of a tie.

  std::stable_sort(scores.begin(), scores.end(), score_sorter());

  scorecard_t::iterator si        = scores.begin();
  int                   decay     = 0;
  xact_t *              best_xact = si != scores.end() ? (*si).first : NULL;
  account_use_map       account_usage;

  for (int i = 0; i < 5 && si != scores.end(); i++, si++) {
    DEBUG("lookup.account",
          "Payee: " << std::setw(5) << std::right << (*si).second <<
          " - " << (*si).first->payee);

    foreach (post_t * post, (*si).first->posts) {
      if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) &&
          post->account != ref_account &&
          ! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) {
        account_use_map::iterator x = account_usage.find(post->account);
        if (x == account_usage.end())
          account_usage.insert(account_use_pair(post->account,
                                                ((*si).second - decay)));
        else
          (*x).second += ((*si).second - decay);
      }
      decay++;
    }
  }

  if (account_usage.size() > 0) {
#if defined(DEBUG_ON)
    if (SHOW_DEBUG("lookup.account")) {
      foreach (const account_use_pair& value, account_usage) {
        DEBUG("lookup.account",
              "Account: " << value.second << " - " << value.first->fullname());
      }
    }
#endif
    return std::pair<xact_t *, account_t *>
      (best_xact, (*std::max_element(account_usage.begin(), account_usage.end(),
                                     usage_sorter())).first);
  } else {
    return std::pair<xact_t *, account_t *>(best_xact, NULL);
  }
}

} // namespace ledger