From fe95280492a35fb2e8ee8d43f63754defc3b56e7 Mon Sep 17 00:00:00 2001 From: John Wiegley Date: Fri, 5 Mar 2010 22:12:59 -0500 Subject: Added some preliminary code for convert CSV to Ledger --- src/csv.cc | 181 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 src/csv.cc (limited to 'src/csv.cc') diff --git a/src/csv.cc b/src/csv.cc new file mode 100644 index 00000000..c0f8cd0e --- /dev/null +++ b/src/csv.cc @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2003-2010, John Wiegley. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of New Artisans LLC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "csv.h" +#include "xact.h" +#include "post.h" +#include "account.h" +#include "journal.h" +#include "pool.h" + +namespace ledger { + +string csv_reader::read_field() +{ + string field; + + char c; + if (in.peek() == '"' || in.peek() == '|') { + in.get(c); + char x; + while (in.good() && ! in.eof()) { + in.get(x); + if (x == '\\') { + in.get(x); + } + else if (x == c) { + if (x == '|') + in.unget(); + else if (in.peek() == ',') + in.get(c); + break; + } + field += x; + } + } + else { + + } + return field; +} + +xact_t * csv_reader::read_xact(journal_t& journal, account_t * bucket) +{ + static char linebuf[MAX_LINE + 1]; + + if (! in.good() || in.eof()) + return NULL; + + std::auto_ptr xact; + + while (in.good() && ! in.eof() && in.peek() == '#') + in.getline(linebuf, MAX_LINE); + + xact.reset(new xact_t); + + xact->pos = position_t(); + xact->pos->pathname = "jww (2010-03-05): unknown"; + xact->pos->beg_pos = in.tellg(); + xact->pos->beg_line = 0; + xact->pos->sequence = 0; + + string date = read_field(); trim(date); + string code = read_field(); trim(code); + string payee = read_field(); trim(payee); + + if (date.empty()) + return NULL; + + xact->set_state(item_t::CLEARED); + xact->_date = parse_date(date); + if (! code.empty()) + xact->code = code; + + bool found = false; + foreach (payee_mapping_t& value, journal.payee_mappings) { + DEBUG("csv.mappings", "Looking for payee mapping: " << value.first); + if (value.first.match(payee)) { + xact->payee = value.second; + found = true; + break; + } + } + if (! found) + xact->payee = payee; + + string amount = read_field(); trim(amount); + string total = read_field(); trim(total); + in.getline(linebuf, MAX_LINE); // skip to the next line + + std::auto_ptr post(new post_t); + + post->xact = xact.get(); + +#if 0 + post->pos = position_t(); + post->pos->pathname = pathname; + post->pos->beg_pos = line_beg_pos; + post->pos->beg_line = linenum; + post->pos->sequence = context.sequence++; +#endif + + post->set_state(item_t::CLEARED); + post->account = journal.master->find_account(_("Expenses:Unknown")); + + foreach (account_mapping_t& value, journal.account_mappings) { + if (value.first.match(xact->payee)) { + post->account = value.second; + break; + } + } + + std::istringstream amount_str(amount); + amount_t amt; + amt.parse(amount_str, PARSE_NO_REDUCE); + if (! amt.has_commodity() && + commodity_pool_t::current_pool->default_commodity) + amt.set_commodity + (*commodity_pool_t::current_pool->default_commodity); + post->amount = amt; + + xact->add_post(post.release()); + + post.reset(new post_t); + + post->xact = xact.get(); + +#if 0 + post->pos = position_t(); + post->pos->pathname = pathname; + post->pos->beg_pos = line_beg_pos; + post->pos->beg_line = linenum; + post->pos->sequence = context.sequence++; +#endif + + post->set_state(item_t::CLEARED); + post->account = bucket; + post->amount = - amt; + + if (! total.empty()) { + std::istringstream assigned_amount_str(total); + amount_t assigned_amount; + assigned_amount.parse(assigned_amount_str, PARSE_NO_REDUCE); + post->assigned_amount = assigned_amount; + } + + xact->add_post(post.release()); + + return xact.release(); +} + +} // namespace ledger -- cgit v1.2.3 From e070cdfc8ddcf9d6a25b593502f1c5ade56c849c Mon Sep 17 00:00:00 2001 From: John Wiegley Date: Sun, 7 Mar 2010 22:53:03 -0500 Subject: The CSV reader now auto-correlates fields by regex --- src/csv.cc | 217 +++++++++++++++++++++++++++++++++++++++++++++++-------------- src/csv.h | 45 ++++++++++++- 2 files changed, 210 insertions(+), 52 deletions(-) (limited to 'src/csv.cc') diff --git a/src/csv.cc b/src/csv.cc index c0f8cd0e..5a74232f 100644 --- a/src/csv.cc +++ b/src/csv.cc @@ -40,7 +40,7 @@ namespace ledger { -string csv_reader::read_field() +string csv_reader::read_field(std::istream& in) { string field; @@ -53,6 +53,9 @@ string csv_reader::read_field() if (x == '\\') { in.get(x); } + else if (x == '"' && in.peek() == '"') { + in.get(x); + } else if (x == c) { if (x == '|') in.unget(); @@ -60,65 +63,93 @@ string csv_reader::read_field() in.get(c); break; } - field += x; + if (x != '\0') + field += x; } } else { - + while (in.good() && ! in.eof()) { + in.get(c); + if (c == ',') + break; + if (c != '\0') + field += c; + } } + trim(field); return field; } -xact_t * csv_reader::read_xact(journal_t& journal, account_t * bucket) +char * csv_reader::next_line(std::istream& in) { static char linebuf[MAX_LINE + 1]; + while (in.good() && ! in.eof() && in.peek() == '#') + in.getline(linebuf, MAX_LINE); + if (! in.good() || in.eof()) return NULL; - std::auto_ptr xact; + in.getline(linebuf, MAX_LINE); - while (in.good() && ! in.eof() && in.peek() == '#') - in.getline(linebuf, MAX_LINE); + return linebuf; +} - xact.reset(new xact_t); +void csv_reader::read_index(std::istream& in) +{ + char * line = next_line(in); + if (! line) + return; - xact->pos = position_t(); - xact->pos->pathname = "jww (2010-03-05): unknown"; - xact->pos->beg_pos = in.tellg(); - xact->pos->beg_line = 0; - xact->pos->sequence = 0; + std::istringstream instr(line); - string date = read_field(); trim(date); - string code = read_field(); trim(code); - string payee = read_field(); trim(payee); + while (instr.good() && ! instr.eof()) { + string field = read_field(instr); + names.push_back(field); - if (date.empty()) - return NULL; + if (date_mask.match(field)) + index.push_back(FIELD_DATE); + else if (date_eff_mask.match(field)) + index.push_back(FIELD_DATE_EFF); + else if (code_mask.match(field)) + index.push_back(FIELD_CODE); + else if (payee_mask.match(field)) + index.push_back(FIELD_PAYEE); + else if (amount_mask.match(field)) + index.push_back(FIELD_AMOUNT); + else if (cost_mask.match(field)) + index.push_back(FIELD_COST); + else if (total_mask.match(field)) + index.push_back(FIELD_TOTAL); + else if (note_mask.match(field)) + index.push_back(FIELD_NOTE); + else + index.push_back(FIELD_UNKNOWN); - xact->set_state(item_t::CLEARED); - xact->_date = parse_date(date); - if (! code.empty()) - xact->code = code; - - bool found = false; - foreach (payee_mapping_t& value, journal.payee_mappings) { - DEBUG("csv.mappings", "Looking for payee mapping: " << value.first); - if (value.first.match(payee)) { - xact->payee = value.second; - found = true; - break; - } + DEBUG("csv.parse", "Header field: " << field); } - if (! found) - xact->payee = payee; +} + +xact_t * csv_reader::read_xact(journal_t& journal, account_t * bucket) +{ + restart: + char * line = next_line(in); + if (! line || index.empty()) + return NULL; - string amount = read_field(); trim(amount); - string total = read_field(); trim(total); - in.getline(linebuf, MAX_LINE); // skip to the next line + std::istringstream instr(line); + std::auto_ptr xact(new xact_t); std::auto_ptr post(new post_t); + xact->set_state(item_t::CLEARED); + + xact->pos = position_t(); + xact->pos->pathname = "jww (2010-03-05): unknown"; + xact->pos->beg_pos = in.tellg(); + xact->pos->beg_line = 0; + xact->pos->sequence = 0; + post->xact = xact.get(); #if 0 @@ -130,7 +161,96 @@ xact_t * csv_reader::read_xact(journal_t& journal, account_t * bucket) #endif post->set_state(item_t::CLEARED); - post->account = journal.master->find_account(_("Expenses:Unknown")); + post->account = NULL; + + int n = 0; + amount_t amt; + string total; + + while (instr.good() && ! instr.eof()) { + string field = read_field(instr); + + switch (index[n]) { + case FIELD_DATE: + if (field.empty()) + goto restart; + try { + xact->_date = parse_date(field); + } + catch (date_error&) { + goto restart; + } + break; + + case FIELD_DATE_EFF: + xact->_date_eff = parse_date(field); + break; + + case FIELD_CODE: + if (! field.empty()) + xact->code = field; + break; + + case FIELD_PAYEE: { + bool found = false; + foreach (payee_mapping_t& value, journal.payee_mappings) { + DEBUG("csv.mappings", "Looking for payee mapping: " << value.first); + if (value.first.match(field)) { + xact->payee = value.second; + found = true; + break; + } + } + if (! found) + xact->payee = field; + break; + } + + case FIELD_AMOUNT: { + std::istringstream amount_str(field); + amt.parse(amount_str, PARSE_NO_REDUCE); + if (! amt.has_commodity() && + commodity_pool_t::current_pool->default_commodity) + amt.set_commodity(*commodity_pool_t::current_pool->default_commodity); + post->amount = amt; + break; + } + + case FIELD_COST: { + std::istringstream amount_str(field); + amt.parse(amount_str, PARSE_NO_REDUCE); + if (! amt.has_commodity() && + commodity_pool_t::current_pool->default_commodity) + amt.set_commodity + (*commodity_pool_t::current_pool->default_commodity); + post->cost = amt; + break; + } + + case FIELD_TOTAL: + total = field; + break; + + case FIELD_NOTE: + xact->note = field; + break; + + case FIELD_UNKNOWN: + if (! names[n].empty() && ! field.empty()) + xact->set_tag(names[n], field); + break; + } + n++; + } + +#if 0 + xact->set_tag(_("Imported"), + string(format_date(CURRENT_DATE(), FMT_WRITTEN))); + xact->set_tag(_("Original"), string(line)); + xact->set_tag(_("SHA1"), string(sha1sum(line))); +#endif + + // Translate the account name, if we have enough information to do so foreach (account_mapping_t& value, journal.account_mappings) { if (value.first.match(xact->payee)) { @@ -139,17 +259,10 @@ xact_t * csv_reader::read_xact(journal_t& journal, account_t * bucket) } } - std::istringstream amount_str(amount); - amount_t amt; - amt.parse(amount_str, PARSE_NO_REDUCE); - if (! amt.has_commodity() && - commodity_pool_t::current_pool->default_commodity) - amt.set_commodity - (*commodity_pool_t::current_pool->default_commodity); - post->amount = amt; - xact->add_post(post.release()); + // Create the "balancing post", which refers to the account for this data + post.reset(new post_t); post->xact = xact.get(); @@ -164,13 +277,17 @@ xact_t * csv_reader::read_xact(journal_t& journal, account_t * bucket) post->set_state(item_t::CLEARED); post->account = bucket; - post->amount = - amt; + + if (! amt.is_null()) + post->amount = - amt; if (! total.empty()) { std::istringstream assigned_amount_str(total); - amount_t assigned_amount; - assigned_amount.parse(assigned_amount_str, PARSE_NO_REDUCE); - post->assigned_amount = assigned_amount; + amt.parse(assigned_amount_str, PARSE_NO_REDUCE); + if (! amt.has_commodity() && + commodity_pool_t::current_pool->default_commodity) + amt.set_commodity(*commodity_pool_t::current_pool->default_commodity); + post->assigned_amount = amt; } xact->add_post(post.release()); diff --git a/src/csv.h b/src/csv.h index 7029d482..5ff8b59e 100644 --- a/src/csv.h +++ b/src/csv.h @@ -56,10 +56,51 @@ class csv_reader std::istream& in; + enum headers_t { + FIELD_DATE = 0, + FIELD_DATE_EFF, + FIELD_CODE, + FIELD_PAYEE, + FIELD_AMOUNT, + FIELD_COST, + FIELD_TOTAL, + FIELD_NOTE, + + FIELD_UNKNOWN + }; + + mask_t date_mask; + mask_t date_eff_mask; + mask_t code_mask; + mask_t payee_mask; + mask_t amount_mask; + mask_t cost_mask; + mask_t total_mask; + mask_t note_mask; + + std::vector index; + std::vector names; + std::vector fields; + + typedef std::map string_map; + public: - csv_reader(std::istream& _in) : in(_in) {} + csv_reader(std::istream& _in) + : in(_in), + date_mask("date"), + date_eff_mask("posted( ?date)?"), + code_mask("code"), + payee_mask("(payee|desc(ription)?|title)"), + amount_mask("amount"), + cost_mask("cost"), + total_mask("total"), + note_mask("note") { + read_index(in); + } - string read_field(); + string read_field(std::istream& in); + char * next_line(std::istream& in); + void read_index(std::istream& in); xact_t * read_xact(journal_t& journal, account_t * bucket); }; -- cgit v1.2.3