Patch adds bayesian matching to gnucash using transaction token frequency to
auto match a transactions split to a given destination account.
Comments/questions welcome.
Chris
Index: import-export/import-backend.c
===================================================================
RCS file: /home/cvs/cvsroot/gnucash/src/import-export/import-backend.c,v
retrieving revision 1.21
diff -u -r1.21 import-backend.c
--- import-export/import-backend.c 26 Feb 2003 19:00:43 -0000 1.21
+++ import-export/import-backend.c 4 Mar 2003 02:12:54 -0000
@@ -54,6 +54,8 @@
* Constants, should idealy be defined a user preference dialog *
\********************************************************************/
+gboolean useBayes = TRUE; /* FIXME: possibly hook this to something user configurable */
+
static const int MATCH_DATE_THRESHOLD=4; /*within 4 days*/
static const int MATCH_DATE_NOT_THRESHOLD = 21;
/**Transaction's who have an online_id kvp frame have been downloaded
@@ -343,6 +345,86 @@
* MatchMap- related functions (storing and retrieving)
*/
+/* Tokenize a string and append to an existing GList(or an empty GList) */
+/* the tokens */
+static GList* tokenize_string(GList* existing_tokens, const char *string)
+{
+ char **tokenized_strings; /* array of strings returned by g_strsplit() */
+ char **stringpos;
+
+ tokenized_strings = g_strsplit(string, " ", 0);
+ stringpos = tokenized_strings;
+
+ /* add each token to the token GList */
+ while(stringpos && *stringpos)
+ {
+ existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos)); /* prepend the char* to the token GList */
+ stringpos++; /* move to the next string */
+ }
+
+ g_strfreev(tokenized_strings); /* free up the strings that g_strsplit() created */
+
+ return existing_tokens;
+}
+
+/* create and return a list of tokens for a given transaction */
+/* NOTE: the user must iterate through the returned GList* and free */
+/* each string like foreach(GList*){g_free()}; as well as free the GList */
+/* itself */
+static GList* TransactionGetTokens(Transaction* transaction)
+{
+ GList* tokens;
+ const char* text;
+ time_t transtime;
+ struct tm *tm_struct;
+ char local_day_of_week[16];
+ Split* split;
+ int split_index;
+
+ tokens = 0; /* start off with an empty list */
+
+ /* make tokens from the transaction description */
+ text = xaccTransGetDescription(transaction);
+ tokens = tokenize_string(tokens, text);
+
+ /* the day of week the transaction occured is a good indicator of */
+ /* what account this transaction belongs in */
+ /* get the date and covert it to day of week as a token */
+ transtime = xaccTransGetDate(transaction);
+ tm_struct = gmtime(&transtime);
+ if(!strftime(local_day_of_week, sizeof(local_day_of_week), "%A", tm_struct))
+ {
+ PERR("TransactionGetTokens: error, strftime failed\n");
+ }
+ tokens = g_list_prepend(tokens, g_strdup(local_day_of_week)); /* we cannot add a localy allocated string to this array, dup it so it frees the same way the rest do */
+
+ /* make tokens from the memo of each split of this transaction */
+ split_index = 0;
+ while((split = xaccTransGetSplit(transaction, split_index)))
+ {
+ text = xaccSplitGetMemo(split);
+ tokens = tokenize_string(tokens, text);
+ split_index++; /* next split */
+ }
+
+ return tokens; /* return the pointer to the GList */
+}
+
+static gboolean TransactionFreeTokens(GList *tokens)
+{
+ GList *node;
+
+ /* free up each g_strdup'ed string in the GList */
+ for(node = tokens; node; node = node->next)
+ {
+ g_free(node->data);
+ }
+
+ g_list_free(tokens); /* free the GList */
+
+ return TRUE;
+}
+
/* searches using the GNCImportTransInfo through all existing transactions */
/* if there is an exact match of the description and memo */
static Account *
@@ -351,16 +433,25 @@
{
GncImportMatchMap *tmp_map;
Account *result;
+ Transaction* transaction;
+ GList* tokens;
g_assert (info);
-
tmp_map = ((matchmap != NULL) ? matchmap :
gnc_imap_create_from_account
(xaccSplitGetAccount
(gnc_import_TransInfo_get_fsplit (info))));
-
- result = gnc_imap_find_account
- (tmp_map, GNCIMPORT_DESC,
- xaccTransGetDescription (gnc_import_TransInfo_get_trans (info)));
+ if(useBayes)
+ {
+ transaction = gnc_import_TransInfo_get_trans(info); /* convert from transaction_info* to transaction* */
+ tokens = TransactionGetTokens(transaction); /* get the tokens for this transaction* */
+ result = gnc_imap_find_account_bayes(tmp_map, tokens); /* try to find the destination account for this transaction from its tokens */
+ TransactionFreeTokens(tokens); /* free up the strings and the GList */
+ } else /* old system of transaction to account matching */
+ {
+ result = gnc_imap_find_account
+ (tmp_map, GNCIMPORT_DESC,
+ xaccTransGetDescription (gnc_import_TransInfo_get_trans (info)));
+ }
/* Disable matching by memo, until bayesian filtering is implemented.
It's currently unlikely to help, and has adverse effects, causing false positives,
@@ -390,6 +481,9 @@
GncImportMatchMap *tmp_matchmap = NULL;
Account *dest;
const char *descr, *memo;
+ Transaction *transaction;
+ GList *tokens;
+
g_assert (trans_info);
/* This will store the destination account of the selected match if
@@ -410,20 +504,30 @@
(xaccSplitGetAccount
(gnc_import_TransInfo_get_fsplit (trans_info))));
- descr = xaccTransGetDescription
- (gnc_import_TransInfo_get_trans (trans_info));
- if (descr && (strlen (descr) > 0))
- gnc_imap_add_account (tmp_matchmap,
+ /* see what matching system we are currently using */
+ if(useBayes)
+ {
+ transaction = gnc_import_TransInfo_get_trans(trans_info); /* convert from transaction_info* into transaction* */
+ tokens = TransactionGetTokens(transaction); /* tokenize this transaction */
+ gnc_imap_add_account_bayes(tmp_matchmap, tokens, dest); /* add the tokens to the imap with the given destination account */
+ TransactionFreeTokens(tokens); /* free the strings and the GList */
+ } else /* old matching system */
+ {
+ descr = xaccTransGetDescription
+ (gnc_import_TransInfo_get_trans (trans_info));
+ if (descr && (strlen (descr) > 0))
+ gnc_imap_add_account (tmp_matchmap,
GNCIMPORT_DESC,
descr,
dest);
- memo = xaccSplitGetMemo
- (gnc_import_TransInfo_get_fsplit (trans_info));
- if (memo && (strlen (memo) > 0))
- gnc_imap_add_account (tmp_matchmap,
+ memo = xaccSplitGetMemo
+ (gnc_import_TransInfo_get_fsplit (trans_info));
+ if (memo && (strlen (memo) > 0))
+ gnc_imap_add_account (tmp_matchmap,
GNCIMPORT_MEMO,
memo,
dest);
+ } /* if(useBayes) */
if (matchmap == NULL)
gnc_imap_destroy (tmp_matchmap);
@@ -935,7 +1039,7 @@
/* if we haven't manually selected a destination account for this transaction */
if(gnc_import_TransInfo_get_destacc_selected_manually(transaction_info) == FALSE)
{
- /* Try to find a previous selected destination account string match for the ADD action */
+ /* Try to find the destination account for this transaction based on prior ones */
new_destacc = matchmap_find_destination(matchmap, transaction_info);
gnc_import_TransInfo_set_destacc(transaction_info, new_destacc, FALSE);
} else
Index: import-export/import-match-map.c
===================================================================
RCS file: /home/cvs/cvsroot/gnucash/src/import-export/import-match-map.c,v
retrieving revision 1.4
diff -u -r1.4 import-match-map.c
--- import-export/import-match-map.c 14 Dec 2002 19:25:50 -0000 1.4
+++ import-export/import-match-map.c 4 Mar 2003 02:12:54 -0000
@@ -28,8 +28,19 @@
@author Copyright (C) 2002 Derek Atkins <[EMAIL PROTECTED]>
*/
#include <string.h>
+#include <glib.h>
#include "import-match-map.h"
#include "kvp_frame.h"
+#include "Group.h"
+#include "gnc-ui-util.h"
+#include "gnc-engine-util.h"
+
+/********************************************************************\
+ * Constants *
+\********************************************************************/
+
+static short module = MOD_IMPORT;
+
struct _GncImportMatchMap {
kvp_frame * frame;
@@ -37,7 +48,8 @@
GNCBook * book;
};
-#define IMAP_FRAME "import-map"
+#define IMAP_FRAME "import-map"
+#define IMAP_FRAME_BAYES "import-map-bayes"
static GncImportMatchMap *
gnc_imap_create_from_frame (kvp_frame *frame, Account *acc, GNCBook *book)
@@ -99,6 +111,9 @@
/* Clear the IMAP_FRAME kvp */
kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME);
+ /* Clear the bayes kvp, IMAP_FRAME_BAYES */
+ kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME_BAYES);
+
/* XXX: mark the account (or book) as dirty! */
}
@@ -141,6 +156,294 @@
kvp_value_delete (value);
/* XXX Mark the account (or book) as dirty! */
+}
+
+
+
+
+/* Below here is the bayes transaction to account matching system */
+struct account_token_count
+{
+ char* account_name;
+ gint64 token_count; /* occurances of a given token for this account_name */
+};
+
+/* total_count and the token_count for a given account let us calculate the */
+/* probability of a given account with any single token */
+struct token_accounts_info
+{
+ GList *accounts; /* array of struct account_token_count */
+ gint64 total_count;
+};
+
+/* gpointer is a pointer to a struct token_accounts_info */
+/* NOTE: can always assume that keys are unique, reduces code in this function */
+static void buildTokenInfo(const char *key, kvp_value *value, gpointer data)
+{
+ struct token_accounts_info *tokenInfo = (struct token_accounts_info*)data;
+ struct account_token_count* this_account;
+
+ // PINFO("buildTokenInfo: account '%s', token_count: '%ld'\n", (char*)key, (long)kvp_value_get_gint64(value));
+
+ /* add the count to the total_count */
+ tokenInfo->total_count += kvp_value_get_gint64(value);
+
+ /* allocate a new structure for this account and it's token count */
+ this_account = (struct account_token_count*)(g_new0(struct account_token_count, 1));
+ this_account->account_name = (char*)key; /* fill in the account name */
+ this_account->token_count = kvp_value_get_gint64(value); /* fill in the number of tokens found for this accout name */
+
+ tokenInfo->accounts = g_list_prepend(tokenInfo->accounts, this_account); /* append onto the glist a pointer to the new account_token_count structure */
+}
+
+/* intermediate values used to calculate the bayes probability of a given account */
+/* where p(AB) = (a*b)/[a*b + (1-a)(1-b)], product is (a*b), */
+/* product_difference is (1-a) * (1-b) */
+struct account_probability
+{
+ double product; /* product of probabilities */
+ double product_difference; /* product of (1-probabilities) */
+};
+
+/* convert a hash table of account names and (struct account_probability*) */
+/* into a hash table of 100000x the percentage match value, ie. 10% would be */
+/* 0.10 * 100000 = 10000 */
+#define PROBABILITY_FACTOR 100000
+static void buildProbabilities(gpointer key, gpointer value, gpointer data)
+{
+ GHashTable *final_probabilities = (GHashTable*)data;
+ struct account_probability *account_p = (struct account_probability*)value;
+
+ /* P(AB) = A*B / [A*B + (1-A)*(1-B)] */
+ /* NOTE: so we only keep track of a running product(A*B*C...) and product difference ((1-A)(1-B)...) */
+ gint32 probability = (account_p->product / (account_p->product + account_p->product_difference)) * PROBABILITY_FACTOR;
+
+ PINFO("P('%s') = '%d'\n", (char*)key, probability);
+
+ g_hash_table_insert(final_probabilities, key, (gpointer)probability);
+}
+
+/* Frees an array of the same time that buildProperties built */
+static void freeProbabilities(gpointer key, gpointer value, gpointer data)
+{
+ g_free(value); /* free up the struct account_probability that was allocated in gnc_imap_find_account_bayes() */
+}
+
+/* holds an account name and its corresponding integer probability */
+/* the integer probability is some factor of 10 */
+struct account_info
+{
+ char* account_name;
+ gint32 probability;
+};
+
+/* Find the highest probability and the corresponding account name */
+/* store in data, a (struct account_info*) */
+/* NOTE: this is a g_hash_table_foreach() function for a hash table of entries */
+/* key is a pointer to the account name, value is a gint32, 100000x the probability for this account */
+static void highestProbability(gpointer key, gpointer value, gpointer data)
+{
+ struct account_info *account_i = (struct account_info*)data;
+
+ /* if the current probability is greater than the stored, store the current */
+ if((gint32)value > account_i->probability)
+ {
+ account_i->probability = (gint32)value; /* save the new highest probability */
+ account_i->account_name = key; /* copy the account name over */
+ }
+}
+
+
+#define threshold (.90 * PROBABILITY_FACTOR) /* 90% */
+
+/* Look up an Account in the map */
+Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList *tokens)
+{
+ struct token_accounts_info tokenInfo; /* holds the accounts and total token count for a single token */
+
+ GList *current_token; /* pointer to the current token from the input GList *tokens */
+ GList *current_account_token; /* pointer to the struct account_token_count */
+
+ struct account_token_count *account_c; /* an account name and the number of times a token has appeared for the account */
+
+ GHashTable *running_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
+ GHashTable *final_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
+ struct account_probability *account_p; /* intermediate storage of values to compute the bayes probability of an account */
+
+ struct account_info account_i;
+
+ kvp_value* value;
+ kvp_frame* token_frame;
+
+ ENTER(" ");
+
+ /* check to see if the imap is NULL */
+ if(!imap)
+ {
+ PINFO("imap is null, returning null");
+ LEAVE(" ");
+ return NULL;
+ }
+
+ /* find the probability for each account that contains any of the tokens */
+ /* in the input tokens list */
+ for(current_token = tokens; current_token; current_token = current_token->next)
+ {
+ memset(&tokenInfo, 0, sizeof(struct token_accounts_info)); /* zero out the token_accounts_info structure */
+
+ PINFO("token: '%s'", (char*)current_token->data);
+
+ value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES, (char*)current_token->data, NULL); /* find the slot for the given token off of the source account for these tokens, search off of the IMAP_FRAME_BAYES path so we aren't looking from the parent of the entire kvp tree */
+
+ /* if value is null we should skip over this token */
+ if(!value)
+ continue;
+
+ token_frame = kvp_value_get_frame(value); /* convert the slot(value) into a the frame that contains the list of accounts */
+
+ /* token_frame should NEVER be null */
+ if(!token_frame)
+ {
+ PERR("token '%s' has no accounts", (char*)current_token->data);
+ continue; /* skip over this token */
+ }
+
+ /* process the accounts for this token, adding the account if it */
+ /* doesn't already exist or adding to the existing accounts token */
+ /* count if it does */
+ kvp_frame_for_each_slot(token_frame, buildTokenInfo, &tokenInfo);
+
+ /* for each account we have just found, see if the account already exists */
+ /* in the list of account probabilities, if not add it */
+ for(current_account_token = tokenInfo.accounts; current_account_token; current_account_token = current_account_token->next)
+ {
+ account_c = (struct account_token_count*)current_account_token->data; /* get the account name and corresponding token count */
+
+ PINFO("account_c->account_name('%s'), account_c->token_count('%ld')/total_count('%ld')", account_c->account_name, (long)account_c->token_count, (long)tokenInfo.total_count);
+
+ account_p = g_hash_table_lookup(running_probabilities, account_c->account_name);
+
+ /* if the account exists in the list then continue the running probablities */
+ if(account_p)
+ {
+ account_p->product = ((double)account_c->token_count / (double)tokenInfo.total_count) * account_p->product;
+ account_p->product_difference = ((double)1 - ((double)account_c->token_count / (double)tokenInfo.total_count)) * account_p->product_difference;
+ PINFO("product == %f, product_difference == %f", account_p->product, account_p->product_difference);
+ } else /* add a new entry */
+ {
+ PINFO("adding a new entry for this account");
+ account_p = (struct account_probability*)g_new0(struct account_probability, 1);
+
+ /* set the product and product difference values */
+ account_p->product = ((double)account_c->token_count / (double)tokenInfo.total_count);
+ account_p->product_difference = (double)1 - ((double)account_c->token_count / (double)tokenInfo.total_count);
+
+ PINFO("product == %f, product_difference == %f", account_p->product, account_p->product_difference);
+
+ /* add the account name and (struct account_probability*) to the hash table */
+ g_hash_table_insert(running_probabilities, account_c->account_name, account_p);
+ }
+ } /* for all accounts in tokenInfo */
+
+ /* free the data in tokenInfo */
+ for(current_account_token = tokenInfo.accounts; current_account_token; current_account_token = current_account_token->next)
+ {
+ /* free up each struct account_token_count we allocated */
+ g_free((struct account_token_count*)current_account_token->data);
+ }
+
+ g_list_free(tokenInfo.accounts); /* free the accounts GList */
+ }
+
+ /* build a hash table of account names and their final probabilities */
+ /* from each entry in the running_probabilties hash table */
+ g_hash_table_foreach(running_probabilities, buildProbabilities, final_probabilities);
+
+ /* find the highest probabilty and the corresponding account */
+ memset(&account_i, 0, sizeof(struct account_info));
+ g_hash_table_foreach(final_probabilities, highestProbability, &account_i);
+
+ /* free each element of the running_probabilities hash */
+ g_hash_table_foreach(running_probabilities, freeProbabilities, NULL);
+
+ /* free the hash tables */
+ g_hash_table_destroy(running_probabilities);
+ g_hash_table_destroy(final_probabilities);
+
+ PINFO("highest P('%s') = '%d'", account_i.account_name, account_i.probability);
+
+ /* has this probability met our threshold? */
+ if(account_i.probability >= threshold)
+ {
+ PINFO("found match");
+ LEAVE(" ");
+ return xaccGetAccountFromFullName(gnc_book_get_group(imap->book), account_i.account_name, gnc_get_account_separator());
+ }
+
+ PINFO("no match");
+ LEAVE(" ");
+
+ return NULL; /* we didn't meet our threshold, return NULL for an account */
+}
+
+
+/* Updates the imap for a given account using a list of tokens */
+void gnc_imap_add_account_bayes(GncImportMatchMap *imap, GList *tokens, Account *acc)
+{
+ GList *current_token;
+ kvp_value *value;
+ gint64 token_count;
+ char* account_fullname;
+ kvp_value *new_value; /* the value that will be added back into the kvp tree */
+
+ ENTER(" ");
+
+ /* if imap is null return */
+ if(!imap)
+ {
+ LEAVE(" ");
+ return;
+ }
+
+ account_fullname = xaccAccountGetFullName(acc, gnc_get_account_separator());
+
+ PINFO("account name: '%s'\n", account_fullname);
+
+ /* process each token in the list */
+ for(current_token = g_list_first(tokens); current_token; current_token = current_token->next)
+ {
+ token_count = 0; /* start off with no tokens for this account */
+
+ PINFO("adding token '%s'\n", (char*)current_token->data);
+
+ /* is this token/account_name already in the kvp tree? */
+ value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES, (char*)current_token->data, account_fullname, NULL);
+
+ /* if the token/account is already in the tree, read the current value from the tree and use this for the basis of the value we are putting back */
+ if(value)
+ {
+ PINFO("found existing value of '%ld'\n", (long)kvp_value_get_gint64(value));
+
+ /* convert this value back into an integer */
+ token_count+=kvp_value_get_gint64(value);
+ }
+
+ token_count++; /* add one to the existing token count */
+
+ /* create a new value */
+ new_value = kvp_value_new_gint64(token_count);
+
+ /* insert the value into the kvp tree at */
+ /* /imap->frame/IMAP_FRAME/token_string/account_name_string */
+ kvp_frame_set_slot_path(imap->frame, new_value, IMAP_FRAME_BAYES, (char*)current_token->data, account_fullname, NULL);
+
+ /* kvp_frame_set_slot_path() copied the value so we need to delete this one ;-) */
+ kvp_value_delete(new_value);
+ }
+
+ g_free(account_fullname); /* free up the account fullname string */
+
+ LEAVE(" ");
}
/** @} */
Index: import-export/import-match-map.h
===================================================================
RCS file: /home/cvs/cvsroot/gnucash/src/import-export/import-match-map.h,v
retrieving revision 1.2
diff -u -r1.2 import-match-map.h
--- import-export/import-match-map.h 7 Dec 2002 08:13:42 -0000 1.2
+++ import-export/import-match-map.h 4 Mar 2003 02:12:54 -0000
@@ -48,14 +48,23 @@
void gnc_imap_clear (GncImportMatchMap *imap);
/** Look up an Account in the map */
-Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category,
- const char *key);
+Account* gnc_imap_find_account(GncImportMatchMap *imap, const char* category, const char *key);
/** Store an Account in the map. This mapping is immediatly stored in
the underlying kvp frame, regardless of whether the MatchMap is
destroyed later or not. */
void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
const char *key, Account *acc);
+
+/** Look up an Account in the map from a GList* of pointers to strings(tokens)
+ from the current transaction */
+Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList* tokens);
+
+/** Store an Account in the map. This mapping is immediatly stored in
+ the underlying kvp frame, regardless of whether the MatchMap is
+ destroyed later or not. */
+void gnc_imap_add_account_bayes (GncImportMatchMap *imap, GList* tokens,
+ Account *acc);
/** @name Some well-known categories
_______________________________________________
gnucash-devel mailing list
[EMAIL PROTECTED]
http://www.gnucash.org/cgi-bin/mailman/listinfo/gnucash-devel