Patch adds bayesian matching to gnucash using transaction token frequency to 
auto match a transactions split to a given destination account.  
Comments/questions welcome.

Chris

Index: import-export/import-backend.c
===================================================================
RCS file: /home/cvs/cvsroot/gnucash/src/import-export/import-backend.c,v
retrieving revision 1.21
diff -u -r1.21 import-backend.c
--- import-export/import-backend.c	26 Feb 2003 19:00:43 -0000	1.21
+++ import-export/import-backend.c	4 Mar 2003 02:12:54 -0000
@@ -54,6 +54,8 @@
  *   Constants, should idealy be defined a user preference dialog    *
 \********************************************************************/
 
+gboolean useBayes = TRUE; /* FIXME: possibly hook this to something user configurable */
+
 static const int MATCH_DATE_THRESHOLD=4; /*within 4 days*/
 static const int MATCH_DATE_NOT_THRESHOLD = 21;
 /**Transaction's who have an online_id kvp frame have been downloaded 
@@ -343,6 +345,86 @@
  * MatchMap- related functions (storing and retrieving)
  */
 
+/* Tokenize a string and append to an existing GList(or an empty GList) */
+/* the tokens */
+static GList* tokenize_string(GList* existing_tokens, const char *string)
+{
+  char **tokenized_strings; /* array of strings returned by g_strsplit() */
+  char **stringpos;
+
+  tokenized_strings = g_strsplit(string, " ", 0);
+  stringpos = tokenized_strings;
+
+  /* add each token to the token GList */
+  while(stringpos && *stringpos)
+  {
+    existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos)); /* prepend the char* to the token GList */
+    stringpos++; /* move to the next string */
+  }
+
+  g_strfreev(tokenized_strings); /* free up the strings that g_strsplit() created */
+
+  return existing_tokens;
+}
+
+/* create and return a list of tokens for a given transaction */
+/* NOTE: the user must iterate through the returned GList* and free */
+/* each string like foreach(GList*){g_free()}; as well as free the GList */
+/* itself */
+static GList* TransactionGetTokens(Transaction* transaction)
+{
+  GList* tokens;
+  const char* text;
+  time_t transtime;
+  struct tm *tm_struct;
+  char local_day_of_week[16];
+  Split* split;
+  int split_index;
+
+  tokens = 0; /* start off with an empty list */
+
+  /* make tokens from the transaction description */
+  text = xaccTransGetDescription(transaction);
+  tokens = tokenize_string(tokens, text);
+
+  /* the day of week the transaction occured is a good indicator of */
+  /* what account this transaction belongs in */
+  /* get the date and covert it to day of week as a token */
+  transtime = xaccTransGetDate(transaction);
+  tm_struct = gmtime(&transtime);
+  if(!strftime(local_day_of_week, sizeof(local_day_of_week), "%A", tm_struct))
+    {
+      PERR("TransactionGetTokens: error, strftime failed\n");
+    }
+   tokens = g_list_prepend(tokens, g_strdup(local_day_of_week)); /* we cannot add a localy allocated string to this array, dup it so it frees the same way the rest do */
+
+  /* make tokens from the memo of each split of this transaction */
+  split_index = 0;
+  while((split = xaccTransGetSplit(transaction, split_index)))
+  {
+       text = xaccSplitGetMemo(split);
+       tokens = tokenize_string(tokens, text);
+       split_index++; /* next split */
+  }
+
+  return tokens; /* return the pointer to the GList */
+}
+
+static gboolean TransactionFreeTokens(GList *tokens)
+{
+  GList *node;
+  
+  /* free up each g_strdup'ed string in the GList */
+  for(node = tokens; node; node = node->next)
+    {
+      g_free(node->data);
+    }
+
+  g_list_free(tokens); /* free the GList */
+
+  return TRUE;
+}
+
 /* searches using the GNCImportTransInfo through all existing transactions */
 /* if there is an exact match of the description and memo */
 static Account *
@@ -351,16 +433,25 @@
 {
   GncImportMatchMap *tmp_map;
   Account *result;
+  Transaction* transaction;
+  GList* tokens;
   g_assert (info);
-  
   tmp_map = ((matchmap != NULL) ? matchmap : 
 	     gnc_imap_create_from_account 
 	     (xaccSplitGetAccount
 	      (gnc_import_TransInfo_get_fsplit (info))));
-
-  result = gnc_imap_find_account 
-    (tmp_map, GNCIMPORT_DESC, 
-     xaccTransGetDescription (gnc_import_TransInfo_get_trans (info)));
+  if(useBayes)
+    {
+      transaction = gnc_import_TransInfo_get_trans(info); /* convert from transaction_info* to transaction* */
+      tokens = TransactionGetTokens(transaction); /* get the tokens for this transaction* */
+      result = gnc_imap_find_account_bayes(tmp_map, tokens); /* try to find the destination account for this transaction from its tokens */
+      TransactionFreeTokens(tokens); /* free up the strings and the GList */
+    } else /* old system of transaction to account matching */
+    {
+      result = gnc_imap_find_account 
+        (tmp_map, GNCIMPORT_DESC, 
+         xaccTransGetDescription (gnc_import_TransInfo_get_trans (info)));
+    }
 
   /* Disable matching by memo, until bayesian filtering is implemented. 
      It's currently unlikely to help, and has adverse effects, causing false positives,
@@ -390,6 +481,9 @@
   GncImportMatchMap *tmp_matchmap = NULL;
   Account *dest;
   const char *descr, *memo;
+  Transaction *transaction;
+  GList *tokens;
+
   g_assert (trans_info);
 
   /* This will store the destination account of the selected match if 
@@ -410,20 +504,30 @@
 		  (xaccSplitGetAccount
 		   (gnc_import_TransInfo_get_fsplit (trans_info))));
 
-  descr = xaccTransGetDescription 
-      (gnc_import_TransInfo_get_trans (trans_info));
-  if (descr && (strlen (descr) > 0))
-      gnc_imap_add_account (tmp_matchmap, 
+  /* see what matching system we are currently using */
+  if(useBayes)
+    {
+      transaction = gnc_import_TransInfo_get_trans(trans_info); /* convert from transaction_info* into transaction* */
+      tokens = TransactionGetTokens(transaction); /* tokenize this transaction */
+      gnc_imap_add_account_bayes(tmp_matchmap, tokens, dest); /* add the tokens to the imap with the given destination account */
+      TransactionFreeTokens(tokens); /* free the strings and the GList */
+  } else /* old matching system */
+    {
+      descr = xaccTransGetDescription 
+          (gnc_import_TransInfo_get_trans (trans_info));
+      if (descr && (strlen (descr) > 0))
+          gnc_imap_add_account (tmp_matchmap, 
 			    GNCIMPORT_DESC, 
 			    descr,
 			    dest);
-  memo = xaccSplitGetMemo 
-      (gnc_import_TransInfo_get_fsplit (trans_info));
-  if (memo && (strlen (memo) > 0))
-      gnc_imap_add_account (tmp_matchmap, 
+      memo = xaccSplitGetMemo 
+          (gnc_import_TransInfo_get_fsplit (trans_info));
+      if (memo && (strlen (memo) > 0))
+          gnc_imap_add_account (tmp_matchmap, 
 			    GNCIMPORT_MEMO, 
 			    memo,
 			    dest);
+    } /* if(useBayes) */
 
   if (matchmap == NULL)
     gnc_imap_destroy (tmp_matchmap);
@@ -935,7 +1039,7 @@
   /* if we haven't manually selected a destination account for this transaction */
   if(gnc_import_TransInfo_get_destacc_selected_manually(transaction_info) == FALSE)
   {
-    /* Try to find a previous selected destination account string match for the ADD action */
+    /* Try to find the destination account for this transaction based on prior ones */
     new_destacc = matchmap_find_destination(matchmap, transaction_info);
     gnc_import_TransInfo_set_destacc(transaction_info, new_destacc, FALSE);
   } else
Index: import-export/import-match-map.c
===================================================================
RCS file: /home/cvs/cvsroot/gnucash/src/import-export/import-match-map.c,v
retrieving revision 1.4
diff -u -r1.4 import-match-map.c
--- import-export/import-match-map.c	14 Dec 2002 19:25:50 -0000	1.4
+++ import-export/import-match-map.c	4 Mar 2003 02:12:54 -0000
@@ -28,8 +28,19 @@
     @author Copyright (C) 2002 Derek Atkins <[EMAIL PROTECTED]>
  */
 #include <string.h>
+#include <glib.h>
 #include "import-match-map.h"
 #include "kvp_frame.h"
+#include "Group.h"
+#include "gnc-ui-util.h"
+#include "gnc-engine-util.h"
+
+/********************************************************************\
+ *   Constants   *
+\********************************************************************/
+
+static short module = MOD_IMPORT;
+
 
 struct _GncImportMatchMap {
   kvp_frame *	frame;
@@ -37,7 +48,8 @@
   GNCBook *	book;
 };
 
-#define IMAP_FRAME	"import-map"
+#define IMAP_FRAME              "import-map"
+#define IMAP_FRAME_BAYES	"import-map-bayes"
 
 static GncImportMatchMap *
 gnc_imap_create_from_frame (kvp_frame *frame, Account *acc, GNCBook *book)
@@ -99,6 +111,9 @@
   /* Clear the IMAP_FRAME kvp */
   kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME);
 
+  /* Clear the bayes kvp, IMAP_FRAME_BAYES */
+  kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME_BAYES);
+
   /* XXX: mark the account (or book) as dirty! */
 }
 
@@ -141,6 +156,294 @@
   kvp_value_delete (value);
 
   /* XXX Mark the account (or book) as dirty! */
+}
+
+
+
+
+/* Below here is the bayes transaction to account matching system */
+struct account_token_count
+{
+  char* account_name;
+  gint64 token_count; /* occurances of a given token for this account_name */
+};
+
+/* total_count and the token_count for a given account let us calculate the */
+/* probability of a given account with any single token */
+struct token_accounts_info
+{
+  GList *accounts; /* array of struct account_token_count */
+  gint64 total_count;
+};
+
+/* gpointer is a pointer to a struct token_accounts_info */
+/* NOTE: can always assume that keys are unique, reduces code in this function */
+static void buildTokenInfo(const char *key, kvp_value *value, gpointer data)
+{
+  struct token_accounts_info *tokenInfo = (struct token_accounts_info*)data;
+  struct account_token_count* this_account;
+
+  //  PINFO("buildTokenInfo: account '%s', token_count: '%ld'\n", (char*)key, (long)kvp_value_get_gint64(value));
+
+  /* add the count to the total_count */
+  tokenInfo->total_count += kvp_value_get_gint64(value);
+
+  /* allocate a new structure for this account and it's token count */
+  this_account = (struct account_token_count*)(g_new0(struct account_token_count, 1));
+  this_account->account_name = (char*)key;   /* fill in the account name */
+  this_account->token_count = kvp_value_get_gint64(value);   /* fill in the number of tokens found for this accout name */
+
+  tokenInfo->accounts = g_list_prepend(tokenInfo->accounts, this_account); /* append onto the glist a pointer to the new account_token_count structure */
+}
+
+/* intermediate values used to calculate the bayes probability of a given account */
+/* where p(AB) = (a*b)/[a*b + (1-a)(1-b)], product is (a*b), */
+/* product_difference is (1-a) * (1-b) */
+struct account_probability
+{
+  double product; /* product of probabilities */
+  double product_difference; /* product of (1-probabilities) */
+};
+
+/* convert a hash table of account names and (struct account_probability*) */
+/* into a hash table of 100000x the percentage match value, ie. 10% would be */
+/* 0.10 * 100000 = 10000 */
+#define PROBABILITY_FACTOR 100000
+static void buildProbabilities(gpointer key, gpointer value, gpointer data)
+{
+    GHashTable *final_probabilities = (GHashTable*)data;
+    struct account_probability *account_p = (struct account_probability*)value;
+
+    /* P(AB) = A*B / [A*B + (1-A)*(1-B)] */
+    /* NOTE: so we only keep track of a running product(A*B*C...) and product difference ((1-A)(1-B)...) */
+    gint32 probability = (account_p->product / (account_p->product + account_p->product_difference)) * PROBABILITY_FACTOR;
+
+    PINFO("P('%s') = '%d'\n", (char*)key, probability);
+
+    g_hash_table_insert(final_probabilities, key, (gpointer)probability);
+}
+
+/* Frees an array of the same time that buildProperties built */
+static void freeProbabilities(gpointer key, gpointer value, gpointer data)
+{
+  g_free(value); /* free up the struct account_probability that was allocated in gnc_imap_find_account_bayes() */
+}
+
+/* holds an account name and its corresponding integer probability */
+/* the integer probability is some factor of 10 */
+struct account_info
+{
+  char* account_name;
+  gint32 probability;
+};
+
+/* Find the highest probability and the corresponding account name */
+/* store in data, a (struct account_info*) */
+/* NOTE: this is a g_hash_table_foreach() function for a hash table of entries */
+/* key is a  pointer to the account name, value is a gint32, 100000x the probability for this account */
+static void highestProbability(gpointer key, gpointer value, gpointer data)
+{
+  struct account_info *account_i = (struct account_info*)data;
+
+  /* if the current probability is greater than the stored, store the current */
+  if((gint32)value > account_i->probability)
+    {
+      account_i->probability = (gint32)value; /* save the new highest probability */
+      account_i->account_name = key; /* copy the account name over */
+    }
+}
+
+
+#define threshold (.90 * PROBABILITY_FACTOR) /* 90% */
+
+/* Look up an Account in the map */
+Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList *tokens)
+{
+  struct token_accounts_info tokenInfo; /* holds the accounts and total token count for a single token */
+
+  GList *current_token; /* pointer to the current token from the input GList *tokens */
+  GList *current_account_token; /* pointer to the struct account_token_count */
+
+  struct account_token_count *account_c; /* an account name and the number of times a token has appeared for the account */
+
+  GHashTable *running_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
+  GHashTable *final_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
+  struct account_probability *account_p; /* intermediate storage of values to compute the bayes probability of an account */
+
+  struct account_info account_i;
+
+  kvp_value* value;
+  kvp_frame* token_frame;
+
+  ENTER(" ");
+
+  /* check to see if the imap is NULL */
+  if(!imap)
+  {
+    PINFO("imap is null, returning null");
+    LEAVE(" ");
+    return NULL;
+  }
+
+  /* find the probability for each account that contains any of the tokens */
+  /* in the input tokens list */
+  for(current_token = tokens; current_token; current_token = current_token->next)
+    {
+      memset(&tokenInfo, 0, sizeof(struct token_accounts_info)); /* zero out the token_accounts_info structure */
+
+      PINFO("token: '%s'", (char*)current_token->data);
+
+      value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES, (char*)current_token->data, NULL); /* find the slot for the given token off of the source account for these tokens, search off of the IMAP_FRAME_BAYES path so we aren't looking from the parent of the entire kvp tree */
+
+      /* if value is null we should skip over this token */
+      if(!value)
+      	continue;
+
+      token_frame = kvp_value_get_frame(value); /* convert the slot(value) into a the frame that contains the list of accounts */
+
+      /* token_frame should NEVER be null */
+      if(!token_frame)
+      {
+	PERR("token '%s' has no accounts", (char*)current_token->data);
+        continue; /* skip over this token */
+      }
+
+      /* process the accounts for this token, adding the account if it */
+      /* doesn't already exist or adding to the existing accounts token */
+      /* count if it does */
+      kvp_frame_for_each_slot(token_frame, buildTokenInfo, &tokenInfo);
+
+      /* for each account we have just found, see if the account already exists */
+      /* in the list of account probabilities, if not add it */
+      for(current_account_token = tokenInfo.accounts; current_account_token; current_account_token = current_account_token->next)
+      {
+	account_c = (struct account_token_count*)current_account_token->data; /* get the account name and corresponding token count */
+
+	PINFO("account_c->account_name('%s'), account_c->token_count('%ld')/total_count('%ld')", account_c->account_name, (long)account_c->token_count, (long)tokenInfo.total_count); 
+
+	account_p = g_hash_table_lookup(running_probabilities, account_c->account_name);
+
+	/* if the account exists in the list then continue the running probablities */
+	if(account_p)
+	  {
+	    account_p->product = ((double)account_c->token_count / (double)tokenInfo.total_count) * account_p->product;
+	    account_p->product_difference = ((double)1 - ((double)account_c->token_count / (double)tokenInfo.total_count)) * account_p->product_difference;
+	    PINFO("product == %f, product_difference == %f", account_p->product, account_p->product_difference);
+	  } else /* add a new entry */
+	  {
+	    PINFO("adding a new entry for this account");
+	    account_p = (struct account_probability*)g_new0(struct account_probability, 1);
+
+	    /* set the product and product difference values */
+	    account_p->product = ((double)account_c->token_count / (double)tokenInfo.total_count);
+	    account_p->product_difference = (double)1 - ((double)account_c->token_count / (double)tokenInfo.total_count);
+
+	    PINFO("product == %f, product_difference == %f", account_p->product, account_p->product_difference);
+	    
+	    /* add the account name and (struct account_probability*) to the hash table */
+	    g_hash_table_insert(running_probabilities, account_c->account_name, account_p);
+	  }
+      } /* for all accounts in tokenInfo */
+
+      /* free the data in tokenInfo */
+      for(current_account_token = tokenInfo.accounts; current_account_token; current_account_token = current_account_token->next)
+      {
+	/* free up each struct account_token_count we allocated */
+	g_free((struct account_token_count*)current_account_token->data);
+      }
+
+      g_list_free(tokenInfo.accounts); /* free the accounts GList */
+    }
+
+  /* build a hash table of account names and their final probabilities */
+  /* from each entry in the running_probabilties hash table */
+  g_hash_table_foreach(running_probabilities, buildProbabilities, final_probabilities);
+
+  /* find the highest probabilty and the corresponding account */
+  memset(&account_i, 0, sizeof(struct account_info));
+  g_hash_table_foreach(final_probabilities, highestProbability, &account_i);
+
+  /* free each element of the running_probabilities hash */
+  g_hash_table_foreach(running_probabilities, freeProbabilities, NULL);
+
+  /* free the hash tables */
+  g_hash_table_destroy(running_probabilities);
+  g_hash_table_destroy(final_probabilities);
+
+  PINFO("highest P('%s') = '%d'", account_i.account_name, account_i.probability);
+
+  /* has this probability met our threshold? */
+  if(account_i.probability >= threshold)
+    {
+      PINFO("found match");
+      LEAVE(" ");
+      return xaccGetAccountFromFullName(gnc_book_get_group(imap->book), account_i.account_name, gnc_get_account_separator());
+    }
+
+  PINFO("no match");
+  LEAVE(" ");
+
+  return NULL; /* we didn't meet our threshold, return NULL for an account */
+}
+
+
+/* Updates the imap for a given account using a list of tokens */
+void gnc_imap_add_account_bayes(GncImportMatchMap *imap, GList *tokens, Account *acc)
+{
+  GList *current_token;
+  kvp_value *value;
+  gint64 token_count;
+  char* account_fullname;
+  kvp_value *new_value; /* the value that will be added back into the kvp tree */
+
+  ENTER(" ");
+
+  /* if imap is null return */
+  if(!imap)
+  {
+    LEAVE(" ");
+    return;
+  }
+
+  account_fullname = xaccAccountGetFullName(acc, gnc_get_account_separator());
+
+  PINFO("account name: '%s'\n", account_fullname);
+
+  /* process each token in the list */
+  for(current_token = g_list_first(tokens); current_token; current_token = current_token->next)
+    {
+      token_count = 0; /* start off with no tokens for this account */
+
+      PINFO("adding token '%s'\n", (char*)current_token->data);
+
+      /* is this token/account_name already in the kvp tree? */
+      value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES, (char*)current_token->data, account_fullname, NULL);
+
+      /* if the token/account is already in the tree, read the current value from the tree and use this for the basis of the value we are putting back */
+      if(value)
+	{
+	  PINFO("found existing value of '%ld'\n", (long)kvp_value_get_gint64(value));
+
+	  /* convert this value back into an integer */
+	  token_count+=kvp_value_get_gint64(value);
+	}
+
+        token_count++; /* add one to the existing token count */
+
+        /* create a new value */
+        new_value = kvp_value_new_gint64(token_count);
+
+        /* insert the value into the kvp tree at */
+        /* /imap->frame/IMAP_FRAME/token_string/account_name_string */
+        kvp_frame_set_slot_path(imap->frame, new_value, IMAP_FRAME_BAYES, (char*)current_token->data, account_fullname, NULL);
+
+        /* kvp_frame_set_slot_path() copied the value so we need to delete this one ;-) */
+	kvp_value_delete(new_value);
+    }
+
+  g_free(account_fullname); /* free up the account fullname string */
+ 
+  LEAVE(" ");
 }
 
 /** @} */
Index: import-export/import-match-map.h
===================================================================
RCS file: /home/cvs/cvsroot/gnucash/src/import-export/import-match-map.h,v
retrieving revision 1.2
diff -u -r1.2 import-match-map.h
--- import-export/import-match-map.h	7 Dec 2002 08:13:42 -0000	1.2
+++ import-export/import-match-map.h	4 Mar 2003 02:12:54 -0000
@@ -48,14 +48,23 @@
 void gnc_imap_clear (GncImportMatchMap *imap);
 
 /** Look up an Account in the map */
-Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category,
-				 const char *key);
+Account* gnc_imap_find_account(GncImportMatchMap *imap, const char* category, const char *key);
 
 /** Store an Account in the map. This mapping is immediatly stored in
   the underlying kvp frame, regardless of whether the MatchMap is
   destroyed later or not. */
 void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
 			   const char *key, Account *acc);
+
+/** Look up an Account in the map from a GList* of pointers to strings(tokens)
+  from the current transaction */
+Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList* tokens);
+
+/** Store an Account in the map. This mapping is immediatly stored in
+  the underlying kvp frame, regardless of whether the MatchMap is
+  destroyed later or not. */
+void gnc_imap_add_account_bayes (GncImportMatchMap *imap, GList* tokens,
+			   Account *acc);
 
 
 /** @name Some well-known categories
_______________________________________________
gnucash-devel mailing list
[EMAIL PROTECTED]
http://www.gnucash.org/cgi-bin/mailman/listinfo/gnucash-devel

Reply via email to