GnuCash 2.4.99
import-match-map.c
00001 /********************************************************************\
00002  * This program is free software; you can redistribute it and/or    *
00003  * modify it under the terms of the GNU General Public License as   *
00004  * published by the Free Software Foundation; either version 2 of   *
00005  * the License, or (at your option) any later version.              *
00006  *                                                                  *
00007  * This program is distributed in the hope that it will be useful,  *
00008  * but WITHOUT ANY WARRANTY; without even the implied warranty of   *
00009  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    *
00010  * GNU General Public License for more details.                     *
00011  *                                                                  *
00012  * You should have received a copy of the GNU General Public License*
00013  * along with this program; if not, contact:                        *
00014  *                                                                  *
00015  * Free Software Foundation           Voice:  +1-617-542-5942       *
00016  * 51 Franklin Street, Fifth Floor    Fax:    +1-617-542-2652       *
00017  * Boston, MA  02110-1301,  USA       gnu@gnu.org                   *
00018 \********************************************************************/
00030 #include "config.h"
00031 #include <string.h>
00032 #include <glib.h>
00033 #include "import-match-map.h"
00034 #include "gnc-ui-util.h"
00035 #include "gnc-engine.h"
00036 
00037 /********************************************************************\
00038  *   Constants   *
00039 \********************************************************************/
00040 
00041 static QofLogModule log_module = GNC_MOD_IMPORT;
00042 
00043 
00044 struct _GncImportMatchMap
00045 {
00046     kvp_frame * frame;
00047     Account *   acc;
00048     QofBook *   book;
00049 };
00050 
00051 #define IMAP_FRAME              "import-map"
00052 #define IMAP_FRAME_BAYES        "import-map-bayes"
00053 
00054 static GncImportMatchMap *
00055 gnc_imap_create_from_frame (kvp_frame *frame, Account *acc, QofBook *book)
00056 {
00057     GncImportMatchMap *imap;
00058 
00059     g_return_val_if_fail (frame != NULL, NULL);
00060     g_return_val_if_fail ((acc && !book) || (!acc && book), NULL);
00061 
00062     imap = g_new0(GncImportMatchMap, 1);
00063     imap->frame = frame;
00064 
00065     /* Cache the book for easy lookups; store the account/book for
00066      * marking dirtiness
00067      */
00068     if (acc)
00069         book = gnc_account_get_book (acc);
00070     imap->acc = acc;
00071     imap->book = book;
00072 
00073     return imap;
00074 }
00075 
00077 GncImportMatchMap * gnc_imap_create_from_account (Account *acc)
00078 {
00079     kvp_frame * frame;
00080 
00081     if (!acc) return NULL;
00082     frame = xaccAccountGetSlots (acc);
00083     g_return_val_if_fail (frame != NULL, NULL);
00084 
00085     return gnc_imap_create_from_frame (frame, acc, NULL);
00086 }
00087 
00088 GncImportMatchMap * gnc_imap_create_from_book (QofBook *book)
00089 {
00090     kvp_frame * frame;
00091 
00092     if (!book) return NULL;
00093     frame = qof_book_get_slots (book);
00094     g_return_val_if_fail (frame != NULL, NULL);
00095 
00096     return gnc_imap_create_from_frame (frame, NULL, book);
00097 }
00098 
00100 void gnc_imap_destroy (GncImportMatchMap *imap)
00101 {
00102     if (!imap) return;
00103     g_free (imap);
00104 }
00105 
00107 void gnc_imap_clear (GncImportMatchMap *imap)
00108 {
00109     if (!imap) return;
00110 
00111     /* Clear the IMAP_FRAME kvp */
00112     kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME);
00113 
00114     /* Clear the bayes kvp, IMAP_FRAME_BAYES */
00115     kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME_BAYES);
00116 
00117     /* XXX: mark the account (or book) as dirty! */
00118 }
00119 
00121 Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category,
00122                                  const char *key)
00123 {
00124     kvp_value *value;
00125     GncGUID * guid;
00126 
00127     if (!imap || !key) return NULL;
00128     if (!category)
00129     {
00130         category = key;
00131         key = NULL;
00132     }
00133 
00134     value = kvp_frame_get_slot_path (imap->frame, IMAP_FRAME, category, key, NULL);
00135     if (!value) return NULL;
00136 
00137     guid = kvp_value_get_guid (value);
00138     return xaccAccountLookup (guid, imap->book);
00139 }
00140 
00142 void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
00143                            const char *key, Account *acc)
00144 {
00145     kvp_value *value;
00146 
00147     if (!imap || !key || !acc || (strlen (key) == 0)) return;
00148     if (!category)
00149     {
00150         category = key;
00151         key = NULL;
00152     }
00153     g_return_if_fail (acc != NULL);
00154 
00155     value = kvp_value_new_guid (xaccAccountGetGUID (acc));
00156     g_return_if_fail (value != NULL);
00157     xaccAccountBeginEdit (imap->acc);
00158     kvp_frame_set_slot_path (imap->frame, value, IMAP_FRAME, category, key, NULL);
00159     qof_instance_set_dirty (QOF_INSTANCE (imap->acc));
00160     xaccAccountCommitEdit (imap->acc);
00161     kvp_value_delete (value);
00162 
00163     /* XXX Mark the account (or book) as dirty! */
00164 }
00165 
00166 
00167 
00168 
00169 /*--------------------------------------------------------------------------
00170  Below here is the bayes transaction to account matching system
00171 --------------------------------------------------------------------------*/
00172 
00173 
00174 struct account_token_count
00175 {
00176     char* account_name;
00177     gint64 token_count; 
00178 };
00179 
00183 struct token_accounts_info
00184 {
00185     GList *accounts; 
00186     gint64 total_count;
00187 };
00188 
00192 static void buildTokenInfo(const char *key, kvp_value *value, gpointer data)
00193 {
00194     struct token_accounts_info *tokenInfo = (struct token_accounts_info*)data;
00195     struct account_token_count* this_account;
00196 
00197     //  PINFO("buildTokenInfo: account '%s', token_count: '%ld'\n", (char*)key,
00198     //                  (long)kvp_value_get_gint64(value));
00199 
00200     /* add the count to the total_count */
00201     tokenInfo->total_count += kvp_value_get_gint64(value);
00202 
00203     /* allocate a new structure for this account and it's token count */
00204     this_account = (struct account_token_count*)
00205                    g_new0(struct account_token_count, 1);
00206 
00207     /* fill in the account name and number of tokens found for this account name */
00208     this_account->account_name = (char*)key;
00209     this_account->token_count = kvp_value_get_gint64(value);
00210 
00211     /* append onto the glist a pointer to the new account_token_count structure */
00212     tokenInfo->accounts = g_list_prepend(tokenInfo->accounts, this_account);
00213 }
00214 
00219 struct account_probability
00220 {
00221     double product; /* product of probabilities */
00222     double product_difference; /* product of (1-probabilities) */
00223 };
00224 
00229 #define PROBABILITY_FACTOR 100000
00230 static void buildProbabilities(gpointer key, gpointer value, gpointer data)
00231 {
00232     GHashTable *final_probabilities = (GHashTable*)data;
00233     struct account_probability *account_p = (struct account_probability*)value;
00234 
00235     /* P(AB) = A*B / [A*B + (1-A)*(1-B)]
00236      * NOTE: so we only keep track of a running product(A*B*C...)
00237      * and product difference ((1-A)(1-B)...)
00238      */
00239     gint32 probability =
00240         (account_p->product /
00241          (account_p->product + account_p->product_difference))
00242         * PROBABILITY_FACTOR;
00243 
00244     PINFO("P('%s') = '%d'\n", (char*)key, probability);
00245 
00246     g_hash_table_insert(final_probabilities, key, GINT_TO_POINTER(probability));
00247 }
00248 
00250 static void freeProbabilities(gpointer key, gpointer value, gpointer data)
00251 {
00252     /* free up the struct account_probability that was allocated
00253      * in gnc_imap_find_account_bayes()
00254      */
00255     g_free(value);
00256 }
00257 
00261 struct account_info
00262 {
00263     char* account_name;
00264     gint32 probability;
00265 };
00266 
00273 static void highestProbability(gpointer key, gpointer value, gpointer data)
00274 {
00275     struct account_info *account_i = (struct account_info*)data;
00276 
00277     /* if the current probability is greater than the stored, store the current */
00278     if (GPOINTER_TO_INT(value) > account_i->probability)
00279     {
00280         /* Save the new highest probability and the assoaciated account name */
00281         account_i->probability = GPOINTER_TO_INT(value);
00282         account_i->account_name = key;
00283     }
00284 }
00285 
00286 
00287 #define threshold (.90 * PROBABILITY_FACTOR) /* 90% */
00288 
00290 Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList *tokens)
00291 {
00292     struct token_accounts_info tokenInfo; 
00294     GList *current_token;                       
00296     GList *current_account_token;               
00298     struct account_token_count *account_c; 
00301     struct account_probability *account_p; 
00304     GHashTable *running_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
00305     GHashTable *final_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
00306     struct account_info account_i;
00307     kvp_value* value;
00308     kvp_frame* token_frame;
00309 
00310     ENTER(" ");
00311 
00312     /* check to see if the imap is NULL */
00313     if (!imap)
00314     {
00315         PINFO("imap is null, returning null");
00316         LEAVE(" ");
00317         return NULL;
00318     }
00319 
00320     /* find the probability for each account that contains any of the tokens
00321      * in the input tokens list
00322      */
00323     for (current_token = tokens; current_token; current_token = current_token->next)
00324     {
00325         /* zero out the token_accounts_info structure */
00326         memset(&tokenInfo, 0, sizeof(struct token_accounts_info));
00327 
00328         PINFO("token: '%s'", (char*)current_token->data);
00329 
00330         /* find the slot for the given token off of the source account
00331          * for these tokens, search off of the IMAP_FRAME_BAYES path so
00332          * we aren't looking from the parent of the entire kvp tree
00333          */
00334         value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
00335                                         (char*)current_token->data, NULL);
00336 
00337         /* if value is null we should skip over this token */
00338         if (!value)
00339             continue;
00340 
00341         /* convert the slot(value) into a the frame that contains the
00342          * list of accounts
00343          */
00344         token_frame = kvp_value_get_frame(value);
00345 
00346         /* token_frame should NEVER be null */
00347         if (!token_frame)
00348         {
00349             PERR("token '%s' has no accounts", (char*)current_token->data);
00350             continue; /* skip over this token */
00351         }
00352 
00353         /* process the accounts for this token, adding the account if it
00354          * doesn't already exist or adding to the existing accounts token
00355          * count if it does
00356          */
00357         kvp_frame_for_each_slot(token_frame, buildTokenInfo, &tokenInfo);
00358 
00359         /* for each account we have just found, see if the account already exists
00360          * in the list of account probabilities, if not add it
00361          */
00362         for (current_account_token = tokenInfo.accounts; current_account_token;
00363                 current_account_token = current_account_token->next)
00364         {
00365             /* get the account name and corresponding token count */
00366             account_c = (struct account_token_count*)current_account_token->data;
00367 
00368             PINFO("account_c->account_name('%s'), "
00369                   "account_c->token_count('%ld')/total_count('%ld')",
00370                   account_c->account_name, (long)account_c->token_count,
00371                   (long)tokenInfo.total_count);
00372 
00373             account_p = g_hash_table_lookup(running_probabilities,
00374                                             account_c->account_name);
00375 
00376             /* if the account exists in the list then continue
00377              * the running probablities
00378              */
00379             if (account_p)
00380             {
00381                 account_p->product =
00382                     ((double)account_c->token_count / (double)tokenInfo.total_count)
00383                     * account_p->product;
00384                 account_p->product_difference =
00385                     ((double)1 - ((double)account_c->token_count /
00386                                   (double)tokenInfo.total_count))
00387                     * account_p->product_difference;
00388                 PINFO("product == %f, product_difference == %f",
00389                       account_p->product, account_p->product_difference);
00390             }
00391             else
00392             {
00393                 /* add a new entry */
00394                 PINFO("adding a new entry for this account");
00395                 account_p = (struct account_probability*)
00396                             g_new0(struct account_probability, 1);
00397 
00398                 /* set the product and product difference values */
00399                 account_p->product = ((double)account_c->token_count /
00400                                       (double)tokenInfo.total_count);
00401                 account_p->product_difference =
00402                     (double)1 - ((double)account_c->token_count /
00403                                  (double)tokenInfo.total_count);
00404 
00405                 PINFO("product == %f, product_difference == %f",
00406                       account_p->product, account_p->product_difference);
00407 
00408                 /* add the account name and (struct account_probability*)
00409                  * to the hash table */
00410                 g_hash_table_insert(running_probabilities,
00411                                     account_c->account_name, account_p);
00412             }
00413         } /* for all accounts in tokenInfo */
00414 
00415         /* free the data in tokenInfo */
00416         for (current_account_token = tokenInfo.accounts; current_account_token;
00417                 current_account_token = current_account_token->next)
00418         {
00419             /* free up each struct account_token_count we allocated */
00420             g_free((struct account_token_count*)current_account_token->data);
00421         }
00422 
00423         g_list_free(tokenInfo.accounts); /* free the accounts GList */
00424     }
00425 
00426     /* build a hash table of account names and their final probabilities
00427      * from each entry in the running_probabilties hash table
00428      */
00429     g_hash_table_foreach(running_probabilities, buildProbabilities,
00430                          final_probabilities);
00431 
00432     /* find the highest probabilty and the corresponding account */
00433     memset(&account_i, 0, sizeof(struct account_info));
00434     g_hash_table_foreach(final_probabilities, highestProbability, &account_i);
00435 
00436     /* free each element of the running_probabilities hash */
00437     g_hash_table_foreach(running_probabilities, freeProbabilities, NULL);
00438 
00439     /* free the hash tables */
00440     g_hash_table_destroy(running_probabilities);
00441     g_hash_table_destroy(final_probabilities);
00442 
00443     PINFO("highest P('%s') = '%d'",
00444           account_i.account_name ? account_i.account_name : "(null)",
00445           account_i.probability);
00446 
00447     /* has this probability met our threshold? */
00448     if (account_i.probability >= threshold)
00449     {
00450         PINFO("found match");
00451         LEAVE(" ");
00452         return gnc_account_lookup_by_full_name(gnc_book_get_root_account(imap->book),
00453                                                account_i.account_name);
00454     }
00455 
00456     PINFO("no match");
00457     LEAVE(" ");
00458 
00459     return NULL; /* we didn't meet our threshold, return NULL for an account */
00460 }
00461 
00462 
00464 void gnc_imap_add_account_bayes(GncImportMatchMap *imap, GList *tokens, Account *acc)
00465 {
00466     GList *current_token;
00467     kvp_value *value;
00468     gint64 token_count;
00469     char* account_fullname;
00470     kvp_value *new_value; /* the value that will be added back into the kvp tree */
00471 
00472     ENTER(" ");
00473 
00474     /* if imap is null return */
00475     if (!imap)
00476     {
00477         LEAVE(" ");
00478         return;
00479     }
00480 
00481     g_return_if_fail (acc != NULL);
00482     account_fullname = gnc_account_get_full_name(acc);
00483     xaccAccountBeginEdit (imap->acc);
00484 
00485     PINFO("account name: '%s'\n", account_fullname);
00486 
00487     /* process each token in the list */
00488     for (current_token = g_list_first(tokens); current_token;
00489             current_token = current_token->next)
00490     {
00491         /* Jump to next iteration if the pointer is not valid or if the
00492                  string is empty. In HBCI import we almost always get an empty
00493                  string, which doesn't work in the kvp loopkup later. So we
00494                  skip this case here. */
00495         if (!current_token->data || (*((char*)current_token->data) == '\0'))
00496             continue;
00497 
00498         /* start off with no tokens for this account */
00499         token_count = 0;
00500 
00501         PINFO("adding token '%s'\n", (char*)current_token->data);
00502 
00503         /* is this token/account_name already in the kvp tree? */
00504         value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
00505                                         (char*)current_token->data, account_fullname,
00506                                         NULL);
00507 
00508         /* if the token/account is already in the tree, read the current
00509          * value from the tree and use this for the basis of the value we
00510          * are putting back
00511          */
00512         if (value)
00513         {
00514             PINFO("found existing value of '%ld'\n",
00515                   (long)kvp_value_get_gint64(value));
00516 
00517             /* convert this value back into an integer */
00518             token_count += kvp_value_get_gint64(value);
00519         }
00520 
00521         /* increment the token count */
00522         token_count++;
00523 
00524         /* create a new value */
00525         new_value = kvp_value_new_gint64(token_count);
00526 
00527         /* insert the value into the kvp tree at
00528          * /imap->frame/IMAP_FRAME/token_string/account_name_string
00529          */
00530         kvp_frame_set_slot_path(imap->frame, new_value, IMAP_FRAME_BAYES,
00531                                 (char*)current_token->data, account_fullname, NULL);
00532         /* kvp_frame_set_slot_path() copied the value so we
00533          * need to delete this one ;-) */
00534         kvp_value_delete(new_value);
00535     }
00536 
00537     /* free up the account fullname string */
00538     qof_instance_set_dirty (QOF_INSTANCE (imap->acc));
00539     xaccAccountCommitEdit (imap->acc);
00540     g_free(account_fullname);
00541 
00542     LEAVE(" ");
00543 }
00544 
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines