|
GnuCash 2.4.99
|
00001 /********************************************************************\ 00002 * This program is free software; you can redistribute it and/or * 00003 * modify it under the terms of the GNU General Public License as * 00004 * published by the Free Software Foundation; either version 2 of * 00005 * the License, or (at your option) any later version. * 00006 * * 00007 * This program is distributed in the hope that it will be useful, * 00008 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 00009 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 00010 * GNU General Public License for more details. * 00011 * * 00012 * You should have received a copy of the GNU General Public License* 00013 * along with this program; if not, contact: * 00014 * * 00015 * Free Software Foundation Voice: +1-617-542-5942 * 00016 * 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652 * 00017 * Boston, MA 02110-1301, USA gnu@gnu.org * 00018 \********************************************************************/ 00030 #include "config.h" 00031 #include <string.h> 00032 #include <glib.h> 00033 #include "import-match-map.h" 00034 #include "gnc-ui-util.h" 00035 #include "gnc-engine.h" 00036 00037 /********************************************************************\ 00038 * Constants * 00039 \********************************************************************/ 00040 00041 static QofLogModule log_module = GNC_MOD_IMPORT; 00042 00043 00044 struct _GncImportMatchMap 00045 { 00046 kvp_frame * frame; 00047 Account * acc; 00048 QofBook * book; 00049 }; 00050 00051 #define IMAP_FRAME "import-map" 00052 #define IMAP_FRAME_BAYES "import-map-bayes" 00053 00054 static GncImportMatchMap * 00055 gnc_imap_create_from_frame (kvp_frame *frame, Account *acc, QofBook *book) 00056 { 00057 GncImportMatchMap *imap; 00058 00059 g_return_val_if_fail (frame != NULL, NULL); 00060 g_return_val_if_fail ((acc && !book) || (!acc && book), NULL); 00061 00062 imap = g_new0(GncImportMatchMap, 1); 00063 imap->frame = frame; 00064 00065 /* Cache the book for easy lookups; store the account/book for 00066 * marking dirtiness 00067 */ 00068 if (acc) 00069 book = gnc_account_get_book (acc); 00070 imap->acc = acc; 00071 imap->book = book; 00072 00073 return imap; 00074 } 00075 00077 GncImportMatchMap * gnc_imap_create_from_account (Account *acc) 00078 { 00079 kvp_frame * frame; 00080 00081 if (!acc) return NULL; 00082 frame = xaccAccountGetSlots (acc); 00083 g_return_val_if_fail (frame != NULL, NULL); 00084 00085 return gnc_imap_create_from_frame (frame, acc, NULL); 00086 } 00087 00088 GncImportMatchMap * gnc_imap_create_from_book (QofBook *book) 00089 { 00090 kvp_frame * frame; 00091 00092 if (!book) return NULL; 00093 frame = qof_book_get_slots (book); 00094 g_return_val_if_fail (frame != NULL, NULL); 00095 00096 return gnc_imap_create_from_frame (frame, NULL, book); 00097 } 00098 00100 void gnc_imap_destroy (GncImportMatchMap *imap) 00101 { 00102 if (!imap) return; 00103 g_free (imap); 00104 } 00105 00107 void gnc_imap_clear (GncImportMatchMap *imap) 00108 { 00109 if (!imap) return; 00110 00111 /* Clear the IMAP_FRAME kvp */ 00112 kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME); 00113 00114 /* Clear the bayes kvp, IMAP_FRAME_BAYES */ 00115 kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME_BAYES); 00116 00117 /* XXX: mark the account (or book) as dirty! */ 00118 } 00119 00121 Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category, 00122 const char *key) 00123 { 00124 kvp_value *value; 00125 GncGUID * guid; 00126 00127 if (!imap || !key) return NULL; 00128 if (!category) 00129 { 00130 category = key; 00131 key = NULL; 00132 } 00133 00134 value = kvp_frame_get_slot_path (imap->frame, IMAP_FRAME, category, key, NULL); 00135 if (!value) return NULL; 00136 00137 guid = kvp_value_get_guid (value); 00138 return xaccAccountLookup (guid, imap->book); 00139 } 00140 00142 void gnc_imap_add_account (GncImportMatchMap *imap, const char *category, 00143 const char *key, Account *acc) 00144 { 00145 kvp_value *value; 00146 00147 if (!imap || !key || !acc || (strlen (key) == 0)) return; 00148 if (!category) 00149 { 00150 category = key; 00151 key = NULL; 00152 } 00153 g_return_if_fail (acc != NULL); 00154 00155 value = kvp_value_new_guid (xaccAccountGetGUID (acc)); 00156 g_return_if_fail (value != NULL); 00157 xaccAccountBeginEdit (imap->acc); 00158 kvp_frame_set_slot_path (imap->frame, value, IMAP_FRAME, category, key, NULL); 00159 qof_instance_set_dirty (QOF_INSTANCE (imap->acc)); 00160 xaccAccountCommitEdit (imap->acc); 00161 kvp_value_delete (value); 00162 00163 /* XXX Mark the account (or book) as dirty! */ 00164 } 00165 00166 00167 00168 00169 /*-------------------------------------------------------------------------- 00170 Below here is the bayes transaction to account matching system 00171 --------------------------------------------------------------------------*/ 00172 00173 00174 struct account_token_count 00175 { 00176 char* account_name; 00177 gint64 token_count; 00178 }; 00179 00183 struct token_accounts_info 00184 { 00185 GList *accounts; 00186 gint64 total_count; 00187 }; 00188 00192 static void buildTokenInfo(const char *key, kvp_value *value, gpointer data) 00193 { 00194 struct token_accounts_info *tokenInfo = (struct token_accounts_info*)data; 00195 struct account_token_count* this_account; 00196 00197 // PINFO("buildTokenInfo: account '%s', token_count: '%ld'\n", (char*)key, 00198 // (long)kvp_value_get_gint64(value)); 00199 00200 /* add the count to the total_count */ 00201 tokenInfo->total_count += kvp_value_get_gint64(value); 00202 00203 /* allocate a new structure for this account and it's token count */ 00204 this_account = (struct account_token_count*) 00205 g_new0(struct account_token_count, 1); 00206 00207 /* fill in the account name and number of tokens found for this account name */ 00208 this_account->account_name = (char*)key; 00209 this_account->token_count = kvp_value_get_gint64(value); 00210 00211 /* append onto the glist a pointer to the new account_token_count structure */ 00212 tokenInfo->accounts = g_list_prepend(tokenInfo->accounts, this_account); 00213 } 00214 00219 struct account_probability 00220 { 00221 double product; /* product of probabilities */ 00222 double product_difference; /* product of (1-probabilities) */ 00223 }; 00224 00229 #define PROBABILITY_FACTOR 100000 00230 static void buildProbabilities(gpointer key, gpointer value, gpointer data) 00231 { 00232 GHashTable *final_probabilities = (GHashTable*)data; 00233 struct account_probability *account_p = (struct account_probability*)value; 00234 00235 /* P(AB) = A*B / [A*B + (1-A)*(1-B)] 00236 * NOTE: so we only keep track of a running product(A*B*C...) 00237 * and product difference ((1-A)(1-B)...) 00238 */ 00239 gint32 probability = 00240 (account_p->product / 00241 (account_p->product + account_p->product_difference)) 00242 * PROBABILITY_FACTOR; 00243 00244 PINFO("P('%s') = '%d'\n", (char*)key, probability); 00245 00246 g_hash_table_insert(final_probabilities, key, GINT_TO_POINTER(probability)); 00247 } 00248 00250 static void freeProbabilities(gpointer key, gpointer value, gpointer data) 00251 { 00252 /* free up the struct account_probability that was allocated 00253 * in gnc_imap_find_account_bayes() 00254 */ 00255 g_free(value); 00256 } 00257 00261 struct account_info 00262 { 00263 char* account_name; 00264 gint32 probability; 00265 }; 00266 00273 static void highestProbability(gpointer key, gpointer value, gpointer data) 00274 { 00275 struct account_info *account_i = (struct account_info*)data; 00276 00277 /* if the current probability is greater than the stored, store the current */ 00278 if (GPOINTER_TO_INT(value) > account_i->probability) 00279 { 00280 /* Save the new highest probability and the assoaciated account name */ 00281 account_i->probability = GPOINTER_TO_INT(value); 00282 account_i->account_name = key; 00283 } 00284 } 00285 00286 00287 #define threshold (.90 * PROBABILITY_FACTOR) /* 90% */ 00288 00290 Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList *tokens) 00291 { 00292 struct token_accounts_info tokenInfo; 00294 GList *current_token; 00296 GList *current_account_token; 00298 struct account_token_count *account_c; 00301 struct account_probability *account_p; 00304 GHashTable *running_probabilities = g_hash_table_new(g_str_hash, g_str_equal); 00305 GHashTable *final_probabilities = g_hash_table_new(g_str_hash, g_str_equal); 00306 struct account_info account_i; 00307 kvp_value* value; 00308 kvp_frame* token_frame; 00309 00310 ENTER(" "); 00311 00312 /* check to see if the imap is NULL */ 00313 if (!imap) 00314 { 00315 PINFO("imap is null, returning null"); 00316 LEAVE(" "); 00317 return NULL; 00318 } 00319 00320 /* find the probability for each account that contains any of the tokens 00321 * in the input tokens list 00322 */ 00323 for (current_token = tokens; current_token; current_token = current_token->next) 00324 { 00325 /* zero out the token_accounts_info structure */ 00326 memset(&tokenInfo, 0, sizeof(struct token_accounts_info)); 00327 00328 PINFO("token: '%s'", (char*)current_token->data); 00329 00330 /* find the slot for the given token off of the source account 00331 * for these tokens, search off of the IMAP_FRAME_BAYES path so 00332 * we aren't looking from the parent of the entire kvp tree 00333 */ 00334 value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES, 00335 (char*)current_token->data, NULL); 00336 00337 /* if value is null we should skip over this token */ 00338 if (!value) 00339 continue; 00340 00341 /* convert the slot(value) into a the frame that contains the 00342 * list of accounts 00343 */ 00344 token_frame = kvp_value_get_frame(value); 00345 00346 /* token_frame should NEVER be null */ 00347 if (!token_frame) 00348 { 00349 PERR("token '%s' has no accounts", (char*)current_token->data); 00350 continue; /* skip over this token */ 00351 } 00352 00353 /* process the accounts for this token, adding the account if it 00354 * doesn't already exist or adding to the existing accounts token 00355 * count if it does 00356 */ 00357 kvp_frame_for_each_slot(token_frame, buildTokenInfo, &tokenInfo); 00358 00359 /* for each account we have just found, see if the account already exists 00360 * in the list of account probabilities, if not add it 00361 */ 00362 for (current_account_token = tokenInfo.accounts; current_account_token; 00363 current_account_token = current_account_token->next) 00364 { 00365 /* get the account name and corresponding token count */ 00366 account_c = (struct account_token_count*)current_account_token->data; 00367 00368 PINFO("account_c->account_name('%s'), " 00369 "account_c->token_count('%ld')/total_count('%ld')", 00370 account_c->account_name, (long)account_c->token_count, 00371 (long)tokenInfo.total_count); 00372 00373 account_p = g_hash_table_lookup(running_probabilities, 00374 account_c->account_name); 00375 00376 /* if the account exists in the list then continue 00377 * the running probablities 00378 */ 00379 if (account_p) 00380 { 00381 account_p->product = 00382 ((double)account_c->token_count / (double)tokenInfo.total_count) 00383 * account_p->product; 00384 account_p->product_difference = 00385 ((double)1 - ((double)account_c->token_count / 00386 (double)tokenInfo.total_count)) 00387 * account_p->product_difference; 00388 PINFO("product == %f, product_difference == %f", 00389 account_p->product, account_p->product_difference); 00390 } 00391 else 00392 { 00393 /* add a new entry */ 00394 PINFO("adding a new entry for this account"); 00395 account_p = (struct account_probability*) 00396 g_new0(struct account_probability, 1); 00397 00398 /* set the product and product difference values */ 00399 account_p->product = ((double)account_c->token_count / 00400 (double)tokenInfo.total_count); 00401 account_p->product_difference = 00402 (double)1 - ((double)account_c->token_count / 00403 (double)tokenInfo.total_count); 00404 00405 PINFO("product == %f, product_difference == %f", 00406 account_p->product, account_p->product_difference); 00407 00408 /* add the account name and (struct account_probability*) 00409 * to the hash table */ 00410 g_hash_table_insert(running_probabilities, 00411 account_c->account_name, account_p); 00412 } 00413 } /* for all accounts in tokenInfo */ 00414 00415 /* free the data in tokenInfo */ 00416 for (current_account_token = tokenInfo.accounts; current_account_token; 00417 current_account_token = current_account_token->next) 00418 { 00419 /* free up each struct account_token_count we allocated */ 00420 g_free((struct account_token_count*)current_account_token->data); 00421 } 00422 00423 g_list_free(tokenInfo.accounts); /* free the accounts GList */ 00424 } 00425 00426 /* build a hash table of account names and their final probabilities 00427 * from each entry in the running_probabilties hash table 00428 */ 00429 g_hash_table_foreach(running_probabilities, buildProbabilities, 00430 final_probabilities); 00431 00432 /* find the highest probabilty and the corresponding account */ 00433 memset(&account_i, 0, sizeof(struct account_info)); 00434 g_hash_table_foreach(final_probabilities, highestProbability, &account_i); 00435 00436 /* free each element of the running_probabilities hash */ 00437 g_hash_table_foreach(running_probabilities, freeProbabilities, NULL); 00438 00439 /* free the hash tables */ 00440 g_hash_table_destroy(running_probabilities); 00441 g_hash_table_destroy(final_probabilities); 00442 00443 PINFO("highest P('%s') = '%d'", 00444 account_i.account_name ? account_i.account_name : "(null)", 00445 account_i.probability); 00446 00447 /* has this probability met our threshold? */ 00448 if (account_i.probability >= threshold) 00449 { 00450 PINFO("found match"); 00451 LEAVE(" "); 00452 return gnc_account_lookup_by_full_name(gnc_book_get_root_account(imap->book), 00453 account_i.account_name); 00454 } 00455 00456 PINFO("no match"); 00457 LEAVE(" "); 00458 00459 return NULL; /* we didn't meet our threshold, return NULL for an account */ 00460 } 00461 00462 00464 void gnc_imap_add_account_bayes(GncImportMatchMap *imap, GList *tokens, Account *acc) 00465 { 00466 GList *current_token; 00467 kvp_value *value; 00468 gint64 token_count; 00469 char* account_fullname; 00470 kvp_value *new_value; /* the value that will be added back into the kvp tree */ 00471 00472 ENTER(" "); 00473 00474 /* if imap is null return */ 00475 if (!imap) 00476 { 00477 LEAVE(" "); 00478 return; 00479 } 00480 00481 g_return_if_fail (acc != NULL); 00482 account_fullname = gnc_account_get_full_name(acc); 00483 xaccAccountBeginEdit (imap->acc); 00484 00485 PINFO("account name: '%s'\n", account_fullname); 00486 00487 /* process each token in the list */ 00488 for (current_token = g_list_first(tokens); current_token; 00489 current_token = current_token->next) 00490 { 00491 /* Jump to next iteration if the pointer is not valid or if the 00492 string is empty. In HBCI import we almost always get an empty 00493 string, which doesn't work in the kvp loopkup later. So we 00494 skip this case here. */ 00495 if (!current_token->data || (*((char*)current_token->data) == '\0')) 00496 continue; 00497 00498 /* start off with no tokens for this account */ 00499 token_count = 0; 00500 00501 PINFO("adding token '%s'\n", (char*)current_token->data); 00502 00503 /* is this token/account_name already in the kvp tree? */ 00504 value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES, 00505 (char*)current_token->data, account_fullname, 00506 NULL); 00507 00508 /* if the token/account is already in the tree, read the current 00509 * value from the tree and use this for the basis of the value we 00510 * are putting back 00511 */ 00512 if (value) 00513 { 00514 PINFO("found existing value of '%ld'\n", 00515 (long)kvp_value_get_gint64(value)); 00516 00517 /* convert this value back into an integer */ 00518 token_count += kvp_value_get_gint64(value); 00519 } 00520 00521 /* increment the token count */ 00522 token_count++; 00523 00524 /* create a new value */ 00525 new_value = kvp_value_new_gint64(token_count); 00526 00527 /* insert the value into the kvp tree at 00528 * /imap->frame/IMAP_FRAME/token_string/account_name_string 00529 */ 00530 kvp_frame_set_slot_path(imap->frame, new_value, IMAP_FRAME_BAYES, 00531 (char*)current_token->data, account_fullname, NULL); 00532 /* kvp_frame_set_slot_path() copied the value so we 00533 * need to delete this one ;-) */ 00534 kvp_value_delete(new_value); 00535 } 00536 00537 /* free up the account fullname string */ 00538 qof_instance_set_dirty (QOF_INSTANCE (imap->acc)); 00539 xaccAccountCommitEdit (imap->acc); 00540 g_free(account_fullname); 00541 00542 LEAVE(" "); 00543 } 00544
1.7.4