diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-01-13 17:41:57 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-01-13 17:41:57 +0000 |
commit | b72c8f94ccbbe8362b38a4a9f35823367ad21a9c (patch) | |
tree | c8c79328cf10c669021b9a3df761e3bd9c5d36ab /src/libmime/lang_detection.c | |
parent | 88950e4e4f563caba44a14a40b1180be27b772d0 (diff) | |
download | rspamd-b72c8f94ccbbe8362b38a4a9f35823367ad21a9c.tar.gz rspamd-b72c8f94ccbbe8362b38a4a9f35823367ad21a9c.zip |
[Project] Preliminary version of ngramms based language detector
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r-- | src/libmime/lang_detection.c | 43 |
1 files changed, 24 insertions, 19 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index fb9af6df7..66901e6b9 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -263,7 +263,7 @@ rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, } static void -rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords, +rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, goffset *offsets_out) { guint step_len, remainder, i, out_idx; @@ -362,7 +362,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, } } else { - if (tok->len >= cur_off) { + if (tok->len <= cur_off) { return -1; } @@ -406,17 +406,21 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d, freq = ((gdouble)GPOINTER_TO_UINT ( g_hash_table_lookup (ngramms, window))) / class_freq; - cand = g_hash_table_lookup (candidates, elt->name); - if (cand == NULL) { - cand = g_malloc (sizeof (*cand)); - cand->elt = elt; - cand->lang = elt->name; - cand->prob = freq; - } - else { - /* Update guess */ - cand->prob += freq; + if (freq > 0) { + cand = g_hash_table_lookup (candidates, elt->name); + + if (cand == NULL) { + cand = g_malloc (sizeof (*cand)); + cand->elt = elt; + cand->lang = elt->name; + cand->prob = freq; + + g_hash_table_insert (candidates, (gpointer)elt->name, cand); + } else { + /* Update guess */ + cand->prob += freq; + } } } } @@ -583,7 +587,7 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates) static void rspamd_language_detector_detect_type (struct rspamd_lang_detector *d, - GPtrArray *ucs_tokens, + GArray *ucs_tokens, GHashTable *candidates, enum rspamd_language_gramm_type type, gboolean start_over) @@ -597,7 +601,7 @@ rspamd_language_detector_detect_type (struct rspamd_lang_detector *d, rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words); /* Deal with the first word in a special case */ - tok = g_ptr_array_index (ucs_tokens, selected_words[0]); + tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[0]); if (start_over) { rspamd_language_detector_detect_word (d, tok, candidates, type); @@ -607,7 +611,7 @@ rspamd_language_detector_detect_type (struct rspamd_lang_detector *d, } for (i = 1; i < nparts; i ++) { - tok = g_ptr_array_index (ucs_tokens, selected_words[i]); + tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[i]); rspamd_language_detector_update_guess (d, tok, candidates, type); } @@ -620,13 +624,13 @@ rspamd_language_detector_cmp (gconstpointer a, gconstpointer b) { const struct rspamd_lang_detector_res *canda = *(const struct rspamd_lang_detector_res **)a, - *candb = *(const struct rspamd_lang_detector_res **)a; + *candb = *(const struct rspamd_lang_detector_res **)b; if (canda->prob > candb->prob) { - return 1; + return -1; } else if (candb->prob > canda->prob) { - return -1; + return 1; } return 0; @@ -634,7 +638,7 @@ rspamd_language_detector_cmp (gconstpointer a, gconstpointer b) GPtrArray * rspamd_language_detector_detect (struct rspamd_lang_detector *d, - GPtrArray *ucs_tokens, gsize words_len) + GArray *ucs_tokens, gsize words_len) { GHashTable *candidates; GPtrArray *result; @@ -690,6 +694,7 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d, while (g_hash_table_iter_next (&it, &k, &v)) { cand = (struct rspamd_lang_detector_res *) v; + msg_err ("%s -> %.2f", cand->lang, cand->prob); g_ptr_array_add (result, cand); g_hash_table_iter_steal (&it); } |