diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-01-24 20:56:00 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-01-24 20:57:09 +0000 |
commit | f74c2585fdd51f8f759701a48be8a449f889fff7 (patch) | |
tree | e38eb8ae7a327d32a9cc4c9cf1905e71d4d01c6b /src | |
parent | 54b527ee1e2d43dd586d53bea32ec1de91af773b (diff) | |
download | rspamd-f74c2585fdd51f8f759701a48be8a449f889fff7.tar.gz rspamd-f74c2585fdd51f8f759701a48be8a449f889fff7.zip |
[Rework] Use frequencies distribution in language detector
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/lang_detection.c | 206 |
1 files changed, 108 insertions, 98 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 2bdda3004..ed2aa1681 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -769,18 +769,70 @@ rspamd_language_detector_try_ngramm (struct rspamd_task *task, return rs_detect_multiple; } +struct rspamd_frequency_sort_cbdata { + struct rspamd_lang_detector *d; + gdouble std; + gdouble mean; +}; + +static gint +rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b, + gpointer ud) +{ + struct rspamd_frequency_sort_cbdata *cbd = ud; + const struct rspamd_lang_detector_res + *canda = *(const struct rspamd_lang_detector_res **)a, + *candb = *(const struct rspamd_lang_detector_res **)b; + gdouble diff; + + diff = fabs (canda->prob - candb->prob); + + if (diff > cbd->std) { + /* Generic case */ + if (canda->prob > candb->prob) { + return -1; + } else if (candb->prob > canda->prob) { + return 1; + } + + return 0; + } + else { + gdouble proba_adjusted, probb_adjusted, freqa, freqb; + + freqa = ((gdouble)canda->elt->occurencies) / + (gdouble)cbd->d->total_occurencies; + freqb = ((gdouble)candb->elt->occurencies) / + (gdouble)cbd->d->total_occurencies; + + proba_adjusted = canda->prob * freqa; + probb_adjusted = candb->prob * freqb; + + if (proba_adjusted > probb_adjusted) { + return -1; + } else if (probb_adjusted > proba_adjusted) { + return 1; + } + + return 0; + } +} + GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task, struct rspamd_lang_detector *d, GArray *ucs_tokens, gsize words_len) { - GHashTable *candidates, *tcandidates; + GHashTable *candidates; GPtrArray *result; GHashTableIter it; gpointer k, v; gdouble mean, std; struct rspamd_lang_detector_res *cand; enum rspamd_language_detected_type r; + struct rspamd_frequency_sort_cbdata cbd; + /* Check if we have sorted candidates based on frequency */ + gboolean frequency_heuristic_applied = FALSE; if (ucs_tokens->len == 0) { return g_ptr_array_new (); @@ -789,110 +841,61 @@ rspamd_language_detector_detect (struct rspamd_task *task, candidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, NULL, g_free); - if (words_len < d->short_text_limit) { - /* For short text, start directly from trigramms */ - msg_debug_lang_det ("text is less than %z words: %z, start with trigramms", - d->short_text_limit, words_len); + msg_debug_lang_det ("text is less than %z words: %z, start with trigramms", + d->short_text_limit, words_len); + r = rspamd_language_detector_try_ngramm (task, default_words, d, + ucs_tokens, rs_trigramm, + candidates); + + if (r == rs_detect_none) { + msg_debug_lang_det ("short mode; no trigramms found, switch to bigramms"); r = rspamd_language_detector_try_ngramm (task, default_words, d, - ucs_tokens, rs_trigramm, + ucs_tokens, rs_bigramm, candidates); if (r == rs_detect_none) { - msg_debug_lang_det ("short mode; no trigramms found, switch to bigramms"); - r = rspamd_language_detector_try_ngramm (task, default_words, d, - ucs_tokens, rs_bigramm, + msg_debug_lang_det ("short mode; no trigramms found, " + "switch to unigramms"); + r = rspamd_language_detector_try_ngramm (task, default_words, + d, ucs_tokens, rs_unigramm, candidates); - - if (r == rs_detect_none) { - msg_debug_lang_det ("short mode; no trigramms found, " - "switch to unigramms"); - r = rspamd_language_detector_try_ngramm (task, default_words, - d, ucs_tokens, rs_unigramm, - candidates); - } } } - else { - /* Start with unigramms */ - r = rspamd_language_detector_try_ngramm (task, default_words, - d, ucs_tokens, rs_unigramm, - candidates); + else if (r == rs_detect_multiple) { + /* Check our guess */ + msg_debug_lang_det ("unigramms pass finished, found %d candidates", + (gint)g_hash_table_size (candidates)); + mean = 0.0; + std = 0.0; + g_hash_table_iter_init (&it, candidates); + + /* Check distirbution */ + while (g_hash_table_iter_next (&it, &k, &v)) { + cand = (struct rspamd_lang_detector_res *) v; + mean += cand->prob; + } - switch (r) { - case rs_detect_none: - case rs_detect_single: - msg_debug_lang_det ("no unigramms found, try bigramms"); - break; - case rs_detect_multiple: - /* Try to improve guess */ - msg_debug_lang_det ("unigramms pass finished, found %d candidates", - (gint)g_hash_table_size (candidates)); - tcandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, - NULL, g_free); - r = rspamd_language_detector_try_ngramm (task, default_words, - d, ucs_tokens, rs_trigramm, - tcandidates); + mean /= g_hash_table_size (candidates); - switch (r) { - case rs_detect_none: - /* Revert to unigramms result */ - g_hash_table_unref (tcandidates); - break; - case rs_detect_single: - /* We have good enough result, return it */ - g_hash_table_unref (candidates); - candidates = tcandidates; - break; - case rs_detect_multiple: - mean = 0.0; - std = 0.0; - g_hash_table_iter_init (&it, tcandidates); - - /* Check distirbution */ - while (g_hash_table_iter_next (&it, &k, &v)) { - cand = (struct rspamd_lang_detector_res *) v; - mean += cand->prob; - } - - mean /= g_hash_table_size (tcandidates); - - g_hash_table_iter_init (&it, tcandidates); - while (g_hash_table_iter_next (&it, &k, &v)) { - gdouble err; - cand = (struct rspamd_lang_detector_res *) v; - err = cand->prob - mean; - std += fabs (err); - } - - std /= g_hash_table_size (tcandidates); - g_hash_table_unref (candidates); - candidates = tcandidates; - - msg_debug_lang_det ("trigramms checked, %.3f mean, %.4f stddev", - mean, std); - - if (std / fabs (mean) < 0.01) { - /* Try trigramms */ - tcandidates = g_hash_table_new_full (rspamd_str_hash, - rspamd_str_equal, - NULL, g_free); - - r = rspamd_language_detector_try_ngramm (task, - default_words * 2, - d, - ucs_tokens, - rs_trigramm, - tcandidates); - - if (r != rs_detect_none) { - /* TODO: check if we have better distribution here */ - g_hash_table_unref (candidates); - candidates = tcandidates; - } - } - break; - } - break; + g_hash_table_iter_init (&it, candidates); + while (g_hash_table_iter_next (&it, &k, &v)) { + gdouble err; + cand = (struct rspamd_lang_detector_res *) v; + err = cand->prob - mean; + std += fabs (err); + } + + std /= g_hash_table_size (candidates); + + msg_debug_lang_det ("trigramms checked, %.3f mean, %.4f stddev", + mean, std); + + if (std / fabs (mean) < 0.1) { + msg_debug_lang_det ("apply frequency heuristic sorting"); + frequency_heuristic_applied = TRUE; + cbd.d = d; + cbd.mean = mean; + cbd.std = std; } } @@ -907,10 +910,17 @@ rspamd_language_detector_detect (struct rspamd_task *task, g_hash_table_iter_steal (&it); } - g_ptr_array_sort (result, rspamd_language_detector_cmp); + if (frequency_heuristic_applied) { + g_ptr_array_sort_with_data (result, + rspamd_language_detector_cmp_heuristic, (gpointer)&cbd); + } + else { + g_ptr_array_sort (result, rspamd_language_detector_cmp); + } g_hash_table_unref (candidates); - if (result->len > 0) { + + if (result->len > 0 && !frequency_heuristic_applied) { cand = g_ptr_array_index (result, 0); cand->elt->occurencies ++; d->total_occurencies ++; |