diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-01-13 20:13:18 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-01-13 20:13:18 +0000 |
commit | 374afc0e77fdca6bff25c44ebfe467780f461d87 (patch) | |
tree | f772e35d87624563cf3d3027671e1915aa8a5e17 /src/libmime | |
parent | b72c8f94ccbbe8362b38a4a9f35823367ad21a9c (diff) | |
download | rspamd-374afc0e77fdca6bff25c44ebfe467780f461d87.tar.gz rspamd-374afc0e77fdca6bff25c44ebfe467780f461d87.zip |
[Fix] Various improvements in language detection
Diffstat (limited to 'src/libmime')
-rw-r--r-- | src/libmime/lang_detection.c | 162 | ||||
-rw-r--r-- | src/libmime/message.c | 12 |
2 files changed, 132 insertions, 42 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 66901e6b9..ead12b8e8 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -24,7 +24,8 @@ #include <math.h> static const gsize default_short_text_limit = 200; -static const gsize default_words = 20; +static const gsize default_words = 30; +static const gdouble update_prob = 0.6; static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages"; struct rspamd_language_elt { @@ -503,9 +504,17 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d, /* Split words */ while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur)) != -1) { - if (!rspamd_language_detector_process_ngramm_update (d, window, - type, candidates)) { - ret = FALSE; + + if (rspamd_random_double_fast () > update_prob) { + if (!rspamd_language_detector_process_ngramm_update (d, window, + type, candidates)) { + ret = FALSE; + } + } + else { + /* Try to do full update in case if we are missing some candidates */ + rspamd_language_detector_process_ngramm_full (d, window, type, + candidates); } } @@ -576,10 +585,10 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates) cand = (struct rspamd_lang_detector_res *) v; /* - * Probabilities are logarifmic, so if prob1 - prob2 > 4, it means that + * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that * prob2 is 2^4 less than prob1 */ - if (max_prob - cand->prob > 256) { + if (max_prob - cand->prob > 4) { g_hash_table_iter_remove (&it); } } @@ -636,55 +645,134 @@ rspamd_language_detector_cmp (gconstpointer a, gconstpointer b) return 0; } +enum rspamd_language_detected_type { + rs_detect_none = 0, + rs_detect_single, + rs_detect_multiple, +}; + +static enum rspamd_language_detected_type +rspamd_language_detector_try_ngramm (struct rspamd_lang_detector *d, + GArray *ucs_tokens, + enum rspamd_language_gramm_type type, + GHashTable *candidates) +{ + guint cand_len; + + rspamd_language_detector_detect_type (d, ucs_tokens, candidates, + type, TRUE); + + cand_len = g_hash_table_size (candidates); + + if (cand_len == 0) { + return rs_detect_none; + } + else if (cand_len == 1) { + return rs_detect_single; + } + + return rs_detect_multiple; +} + GPtrArray * rspamd_language_detector_detect (struct rspamd_lang_detector *d, GArray *ucs_tokens, gsize words_len) { - GHashTable *candidates; + GHashTable *candidates, *tcandidates; GPtrArray *result; GHashTableIter it; gpointer k, v; + gdouble mean, std; struct rspamd_lang_detector_res *cand; - guint cand_len, prev_len; + enum rspamd_language_detected_type r; candidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, NULL, g_free); if (words_len < d->short_text_limit) { /* For short text, start directly from trigramms */ - rspamd_language_detector_detect_type (d, ucs_tokens, candidates, - rs_trigramm, TRUE); + r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_trigramm, + candidates); + + if (r == rs_detect_none) { + r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_bigramm, + candidates); + + if (r == rs_detect_none) { + r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_unigramm, + candidates); + } + } } else { /* Start with unigramms */ - rspamd_language_detector_detect_type (d, ucs_tokens, candidates, - rs_unigramm, TRUE); - cand_len = g_hash_table_size (candidates); - - if (cand_len > 1) { - /* Try bigramms */ - rspamd_language_detector_detect_type (d, ucs_tokens, candidates, - rs_unigramm, FALSE); - - cand_len = g_hash_table_size (candidates); - if (cand_len > 1) { - prev_len = cand_len; - /* Try trigramms */ - GHashTable *ncandidates; - ncandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, - NULL, g_free); - rspamd_language_detector_detect_type (d, ucs_tokens, ncandidates, - rs_trigramm, TRUE); - cand_len = g_hash_table_size (ncandidates); - - if (cand_len < prev_len) { - g_hash_table_unref (candidates); - candidates = ncandidates; + r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_unigramm, + candidates); + + switch (r) { + case rs_detect_none: + case rs_detect_single: + /* No unigramms found or single set found, no reason to continue */; + break; + case rs_detect_multiple: + /* Try to improve guess */ + tcandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, + NULL, g_free); + r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_bigramm, + tcandidates); + + switch (r) { + case rs_detect_none: + /* Revert to unigramms result */ + g_hash_table_unref (tcandidates); + break; + case rs_detect_single: + /* We have good enough result, return it */ + g_hash_table_unref (candidates); + candidates = tcandidates; + break; + case rs_detect_multiple: + mean = 0.0; + std = 0.0; + g_hash_table_iter_init (&it, tcandidates); + + /* Check distirbution */ + while (g_hash_table_iter_next (&it, &k, &v)) { + cand = (struct rspamd_lang_detector_res *) v; + mean += cand->prob; } - else { - /* Not a better guess */ - g_hash_table_unref (ncandidates); + + mean /= g_hash_table_size (tcandidates); + + g_hash_table_iter_init (&it, tcandidates); + while (g_hash_table_iter_next (&it, &k, &v)) { + gdouble err; + cand = (struct rspamd_lang_detector_res *) v; + err = cand->prob - mean; + std += err * err; + } + + std /= g_hash_table_size (tcandidates); + g_hash_table_unref (candidates); + candidates = tcandidates; + + if (std < mean / 100) { + /* Try trigramms */ + tcandidates = g_hash_table_new_full (rspamd_str_hash, + rspamd_str_equal, + NULL, g_free); + + r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_trigramm, + tcandidates); + + if (r != rs_detect_none) { + /* TODO: check if we have better distribution here */ + g_hash_table_unref (candidates); + candidates = tcandidates; + } } + break; } + break; } } @@ -694,7 +782,7 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d, while (g_hash_table_iter_next (&it, &k, &v)) { cand = (struct rspamd_lang_detector_res *) v; - msg_err ("%s -> %.2f", cand->lang, cand->prob); + msg_debug ("%s -> %.2f", cand->lang, cand->prob); g_ptr_array_add (result, cand); g_hash_table_iter_steal (&it); } diff --git a/src/libmime/message.c b/src/libmime/message.c index 4bac77062..2a7801100 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -98,11 +98,13 @@ rspamd_extract_words (struct rspamd_task *task, for (i = 0; i < part->normalized_words->len; i++) { w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); - - rspamd_language_detector_to_ucs (task->lang_det, task->task_pool, - w, &ucs_w); - g_array_append_val (part->ucs32_words, ucs_w); - ucs_len += ucs_w.len; + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + rspamd_language_detector_to_ucs (task->lang_det, + task->task_pool, + w, &ucs_w); + g_array_append_val (part->ucs32_words, ucs_w); + ucs_len += ucs_w.len; + } } part->languages = rspamd_language_detector_detect (task->lang_det, |