diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-01-13 15:57:56 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-01-13 15:57:56 +0000 |
commit | cb8f401b5fa59ec5dbb76a7f22db7a616ecc3821 (patch) | |
tree | b0edfc38c2047404f5408e84496d3d1e24cd7f81 /src/libmime/lang_detection.c | |
parent | f581bcea9161fe5086d45b355b8f675153047c3f (diff) | |
download | rspamd-cb8f401b5fa59ec5dbb76a7f22db7a616ecc3821.tar.gz rspamd-cb8f401b5fa59ec5dbb76a7f22db7a616ecc3821.zip |
[Project] Add detection logic for words
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r-- | src/libmime/lang_detection.c | 104 |
1 files changed, 100 insertions, 4 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index aa5df86cc..e579580db 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -21,6 +21,7 @@ #include <glob.h> #include <unicode/utf8.h> #include <unicode/ucnv.h> +#include <math.h> static const gsize default_short_text_limit = 200; static const gsize default_words = 20; @@ -267,6 +268,7 @@ rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords, { guint step_len, remainder, i, out_idx; guint64 coin, sel; + goffset tmp; g_assert (nwords != 0); g_assert (offsets_out != NULL); @@ -301,6 +303,23 @@ rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords, sel = (coin % step_len) + i; offsets_out[out_idx] = sel; } + + /* + * Fisher-Yates algorithm: + * for i from 0 to nā2 do + * j ā random integer such that i ā¤ j < n + * exchange a[i] and a[j] + */ + if (out_idx > 2) { + for (i = 0; i < out_idx - 2; i++) { + coin = rspamd_random_uint64_fast (); + sel = (coin % (out_idx - i)) + i; + /* swap */ + tmp = offsets_out[i]; + offsets_out[i] = offsets_out[sel]; + offsets_out[sel] = tmp; + } + } } enum rspamd_language_gramm_type { @@ -466,7 +485,7 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d, /* Split words */ while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur)) != -1) { - + rspamd_language_detector_process_ngramm_update (d, window, type, candidates); } } @@ -494,18 +513,95 @@ rspamd_language_detector_detect_word (struct rspamd_lang_detector *d, /* Split words */ while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur)) != -1) { + rspamd_language_detector_process_ngramm_full (d, window, type, candidates); + } +} + +/* + * Converts frequencies to log probabilities, filter those candidates who + * has the lowest probabilities + */ +static void +rspamd_language_detector_filter_negligible (GHashTable *candidates) +{ + GHashTableIter it; + gpointer k, v; + struct rspamd_lang_detector_res *cand; + gdouble max_prob = -(G_MAXDOUBLE); + + /* Normalize step */ + g_hash_table_iter_init (&it, candidates); + + while (g_hash_table_iter_next (&it, &k, &v)) { + cand = (struct rspamd_lang_detector_res *)v; + + if (cand->prob == 0) { + g_hash_table_iter_remove (&it); + } + else { + cand->prob = log2 (cand->prob / cand->total_words); + + if (cand->prob > max_prob) { + max_prob = cand->prob; + } + } + } + /* Filter step */ + while (g_hash_table_iter_next (&it, &k, &v)) { + cand = (struct rspamd_lang_detector_res *) v; + + /* + * Probabilities are logarifmic, so if prob1 - prob2 > 4, it means that + * prob2 is 2^4 less than prob1 + */ + if (max_prob - cand->prob > 256) { + g_hash_table_iter_remove (&it); + } } } +static void +rspamd_language_detector_detect_type (struct rspamd_lang_detector *d, + GPtrArray *ucs_tokens, + GHashTable *candidates, + enum rspamd_language_gramm_type type) +{ + guint nparts = MIN (ucs_tokens->len, default_words); + goffset *selected_words; + rspamd_stat_token_t *tok; + guint i; + + selected_words = g_new0 (goffset, nparts); + rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words); + + /* Deal with the first word in a special case */ + tok = g_ptr_array_index (ucs_tokens, selected_words[0]); + rspamd_language_detector_detect_word (d, tok, candidates, type); + + for (i = 1; i < nparts; i ++) { + tok = g_ptr_array_index (ucs_tokens, selected_words[i]); + rspamd_language_detector_update_guess (d, tok, candidates, type); + } + + /* Filter negligible candidates */ + rspamd_language_detector_filter_negligible (candidates); +} + const gchar * rspamd_language_detector_detect (struct rspamd_lang_detector *d, GPtrArray *ucs_tokens, gsize words_len) { + GHashTable *candidates; + + candidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, + NULL, g_free); if (words_len < d->short_text_limit) { /* For short text, start directly from trigramms */ + rspamd_language_detector_detect_type (d, ucs_tokens, candidates, + rs_trigramm); + } + else { + /* Start with unigramms */ } - - /* Start with unigramms */ - }
\ No newline at end of file |