From: Vsevolod Stakhov Date: Sat, 13 Jan 2018 14:54:11 +0000 (+0000) Subject: [Project] Add ngramms frequencies detector X-Git-Tag: 1.7.0~286 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=f581bcea9161fe5086d45b355b8f675153047c3f;p=rspamd.git [Project] Add ngramms frequencies detector --- diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 81c521b46..aa5df86cc 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -303,13 +303,207 @@ rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords, } } +enum rspamd_language_gramm_type { + rs_unigramm = 0, + rs_bigramm, + rs_trigramm +}; + +static goffset +rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window, + guint wlen, goffset cur_off) +{ + guint i; + + if (wlen > 1) { + /* Deal with spaces at the beginning and ending */ + + if (cur_off == 0) { + window[0] = (UChar)' '; + + for (i = 0; i < wlen - 1; i ++) { + window[i + 1] = *(((UChar *)tok->begin) + i); + } + } + else if (cur_off + wlen == tok->len + 1) { + /* Add trailing space */ + for (i = 0; i < wlen - 1; i ++) { + window[i] = *(((UChar *)tok->begin) + cur_off + i); + } + window[wlen - 1] = (UChar)' '; + } + else if (cur_off + wlen > tok->len + 1) { + /* No more fun */ + return -1; + } + + /* Normal case */ + for (i = 0; i < wlen; i ++) { + window[i] = *(((UChar *)tok->begin) + cur_off + i); + } + } + else { + if (tok->len >= cur_off) { + return -1; + } + + window[0] = *(((UChar *)tok->begin) + cur_off); + } + + return cur_off + 1; +} + +/* + * Do full guess for a specific ngramm, checking all languages defined + */ +static void +rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d, + UChar *window, enum rspamd_language_gramm_type type, + GHashTable *candidates) +{ + guint i, freq; + struct rspamd_language_elt *elt; + struct rspamd_lang_detector_res *cand; + GHashTable *ngramms; + + for (i = 0; i < d->languages->len; i ++) { + elt = g_ptr_array_index (d->languages, i); + + switch (type) { + case rs_unigramm: + ngramms = elt->unigramms; + break; + case rs_bigramm: + ngramms = elt->bigramms; + break; + case rs_trigramm: + ngramms = elt->trigramms; + break; + } + + freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window)); + cand = g_hash_table_lookup (candidates, elt->name); + + if (cand == NULL) { + cand = g_malloc (sizeof (*cand)); + cand->elt = elt; + cand->lang = elt->name; + cand->prob = freq; + } + else { + /* Update guess */ + cand->prob += freq; + } + } +} + +/* + * Check only candidates, if none found, switch to full version + */ +static void +rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d, + UChar *window, enum rspamd_language_gramm_type type, + GHashTable *candidates) +{ + guint freq, total_freq = 0; + struct rspamd_language_elt *elt; + struct rspamd_lang_detector_res *cand; + GHashTableIter it; + gpointer k, v; + GHashTable *ngramms; + + g_hash_table_iter_init (&it, candidates); + + while (g_hash_table_iter_next (&it, &k, &v)) { + cand = (struct rspamd_lang_detector_res *)v; + elt = cand->elt; + + switch (type) { + case rs_unigramm: + ngramms = elt->unigramms; + break; + case rs_bigramm: + ngramms = elt->bigramms; + break; + case rs_trigramm: + ngramms = elt->trigramms; + break; + } + + freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window)); + + cand->prob += freq; + total_freq += freq; + } + + if (total_freq == 0) { + /* Nothing found , do full scan which will also update candidates */ + rspamd_language_detector_process_ngramm_full (d, window, type, candidates); + } +} + +static gboolean +rspamd_language_detector_update_guess (struct rspamd_lang_detector *d, + rspamd_stat_token_t *tok, GHashTable *candidates, + enum rspamd_language_gramm_type type) +{ + guint wlen; + UChar window[3]; + goffset cur = 0; + + switch (type) { + case rs_unigramm: + wlen = 1; + break; + case rs_bigramm: + wlen = 2; + break; + case rs_trigramm: + wlen = 3; + break; + } + + /* Split words */ + while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur)) + != -1) { + + } +} + +static void +rspamd_language_detector_detect_word (struct rspamd_lang_detector *d, + rspamd_stat_token_t *tok, GHashTable *candidates, + enum rspamd_language_gramm_type type) +{ + guint wlen; + UChar window[3]; + goffset cur = 0; + + switch (type) { + case rs_unigramm: + wlen = 1; + break; + case rs_bigramm: + wlen = 2; + break; + case rs_trigramm: + wlen = 3; + break; + } + + /* Split words */ + while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur)) + != -1) { + + } +} + const gchar * rspamd_language_detector_detect (struct rspamd_lang_detector *d, GPtrArray *ucs_tokens, gsize words_len) { if (words_len < d->short_text_limit) { /* For short text, start directly from trigramms */ - return rspamd_language_detector_detect_trigramm (); } /* Start with unigramms */ diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h index f3f16b1ea..77cd09081 100644 --- a/src/libmime/lang_detection.h +++ b/src/libmime/lang_detection.h @@ -22,10 +22,12 @@ #include "libstat/stat_api.h" struct rspamd_lang_detector; +struct rspamd_language_elt; struct rspamd_lang_detector_res { gdouble prob; const gchar *lang; + struct rspamd_language_elt *elt; }; /**