]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Add ngramms frequencies detector
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 13 Jan 2018 14:54:11 +0000 (14:54 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 13 Jan 2018 14:54:11 +0000 (14:54 +0000)
src/libmime/lang_detection.c
src/libmime/lang_detection.h

index 81c521b4663af9e82db9d88b950ed4c460a5f6de..aa5df86cc787b94014da8dcdc331fb1b9c906eff 100644 (file)
@@ -303,13 +303,207 @@ rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords,
        }
 }
 
+enum rspamd_language_gramm_type {
+       rs_unigramm = 0,
+       rs_bigramm,
+       rs_trigramm
+};
+
+static goffset
+rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
+               guint wlen, goffset cur_off)
+{
+       guint i;
+
+       if (wlen > 1) {
+               /* Deal with spaces at the beginning and ending */
+
+               if (cur_off == 0) {
+                       window[0] = (UChar)' ';
+
+                       for (i = 0; i < wlen - 1; i ++) {
+                               window[i + 1] = *(((UChar *)tok->begin) + i);
+                       }
+               }
+               else if (cur_off + wlen == tok->len + 1) {
+                       /* Add trailing space */
+                       for (i = 0; i < wlen - 1; i ++) {
+                               window[i] = *(((UChar *)tok->begin) + cur_off + i);
+                       }
+                       window[wlen - 1] = (UChar)' ';
+               }
+               else if (cur_off + wlen > tok->len + 1) {
+                       /* No more fun */
+                       return -1;
+               }
+
+               /* Normal case */
+               for (i = 0; i < wlen; i ++) {
+                       window[i] = *(((UChar *)tok->begin) + cur_off + i);
+               }
+       }
+       else {
+               if (tok->len >= cur_off) {
+                       return -1;
+               }
+
+               window[0] = *(((UChar *)tok->begin) + cur_off);
+       }
+
+       return cur_off + 1;
+}
+
+/*
+ * Do full guess for a specific ngramm, checking all languages defined
+ */
+static void
+rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
+               UChar *window, enum rspamd_language_gramm_type type,
+               GHashTable *candidates)
+{
+       guint i, freq;
+       struct rspamd_language_elt *elt;
+       struct rspamd_lang_detector_res *cand;
+       GHashTable *ngramms;
+
+       for (i = 0; i < d->languages->len; i ++) {
+               elt = g_ptr_array_index (d->languages, i);
+
+               switch (type) {
+               case rs_unigramm:
+                       ngramms = elt->unigramms;
+                       break;
+               case rs_bigramm:
+                       ngramms = elt->bigramms;
+                       break;
+               case rs_trigramm:
+                       ngramms = elt->trigramms;
+                       break;
+               }
+
+               freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
+               cand = g_hash_table_lookup (candidates, elt->name);
+
+               if (cand == NULL) {
+                       cand = g_malloc (sizeof (*cand));
+                       cand->elt = elt;
+                       cand->lang = elt->name;
+                       cand->prob = freq;
+               }
+               else {
+                       /* Update guess */
+                       cand->prob += freq;
+               }
+       }
+}
+
+/*
+ * Check only candidates, if none found, switch to full version
+ */
+static void
+rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
+               UChar *window, enum rspamd_language_gramm_type type,
+               GHashTable *candidates)
+{
+       guint freq, total_freq = 0;
+       struct rspamd_language_elt *elt;
+       struct rspamd_lang_detector_res *cand;
+       GHashTableIter it;
+       gpointer k, v;
+       GHashTable *ngramms;
+
+       g_hash_table_iter_init (&it, candidates);
+
+       while (g_hash_table_iter_next (&it, &k, &v)) {
+               cand = (struct rspamd_lang_detector_res *)v;
+               elt = cand->elt;
+
+               switch (type) {
+               case rs_unigramm:
+                       ngramms = elt->unigramms;
+                       break;
+               case rs_bigramm:
+                       ngramms = elt->bigramms;
+                       break;
+               case rs_trigramm:
+                       ngramms = elt->trigramms;
+                       break;
+               }
+
+               freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
+
+               cand->prob += freq;
+               total_freq += freq;
+       }
+
+       if (total_freq == 0) {
+               /* Nothing found , do full scan which will also update candidates */
+               rspamd_language_detector_process_ngramm_full (d, window, type, candidates);
+       }
+}
+
+static gboolean
+rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
+               rspamd_stat_token_t *tok, GHashTable *candidates,
+               enum rspamd_language_gramm_type type)
+{
+       guint wlen;
+       UChar window[3];
+       goffset cur = 0;
+
+       switch (type) {
+       case rs_unigramm:
+               wlen = 1;
+               break;
+       case rs_bigramm:
+               wlen = 2;
+               break;
+       case rs_trigramm:
+               wlen = 3;
+               break;
+       }
+
+       /* Split words */
+       while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
+                       != -1) {
+
+       }
+}
+
+static void
+rspamd_language_detector_detect_word (struct rspamd_lang_detector *d,
+               rspamd_stat_token_t *tok, GHashTable *candidates,
+               enum rspamd_language_gramm_type type)
+{
+       guint wlen;
+       UChar window[3];
+       goffset cur = 0;
+
+       switch (type) {
+       case rs_unigramm:
+               wlen = 1;
+               break;
+       case rs_bigramm:
+               wlen = 2;
+               break;
+       case rs_trigramm:
+               wlen = 3;
+               break;
+       }
+
+       /* Split words */
+       while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
+                       != -1) {
+
+       }
+}
+
 const gchar *
 rspamd_language_detector_detect (struct rspamd_lang_detector *d,
                GPtrArray *ucs_tokens, gsize words_len)
 {
        if (words_len < d->short_text_limit) {
                /* For short text, start directly from trigramms */
-               return rspamd_language_detector_detect_trigramm ();
        }
 
        /* Start with unigramms */
index f3f16b1ea974b34ca61bb7f4e1abeded3d504472..77cd09081aefdefd37c419ed93a51c536730fcfd 100644 (file)
 #include "libstat/stat_api.h"
 
 struct rspamd_lang_detector;
+struct rspamd_language_elt;
 
 struct rspamd_lang_detector_res {
        gdouble prob;
        const gchar *lang;
+       struct rspamd_language_elt *elt;
 };
 
 /**