[Project] Add unigramms to language detection as well

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 1 Jan 2018 20:28:59 +0000 (20:28 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 1 Jan 2018 20:28:59 +0000 (20:28 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 1 Jan 2018 20:28:59 +0000 (20:28 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 1 Jan 2018 20:28:59 +0000 (20:28 +0000)
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c

index 730733dfb8b074fcfc27f81af609a3114a16f5f8..3dd0a8d63289f79146977449181d28dc6fb7ac33 100644 (file)
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -24,8 +24,10 @@
  
  struct rspamd_language_elt {
         const gchar *name; /* e.g. "en" or "ru" */
+       guint unigramms_total; /* total frequencies for unigramms */
+       GHashTable *unigramms; /* unigramms frequencies */
         guint bigramms_total; /* total frequencies for bigramms */
-       GHashTable *bigramms; /* bigrams frequencies */
+       GHashTable *bigramms; /* bigramms frequencies */
         guint trigramms_total; /* total frequencies for trigramms */
         GHashTable *trigramms; /* trigramms frequencies */
  };
@@ -35,6 +37,18 @@ struct rspamd_lang_detector {
         UConverter *uchar_converter;
  };
  
+static guint
+rspamd_unigram_hash (gconstpointer key)
+{
+       return rspamd_cryptobox_fast_hash (key, sizeof (UChar), rspamd_hash_seed ());
+}
+
+static gboolean
+rspamd_unigram_equal (gconstpointer v, gconstpointer v2)
+{
+       return memcmp (v, v2, sizeof (UChar)) == 0;
+}
+
  static guint
  rspamd_bigram_hash (gconstpointer key)
  {
@@ -101,6 +115,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
         pos = strchr (nelt->name, '.');
         g_assert (pos != NULL);
         *pos = '\0';
+       nelt->unigramms = g_hash_table_new (rspamd_unigram_hash, rspamd_unigram_equal);
         nelt->bigramms = g_hash_table_new (rspamd_bigram_hash, rspamd_bigram_equal);
         nelt->trigramms = g_hash_table_new (rspamd_trigram_hash, rspamd_trigram_equal);
  
@@ -138,14 +153,21 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
                                                 GUINT_TO_POINTER (freq));
                                 nelt->trigramms_total += freq;
                         }
+                       else if (nsym == 1) {
+                               g_hash_table_insert (nelt->unigramms, ucs_key,
+                                               GUINT_TO_POINTER (freq));
+                               nelt->unigramms_total += freq;
+                       }
                         else if (nsym > 3) {
                                 msg_warn_config ("have more than 3 characters in key: %d", nsym);
                         }
                 }
         }
  
-       msg_info_config ("loaded %s language, %d digramms, %d trigramms",
-                       nelt->name, (gint)g_hash_table_size (nelt->bigramms),
+       msg_info_config ("loaded %s language, %d unigramms, %d digramms, %d trigramms",
+                       nelt->name,
+                       (gint)g_hash_table_size (nelt->unigramms),
+                       (gint)g_hash_table_size (nelt->bigramms),
                         (gint)g_hash_table_size (nelt->trigramms));
  
         g_ptr_array_add (d->languages, nelt);
@@ -202,3 +224,26 @@ end:
  
         return ret;
  }
+
+
+void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
+               rspamd_mempool_t *pool,
+               rspamd_stat_token_t *utf_token, rspamd_stat_token_t *ucs_token)
+{
+       UChar *out;
+       int32_t nsym;
+       UErrorCode uc_err = U_ZERO_ERROR;
+
+       ucs_token->flags = utf_token->flags;
+       out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->len + 1));
+       nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->len + 1),
+                       utf_token->begin, utf_token->len, &uc_err);
+
+       if (nsym >= 0) {
+               ucs_token->begin = (const gchar *) out;
+               ucs_token->len = nsym;
+       }
+       else {
+               ucs_token->len = 0;
+       }
+}
+\ No newline at end of file
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h

index aa1bf54949f15295f5125aeb6b2a6ea0cdf64402..c0d05cf1d8444c7f455f200d89126992a357e43c 100644 (file)
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -19,9 +19,25 @@
  
  #include "config.h"
  #include "libserver/cfg_file.h"
+#include "libstat/stat_api.h"
  
  struct rspamd_lang_detector;
  
+/**
+ * Create new language detector object using configuration object
+ * @param cfg
+ * @return
+ */
  struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config *cfg);
+/**
+ * Convert string from utf8 to ucs32
+ * @param d
+ * @param utf_token
+ * @param ucs_token
+ */
+void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
+               rspamd_mempool_t *pool,
+               rspamd_stat_token_t *utf_token,
+               rspamd_stat_token_t *ucs_token);
  
  #endif
diff --git a/src/libmime/message.c b/src/libmime/message.c

index a2ea7d68599a379b7e14c97ad2c391b7617a1ff3..672d78806e7158077bbf6c83fc6130ab24ac4cd7 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -24,6 +24,7 @@
  #include "smtp_parsers.h"
  #include "mime_parser.h"
  #include "mime_encoding.h"
+#include "lang_detection.h"
  #include "libutil/multipattern.h"
  #include "libserver/mempool_vars_internal.h"
  
@@ -204,10 +205,10 @@ rspamd_extract_words (struct rspamd_task *task,
  #ifdef WITH_SNOWBALL
         struct sb_stemmer *stem = NULL;
  #endif
-       rspamd_stat_token_t *w;
+       rspamd_stat_token_t *w, ucs_w;
         gchar *temp_word;
         const guchar *r;
-       guint i, nlen, total_len = 0, short_len = 0;
+       guint i, nlen, total_len = 0, short_len = 0, ucs_len = 0;
         gdouble avg_len = 0;
  
  #ifdef WITH_SNOWBALL
@@ -257,10 +258,23 @@ rspamd_extract_words (struct rspamd_task *task,
                 part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
                                 sizeof (guint64), part->normalized_words->len);
  
+               if (IS_PART_UTF (part) && task->lang_det) {
+                       part->ucs32_words =  g_array_sized_new (FALSE, FALSE,
+                                       sizeof (rspamd_stat_token_t), part->normalized_words->len);
+               }
+
                 for (i = 0; i < part->normalized_words->len; i ++) {
                         guint64 h;
  
                         w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+
+                       if (part->ucs32_words) {
+                               rspamd_language_detector_to_ucs (task->lang_det, task->task_pool,
+                                               w, &ucs_w);
+                               g_array_append_val (part->ucs32_words, ucs_w);
+                               ucs_len += ucs_w.len;
+                       }
+
                         r = NULL;
  #ifdef WITH_SNOWBALL
                         if (stem) {
diff --git a/src/libmime/message.h b/src/libmime/message.h

index 3092f3da580f2be1e656f972356c819fe803415d..90f86b3bdcc1dd8dc4b844d0e409e7184a31d810 100644 (file)
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -93,6 +93,7 @@ struct rspamd_mime_text_part {
         GList *exceptions;      /**< list of offsets of urls                                            */
         struct rspamd_mime_part *mime_part;
         GArray *normalized_words;
+       GArray *ucs32_words;
         GArray *normalized_hashes;
         guint nlines;
         guint spaces;
diff --git a/src/libserver/task.c b/src/libserver/task.c

index 2c014a7d1b81249d8e66d426bf2de440655da567..7b665d983ef4d4977a05f833fd052a367c44e8b8 100644 (file)
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -239,6 +239,9 @@ rspamd_task_free (struct rspamd_task *task)
                         if (tp->normalized_hashes) {
                                 g_array_free (tp->normalized_hashes, TRUE);
                         }
+                       if (tp->ucs32_words) {
+                               g_array_free (tp->ucs32_words, TRUE);
+                       }
                 }
  
                 if (task->rcpt_envelope) {
diff --git a/src/worker.c b/src/worker.c

index e0d2b4a0bf4b284b2365a96318007dadca14bcf4..8b01205eba769501fc3b246f5e923be57c9599a6 100644 (file)
--- a/src/worker.c
+++ b/src/worker.c
@@ -660,7 +660,7 @@ rspamd_worker_init_scanner (struct rspamd_worker *worker,
                         rspamd_worker_monitored_handler,
                         worker->srv->cfg);
  
-       *plang_det = worker->srv->cfg;
+       *plang_det = worker->srv->cfg->lang_det;
  }
  
  /*
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 1 Jan 2018 20:28:59 +0000 (20:28 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 1 Jan 2018 20:28:59 +0000 (20:28 +0000)
src/libmime/lang_detection.c		patch \| blob \| history
src/libmime/lang_detection.h		patch \| blob \| history
src/libmime/message.c		patch \| blob \| history
src/libmime/message.h		patch \| blob \| history
src/libserver/task.c		patch \| blob \| history
src/worker.c		patch \| blob \| history