[Project] Add random words selection logic

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Sat, 13 Jan 2018 13:49:18 +0000 (13:49 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Sat, 13 Jan 2018 13:49:18 +0000 (13:49 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 13 Jan 2018 13:49:18 +0000 (13:49 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 13 Jan 2018 13:49:18 +0000 (13:49 +0000)
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c

index fdece62f18854e6f23cc40caca4182af1a0c7f28..81c521b4663af9e82db9d88b950ed4c460a5f6de 100644 (file)
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -22,6 +22,10 @@
  #include <unicode/utf8.h>
  #include <unicode/ucnv.h>
  
+static const gsize default_short_text_limit = 200;
+static const gsize default_words = 20;
+static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
+
  struct rspamd_language_elt {
         const gchar *name; /* e.g. "en" or "ru" */
         guint unigramms_total; /* total frequencies for unigramms */
@@ -35,6 +39,7 @@ struct rspamd_language_elt {
  struct rspamd_lang_detector {
         GPtrArray *languages;
         UConverter *uchar_converter;
+       gsize short_text_limit;
  };
  
  static guint
@@ -178,9 +183,9 @@ struct rspamd_lang_detector*
  rspamd_language_detector_init (struct rspamd_config *cfg)
  {
         const ucl_object_t *section, *elt;
-       const gchar *languages_path = RSPAMD_PLUGINSDIR "/languages";
+       const gchar *languages_path = default_languages_path;
         glob_t gl;
-       size_t i;
+       size_t i, short_text_limit = default_short_text_limit;
         UErrorCode uc_err = U_ZERO_ERROR;
         GString *languages_pattern;
         struct rspamd_lang_detector *ret = NULL;
@@ -193,6 +198,12 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
                 if (elt) {
                         languages_path = ucl_object_tostring (elt);
                 }
+
+               elt = ucl_object_lookup (section, "short_text_limit");
+
+               if (elt) {
+                       short_text_limit = ucl_object_toint (elt);
+               }
         }
  
         languages_pattern = g_string_sized_new (PATH_MAX);
@@ -207,6 +218,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
         ret = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*ret));
         ret->languages = g_ptr_array_sized_new (gl.gl_pathc);
         ret->uchar_converter = ucnv_open ("UTF-8", &uc_err);
+       ret->short_text_limit = short_text_limit;
  
         g_assert (uc_err == U_ZERO_ERROR);
  
@@ -247,4 +259,59 @@ rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
         else {
                 ucs_token->len = 0;
         }
+}
+
+static void
+rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords,
+               goffset *offsets_out)
+{
+       guint step_len, remainder, i, out_idx;
+       guint64 coin, sel;
+
+       g_assert (nwords != 0);
+       g_assert (offsets_out != NULL);
+       g_assert (ucs_tokens->len >= nwords);
+       /*
+        * We split input array into `nwords` parts. For each part we randomly select
+        * an element from this particular split. Here is an example:
+        *
+        * nwords=2, input_len=5
+        *
+        * w1 w2 w3   w4 w5
+        * ^          ^
+        * part1      part2
+        *  vv         vv
+        *  w2         w5
+        *
+        * So we have 2 output words from 5 input words selected randomly within
+        * their splits. It is not uniform distribution but it seems to be better
+        * to include words from different text parts
+        */
+       step_len = ucs_tokens->len / nwords;
+       remainder = ucs_tokens->len % nwords;
+
+       out_idx = 0;
+       coin = rspamd_random_uint64_fast ();
+       sel = coin % (step_len + remainder);
+       offsets_out[out_idx] = sel;
+
+       for (i = step_len + remainder; i < ucs_tokens->len;
+                       i += step_len, out_idx ++) {
+               coin = rspamd_random_uint64_fast ();
+               sel = (coin % step_len) + i;
+               offsets_out[out_idx] = sel;
+       }
+}
+
+const gchar *
+rspamd_language_detector_detect (struct rspamd_lang_detector *d,
+               GPtrArray *ucs_tokens, gsize words_len)
+{
+       if (words_len < d->short_text_limit) {
+               /* For short text, start directly from trigramms */
+               return rspamd_language_detector_detect_trigramm ();
+       }
+
+       /* Start with unigramms */
+
  }
 \ No newline at end of file
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h

index c0d05cf1d8444c7f455f200d89126992a357e43c..f3f16b1ea974b34ca61bb7f4e1abeded3d504472 100644 (file)
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -23,6 +23,11 @@
  
  struct rspamd_lang_detector;
  
+struct rspamd_lang_detector_res {
+       gdouble prob;
+       const gchar *lang;
+};
+
  /**
   * Create new language detector object using configuration object
   * @param cfg
@@ -40,4 +45,14 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
                 rspamd_stat_token_t *utf_token,
                 rspamd_stat_token_t *ucs_token);
  
+/**
+ * Try to detect language of words
+ * @param d
+ * @param ucs_tokens
+ * @param words_len
+ * @return language code or NULL if language has not been detected
+ */
+const gchar * rspamd_language_detector_detect (struct rspamd_lang_detector *d,
+               GPtrArray *ucs_tokens, gsize words_len);
+
  #endif
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sat, 13 Jan 2018 13:49:18 +0000 (13:49 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sat, 13 Jan 2018 13:49:18 +0000 (13:49 +0000)
src/libmime/lang_detection.c		patch \| blob \| history
src/libmime/lang_detection.h		patch \| blob \| history