aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-01-13 13:49:18 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-01-13 13:49:18 +0000
commit47d23f7ff0988398a813cc340f89783de24f8596 (patch)
tree58e9fad63ab632da7779e2189feab13c90e6925b
parentea6af2a9940ef79051051e764e430336be73179f (diff)
downloadrspamd-47d23f7ff0988398a813cc340f89783de24f8596.tar.gz
rspamd-47d23f7ff0988398a813cc340f89783de24f8596.zip
[Project] Add random words selection logic
-rw-r--r--src/libmime/lang_detection.c71
-rw-r--r--src/libmime/lang_detection.h15
2 files changed, 84 insertions, 2 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index fdece62f1..81c521b46 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -22,6 +22,10 @@
#include <unicode/utf8.h>
#include <unicode/ucnv.h>
+static const gsize default_short_text_limit = 200;
+static const gsize default_words = 20;
+static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
+
struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
guint unigramms_total; /* total frequencies for unigramms */
@@ -35,6 +39,7 @@ struct rspamd_language_elt {
struct rspamd_lang_detector {
GPtrArray *languages;
UConverter *uchar_converter;
+ gsize short_text_limit;
};
static guint
@@ -178,9 +183,9 @@ struct rspamd_lang_detector*
rspamd_language_detector_init (struct rspamd_config *cfg)
{
const ucl_object_t *section, *elt;
- const gchar *languages_path = RSPAMD_PLUGINSDIR "/languages";
+ const gchar *languages_path = default_languages_path;
glob_t gl;
- size_t i;
+ size_t i, short_text_limit = default_short_text_limit;
UErrorCode uc_err = U_ZERO_ERROR;
GString *languages_pattern;
struct rspamd_lang_detector *ret = NULL;
@@ -193,6 +198,12 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
if (elt) {
languages_path = ucl_object_tostring (elt);
}
+
+ elt = ucl_object_lookup (section, "short_text_limit");
+
+ if (elt) {
+ short_text_limit = ucl_object_toint (elt);
+ }
}
languages_pattern = g_string_sized_new (PATH_MAX);
@@ -207,6 +218,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
ret = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*ret));
ret->languages = g_ptr_array_sized_new (gl.gl_pathc);
ret->uchar_converter = ucnv_open ("UTF-8", &uc_err);
+ ret->short_text_limit = short_text_limit;
g_assert (uc_err == U_ZERO_ERROR);
@@ -247,4 +259,59 @@ rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
else {
ucs_token->len = 0;
}
+}
+
+static void
+rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords,
+ goffset *offsets_out)
+{
+ guint step_len, remainder, i, out_idx;
+ guint64 coin, sel;
+
+ g_assert (nwords != 0);
+ g_assert (offsets_out != NULL);
+ g_assert (ucs_tokens->len >= nwords);
+ /*
+ * We split input array into `nwords` parts. For each part we randomly select
+ * an element from this particular split. Here is an example:
+ *
+ * nwords=2, input_len=5
+ *
+ * w1 w2 w3 w4 w5
+ * ^ ^
+ * part1 part2
+ * vv vv
+ * w2 w5
+ *
+ * So we have 2 output words from 5 input words selected randomly within
+ * their splits. It is not uniform distribution but it seems to be better
+ * to include words from different text parts
+ */
+ step_len = ucs_tokens->len / nwords;
+ remainder = ucs_tokens->len % nwords;
+
+ out_idx = 0;
+ coin = rspamd_random_uint64_fast ();
+ sel = coin % (step_len + remainder);
+ offsets_out[out_idx] = sel;
+
+ for (i = step_len + remainder; i < ucs_tokens->len;
+ i += step_len, out_idx ++) {
+ coin = rspamd_random_uint64_fast ();
+ sel = (coin % step_len) + i;
+ offsets_out[out_idx] = sel;
+ }
+}
+
+const gchar *
+rspamd_language_detector_detect (struct rspamd_lang_detector *d,
+ GPtrArray *ucs_tokens, gsize words_len)
+{
+ if (words_len < d->short_text_limit) {
+ /* For short text, start directly from trigramms */
+ return rspamd_language_detector_detect_trigramm ();
+ }
+
+ /* Start with unigramms */
+
} \ No newline at end of file
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h
index c0d05cf1d..f3f16b1ea 100644
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -23,6 +23,11 @@
struct rspamd_lang_detector;
+struct rspamd_lang_detector_res {
+ gdouble prob;
+ const gchar *lang;
+};
+
/**
* Create new language detector object using configuration object
* @param cfg
@@ -40,4 +45,14 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
rspamd_stat_token_t *utf_token,
rspamd_stat_token_t *ucs_token);
+/**
+ * Try to detect language of words
+ * @param d
+ * @param ucs_tokens
+ * @param words_len
+ * @return language code or NULL if language has not been detected
+ */
+const gchar * rspamd_language_detector_detect (struct rspamd_lang_detector *d,
+ GPtrArray *ucs_tokens, gsize words_len);
+
#endif