aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-02-01 08:00:59 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-02-01 08:02:49 +0000
commitc72f016088608c17297335e2bb57e4f405b2b254 (patch)
treeaaac3018560d0590742821fd6e18927517c48bf3
parente5f405519caf3a6a4fa52999ab72a720d5da96a1 (diff)
downloadrspamd-c72f016088608c17297335e2bb57e4f405b2b254.tar.gz
rspamd-c72f016088608c17297335e2bb57e4f405b2b254.zip
[Feature] Further improvements in language detection
-rw-r--r--src/libmime/lang_detection.c34
1 files changed, 31 insertions, 3 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 940e7a8e5..2420e1c5f 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -111,6 +111,10 @@ struct rspamd_lang_detector {
rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \
G_STRFUNC, \
__VA_ARGS__)
+#define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast (NULL, NULL, \
+ rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
INIT_LOG_MODULE(langdet)
@@ -183,7 +187,8 @@ rspamd_language_detector_ucs_is_latin (UChar *s, gsize len)
gboolean ret = TRUE;
for (i = 0; i < len; i ++) {
- if (!((s[i] >= 'A' && s[i] <= 'Z') || (s[i] >= 'a' && s[i] <= 'z'))) {
+ if (!((s[i] >= 'A' && s[i] <= 'Z') || (s[i] >= 'a' && s[i] <= 'z')
+ || s[i] == ' ')) {
ret = FALSE;
break;
}
@@ -254,6 +259,7 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
struct rspamd_language_ucs_elt {
guint freq;
+ const gchar *utf;
UChar s[0];
};
@@ -286,6 +292,15 @@ rspamd_language_detector_print_flags (struct rspamd_language_elt *elt)
return flags_buf;
}
+static gint
+rspamd_language_detector_cmp_ngramm (gconstpointer a, gconstpointer b)
+{
+ struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **)a;
+ struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **)b;
+
+ return (gint)e2->freq - (gint)e1->freq;
+}
+
static void
rspamd_language_detector_read_file (struct rspamd_config *cfg,
struct rspamd_lang_detector *d,
@@ -409,6 +424,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
ucs_elt->s, keylen + 1,
key,
keylen, &uc_err);
+ ucs_elt->utf = key;
if (uc_err != U_ZERO_ERROR) {
msg_warn_config ("cannot convert key to unicode: %s",
@@ -464,7 +480,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
}
/* Now, discriminate low frequency ngramms */
- if (ucs_elt->freq < mean + std / 8.0) {
+ if (ucs_elt->freq < mean) {
ucs_elt->freq = 0;
skipped ++;
continue;
@@ -474,6 +490,8 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
loaded ++;
}
+ g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm);
+
PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
if (ucs_elt->freq > 0) {
rspamd_language_detector_init_ngramm (cfg, d, nelt, ucs_elt->s,
@@ -482,6 +500,14 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
}
}
+ /* Useful for debug */
+ for (i = 0; i < 10; i ++) {
+ ucs_elt = g_ptr_array_index (ngramms, i);
+
+ msg_debug_lang_det_cfg ("%s -> %s: %d", nelt->name,
+ ucs_elt->utf, ucs_elt->freq);
+ }
+
g_ptr_array_free (ngramms, TRUE);
msg_info_config ("loaded %s language, %d unigramms, %d trigramms, "
"%d ngramms loaded; "
@@ -754,6 +780,8 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
struct rspamd_ngramm_elt *elt;
struct rspamd_lang_detector_res *cand;
GHashTable *ngramms;
+ /* Ignore if ngramm is found in that amount of languages */
+ static const guint languages_cutoff = 10;
switch (type) {
case rs_unigramm:
@@ -767,7 +795,7 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
ar = g_hash_table_lookup (ngramms, window);
- if (ar) {
+ if (ar && ar->len < languages_cutoff) {
PTR_ARRAY_FOREACH (ar, i, elt) {
cand = g_hash_table_lookup (candidates, elt->elt->name);