aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-01-13 20:13:18 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-01-13 20:13:18 +0000
commit374afc0e77fdca6bff25c44ebfe467780f461d87 (patch)
treef772e35d87624563cf3d3027671e1915aa8a5e17 /src/libmime
parentb72c8f94ccbbe8362b38a4a9f35823367ad21a9c (diff)
downloadrspamd-374afc0e77fdca6bff25c44ebfe467780f461d87.tar.gz
rspamd-374afc0e77fdca6bff25c44ebfe467780f461d87.zip
[Fix] Various improvements in language detection
Diffstat (limited to 'src/libmime')
-rw-r--r--src/libmime/lang_detection.c162
-rw-r--r--src/libmime/message.c12
2 files changed, 132 insertions, 42 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 66901e6b9..ead12b8e8 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -24,7 +24,8 @@
#include <math.h>
static const gsize default_short_text_limit = 200;
-static const gsize default_words = 20;
+static const gsize default_words = 30;
+static const gdouble update_prob = 0.6;
static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
struct rspamd_language_elt {
@@ -503,9 +504,17 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
/* Split words */
while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
!= -1) {
- if (!rspamd_language_detector_process_ngramm_update (d, window,
- type, candidates)) {
- ret = FALSE;
+
+ if (rspamd_random_double_fast () > update_prob) {
+ if (!rspamd_language_detector_process_ngramm_update (d, window,
+ type, candidates)) {
+ ret = FALSE;
+ }
+ }
+ else {
+ /* Try to do full update in case if we are missing some candidates */
+ rspamd_language_detector_process_ngramm_full (d, window, type,
+ candidates);
}
}
@@ -576,10 +585,10 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates)
cand = (struct rspamd_lang_detector_res *) v;
/*
- * Probabilities are logarifmic, so if prob1 - prob2 > 4, it means that
+ * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that
* prob2 is 2^4 less than prob1
*/
- if (max_prob - cand->prob > 256) {
+ if (max_prob - cand->prob > 4) {
g_hash_table_iter_remove (&it);
}
}
@@ -636,55 +645,134 @@ rspamd_language_detector_cmp (gconstpointer a, gconstpointer b)
return 0;
}
+enum rspamd_language_detected_type {
+ rs_detect_none = 0,
+ rs_detect_single,
+ rs_detect_multiple,
+};
+
+static enum rspamd_language_detected_type
+rspamd_language_detector_try_ngramm (struct rspamd_lang_detector *d,
+ GArray *ucs_tokens,
+ enum rspamd_language_gramm_type type,
+ GHashTable *candidates)
+{
+ guint cand_len;
+
+ rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
+ type, TRUE);
+
+ cand_len = g_hash_table_size (candidates);
+
+ if (cand_len == 0) {
+ return rs_detect_none;
+ }
+ else if (cand_len == 1) {
+ return rs_detect_single;
+ }
+
+ return rs_detect_multiple;
+}
+
GPtrArray *
rspamd_language_detector_detect (struct rspamd_lang_detector *d,
GArray *ucs_tokens, gsize words_len)
{
- GHashTable *candidates;
+ GHashTable *candidates, *tcandidates;
GPtrArray *result;
GHashTableIter it;
gpointer k, v;
+ gdouble mean, std;
struct rspamd_lang_detector_res *cand;
- guint cand_len, prev_len;
+ enum rspamd_language_detected_type r;
candidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
NULL, g_free);
if (words_len < d->short_text_limit) {
/* For short text, start directly from trigramms */
- rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
- rs_trigramm, TRUE);
+ r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_trigramm,
+ candidates);
+
+ if (r == rs_detect_none) {
+ r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_bigramm,
+ candidates);
+
+ if (r == rs_detect_none) {
+ r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_unigramm,
+ candidates);
+ }
+ }
}
else {
/* Start with unigramms */
- rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
- rs_unigramm, TRUE);
- cand_len = g_hash_table_size (candidates);
-
- if (cand_len > 1) {
- /* Try bigramms */
- rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
- rs_unigramm, FALSE);
-
- cand_len = g_hash_table_size (candidates);
- if (cand_len > 1) {
- prev_len = cand_len;
- /* Try trigramms */
- GHashTable *ncandidates;
- ncandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
- NULL, g_free);
- rspamd_language_detector_detect_type (d, ucs_tokens, ncandidates,
- rs_trigramm, TRUE);
- cand_len = g_hash_table_size (ncandidates);
-
- if (cand_len < prev_len) {
- g_hash_table_unref (candidates);
- candidates = ncandidates;
+ r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_unigramm,
+ candidates);
+
+ switch (r) {
+ case rs_detect_none:
+ case rs_detect_single:
+ /* No unigramms found or single set found, no reason to continue */;
+ break;
+ case rs_detect_multiple:
+ /* Try to improve guess */
+ tcandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
+ NULL, g_free);
+ r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_bigramm,
+ tcandidates);
+
+ switch (r) {
+ case rs_detect_none:
+ /* Revert to unigramms result */
+ g_hash_table_unref (tcandidates);
+ break;
+ case rs_detect_single:
+ /* We have good enough result, return it */
+ g_hash_table_unref (candidates);
+ candidates = tcandidates;
+ break;
+ case rs_detect_multiple:
+ mean = 0.0;
+ std = 0.0;
+ g_hash_table_iter_init (&it, tcandidates);
+
+ /* Check distirbution */
+ while (g_hash_table_iter_next (&it, &k, &v)) {
+ cand = (struct rspamd_lang_detector_res *) v;
+ mean += cand->prob;
}
- else {
- /* Not a better guess */
- g_hash_table_unref (ncandidates);
+
+ mean /= g_hash_table_size (tcandidates);
+
+ g_hash_table_iter_init (&it, tcandidates);
+ while (g_hash_table_iter_next (&it, &k, &v)) {
+ gdouble err;
+ cand = (struct rspamd_lang_detector_res *) v;
+ err = cand->prob - mean;
+ std += err * err;
+ }
+
+ std /= g_hash_table_size (tcandidates);
+ g_hash_table_unref (candidates);
+ candidates = tcandidates;
+
+ if (std < mean / 100) {
+ /* Try trigramms */
+ tcandidates = g_hash_table_new_full (rspamd_str_hash,
+ rspamd_str_equal,
+ NULL, g_free);
+
+ r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_trigramm,
+ tcandidates);
+
+ if (r != rs_detect_none) {
+ /* TODO: check if we have better distribution here */
+ g_hash_table_unref (candidates);
+ candidates = tcandidates;
+ }
}
+ break;
}
+ break;
}
}
@@ -694,7 +782,7 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d,
while (g_hash_table_iter_next (&it, &k, &v)) {
cand = (struct rspamd_lang_detector_res *) v;
- msg_err ("%s -> %.2f", cand->lang, cand->prob);
+ msg_debug ("%s -> %.2f", cand->lang, cand->prob);
g_ptr_array_add (result, cand);
g_hash_table_iter_steal (&it);
}
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 4bac77062..2a7801100 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -98,11 +98,13 @@ rspamd_extract_words (struct rspamd_task *task,
for (i = 0; i < part->normalized_words->len; i++) {
w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
-
- rspamd_language_detector_to_ucs (task->lang_det, task->task_pool,
- w, &ucs_w);
- g_array_append_val (part->ucs32_words, ucs_w);
- ucs_len += ucs_w.len;
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ rspamd_language_detector_to_ucs (task->lang_det,
+ task->task_pool,
+ w, &ucs_w);
+ g_array_append_val (part->ucs32_words, ucs_w);
+ ucs_len += ucs_w.len;
+ }
}
part->languages = rspamd_language_detector_detect (task->lang_det,