aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime/lang_detection.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-08 16:46:49 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-08 16:46:49 +0100
commit2ef4d7f2356eb44e71d2906c6ed6ad08c23773c4 (patch)
treeaded10d9bd89ce19d3fbed4a3a61e6b1e4def05a /src/libmime/lang_detection.c
parente81e8ffb30b953c42e52e5bf20d97f820e8b08e0 (diff)
downloadrspamd-2ef4d7f2356eb44e71d2906c6ed6ad08c23773c4.tar.gz
rspamd-2ef4d7f2356eb44e71d2906c6ed6ad08c23773c4.zip
[Minor] Do not apply ngramms detection for short texts
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r--src/libmime/lang_detection.c103
1 files changed, 56 insertions, 47 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index fbc5f56c9..dfd62d4e5 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -27,7 +27,7 @@
#include <unicode/ustring.h>
#include <math.h>
-static const gsize default_short_text_limit = 200;
+static const gsize default_short_text_limit = 20;
static const gsize default_words = 80;
static const gdouble update_prob = 0.6;
static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
@@ -1597,62 +1597,71 @@ rspamd_language_detector_detect (struct rspamd_task *task,
}
if (!ret) {
- candidates = kh_init (rspamd_candidates_hash);
- kh_resize (rspamd_candidates_hash, candidates, 32);
-
- r = rspamd_language_detector_try_ngramm (task,
- default_words,
- d,
- part->utf_words,
- cat,
- candidates);
-
- if (r == rs_detect_none) {
- msg_debug_lang_det ("no trigramms found, fallback to english");
+ if (part->utf_words->len < default_short_text_limit) {
+ r = rs_detect_none;
+ msg_debug_lang_det ("text is too short for trigramms detection: "
+ "%d words; at least %d words required",
+ (int)part->utf_words->len,
+ (int)default_short_text_limit);
rspamd_language_detector_set_language (task, part, "en");
}
- else if (r == rs_detect_multiple) {
- /* Check our guess */
-
- mean = 0.0;
- std = 0.0;
- cand_len = 0;
-
- /* Check distirbution */
- kh_foreach_value (candidates, cand, {
- if (!isnan (cand->prob)) {
- mean += cand->prob;
- cand_len++;
- }
- });
-
- if (cand_len > 0) {
- mean /= cand_len;
-
+ else {
+ candidates = kh_init (rspamd_candidates_hash);
+ kh_resize (rspamd_candidates_hash, candidates, 32);
+
+ r = rspamd_language_detector_try_ngramm (task,
+ default_words,
+ d,
+ part->utf_words,
+ cat,
+ candidates);
+
+ if (r == rs_detect_none) {
+ msg_debug_lang_det ("no trigramms found, fallback to english");
+ rspamd_language_detector_set_language (task, part, "en");
+ } else if (r == rs_detect_multiple) {
+ /* Check our guess */
+
+ mean = 0.0;
+ std = 0.0;
+ cand_len = 0;
+
+ /* Check distirbution */
kh_foreach_value (candidates, cand, {
- gdouble err;
if (!isnan (cand->prob)) {
- err = cand->prob - mean;
- std += fabs (err);
+ mean += cand->prob;
+ cand_len++;
}
});
- std /= cand_len;
- }
+ if (cand_len > 0) {
+ mean /= cand_len;
- msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
- cand_len, mean, std);
+ kh_foreach_value (candidates, cand, {
+ gdouble err;
+ if (!isnan (cand->prob)) {
+ err = cand->prob - mean;
+ std += fabs (err);
+ }
+ });
- if (cand_len > 0 && std / fabs (mean) < 0.25) {
- msg_debug_lang_det ("apply frequency heuristic sorting");
- frequency_heuristic_applied = TRUE;
- cbd.d = d;
- cbd.mean = mean;
- cbd.std = std;
- cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+ std /= cand_len;
+ }
- if (part->utf_words->len < default_words / 2) {
- cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
+ cand_len, mean, std);
+
+ if (cand_len > 0 && std / fabs (mean) < 0.25) {
+ msg_debug_lang_det ("apply frequency heuristic sorting");
+ frequency_heuristic_applied = TRUE;
+ cbd.d = d;
+ cbd.mean = mean;
+ cbd.std = std;
+ cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+
+ if (part->utf_words->len < default_words / 2) {
+ cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ }
}
}
}