aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-02-08 19:30:52 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-02-08 19:30:52 +0000
commitcbf07c7c5a4f8a6ae556eb833c7f130e9810ffbf (patch)
tree397cfaeb14d686bc36b5c9dfaba4be3b953f0add
parent347cf7c15b0d94bda8677a776c01509a966195e2 (diff)
downloadrspamd-cbf07c7c5a4f8a6ae556eb833c7f130e9810ffbf.tar.gz
rspamd-cbf07c7c5a4f8a6ae556eb833c7f130e9810ffbf.zip
[Minor] Add extra logic for short texts in language detector
-rw-r--r--src/libmime/lang_detection.c96
1 files changed, 53 insertions, 43 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index e38834999..fadb78d95 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1150,8 +1150,14 @@ rspamd_language_detector_try_ngramm (struct rspamd_task *task,
return rs_detect_multiple;
}
+enum rspamd_language_sort_flags {
+ RSPAMD_LANG_FLAG_DEFAULT = 0,
+ RSPAMD_LANG_FLAG_SHORT = 1 << 0,
+};
+
struct rspamd_frequency_sort_cbdata {
struct rspamd_lang_detector *d;
+ enum rspamd_language_sort_flags flags;
gdouble std;
gdouble mean;
};
@@ -1168,60 +1174,59 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
const struct rspamd_lang_detector_res
*canda = *(const struct rspamd_lang_detector_res **)a,
*candb = *(const struct rspamd_lang_detector_res **)b;
- gdouble diff;
+ gdouble adj;
+ gdouble proba_adjusted, probb_adjusted, freqa, freqb;
- diff = fabs (canda->prob - candb->prob);
+ freqa = ((gdouble)canda->elt->occurencies) /
+ (gdouble)cbd->d->total_occurencies;
+ freqb = ((gdouble)candb->elt->occurencies) /
+ (gdouble)cbd->d->total_occurencies;
- if (diff > cbd->std) {
- /* Generic case */
- if (canda->prob > candb->prob) {
- return -1;
- } else if (candb->prob > canda->prob) {
- return 1;
- }
+ proba_adjusted = canda->prob;
+ probb_adjusted = candb->prob;
- return 0;
+ if (isnormal (freqa) && isnormal (freqb)) {
+ proba_adjusted += cbd->std * (frequency_adjustment * freqa);
+ probb_adjusted += cbd->std * (frequency_adjustment * freqb);
}
- else {
- gdouble proba_adjusted, probb_adjusted, freqa, freqb;
-
- freqa = ((gdouble)canda->elt->occurencies) /
- (gdouble)cbd->d->total_occurencies;
- freqb = ((gdouble)candb->elt->occurencies) /
- (gdouble)cbd->d->total_occurencies;
-
- proba_adjusted = canda->prob;
- probb_adjusted = candb->prob;
-
- if (isnormal (freqa) && isnormal (freqb)) {
- proba_adjusted += cbd->std * (frequency_adjustment * freqa);
- probb_adjusted += cbd->std * (frequency_adjustment * freqb);
- }
- if (canda->elt->flags & RS_LANGUAGE_TIER1) {
- proba_adjusted += cbd->std * tier1_adjustment;
- }
+ if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
+ adj = tier1_adjustment * 2.0;
+ }
+ else {
+ adj = tier1_adjustment;
+ }
+ if (canda->elt->flags & RS_LANGUAGE_TIER1) {
+ proba_adjusted += cbd->std * adj;
+ }
- if (candb->elt->flags & RS_LANGUAGE_TIER1) {
- probb_adjusted += cbd->std * tier1_adjustment;
- }
+ if (candb->elt->flags & RS_LANGUAGE_TIER1) {
+ probb_adjusted += cbd->std * adj;
+ }
- if (canda->elt->flags & RS_LANGUAGE_TIER0) {
- proba_adjusted += cbd->std * tier0_adjustment;
- }
+ if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
+ adj = tier0_adjustment * 16.0;
+ }
+ else {
+ adj = tier0_adjustment;
+ }
- if (candb->elt->flags & RS_LANGUAGE_TIER0) {
- probb_adjusted += cbd->std * tier0_adjustment;
- }
+ if (canda->elt->flags & RS_LANGUAGE_TIER0) {
+ proba_adjusted += cbd->std * adj;
+ }
- if (proba_adjusted > probb_adjusted) {
- return -1;
- } else if (probb_adjusted > proba_adjusted) {
- return 1;
- }
+ if (candb->elt->flags & RS_LANGUAGE_TIER0) {
+ probb_adjusted += cbd->std * adj;
+ }
- return 0;
+ if (proba_adjusted > probb_adjusted) {
+ return -1;
+ }
+ else if (probb_adjusted > proba_adjusted) {
+ return 1;
}
+
+ return 0;
}
GPtrArray *
@@ -1294,6 +1299,11 @@ rspamd_language_detector_detect (struct rspamd_task *task,
cbd.d = d;
cbd.mean = mean;
cbd.std = std;
+ cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+
+ if (ucs_tokens->len < default_words / 2) {
+ cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ }
}
}