aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime/lang_detection.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rspamd.com>2024-01-18 14:00:44 +0000
committerVsevolod Stakhov <vsevolod@rspamd.com>2024-01-18 14:00:44 +0000
commit1fee9a39017d946152eab4f29e0d143db13ae951 (patch)
treed57687863ff00a6ef149843d7023850d8613fe61 /src/libmime/lang_detection.c
parent5570daed2d0faafbde564cae36a0b4764e266c4a (diff)
downloadrspamd-1fee9a39017d946152eab4f29e0d143db13ae951.tar.gz
rspamd-1fee9a39017d946152eab4f29e0d143db13ae951.zip
[Fix] Make words selection random deterministic upon content
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r--src/libmime/lang_detection.c31
1 files changed, 19 insertions, 12 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index aa5447c8b..c44aa2b04 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Vsevolod Stakhov
+ * Copyright 2024 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -359,7 +359,7 @@ rspamd_language_detector_read_file(struct rspamd_config *cfg,
khash_t(rspamd_trigram_hash) *htb = NULL;
gchar *pos;
guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped,
- loaded, nstop = 0;
+ loaded;
gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0;
enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX;
@@ -492,7 +492,6 @@ rspamd_language_detector_read_file(struct rspamd_config *cfg,
word, wlen,
mp_flags);
nelt->stop_words++;
- nstop++;
/* Also lemmatise and store normalised */
if (stem) {
@@ -938,7 +937,8 @@ end:
static void
rspamd_language_detector_random_select(GArray *ucs_tokens, guint nwords,
- goffset *offsets_out)
+ goffset *offsets_out,
+ guint64 *seed)
{
guint step_len, remainder, i, out_idx;
guint64 coin, sel;
@@ -967,14 +967,14 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, guint nwords,
remainder = ucs_tokens->len % nwords;
out_idx = 0;
- coin = rspamd_random_uint64_fast();
+ coin = rspamd_random_uint64_fast_seed(seed);
sel = coin % (step_len + remainder);
offsets_out[out_idx] = sel;
for (i = step_len + remainder; i < ucs_tokens->len;
i += step_len, out_idx++) {
guint ntries = 0;
- coin = rspamd_random_uint64_fast();
+ coin = rspamd_random_uint64_fast_seed(seed);
sel = (coin % step_len) + i;
for (;;) {
@@ -990,7 +990,7 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, guint nwords,
}
else {
ntries++;
- coin = rspamd_random_uint64_fast();
+ coin = rspamd_random_uint64_fast_seed(seed);
if (ntries < step_len) {
sel = (coin % step_len) + i;
@@ -1225,15 +1225,19 @@ rspamd_language_detector_detect_type(struct rspamd_task *task,
struct rspamd_lang_detector *d,
GArray *words,
enum rspamd_language_category cat,
- khash_t(rspamd_candidates_hash) * candidates)
+ khash_t(rspamd_candidates_hash) * candidates,
+ struct rspamd_mime_text_part *part)
{
guint nparts = MIN(words->len, nwords);
goffset *selected_words;
rspamd_stat_token_t *tok;
guint i;
+ guint64 seed;
+ /* Seed PRNG with part digest to provide some sort of determinism */
+ memcpy(&seed, part->mime_part->digest, sizeof(seed));
selected_words = g_new0(goffset, nparts);
- rspamd_language_detector_random_select(words, nparts, selected_words);
+ rspamd_language_detector_random_select(words, nparts, selected_words, &seed);
msg_debug_lang_det("randomly selected %d words", nparts);
for (i = 0; i < nparts; i++) {
@@ -1280,7 +1284,8 @@ rspamd_language_detector_try_ngramm(struct rspamd_task *task,
struct rspamd_lang_detector *d,
GArray *ucs_tokens,
enum rspamd_language_category cat,
- khash_t(rspamd_candidates_hash) * candidates)
+ khash_t(rspamd_candidates_hash) * candidates,
+ struct rspamd_mime_text_part *part)
{
guint cand_len = 0;
struct rspamd_lang_detector_res *cand;
@@ -1290,7 +1295,8 @@ rspamd_language_detector_try_ngramm(struct rspamd_task *task,
d,
ucs_tokens,
cat,
- candidates);
+ candidates,
+ part);
kh_foreach_value(candidates, cand, {
if (!isnan(cand->prob)) {
@@ -1931,7 +1937,8 @@ rspamd_language_detector_detect(struct rspamd_task *task,
d,
part->utf_words,
cat,
- candidates);
+ candidates,
+ part);
if (r == rs_detect_none) {
msg_debug_lang_det("no trigrams found, fallback to english");