aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/client/rspamc.cxx59
-rw-r--r--src/libserver/cfg_rcl.cxx2
-rw-r--r--src/libstat/classifiers/bayes.c63
-rw-r--r--src/plugins/lua/bayes_expiry.lua182
-rw-r--r--test/functional/cases/110_statistics/multiclass_lib.robot61
-rw-r--r--test/functional/configs/multiclass_bayes.conf13
-rw-r--r--test/functional/messages/newsletter.eml56
7 files changed, 306 insertions, 130 deletions
diff --git a/src/client/rspamc.cxx b/src/client/rspamc.cxx
index af88acb33..04bbaeac8 100644
--- a/src/client/rspamc.cxx
+++ b/src/client/rspamc.cxx
@@ -91,6 +91,8 @@ static gboolean skip_attachments = FALSE;
static const char *pubkey = nullptr;
static const char *user_agent = "rspamc";
static const char *files_list = nullptr;
+static const char *queue_id = nullptr;
+static std::string settings;
std::vector<GPid> children;
static GPatternSpec **exclude_compiled = nullptr;
@@ -103,6 +105,11 @@ static gboolean rspamc_password_callback(const char *option_name,
gpointer data,
GError **error);
+static gboolean rspamc_settings_callback(const char *option_name,
+ const char *value,
+ gpointer data,
+ GError **error);
+
static GOptionEntry entries[] =
{
{"connect", 'h', 0, G_OPTION_ARG_STRING, &connect_str,
@@ -183,6 +190,10 @@ static GOptionEntry entries[] =
"Use specific User-Agent instead of \"rspamc\"", nullptr},
{"files-list", '\0', 0, G_OPTION_ARG_FILENAME, &files_list,
"Read one or more newline separated filenames to scan from file", nullptr},
+ {"queue-id", '\0', 0, G_OPTION_ARG_STRING, &queue_id,
+ "Set Queue-ID header for the request", nullptr},
+ {"settings", '\0', 0, G_OPTION_ARG_CALLBACK, (void *) &rspamc_settings_callback,
+ "Set Settings header as JSON/UCL for the request", nullptr},
{nullptr, 0, 0, G_OPTION_ARG_NONE, nullptr, nullptr, nullptr}};
static void rspamc_symbols_output(FILE *out, ucl_object_t *obj);
@@ -567,6 +578,46 @@ rspamc_password_callback(const char *option_name,
return TRUE;
}
+static gboolean
+rspamc_settings_callback(const char *option_name,
+ const char *value,
+ gpointer data,
+ GError **error)
+{
+ if (value == nullptr) {
+ g_set_error(error, G_OPTION_ERROR, G_OPTION_ERROR_BAD_VALUE,
+ "Settings parameter cannot be empty");
+ return FALSE;
+ }
+
+ // Parse the settings string using UCL to validate it
+ struct ucl_parser *parser = ucl_parser_new(UCL_PARSER_KEY_LOWERCASE);
+ if (!ucl_parser_add_string(parser, value, strlen(value))) {
+ auto *ucl_error = ucl_parser_get_error(parser);
+ g_set_error(error, G_OPTION_ERROR, G_OPTION_ERROR_BAD_VALUE,
+ "Invalid JSON/UCL in settings: %s", ucl_error);
+ ucl_parser_free(parser);
+ return FALSE;
+ }
+
+ // Get the parsed object and validate it
+ auto *obj = ucl_parser_get_object(parser);
+ if (obj == nullptr) {
+ g_set_error(error, G_OPTION_ERROR, G_OPTION_ERROR_BAD_VALUE,
+ "Failed to parse settings as JSON/UCL");
+ ucl_parser_free(parser);
+ return FALSE;
+ }
+
+ // Store the validated settings string
+ settings = value;
+
+ ucl_object_unref(obj);
+ ucl_parser_free(parser);
+
+ return TRUE;
+}
+
/*
* Parse command line
*/
@@ -890,6 +941,14 @@ add_options(GQueue *opts)
hdr++;
}
+ if (queue_id != nullptr) {
+ add_client_header(opts, "Queue-Id", queue_id);
+ }
+
+ if (!settings.empty()) {
+ add_client_header(opts, "Settings", settings.c_str());
+ }
+
if (!flagbuf.empty()) {
if (flagbuf.back() == ',') {
flagbuf.pop_back();
diff --git a/src/libserver/cfg_rcl.cxx b/src/libserver/cfg_rcl.cxx
index 3978b23b0..da5845917 100644
--- a/src/libserver/cfg_rcl.cxx
+++ b/src/libserver/cfg_rcl.cxx
@@ -2631,7 +2631,7 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections)
rspamd_rcl_add_default_handler(sub,
"min_prob_strength",
rspamd_rcl_parse_struct_double,
- G_STRUCT_OFFSET(struct rspamd_classifier_config, min_token_hits),
+ G_STRUCT_OFFSET(struct rspamd_classifier_config, min_prob_strength),
0,
"Use only tokens with probability in [0.5 - MPS, 0.5 + MPS]");
rspamd_rcl_add_default_handler(sub,
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 66d84a14d..f851fbb36 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -333,9 +333,17 @@ bayes_classify_token_multiclass(struct rspamd_classifier *ctx,
/* Apply multinomial model for each class */
for (j = 0; j < cl->num_classes; j++) {
+ /* Skip classes with insufficient learns */
+ if (ctx->cfg->min_learns > 0 && cl->class_learns[j] < ctx->cfg->min_learns) {
+ continue;
+ }
+
double class_freq = (double) class_counts[j] / MAX(1.0, (double) cl->class_learns[j]);
double class_prob = PROB_COMBINE(class_freq, total_count, w, 1.0 / cl->num_classes);
+ /* Ensure probability is properly bounded [0, 1] */
+ class_prob = MAX(0.0, MIN(1.0, class_prob));
+
/* Skip probabilities too close to uniform (1/num_classes) */
double uniform_prior = 1.0 / cl->num_classes;
if (fabs(class_prob - uniform_prior) < ctx->cfg->min_prob_strength) {
@@ -428,16 +436,26 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
}
}
- /* Check minimum learns requirement */
+ /* Check minimum learns requirement - count viable classes */
+ unsigned int viable_classes = 0;
if (ctx->cfg->min_learns > 0) {
for (i = 0; i < cl.num_classes; i++) {
- if (cl.class_learns[i] < ctx->cfg->min_learns) {
- msg_info_task("not classified as %s. The class needs more "
- "training samples. Currently: %uL; minimum %ud required",
+ if (cl.class_learns[i] >= ctx->cfg->min_learns) {
+ viable_classes++;
+ }
+ else {
+ msg_info_task("class %s excluded from classification: %uL learns < %ud minimum",
cl.class_names[i], cl.class_learns[i], ctx->cfg->min_learns);
- return TRUE;
}
}
+
+ if (viable_classes == 0) {
+ msg_info_task("no classes have sufficient training samples for classification");
+ return TRUE;
+ }
+
+ msg_info_bayes("multiclass classification: %ud/%ud classes have sufficient learns",
+ viable_classes, cl.num_classes);
}
/* Count text tokens */
@@ -535,7 +553,28 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
/* Calculate confidence using Fisher method for the winning class */
if (max_log_prob > -300) {
- confidence = 1.0 - inv_chi_square(task, max_log_prob, cl.processed_tokens);
+ if (max_log_prob > 0) {
+ /* Positive log prob means very strong evidence - high confidence */
+ confidence = 0.95; /* High confidence for positive log probabilities */
+ msg_debug_bayes("positive log_prob (%g), setting high confidence", max_log_prob);
+ }
+ else {
+ /* Negative log prob - use Fisher method as intended */
+ double fisher_result = inv_chi_square(task, max_log_prob, cl.processed_tokens);
+ confidence = 1.0 - fisher_result;
+
+ msg_debug_bayes("fisher_result: %g, max_log_prob: %g, condition check: fisher_result > 0.999 = %s, max_log_prob > -50 = %s",
+ fisher_result, max_log_prob,
+ fisher_result > 0.999 ? "true" : "false",
+ max_log_prob > -50 ? "true" : "false");
+
+ /* Handle case where Fisher method indicates extreme confidence */
+ if (fisher_result > 0.999 && max_log_prob > -100) {
+ /* Large magnitude negative log prob means strong evidence */
+ confidence = 0.90;
+ msg_debug_bayes("extreme negative log_prob (%g), setting high confidence", max_log_prob);
+ }
+ }
}
else {
confidence = normalized_probs[winning_class_idx];
@@ -556,6 +595,10 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
rspamd_task_set_multiclass_result(task, result);
+ msg_info_bayes("MULTICLASS_RESULT: winning_class='%s', confidence=%.3f, normalized_prob=%.3f, tokens=%uL",
+ cl.class_names[winning_class_idx], confidence,
+ normalized_probs[winning_class_idx], cl.processed_tokens);
+
/* Insert symbol for winning class if confidence is significant */
if (confidence > 0.05) {
char sumbuf[32];
@@ -570,6 +613,8 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
if (st->stcf->class_name &&
strcmp(st->stcf->class_name, cl.class_names[winning_class_idx]) == 0) {
+ msg_info_bayes("SYMBOL_INSERT: symbol='%s', final_prob=%.3f, confidence_display='%s'",
+ st->stcf->symbol, final_prob, sumbuf);
rspamd_task_insert_result(task, st->stcf->symbol, final_prob, sumbuf);
break;
}
@@ -581,6 +626,9 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
normalized_probs[winning_class_idx],
confidence, cl.processed_tokens);
}
+ else {
+ msg_info_bayes("SYMBOL_SKIPPED: confidence=%.3f <= 0.05, no symbol inserted", confidence);
+ }
return TRUE;
}
@@ -939,6 +987,9 @@ bayes_learn_class(struct rspamd_classifier *ctx,
g_assert(tokens != NULL);
g_assert(class_name != NULL);
+ msg_info_bayes("LEARN_CLASS: class='%s', unlearn=%s, tokens=%ud",
+ class_name, unlearn ? "true" : "false", tokens->len);
+
incrementing = ctx->cfg->flags & RSPAMD_FLAG_CLASSIFIER_INCREMENTING_BACKEND;
/* Count classes and prepare arrays for multi-class learning */
diff --git a/src/plugins/lua/bayes_expiry.lua b/src/plugins/lua/bayes_expiry.lua
index 44ff9dafa..0d78f2272 100644
--- a/src/plugins/lua/bayes_expiry.lua
+++ b/src/plugins/lua/bayes_expiry.lua
@@ -41,32 +41,38 @@ local template = {}
local function check_redis_classifier(cls, cfg)
-- Skip old classifiers
if cls.new_schema then
- local symbol_spam, symbol_ham
+ local class_symbols = {}
+ local class_labels = {}
local expiry = (cls.expiry or cls.expire)
if type(expiry) == 'table' then
expiry = expiry[1]
end
- -- Load symbols from statfiles
+ -- Extract class_labels mapping from classifier config
+ if cls.class_labels then
+ class_labels = cls.class_labels
+ end
+ -- Load symbols from statfiles for multi-class support
local function check_statfile_table(tbl, def_sym)
local symbol = tbl.symbol or def_sym
-
- local spam
- if tbl.spam then
- spam = tbl.spam
- else
- if string.match(symbol:upper(), 'SPAM') then
- spam = true
+ local class_name = tbl.class
+
+ -- Handle legacy spam/ham detection for backward compatibility
+ if not class_name then
+ if tbl.spam ~= nil then
+ class_name = tbl.spam and 'spam' or 'ham'
+ elseif string.match(tostring(symbol):upper(), 'SPAM') then
+ class_name = 'spam'
+ elseif string.match(tostring(symbol):upper(), 'HAM') then
+ class_name = 'ham'
else
- spam = false
+ class_name = def_sym
end
end
- if spam then
- symbol_spam = symbol
- else
- symbol_ham = symbol
+ if class_name then
+ class_symbols[class_name] = symbol
end
end
@@ -87,10 +93,9 @@ local function check_redis_classifier(cls, cfg)
end
end
- if not symbol_spam or not symbol_ham or type(expiry) ~= 'number' then
+ if next(class_symbols) == nil or type(expiry) ~= 'number' then
logger.debugm(N, rspamd_config,
- 'disable expiry for classifier %s: no expiry %s',
- symbol_spam, cls)
+ 'disable expiry for classifier: no class symbols or expiry configured')
return
end
-- Now try to load redis_params if needed
@@ -108,17 +113,16 @@ local function check_redis_classifier(cls, cfg)
end
if redis_params['read_only'] then
- logger.infox(rspamd_config, 'disable expiry for classifier %s: read only redis configuration',
- symbol_spam)
+ logger.infox(rspamd_config, 'disable expiry for classifier: read only redis configuration')
return
end
- logger.debugm(N, rspamd_config, "enabled expiry for %s/%s -> %s expiry",
- symbol_spam, symbol_ham, expiry)
+ logger.debugm(N, rspamd_config, "enabled expiry for classes %s -> %s expiry",
+ table.concat(lutil.keys(class_symbols), ', '), expiry)
table.insert(settings.classifiers, {
- symbol_spam = symbol_spam,
- symbol_ham = symbol_ham,
+ class_symbols = class_symbols,
+ class_labels = class_labels,
redis_params = redis_params,
expiry = expiry
})
@@ -249,12 +253,11 @@ local expiry_script = [[
local keys = ret[2]
local tokens = {}
- -- Tokens occurrences distribution counters
+ -- Dynamic occurrence tracking for all classes
local occur = {
- ham = {},
- spam = {},
total = {}
}
+ local classes_found = {}
-- Expiry step statistics counters
local nelts, extended, discriminated, sum, sum_squares, common, significant,
@@ -264,24 +267,44 @@ local expiry_script = [[
for _,key in ipairs(keys) do
local t = redis.call('TYPE', key)["ok"]
if t == 'hash' then
- local values = redis.call('HMGET', key, 'H', 'S')
- local ham = tonumber(values[1]) or 0
- local spam = tonumber(values[2]) or 0
+ -- Get all hash fields to support multi-class
+ local hash_data = redis.call('HGETALL', key)
+ local class_counts = {}
+ local total = 0
local ttl = redis.call('TTL', key)
+
+ -- Parse hash data into class counts
+ for i = 1, #hash_data, 2 do
+ local class_label = hash_data[i]
+ local count = tonumber(hash_data[i + 1]) or 0
+ class_counts[class_label] = count
+ total = total + count
+
+ -- Track classes we've seen
+ if not classes_found[class_label] then
+ classes_found[class_label] = true
+ occur[class_label] = {}
+ end
+ end
+
tokens[key] = {
- ham,
- spam,
- ttl
+ class_counts = class_counts,
+ total = total,
+ ttl = ttl
}
- local total = spam + ham
+
sum = sum + total
sum_squares = sum_squares + total * total
nelts = nelts + 1
- for k,v in pairs({['ham']=ham, ['spam']=spam, ['total']=total}) do
- if tonumber(v) > 19 then v = 20 end
- occur[k][v] = occur[k][v] and occur[k][v] + 1 or 1
+ -- Update occurrence counters for all classes and total
+ for class_label, count in pairs(class_counts) do
+ local bucket = count > 19 and 20 or count
+ occur[class_label][bucket] = (occur[class_label][bucket] or 0) + 1
end
+
+ local total_bucket = total > 19 and 20 or total
+ occur.total[total_bucket] = (occur.total[total_bucket] or 0) + 1
end
end
@@ -293,9 +316,10 @@ local expiry_script = [[
end
for key,token in pairs(tokens) do
- local ham, spam, ttl = token[1], token[2], tonumber(token[3])
+ local class_counts = token.class_counts
+ local total = token.total
+ local ttl = tonumber(token.ttl)
local threshold = mean
- local total = spam + ham
local function set_ttl()
if expire < 0 then
@@ -310,14 +334,39 @@ local expiry_script = [[
return 0
end
- if total == 0 or math.abs(ham - spam) <= total * ${epsilon_common} then
+ -- Check if token is common (balanced across classes)
+ local is_common = false
+ if total == 0 then
+ is_common = true
+ else
+ -- For multi-class, check if any class dominates significantly
+ local max_count = 0
+ for _, count in pairs(class_counts) do
+ if count > max_count then
+ max_count = count
+ end
+ end
+ -- Token is common if no class has more than (1 - epsilon) of total
+ is_common = (max_count / total) <= (1 - ${epsilon_common})
+ end
+
+ if is_common then
common = common + 1
if ttl > ${common_ttl} then
discriminated = discriminated + 1
redis.call('EXPIRE', key, ${common_ttl})
end
elseif total >= threshold and total > 0 then
- if ham / total > ${significant_factor} or spam / total > ${significant_factor} then
+ -- Check if any class is significant
+ local is_significant = false
+ for _, count in pairs(class_counts) do
+ if count / total > ${significant_factor} then
+ is_significant = true
+ break
+ end
+ end
+
+ if is_significant then
significant = significant + 1
if ttl ~= -1 then
redis.call('PERSIST', key)
@@ -361,33 +410,50 @@ local expiry_script = [[
redis.call('DEL', lock_key)
local occ_distr = {}
- for _,cl in pairs({'ham', 'spam', 'total'}) do
+
+ -- Process all classes found plus total
+ local all_classes = {'total'}
+ for class_label in pairs(classes_found) do
+ table.insert(all_classes, class_label)
+ end
+
+ for _, cl in ipairs(all_classes) do
local occur_key = pattern_sha1 .. '_occurrence_' .. cl
if cursor ~= 0 then
- local n
- for i,v in ipairs(redis.call('HGETALL', occur_key)) do
- if i % 2 == 1 then
- n = tonumber(v)
- else
- occur[cl][n] = occur[cl][n] and occur[cl][n] + v or v
+ local existing_data = redis.call('HGETALL', occur_key)
+ if #existing_data > 0 then
+ for i = 1, #existing_data, 2 do
+ local bucket = tonumber(existing_data[i])
+ local count = tonumber(existing_data[i + 1])
+ if occur[cl] and occur[cl][bucket] then
+ occur[cl][bucket] = occur[cl][bucket] + count
+ elseif occur[cl] then
+ occur[cl][bucket] = count
+ end
end
end
- local str = ''
- if occur[cl][0] ~= nil then
- str = '0:' .. occur[cl][0] .. ','
- end
- for k,v in ipairs(occur[cl]) do
- if k == 20 then k = '>19' end
- str = str .. k .. ':' .. v .. ','
+ if occur[cl] and next(occur[cl]) then
+ local str = ''
+ if occur[cl][0] then
+ str = '0:' .. occur[cl][0] .. ','
+ end
+ for k = 1, 20 do
+ if occur[cl][k] then
+ local label = k == 20 and '>19' or tostring(k)
+ str = str .. label .. ':' .. occur[cl][k] .. ','
+ end
+ end
+ table.insert(occ_distr, cl .. '=' .. str)
+ else
+ table.insert(occ_distr, cl .. '=no_data')
end
- table.insert(occ_distr, str)
else
redis.call('DEL', occur_key)
end
- if next(occur[cl]) ~= nil then
+ if occur[cl] and next(occur[cl]) then
redis.call('HMSET', occur_key, unpack_function(hash2list(occur[cl])))
end
end
@@ -446,8 +512,8 @@ local function expire_step(cls, ev_base, worker)
'%s infrequent (%s %s), %s mean, %s std',
lutil.unpack(d))
if cycle then
- for i, cl in ipairs({ 'in ham', 'in spam', 'total' }) do
- logger.infox(rspamd_config, 'tokens occurrences, %s: {%s}', cl, occ_distr[i])
+ for _, distr_info in ipairs(occ_distr) do
+ logger.infox(rspamd_config, 'tokens occurrences: {%s}', distr_info)
end
end
end
diff --git a/test/functional/cases/110_statistics/multiclass_lib.robot b/test/functional/cases/110_statistics/multiclass_lib.robot
index 4fa4284bb..e8b0b3b64 100644
--- a/test/functional/cases/110_statistics/multiclass_lib.robot
+++ b/test/functional/cases/110_statistics/multiclass_lib.robot
@@ -1,4 +1,5 @@
*** Settings ***
+Library OperatingSystem
Resource lib.robot
*** Variables ***
@@ -6,7 +7,6 @@ ${CONFIG} ${RSPAMD_TESTDIR}/configs/multiclass_bayes.conf
${MESSAGE_HAM} ${RSPAMD_TESTDIR}/messages/ham.eml
${MESSAGE_SPAM} ${RSPAMD_TESTDIR}/messages/spam_message.eml
${MESSAGE_NEWSLETTER} ${RSPAMD_TESTDIR}/messages/newsletter.eml
-${MESSAGE_TRANSACTIONAL} ${RSPAMD_TESTDIR}/messages/transactional.eml
${REDIS_SCOPE} Suite
${RSPAMD_REDIS_SERVER} null
${RSPAMD_SCOPE} Suite
@@ -18,20 +18,24 @@ ${RSPAMD_STATS_PER_USER} ${EMPTY}
*** Keywords ***
Learn Multiclass
[Arguments] ${user} ${class} ${message}
+ # Extract filename from message path for queue-id
+ ${path} ${filename} = Split Path ${message}
IF "${user}"
- ${result} = Run Rspamc -d ${user} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_class:${class} ${message}
+ ${result} = Run Rspamc -d ${user} --queue-id ${filename} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_class:${class} ${message}
ELSE
- ${result} = Run Rspamc -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_class:${class} ${message}
+ ${result} = Run Rspamc --queue-id ${filename} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_class:${class} ${message}
END
Check Rspamc ${result}
Learn Multiclass Legacy
[Arguments] ${user} ${class} ${message}
# Test backward compatibility with old learn_spam/learn_ham commands
+ # Extract filename from message path for queue-id
+ ${path} ${filename} = Split Path ${message}
IF "${user}"
- ${result} = Run Rspamc -d ${user} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_${class} ${message}
+ ${result} = Run Rspamc -d ${user} --queue-id ${filename} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_${class} ${message}
ELSE
- ${result} = Run Rspamc -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_${class} ${message}
+ ${result} = Run Rspamc --queue-id ${filename} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_${class} ${message}
END
Check Rspamc ${result}
@@ -47,7 +51,6 @@ Multiclass Basic Learn Test
Learn Multiclass ${user} spam ${MESSAGE_SPAM}
Learn Multiclass ${user} ham ${MESSAGE_HAM}
Learn Multiclass ${user} newsletter ${MESSAGE_NEWSLETTER}
- Learn Multiclass ${user} transactional ${MESSAGE_TRANSACTIONAL}
# Test classification
Scan File ${MESSAGE_SPAM} &{kwargs}
@@ -59,9 +62,6 @@ Multiclass Basic Learn Test
Scan File ${MESSAGE_NEWSLETTER} &{kwargs}
Expect Symbol BAYES_NEWSLETTER
- Scan File ${MESSAGE_TRANSACTIONAL} &{kwargs}
- Expect Symbol BAYES_TRANSACTIONAL
-
Set Suite Variable ${RSPAMD_STATS_LEARNTEST} 1
Multiclass Legacy Compatibility Test
@@ -111,12 +111,12 @@ Multiclass Cross-Learn Test
Set To Dictionary ${kwargs} Deliver-To=${user}
END
- # Learn newsletter message as transactional
- Learn Multiclass ${user} transactional ${MESSAGE_NEWSLETTER}
+ # Learn newsletter message as ham to test cross-class learning
+ Learn Multiclass ${user} ham ${MESSAGE_NEWSLETTER}
- # Should classify as transactional, not newsletter
+ # Should classify as ham, not newsletter (since we trained it as ham)
Scan File ${MESSAGE_NEWSLETTER} &{kwargs}
- Expect Symbol BAYES_TRANSACTIONAL
+ Expect Symbol BAYES_HAM
Do Not Expect Symbol BAYES_NEWSLETTER
Multiclass Unlearn Test
@@ -148,13 +148,13 @@ Check Multiclass Results
Multiclass Stats Test
# Check that rspamc stat shows learning counts for all classes
${result} = Run Rspamc -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} stat
- Check Rspamc ${result}
+ # Don't use Check Rspamc for stat command as it expects JSON success format
+ Should Be Equal As Integers ${result.rc} 0
# Should show statistics for all classes
- Should Contain ${result.stdout} spam
- Should Contain ${result.stdout} ham
- Should Contain ${result.stdout} newsletter
- Should Contain ${result.stdout} transactional
+ Should Contain ${result.stdout} BAYES_SPAM
+ Should Contain ${result.stdout} BAYES_HAM
+ Should Contain ${result.stdout} BAYES_NEWSLETTER
Multiclass Configuration Migration Test
# Test that old binary config can be automatically migrated
@@ -167,28 +167,3 @@ Multiclass Configuration Migration Test
# Should show deprecation warning but work
Should Contain ${result.stderr} deprecated ignore_case=True
-Multiclass Performance Test
- [Arguments] ${num_messages}=100
- # Test classification performance with multiple classes
- ${start_time} = Get Time epoch
-
- FOR ${i} IN RANGE ${num_messages}
- Scan File ${MESSAGE_SPAM}
- Scan File ${MESSAGE_HAM}
- Scan File ${MESSAGE_NEWSLETTER}
- Scan File ${MESSAGE_TRANSACTIONAL}
- END
-
- ${end_time} = Get Time epoch
- ${duration} = Evaluate ${end_time} - ${start_time}
-
- # Should complete in reasonable time (adjust threshold as needed)
- Should Be True ${duration} < 30 msg=Performance test took ${duration}s, expected < 30s
-
-Multiclass Memory Test
- # Test that memory usage is reasonable for multiclass classification
- ${result} = Run Rspamc -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} stat
- Check Rspamc ${result}
-
- # Extract memory usage if available in stats
- # This is a placeholder - actual implementation would parse memory stats
diff --git a/test/functional/configs/multiclass_bayes.conf b/test/functional/configs/multiclass_bayes.conf
index e58a39056..278aeeee9 100644
--- a/test/functional/configs/multiclass_bayes.conf
+++ b/test/functional/configs/multiclass_bayes.conf
@@ -76,18 +76,12 @@ classifier {
symbol = BAYES_NEWSLETTER;
server = {= env.REDIS_SERVER =}
}
- statfile {
- class = "transactional";
- symbol = BAYES_TRANSACTIONAL;
- server = {= env.REDIS_SERVER =}
- }
# Backend class labels for Redis
class_labels = {
"spam" = "S";
"ham" = "H";
"newsletter" = "N";
- "transactional" = "T";
}
cache {
@@ -106,13 +100,9 @@ classifier {
verdict_mapping = { ham = true };
};
newsletter = {
- symbols = ["NEWSLETTER_HEADER", "BULK_MAIL"];
+ symbols = ["NEWSLETTER_HEADER", "BULK_MAIL", "UNSUBSCRIBE_LINK"];
threshold = 8.0;
};
- transactional = {
- symbols = ["TRANSACTIONAL_MAIL", "PASSWORD_RESET"];
- threshold = 5.0;
- };
};
check_balance = true;
@@ -122,6 +112,7 @@ classifier {
min_learns = 1;
min_tokens = 1;
+ min_token_hits = 1;
min_prob_strength = 0.05;
{% if env.STATS_PER_USER ~= '' %}
diff --git a/test/functional/messages/newsletter.eml b/test/functional/messages/newsletter.eml
index 52e8988b8..93c996956 100644
--- a/test/functional/messages/newsletter.eml
+++ b/test/functional/messages/newsletter.eml
@@ -1,16 +1,50 @@
-From: newsletter@example.com
+From: "Marketing Team" <newsletter@example.com>
To: user@example.org
-Subject: Monthly Newsletter - Special Offers Inside
+Subject: 🎉 Monthly Newsletter - Exclusive Deals & Product Updates!
Date: Thu, 21 Jul 2023 10:00:00 +0000
Message-ID: <newsletter-123@example.com>
MIME-Version: 1.0
-Content-Type: text/plain
+Content-Type: text/html; charset=utf-8
+List-Unsubscribe: <https://example.com/unsubscribe?id=123>
+Precedence: bulk
+X-Mailer: MailChimp/Pro 12.345
-Dear Subscriber,
-
-This is our monthly newsletter with special offers and updates.
-
-Best regards,
-Newsletter Team
-
-Unsubscribe: https://example.com/unsubscribe?id=123 \ No newline at end of file
+<!DOCTYPE html>
+<html>
+<head>
+ <meta charset="utf-8">
+ <title>Monthly Newsletter</title>
+</head>
+<body>
+ <h1>🎉 Exclusive Monthly Offers!</h1>
+
+ <p>Dear Valued Subscriber,</p>
+
+ <p>This month we're excited to bring you our <strong>BIGGEST SALE</strong> of the year!</p>
+
+ <h2>🔥 Hot Deals This Month:</h2>
+ <ul>
+ <li>50% OFF all premium products</li>
+ <li>FREE shipping on orders over $50</li>
+ <li>Buy 2 Get 1 FREE on selected items</li>
+ </ul>
+
+ <p><a href="https://example.com/shop?utm_source=newsletter&utm_campaign=monthly">SHOP NOW</a></p>
+
+ <h2>📱 New Product Launch</h2>
+ <p>Check out our revolutionary new gadget that everyone is talking about!</p>
+
+ <h2>🎁 Refer a Friend</h2>
+ <p>Share this newsletter and both you and your friend get $10 credit!</p>
+
+ <hr>
+
+ <p><small>
+ You're receiving this because you subscribed to our newsletter.<br>
+ <a href="https://example.com/unsubscribe?id=123">Unsubscribe here</a> |
+ <a href="https://example.com/preferences">Update preferences</a><br>
+ Marketing Team, Example Corp<br>
+ 123 Business St, City, State 12345
+ </small></p>
+</body>
+</html> \ No newline at end of file