diff options
-rw-r--r-- | src/client/rspamc.cxx | 59 | ||||
-rw-r--r-- | src/libserver/cfg_rcl.cxx | 2 | ||||
-rw-r--r-- | src/libstat/classifiers/bayes.c | 63 | ||||
-rw-r--r-- | src/plugins/lua/bayes_expiry.lua | 182 | ||||
-rw-r--r-- | test/functional/cases/110_statistics/multiclass_lib.robot | 61 | ||||
-rw-r--r-- | test/functional/configs/multiclass_bayes.conf | 13 | ||||
-rw-r--r-- | test/functional/messages/newsletter.eml | 56 |
7 files changed, 306 insertions, 130 deletions
diff --git a/src/client/rspamc.cxx b/src/client/rspamc.cxx index af88acb33..04bbaeac8 100644 --- a/src/client/rspamc.cxx +++ b/src/client/rspamc.cxx @@ -91,6 +91,8 @@ static gboolean skip_attachments = FALSE; static const char *pubkey = nullptr; static const char *user_agent = "rspamc"; static const char *files_list = nullptr; +static const char *queue_id = nullptr; +static std::string settings; std::vector<GPid> children; static GPatternSpec **exclude_compiled = nullptr; @@ -103,6 +105,11 @@ static gboolean rspamc_password_callback(const char *option_name, gpointer data, GError **error); +static gboolean rspamc_settings_callback(const char *option_name, + const char *value, + gpointer data, + GError **error); + static GOptionEntry entries[] = { {"connect", 'h', 0, G_OPTION_ARG_STRING, &connect_str, @@ -183,6 +190,10 @@ static GOptionEntry entries[] = "Use specific User-Agent instead of \"rspamc\"", nullptr}, {"files-list", '\0', 0, G_OPTION_ARG_FILENAME, &files_list, "Read one or more newline separated filenames to scan from file", nullptr}, + {"queue-id", '\0', 0, G_OPTION_ARG_STRING, &queue_id, + "Set Queue-ID header for the request", nullptr}, + {"settings", '\0', 0, G_OPTION_ARG_CALLBACK, (void *) &rspamc_settings_callback, + "Set Settings header as JSON/UCL for the request", nullptr}, {nullptr, 0, 0, G_OPTION_ARG_NONE, nullptr, nullptr, nullptr}}; static void rspamc_symbols_output(FILE *out, ucl_object_t *obj); @@ -567,6 +578,46 @@ rspamc_password_callback(const char *option_name, return TRUE; } +static gboolean +rspamc_settings_callback(const char *option_name, + const char *value, + gpointer data, + GError **error) +{ + if (value == nullptr) { + g_set_error(error, G_OPTION_ERROR, G_OPTION_ERROR_BAD_VALUE, + "Settings parameter cannot be empty"); + return FALSE; + } + + // Parse the settings string using UCL to validate it + struct ucl_parser *parser = ucl_parser_new(UCL_PARSER_KEY_LOWERCASE); + if (!ucl_parser_add_string(parser, value, strlen(value))) { + auto *ucl_error = ucl_parser_get_error(parser); + g_set_error(error, G_OPTION_ERROR, G_OPTION_ERROR_BAD_VALUE, + "Invalid JSON/UCL in settings: %s", ucl_error); + ucl_parser_free(parser); + return FALSE; + } + + // Get the parsed object and validate it + auto *obj = ucl_parser_get_object(parser); + if (obj == nullptr) { + g_set_error(error, G_OPTION_ERROR, G_OPTION_ERROR_BAD_VALUE, + "Failed to parse settings as JSON/UCL"); + ucl_parser_free(parser); + return FALSE; + } + + // Store the validated settings string + settings = value; + + ucl_object_unref(obj); + ucl_parser_free(parser); + + return TRUE; +} + /* * Parse command line */ @@ -890,6 +941,14 @@ add_options(GQueue *opts) hdr++; } + if (queue_id != nullptr) { + add_client_header(opts, "Queue-Id", queue_id); + } + + if (!settings.empty()) { + add_client_header(opts, "Settings", settings.c_str()); + } + if (!flagbuf.empty()) { if (flagbuf.back() == ',') { flagbuf.pop_back(); diff --git a/src/libserver/cfg_rcl.cxx b/src/libserver/cfg_rcl.cxx index 3978b23b0..da5845917 100644 --- a/src/libserver/cfg_rcl.cxx +++ b/src/libserver/cfg_rcl.cxx @@ -2631,7 +2631,7 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) rspamd_rcl_add_default_handler(sub, "min_prob_strength", rspamd_rcl_parse_struct_double, - G_STRUCT_OFFSET(struct rspamd_classifier_config, min_token_hits), + G_STRUCT_OFFSET(struct rspamd_classifier_config, min_prob_strength), 0, "Use only tokens with probability in [0.5 - MPS, 0.5 + MPS]"); rspamd_rcl_add_default_handler(sub, diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 66d84a14d..f851fbb36 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -333,9 +333,17 @@ bayes_classify_token_multiclass(struct rspamd_classifier *ctx, /* Apply multinomial model for each class */ for (j = 0; j < cl->num_classes; j++) { + /* Skip classes with insufficient learns */ + if (ctx->cfg->min_learns > 0 && cl->class_learns[j] < ctx->cfg->min_learns) { + continue; + } + double class_freq = (double) class_counts[j] / MAX(1.0, (double) cl->class_learns[j]); double class_prob = PROB_COMBINE(class_freq, total_count, w, 1.0 / cl->num_classes); + /* Ensure probability is properly bounded [0, 1] */ + class_prob = MAX(0.0, MIN(1.0, class_prob)); + /* Skip probabilities too close to uniform (1/num_classes) */ double uniform_prior = 1.0 / cl->num_classes; if (fabs(class_prob - uniform_prior) < ctx->cfg->min_prob_strength) { @@ -428,16 +436,26 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, } } - /* Check minimum learns requirement */ + /* Check minimum learns requirement - count viable classes */ + unsigned int viable_classes = 0; if (ctx->cfg->min_learns > 0) { for (i = 0; i < cl.num_classes; i++) { - if (cl.class_learns[i] < ctx->cfg->min_learns) { - msg_info_task("not classified as %s. The class needs more " - "training samples. Currently: %uL; minimum %ud required", + if (cl.class_learns[i] >= ctx->cfg->min_learns) { + viable_classes++; + } + else { + msg_info_task("class %s excluded from classification: %uL learns < %ud minimum", cl.class_names[i], cl.class_learns[i], ctx->cfg->min_learns); - return TRUE; } } + + if (viable_classes == 0) { + msg_info_task("no classes have sufficient training samples for classification"); + return TRUE; + } + + msg_info_bayes("multiclass classification: %ud/%ud classes have sufficient learns", + viable_classes, cl.num_classes); } /* Count text tokens */ @@ -535,7 +553,28 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, /* Calculate confidence using Fisher method for the winning class */ if (max_log_prob > -300) { - confidence = 1.0 - inv_chi_square(task, max_log_prob, cl.processed_tokens); + if (max_log_prob > 0) { + /* Positive log prob means very strong evidence - high confidence */ + confidence = 0.95; /* High confidence for positive log probabilities */ + msg_debug_bayes("positive log_prob (%g), setting high confidence", max_log_prob); + } + else { + /* Negative log prob - use Fisher method as intended */ + double fisher_result = inv_chi_square(task, max_log_prob, cl.processed_tokens); + confidence = 1.0 - fisher_result; + + msg_debug_bayes("fisher_result: %g, max_log_prob: %g, condition check: fisher_result > 0.999 = %s, max_log_prob > -50 = %s", + fisher_result, max_log_prob, + fisher_result > 0.999 ? "true" : "false", + max_log_prob > -50 ? "true" : "false"); + + /* Handle case where Fisher method indicates extreme confidence */ + if (fisher_result > 0.999 && max_log_prob > -100) { + /* Large magnitude negative log prob means strong evidence */ + confidence = 0.90; + msg_debug_bayes("extreme negative log_prob (%g), setting high confidence", max_log_prob); + } + } } else { confidence = normalized_probs[winning_class_idx]; @@ -556,6 +595,10 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, rspamd_task_set_multiclass_result(task, result); + msg_info_bayes("MULTICLASS_RESULT: winning_class='%s', confidence=%.3f, normalized_prob=%.3f, tokens=%uL", + cl.class_names[winning_class_idx], confidence, + normalized_probs[winning_class_idx], cl.processed_tokens); + /* Insert symbol for winning class if confidence is significant */ if (confidence > 0.05) { char sumbuf[32]; @@ -570,6 +613,8 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, if (st->stcf->class_name && strcmp(st->stcf->class_name, cl.class_names[winning_class_idx]) == 0) { + msg_info_bayes("SYMBOL_INSERT: symbol='%s', final_prob=%.3f, confidence_display='%s'", + st->stcf->symbol, final_prob, sumbuf); rspamd_task_insert_result(task, st->stcf->symbol, final_prob, sumbuf); break; } @@ -581,6 +626,9 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, normalized_probs[winning_class_idx], confidence, cl.processed_tokens); } + else { + msg_info_bayes("SYMBOL_SKIPPED: confidence=%.3f <= 0.05, no symbol inserted", confidence); + } return TRUE; } @@ -939,6 +987,9 @@ bayes_learn_class(struct rspamd_classifier *ctx, g_assert(tokens != NULL); g_assert(class_name != NULL); + msg_info_bayes("LEARN_CLASS: class='%s', unlearn=%s, tokens=%ud", + class_name, unlearn ? "true" : "false", tokens->len); + incrementing = ctx->cfg->flags & RSPAMD_FLAG_CLASSIFIER_INCREMENTING_BACKEND; /* Count classes and prepare arrays for multi-class learning */ diff --git a/src/plugins/lua/bayes_expiry.lua b/src/plugins/lua/bayes_expiry.lua index 44ff9dafa..0d78f2272 100644 --- a/src/plugins/lua/bayes_expiry.lua +++ b/src/plugins/lua/bayes_expiry.lua @@ -41,32 +41,38 @@ local template = {} local function check_redis_classifier(cls, cfg) -- Skip old classifiers if cls.new_schema then - local symbol_spam, symbol_ham + local class_symbols = {} + local class_labels = {} local expiry = (cls.expiry or cls.expire) if type(expiry) == 'table' then expiry = expiry[1] end - -- Load symbols from statfiles + -- Extract class_labels mapping from classifier config + if cls.class_labels then + class_labels = cls.class_labels + end + -- Load symbols from statfiles for multi-class support local function check_statfile_table(tbl, def_sym) local symbol = tbl.symbol or def_sym - - local spam - if tbl.spam then - spam = tbl.spam - else - if string.match(symbol:upper(), 'SPAM') then - spam = true + local class_name = tbl.class + + -- Handle legacy spam/ham detection for backward compatibility + if not class_name then + if tbl.spam ~= nil then + class_name = tbl.spam and 'spam' or 'ham' + elseif string.match(tostring(symbol):upper(), 'SPAM') then + class_name = 'spam' + elseif string.match(tostring(symbol):upper(), 'HAM') then + class_name = 'ham' else - spam = false + class_name = def_sym end end - if spam then - symbol_spam = symbol - else - symbol_ham = symbol + if class_name then + class_symbols[class_name] = symbol end end @@ -87,10 +93,9 @@ local function check_redis_classifier(cls, cfg) end end - if not symbol_spam or not symbol_ham or type(expiry) ~= 'number' then + if next(class_symbols) == nil or type(expiry) ~= 'number' then logger.debugm(N, rspamd_config, - 'disable expiry for classifier %s: no expiry %s', - symbol_spam, cls) + 'disable expiry for classifier: no class symbols or expiry configured') return end -- Now try to load redis_params if needed @@ -108,17 +113,16 @@ local function check_redis_classifier(cls, cfg) end if redis_params['read_only'] then - logger.infox(rspamd_config, 'disable expiry for classifier %s: read only redis configuration', - symbol_spam) + logger.infox(rspamd_config, 'disable expiry for classifier: read only redis configuration') return end - logger.debugm(N, rspamd_config, "enabled expiry for %s/%s -> %s expiry", - symbol_spam, symbol_ham, expiry) + logger.debugm(N, rspamd_config, "enabled expiry for classes %s -> %s expiry", + table.concat(lutil.keys(class_symbols), ', '), expiry) table.insert(settings.classifiers, { - symbol_spam = symbol_spam, - symbol_ham = symbol_ham, + class_symbols = class_symbols, + class_labels = class_labels, redis_params = redis_params, expiry = expiry }) @@ -249,12 +253,11 @@ local expiry_script = [[ local keys = ret[2] local tokens = {} - -- Tokens occurrences distribution counters + -- Dynamic occurrence tracking for all classes local occur = { - ham = {}, - spam = {}, total = {} } + local classes_found = {} -- Expiry step statistics counters local nelts, extended, discriminated, sum, sum_squares, common, significant, @@ -264,24 +267,44 @@ local expiry_script = [[ for _,key in ipairs(keys) do local t = redis.call('TYPE', key)["ok"] if t == 'hash' then - local values = redis.call('HMGET', key, 'H', 'S') - local ham = tonumber(values[1]) or 0 - local spam = tonumber(values[2]) or 0 + -- Get all hash fields to support multi-class + local hash_data = redis.call('HGETALL', key) + local class_counts = {} + local total = 0 local ttl = redis.call('TTL', key) + + -- Parse hash data into class counts + for i = 1, #hash_data, 2 do + local class_label = hash_data[i] + local count = tonumber(hash_data[i + 1]) or 0 + class_counts[class_label] = count + total = total + count + + -- Track classes we've seen + if not classes_found[class_label] then + classes_found[class_label] = true + occur[class_label] = {} + end + end + tokens[key] = { - ham, - spam, - ttl + class_counts = class_counts, + total = total, + ttl = ttl } - local total = spam + ham + sum = sum + total sum_squares = sum_squares + total * total nelts = nelts + 1 - for k,v in pairs({['ham']=ham, ['spam']=spam, ['total']=total}) do - if tonumber(v) > 19 then v = 20 end - occur[k][v] = occur[k][v] and occur[k][v] + 1 or 1 + -- Update occurrence counters for all classes and total + for class_label, count in pairs(class_counts) do + local bucket = count > 19 and 20 or count + occur[class_label][bucket] = (occur[class_label][bucket] or 0) + 1 end + + local total_bucket = total > 19 and 20 or total + occur.total[total_bucket] = (occur.total[total_bucket] or 0) + 1 end end @@ -293,9 +316,10 @@ local expiry_script = [[ end for key,token in pairs(tokens) do - local ham, spam, ttl = token[1], token[2], tonumber(token[3]) + local class_counts = token.class_counts + local total = token.total + local ttl = tonumber(token.ttl) local threshold = mean - local total = spam + ham local function set_ttl() if expire < 0 then @@ -310,14 +334,39 @@ local expiry_script = [[ return 0 end - if total == 0 or math.abs(ham - spam) <= total * ${epsilon_common} then + -- Check if token is common (balanced across classes) + local is_common = false + if total == 0 then + is_common = true + else + -- For multi-class, check if any class dominates significantly + local max_count = 0 + for _, count in pairs(class_counts) do + if count > max_count then + max_count = count + end + end + -- Token is common if no class has more than (1 - epsilon) of total + is_common = (max_count / total) <= (1 - ${epsilon_common}) + end + + if is_common then common = common + 1 if ttl > ${common_ttl} then discriminated = discriminated + 1 redis.call('EXPIRE', key, ${common_ttl}) end elseif total >= threshold and total > 0 then - if ham / total > ${significant_factor} or spam / total > ${significant_factor} then + -- Check if any class is significant + local is_significant = false + for _, count in pairs(class_counts) do + if count / total > ${significant_factor} then + is_significant = true + break + end + end + + if is_significant then significant = significant + 1 if ttl ~= -1 then redis.call('PERSIST', key) @@ -361,33 +410,50 @@ local expiry_script = [[ redis.call('DEL', lock_key) local occ_distr = {} - for _,cl in pairs({'ham', 'spam', 'total'}) do + + -- Process all classes found plus total + local all_classes = {'total'} + for class_label in pairs(classes_found) do + table.insert(all_classes, class_label) + end + + for _, cl in ipairs(all_classes) do local occur_key = pattern_sha1 .. '_occurrence_' .. cl if cursor ~= 0 then - local n - for i,v in ipairs(redis.call('HGETALL', occur_key)) do - if i % 2 == 1 then - n = tonumber(v) - else - occur[cl][n] = occur[cl][n] and occur[cl][n] + v or v + local existing_data = redis.call('HGETALL', occur_key) + if #existing_data > 0 then + for i = 1, #existing_data, 2 do + local bucket = tonumber(existing_data[i]) + local count = tonumber(existing_data[i + 1]) + if occur[cl] and occur[cl][bucket] then + occur[cl][bucket] = occur[cl][bucket] + count + elseif occur[cl] then + occur[cl][bucket] = count + end end end - local str = '' - if occur[cl][0] ~= nil then - str = '0:' .. occur[cl][0] .. ',' - end - for k,v in ipairs(occur[cl]) do - if k == 20 then k = '>19' end - str = str .. k .. ':' .. v .. ',' + if occur[cl] and next(occur[cl]) then + local str = '' + if occur[cl][0] then + str = '0:' .. occur[cl][0] .. ',' + end + for k = 1, 20 do + if occur[cl][k] then + local label = k == 20 and '>19' or tostring(k) + str = str .. label .. ':' .. occur[cl][k] .. ',' + end + end + table.insert(occ_distr, cl .. '=' .. str) + else + table.insert(occ_distr, cl .. '=no_data') end - table.insert(occ_distr, str) else redis.call('DEL', occur_key) end - if next(occur[cl]) ~= nil then + if occur[cl] and next(occur[cl]) then redis.call('HMSET', occur_key, unpack_function(hash2list(occur[cl]))) end end @@ -446,8 +512,8 @@ local function expire_step(cls, ev_base, worker) '%s infrequent (%s %s), %s mean, %s std', lutil.unpack(d)) if cycle then - for i, cl in ipairs({ 'in ham', 'in spam', 'total' }) do - logger.infox(rspamd_config, 'tokens occurrences, %s: {%s}', cl, occ_distr[i]) + for _, distr_info in ipairs(occ_distr) do + logger.infox(rspamd_config, 'tokens occurrences: {%s}', distr_info) end end end diff --git a/test/functional/cases/110_statistics/multiclass_lib.robot b/test/functional/cases/110_statistics/multiclass_lib.robot index 4fa4284bb..e8b0b3b64 100644 --- a/test/functional/cases/110_statistics/multiclass_lib.robot +++ b/test/functional/cases/110_statistics/multiclass_lib.robot @@ -1,4 +1,5 @@ *** Settings *** +Library OperatingSystem Resource lib.robot *** Variables *** @@ -6,7 +7,6 @@ ${CONFIG} ${RSPAMD_TESTDIR}/configs/multiclass_bayes.conf ${MESSAGE_HAM} ${RSPAMD_TESTDIR}/messages/ham.eml ${MESSAGE_SPAM} ${RSPAMD_TESTDIR}/messages/spam_message.eml ${MESSAGE_NEWSLETTER} ${RSPAMD_TESTDIR}/messages/newsletter.eml -${MESSAGE_TRANSACTIONAL} ${RSPAMD_TESTDIR}/messages/transactional.eml ${REDIS_SCOPE} Suite ${RSPAMD_REDIS_SERVER} null ${RSPAMD_SCOPE} Suite @@ -18,20 +18,24 @@ ${RSPAMD_STATS_PER_USER} ${EMPTY} *** Keywords *** Learn Multiclass [Arguments] ${user} ${class} ${message} + # Extract filename from message path for queue-id + ${path} ${filename} = Split Path ${message} IF "${user}" - ${result} = Run Rspamc -d ${user} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_class:${class} ${message} + ${result} = Run Rspamc -d ${user} --queue-id ${filename} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_class:${class} ${message} ELSE - ${result} = Run Rspamc -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_class:${class} ${message} + ${result} = Run Rspamc --queue-id ${filename} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_class:${class} ${message} END Check Rspamc ${result} Learn Multiclass Legacy [Arguments] ${user} ${class} ${message} # Test backward compatibility with old learn_spam/learn_ham commands + # Extract filename from message path for queue-id + ${path} ${filename} = Split Path ${message} IF "${user}" - ${result} = Run Rspamc -d ${user} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_${class} ${message} + ${result} = Run Rspamc -d ${user} --queue-id ${filename} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_${class} ${message} ELSE - ${result} = Run Rspamc -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_${class} ${message} + ${result} = Run Rspamc --queue-id ${filename} -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_${class} ${message} END Check Rspamc ${result} @@ -47,7 +51,6 @@ Multiclass Basic Learn Test Learn Multiclass ${user} spam ${MESSAGE_SPAM} Learn Multiclass ${user} ham ${MESSAGE_HAM} Learn Multiclass ${user} newsletter ${MESSAGE_NEWSLETTER} - Learn Multiclass ${user} transactional ${MESSAGE_TRANSACTIONAL} # Test classification Scan File ${MESSAGE_SPAM} &{kwargs} @@ -59,9 +62,6 @@ Multiclass Basic Learn Test Scan File ${MESSAGE_NEWSLETTER} &{kwargs} Expect Symbol BAYES_NEWSLETTER - Scan File ${MESSAGE_TRANSACTIONAL} &{kwargs} - Expect Symbol BAYES_TRANSACTIONAL - Set Suite Variable ${RSPAMD_STATS_LEARNTEST} 1 Multiclass Legacy Compatibility Test @@ -111,12 +111,12 @@ Multiclass Cross-Learn Test Set To Dictionary ${kwargs} Deliver-To=${user} END - # Learn newsletter message as transactional - Learn Multiclass ${user} transactional ${MESSAGE_NEWSLETTER} + # Learn newsletter message as ham to test cross-class learning + Learn Multiclass ${user} ham ${MESSAGE_NEWSLETTER} - # Should classify as transactional, not newsletter + # Should classify as ham, not newsletter (since we trained it as ham) Scan File ${MESSAGE_NEWSLETTER} &{kwargs} - Expect Symbol BAYES_TRANSACTIONAL + Expect Symbol BAYES_HAM Do Not Expect Symbol BAYES_NEWSLETTER Multiclass Unlearn Test @@ -148,13 +148,13 @@ Check Multiclass Results Multiclass Stats Test # Check that rspamc stat shows learning counts for all classes ${result} = Run Rspamc -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} stat - Check Rspamc ${result} + # Don't use Check Rspamc for stat command as it expects JSON success format + Should Be Equal As Integers ${result.rc} 0 # Should show statistics for all classes - Should Contain ${result.stdout} spam - Should Contain ${result.stdout} ham - Should Contain ${result.stdout} newsletter - Should Contain ${result.stdout} transactional + Should Contain ${result.stdout} BAYES_SPAM + Should Contain ${result.stdout} BAYES_HAM + Should Contain ${result.stdout} BAYES_NEWSLETTER Multiclass Configuration Migration Test # Test that old binary config can be automatically migrated @@ -167,28 +167,3 @@ Multiclass Configuration Migration Test # Should show deprecation warning but work Should Contain ${result.stderr} deprecated ignore_case=True -Multiclass Performance Test - [Arguments] ${num_messages}=100 - # Test classification performance with multiple classes - ${start_time} = Get Time epoch - - FOR ${i} IN RANGE ${num_messages} - Scan File ${MESSAGE_SPAM} - Scan File ${MESSAGE_HAM} - Scan File ${MESSAGE_NEWSLETTER} - Scan File ${MESSAGE_TRANSACTIONAL} - END - - ${end_time} = Get Time epoch - ${duration} = Evaluate ${end_time} - ${start_time} - - # Should complete in reasonable time (adjust threshold as needed) - Should Be True ${duration} < 30 msg=Performance test took ${duration}s, expected < 30s - -Multiclass Memory Test - # Test that memory usage is reasonable for multiclass classification - ${result} = Run Rspamc -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} stat - Check Rspamc ${result} - - # Extract memory usage if available in stats - # This is a placeholder - actual implementation would parse memory stats diff --git a/test/functional/configs/multiclass_bayes.conf b/test/functional/configs/multiclass_bayes.conf index e58a39056..278aeeee9 100644 --- a/test/functional/configs/multiclass_bayes.conf +++ b/test/functional/configs/multiclass_bayes.conf @@ -76,18 +76,12 @@ classifier { symbol = BAYES_NEWSLETTER; server = {= env.REDIS_SERVER =} } - statfile { - class = "transactional"; - symbol = BAYES_TRANSACTIONAL; - server = {= env.REDIS_SERVER =} - } # Backend class labels for Redis class_labels = { "spam" = "S"; "ham" = "H"; "newsletter" = "N"; - "transactional" = "T"; } cache { @@ -106,13 +100,9 @@ classifier { verdict_mapping = { ham = true }; }; newsletter = { - symbols = ["NEWSLETTER_HEADER", "BULK_MAIL"]; + symbols = ["NEWSLETTER_HEADER", "BULK_MAIL", "UNSUBSCRIBE_LINK"]; threshold = 8.0; }; - transactional = { - symbols = ["TRANSACTIONAL_MAIL", "PASSWORD_RESET"]; - threshold = 5.0; - }; }; check_balance = true; @@ -122,6 +112,7 @@ classifier { min_learns = 1; min_tokens = 1; + min_token_hits = 1; min_prob_strength = 0.05; {% if env.STATS_PER_USER ~= '' %} diff --git a/test/functional/messages/newsletter.eml b/test/functional/messages/newsletter.eml index 52e8988b8..93c996956 100644 --- a/test/functional/messages/newsletter.eml +++ b/test/functional/messages/newsletter.eml @@ -1,16 +1,50 @@ -From: newsletter@example.com +From: "Marketing Team" <newsletter@example.com> To: user@example.org -Subject: Monthly Newsletter - Special Offers Inside +Subject: 🎉 Monthly Newsletter - Exclusive Deals & Product Updates! Date: Thu, 21 Jul 2023 10:00:00 +0000 Message-ID: <newsletter-123@example.com> MIME-Version: 1.0 -Content-Type: text/plain +Content-Type: text/html; charset=utf-8 +List-Unsubscribe: <https://example.com/unsubscribe?id=123> +Precedence: bulk +X-Mailer: MailChimp/Pro 12.345 -Dear Subscriber, - -This is our monthly newsletter with special offers and updates. - -Best regards, -Newsletter Team - -Unsubscribe: https://example.com/unsubscribe?id=123
\ No newline at end of file +<!DOCTYPE html> +<html> +<head> + <meta charset="utf-8"> + <title>Monthly Newsletter</title> +</head> +<body> + <h1>🎉 Exclusive Monthly Offers!</h1> + + <p>Dear Valued Subscriber,</p> + + <p>This month we're excited to bring you our <strong>BIGGEST SALE</strong> of the year!</p> + + <h2>🔥 Hot Deals This Month:</h2> + <ul> + <li>50% OFF all premium products</li> + <li>FREE shipping on orders over $50</li> + <li>Buy 2 Get 1 FREE on selected items</li> + </ul> + + <p><a href="https://example.com/shop?utm_source=newsletter&utm_campaign=monthly">SHOP NOW</a></p> + + <h2>📱 New Product Launch</h2> + <p>Check out our revolutionary new gadget that everyone is talking about!</p> + + <h2>🎁 Refer a Friend</h2> + <p>Share this newsletter and both you and your friend get $10 credit!</p> + + <hr> + + <p><small> + You're receiving this because you subscribed to our newsletter.<br> + <a href="https://example.com/unsubscribe?id=123">Unsubscribe here</a> | + <a href="https://example.com/preferences">Update preferences</a><br> + Marketing Team, Example Corp<br> + 123 Business St, City, State 12345 + </small></p> +</body> +</html>
\ No newline at end of file |