diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2025-07-27 16:04:29 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2025-07-27 16:04:29 +0100 |
commit | 3428e63e3a914023cd0ef9ab3257b1299919e74f (patch) | |
tree | 285df6473ef3e0c8b0aa4b7a7d454975342bd8e8 | |
parent | 16985fecee9179f8afa63a4ffb3b9f8b417c1f34 (diff) | |
download | rspamd-vstakhov-multi-class-bayes.tar.gz rspamd-vstakhov-multi-class-bayes.zip |
[Minor] Further adjustmentsvstakhov-multi-class-bayes
-rw-r--r-- | src/libstat/classifiers/bayes.c | 21 | ||||
-rw-r--r-- | test/functional/cases/110_statistics/multiclass_lib.robot | 15 | ||||
-rw-r--r-- | test/functional/configs/multiclass_bayes.conf | 12 | ||||
-rw-r--r-- | test/functional/messages/newsletter.eml | 56 |
4 files changed, 70 insertions, 34 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 66d84a14d..d995de91f 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -336,6 +336,9 @@ bayes_classify_token_multiclass(struct rspamd_classifier *ctx, double class_freq = (double) class_counts[j] / MAX(1.0, (double) cl->class_learns[j]); double class_prob = PROB_COMBINE(class_freq, total_count, w, 1.0 / cl->num_classes); + /* Ensure probability is properly bounded [0, 1] */ + class_prob = MAX(0.0, MIN(1.0, class_prob)); + /* Skip probabilities too close to uniform (1/num_classes) */ double uniform_prior = 1.0 / cl->num_classes; if (fabs(class_prob - uniform_prior) < ctx->cfg->min_prob_strength) { @@ -535,7 +538,23 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, /* Calculate confidence using Fisher method for the winning class */ if (max_log_prob > -300) { - confidence = 1.0 - inv_chi_square(task, max_log_prob, cl.processed_tokens); + if (max_log_prob > 0) { + /* Positive log prob means very strong evidence - high confidence */ + confidence = 0.95; /* High confidence for positive log probabilities */ + msg_debug_bayes("positive log_prob (%g), setting high confidence", max_log_prob); + } + else { + /* Negative log prob - use Fisher method as intended */ + double fisher_result = inv_chi_square(task, max_log_prob, cl.processed_tokens); + confidence = 1.0 - fisher_result; + + /* Handle case where Fisher method indicates extreme confidence */ + if (fisher_result >= 1.0 && max_log_prob > -50) { + /* Large magnitude negative log prob means strong evidence */ + confidence = 0.90; + msg_debug_bayes("extreme negative log_prob (%g), setting high confidence", max_log_prob); + } + } } else { confidence = normalized_probs[winning_class_idx]; diff --git a/test/functional/cases/110_statistics/multiclass_lib.robot b/test/functional/cases/110_statistics/multiclass_lib.robot index 4fa4284bb..b2e7c10e3 100644 --- a/test/functional/cases/110_statistics/multiclass_lib.robot +++ b/test/functional/cases/110_statistics/multiclass_lib.robot @@ -6,7 +6,6 @@ ${CONFIG} ${RSPAMD_TESTDIR}/configs/multiclass_bayes.conf ${MESSAGE_HAM} ${RSPAMD_TESTDIR}/messages/ham.eml ${MESSAGE_SPAM} ${RSPAMD_TESTDIR}/messages/spam_message.eml ${MESSAGE_NEWSLETTER} ${RSPAMD_TESTDIR}/messages/newsletter.eml -${MESSAGE_TRANSACTIONAL} ${RSPAMD_TESTDIR}/messages/transactional.eml ${REDIS_SCOPE} Suite ${RSPAMD_REDIS_SERVER} null ${RSPAMD_SCOPE} Suite @@ -47,7 +46,6 @@ Multiclass Basic Learn Test Learn Multiclass ${user} spam ${MESSAGE_SPAM} Learn Multiclass ${user} ham ${MESSAGE_HAM} Learn Multiclass ${user} newsletter ${MESSAGE_NEWSLETTER} - Learn Multiclass ${user} transactional ${MESSAGE_TRANSACTIONAL} # Test classification Scan File ${MESSAGE_SPAM} &{kwargs} @@ -59,9 +57,6 @@ Multiclass Basic Learn Test Scan File ${MESSAGE_NEWSLETTER} &{kwargs} Expect Symbol BAYES_NEWSLETTER - Scan File ${MESSAGE_TRANSACTIONAL} &{kwargs} - Expect Symbol BAYES_TRANSACTIONAL - Set Suite Variable ${RSPAMD_STATS_LEARNTEST} 1 Multiclass Legacy Compatibility Test @@ -111,12 +106,12 @@ Multiclass Cross-Learn Test Set To Dictionary ${kwargs} Deliver-To=${user} END - # Learn newsletter message as transactional - Learn Multiclass ${user} transactional ${MESSAGE_NEWSLETTER} + # Learn newsletter message as ham to test cross-class learning + Learn Multiclass ${user} ham ${MESSAGE_NEWSLETTER} - # Should classify as transactional, not newsletter + # Should classify as ham, not newsletter (since we trained it as ham) Scan File ${MESSAGE_NEWSLETTER} &{kwargs} - Expect Symbol BAYES_TRANSACTIONAL + Expect Symbol BAYES_HAM Do Not Expect Symbol BAYES_NEWSLETTER Multiclass Unlearn Test @@ -154,7 +149,6 @@ Multiclass Stats Test Should Contain ${result.stdout} spam Should Contain ${result.stdout} ham Should Contain ${result.stdout} newsletter - Should Contain ${result.stdout} transactional Multiclass Configuration Migration Test # Test that old binary config can be automatically migrated @@ -176,7 +170,6 @@ Multiclass Performance Test Scan File ${MESSAGE_SPAM} Scan File ${MESSAGE_HAM} Scan File ${MESSAGE_NEWSLETTER} - Scan File ${MESSAGE_TRANSACTIONAL} END ${end_time} = Get Time epoch diff --git a/test/functional/configs/multiclass_bayes.conf b/test/functional/configs/multiclass_bayes.conf index e58a39056..6651f94a1 100644 --- a/test/functional/configs/multiclass_bayes.conf +++ b/test/functional/configs/multiclass_bayes.conf @@ -76,18 +76,12 @@ classifier { symbol = BAYES_NEWSLETTER; server = {= env.REDIS_SERVER =} } - statfile { - class = "transactional"; - symbol = BAYES_TRANSACTIONAL; - server = {= env.REDIS_SERVER =} - } # Backend class labels for Redis class_labels = { "spam" = "S"; "ham" = "H"; "newsletter" = "N"; - "transactional" = "T"; } cache { @@ -106,13 +100,9 @@ classifier { verdict_mapping = { ham = true }; }; newsletter = { - symbols = ["NEWSLETTER_HEADER", "BULK_MAIL"]; + symbols = ["NEWSLETTER_HEADER", "BULK_MAIL", "UNSUBSCRIBE_LINK"]; threshold = 8.0; }; - transactional = { - symbols = ["TRANSACTIONAL_MAIL", "PASSWORD_RESET"]; - threshold = 5.0; - }; }; check_balance = true; diff --git a/test/functional/messages/newsletter.eml b/test/functional/messages/newsletter.eml index 52e8988b8..93c996956 100644 --- a/test/functional/messages/newsletter.eml +++ b/test/functional/messages/newsletter.eml @@ -1,16 +1,50 @@ -From: newsletter@example.com +From: "Marketing Team" <newsletter@example.com> To: user@example.org -Subject: Monthly Newsletter - Special Offers Inside +Subject: 🎉 Monthly Newsletter - Exclusive Deals & Product Updates! Date: Thu, 21 Jul 2023 10:00:00 +0000 Message-ID: <newsletter-123@example.com> MIME-Version: 1.0 -Content-Type: text/plain +Content-Type: text/html; charset=utf-8 +List-Unsubscribe: <https://example.com/unsubscribe?id=123> +Precedence: bulk +X-Mailer: MailChimp/Pro 12.345 -Dear Subscriber, - -This is our monthly newsletter with special offers and updates. - -Best regards, -Newsletter Team - -Unsubscribe: https://example.com/unsubscribe?id=123
\ No newline at end of file +<!DOCTYPE html> +<html> +<head> + <meta charset="utf-8"> + <title>Monthly Newsletter</title> +</head> +<body> + <h1>🎉 Exclusive Monthly Offers!</h1> + + <p>Dear Valued Subscriber,</p> + + <p>This month we're excited to bring you our <strong>BIGGEST SALE</strong> of the year!</p> + + <h2>🔥 Hot Deals This Month:</h2> + <ul> + <li>50% OFF all premium products</li> + <li>FREE shipping on orders over $50</li> + <li>Buy 2 Get 1 FREE on selected items</li> + </ul> + + <p><a href="https://example.com/shop?utm_source=newsletter&utm_campaign=monthly">SHOP NOW</a></p> + + <h2>📱 New Product Launch</h2> + <p>Check out our revolutionary new gadget that everyone is talking about!</p> + + <h2>🎁 Refer a Friend</h2> + <p>Share this newsletter and both you and your friend get $10 credit!</p> + + <hr> + + <p><small> + You're receiving this because you subscribed to our newsletter.<br> + <a href="https://example.com/unsubscribe?id=123">Unsubscribe here</a> | + <a href="https://example.com/preferences">Update preferences</a><br> + Marketing Team, Example Corp<br> + 123 Business St, City, State 12345 + </small></p> +</body> +</html>
\ No newline at end of file |