aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libstat/classifiers/bayes.c21
-rw-r--r--test/functional/cases/110_statistics/multiclass_lib.robot15
-rw-r--r--test/functional/configs/multiclass_bayes.conf12
-rw-r--r--test/functional/messages/newsletter.eml56
4 files changed, 70 insertions, 34 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 66d84a14d..d995de91f 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -336,6 +336,9 @@ bayes_classify_token_multiclass(struct rspamd_classifier *ctx,
double class_freq = (double) class_counts[j] / MAX(1.0, (double) cl->class_learns[j]);
double class_prob = PROB_COMBINE(class_freq, total_count, w, 1.0 / cl->num_classes);
+ /* Ensure probability is properly bounded [0, 1] */
+ class_prob = MAX(0.0, MIN(1.0, class_prob));
+
/* Skip probabilities too close to uniform (1/num_classes) */
double uniform_prior = 1.0 / cl->num_classes;
if (fabs(class_prob - uniform_prior) < ctx->cfg->min_prob_strength) {
@@ -535,7 +538,23 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
/* Calculate confidence using Fisher method for the winning class */
if (max_log_prob > -300) {
- confidence = 1.0 - inv_chi_square(task, max_log_prob, cl.processed_tokens);
+ if (max_log_prob > 0) {
+ /* Positive log prob means very strong evidence - high confidence */
+ confidence = 0.95; /* High confidence for positive log probabilities */
+ msg_debug_bayes("positive log_prob (%g), setting high confidence", max_log_prob);
+ }
+ else {
+ /* Negative log prob - use Fisher method as intended */
+ double fisher_result = inv_chi_square(task, max_log_prob, cl.processed_tokens);
+ confidence = 1.0 - fisher_result;
+
+ /* Handle case where Fisher method indicates extreme confidence */
+ if (fisher_result >= 1.0 && max_log_prob > -50) {
+ /* Large magnitude negative log prob means strong evidence */
+ confidence = 0.90;
+ msg_debug_bayes("extreme negative log_prob (%g), setting high confidence", max_log_prob);
+ }
+ }
}
else {
confidence = normalized_probs[winning_class_idx];
diff --git a/test/functional/cases/110_statistics/multiclass_lib.robot b/test/functional/cases/110_statistics/multiclass_lib.robot
index 4fa4284bb..b2e7c10e3 100644
--- a/test/functional/cases/110_statistics/multiclass_lib.robot
+++ b/test/functional/cases/110_statistics/multiclass_lib.robot
@@ -6,7 +6,6 @@ ${CONFIG} ${RSPAMD_TESTDIR}/configs/multiclass_bayes.conf
${MESSAGE_HAM} ${RSPAMD_TESTDIR}/messages/ham.eml
${MESSAGE_SPAM} ${RSPAMD_TESTDIR}/messages/spam_message.eml
${MESSAGE_NEWSLETTER} ${RSPAMD_TESTDIR}/messages/newsletter.eml
-${MESSAGE_TRANSACTIONAL} ${RSPAMD_TESTDIR}/messages/transactional.eml
${REDIS_SCOPE} Suite
${RSPAMD_REDIS_SERVER} null
${RSPAMD_SCOPE} Suite
@@ -47,7 +46,6 @@ Multiclass Basic Learn Test
Learn Multiclass ${user} spam ${MESSAGE_SPAM}
Learn Multiclass ${user} ham ${MESSAGE_HAM}
Learn Multiclass ${user} newsletter ${MESSAGE_NEWSLETTER}
- Learn Multiclass ${user} transactional ${MESSAGE_TRANSACTIONAL}
# Test classification
Scan File ${MESSAGE_SPAM} &{kwargs}
@@ -59,9 +57,6 @@ Multiclass Basic Learn Test
Scan File ${MESSAGE_NEWSLETTER} &{kwargs}
Expect Symbol BAYES_NEWSLETTER
- Scan File ${MESSAGE_TRANSACTIONAL} &{kwargs}
- Expect Symbol BAYES_TRANSACTIONAL
-
Set Suite Variable ${RSPAMD_STATS_LEARNTEST} 1
Multiclass Legacy Compatibility Test
@@ -111,12 +106,12 @@ Multiclass Cross-Learn Test
Set To Dictionary ${kwargs} Deliver-To=${user}
END
- # Learn newsletter message as transactional
- Learn Multiclass ${user} transactional ${MESSAGE_NEWSLETTER}
+ # Learn newsletter message as ham to test cross-class learning
+ Learn Multiclass ${user} ham ${MESSAGE_NEWSLETTER}
- # Should classify as transactional, not newsletter
+ # Should classify as ham, not newsletter (since we trained it as ham)
Scan File ${MESSAGE_NEWSLETTER} &{kwargs}
- Expect Symbol BAYES_TRANSACTIONAL
+ Expect Symbol BAYES_HAM
Do Not Expect Symbol BAYES_NEWSLETTER
Multiclass Unlearn Test
@@ -154,7 +149,6 @@ Multiclass Stats Test
Should Contain ${result.stdout} spam
Should Contain ${result.stdout} ham
Should Contain ${result.stdout} newsletter
- Should Contain ${result.stdout} transactional
Multiclass Configuration Migration Test
# Test that old binary config can be automatically migrated
@@ -176,7 +170,6 @@ Multiclass Performance Test
Scan File ${MESSAGE_SPAM}
Scan File ${MESSAGE_HAM}
Scan File ${MESSAGE_NEWSLETTER}
- Scan File ${MESSAGE_TRANSACTIONAL}
END
${end_time} = Get Time epoch
diff --git a/test/functional/configs/multiclass_bayes.conf b/test/functional/configs/multiclass_bayes.conf
index e58a39056..6651f94a1 100644
--- a/test/functional/configs/multiclass_bayes.conf
+++ b/test/functional/configs/multiclass_bayes.conf
@@ -76,18 +76,12 @@ classifier {
symbol = BAYES_NEWSLETTER;
server = {= env.REDIS_SERVER =}
}
- statfile {
- class = "transactional";
- symbol = BAYES_TRANSACTIONAL;
- server = {= env.REDIS_SERVER =}
- }
# Backend class labels for Redis
class_labels = {
"spam" = "S";
"ham" = "H";
"newsletter" = "N";
- "transactional" = "T";
}
cache {
@@ -106,13 +100,9 @@ classifier {
verdict_mapping = { ham = true };
};
newsletter = {
- symbols = ["NEWSLETTER_HEADER", "BULK_MAIL"];
+ symbols = ["NEWSLETTER_HEADER", "BULK_MAIL", "UNSUBSCRIBE_LINK"];
threshold = 8.0;
};
- transactional = {
- symbols = ["TRANSACTIONAL_MAIL", "PASSWORD_RESET"];
- threshold = 5.0;
- };
};
check_balance = true;
diff --git a/test/functional/messages/newsletter.eml b/test/functional/messages/newsletter.eml
index 52e8988b8..93c996956 100644
--- a/test/functional/messages/newsletter.eml
+++ b/test/functional/messages/newsletter.eml
@@ -1,16 +1,50 @@
-From: newsletter@example.com
+From: "Marketing Team" <newsletter@example.com>
To: user@example.org
-Subject: Monthly Newsletter - Special Offers Inside
+Subject: 🎉 Monthly Newsletter - Exclusive Deals & Product Updates!
Date: Thu, 21 Jul 2023 10:00:00 +0000
Message-ID: <newsletter-123@example.com>
MIME-Version: 1.0
-Content-Type: text/plain
+Content-Type: text/html; charset=utf-8
+List-Unsubscribe: <https://example.com/unsubscribe?id=123>
+Precedence: bulk
+X-Mailer: MailChimp/Pro 12.345
-Dear Subscriber,
-
-This is our monthly newsletter with special offers and updates.
-
-Best regards,
-Newsletter Team
-
-Unsubscribe: https://example.com/unsubscribe?id=123 \ No newline at end of file
+<!DOCTYPE html>
+<html>
+<head>
+ <meta charset="utf-8">
+ <title>Monthly Newsletter</title>
+</head>
+<body>
+ <h1>🎉 Exclusive Monthly Offers!</h1>
+
+ <p>Dear Valued Subscriber,</p>
+
+ <p>This month we're excited to bring you our <strong>BIGGEST SALE</strong> of the year!</p>
+
+ <h2>🔥 Hot Deals This Month:</h2>
+ <ul>
+ <li>50% OFF all premium products</li>
+ <li>FREE shipping on orders over $50</li>
+ <li>Buy 2 Get 1 FREE on selected items</li>
+ </ul>
+
+ <p><a href="https://example.com/shop?utm_source=newsletter&utm_campaign=monthly">SHOP NOW</a></p>
+
+ <h2>📱 New Product Launch</h2>
+ <p>Check out our revolutionary new gadget that everyone is talking about!</p>
+
+ <h2>🎁 Refer a Friend</h2>
+ <p>Share this newsletter and both you and your friend get $10 credit!</p>
+
+ <hr>
+
+ <p><small>
+ You're receiving this because you subscribed to our newsletter.<br>
+ <a href="https://example.com/unsubscribe?id=123">Unsubscribe here</a> |
+ <a href="https://example.com/preferences">Update preferences</a><br>
+ Marketing Team, Example Corp<br>
+ 123 Business St, City, State 12345
+ </small></p>
+</body>
+</html> \ No newline at end of file