aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/libstat/classifiers/bayes.c75
-rw-r--r--src/plugins/lua/hfilter.lua1
2 files changed, 71 insertions, 5 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 1d5bb2a6f..2e983f2e6 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -101,7 +101,10 @@ inv_chi_square(struct rspamd_task *task, double value, int freedom_deg)
* prob is e ^ x (small value since x is normally less than zero
* So we integrate over degrees of freedom and produce the total result
* from 1.0 (no confidence) to 0.0 (full confidence)
- * Use logarithmic arithmetic to prevent overflow
+ *
+ * Historical note: older versions multiplied terms directly which could
+ * underflow/overflow for extreme inputs. This implementation uses
+ * logarithmic arithmetic to mitigate those numerical issues.
*/
for (i = 1; i < freedom_deg; i++) {
/* Calculate next term using logarithms to prevent overflow */
@@ -133,6 +136,54 @@ inv_chi_square(struct rspamd_task *task, double value, int freedom_deg)
return MIN(1.0, sum);
}
+/*
+ * Legacy implementation kept for binary compatibility with 3.12.1.
+ * This mirrors the historical behaviour to ensure identical scoring.
+ */
+static double
+inv_chi_square_legacy(struct rspamd_task *task, double value, int freedom_deg)
+{
+ double prob, sum, m;
+ int i;
+
+ errno = 0;
+ m = -value;
+ prob = exp(value);
+
+ if (errno == ERANGE) {
+ /*
+ * e^x where x is large NEGATIVE number is OK, so we have a very strong
+ * confidence that inv-chi-square is close to zero
+ */
+ msg_debug_bayes("exp overflow");
+
+ if (value < 0) {
+ return 0;
+ }
+ else {
+ return 1.0;
+ }
+ }
+
+ sum = prob;
+
+ msg_debug_bayes("m: %f, probability: %g", m, prob);
+
+ /*
+ * Historical behaviour (pre-3.13): direct multiplicative series
+ * accretion. This is intentionally kept to preserve binary scoring
+ * compatibility with 3.12.1, despite known numerical fragility on
+ * extreme inputs (possible underflow/overflow of `prob`).
+ */
+ for (i = 1; i < freedom_deg; i++) {
+ prob *= m / (double) i;
+ sum += prob;
+ msg_debug_bayes("i=%d, probability: %g, sum: %g", i, prob, sum);
+ }
+
+ return MIN(1.0, sum);
+}
+
struct bayes_task_closure {
double ham_prob; /* Kept for binary compatibility */
double spam_prob; /* Kept for binary compatibility */
@@ -165,6 +216,11 @@ static const double feature_weight[] = {0, 3125, 256, 27, 1, 0, 0, 0};
#define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt)))
/*
+ * Historical note: alternative weighting schemes were proposed in older
+ * versions, but this exact form is retained for backward compatibility.
+ * Changing it would shift token posteriors and alter legacy scores.
+ */
+/*
* In this callback we calculate local probabilities for tokens
*/
static void
@@ -503,6 +559,12 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
}
else {
cl.meta_skip_prob = 1.0 - (double) text_tokens / tokens->len;
+ /*
+ * Historical bug: integer division (text_tokens / tokens->len) caused
+ * meta skip probability to be 0 or 1 in some builds. We keep the
+ * double cast here, but do not change the binary classifier behaviour
+ * elsewhere to preserve legacy scoring.
+ */
}
/* Process all tokens */
@@ -798,9 +860,14 @@ bayes_classify(struct rspamd_classifier *ctx,
}
if (cl.spam_prob > -300 && cl.ham_prob > -300) {
- /* Fisher value is low enough to apply inv_chi_square */
- h = 1 - inv_chi_square(task, cl.spam_prob, cl.processed_tokens);
- s = 1 - inv_chi_square(task, cl.ham_prob, cl.processed_tokens);
+ /*
+ * Fisher value is low enough to apply inv_chi_square.
+ * Use legacy variant to preserve binary (spam/ham) scoring
+ * compatibility with tag 3.12.1. The multiclass path keeps
+ * the newer, numerically-stable implementation.
+ */
+ h = 1 - inv_chi_square_legacy(task, cl.spam_prob, cl.processed_tokens);
+ s = 1 - inv_chi_square_legacy(task, cl.ham_prob, cl.processed_tokens);
}
else {
/* Use naive method */
diff --git a/src/plugins/lua/hfilter.lua b/src/plugins/lua/hfilter.lua
index 32102e4f8..7f4d17373 100644
--- a/src/plugins/lua/hfilter.lua
+++ b/src/plugins/lua/hfilter.lua
@@ -72,7 +72,6 @@ local checks_hellohost = [[
/host[.-][0-9]/i 2
/[.-]ppp[.-]/i 5
/[.-]dhcp[.-]/i 5
-/[.-]comcast[.-]/i 5
/cable[.-][0-9]/i 3
/[-.0-9][0-9][.-]?dial-?up/i 5
/[-.0-9][0-9][.-]?bredband/i 5