[Project] Optimise classify script

author Vsevolod Stakhov <vsevolod@rspamd.com>

Sat, 2 Dec 2023 15:54:04 +0000 (15:54 +0000)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Sat, 2 Dec 2023 15:54:04 +0000 (15:54 +0000)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Sat, 2 Dec 2023 15:54:04 +0000 (15:54 +0000)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Sat, 2 Dec 2023 15:54:04 +0000 (15:54 +0000)
diff --git a/lualib/redis_scripts/bayes_classify.lua b/lualib/redis_scripts/bayes_classify.lua

index c2654e476c338e3018c255da3fa93a516a62cbcc..76e88a6f328117f817c4dfea378cd2673e90a0ef 100644 (file)
--- a/lualib/redis_scripts/bayes_classify.lua
+++ b/lualib/redis_scripts/bayes_classify.lua
@@ -8,19 +8,31 @@ local input_tokens = cmsgpack.unpack(KEYS[2])
  local output_spam = {}
  local output_ham = {}
  
-for i, token in ipairs(input_tokens) do
-  local token_data = redis.call('HMGET', prefix .. tostring(token), 'H', 'S')
-
-  if token_data then
-    local ham_count = tonumber(token_data[1]) or 0
-    local spam_count = tonumber(token_data[2]) or 0
-
-    output_ham[i] = ham_count
-    output_spam[i] = spam_count
-  else
-    output_ham[i] = 0
-    output_spam[i] = 0
+local learned_ham = redis.call('HGET', prefix, 'learned_ham') or 0
+local learned_spam = redis.call('HGET', prefix, 'learned_spam') or 0
+local prefix_underscore = prefix .. '_'
+
+-- Output is a set of pairs (token_index, token_count), tokens that are not
+-- found are not filled.
+-- This optimisation will save a lot of space for sparse tokens, and in Bayes that assumption is normally held
+
+if learned_ham > 0 and learned_spam > 0 then
+  for i, token in ipairs(input_tokens) do
+    local token_data = redis.call('HMGET', prefix_underscore .. tostring(token), 'H', 'S')
+
+    if token_data then
+      local ham_count = token_data[1]
+      local spam_count = tonumber(token_data[2]) or 0
+
+      if ham_count then
+        table.insert(output_ham, { i, tonumber(ham_count) })
+      end
+
+      if spam_count then
+        table.insert(output_spam, { i, tonumber(spam_count) })
+      end
+    end
    end
  end
  
-return cmsgpack.pack({ output_ham, output_spam })
-\ No newline at end of file
+return { learned_ham, learned_spam, output_ham, output_spam }
+\ No newline at end of file
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Sat, 2 Dec 2023 15:54:04 +0000 (15:54 +0000)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Sat, 2 Dec 2023 15:54:04 +0000 (15:54 +0000)