aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--ChangeLog56
-rw-r--r--cmake/AddDependencySubdirectories.cmake3
-rw-r--r--conf/composites.conf6
-rw-r--r--conf/maps.d/spf_dkim_whitelist.inc1
-rw-r--r--config.h.in1
-rw-r--r--contrib/libev/ev.c19
-rw-r--r--contrib/libucl/lua_ucl.c12
-rw-r--r--contrib/libucl/ucl.h20
-rw-r--r--contrib/libucl/ucl_parser.c10
-rw-r--r--contrib/libucl/ucl_util.c32
-rw-r--r--contrib/snowball/compiler/generator.c18
-rw-r--r--contrib/snowball/compiler/space.c335
-rw-r--r--contrib/xxhash/CMakeLists.txt4
-rwxr-xr-xdebian/rules2
-rw-r--r--doc/rspamc.13
-rw-r--r--doc/rspamc.1.md2
-rw-r--r--eslint.config.mjs3
-rw-r--r--interface/css/rspamd.css18
-rw-r--r--interface/index.html19
-rw-r--r--interface/js/app/common.js130
-rw-r--r--interface/js/app/rspamd.js4
-rw-r--r--interface/js/app/selectors.js10
-rw-r--r--interface/js/app/symbols.js1
-rw-r--r--interface/js/app/upload.js182
-rw-r--r--lualib/lua_cfg_transform.lua2
-rw-r--r--lualib/lua_magic/patterns.lua17
-rw-r--r--lualib/lua_magic/types.lua7
-rw-r--r--lualib/lua_redis.lua28
-rw-r--r--lualib/redis_scripts/bayes_cache_learn.lua2
-rw-r--r--lualib/rspamadm/mime.lua401
-rw-r--r--rules/controller/fuzzy.lua22
-rw-r--r--src/client/rspamdclient.c4
-rw-r--r--src/controller.c54
-rw-r--r--src/fuzzy_storage.c16
-rw-r--r--src/libmime/lang_detection.c41
-rw-r--r--src/libmime/lang_detection_fasttext.cxx21
-rw-r--r--src/libmime/lang_detection_fasttext.h3
-rw-r--r--src/libmime/message.c35
-rw-r--r--src/libmime/message.h3
-rw-r--r--src/libserver/cfg_file.h12
-rw-r--r--src/libserver/cfg_rcl.cxx22
-rw-r--r--src/libserver/cfg_utils.cxx80
-rw-r--r--src/libserver/dynamic_cfg.c4
-rw-r--r--src/libserver/http/http_connection.c90
-rw-r--r--src/libserver/http/http_connection.h12
-rw-r--r--src/libserver/http/http_message.c7
-rw-r--r--src/libserver/logger/logger.c141
-rw-r--r--src/libserver/logger/logger_private.h12
-rw-r--r--src/libserver/maps/map.c279
-rw-r--r--src/libserver/maps/map_private.h21
-rw-r--r--src/libserver/milter.c4
-rw-r--r--src/libserver/re_cache.c41
-rw-r--r--src/libserver/roll_history.c8
-rw-r--r--src/libserver/rspamd_control.c4
-rw-r--r--src/libserver/symcache/symcache_impl.cxx2
-rw-r--r--src/libserver/task.c6
-rw-r--r--src/libserver/task.h3
-rw-r--r--src/libserver/word.h88
-rw-r--r--src/libserver/worker_util.c2
-rw-r--r--src/libstat/CMakeLists.txt33
-rw-r--r--src/libstat/stat_api.h27
-rw-r--r--src/libstat/stat_process.c34
-rw-r--r--src/libstat/tokenizers/custom_tokenizer.h177
-rw-r--r--src/libstat/tokenizers/osb.c9
-rw-r--r--src/libstat/tokenizers/rspamd_tokenizer_types.h89
-rw-r--r--src/libstat/tokenizers/tokenizer_manager.c500
-rw-r--r--src/libstat/tokenizers/tokenizers.c202
-rw-r--r--src/libstat/tokenizers/tokenizers.h33
-rw-r--r--src/libutil/fstring.h14
-rw-r--r--src/libutil/mem_pool.c7
-rw-r--r--src/libutil/mem_pool.h2
-rw-r--r--src/libutil/radix.c37
-rw-r--r--src/libutil/radix.h18
-rw-r--r--src/libutil/shingles.c27
-rw-r--r--src/libutil/shingles.h5
-rw-r--r--src/lua/lua_common.c52
-rw-r--r--src/lua/lua_common.h7
-rw-r--r--src/lua/lua_config.c88
-rw-r--r--src/lua/lua_cryptobox.c2
-rw-r--r--src/lua/lua_http.c123
-rw-r--r--src/lua/lua_logger.c102
-rw-r--r--src/lua/lua_mimepart.c39
-rw-r--r--src/lua/lua_parsers.c18
-rw-r--r--src/lua/lua_task.c82
-rw-r--r--src/lua/lua_util.c164
-rw-r--r--src/plugins/chartable.cxx24
-rw-r--r--src/plugins/fuzzy_check.c14
-rw-r--r--src/plugins/lua/arc.lua4
-rw-r--r--src/plugins/lua/fuzzy_collect.lua2
-rw-r--r--src/plugins/lua/gpt.lua73
-rw-r--r--src/plugins/lua/hfilter.lua1
-rw-r--r--src/plugins/lua/history_redis.lua6
-rw-r--r--src/plugins/lua/known_senders.lua62
-rw-r--r--src/plugins/lua/milter_headers.lua2
-rw-r--r--src/plugins/lua/mime_types.lua6
-rw-r--r--src/plugins/lua/multimap.lua11
-rw-r--r--src/plugins/lua/ratelimit.lua4
-rw-r--r--src/plugins/lua/rbl.lua2
-rw-r--r--src/plugins/lua/replies.lua26
-rw-r--r--src/plugins/lua/reputation.lua2
-rw-r--r--src/plugins/lua/settings.lua6
-rw-r--r--src/plugins/lua/spamassassin.lua65
-rw-r--r--src/plugins/lua/trie.lua12
-rw-r--r--src/rspamadm/configdump.c115
-rw-r--r--src/rspamadm/control.c8
-rw-r--r--src/rspamadm/lua_repl.c23
-rw-r--r--src/rspamadm/signtool.c4
-rw-r--r--src/rspamd_proxy.c142
-rw-r--r--test/functional/cases/001_merged/350_magic.robot1
-rw-r--r--test/functional/cases/400_known_senders.robot22
-rw-r--r--test/functional/cases/410_replies.robot37
-rw-r--r--test/functional/cases/411_logging/000_console/000_systemd_logger.robot (renamed from test/functional/cases/410_logging/000_console/000_systemd_logger.robot)0
-rw-r--r--test/functional/cases/411_logging/000_console/001_timestamps.robot (renamed from test/functional/cases/410_logging/000_console/001_timestamps.robot)0
-rw-r--r--test/functional/cases/411_logging/001_file/000_json.robot (renamed from test/functional/cases/410_logging/001_file/000_json.robot)0
-rw-r--r--test/functional/cases/550_milter_headers.robot11
-rw-r--r--test/functional/configs/milter_headers.conf12
-rw-r--r--test/functional/lib/rspamd.robot9
-rw-r--r--test/functional/messages/gargantua.eml54
-rw-r--r--test/lua/unit/logger.lua95
-rw-r--r--test/lua/unit/rspamd_resolver.lua62
-rw-r--r--test/rspamd_cryptobox_test.c6
-rw-r--r--test/rspamd_cxx_unit_cryptobox.hxx221
-rw-r--r--test/rspamd_dns_test.c19
-rw-r--r--test/rspamd_shingles_test.c80
125 files changed, 4264 insertions, 1257 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7735884c3..116d68888 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@ cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
# Define version variables
set(RSPAMD_VERSION_MAJOR 3)
-set(RSPAMD_VERSION_MINOR 11)
+set(RSPAMD_VERSION_MINOR 12)
set(RSPAMD_VERSION_PATCH 2)
# Keep two digits all the time
diff --git a/ChangeLog b/ChangeLog
index 6b0e486f5..0ae980c9b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,59 @@
+3.12.1: 17 Jun 2025
+ * [Feature] Add /bayes/classifiers HTTP endpoint
+ * [Feature] Further improvements in scheduling next checks
+ * [Fix] Another fix for maps concurrent load
+ * [Fix] Do not add log tag header in milter logic
+ * [Fix] Do not explicitly add Connection header if it's there
+ * [Fix] Fix proxy headers duplication
+ * [Fix] Fix several issues with the lua_logger
+ * [Fix] Make logger more graceful when dealing with format arguments
+ * [Fix] Try to avoid incomplete writes
+ * [Rework] Eliminate maps locking
+
+3.12.0: 09 Jun 2025
+ * [CritFix] In lua-ucl disable macros and file variables by default
+ * [Feature] Add keep-alive support
+ * [Feature] Add some convenience methods
+ * [Feature] Add support for separate read and write servers in fuzzy check
+ * [Feature] Allow CDB files as external maps
+ * [Feature] Allow to specify Redis version
+ * [Feature] Allow to specify extra headers in Rspamd proxy
+ * [Feature] Allow to specify log tag in proxy
+ * [Feature] Allow to specify max log tag length for all log messages
+ * [Feature] Allow to use HTTPS when connection to backends in proxy
+ * [Feature] Output content for all maps
+ * [Feature] Plugin to integrate with Contextal platform
+ * [Feature] Show all maps status
+ * [Fix] Add fail check for cfg transform for some corner cases
+ * [Fix] Add header with reason everytime (not only for ham) and use correct value for header
+ * [Fix] Add null check for master_conn->up in proxy backend error handler
+ * [Fix] Allow 'Hash' in Access-Control-Allow-Headers
+ * [Fix] Arc: Use tonumber when comparing
+ * [Fix] As we have replxx library, always use it
+ * [Fix] Backport some issues from libucl
+ * [Fix] Filter invalid domains in fuzzy extra data
+ * [Fix] Fix maps ids
+ * [Fix] Fix race condition in maps loading by unlocking backend on switch
+ * [Fix] Fix static maps description passing
+ * [Fix] Fix variable propagation (no functional change)
+ * [Fix] Fix various issues
+ * [Fix] Greylist: Improve body hash calculations
+ * [Fix] Known senders: More recipients test logic
+ * [Fix] Known senders: Use the same logic as in the replies module
+ * [Fix] Prevent crashes when accessing upstream address in self-scan mode
+ * [Fix] Really fix local objects filtering, sigh...
+ * [Fix] Update default URL for openphish
+ * [Fix] Use bundled libfmt everywhere
+ * [Fix] Use safe parsers everywhere except configuration
+ * [Fix] correct logic error in milter_headers.lua: skip_wanted()
+ * [Fix] initialize ollama result table
+ * [Fix] libmime: declare comparators const for doctest 2.4.12 compatibility
+ * [Project] Modernize cmake
+ * [Project] Rework OSDep
+ * [Rework] Replies: consider all recipients and use smtp ones
+ * [Rework] Store shared maps data separately
+ * [Rework] Use locks/loaded per backend for all maps
+
3.11.1: 08 Mar 2025
* [Feature] Add 'noop' redis backend for scripts running
* [Feature] Add Redis caching framework
diff --git a/cmake/AddDependencySubdirectories.cmake b/cmake/AddDependencySubdirectories.cmake
index 56932e9c6..866204ff5 100644
--- a/cmake/AddDependencySubdirectories.cmake
+++ b/cmake/AddDependencySubdirectories.cmake
@@ -76,7 +76,6 @@ function(AddDependencySubdirectories)
# Lua REPL support
add_subdirectory(contrib/replxx)
- set(WITH_LUA_REPL 1 PARENT_SCOPE)
list(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-replxx)
# Update the required libraries list based on dependencies
@@ -105,6 +104,4 @@ function(AddDependencySubdirectories)
# Propagate variables to parent scope
set(RSPAMD_REQUIRED_LIBRARIES ${RSPAMD_REQUIRED_LIBRARIES} PARENT_SCOPE)
- set(WITH_SNOWBALL ${WITH_SNOWBALL} PARENT_SCOPE)
- set(WITH_LUA_REPL ${WITH_LUA_REPL} PARENT_SCOPE)
endfunction()
diff --git a/conf/composites.conf b/conf/composites.conf
index 63385cbf3..635022222 100644
--- a/conf/composites.conf
+++ b/conf/composites.conf
@@ -171,6 +171,12 @@ composites {
description = "Message exhibits strong characteristics of advance fee fraud (AFF a/k/a '419' spam) involving freemail addresses";
group = "scams";
}
+ FREEMAIL_REPLYTO_NEQ_FROM {
+ expression = "FREEMAIL_REPLYTO & !REPLYTO_EQ_FROM & !REPLYTO_ADDR_EQ_FROM & !FREEMAIL_REPLYTO_NEQ_FROM_DOM";
+ score = 2.0;
+ policy = "leave";
+ description = "Reply-To is a Freemail address and it not match From header or SMTP From, also From is not another Freemail";
+ }
SUSPICIOUS_MDN {
expression = "(FREEMAIL_MDN | DISPOSABLE_MDN) & !(FREEMAIL_FROM | FREEMAIL_ENVFROM)";
score = 2.0;
diff --git a/conf/maps.d/spf_dkim_whitelist.inc b/conf/maps.d/spf_dkim_whitelist.inc
index fe0ddbbaa..09ceba89b 100644
--- a/conf/maps.d/spf_dkim_whitelist.inc
+++ b/conf/maps.d/spf_dkim_whitelist.inc
@@ -33,7 +33,6 @@ asana.com
att.com
autohome.com.cn
avg.com
-aweber.com
badoo.com
bankofamerica.com
basecamp.com
diff --git a/config.h.in b/config.h.in
index 0ed2cd6b2..f9d910d68 100644
--- a/config.h.in
+++ b/config.h.in
@@ -112,7 +112,6 @@
#cmakedefine WITH_SNOWBALL 1
#cmakedefine WITH_SQLITE 1
#cmakedefine WITH_LUA_TRACE 1
-#cmakedefine WITH_LUA_REPL 1
#cmakedefine WITH_FASTTEXT 1
#cmakedefine BACKWARD_ENABLE 1
#cmakedefine HAVE_BUILTIN_CPU_SUPPORTS 1
diff --git a/contrib/libev/ev.c b/contrib/libev/ev.c
index 230445d2a..0dec50bcd 100644
--- a/contrib/libev/ev.c
+++ b/contrib/libev/ev.c
@@ -1,4 +1,20 @@
/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
* libev event processing core, watcher management
*
* Copyright (c) 2007-2019 Marc Alexander Lehmann <libev@schmorp.de>
@@ -2148,7 +2164,10 @@ typedef struct
#include "ev_wrap.h"
static struct ev_loop default_loop_struct;
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wextern-initializer"
EV_API_DECL struct ev_loop *ev_default_loop_ptr = 0; /* needs to be initialised to make it a definition despite extern */
+#pragma clang diagnostic pop
#else
diff --git a/contrib/libucl/lua_ucl.c b/contrib/libucl/lua_ucl.c
index 473aefe0c..13306b942 100644
--- a/contrib/libucl/lua_ucl.c
+++ b/contrib/libucl/lua_ucl.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -694,7 +694,13 @@ static int
lua_ucl_parser_init(lua_State *L)
{
struct ucl_parser *parser, **pparser;
- int flags = UCL_PARSER_NO_FILEVARS;
+ /*
+ * We disable file variables and macros by default, as
+ * the most use cases are parsing of JSON and not of the real
+ * files. Macros in the parser are very dangerous and should be used
+ * for trusted data only.
+ */
+ int flags = UCL_PARSER_SAFE_FLAGS;
if (lua_gettop(L) >= 1) {
flags = lua_tonumber(L, 1);
@@ -1091,7 +1097,7 @@ lua_ucl_parser_validate(lua_State *L)
}
}
else if (lua_type(L, 2) == LUA_TSTRING) {
- schema_parser = ucl_parser_new(0);
+ schema_parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
schema_file = luaL_checkstring(L, 2);
if (!ucl_parser_add_file(schema_parser, schema_file)) {
diff --git a/contrib/libucl/ucl.h b/contrib/libucl/ucl.h
index b6b9f44c0..8c2ac59a4 100644
--- a/contrib/libucl/ucl.h
+++ b/contrib/libucl/ucl.h
@@ -1,3 +1,19 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
/* Copyright (c) 2013-2015, Vsevolod Stakhov
* All rights reserved.
*
@@ -159,6 +175,10 @@ typedef enum ucl_parser_flags {
UCL_PARSER_NO_FILEVARS = (1 << 6) /** Do not set file vars */
} ucl_parser_flags_t;
+#define UCL_PARSER_SAFE_FLAGS (UCL_PARSER_NO_TIME | \
+ UCL_PARSER_NO_IMPLICIT_ARRAYS | \
+ UCL_PARSER_DISABLE_MACRO | \
+ UCL_PARSER_NO_FILEVARS)
/**
* String conversion flags, that are used in #ucl_object_fromstring_common function.
*/
diff --git a/contrib/libucl/ucl_parser.c b/contrib/libucl/ucl_parser.c
index 6be16d121..e56a010ff 100644
--- a/contrib/libucl/ucl_parser.c
+++ b/contrib/libucl/ucl_parser.c
@@ -1246,10 +1246,12 @@ ucl_parser_process_object_element (struct ucl_parser *parser, ucl_object_t *nobj
container = parser->stack->obj->value.ov;
DL_FOREACH (parser->stack->obj, cur) {
- tobj = __DECONST (ucl_object_t *, ucl_hash_search_obj (cur->value.ov, nobj));
+ if (cur->type == UCL_OBJECT) {
+ tobj = __DECONST (ucl_object_t *, ucl_hash_search_obj (cur->value.ov, nobj));
- if (tobj != NULL) {
- break;
+ if (tobj != NULL) {
+ break;
+ }
}
}
@@ -3165,7 +3167,7 @@ ucl_parser_add_string (struct ucl_parser *parser, const char *data,
bool
ucl_set_include_path (struct ucl_parser *parser, ucl_object_t *paths)
{
- if (parser == NULL || paths == NULL) {
+ if (parser == NULL || paths == NULL || paths->type != UCL_ARRAY) {
return false;
}
diff --git a/contrib/libucl/ucl_util.c b/contrib/libucl/ucl_util.c
index b00f2779e..d5b84f6a5 100644
--- a/contrib/libucl/ucl_util.c
+++ b/contrib/libucl/ucl_util.c
@@ -3148,6 +3148,10 @@ ucl_object_frombool (bool bv)
bool
ucl_array_append (ucl_object_t *top, ucl_object_t *elt)
{
+ if (top->type != UCL_ARRAY) {
+ return false;
+ }
+
UCL_ARRAY_GET (vec, top);
if (elt == NULL || top == NULL) {
@@ -3177,6 +3181,10 @@ e0:
bool
ucl_array_prepend (ucl_object_t *top, ucl_object_t *elt)
{
+ if (top->type != UCL_ARRAY) {
+ return false;
+ }
+
UCL_ARRAY_GET (vec, top);
if (elt == NULL || top == NULL) {
@@ -3242,6 +3250,10 @@ e0:
ucl_object_t *
ucl_array_delete (ucl_object_t *top, ucl_object_t *elt)
{
+ if (top->type != UCL_ARRAY) {
+ return NULL;
+ }
+
UCL_ARRAY_GET (vec, top);
ucl_object_t *ret = NULL;
unsigned i;
@@ -3290,6 +3302,10 @@ ucl_array_tail (const ucl_object_t *top)
ucl_object_t *
ucl_array_pop_last (ucl_object_t *top)
{
+ if (top->type != UCL_ARRAY) {
+ return NULL;
+ }
+
UCL_ARRAY_GET (vec, top);
ucl_object_t **obj, *ret = NULL;
@@ -3306,6 +3322,10 @@ ucl_array_pop_last (ucl_object_t *top)
ucl_object_t *
ucl_array_pop_first (ucl_object_t *top)
{
+ if (top->type != UCL_ARRAY) {
+ return NULL;
+ }
+
UCL_ARRAY_GET (vec, top);
ucl_object_t **obj, *ret = NULL;
@@ -3338,6 +3358,10 @@ ucl_array_size (const ucl_object_t *top)
const ucl_object_t *
ucl_array_find_index (const ucl_object_t *top, unsigned int index)
{
+ if (top->type != UCL_ARRAY) {
+ return NULL;
+ }
+
UCL_ARRAY_GET (vec, top);
if (vec != NULL && vec->n > 0 && index < vec->n) {
@@ -3350,6 +3374,10 @@ ucl_array_find_index (const ucl_object_t *top, unsigned int index)
unsigned int
ucl_array_index_of (ucl_object_t *top, ucl_object_t *elt)
{
+ if (top->type != UCL_ARRAY) {
+ return (unsigned int)(-1);
+ }
+
UCL_ARRAY_GET (vec, top);
unsigned i;
@@ -3370,6 +3398,10 @@ ucl_object_t *
ucl_array_replace_index (ucl_object_t *top, ucl_object_t *elt,
unsigned int index)
{
+ if (top->type != UCL_ARRAY) {
+ return NULL;
+ }
+
UCL_ARRAY_GET (vec, top);
ucl_object_t *ret = NULL;
diff --git a/contrib/snowball/compiler/generator.c b/contrib/snowball/compiler/generator.c
index eed86c117..44fbdcea4 100644
--- a/contrib/snowball/compiler/generator.c
+++ b/contrib/snowball/compiler/generator.c
@@ -1,4 +1,20 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
#include <limits.h> /* for INT_MAX */
#include <stdio.h> /* for fprintf etc */
#include <stdlib.h> /* for free etc */
@@ -1241,7 +1257,7 @@ static void generate_substring(struct generator * g, struct node * p) {
g->I[3] = bitmap;
g->I[4] = shortest_size - 1;
if (p->mode == m_forward) {
- sprintf(buf, "z->p[z->c + %d]", shortest_size - 1);
+ snprintf(buf, sizeof(buf), "z->p[z->c + %d]", shortest_size - 1);
g->S[1] = buf;
if (shortest_size == 1) {
writef(g, "~Mif (z->c >= z->l", p);
diff --git a/contrib/snowball/compiler/space.c b/contrib/snowball/compiler/space.c
index 5b058763a..46c32e079 100644
--- a/contrib/snowball/compiler/space.c
+++ b/contrib/snowball/compiler/space.c
@@ -1,11 +1,27 @@
-#include <stdio.h> /* for printf */
-#include <stdlib.h> /* malloc, free */
-#include <string.h> /* memmove */
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h> /* for printf */
+#include <stdlib.h> /* malloc, free */
+#include <string.h> /* memmove */
#include "header.h"
-#define HEAD 2*sizeof(int)
+#define HEAD 2 * sizeof(int)
#define EXTENDER 40
@@ -50,108 +66,125 @@
overwriting.
*/
-extern symbol * create_b(int n) {
- symbol * p = (symbol *) (HEAD + (char *) MALLOC(HEAD + (n + 1) * sizeof(symbol)));
- CAPACITY(p) = n;
- SIZE(p) = 0;
- return p;
+extern symbol *create_b(int n)
+{
+ symbol *p = (symbol *) (HEAD + (char *) MALLOC(HEAD + (n + 1) * sizeof(symbol)));
+ CAPACITY(p) = n;
+ SIZE(p) = 0;
+ return p;
}
-extern void report_b(FILE * out, const symbol * p) {
- int i;
- for (i = 0; i < SIZE(p); i++) {
- if (p[i] > 255) {
- printf("In report_b, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]);
- exit(1);
- }
- putc(p[i], out);
- }
+extern void report_b(FILE *out, const symbol *p)
+{
+ int i;
+ for (i = 0; i < SIZE(p); i++) {
+ if (p[i] > 255) {
+ printf("In report_b, can't convert p[%d] to char because it's 0x%02x\n", i, (int) p[i]);
+ exit(1);
+ }
+ putc(p[i], out);
+ }
}
-extern void output_str(FILE * outfile, struct str * str) {
- report_b(outfile, str_data(str));
+extern void output_str(FILE *outfile, struct str *str)
+{
+ report_b(outfile, str_data(str));
}
-extern void lose_b(symbol * p) {
- if (p == 0) return;
- FREE((char *) p - HEAD);
+extern void lose_b(symbol *p)
+{
+ if (p == 0) return;
+ FREE((char *) p - HEAD);
}
-extern symbol * increase_capacity(symbol * p, int n) {
- symbol * q = create_b(CAPACITY(p) + n + EXTENDER);
- memmove(q, p, CAPACITY(p) * sizeof(symbol));
- SIZE(q) = SIZE(p);
- lose_b(p); return q;
+extern symbol *increase_capacity(symbol *p, int n)
+{
+ symbol *q = create_b(CAPACITY(p) + n + EXTENDER);
+ memmove(q, p, CAPACITY(p) * sizeof(symbol));
+ SIZE(q) = SIZE(p);
+ lose_b(p);
+ return q;
}
-extern symbol * move_to_b(symbol * p, int n, const symbol * q) {
- int x = n - CAPACITY(p);
- if (x > 0) p = increase_capacity(p, x);
- memmove(p, q, n * sizeof(symbol)); SIZE(p) = n; return p;
+extern symbol *move_to_b(symbol *p, int n, const symbol *q)
+{
+ int x = n - CAPACITY(p);
+ if (x > 0) p = increase_capacity(p, x);
+ memmove(p, q, n * sizeof(symbol));
+ SIZE(p) = n;
+ return p;
}
-extern symbol * add_to_b(symbol * p, int n, const symbol * q) {
- int x = SIZE(p) + n - CAPACITY(p);
- if (x > 0) p = increase_capacity(p, x);
- memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p;
+extern symbol *add_to_b(symbol *p, int n, const symbol *q)
+{
+ int x = SIZE(p) + n - CAPACITY(p);
+ if (x > 0) p = increase_capacity(p, x);
+ memmove(p + SIZE(p), q, n * sizeof(symbol));
+ SIZE(p) += n;
+ return p;
}
-extern symbol * copy_b(const symbol * p) {
- int n = SIZE(p);
- symbol * q = create_b(n);
- move_to_b(q, n, p);
- return q;
+extern symbol *copy_b(const symbol *p)
+{
+ int n = SIZE(p);
+ symbol *q = create_b(n);
+ move_to_b(q, n, p);
+ return q;
}
int space_count = 0;
-extern void * check_malloc(int n) {
- space_count++;
- return malloc(n);
+extern void *check_malloc(int n)
+{
+ space_count++;
+ return malloc(n);
}
-extern void check_free(void * p) {
- space_count--;
- free(p);
+extern void check_free(void *p)
+{
+ space_count--;
+ free(p);
}
/* To convert a block to a zero terminated string: */
-extern char * b_to_s(const symbol * p) {
- int n = SIZE(p);
- char * s = (char *)malloc(n + 1);
- {
- int i;
- for (i = 0; i < n; i++) {
- if (p[i] > 255) {
- printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]);
- exit(1);
- }
- s[i] = (char)p[i];
- }
- }
- s[n] = 0;
- return s;
+extern char *b_to_s(const symbol *p)
+{
+ int n = SIZE(p);
+ char *s = (char *) malloc(n + 1);
+ {
+ int i;
+ for (i = 0; i < n; i++) {
+ if (p[i] > 255) {
+ printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int) p[i]);
+ exit(1);
+ }
+ s[i] = (char) p[i];
+ }
+ }
+ s[n] = 0;
+ return s;
}
/* To add a zero terminated string to a block. If p = 0 the
block is created. */
-extern symbol * add_s_to_b(symbol * p, const char * s) {
- int n = strlen(s);
- int k;
- if (p == 0) p = create_b(n);
- k = SIZE(p);
- {
- int x = k + n - CAPACITY(p);
- if (x > 0) p = increase_capacity(p, x);
- }
- {
- int i;
- for (i = 0; i < n; i++) p[i + k] = s[i];
- }
- SIZE(p) += n;
- return p;
+extern symbol *add_s_to_b(symbol *p, const char *s)
+{
+ int n = strlen(s);
+ int k;
+ if (p == 0) p = create_b(n);
+ k = SIZE(p);
+ {
+ int x = k + n - CAPACITY(p);
+ if (x > 0) p = increase_capacity(p, x);
+ }
+ {
+ int i;
+ for (i = 0; i < n; i++) p[i + k] = s[i];
+ }
+ SIZE(p) += n;
+ return p;
}
/* The next section defines string handling capabilities in terms
@@ -159,129 +192,151 @@ extern symbol * add_s_to_b(symbol * p, const char * s) {
/* -------------------------------------------------------------*/
struct str {
- symbol * data;
+ symbol *data;
};
/* Create a new string. */
-extern struct str * str_new(void) {
+extern struct str *str_new(void)
+{
- struct str * output = (struct str *) malloc(sizeof(struct str));
- output->data = create_b(0);
- return output;
+ struct str *output = (struct str *) malloc(sizeof(struct str));
+ output->data = create_b(0);
+ return output;
}
/* Delete a string. */
-extern void str_delete(struct str * str) {
+extern void str_delete(struct str *str)
+{
- lose_b(str->data);
- free(str);
+ lose_b(str->data);
+ free(str);
}
/* Append a str to this str. */
-extern void str_append(struct str * str, const struct str * add) {
+extern void str_append(struct str *str, const struct str *add)
+{
- symbol * q = add->data;
- str->data = add_to_b(str->data, SIZE(q), q);
+ symbol *q = add->data;
+ str->data = add_to_b(str->data, SIZE(q), q);
}
/* Append a character to this str. */
-extern void str_append_ch(struct str * str, char add) {
+extern void str_append_ch(struct str *str, char add)
+{
- symbol q[1];
- q[0] = add;
- str->data = add_to_b(str->data, 1, q);
+ symbol q[1];
+ q[0] = add;
+ str->data = add_to_b(str->data, 1, q);
}
/* Append a low level block to a str. */
-extern void str_append_b(struct str * str, const symbol * q) {
+extern void str_append_b(struct str *str, const symbol *q)
+{
- str->data = add_to_b(str->data, SIZE(q), q);
+ str->data = add_to_b(str->data, SIZE(q), q);
}
/* Append the tail of a low level block to a str. */
-extern void str_append_b_tail(struct str * str, const symbol * q, int skip) {
- if (skip < 0 || skip >= SIZE(q)) return;
+extern void str_append_b_tail(struct str *str, const symbol *q, int skip)
+{
+ if (skip < 0 || skip >= SIZE(q)) return;
- str->data = add_to_b(str->data, SIZE(q) - skip, q + skip);
+ str->data = add_to_b(str->data, SIZE(q) - skip, q + skip);
}
/* Append a (char *, null terminated) string to a str. */
-extern void str_append_string(struct str * str, const char * s) {
+extern void str_append_string(struct str *str, const char *s)
+{
- str->data = add_s_to_b(str->data, s);
+ str->data = add_s_to_b(str->data, s);
}
/* Append an integer to a str. */
-extern void str_append_int(struct str * str, int i) {
+extern void str_append_int(struct str *str, int i)
+{
- char s[30];
- sprintf(s, "%d", i);
- str_append_string(str, s);
+ char s[30];
+ snprintf(s, sizeof(s), "%d", i);
+ str_append_string(str, s);
}
/* Clear a string */
-extern void str_clear(struct str * str) {
+extern void str_clear(struct str *str)
+{
- SIZE(str->data) = 0;
+ SIZE(str->data) = 0;
}
/* Set a string */
-extern void str_assign(struct str * str, const char * s) {
+extern void str_assign(struct str *str, const char *s)
+{
- str_clear(str);
- str_append_string(str, s);
+ str_clear(str);
+ str_append_string(str, s);
}
/* Copy a string. */
-extern struct str * str_copy(const struct str * old) {
+extern struct str *str_copy(const struct str *old)
+{
- struct str * newstr = str_new();
- str_append(newstr, old);
- return newstr;
+ struct str *newstr = str_new();
+ str_append(newstr, old);
+ return newstr;
}
/* Get the data stored in this str. */
-extern symbol * str_data(const struct str * str) {
+extern symbol *str_data(const struct str *str)
+{
- return str->data;
+ return str->data;
}
/* Get the length of the str. */
-extern int str_len(const struct str * str) {
+extern int str_len(const struct str *str)
+{
- return SIZE(str->data);
+ return SIZE(str->data);
}
/* Get the last character of the str.
*
* Or -1 if the string is empty.
*/
-extern int str_back(const struct str *str) {
- return SIZE(str->data) ? str->data[SIZE(str->data) - 1] : -1;
+extern int str_back(const struct str *str)
+{
+ return SIZE(str->data) ? str->data[SIZE(str->data) - 1] : -1;
}
-extern int get_utf8(const symbol * p, int * slot) {
- int b0, b1;
- b0 = *p++;
- if (b0 < 0xC0) { /* 1100 0000 */
- * slot = b0; return 1;
- }
- b1 = *p++;
- if (b0 < 0xE0) { /* 1110 0000 */
- * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
- }
- * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3;
+extern int get_utf8(const symbol *p, int *slot)
+{
+ int b0, b1;
+ b0 = *p++;
+ if (b0 < 0xC0) { /* 1100 0000 */
+ *slot = b0;
+ return 1;
+ }
+ b1 = *p++;
+ if (b0 < 0xE0) { /* 1110 0000 */
+ *slot = (b0 & 0x1F) << 6 | (b1 & 0x3F);
+ return 2;
+ }
+ *slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F);
+ return 3;
}
-extern int put_utf8(int ch, symbol * p) {
- if (ch < 0x80) {
- p[0] = ch; return 1;
- }
- if (ch < 0x800) {
- p[0] = (ch >> 6) | 0xC0;
- p[1] = (ch & 0x3F) | 0x80; return 2;
- }
- p[0] = (ch >> 12) | 0xE0;
- p[1] = ((ch >> 6) & 0x3F) | 0x80;
- p[2] = (ch & 0x3F) | 0x80; return 3;
+extern int put_utf8(int ch, symbol *p)
+{
+ if (ch < 0x80) {
+ p[0] = ch;
+ return 1;
+ }
+ if (ch < 0x800) {
+ p[0] = (ch >> 6) | 0xC0;
+ p[1] = (ch & 0x3F) | 0x80;
+ return 2;
+ }
+ p[0] = (ch >> 12) | 0xE0;
+ p[1] = ((ch >> 6) & 0x3F) | 0x80;
+ p[2] = (ch & 0x3F) | 0x80;
+ return 3;
}
diff --git a/contrib/xxhash/CMakeLists.txt b/contrib/xxhash/CMakeLists.txt
index 5091d261d..41985724f 100644
--- a/contrib/xxhash/CMakeLists.txt
+++ b/contrib/xxhash/CMakeLists.txt
@@ -6,8 +6,6 @@ IF (ENABLE_FULL_DEBUG MATCHES "OFF")
if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
SET_TARGET_PROPERTIES(rspamd-xxhash PROPERTIES COMPILE_FLAGS "-O3")
endif ()
-else ()
- ADD_DEFINITIONS(-DXXH_NO_INLINE_HINTS=1)
ENDIF ()
-
+ADD_DEFINITIONS(-DXXH_NO_INLINE_HINTS=1)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DXXH_NO_INLINE_HINTS=1" PARENT_SCOPE)
diff --git a/debian/rules b/debian/rules
index be06aa6d6..8c3ed647b 100755
--- a/debian/rules
+++ b/debian/rules
@@ -7,7 +7,7 @@ export ASAN_OPTIONS=detect_leaks=0
DEB_HOST_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
builddir = debian/build/flavor-
FLAVORS = release asan
-CONFIG_asan = -DCMAKE_BUILD_TYPE=Debug -DSANITIZE=address -DENABLE_LTO=OFF
+CONFIG_asan = -DCMAKE_BUILD_TYPE=Debug -DSANITIZE=address -DENABLE_LTO=OFF -DENABLE_FULL_DEBUG=ON
CONFIG_release = -DCMAKE_BUILD_TYPE=RelWithDebInfo -DENABLE_LTO=ON
ENABLE_LUAJIT := -DENABLE_LUAJIT=ON -DLUA_ROOT=/luajit-build
diff --git a/doc/rspamc.1 b/doc/rspamc.1
index de022d542..51a4504cc 100644
--- a/doc/rspamc.1
+++ b/doc/rspamc.1
@@ -68,7 +68,8 @@ fuzzy* commands requires input.
Specify host and port
.TP
\-P \f[I]password\f[R], \-\-password=\f[I]password\f[R]
-Specify control password
+Specify control password. Can be an absolute or relative path, in which
+case the password will be read from that file.
.TP
\-c \f[I]name\f[R], \-\-classifier=\f[I]name\f[R]
Classifier to learn spam or ham (bayes is used by default)
diff --git a/doc/rspamc.1.md b/doc/rspamc.1.md
index 335c22513..4389d25f4 100644
--- a/doc/rspamc.1.md
+++ b/doc/rspamc.1.md
@@ -42,7 +42,7 @@ requires input.
: Specify host and port
-P *password*, \--password=*password*
-: Specify control password
+: Specify control password. Can be an absolute or relative path, in which case the password will be read from that file.
-c *name*, \--classifier=*name*
: Classifier to learn spam or ham (bayes is used by default)
diff --git a/eslint.config.mjs b/eslint.config.mjs
index 2f256afcb..bdd6ede48 100644
--- a/eslint.config.mjs
+++ b/eslint.config.mjs
@@ -13,6 +13,9 @@ export default [
...globals.browser,
define: false,
},
+ parserOptions: {
+ ecmaVersion: 2020,
+ },
sourceType: "script",
},
plugins: {
diff --git a/interface/css/rspamd.css b/interface/css/rspamd.css
index 896f92008..a4335c07b 100644
--- a/interface/css/rspamd.css
+++ b/interface/css/rspamd.css
@@ -95,6 +95,15 @@ fieldset[disabled] .btn {
pointer-events: auto;
cursor: not-allowed;
}
+.card.disabled,
+.input-group.disabled {
+ cursor: not-allowed;
+ opacity: 0.65;
+}
+.card.disabled *,
+.input-group.disabled * {
+ pointer-events: none;
+}
.w-1 {
width: 1%;
}
@@ -418,13 +427,18 @@ table#symbolsTable input[type="number"] {
.outline-dashed-primary { outline: 2px dashed var(--bs-primary); }
-#scanMsgSource:placeholder-shown {
+#scanMsgSource:placeholder-shown,
+#selectorsMsgArea:placeholder-shown {
background-image: url("../img/drop-area.svg");
background-repeat: no-repeat;
background-position: center;
opacity: 0.8;
}
-#scanMsgSource:not(:placeholder-shown) { background-image: none;}
+
+#scanMsgSource:not(:placeholder-shown),
+#selectorsMsgArea:not(:placeholder-shown) {
+ background-image: none;
+}
.scorebar-spam {
background-color: rgba(240 0 0 / 0.1) !important;
diff --git a/interface/index.html b/interface/index.html
index cf2187601..30181e788 100644
--- a/interface/index.html
+++ b/interface/index.html
@@ -453,7 +453,8 @@
</div>
<div class="input-group d-inline-flex w-auto my-1">
<label for="fuzzy-flag" class="input-group-text">Flag</label>
- <input id="fuzzy-flag" class="form-control" value="1" min="1" type="number">
+ <select id="fuzzy-flag-picker" class="form-select"></select>
+ <input id="fuzzy-flag" class="form-control flex-grow-0" value="1" min="1" type="number">
<button class="btn btn-warning d-flex align-items-center" data-upload="compute-fuzzy"><i class="fas fa-hashtag me-2"></i>Compute fuzzy hashes</button>
</div>
<div class="float-end my-1">
@@ -479,9 +480,12 @@
<div class="card bg-light shadow card-body card p-2">
<p>Learn Bayesian classifier:</p>
<form>
- <div class="btn-group">
- <button class="btn btn-success d-flex align-items-center" type="button" data-upload="learnham" disabled><i class="fas fa-thumbs-up me-2"></i>Upload HAM</button>
- <button class="btn btn-danger d-flex align-items-center" type="button" data-upload="learnspam" disabled><i class="fas fa-thumbs-down me-2"></i>Upload SPAM</button>
+ <div class="d-flex flex-wrap flex-lg-column align-items-start align-items-lg-stretch gap-2">
+ <select id="classifier" class="form-select w-auto"></select>
+ <div class="btn-group">
+ <button class="btn btn-success d-flex align-items-center" type="button" data-upload="learnham" disabled><i class="fas fa-thumbs-up me-2"></i>Upload HAM</button>
+ <button class="btn btn-danger d-flex align-items-center" type="button" data-upload="learnspam" disabled><i class="fas fa-thumbs-down me-2"></i>Upload SPAM</button>
+ </div>
</div>
</form>
</div>
@@ -492,6 +496,7 @@
<div class="row g-2 align-items-center">
<div class="col-auto d-flex align-items-center me-1">
<label for="fuzzyFlagText" class="me-1">Flag:</label>
+ <select id="fuzzyFlagText-picker" class="form-select"></select>
<input id="fuzzyFlagText" class="form-control" type="number" value="1"/>
</div>
<div class="col-auto d-flex align-items-center me-2">
@@ -580,6 +585,10 @@
<div class="card-header text-secondary py-2 d-flex align-items-center">
<span class="icon me-3"><i class="fas fa-envelope"></i></span>
<span class="h6 fw-bolder my-auto">Test Rspamd selectors</span>
+ <div class="d-flex input-group-sm align-items-center ms-auto">
+ <label for="formFile" class="col-auto col-form-label-sm me-1">Choose a file:</label>
+ <input class="form-control form-control-sm btn btn-secondary" id="selectorsFile" type="file">
+ </div>
</div>
<div class="card-body p-0">
<div class="row h-100 m-0" id="row-main">
@@ -608,7 +617,7 @@
<div class="col">
<div class="form-group">
<label class="form-label" for="selectorsMsgArea">Message source:</label>
- <textarea class="form-control" id="selectorsMsgArea" rows="9" placeholder="Paste raw message source"></textarea>
+ <textarea class="form-control" id="selectorsMsgArea" rows="9" placeholder='Paste raw message source, drag and drop files here or use "Browse..." button.'></textarea>
</div>
<button class="btn btn-secondary d-flex align-items-center float-end" id="selectorsMsgClean"><i class="fas fa-trash-alt me-2"></i>Clean form</button>
</div>
diff --git a/interface/js/app/common.js b/interface/js/app/common.js
index ace4bbba1..6f37d739e 100644
--- a/interface/js/app/common.js
+++ b/interface/js/app/common.js
@@ -57,6 +57,20 @@ define(["jquery", "nprogress"],
}, 5000);
}
+ /**
+ * Perform a request to a single Rspamd neighbour server.
+ *
+ * @param {Array.<Object>} neighbours_status
+ * Array of neighbour status objects.
+ * @param {number} ind
+ * Index of this neighbour in the `neighbours_status` array.
+ * @param {string} req_url
+ * Relative controller endpoint with optional query string.
+ * @param {Object} o
+ * The same `options` object passed into `ui.query`.
+ *
+ * @returns {void}
+ */
function queryServer(neighbours_status, ind, req_url, o) {
neighbours_status[ind].checked = false;
neighbours_status[ind].data = {};
@@ -152,23 +166,51 @@ define(["jquery", "nprogress"],
};
/**
- * @param {string} url - A string containing the URL to which the request is sent
- * @param {Object} [options] - A set of key/value pairs that configure the Ajax request. All settings are optional.
+ * Perform an HTTP request to one or all Rspamd neighbours.
*
- * @param {Function} [options.complete] - A function to be called when the requests to all neighbours complete.
- * @param {Object|string|Array} [options.data] - Data to be sent to the server.
- * @param {Function} [options.error] - A function to be called if the request fails.
- * @param {string} [options.errorMessage] - Text to display in the alert message if the request fails.
- * @param {string} [options.errorOnceId] - A prefix of the alert ID to be added to the session storage. If the
- * parameter is set, the error for each server will be displayed only once per session.
- * @param {Object} [options.headers] - An object of additional header key/value pairs to send along with requests
- * using the XMLHttpRequest transport.
- * @param {string} [options.method] - The HTTP method to use for the request.
- * @param {Object} [options.params] - An object of additional jQuery.ajax() settings key/value pairs.
- * @param {string} [options.server] - A server to which send the request.
- * @param {Function} [options.success] - A function to be called if the request succeeds.
+ * @param {string} url
+ * Relative URL, including with optional query string (e.g. "plugins/selectors/check_selector?selector=from").
+ * @param {Object} [options]
+ * Ajax request configuration options.
+ * @param {Object|string|Array} [options.data]
+ * Request body for POST endpoints.
+ * @param {Object} [options.headers]
+ * Additional HTTP headers.
+ * @param {"GET"|"POST"} [options.method]
+ * HTTP method (defaults to "GET").
+ * @param {string} [options.server]
+ * Name or base-URL of the target server (defaults to the currently selected Rspamd neighbour).
+ * @param {Object} [options.params]
+ * Extra jQuery.ajax() settings (e.g. timeout, dataType).
+ * @param {string} [options.errorMessage]
+ * Text to show inside a Bootstrap alert on generic errors (e.g. network failure).
+ * @param {string} [options.errorOnceId]
+ * Prefix for an alert ID stored in session storage to ensure
+ * `errorMessage` is shown only once per server each session.
+ * @param {function(Array.<Object>, Object)} [options.success]
+ * Called on HTTP success. Receives:
+ * 1. results: Array of per-server status objects:
+ * {
+ * name: string,
+ * host: string,
+ * url: string, // full URL base for this neighbour
+ * checked: boolean, // whether this server was attempted
+ * status: boolean, // HTTP success (<400)
+ * data: any, // parsed JSON or raw text
+ * percentComplete: number
+ * }
+ * 2. jqXHR: jQuery XHR object with properties
+ * { readyState, status, statusText, responseText, responseJSON, … }
+ * @param {function(Object, Object, string, string)} [options.error]
+ * Called on HTTP error or network failure. Receives:
+ * 1. result: a per-server status object (status:false, data:{}).
+ * 2. jqXHR: jQuery XHR object (responseText, responseJSON, status, statusText).
+ * 3. textStatus: string describing error type ("error", "timeout", etc.).
+ * 4. errorThrown: exception message or HTTP statusText.
+ * @param {function()} [options.complete]
+ * Called once all servers have been tried; takes no arguments.
*
- * @returns {undefined}
+ * @returns {void}
*/
ui.query = function (url, options) {
// Force options to be an object
@@ -261,5 +303,63 @@ define(["jquery", "nprogress"],
).appendTo(ftFilter.$dropdown);
};
+ ui.fileUtils = {
+ readFile(files, callback, index = 0) {
+ const file = files[index];
+ const reader = new FileReader();
+ reader.onerror = () => alertMessage("alert-error", `Error reading file: ${file.name}`);
+ reader.onloadend = () => callback(reader.result);
+ reader.readAsText(file);
+ },
+
+ setFileInputFiles(fileInput, files, i) {
+ const dt = new DataTransfer();
+ if (arguments.length > 2) dt.items.add(files[i]);
+ $(fileInput).prop("files", dt.files);
+ },
+
+ setupFileHandling(textArea, fileInput, fileSet, enable_btn_cb, multiple_files_cb) {
+ const dragoverClassList = "outline-dashed-primary bg-primary-subtle";
+ const {readFile, setFileInputFiles} = ui.fileUtils;
+
+ function handleFileInput(fileSource) {
+ fileSet.files = fileSource.files;
+ fileSet.index = 0;
+ const {files} = fileSet;
+
+ if (files.length === 1) {
+ setFileInputFiles(fileInput, files, 0);
+ enable_btn_cb();
+ readFile(files, (result) => {
+ $(textArea).val(result);
+ enable_btn_cb();
+ });
+ } else if (multiple_files_cb) {
+ multiple_files_cb(files);
+ } else {
+ alertMessage("alert-warning", "Multiple files processing is not supported.");
+ }
+ }
+
+ $(textArea)
+ .on("dragenter dragover dragleave drop", (e) => {
+ e.preventDefault();
+ e.stopPropagation();
+ })
+ .on("dragenter dragover", () => $(textArea).addClass(dragoverClassList))
+ .on("dragleave drop", () => $(textArea).removeClass(dragoverClassList))
+ .on("drop", (e) => handleFileInput(e.originalEvent.dataTransfer))
+ .on("input", () => {
+ enable_btn_cb();
+ if (fileSet.files) {
+ fileSet.files = null;
+ setFileInputFiles(fileInput, fileSet.files);
+ }
+ });
+
+ $(fileInput).on("change", (e) => handleFileInput(e.target));
+ }
+ };
+
return ui;
});
diff --git a/interface/js/app/rspamd.js b/interface/js/app/rspamd.js
index 6d047d6f6..ceba6864b 100644
--- a/interface/js/app/rspamd.js
+++ b/interface/js/app/rspamd.js
@@ -176,7 +176,7 @@ define(["jquery", "app/common", "stickytabs", "visibility",
require(["app/symbols"], (module) => module.getSymbols());
break;
case "#scan_nav":
- require(["app/upload"]);
+ require(["app/upload"], (module) => module.getFuzzyStorages());
break;
case "#selectors_nav":
require(["app/selectors"], (module) => module.displayUI());
@@ -236,6 +236,8 @@ define(["jquery", "app/common", "stickytabs", "visibility",
complete: function () {
ajaxSetup(localStorage.getItem("ajax_timeout"));
+ if (require.defined("app/upload")) require(["app/upload"], (module) => module.getClassifiers());
+
if (common.read_only) {
$(".ro-disable").attr("disabled", true);
$(".ro-hide").hide();
diff --git a/interface/js/app/selectors.js b/interface/js/app/selectors.js
index c2b8b27e5..4a1c6d0d0 100644
--- a/interface/js/app/selectors.js
+++ b/interface/js/app/selectors.js
@@ -2,6 +2,7 @@ define(["jquery", "app/common"],
($, common) => {
"use strict";
const ui = {};
+ const fileSet = {files: null, index: null};
function enable_disable_check_btn() {
$("#selectorsChkMsgBtn").prop("disabled", (
@@ -129,12 +130,15 @@ define(["jquery", "app/common"],
return false;
});
- $("#selectorsMsgArea").on("input", () => {
- enable_disable_check_btn();
- });
$("#selectorsSelArea").on("input", () => {
checkSelectors();
});
+ $("#selectorsMsgClean").on("click", () => {
+ $("#selectorsMsgArea").val("");
+ $("#selectorsFile").val("");
+ });
+
+ common.fileUtils.setupFileHandling("#selectorsMsgArea", "#selectorsFile", fileSet, enable_disable_check_btn);
return ui;
});
diff --git a/interface/js/app/symbols.js b/interface/js/app/symbols.js
index 3ff5d5a4b..21711a1e5 100644
--- a/interface/js/app/symbols.js
+++ b/interface/js/app/symbols.js
@@ -135,6 +135,7 @@ define(["jquery", "app/common", "footable"],
construct: function (instance) {
this._super(instance);
[,this.groups] = items;
+ this.groups.sort((a, b) => a.toLowerCase().localeCompare(b.toLowerCase()));
this.def = "Any group";
this.$group = null;
},
diff --git a/interface/js/app/upload.js b/interface/js/app/upload.js
index d5283cdb1..c82bb2306 100644
--- a/interface/js/app/upload.js
+++ b/interface/js/app/upload.js
@@ -28,8 +28,7 @@ define(["jquery", "app/common", "app/libft"],
($, common, libft) => {
"use strict";
const ui = {};
- let files = null;
- let filesIdx = null;
+ const fileSet = {files: null, index: null};
let scanTextHeaders = {};
function uploadText(data, url, headers, method = "POST") {
@@ -67,19 +66,7 @@ define(["jquery", "app/common", "app/libft"],
function enable_disable_scan_btn(disable) {
$("#scan button:not(#cleanScanHistory, #deleteHashesBtn, #scanOptionsToggle, .ft-columns-btn)")
- .prop("disabled", (disable || $.trim($("textarea").val()).length === 0));
- }
-
- function setFileInputFiles(i) {
- const dt = new DataTransfer();
- if (arguments.length) dt.items.add(files[i]);
- $("#formFile").prop("files", dt.files);
- }
-
- function readFile(callback, i) {
- const reader = new FileReader();
- reader.readAsText(files[(arguments.length === 1) ? 0 : i]);
- reader.onload = () => callback(reader.result);
+ .prop("disabled", (disable || $.trim($("#scanMsgSource").val()).length === 0));
}
function scanText(data) {
@@ -100,7 +87,7 @@ define(["jquery", "app/common", "app/libft"],
const {items} = o;
common.symbols.scan.push(o.symbols[0]);
- if (files) items[0].file = files[filesIdx].name;
+ if (fileSet.files) items[0].file = fileSet.files[fileSet.index].name;
if (Object.prototype.hasOwnProperty.call(common.tables, "scan")) {
common.tables.scan.rows.load(items, true);
@@ -108,14 +95,16 @@ define(["jquery", "app/common", "app/libft"],
require(["footable"], () => {
libft.initHistoryTable(data, items, "scan", libft.columns_v2("scan"), true,
() => {
- if (files && filesIdx < files.length - 1) {
- readFile((result) => {
- if (filesIdx === files.length - 1) {
+ const {files} = fileSet;
+ if (files && fileSet.index < files.length - 1) {
+ common.fileUtils.readFile(files, (result) => {
+ const {index} = fileSet;
+ if (index === files.length - 1) {
$("#scanMsgSource").val(result);
- setFileInputFiles(filesIdx);
+ common.fileUtils.setFileInputFiles("#formFile", files, index);
}
scanText(result);
- }, ++filesIdx);
+ }, ++fileSet.index);
} else {
enable_disable_scan_btn();
$("#cleanScanHistory, #scan .ft-columns-dropdown .btn-dropdown-apply")
@@ -196,13 +185,6 @@ define(["jquery", "app/common", "app/libft"],
});
enable_disable_scan_btn();
- $("textarea").on("input", () => {
- enable_disable_scan_btn();
- if (files) {
- files = null;
- setFileInputFiles();
- }
- });
$("#scanClean").on("click", () => {
enable_disable_scan_btn(true);
@@ -235,7 +217,10 @@ define(["jquery", "app/common", "app/libft"],
getFuzzyHashes(data);
} else {
let headers = {};
- if (source === "fuzzyadd") {
+ if (source === "learnham" || source === "learnspam") {
+ const classifier = $("#classifier").val();
+ if (classifier) headers = {classifier: classifier};
+ } else if (source === "fuzzyadd") {
headers = {
flag: $("#fuzzyFlagText").val(),
weight: $("#fuzzyWeightText").val()
@@ -304,39 +289,122 @@ define(["jquery", "app/common", "app/libft"],
});
- function fileInputHandler(obj) {
- ({files} = obj);
- filesIdx = 0;
-
- if (files.length === 1) {
- setFileInputFiles(0);
- enable_disable_scan_btn();
- readFile((result) => {
- $("#scanMsgSource").val(result);
- enable_disable_scan_btn();
- });
+ function multiple_files_cb(files) {
// eslint-disable-next-line no-alert
- } else if (files.length < 10 || confirm("Are you sure you want to scan " + files.length + " files?")) {
+ if (files.length < 10 || confirm("Are you sure you want to scan " + files.length + " files?")) {
getScanTextHeaders();
- readFile((result) => scanText(result));
+ common.fileUtils.readFile(files, (result) => scanText(result));
}
}
- const dragoverClassList = "outline-dashed-primary bg-primary-subtle";
- $("#scanMsgSource")
- .on("dragenter dragover dragleave drop", (e) => {
- e.preventDefault();
- e.stopPropagation();
- })
- .on("dragenter dragover", () => {
- $("#scanMsgSource").addClass(dragoverClassList);
- })
- .on("dragleave drop", () => {
- $("#scanMsgSource").removeClass(dragoverClassList);
- })
- .on("drop", (e) => fileInputHandler(e.originalEvent.dataTransfer));
-
- $("#formFile").on("change", (e) => fileInputHandler(e.target));
+ common.fileUtils.setupFileHandling("#scanMsgSource", "#formFile", fileSet, enable_disable_scan_btn, multiple_files_cb);
+
+ ui.getClassifiers = function () {
+ if (!common.read_only) {
+ const sel = $("#classifier").empty().append($("<option>", {value: "", text: "All classifiers"}));
+ common.query("/bayes/classifiers", {
+ success: function (data) {
+ data[0].data.forEach((c) => sel.append($("<option>", {value: c, text: c})));
+ },
+ server: common.getServer()
+ });
+ }
+ };
+ ui.getClassifiers();
+
+
+ const fuzzyWidgets = [
+ {
+ picker: "#fuzzy-flag-picker",
+ input: "#fuzzy-flag",
+ container: ($picker) => $picker.parent()
+ },
+ {
+ picker: "#fuzzyFlagText-picker",
+ input: "#fuzzyFlagText",
+ container: ($picker) => $picker.closest("div.card")
+ }
+ ];
+
+ function toggleWidgets(showPicker, showInput) {
+ fuzzyWidgets.forEach(({picker, input}) => {
+ $(picker)[showPicker ? "show" : "hide"]();
+ $(input)[showInput ? "show" : "hide"]();
+ });
+ }
+
+ function setWidgetsDisabled(disable) {
+ fuzzyWidgets.forEach(({picker, container}) => {
+ container($(picker))[disable ? "addClass" : "removeClass"]("disabled");
+ });
+ }
+
+ let lastFuzzyStoragesReq = {config_id: null, server: null};
+
+ ui.getFuzzyStorages = function () {
+ const server = common.getServer();
+
+ const servers = JSON.parse(sessionStorage.getItem("Credentials") || "{}");
+ const config_id = servers[server]?.data?.config_id;
+
+ if ((config_id && config_id === lastFuzzyStoragesReq.config_id) ||
+ (!config_id && server === lastFuzzyStoragesReq.server)) {
+ return;
+ }
+ lastFuzzyStoragesReq = {config_id: config_id, server: server};
+
+ fuzzyWidgets.forEach(({picker, container}) => container($(picker)).removeAttr("title"));
+
+ common.query("plugins/fuzzy/storages", {
+ success: function (data) {
+ const storages = data[0].data.storages || {};
+ const hasWritableStorages = Object.keys(storages).some((name) => !storages[name].read_only);
+
+ toggleWidgets(true, false);
+ setWidgetsDisabled(!hasWritableStorages);
+
+ fuzzyWidgets.forEach(({picker, input}) => {
+ const $sel = $(picker);
+
+ $sel.empty();
+
+ if (hasWritableStorages) {
+ Object.entries(storages).forEach(([name, info]) => {
+ if (!info.read_only) {
+ Object.entries(info.flags).forEach(([symbol, val]) => {
+ $sel.append($("<option>", {value: val, text: `${name}:${symbol} (${val})`}));
+ });
+ }
+ });
+ $(input).val($sel.val());
+ $sel.off("change").on("change", () => $(input).val($sel.val()));
+ } else {
+ $sel.append($("<option>", {value: "", text: "No writable storages"}));
+ }
+ });
+ },
+ error: function (_result, _jqXHR, _textStatus, errorThrown) {
+ if (errorThrown === "fuzzy_check is not enabled") {
+ toggleWidgets(true, false);
+ setWidgetsDisabled(true);
+
+ fuzzyWidgets.forEach(({picker, container}) => {
+ const $picker = $(picker);
+ $picker
+ .empty()
+ .append($("<option>", {value: "", text: "fuzzy_check disabled"}))
+ .show();
+ container($picker)
+ .attr("title", "fuzzy_check module is not enabled in server configuration.");
+ });
+ } else {
+ toggleWidgets(false, true);
+ setWidgetsDisabled(false);
+ }
+ },
+ server: server
+ });
+ };
return ui;
});
diff --git a/lualib/lua_cfg_transform.lua b/lualib/lua_cfg_transform.lua
index aaa0c7390..ec11ef299 100644
--- a/lualib/lua_cfg_transform.lua
+++ b/lualib/lua_cfg_transform.lua
@@ -376,7 +376,7 @@ return function(cfg)
local next_act = actions_order[j]
if actions:at(next_act) and actions:at(next_act):type() == 'number' then
local next_score = actions:at(next_act):unwrap()
- if next_score <= score then
+ if type(score) == 'number' and type(next_score) == 'number' and next_score <= score then
logger.errx(rspamd_config, 'invalid actions thresholds order: action %s (%s) must have lower ' ..
'score than action %s (%s)', act, score, next_act, next_score)
ret = false
diff --git a/lualib/lua_magic/patterns.lua b/lualib/lua_magic/patterns.lua
index 971ddd95f..4a5abd8ce 100644
--- a/lualib/lua_magic/patterns.lua
+++ b/lualib/lua_magic/patterns.lua
@@ -466,6 +466,23 @@ local patterns = {
},
}
},
+ heic = {
+ matches = {
+ {
+ -- HEIC/HEIF file format signature
+ -- Starts with ftyp followed by specific brand identifiers
+ string = "^....ftyphe[im][cs]",
+ position = 12,
+ weight = 60,
+ },
+ {
+ -- Alternative signature for HEIC/HEIF
+ string = [[^....ftypmif1]],
+ position = 12,
+ weight = 60,
+ },
+ }
+ },
}
return patterns
diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua
index 3dce2e1f8..ad4ae4349 100644
--- a/lualib/lua_magic/types.lua
+++ b/lualib/lua_magic/types.lua
@@ -279,6 +279,11 @@ local types = {
ct = 'image/bmp',
av_check = false,
},
+ heic = {
+ type = 'image',
+ ct = 'image/heic',
+ av_check = false,
+ },
dwg = {
type = 'image',
ct = 'image/vnd.dwg',
@@ -324,4 +329,4 @@ local types = {
},
}
-return types \ No newline at end of file
+return types
diff --git a/lualib/lua_redis.lua b/lualib/lua_redis.lua
index a21b97f89..195b7759f 100644
--- a/lualib/lua_redis.lua
+++ b/lualib/lua_redis.lua
@@ -1129,9 +1129,9 @@ local function redis_make_request_taskless(ev_base, cfg, redis_params, key,
end
--[[[
--- @function lua_redis.redis_make_request_taskless(ev_base, redis_params, key, is_write, callback, command, args)
+-- @function lua_redis.redis_make_request_taskless(ev_base, cfg, redis_params, key, is_write, callback, command, args)
-- Sends a request to Redis in context where `task` is not available for some specific use-cases
--- Identical to redis_make_request() except in that first parameter is an `event base` object
+-- Identical to redis_make_request() except in that first parameter is an `event base` object and the second one is the 'config' object
--]]
exports.rspamd_redis_make_request_taskless = redis_make_request_taskless
@@ -1207,15 +1207,13 @@ local function prepare_redis_call(script)
return options
end
-local function is_all_servers_ready(script)
+local function is_any_server_ready(script)
for _, s in ipairs(script.servers_ready) do
- if s == "unsent" or s == "tempfail" then
- return false
+ if s == "done" then
+ return true
end
end
-
- -- We assume that permanent errors are not recoverable, so we will just skip those servers
- return true
+ return false
end
local function is_all_servers_failed(script)
@@ -1269,7 +1267,7 @@ local function load_script_task(script, task, is_write)
script.sha = data -- We assume that sha is the same on all servers
script.servers_ready[idx] = "done"
end
- if is_all_servers_ready(script) then
+ if is_any_server_ready(script) then
script_set_loaded(script)
elseif is_all_servers_failed(script) then
script.pending_upload = false
@@ -1287,7 +1285,7 @@ local function load_script_task(script, task, is_write)
end
end
- if is_all_servers_ready(script) then
+ if is_any_server_ready(script) then
script_set_loaded(script)
elseif is_all_servers_failed(script) then
script.pending_upload = false
@@ -1314,7 +1312,6 @@ local function load_script_taskless(script, cfg, ev_base, is_write)
err, script.caller.short_src, script.caller.currentline)
opt.upstream:fail()
script.servers_ready[idx] = "failed"
- return
else
-- Assume temporary error
logger.infox(cfg, 'temporary error uploading script %s to %s: %s; registered from: %s:%s',
@@ -1322,7 +1319,6 @@ local function load_script_taskless(script, cfg, ev_base, is_write)
opt.upstream:get_addr():to_string(true),
err, script.caller.short_src, script.caller.currentline)
script.servers_ready[idx] = "tempfail"
- return
end
else
opt.upstream:ok()
@@ -1335,7 +1331,7 @@ local function load_script_taskless(script, cfg, ev_base, is_write)
script.servers_ready[idx] = "done"
end
- if is_all_servers_ready(script) then
+ if is_any_server_ready(script) then
script_set_loaded(script)
elseif is_all_servers_failed(script) then
script.pending_upload = false
@@ -1353,7 +1349,7 @@ local function load_script_taskless(script, cfg, ev_base, is_write)
end
end
- if is_all_servers_ready(script) then
+ if is_any_server_ready(script) then
script_set_loaded(script)
elseif is_all_servers_failed(script) then
script.pending_upload = false
@@ -1482,6 +1478,10 @@ local function exec_redis_script(id, params, callback, keys, args)
script.sha = nil
script.loaded = nil
script.pending_upload = true
+ -- We must initialize all servers as we don't know here which one failed
+ for i, _ in ipairs(script.servers_ready) do
+ script.servers_ready[i] = "unsent"
+ end
-- Reload scripts if this has not been initiated yet
if params.task then
load_script_task(script, params.task)
diff --git a/lualib/redis_scripts/bayes_cache_learn.lua b/lualib/redis_scripts/bayes_cache_learn.lua
index d8a2d878e..7d44a73ef 100644
--- a/lualib/redis_scripts/bayes_cache_learn.lua
+++ b/lualib/redis_scripts/bayes_cache_learn.lua
@@ -1,7 +1,7 @@
-- Lua script to perform cache checking for bayes classification
-- This script accepts the following parameters:
-- key1 - cache id
--- key3 - is spam (1 or 0)
+-- key2 - is spam (1 or 0)
-- key3 - configuration table in message pack
local cache_id = KEYS[1]
diff --git a/lualib/rspamadm/mime.lua b/lualib/rspamadm/mime.lua
index e0b23e16c..a20e47e23 100644
--- a/lualib/rspamadm/mime.lua
+++ b/lualib/rspamadm/mime.lua
@@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-]]--
+]] --
local argparse = require "argparse"
local ansicolors = require "ansicolors"
@@ -35,94 +35,94 @@ local parser = argparse()
:require_command(true)
parser:option "-c --config"
- :description "Path to config file"
- :argname("<cfg>")
- :default(rspamd_paths["CONFDIR"] .. "/" .. "rspamd.conf")
+ :description "Path to config file"
+ :argname("<cfg>")
+ :default(rspamd_paths["CONFDIR"] .. "/" .. "rspamd.conf")
parser:mutex(
- parser:flag "-j --json"
- :description "JSON output",
- parser:flag "-U --ucl"
- :description "UCL output",
- parser:flag "-M --messagepack"
- :description "MessagePack output"
+ parser:flag "-j --json"
+ :description "JSON output",
+ parser:flag "-U --ucl"
+ :description "UCL output",
+ parser:flag "-M --messagepack"
+ :description "MessagePack output"
)
parser:flag "-C --compact"
- :description "Use compact format"
+ :description "Use compact format"
parser:flag "--no-file"
- :description "Do not print filename"
+ :description "Do not print filename"
-- Extract subcommand
local extract = parser:command "extract ex e"
- :description "Extracts data from MIME messages"
+ :description "Extracts data from MIME messages"
extract:argument "file"
- :description "File to process"
- :argname "<file>"
- :args "+"
+ :description "File to process"
+ :argname "<file>"
+ :args "+"
extract:flag "-t --text"
- :description "Extracts plain text data from a message"
+ :description "Extracts plain text data from a message"
extract:flag "-H --html"
- :description "Extracts htm data from a message"
+ :description "Extracts htm data from a message"
extract:option "-o --output"
- :description "Output format ('raw', 'content', 'oneline', 'decoded', 'decoded_utf')"
- :argname("<type>")
- :convert {
- raw = "raw",
- content = "content",
- oneline = "content_oneline",
- decoded = "raw_parsed",
- decoded_utf = "raw_utf"
-}
- :default "content"
+ :description "Output format ('raw', 'content', 'oneline', 'decoded', 'decoded_utf')"
+ :argname("<type>")
+ :convert {
+ raw = "raw",
+ content = "content",
+ oneline = "content_oneline",
+ decoded = "raw_parsed",
+ decoded_utf = "raw_utf"
+ }
+ :default "content"
extract:flag "-w --words"
- :description "Extracts words"
+ :description "Extracts words"
extract:flag "-p --part"
- :description "Show part info"
+ :description "Show part info"
extract:flag "-s --structure"
- :description "Show structure info (e.g. HTML tags)"
+ :description "Show structure info (e.g. HTML tags)"
extract:flag "-i --invisible"
- :description "Show invisible content for HTML parts"
+ :description "Show invisible content for HTML parts"
extract:option "-F --words-format"
- :description "Words format ('stem', 'norm', 'raw', 'full')"
- :argname("<type>")
- :convert {
- stem = "stem",
- norm = "norm",
- raw = "raw",
- full = "full",
-}
- :default "stem"
+ :description "Words format ('stem', 'norm', 'raw', 'full')"
+ :argname("<type>")
+ :convert {
+ stem = "stem",
+ norm = "norm",
+ raw = "raw",
+ full = "full",
+ }
+ :default "stem"
local stat = parser:command "stat st s"
- :description "Extracts statistical data from MIME messages"
+ :description "Extracts statistical data from MIME messages"
stat:argument "file"
:description "File to process"
:argname "<file>"
:args "+"
stat:mutex(
- stat:flag "-m --meta"
- :description "Lua metatokens",
- stat:flag "-b --bayes"
- :description "Bayes tokens",
- stat:flag "-F --fuzzy"
- :description "Fuzzy hashes"
+ stat:flag "-m --meta"
+ :description "Lua metatokens",
+ stat:flag "-b --bayes"
+ :description "Bayes tokens",
+ stat:flag "-F --fuzzy"
+ :description "Fuzzy hashes"
)
stat:flag "-s --shingles"
:description "Show shingles for fuzzy hashes"
local urls = parser:command "urls url u"
- :description "Extracts URLs from MIME messages"
+ :description "Extracts URLs from MIME messages"
urls:argument "file"
:description "File to process"
:argname "<file>"
:args "+"
urls:mutex(
- urls:flag "-t --tld"
- :description "Get TLDs only",
- urls:flag "-H --host"
- :description "Get hosts only",
- urls:flag "-f --full"
- :description "Show piecewise urls as processed by Rspamd"
+ urls:flag "-t --tld"
+ :description "Get TLDs only",
+ urls:flag "-H --host"
+ :description "Get hosts only",
+ urls:flag "-f --full"
+ :description "Show piecewise urls as processed by Rspamd"
)
urls:flag "-u --unique"
@@ -135,75 +135,75 @@ urls:flag "-r --reverse"
:description "Reverse sort order"
local modify = parser:command "modify mod m"
- :description "Modifies MIME message"
+ :description "Modifies MIME message"
modify:argument "file"
- :description "File to process"
- :argname "<file>"
- :args "+"
+ :description "File to process"
+ :argname "<file>"
+ :args "+"
modify:option "-a --add-header"
- :description "Adds specific header"
- :argname "<header=value>"
- :count "*"
+ :description "Adds specific header"
+ :argname "<header=value>"
+ :count "*"
modify:option "-r --remove-header"
- :description "Removes specific header (all occurrences)"
- :argname "<header>"
- :count "*"
+ :description "Removes specific header (all occurrences)"
+ :argname "<header>"
+ :count "*"
modify:option "-R --rewrite-header"
- :description "Rewrites specific header, uses Lua string.format pattern"
- :argname "<header=pattern>"
- :count "*"
+ :description "Rewrites specific header, uses Lua string.format pattern"
+ :argname "<header=pattern>"
+ :count "*"
modify:option "-t --text-footer"
- :description "Adds footer to text/plain parts from a specific file"
- :argname "<file>"
+ :description "Adds footer to text/plain parts from a specific file"
+ :argname "<file>"
modify:option "-H --html-footer"
- :description "Adds footer to text/html parts from a specific file"
- :argname "<file>"
+ :description "Adds footer to text/html parts from a specific file"
+ :argname "<file>"
local strip = parser:command "strip"
- :description "Strip attachments from a message"
+ :description "Strip attachments from a message"
strip:argument "file"
- :description "File to process"
- :argname "<file>"
- :args "+"
+ :description "File to process"
+ :argname "<file>"
+ :args "+"
strip:flag "-i --keep-images"
- :description "Keep images"
+ :description "Keep images"
strip:option "--min-text-size"
- :description "Minimal text size to keep"
- :argname "<size>"
- :convert(tonumber)
- :default(0)
+ :description "Minimal text size to keep"
+ :argname "<size>"
+ :convert(tonumber)
+ :default(0)
strip:option "--max-text-size"
- :description "Max text size to keep"
- :argname "<size>"
- :convert(tonumber)
- :default(math.huge)
+ :description "Max text size to keep"
+ :argname "<size>"
+ :convert(tonumber)
+ :default(math.huge)
local anonymize = parser:command "anonymize"
- :description "Try to remove sensitive information from a message"
+ :description "Try to remove sensitive information from a message"
anonymize:argument "file"
- :description "File to process"
- :argname "<file>"
- :args "+"
+ :description "File to process"
+ :argname "<file>"
+ :args "+"
anonymize:option "--exclude-header -X"
- :description "Exclude specific headers from anonymization"
- :argname "<header>"
- :count "*"
+ :description "Exclude specific headers from anonymization"
+ :argname "<header>"
+ :count "*"
anonymize:option "--include-header -I"
- :description "Include specific headers from anonymization"
- :argname "<header>"
- :count "*"
+ :description "Include specific headers from anonymization"
+ :argname "<header>"
+ :count "*"
anonymize:flag "--gpt"
- :description "Use LLM model for anonymization (requires GPT plugin to be configured)"
+ :description "Use LLM model for anonymization (requires GPT plugin to be configured)"
anonymize:option "--model"
- :description "Model to use for anonymization"
- :argname "<model>"
+ :description "Model to use for anonymization"
+ :argname "<model>"
anonymize:option "--prompt"
- :description "Prompt to use for anonymization"
- :argname "<prompt>"
+ :description "Prompt to use for anonymization"
+ :argname "<prompt>"
local sign = parser:command "sign"
- :description "Performs DKIM signing"
+ :description "Performs DKIM signing"
sign:argument "file"
:description "File to process"
:argname "<file>"
@@ -225,33 +225,33 @@ sign:option "-t --type"
:description "ARC or DKIM signing"
:argname("<arc|dkim>")
:convert {
- ['arc'] = 'arc',
- ['dkim'] = 'dkim',
-}
+ ['arc'] = 'arc',
+ ['dkim'] = 'dkim',
+ }
:default 'dkim'
sign:option "-o --output"
:description "Output format"
:argname("<message|signature>")
:convert {
- ['message'] = 'message',
- ['signature'] = 'signature',
-}
+ ['message'] = 'message',
+ ['signature'] = 'signature',
+ }
:default 'message'
local dump = parser:command "dump"
- :description "Dumps a raw message in different formats"
+ :description "Dumps a raw message in different formats"
dump:argument "file"
:description "File to process"
:argname "<file>"
:args "+"
-- Duplicate format for convenience
dump:mutex(
- parser:flag "-j --json"
- :description "JSON output",
- parser:flag "-U --ucl"
- :description "UCL output",
- parser:flag "-M --messagepack"
- :description "MessagePack output"
+ parser:flag "-j --json"
+ :description "JSON output",
+ parser:flag "-U --ucl"
+ :description "UCL output",
+ parser:flag "-M --messagepack"
+ :description "MessagePack output"
)
dump:flag "-s --split"
:description "Split the output file contents such that no content is embedded"
@@ -260,7 +260,7 @@ dump:option "-o --outdir"
:description "Output directory"
:argname("<directory>")
-local function load_config(opts)
+local function load_config(opts, load_tokenizers)
local _r, err = rspamd_config:load_ucl(opts['config'])
if not _r then
@@ -273,6 +273,23 @@ local function load_config(opts)
rspamd_logger.errx('cannot process %s: %s', opts['config'], err)
os.exit(1)
end
+
+ -- Load custom tokenizers if requested
+ if load_tokenizers then
+ local success, tokenizer_err = rspamd_config:load_custom_tokenizers()
+ if not success then
+ rspamd_logger.errx('cannot load custom tokenizers: %s', tokenizer_err or 'unknown error')
+ -- Don't exit here as custom tokenizers are optional
+ rspamd_logger.warnx('proceeding without custom tokenizers')
+ end
+ end
+end
+
+-- Helper function to ensure proper cleanup of tokenizers
+local function cleanup_tokenizers()
+ if rspamd_config then
+ rspamd_config:unload_custom_tokenizers()
+ end
end
local function load_task(_, fname)
@@ -288,13 +305,13 @@ local function load_task(_, fname)
if not res then
parser:error(string.format('cannot read message from %s: %s', fname,
- task))
+ task))
return nil
end
if not task:process_message() then
parser:error(string.format('cannot read message from %s: %s', fname,
- 'failed to parse'))
+ 'failed to parse'))
return nil
end
@@ -335,7 +352,6 @@ local function print_elts(elts, opts, func)
io.write(ucl.to_format(elts, output_fmt(opts)))
else
fun.each(function(fname, elt)
-
if not opts.json and not opts.ucl then
if func then
elt = fun.map(func, elt)
@@ -357,7 +373,7 @@ local function extract_handler(opts)
if opts.words then
-- Enable stemming and urls detection
- load_config(opts)
+ load_config(opts, true) -- Load with custom tokenizers
rspamd_url.init(rspamd_config:get_tld_path())
rspamd_config:init_subsystem('langdet')
end
@@ -372,39 +388,38 @@ local function extract_handler(opts)
if not opts.json and not opts.ucl then
table.insert(out,
- rspamd_logger.slog('Part: %s: %s, language: %s, size: %s (%s raw), words: %s',
- part:get_mimepart():get_digest():sub(1, 8),
- t,
- part:get_language(),
- part:get_length(), part:get_raw_length(),
- part:get_words_count()))
+ rspamd_logger.slog('Part: %s: %s, language: %s, size: %s (%s raw), words: %s',
+ part:get_mimepart():get_digest():sub(1, 8),
+ t,
+ part:get_language(),
+ part:get_length(), part:get_raw_length(),
+ part:get_words_count()))
table.insert(out,
- rspamd_logger.slog('Stats: %s',
- fun.foldl(function(acc, k, v)
- if acc ~= '' then
- return string.format('%s, %s:%s', acc, k, v)
- else
- return string.format('%s:%s', k, v)
- end
- end, '', part:get_stats())))
+ rspamd_logger.slog('Stats: %s',
+ fun.foldl(function(acc, k, v)
+ if acc ~= '' then
+ return string.format('%s, %s:%s', acc, k, v)
+ else
+ return string.format('%s:%s', k, v)
+ end
+ end, '', part:get_stats())))
end
end
end
local function maybe_print_mime_part_info(part, out)
if opts.part then
-
if not opts.json and not opts.ucl then
local mtype, msubtype = part:get_type()
local det_mtype, det_msubtype = part:get_detected_type()
table.insert(out,
- rspamd_logger.slog('Mime Part: %s: %s/%s (%s/%s detected), filename: %s (%s detected ext), size: %s',
- part:get_digest():sub(1, 8),
- mtype, msubtype,
- det_mtype, det_msubtype,
- part:get_filename(),
- part:get_detected_ext(),
- part:get_length()))
+ rspamd_logger.slog('Mime Part: %s: %s/%s (%s/%s detected), filename: %s (%s detected ext), size: %s',
+ part:get_digest():sub(1, 8),
+ mtype, msubtype,
+ det_mtype, det_msubtype,
+ part:get_filename(),
+ part:get_detected_ext(),
+ part:get_length()))
end
end
end
@@ -416,17 +431,17 @@ local function extract_handler(opts)
return table.concat(words, ' ')
else
return table.concat(
- fun.totable(
- fun.map(function(w)
- -- [1] - stemmed word
- -- [2] - normalised word
- -- [3] - raw word
- -- [4] - flags (table of strings)
- return string.format('%s|%s|%s(%s)',
- w[3], w[2], w[1], table.concat(w[4], ','))
- end, words)
- ),
- ' '
+ fun.totable(
+ fun.map(function(w)
+ -- [1] - stemmed word
+ -- [2] - normalised word
+ -- [3] - raw word
+ -- [4] - flags (table of strings)
+ return string.format('%s|%s|%s(%s)',
+ w[3], w[2], w[1], table.concat(w[4], ','))
+ end, words)
+ ),
+ ' '
)
end
end
@@ -443,7 +458,7 @@ local function extract_handler(opts)
if opts.words then
local how_words = opts['words_format'] or 'stem'
table.insert(out_elts[fname], 'meta_words: ' ..
- print_words(task:get_meta_words(how_words), how_words == 'full'))
+ print_words(task:get_meta_words(how_words), how_words == 'full'))
end
if opts.text or opts.html then
@@ -466,7 +481,7 @@ local function extract_handler(opts)
if opts.words then
local how_words = opts['words_format'] or 'stem'
table.insert(out_elts[fname], print_words(part:get_words(how_words),
- how_words == 'full'))
+ how_words == 'full'))
else
table.insert(out_elts[fname], tostring(part:get_content(how)))
end
@@ -480,7 +495,7 @@ local function extract_handler(opts)
if opts.words then
local how_words = opts['words_format'] or 'stem'
table.insert(out_elts[fname], print_words(part:get_words(how_words),
- how_words == 'full'))
+ how_words == 'full'))
else
if opts.structure then
local hc = part:get_html()
@@ -489,11 +504,11 @@ local function extract_handler(opts)
local fun = require "fun"
if type(elt) == 'table' then
return table.concat(fun.totable(
- fun.map(
- function(t)
- return rspamd_logger.slog("%s", t)
- end,
- elt)), '\n')
+ fun.map(
+ function(t)
+ return rspamd_logger.slog("%s", t)
+ end,
+ elt)), '\n')
else
return rspamd_logger.slog("%s", elt)
end
@@ -524,7 +539,7 @@ local function extract_handler(opts)
if opts.invisible then
local hc = part:get_html()
table.insert(out_elts[fname], string.format('invisible content: %s',
- tostring(hc:get_invisible())))
+ tostring(hc:get_invisible())))
end
end
end
@@ -544,13 +559,18 @@ local function extract_handler(opts)
for _, task in ipairs(tasks) do
task:destroy()
end
+
+ -- Cleanup custom tokenizers if they were loaded
+ if opts.words then
+ cleanup_tokenizers()
+ end
end
local function stat_handler(opts)
local fun = require "fun"
local out_elts = {}
- load_config(opts)
+ load_config(opts, true) -- Load with custom tokenizers for stat generation
rspamd_url.init(rspamd_config:get_tld_path())
rspamd_config:init_subsystem('langdet,stat') -- Needed to gen stat tokens
@@ -571,10 +591,10 @@ local function stat_handler(opts)
out_elts[fname] = bt
process_func = function(e)
return string.format('%s (%d): "%s"+"%s", [%s]', e.data, e.win, e.t1 or "",
- e.t2 or "", table.concat(fun.totable(
- fun.map(function(k)
- return k
- end, e.flags)), ","))
+ e.t2 or "", table.concat(fun.totable(
+ fun.map(function(k)
+ return k
+ end, e.flags)), ","))
end
elseif opts.fuzzy then
local parts = task:get_parts() or {}
@@ -601,16 +621,16 @@ local function stat_handler(opts)
digest = digest,
shingles = shingles,
type = string.format('%s/%s',
- ({ part:get_type() })[1],
- ({ part:get_type() })[2])
+ ({ part:get_type() })[1],
+ ({ part:get_type() })[2])
})
else
table.insert(out_elts[fname], {
digest = part:get_digest(),
file = part:get_filename(),
type = string.format('%s/%s',
- ({ part:get_type() })[1],
- ({ part:get_type() })[2])
+ ({ part:get_type() })[1],
+ ({ part:get_type() })[2])
})
end
end
@@ -621,10 +641,13 @@ local function stat_handler(opts)
end
print_elts(out_elts, opts, process_func)
+
+ -- Cleanup custom tokenizers
+ cleanup_tokenizers()
end
local function urls_handler(opts)
- load_config(opts)
+ load_config(opts, false) -- URLs don't need custom tokenizers
rspamd_url.init(rspamd_config:get_tld_path())
local out_elts = {}
@@ -764,7 +787,7 @@ local function newline(task)
end
local function modify_handler(opts)
- load_config(opts)
+ load_config(opts, false) -- Modification doesn't need custom tokenizers
rspamd_url.init(rspamd_config:get_tld_path())
local function read_file(file)
@@ -804,10 +827,10 @@ local function modify_handler(opts)
if hname == name then
local new_value = string.format(hpattern, hdr.decoded)
new_value = string.format('%s:%s%s',
- name, hdr.separator,
- rspamd_util.fold_header(name,
- rspamd_util.mime_header_encode(new_value),
- task:get_newlines_type()))
+ name, hdr.separator,
+ rspamd_util.fold_header(name,
+ rspamd_util.mime_header_encode(new_value),
+ task:get_newlines_type()))
out[#out + 1] = new_value
return
end
@@ -816,12 +839,12 @@ local function modify_handler(opts)
if rewrite.need_rewrite_ct then
if name:lower() == 'content-type' then
local nct = string.format('%s: %s/%s; charset=utf-8',
- 'Content-Type', rewrite.new_ct.type, rewrite.new_ct.subtype)
+ 'Content-Type', rewrite.new_ct.type, rewrite.new_ct.subtype)
out[#out + 1] = nct
return
elseif name:lower() == 'content-transfer-encoding' then
out[#out + 1] = string.format('%s: %s',
- 'Content-Transfer-Encoding', rewrite.new_cte or 'quoted-printable')
+ 'Content-Transfer-Encoding', rewrite.new_cte or 'quoted-printable')
seen_cte = true
return
end
@@ -837,13 +860,13 @@ local function modify_handler(opts)
if hname and hvalue then
out[#out + 1] = string.format('%s: %s', hname,
- rspamd_util.fold_header(hname, hvalue, task:get_newlines_type()))
+ rspamd_util.fold_header(hname, hvalue, task:get_newlines_type()))
end
end
if not seen_cte and rewrite.need_rewrite_ct then
out[#out + 1] = string.format('%s: %s',
- 'Content-Transfer-Encoding', rewrite.new_cte or 'quoted-printable')
+ 'Content-Transfer-Encoding', rewrite.new_cte or 'quoted-printable')
end
-- End of headers
@@ -883,7 +906,7 @@ local function modify_handler(opts)
end
local function sign_handler(opts)
- load_config(opts)
+ load_config(opts, false) -- Signing doesn't need custom tokenizers
rspamd_url.init(rspamd_config:get_tld_path())
local lua_dkim = require("lua_ffi").dkim
@@ -927,11 +950,11 @@ local function sign_handler(opts)
io.flush()
else
local dkim_hdr = string.format('%s: %s%s',
- 'DKIM-Signature',
- rspamd_util.fold_header('DKIM-Signature',
- rspamd_util.mime_header_encode(sig),
- task:get_newlines_type()),
- newline(task))
+ 'DKIM-Signature',
+ rspamd_util.fold_header('DKIM-Signature',
+ rspamd_util.mime_header_encode(sig),
+ task:get_newlines_type()),
+ newline(task))
io.write(dkim_hdr)
io.flush()
task:get_content():save_in_file(1)
@@ -942,7 +965,7 @@ local function sign_handler(opts)
end
local function strip_handler(opts)
- load_config(opts)
+ load_config(opts, false) -- Stripping doesn't need custom tokenizers
rspamd_url.init(rspamd_config:get_tld_path())
for _, fname in ipairs(opts.file) do
@@ -998,7 +1021,7 @@ local function strip_handler(opts)
end
local function anonymize_handler(opts)
- load_config(opts)
+ load_config(opts, false) -- Anonymization doesn't need custom tokenizers
rspamd_url.init(rspamd_config:get_tld_path())
for _, fname in ipairs(opts.file) do
@@ -1103,7 +1126,7 @@ local function get_dump_content(task, opts, fname)
end
local function dump_handler(opts)
- load_config(opts)
+ load_config(opts, false) -- Dumping doesn't need custom tokenizers
rspamd_url.init(rspamd_config:get_tld_path())
for _, fname in ipairs(opts.file) do
diff --git a/rules/controller/fuzzy.lua b/rules/controller/fuzzy.lua
index 193e6fd4c..06f5d43d9 100644
--- a/rules/controller/fuzzy.lua
+++ b/rules/controller/fuzzy.lua
@@ -37,10 +37,30 @@ local function handle_gen_fuzzy(task, conn, req_params)
end
end
+local function handle_fuzzy_storages(_task, conn)
+ if type(rspamd_plugins.fuzzy_check) == 'table'
+ and type(rspamd_plugins.fuzzy_check.list_storages) == 'function' then
+ local ok, result = pcall(rspamd_plugins.fuzzy_check.list_storages, rspamd_config)
+
+ if ok then
+ conn:send_ucl({ success = true, storages = result })
+ else
+ conn:send_error(500, 'cannot list fuzzy storages')
+ end
+ else
+ conn:send_error(404, 'fuzzy_check is not enabled')
+ end
+end
+
return {
hashes = {
handler = handle_gen_fuzzy,
need_task = true,
enable = false
},
-} \ No newline at end of file
+ storages = {
+ handler = handle_fuzzy_storages,
+ need_task = false,
+ enable = false
+ },
+}
diff --git a/src/client/rspamdclient.c b/src/client/rspamdclient.c
index d07b24332..4d79590c5 100644
--- a/src/client/rspamdclient.c
+++ b/src/client/rspamdclient.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -231,7 +231,7 @@ rspamd_client_finish_handler(struct rspamd_http_connection *conn,
}
}
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_chunk_full(parser, start, len,
ucl_parser_get_default_priority(parser),
UCL_DUPLICATE_APPEND, UCL_PARSE_AUTO)) {
diff --git a/src/controller.c b/src/controller.c
index c64835038..0550ba6b8 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -68,6 +68,7 @@
#define PATH_NEIGHBOURS "/neighbours"
#define PATH_PLUGINS "/plugins"
#define PATH_PING "/ping"
+#define PATH_BAYES_CLASSIFIERS "/bayes/classifiers"
#define msg_err_session(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \
session->pool->tag.tagname, session->pool->tag.uid, \
@@ -992,9 +993,9 @@ rspamd_controller_handle_maps(struct rspamd_http_connection_entry *conn_ent,
"type", 0, false);
ucl_object_insert_key(obj, ucl_object_frombool(editable),
"editable", 0, false);
- ucl_object_insert_key(obj, ucl_object_frombool(bk->shared->loaded),
+ ucl_object_insert_key(obj, ucl_object_frombool(map->shared->loaded),
"loaded", 0, false);
- ucl_object_insert_key(obj, ucl_object_frombool(bk->shared->cached),
+ ucl_object_insert_key(obj, ucl_object_frombool(map->shared->cached),
"cached", 0, false);
ucl_array_append(top, obj);
}
@@ -1012,9 +1013,9 @@ rspamd_controller_handle_maps(struct rspamd_http_connection_entry *conn_ent,
"type", 0, false);
ucl_object_insert_key(obj, ucl_object_frombool(false),
"editable", 0, false);
- ucl_object_insert_key(obj, ucl_object_frombool(bk->shared->loaded),
+ ucl_object_insert_key(obj, ucl_object_frombool(map->shared->loaded),
"loaded", 0, false);
- ucl_object_insert_key(obj, ucl_object_frombool(bk->shared->cached),
+ ucl_object_insert_key(obj, ucl_object_frombool(map->shared->cached),
"cached", 0, false);
ucl_array_append(top, obj);
}
@@ -1141,7 +1142,7 @@ rspamd_controller_handle_get_map(struct rspamd_http_connection_entry *conn_ent,
rspamd_map_traverse(bk->map, rspamd_controller_map_traverse_callback, &map_body, FALSE);
rspamd_http_message_set_body_from_fstring_steal(reply, map_body);
}
- else if (bk->shared->loaded) {
+ else if (map->shared->loaded) {
reply = rspamd_http_new_message(HTTP_RESPONSE);
reply->code = 200;
rspamd_fstring_t *map_body = rspamd_fstring_new();
@@ -2311,7 +2312,7 @@ rspamd_controller_handle_saveactions(
return 0;
}
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_chunk(parser, msg->body_buf.begin, msg->body_buf.len)) {
if ((error = ucl_parser_get_error(parser)) != NULL) {
msg_err_session("cannot parse input: %s", error);
@@ -2434,7 +2435,7 @@ rspamd_controller_handle_savesymbols(
return 0;
}
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_chunk(parser, msg->body_buf.begin, msg->body_buf.len)) {
if ((error = ucl_parser_get_error(parser)) != NULL) {
msg_err_session("cannot parse input: %s", error);
@@ -3291,7 +3292,7 @@ rspamd_controller_handle_unknown(struct rspamd_http_connection_entry *conn_ent,
rspamd_http_message_add_header(rep, "Access-Control-Allow-Methods",
"POST, GET, OPTIONS");
rspamd_http_message_add_header(rep, "Access-Control-Allow-Headers",
- "Content-Type,Password,Map,Weight,Flag,Hash");
+ "Classifier,Content-Type,Password,Map,Weight,Flag,Hash");
rspamd_http_connection_reset(conn_ent->conn);
rspamd_http_router_insert_headers(conn_ent->rt, rep);
rspamd_http_connection_write_message(conn_ent->conn,
@@ -3446,6 +3447,40 @@ rspamd_controller_handle_lua_plugin(struct rspamd_http_connection_entry *conn_en
return 0;
}
+/*
+ * Bayes classifier list command handler:
+ * request: /bayes/classifiers
+ * headers: Password
+ * reply: JSON array of Bayes classifier names
+ * Note: list is in reverse of declaration order (GList prepend).
+ */
+static int
+rspamd_controller_handle_bayes_classifiers(struct rspamd_http_connection_entry *conn_ent,
+ struct rspamd_http_message *msg)
+{
+ struct rspamd_controller_session *session = conn_ent->ud;
+ struct rspamd_controller_worker_ctx *ctx = session->ctx;
+ ucl_object_t *arr;
+ struct rspamd_classifier_config *clc;
+ GList *cur;
+
+ if (!rspamd_controller_check_password(conn_ent, session, msg, FALSE)) {
+ return 0;
+ }
+
+ arr = ucl_object_typed_new(UCL_ARRAY);
+ cur = g_list_last(ctx->cfg->classifiers);
+ while (cur) {
+ clc = cur->data;
+ ucl_array_append(arr, ucl_object_fromstring(clc->name));
+ cur = g_list_previous(cur);
+ }
+
+ rspamd_controller_send_ucl(conn_ent, arr);
+ ucl_object_unref(arr);
+ return 0;
+}
+
static void
rspamd_controller_error_handler(struct rspamd_http_connection_entry *conn_ent,
@@ -4055,6 +4090,9 @@ start_controller_worker(struct rspamd_worker *worker)
rspamd_http_router_add_path(ctx->http,
PATH_PING,
rspamd_controller_handle_ping);
+ rspamd_http_router_add_path(ctx->http,
+ PATH_BAYES_CLASSIFIERS,
+ rspamd_controller_handle_bayes_classifiers);
rspamd_controller_register_plugins_paths(ctx);
#if 0
diff --git a/src/fuzzy_storage.c b/src/fuzzy_storage.c
index 919ea2118..58d123712 100644
--- a/src/fuzzy_storage.c
+++ b/src/fuzzy_storage.c
@@ -342,7 +342,7 @@ ucl_keymap_fin_cb(struct map_cb_data *data, void **target)
return;
}
- parser = ucl_parser_new(UCL_PARSER_NO_FILEVARS);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_chunk(parser, jb->buf->str, jb->buf->len)) {
msg_err_config("cannot load ucl data: parse error %s",
@@ -1305,7 +1305,7 @@ rspamd_fuzzy_check_callback(struct rspamd_fuzzy_reply *result, void *ud)
{
/* Start lua post handler */
lua_State *L = session->ctx->cfg->lua_state;
- int err_idx, ret, nargs = 9;
+ int err_idx, ret, nargs = 10;
lua_pushcfunction(L, &rspamd_lua_traceback);
err_idx = lua_gettop(L);
@@ -1339,7 +1339,9 @@ rspamd_fuzzy_check_callback(struct rspamd_fuzzy_reply *result, void *ud)
/* We push shingles merely for commands that modify content to avoid extra work */
if (is_shingle && cmd->cmd != FUZZY_CHECK) {
lua_newshingle(L, &session->cmd.sgl);
- nargs++;
+ }
+ else {
+ lua_pushnil(L);
}
if ((ret = lua_pcall(L, nargs, LUA_MULTRET, err_idx)) != 0) {
@@ -1505,7 +1507,7 @@ rspamd_fuzzy_process_command(struct fuzzy_session *session)
{
/* Start lua pre handler */
lua_State *L = session->ctx->cfg->lua_state;
- int err_idx, ret, nargs = 7;
+ int err_idx, ret, nargs = 8;
lua_pushcfunction(L, &rspamd_lua_traceback);
err_idx = lua_gettop(L);
@@ -1527,7 +1529,9 @@ rspamd_fuzzy_process_command(struct fuzzy_session *session)
/* We push shingles merely for commands that modify content to avoid extra work */
if (is_shingle && cmd->cmd != FUZZY_CHECK) {
lua_newshingle(L, &session->cmd.sgl);
- nargs++;
+ }
+ else {
+ lua_pushnil(L);
}
/* Flag and value */
@@ -2661,7 +2665,7 @@ rspamd_fuzzy_maybe_load_ratelimits(struct rspamd_fuzzy_storage_ctx *ctx)
RSPAMD_DBDIR);
if (access(path, R_OK) != -1) {
- struct ucl_parser *parser = ucl_parser_new(UCL_PARSER_NO_IMPLICIT_ARRAYS | UCL_PARSER_DISABLE_MACRO);
+ struct ucl_parser *parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (ucl_parser_add_file(parser, path)) {
ucl_object_t *obj = ucl_parser_get_object(parser);
int loaded = 0;
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 6e180ea66..b783b8325 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -363,7 +363,7 @@ rspamd_language_detector_read_file(struct rspamd_config *cfg,
double mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0;
enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX;
- parser = ucl_parser_new(UCL_PARSER_NO_FILEVARS);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_file(parser, path)) {
msg_warn_config("cannot parse file %s: %s", path,
ucl_parser_get_error(parser));
@@ -825,7 +825,7 @@ rspamd_language_detector_init(struct rspamd_config *cfg)
languages_pattern = g_string_sized_new(PATH_MAX);
rspamd_printf_gstring(languages_pattern, "%s/stop_words", languages_path);
- parser = ucl_parser_new(UCL_PARSER_DEFAULT);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (ucl_parser_add_file(parser, languages_pattern->str)) {
stop_words = ucl_parser_get_object(parser);
@@ -936,7 +936,7 @@ end:
}
static void
-rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
+rspamd_language_detector_random_select(rspamd_words_t *ucs_tokens, unsigned int nwords,
goffset *offsets_out,
uint64_t *seed)
{
@@ -946,7 +946,7 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
g_assert(nwords != 0);
g_assert(offsets_out != NULL);
- g_assert(ucs_tokens->len >= nwords);
+ g_assert(kv_size(*ucs_tokens) >= nwords);
/*
* We split input array into `nwords` parts. For each part we randomly select
* an element from this particular split. Here is an example:
@@ -963,22 +963,22 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
* their splits. It is not uniform distribution but it seems to be better
* to include words from different text parts
*/
- step_len = ucs_tokens->len / nwords;
- remainder = ucs_tokens->len % nwords;
+ step_len = kv_size(*ucs_tokens) / nwords;
+ remainder = kv_size(*ucs_tokens) % nwords;
out_idx = 0;
coin = rspamd_random_uint64_fast_seed(seed);
sel = coin % (step_len + remainder);
offsets_out[out_idx] = sel;
- for (i = step_len + remainder; i < ucs_tokens->len;
+ for (i = step_len + remainder; i < kv_size(*ucs_tokens);
i += step_len, out_idx++) {
unsigned int ntries = 0;
coin = rspamd_random_uint64_fast_seed(seed);
sel = (coin % step_len) + i;
for (;;) {
- tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel);
+ tok = &kv_A(*ucs_tokens, sel);
/* Filter bad tokens */
if (tok->unicode.len >= 2 &&
@@ -995,8 +995,8 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
if (ntries < step_len) {
sel = (coin % step_len) + i;
}
- else if (ntries < ucs_tokens->len) {
- sel = coin % ucs_tokens->len;
+ else if (ntries < kv_size(*ucs_tokens)) {
+ sel = coin % kv_size(*ucs_tokens);
}
else {
offsets_out[out_idx] = sel;
@@ -1223,12 +1223,12 @@ static void
rspamd_language_detector_detect_type(struct rspamd_task *task,
unsigned int nwords,
struct rspamd_lang_detector *d,
- GArray *words,
+ rspamd_words_t *words,
enum rspamd_language_category cat,
khash_t(rspamd_candidates_hash) * candidates,
struct rspamd_mime_text_part *part)
{
- unsigned int nparts = MIN(words->len, nwords);
+ unsigned int nparts = MIN(kv_size(*words), nwords);
goffset *selected_words;
rspamd_stat_token_t *tok;
unsigned int i;
@@ -1241,8 +1241,7 @@ rspamd_language_detector_detect_type(struct rspamd_task *task,
msg_debug_lang_det("randomly selected %d words", nparts);
for (i = 0; i < nparts; i++) {
- tok = &g_array_index(words, rspamd_stat_token_t,
- selected_words[i]);
+ tok = &kv_A(*words, selected_words[i]);
if (tok->unicode.len >= 3) {
rspamd_language_detector_detect_word(task, d, tok, candidates,
@@ -1282,7 +1281,7 @@ static enum rspamd_language_detected_type
rspamd_language_detector_try_ngramm(struct rspamd_task *task,
unsigned int nwords,
struct rspamd_lang_detector *d,
- GArray *ucs_tokens,
+ rspamd_words_t *ucs_tokens,
enum rspamd_language_category cat,
khash_t(rspamd_candidates_hash) * candidates,
struct rspamd_mime_text_part *part)
@@ -1863,7 +1862,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) {
rspamd_fasttext_predict_result_t fasttext_predict_result =
rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task,
- part->utf_words, 4);
+ &part->utf_words, 4);
ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result);
@@ -1930,11 +1929,11 @@ rspamd_language_detector_detect(struct rspamd_task *task,
if (!ret) {
/* Apply trigramms detection */
candidates = kh_init(rspamd_candidates_hash);
- if (part->utf_words->len < default_short_text_limit) {
+ if (kv_size(part->utf_words) < default_short_text_limit) {
r = rs_detect_none;
msg_debug_lang_det("text is too short for trigrams detection: "
"%d words; at least %d words required",
- (int) part->utf_words->len,
+ (int) kv_size(part->utf_words),
(int) default_short_text_limit);
switch (cat) {
case RSPAMD_LANGUAGE_CYRILLIC:
@@ -1960,7 +1959,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
r = rspamd_language_detector_try_ngramm(task,
default_words,
d,
- part->utf_words,
+ &part->utf_words,
cat,
candidates,
part);
@@ -2123,4 +2122,4 @@ int rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt)
}
return 0;
-} \ No newline at end of file
+}
diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx
index 8ea2706e6..983ff78de 100644
--- a/src/libmime/lang_detection_fasttext.cxx
+++ b/src/libmime/lang_detection_fasttext.cxx
@@ -22,6 +22,7 @@
#include "libserver/logger.h"
#include "contrib/fmt/include/fmt/base.h"
#include "stat_api.h"
+#include "libserver/word.h"
#include <exception>
#include <string_view>
#include <vector>
@@ -180,26 +181,32 @@ bool rspamd_lang_detection_fasttext_is_enabled(void *ud)
rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
struct rspamd_task *task,
- GArray *utf_words,
+ rspamd_words_t *utf_words,
int k)
{
#ifndef WITH_FASTTEXT
return nullptr;
#else
/* Avoid too long inputs */
- static const unsigned int max_fasttext_input_len = 1024 * 1024;
+ static const size_t max_fasttext_input_len = 1024 * 1024;
auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
std::vector<std::int32_t> words_vec;
- words_vec.reserve(utf_words->len);
- for (auto i = 0; i < std::min(utf_words->len, max_fasttext_input_len); i++) {
- const auto *w = &g_array_index(utf_words, rspamd_stat_token_t, i);
+ if (!utf_words || !utf_words->a) {
+ return nullptr;
+ }
+
+ auto words_count = kv_size(*utf_words);
+ words_vec.reserve(words_count);
+
+ for (auto i = 0; i < std::min(words_count, max_fasttext_input_len); i++) {
+ const auto *w = &kv_A(*utf_words, i);
if (w->original.len > 0) {
real_model->word2vec(w->original.begin, w->original.len, words_vec);
}
}
- msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), utf_words->len);
+ msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), words_count);
auto *res = real_model->detect_language(words_vec, k);
@@ -266,4 +273,4 @@ void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res
#endif
}
-G_END_DECLS \ No newline at end of file
+G_END_DECLS
diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h
index 2a2756968..e2b67181a 100644
--- a/src/libmime/lang_detection_fasttext.h
+++ b/src/libmime/lang_detection_fasttext.h
@@ -17,6 +17,7 @@
#define RSPAMD_LANG_DETECTION_FASTTEXT_H
#include "config.h"
+#include "libserver/word.h"
G_BEGIN_DECLS
struct rspamd_config;
@@ -53,7 +54,7 @@ typedef void *rspamd_fasttext_predict_result_t;
* @return TRUE if language is detected
*/
rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
- struct rspamd_task *task, GArray *utf_words, int k);
+ struct rspamd_task *task, rspamd_words_t *utf_words, int k);
/**
* Get number of languages detected
diff --git a/src/libmime/message.c b/src/libmime/message.c
index f2cabf399..8442c80ac 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -40,6 +40,8 @@
#include "contrib/uthash/utlist.h"
#include "contrib/t1ha/t1ha.h"
#include "received.h"
+#define RSPAMD_TOKENIZER_INTERNAL
+#include "libstat/tokenizers/custom_tokenizer.h"
#define GTUBE_SYMBOL "GTUBE"
@@ -71,14 +73,14 @@ rspamd_mime_part_extract_words(struct rspamd_task *task,
rspamd_stat_token_t *w;
unsigned int i, total_len = 0, short_len = 0;
- if (part->utf_words) {
- rspamd_stem_words(part->utf_words, task->task_pool, part->language,
+ if (part->utf_words.a) {
+ rspamd_stem_words(&part->utf_words, task->task_pool, part->language,
task->lang_det);
- for (i = 0; i < part->utf_words->len; i++) {
+ for (i = 0; i < kv_size(part->utf_words); i++) {
uint64_t h;
- w = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
+ w = &kv_A(part->utf_words, i);
if (w->stemmed.len > 0) {
/*
@@ -108,7 +110,7 @@ rspamd_mime_part_extract_words(struct rspamd_task *task,
}
}
- if (part->utf_words->len) {
+ if (kv_size(part->utf_words)) {
double *avg_len_p, *short_len_p;
avg_len_p = rspamd_mempool_get_variable(task->task_pool,
@@ -185,21 +187,24 @@ rspamd_mime_part_create_words(struct rspamd_task *task,
tok_type = RSPAMD_TOKENIZE_RAW;
}
- part->utf_words = rspamd_tokenize_text(
+ /* Initialize kvec for words */
+ kv_init(part->utf_words);
+
+ rspamd_tokenize_text(
part->utf_stripped_content->data,
part->utf_stripped_content->len,
&part->utf_stripped_text,
tok_type, task->cfg,
part->exceptions,
NULL,
- NULL,
+ &part->utf_words,
task->task_pool);
- if (part->utf_words) {
+ if (part->utf_words.a) {
part->normalized_hashes = g_array_sized_new(FALSE, FALSE,
- sizeof(uint64_t), part->utf_words->len);
- rspamd_normalize_words(part->utf_words, task->task_pool);
+ sizeof(uint64_t), kv_size(part->utf_words));
+ rspamd_normalize_words(&part->utf_words, task->task_pool);
}
}
@@ -209,7 +214,7 @@ rspamd_mime_part_detect_language(struct rspamd_task *task,
{
struct rspamd_lang_detector_res *lang;
- if (!IS_TEXT_PART_EMPTY(part) && part->utf_words && part->utf_words->len > 0 &&
+ if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a && kv_size(part->utf_words) > 0 &&
task->lang_det) {
if (rspamd_language_detector_detect(task, task->lang_det, part)) {
lang = g_ptr_array_index(part->languages, 0);
@@ -1106,8 +1111,8 @@ rspamd_message_dtor(struct rspamd_message *msg)
PTR_ARRAY_FOREACH(msg->text_parts, i, tp)
{
- if (tp->utf_words) {
- g_array_free(tp->utf_words, TRUE);
+ if (tp->utf_words.a) {
+ kv_destroy(tp->utf_words);
}
if (tp->normalized_hashes) {
g_array_free(tp->normalized_hashes, TRUE);
@@ -1583,7 +1588,7 @@ void rspamd_message_process(struct rspamd_task *task)
rspamd_mime_part_extract_words(task, text_part);
- if (text_part->utf_words) {
+ if (text_part->utf_words.a) {
total_words += text_part->nwords;
}
}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index cb695773e..e6b454362 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -16,6 +16,7 @@
#include "libserver/url.h"
#include "libutil/ref.h"
#include "libutil/str_util.h"
+#include "libserver/word.h"
#include <unicode/uchar.h>
#include <unicode/utext.h>
@@ -139,7 +140,7 @@ struct rspamd_mime_text_part {
GByteArray *utf_raw_content; /* utf raw content */
GByteArray *utf_stripped_content; /* utf content with no newlines */
GArray *normalized_hashes; /* Array of uint64_t */
- GArray *utf_words; /* Array of rspamd_stat_token_t */
+ rspamd_words_t utf_words; /* kvec of rspamd_word_t */
UText utf_stripped_text; /* Used by libicu to represent the utf8 content */
GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h
index f59c6ff89..36941da7a 100644
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -48,6 +48,7 @@ struct worker_s;
struct rspamd_external_libs_ctx;
struct rspamd_cryptobox_pubkey;
struct rspamd_dns_resolver;
+struct rspamd_tokenizer_manager;
/**
* Logging type
@@ -395,6 +396,8 @@ struct rspamd_config {
unsigned int log_error_elts; /**< number of elements in error logbuf */
unsigned int log_error_elt_maxlen; /**< maximum size of error log element */
unsigned int log_task_max_elts; /**< maximum number of elements in task logging */
+ unsigned int log_max_tag_len; /**< maximum length of log tag */
+ char *log_tag_strip_policy_str; /**< log tag strip policy string */
struct rspamd_worker_log_pipe *log_pipes;
gboolean compat_messages; /**< use old messages in the protocol (array) */
@@ -495,9 +498,10 @@ struct rspamd_config {
char *zstd_output_dictionary; /**< path to zstd output dictionary */
ucl_object_t *neighbours; /**< other servers in the cluster */
- struct rspamd_config_settings_elt *setting_ids; /**< preprocessed settings ids */
- struct rspamd_lang_detector *lang_det; /**< language detector */
- struct rspamd_worker *cur_worker; /**< set dynamically by each worker */
+ struct rspamd_config_settings_elt *setting_ids; /**< preprocessed settings ids */
+ struct rspamd_lang_detector *lang_det; /**< language detector */
+ struct rspamd_tokenizer_manager *tokenizer_manager; /**< custom tokenizer manager */
+ struct rspamd_worker *cur_worker; /**< set dynamically by each worker */
ref_entry_t ref; /**< reference counter */
};
diff --git a/src/libserver/cfg_rcl.cxx b/src/libserver/cfg_rcl.cxx
index f38366908..0a48e8a4f 100644
--- a/src/libserver/cfg_rcl.cxx
+++ b/src/libserver/cfg_rcl.cxx
@@ -299,6 +299,14 @@ rspamd_rcl_logging_handler(rspamd_mempool_t *pool, const ucl_object_t *obj,
cfg->log_flags |= RSPAMD_LOG_FLAG_USEC;
}
+ /* Set default values for new log tag options */
+ if (cfg->log_max_tag_len == 0) {
+ cfg->log_max_tag_len = RSPAMD_LOG_ID_LEN; /* Default to new max size */
+ }
+ if (cfg->log_tag_strip_policy_str == NULL) {
+ cfg->log_tag_strip_policy_str = rspamd_mempool_strdup(cfg->cfg_pool, "right");
+ }
+
return rspamd_rcl_section_parse_defaults(cfg, *section, cfg->cfg_pool, obj,
(void *) cfg, err);
}
@@ -1700,6 +1708,18 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections)
G_STRUCT_OFFSET(struct rspamd_config, log_task_max_elts),
RSPAMD_CL_FLAG_UINT,
"Maximum number of elements in task log entry (7 by default)");
+ rspamd_rcl_add_default_handler(sub,
+ "max_tag_len",
+ rspamd_rcl_parse_struct_integer,
+ G_STRUCT_OFFSET(struct rspamd_config, log_max_tag_len),
+ RSPAMD_CL_FLAG_UINT,
+ "Maximum length of log tag cannot exceed 32 (" G_STRINGIFY(RSPAMD_LOG_ID_LEN) ") by default)");
+ rspamd_rcl_add_default_handler(sub,
+ "tag_strip_policy",
+ rspamd_rcl_parse_struct_string,
+ G_STRUCT_OFFSET(struct rspamd_config, log_tag_strip_policy_str),
+ 0,
+ "Log tag strip policy when tag exceeds max length: 'right', 'left', 'middle' (right by default)");
/* Documentation only options, handled in log_handler to map flags */
rspamd_rcl_add_doc_by_path(cfg,
@@ -3640,7 +3660,7 @@ rspamd_config_parse_ucl(struct rspamd_config *cfg,
/* Try to load keyfile if available */
auto keyfile_name = fmt::format("{}.key", filename);
rspamd::util::raii_file::open(keyfile_name, O_RDONLY).map([&](const auto &keyfile) {
- auto *kp_parser = ucl_parser_new(0);
+ auto *kp_parser = ucl_parser_new(UCL_PARSER_DEFAULT);
if (ucl_parser_add_fd(kp_parser, keyfile.get_fd())) {
auto *kp_obj = ucl_parser_get_object(kp_parser);
diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx
index dfbdc6bee..c7bb20210 100644
--- a/src/libserver/cfg_utils.cxx
+++ b/src/libserver/cfg_utils.cxx
@@ -72,6 +72,11 @@
#include "contrib/expected/expected.hpp"
#include "contrib/ankerl/unordered_dense.h"
+#include "libserver/task.h"
+#include "libserver/url.h"
+#define RSPAMD_TOKENIZER_INTERNAL// We need to use internal tokenizer API
+#include "libstat/tokenizers/custom_tokenizer.h"
+
#define DEFAULT_SCORE 10.0
#define DEFAULT_RLIMIT_NOFILE 2048
@@ -821,6 +826,65 @@ rspamd_adjust_clocks_resolution(struct rspamd_config *cfg)
#endif
}
+extern "C" {
+
+gboolean
+rspamd_config_load_custom_tokenizers(struct rspamd_config *cfg, GError **err)
+{
+ /* Load custom tokenizers */
+ const ucl_object_t *custom_tokenizers = ucl_object_lookup_path(cfg->cfg_ucl_obj,
+ "options.custom_tokenizers");
+ if (custom_tokenizers != NULL) {
+ msg_info_config("loading custom tokenizers");
+
+ if (!cfg->tokenizer_manager) {
+ cfg->tokenizer_manager = rspamd_tokenizer_manager_new(cfg->cfg_pool);
+ }
+
+ ucl_object_iter_t it = ucl_object_iterate_new(custom_tokenizers);
+ const ucl_object_t *tok_obj;
+ const char *tok_name;
+
+ while ((tok_obj = ucl_object_iterate_safe(it, true)) != NULL) {
+ tok_name = ucl_object_key(tok_obj);
+ GError *local_err = NULL;
+
+ if (!rspamd_tokenizer_manager_load_tokenizer(cfg->tokenizer_manager,
+ tok_name, tok_obj, &local_err)) {
+ msg_err_config("failed to load custom tokenizer '%s': %s",
+ tok_name, local_err ? local_err->message : "unknown error");
+
+ if (err && !*err) {
+ *err = g_error_copy(local_err);
+ }
+
+ if (local_err) {
+ g_error_free(local_err);
+ }
+
+ ucl_object_iterate_free(it);
+ return FALSE;
+ }
+ }
+ ucl_object_iterate_free(it);
+
+ msg_info_config("loaded custom tokenizers successfully");
+ }
+
+ return TRUE;
+}
+
+void rspamd_config_unload_custom_tokenizers(struct rspamd_config *cfg)
+{
+ if (cfg->tokenizer_manager) {
+ msg_info_config("unloading custom tokenizers");
+ rspamd_tokenizer_manager_destroy(cfg->tokenizer_manager);
+ cfg->tokenizer_manager = NULL;
+ }
+}
+
+}// extern "C"
+
/*
* Perform post load actions
*/
@@ -940,6 +1004,20 @@ rspamd_config_post_load(struct rspamd_config *cfg,
msg_err_config("cannot configure libraries, fatal error");
return FALSE;
}
+
+ /* Load custom tokenizers using the new function */
+ GError *tokenizer_err = NULL;
+ if (!rspamd_config_load_custom_tokenizers(cfg, &tokenizer_err)) {
+ msg_err_config("failed to load custom tokenizers: %s",
+ tokenizer_err ? tokenizer_err->message : "unknown error");
+ if (tokenizer_err) {
+ g_error_free(tokenizer_err);
+ }
+
+ if (opts & RSPAMD_CONFIG_INIT_VALIDATE) {
+ ret = tl::make_unexpected(std::string{"failed to load custom tokenizers"});
+ }
+ }
}
/* Validate cache */
@@ -1363,7 +1441,7 @@ rspamd_ucl_fin_cb(struct map_cb_data *data, void **target)
}
/* New data available */
- auto *parser = ucl_parser_new(0);
+ auto *parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_chunk(parser, (unsigned char *) cbdata->buf.data(),
cbdata->buf.size())) {
msg_err_config("cannot parse map %s: %s",
diff --git a/src/libserver/dynamic_cfg.c b/src/libserver/dynamic_cfg.c
index 984517074..6d648d745 100644
--- a/src/libserver/dynamic_cfg.c
+++ b/src/libserver/dynamic_cfg.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -195,7 +195,7 @@ json_config_fin_cb(struct map_cb_data *data, void **target)
return;
}
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_chunk(parser, jb->buf->str, jb->buf->len)) {
msg_err("cannot load json data: parse error %s",
diff --git a/src/libserver/http/http_connection.c b/src/libserver/http/http_connection.c
index d94f9835e..b5d70fc1c 100644
--- a/src/libserver/http/http_connection.c
+++ b/src/libserver/http/http_connection.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -1670,7 +1670,22 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
{
char datebuf[64];
int meth_len = 0;
- const char *conn_type = "close";
+ const char *server_conn_header, *client_conn_header;
+
+ /* Set up connection header strings based on flags and connection type */
+ if (msg->flags & RSPAMD_HTTP_FLAG_HAS_CONNECTION_HEADER) {
+ server_conn_header = "";
+ client_conn_header = "";
+ }
+ else {
+ server_conn_header = "Connection: close\r\n";
+ if (conn->opts & RSPAMD_HTTP_CLIENT_KEEP_ALIVE) {
+ client_conn_header = "Connection: keep-alive\r\n";
+ }
+ else {
+ client_conn_header = "Connection: close\r\n";
+ }
+ }
if (conn->type == RSPAMD_HTTP_SERVER) {
/* Format reply */
@@ -1712,12 +1727,14 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
meth_len =
rspamd_snprintf(repbuf, replen,
"HTTP/1.1 %d %T\r\n"
- "Connection: close\r\n"
+ "%s"
"Server: %s\r\n"
"Date: %s\r\n"
"Content-Length: %z\r\n"
"Content-Type: %s", /* NO \r\n at the end ! */
- msg->code, &status, priv->ctx->config.server_hdr,
+ msg->code, &status,
+ server_conn_header,
+ priv->ctx->config.server_hdr,
datebuf,
bodylen, mime_type);
}
@@ -1725,11 +1742,13 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
meth_len =
rspamd_snprintf(repbuf, replen,
"HTTP/1.1 %d %T\r\n"
- "Connection: close\r\n"
+ "%s"
"Server: %s\r\n"
"Date: %s\r\n"
"Content-Length: %z", /* NO \r\n at the end ! */
- msg->code, &status, priv->ctx->config.server_hdr,
+ msg->code, &status,
+ server_conn_header,
+ priv->ctx->config.server_hdr,
datebuf,
bodylen);
}
@@ -1737,11 +1756,12 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
/* External reply */
rspamd_printf_fstring(buf,
"HTTP/1.1 200 OK\r\n"
- "Connection: close\r\n"
+ "%s"
"Server: %s\r\n"
"Date: %s\r\n"
"Content-Length: %z\r\n"
"Content-Type: application/octet-stream\r\n",
+ server_conn_header,
priv->ctx->config.server_hdr,
datebuf, enclen);
}
@@ -1750,12 +1770,14 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
meth_len =
rspamd_printf_fstring(buf,
"HTTP/1.1 %d %T\r\n"
- "Connection: close\r\n"
+ "%s"
"Server: %s\r\n"
"Date: %s\r\n"
"Content-Length: %z\r\n"
"Content-Type: %s\r\n",
- msg->code, &status, priv->ctx->config.server_hdr,
+ msg->code, &status,
+ server_conn_header,
+ priv->ctx->config.server_hdr,
datebuf,
bodylen, mime_type);
}
@@ -1763,11 +1785,13 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
meth_len =
rspamd_printf_fstring(buf,
"HTTP/1.1 %d %T\r\n"
- "Connection: close\r\n"
+ "%s"
"Server: %s\r\n"
"Date: %s\r\n"
"Content-Length: %z\r\n",
- msg->code, &status, priv->ctx->config.server_hdr,
+ msg->code, &status,
+ server_conn_header,
+ priv->ctx->config.server_hdr,
datebuf,
bodylen);
}
@@ -1804,10 +1828,6 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
else {
/* Client request */
- if (conn->opts & RSPAMD_HTTP_CLIENT_KEEP_ALIVE) {
- conn_type = "keep-alive";
- }
-
/* Format request */
enclen += RSPAMD_FSTRING_LEN(msg->url) +
strlen(http_method_str(msg->method)) + 1;
@@ -1819,21 +1839,21 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
"%s %s HTTP/1.0\r\n"
"Content-Length: %z\r\n"
"Content-Type: application/octet-stream\r\n"
- "Connection: %s\r\n",
+ "%s",
"POST",
"/post",
enclen,
- conn_type);
+ client_conn_header);
}
else {
rspamd_printf_fstring(buf,
"%s %V HTTP/1.0\r\n"
"Content-Length: %z\r\n"
- "Connection: %s\r\n",
+ "%s",
http_method_str(msg->method),
msg->url,
bodylen,
- conn_type);
+ client_conn_header);
if (bodylen > 0) {
if (mime_type == NULL) {
@@ -1857,26 +1877,26 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
if (rspamd_http_message_is_standard_port(msg)) {
rspamd_printf_fstring(buf,
"%s %s HTTP/1.1\r\n"
- "Connection: %s\r\n"
+ "%s"
"Host: %s\r\n"
"Content-Length: %z\r\n"
"Content-Type: application/octet-stream\r\n",
"POST",
"/post",
- conn_type,
+ client_conn_header,
host,
enclen);
}
else {
rspamd_printf_fstring(buf,
"%s %s HTTP/1.1\r\n"
- "Connection: %s\r\n"
+ "%s"
"Host: %s:%d\r\n"
"Content-Length: %z\r\n"
"Content-Type: application/octet-stream\r\n",
"POST",
"/post",
- conn_type,
+ client_conn_header,
host,
msg->port,
enclen);
@@ -1888,21 +1908,21 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
if ((msg->flags & RSPAMD_HTTP_FLAG_HAS_HOST_HEADER)) {
rspamd_printf_fstring(buf,
"%s %s://%s:%d/%V HTTP/1.1\r\n"
- "Connection: %s\r\n"
+ "%s"
"Content-Length: %z\r\n",
http_method_str(msg->method),
(conn->opts & RSPAMD_HTTP_CLIENT_SSL) ? "https" : "http",
host,
msg->port,
msg->url,
- conn_type,
+ client_conn_header,
bodylen);
}
else {
if (rspamd_http_message_is_standard_port(msg)) {
rspamd_printf_fstring(buf,
"%s %s://%s:%d/%V HTTP/1.1\r\n"
- "Connection: %s\r\n"
+ "%s"
"Host: %s\r\n"
"Content-Length: %z\r\n",
http_method_str(msg->method),
@@ -1910,14 +1930,14 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
host,
msg->port,
msg->url,
- conn_type,
+ client_conn_header,
host,
bodylen);
}
else {
rspamd_printf_fstring(buf,
"%s %s://%s:%d/%V HTTP/1.1\r\n"
- "Connection: %s\r\n"
+ "%s"
"Host: %s:%d\r\n"
"Content-Length: %z\r\n",
http_method_str(msg->method),
@@ -1925,7 +1945,7 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
host,
msg->port,
msg->url,
- conn_type,
+ client_conn_header,
host,
msg->port,
bodylen);
@@ -1937,35 +1957,35 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted,
if ((msg->flags & RSPAMD_HTTP_FLAG_HAS_HOST_HEADER)) {
rspamd_printf_fstring(buf,
"%s %V HTTP/1.1\r\n"
- "Connection: %s\r\n"
+ "%s"
"Content-Length: %z\r\n",
http_method_str(msg->method),
msg->url,
- conn_type,
+ client_conn_header,
bodylen);
}
else {
if (rspamd_http_message_is_standard_port(msg)) {
rspamd_printf_fstring(buf,
"%s %V HTTP/1.1\r\n"
- "Connection: %s\r\n"
+ "%s"
"Host: %s\r\n"
"Content-Length: %z\r\n",
http_method_str(msg->method),
msg->url,
- conn_type,
+ client_conn_header,
host,
bodylen);
}
else {
rspamd_printf_fstring(buf,
"%s %V HTTP/1.1\r\n"
- "Connection: %s\r\n"
+ "%s"
"Host: %s:%d\r\n"
"Content-Length: %z\r\n",
http_method_str(msg->method),
msg->url,
- conn_type,
+ client_conn_header,
host,
msg->port,
bodylen);
diff --git a/src/libserver/http/http_connection.h b/src/libserver/http/http_connection.h
index f6ec03d95..466a3edd9 100644
--- a/src/libserver/http/http_connection.h
+++ b/src/libserver/http/http_connection.h
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -80,9 +80,13 @@ struct rspamd_storage_shmem {
*/
#define RSPAMD_HTTP_FLAG_HAS_HOST_HEADER (1 << 7)
/**
+ * Connection header has been set for a message
+ */
+#define RSPAMD_HTTP_FLAG_HAS_CONNECTION_HEADER (1 << 8)
+/**
* Message is intended for SSL connection
*/
-#define RSPAMD_HTTP_FLAG_WANT_SSL (1 << 8)
+#define RSPAMD_HTTP_FLAG_WANT_SSL (1 << 9)
/**
* Options for HTTP connection
*/
diff --git a/src/libserver/http/http_message.c b/src/libserver/http/http_message.c
index 0c9708450..e5e4a0469 100644
--- a/src/libserver/http/http_message.c
+++ b/src/libserver/http/http_message.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -539,6 +539,9 @@ void rspamd_http_message_add_header_len(struct rspamd_http_message *msg,
if (g_ascii_strcasecmp(name, "host") == 0) {
msg->flags |= RSPAMD_HTTP_FLAG_HAS_HOST_HEADER;
}
+ else if (g_ascii_strcasecmp(name, "connection") == 0) {
+ msg->flags |= RSPAMD_HTTP_FLAG_HAS_CONNECTION_HEADER;
+ }
hdr->combined = rspamd_fstring_sized_new(nlen + vlen + 4);
rspamd_printf_fstring(&hdr->combined, "%s: %*s\r\n", name, (int) vlen,
@@ -746,4 +749,4 @@ const char *rspamd_http_message_get_url(struct rspamd_http_message *msg, gsize *
}
return NULL;
-} \ No newline at end of file
+}
diff --git a/src/libserver/logger/logger.c b/src/libserver/logger/logger.c
index dc0a85a05..600b7f1e1 100644
--- a/src/libserver/logger/logger.c
+++ b/src/libserver/logger/logger.c
@@ -22,7 +22,6 @@
#include "unix-std.h"
#include "logger_private.h"
-
static rspamd_logger_t *default_logger = NULL;
static rspamd_logger_t *emergency_logger = NULL;
static struct rspamd_log_modules *log_modules = NULL;
@@ -30,6 +29,61 @@ static struct rspamd_log_modules *log_modules = NULL;
static const char lf_chr = '\n';
unsigned int rspamd_task_log_id = (unsigned int) -1;
+
+/**
+ * Strip log tag according to the configured policy
+ * @param original_tag original log tag
+ * @param original_len length of original tag
+ * @param dest destination buffer
+ * @param max_len maximum length allowed
+ * @param policy stripping policy
+ * @return actual length of stripped tag
+ */
+static gsize
+rspamd_log_strip_tag(const char *original_tag, gsize original_len,
+ char *dest, gsize max_len,
+ enum rspamd_log_tag_strip_policy policy)
+{
+ if (original_len <= max_len) {
+ /* No stripping needed */
+ memcpy(dest, original_tag, original_len);
+ return original_len;
+ }
+
+ switch (policy) {
+ case RSPAMD_LOG_TAG_STRIP_RIGHT:
+ /* Cut right part (current behavior) */
+ memcpy(dest, original_tag, max_len);
+ return max_len;
+
+ case RSPAMD_LOG_TAG_STRIP_LEFT:
+ /* Cut left part (take last elements) */
+ memcpy(dest, original_tag + (original_len - max_len), max_len);
+ return max_len;
+
+ case RSPAMD_LOG_TAG_STRIP_MIDDLE:
+ /* Half from start and half from end */
+ if (max_len >= 2) {
+ gsize first_half = max_len / 2;
+ gsize second_half = max_len - first_half;
+
+ memcpy(dest, original_tag, first_half);
+ memcpy(dest + first_half,
+ original_tag + (original_len - second_half),
+ second_half);
+ }
+ else if (max_len == 1) {
+ /* Just take first character */
+ dest[0] = original_tag[0];
+ }
+ return max_len;
+
+ default:
+ /* Fallback to right stripping */
+ memcpy(dest, original_tag, max_len);
+ return max_len;
+ }
+}
RSPAMD_CONSTRUCTOR(rspamd_task_log_init)
{
rspamd_task_log_id = rspamd_logger_add_debug_module("task");
@@ -160,6 +214,10 @@ rspamd_log_open_emergency(rspamd_mempool_t *pool, int flags)
logger->process_type = "main";
logger->pid = getpid();
+ /* Initialize log tag configuration with defaults */
+ logger->max_log_tag_len = RSPAMD_LOG_ID_LEN; /* Keep backward compatibility default */
+ logger->log_tag_strip_policy = RSPAMD_LOG_TAG_STRIP_RIGHT;
+
const struct rspamd_logger_funcs *funcs = &console_log_funcs;
memcpy(&logger->ops, funcs, sizeof(*funcs));
@@ -258,6 +316,28 @@ rspamd_log_open_specific(rspamd_mempool_t *pool,
logger->process_type = ptype;
logger->enabled = TRUE;
+ /* Initialize log tag configuration with defaults */
+ if (cfg && cfg->log_max_tag_len > 0) {
+ logger->max_log_tag_len = MIN(MEMPOOL_UID_LEN, cfg->log_max_tag_len);
+ }
+ else {
+ logger->max_log_tag_len = RSPAMD_LOG_ID_LEN; /* Keep backward compatibility default */
+ }
+
+ logger->log_tag_strip_policy = RSPAMD_LOG_TAG_STRIP_RIGHT;
+
+ if (cfg && cfg->log_tag_strip_policy_str) {
+ if (g_ascii_strcasecmp(cfg->log_tag_strip_policy_str, "left") == 0) {
+ logger->log_tag_strip_policy = RSPAMD_LOG_TAG_STRIP_LEFT;
+ }
+ else if (g_ascii_strcasecmp(cfg->log_tag_strip_policy_str, "middle") == 0) {
+ logger->log_tag_strip_policy = RSPAMD_LOG_TAG_STRIP_MIDDLE;
+ }
+ else {
+ logger->log_tag_strip_policy = RSPAMD_LOG_TAG_STRIP_RIGHT; /* Default */
+ }
+ }
+
/* Set up conditional logging */
if (cfg) {
if (cfg->debug_ip_map != NULL) {
@@ -1026,16 +1106,34 @@ log_time(double now, rspamd_logger_t *rspamd_log, char *timebuf,
}
}
+/**
+ * Process log ID with stripping policy and return the effective length
+ * @param logger logger instance with configuration
+ * @param id original log ID
+ * @param processed_id buffer to store processed ID (should be at least max_log_tag_len + 1)
+ * @return effective length of processed ID
+ */
static inline int
-rspamd_log_id_strlen(const char *id)
+rspamd_log_process_id(rspamd_logger_t *logger, const char *id, char *processed_id)
{
- for (int i = 0; i < RSPAMD_LOG_ID_LEN; i++) {
- if (G_UNLIKELY(id[i] == '\0')) {
- return i;
- }
+ if (id == NULL) {
+ return 0;
+ }
+
+ gsize original_len = strlen(id);
+ gsize max_len = MIN(MEMPOOL_UID_LEN, logger->max_log_tag_len);
+
+ if (original_len <= max_len) {
+ /* No processing needed */
+ memcpy(processed_id, id, original_len);
+ return original_len;
}
- return RSPAMD_LOG_ID_LEN;
+ /* Apply stripping policy */
+ gsize processed_len = rspamd_log_strip_tag(id, original_len, processed_id, max_len,
+ logger->log_tag_strip_policy);
+
+ return processed_len;
}
void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx,
@@ -1071,8 +1169,17 @@ void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx,
if (G_UNLIKELY(log_json)) {
/* Perform JSON logging */
- unsigned int slen = id ? strlen(id) : strlen("(NULL)");
- slen = MIN(RSPAMD_LOG_ID_LEN, slen);
+ char processed_id[MEMPOOL_UID_LEN];
+ int processed_len = 0;
+
+ if (id) {
+ processed_len = rspamd_log_process_id(logger, id, processed_id);
+ }
+ else {
+ strcpy(processed_id, "(NULL)");
+ processed_len = strlen(processed_id);
+ }
+
r = rspamd_snprintf(tmpbuf, sizeof(tmpbuf), "{\"ts\": %f, "
"\"pid\": %P, "
"\"severity\": \"%s\", "
@@ -1085,7 +1192,7 @@ void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx,
logger->pid,
rspamd_get_log_severity_string(level_flags),
logger->process_type,
- slen, id,
+ processed_len, processed_id,
module,
function);
iov_ctx->iov[0].iov_base = tmpbuf;
@@ -1241,14 +1348,17 @@ void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx,
glong mremain, mr;
char *m;
+ char processed_id[MEMPOOL_UID_LEN];
+ int processed_len = 0;
modulebuf[0] = '\0';
mremain = sizeof(modulebuf);
m = modulebuf;
if (id != NULL) {
- mr = rspamd_snprintf(m, mremain, "<%*.s>; ", rspamd_log_id_strlen(id),
- id);
+ processed_len = rspamd_log_process_id(logger, id, processed_id);
+ mr = rspamd_snprintf(m, mremain, "<%*.s>; ", processed_len,
+ processed_id);
m += mr;
mremain -= mr;
}
@@ -1300,10 +1410,13 @@ void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx,
iov_ctx->iov[niov].iov_base = (void *) timebuf;
iov_ctx->iov[niov++].iov_len = strlen(timebuf);
if (id != NULL) {
+ char processed_id[MEMPOOL_UID_LEN];
+ int processed_len = rspamd_log_process_id(logger, id, processed_id);
+
iov_ctx->iov[niov].iov_base = (void *) "; ";
iov_ctx->iov[niov++].iov_len = 2;
- iov_ctx->iov[niov].iov_base = (void *) id;
- iov_ctx->iov[niov++].iov_len = rspamd_log_id_strlen(id);
+ iov_ctx->iov[niov].iov_base = (void *) processed_id;
+ iov_ctx->iov[niov++].iov_len = processed_len;
iov_ctx->iov[niov].iov_base = (void *) ";";
iov_ctx->iov[niov++].iov_len = 1;
}
diff --git a/src/libserver/logger/logger_private.h b/src/libserver/logger/logger_private.h
index 80178ad32..387d8639b 100644
--- a/src/libserver/logger/logger_private.h
+++ b/src/libserver/logger/logger_private.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -23,6 +23,12 @@
#define REPEATS_MAX 300
#define LOGBUF_LEN 8192
+enum rspamd_log_tag_strip_policy {
+ RSPAMD_LOG_TAG_STRIP_RIGHT = 0, /* Cut right part (current behavior) */
+ RSPAMD_LOG_TAG_STRIP_LEFT, /* Cut left part (take last elements) */
+ RSPAMD_LOG_TAG_STRIP_MIDDLE, /* Half from start and half from end */
+};
+
struct rspamd_log_module {
char *mname;
unsigned int id;
@@ -73,6 +79,10 @@ struct rspamd_logger_s {
gboolean is_debug;
gboolean no_lock;
+ /* Log tag configuration */
+ unsigned int max_log_tag_len;
+ enum rspamd_log_tag_strip_policy log_tag_strip_policy;
+
pid_t pid;
const char *process_type;
struct rspamd_radix_map_helper *debug_ip;
diff --git a/src/libserver/maps/map.c b/src/libserver/maps/map.c
index 51390f24b..ac82d39bb 100644
--- a/src/libserver/maps/map.c
+++ b/src/libserver/maps/map.c
@@ -84,7 +84,8 @@ RSPAMD_CONSTRUCTOR(rspamd_map_log_init)
}
/**
- * Write HTTP request
+ * Write HTTP request with proper cache validation headers
+ * Uses ETags (If-None-Match) and Last-Modified (If-Modified-Since) for conditional requests
*/
static void
write_http_request(struct http_callback_data *cbd)
@@ -109,7 +110,8 @@ write_http_request(struct http_callback_data *cbd)
}
if (cbd->data->etag) {
rspamd_http_message_add_header_len(msg, "If-None-Match",
- cbd->data->etag->str, cbd->data->etag->len);
+ cbd->data->etag->str,
+ cbd->data->etag->len);
}
}
@@ -295,23 +297,101 @@ rspamd_map_cache_cb(struct ev_loop *loop, ev_timer *w, int revents)
}
}
+/**
+ * Calculate next check time with proper priority for different cache validation mechanisms
+ * Priority: ETags > Last-Modified > Cache expiration headers
+ * @param now current time
+ * @param expires time from cache expiration header
+ * @param map_check_interval base polling interval
+ * @param has_etag whether we have ETag for conditional requests
+ * @param has_last_modified whether we have Last-Modified for conditional requests
+ * @return next check time
+ */
static inline time_t
-rspamd_http_map_process_next_check(time_t now, time_t expires, time_t map_check_interval)
+rspamd_http_map_process_next_check(struct rspamd_map *map,
+ struct rspamd_map_backend *bk,
+ time_t now,
+ time_t expires,
+ time_t map_check_interval,
+ gboolean has_etag,
+ gboolean has_last_modified)
{
- static const time_t interval_mult = 16;
- /* By default use expires header */
- time_t next_check = expires;
+ static const time_t interval_mult = 4; /* Reduced from 16 to be more responsive */
+ static const time_t min_respectful_interval = 5;
+ time_t next_check;
+ time_t effective_interval = map_check_interval;
+
+ /*
+ * Priority order for cache validation:
+ * 1. ETags (most reliable)
+ * 2. Last-Modified dates
+ * 3. Cache expiration headers (least reliable)
+ */
- if (expires < now) {
- return now;
+ if (has_etag || has_last_modified) {
+ /*
+ * If we have ETags or Last-Modified, we can use conditional requests
+ * to avoid unnecessary downloads. However, we still need to be respectful
+ * to servers and not DoS them with overly aggressive polling.
+ */
+ if (map_check_interval < min_respectful_interval) {
+ /*
+ * User configured very aggressive polling, but server provides cache validation.
+ * Enforce minimum respectful interval to avoid DoS'ing the server.
+ */
+ effective_interval = min_respectful_interval * interval_mult;
+ msg_info_map("map polling interval %d too aggressive with server cache support for %s, "
+ "using %d seconds minimum",
+ (int) map_check_interval, bk->uri, (int) effective_interval);
+ }
+
+ if (expires > now && (expires - now) <= effective_interval * interval_mult) {
+ /* Use expires header if it's reasonable (within interval_mult x poll interval) */
+ next_check = expires;
+ }
+ else {
+ /* Use effective interval, don't extend too much */
+ next_check = now + effective_interval;
+ }
}
- else if (expires - now > map_check_interval * interval_mult) {
- next_check = now + map_check_interval * interval_mult;
+ else if (expires > now) {
+ /*
+ * No ETags or Last-Modified available, rely on cache expiration.
+ * But still cap the interval to avoid too long delays.
+ * No need for respectful interval protection here since no conditional requests.
+ */
+ if (expires - now > map_check_interval * interval_mult) {
+ next_check = now + map_check_interval * interval_mult;
+ }
+ else {
+ next_check = expires;
+ }
+ }
+ else {
+ /* No valid cache information, check immediately */
+ next_check = now;
}
return next_check;
}
+/**
+ * Calculate respectful polling interval to avoid DoS'ing servers with cache validation
+ * @param map_check_interval user configured interval
+ * @return effective interval that respects server resources
+ */
+static inline time_t
+rspamd_map_get_respectful_interval(time_t map_check_interval)
+{
+ static const time_t min_respectful_interval = 5; /* Minimum 5 seconds to be respectful */
+ static const time_t interval_mult = 4; /* Multiplier for respectful minimum */
+
+ if (map_check_interval < min_respectful_interval) {
+ return min_respectful_interval * interval_mult;
+ }
+ return map_check_interval;
+}
+
static int
http_map_finish(struct rspamd_http_connection *conn,
struct rspamd_http_message *msg)
@@ -333,13 +413,15 @@ http_map_finish(struct rspamd_http_connection *conn,
if (msg->code == 200) {
if (cbd->check) {
- msg_info_map("need to reread map from %s", cbd->bk->uri);
+ msg_info_map("need to reread map from %s (reply code 200); "
+ "date timestamp: %z, last modified: %z",
+ cbd->bk->uri, (size_t) msg->date, (size_t) msg->last_modified);
cbd->periodic->need_modify = TRUE;
/* Reset the whole chain */
cbd->periodic->cur_backend = 0;
/* Reset cache, old cached data will be cleaned on timeout */
g_atomic_int_set(&data->cache->available, 0);
- g_atomic_int_set(&bk->shared->loaded, 0);
+ g_atomic_int_set(&map->shared->loaded, 0);
data->cur_cache_cbd = NULL;
rspamd_map_process_periodic(cbd->periodic);
@@ -348,6 +430,7 @@ http_map_finish(struct rspamd_http_connection *conn,
return 0;
}
+ /* This code is executed when we are actually reading a map */
cbd->data->last_checked = msg->date;
if (msg->last_modified) {
@@ -378,10 +461,11 @@ http_map_finish(struct rspamd_http_connection *conn,
goto err;
}
- /* Check for expires */
+ /* Check for expires + etag */
double cached_timeout = map->poll_timeout * 2;
expires_hdr = rspamd_http_message_find_header(msg, "Expires");
+ etag_hdr = rspamd_http_message_find_header(msg, "ETag");
if (expires_hdr) {
time_t hdate;
@@ -389,8 +473,10 @@ http_map_finish(struct rspamd_http_connection *conn,
hdate = rspamd_http_parse_date(expires_hdr->begin, expires_hdr->len);
if (hdate != (time_t) -1 && hdate > msg->date) {
- map->next_check = rspamd_http_map_process_next_check(msg->date, hdate,
- (time_t) map->poll_timeout);
+ map->next_check = rspamd_http_map_process_next_check(map, bk, msg->date, hdate,
+ (time_t) map->poll_timeout,
+ etag_hdr != NULL,
+ msg->last_modified != 0);
cached_timeout = map->next_check - msg->date;
}
else {
@@ -398,9 +484,16 @@ http_map_finish(struct rspamd_http_connection *conn,
map->next_check = 0;
}
}
-
- /* Check for etag */
- etag_hdr = rspamd_http_message_find_header(msg, "ETag");
+ else if (etag_hdr != NULL || msg->last_modified != 0) {
+ /* No expires header, but we have ETag or Last-Modified - use respectful interval */
+ time_t effective_interval = rspamd_map_get_respectful_interval(map->poll_timeout);
+ if (effective_interval != map->poll_timeout) {
+ msg_info_map("map polling interval %d too aggressive with server cache support, "
+ "using %d seconds minimum",
+ (int) map->poll_timeout, (int) effective_interval);
+ }
+ map->next_check = msg->date + effective_interval;
+ }
if (etag_hdr) {
if (cbd->data->etag) {
@@ -421,12 +514,7 @@ http_map_finish(struct rspamd_http_connection *conn,
MAP_RETAIN(cbd->shmem_data, "shmem_data");
cbd->data->gen++;
- /*
- * We know that a map is in the locked state
- */
- g_atomic_int_set(&data->cache->available, 1);
- g_atomic_int_set(&bk->shared->loaded, 1);
- g_atomic_int_set(&bk->shared->cached, 0);
+
/* Store cached data */
rspamd_strlcpy(data->cache->shmem_name, cbd->shmem_data->shm_name,
sizeof(data->cache->shmem_name));
@@ -528,6 +616,12 @@ http_map_finish(struct rspamd_http_connection *conn,
cbd->periodic->cur_backend++;
munmap(in, dlen);
+
+ /* Announce for other processes */
+ g_atomic_int_set(&data->cache->available, 1);
+ g_atomic_int_set(&map->shared->loaded, 1);
+ g_atomic_int_set(&map->shared->cached, 1);
+
rspamd_map_process_periodic(cbd->periodic);
}
else if (msg->code == 304 && cbd->check) {
@@ -541,20 +635,34 @@ http_map_finish(struct rspamd_http_connection *conn,
}
expires_hdr = rspamd_http_message_find_header(msg, "Expires");
+ bool has_expires = (expires_hdr != NULL);
if (expires_hdr) {
time_t hdate;
hdate = rspamd_http_parse_date(expires_hdr->begin, expires_hdr->len);
if (hdate != (time_t) -1 && hdate > msg->date) {
- map->next_check = rspamd_http_map_process_next_check(msg->date, hdate,
- (time_t) map->poll_timeout);
+ map->next_check = rspamd_http_map_process_next_check(map, bk, msg->date, hdate,
+ (time_t) map->poll_timeout,
+ cbd->data->etag != NULL,
+ msg->last_modified != 0);
}
else {
msg_info_map("invalid expires header: %T, ignore it", expires_hdr);
map->next_check = 0;
+ has_expires = false;
}
}
+ else if (cbd->data->etag != NULL || msg->last_modified != 0) {
+ /* No expires header, but we have ETag or Last-Modified - use respectful interval */
+ time_t effective_interval = rspamd_map_get_respectful_interval(map->poll_timeout);
+ if (effective_interval != map->poll_timeout) {
+ msg_info_map("map polling interval %d too aggressive with server cache support, "
+ "using %d seconds minimum",
+ (int) map->poll_timeout, (int) effective_interval);
+ }
+ map->next_check = msg->date + effective_interval;
+ }
etag_hdr = rspamd_http_message_find_header(msg, "ETag");
@@ -567,19 +675,24 @@ http_map_finish(struct rspamd_http_connection *conn,
}
}
- if (map->next_check) {
+ if (has_expires) {
rspamd_http_date_format(next_check_date, sizeof(next_check_date),
map->next_check);
- msg_info_map("data is not modified for server %s, next check at %s "
+ msg_info_map("data is not modified for server %s (%s), next check at %s "
"(http cache based: %T)",
- cbd->data->host, next_check_date, expires_hdr);
+ cbd->data->host,
+ bk->uri,
+ next_check_date,
+ expires_hdr);
}
else {
rspamd_http_date_format(next_check_date, sizeof(next_check_date),
- rspamd_get_calendar_ticks() + map->poll_timeout);
- msg_info_map("data is not modified for server %s, next check at %s "
+ map->next_check);
+ msg_info_map("data is not modified for server %s (%s), next check at %s "
"(timer based)",
- cbd->data->host, next_check_date);
+ cbd->data->host,
+ bk->uri,
+ next_check_date);
}
rspamd_map_update_http_cached_file(map, bk, cbd->data);
@@ -922,7 +1035,7 @@ read_map_file(struct rspamd_map *map, struct file_map_data *data,
map->read_callback(NULL, 0, &periodic->cbdata, TRUE);
}
- g_atomic_int_set(&bk->shared->loaded, 1);
+ g_atomic_int_set(&map->shared->loaded, 1);
return TRUE;
}
@@ -1008,7 +1121,7 @@ read_map_static(struct rspamd_map *map, struct static_map_data *data,
}
data->processed = TRUE;
- g_atomic_int_set(&bk->shared->loaded, 1);
+ g_atomic_int_set(&map->shared->loaded, 1);
return TRUE;
}
@@ -1016,10 +1129,7 @@ read_map_static(struct rspamd_map *map, struct static_map_data *data,
static void
rspamd_map_periodic_dtor(struct map_periodic_cbdata *periodic)
{
- struct rspamd_map *map;
- struct rspamd_map_backend *bk;
-
- map = periodic->map;
+ struct rspamd_map *map = periodic->map;
msg_debug_map("periodic dtor %p; need_modify=%d", periodic, periodic->need_modify);
if (periodic->need_modify || periodic->cbdata.errored) {
@@ -1034,21 +1144,13 @@ rspamd_map_periodic_dtor(struct map_periodic_cbdata *periodic)
/* Not modified */
}
- if (periodic->locked) {
- if (periodic->cur_backend < map->backends->len) {
- bk = (struct rspamd_map_backend *) g_ptr_array_index(map->backends, periodic->cur_backend);
- g_atomic_int_set(&bk->shared->locked, 0);
- msg_debug_map("unlocked map %s", map->name);
- }
-
- if (periodic->map->wrk->state == rspamd_worker_state_running) {
- rspamd_map_schedule_periodic(periodic->map,
- RSPAMD_SYMBOL_RESULT_NORMAL);
- }
- else {
- msg_debug_map("stop scheduling periodics for %s; terminating state",
- periodic->map->name);
- }
+ if (periodic->map->wrk->state == rspamd_worker_state_running) {
+ rspamd_map_schedule_periodic(periodic->map,
+ RSPAMD_MAP_SCHEDULE_NORMAL);
+ }
+ else {
+ msg_debug_map("stop scheduling periodics for %s; terminating state",
+ periodic->map->name);
}
g_free(periodic);
@@ -1448,9 +1550,6 @@ rspamd_map_read_cached(struct rspamd_map *map, struct rspamd_map_backend *bk,
map->read_callback(in, len, &periodic->cbdata, TRUE);
}
- g_atomic_int_set(&bk->shared->loaded, 1);
- g_atomic_int_set(&bk->shared->cached, 1);
-
munmap(in, mmap_len);
return TRUE;
@@ -1488,7 +1587,7 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map,
const unsigned char *data,
gsize len)
{
- char path[PATH_MAX];
+ char path[PATH_MAX], temp_path[PATH_MAX];
unsigned char digest[rspamd_cryptobox_HASHBYTES];
struct rspamd_config *cfg = map->cfg;
int fd;
@@ -1501,8 +1600,10 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map,
rspamd_cryptobox_hash(digest, bk->uri, strlen(bk->uri), NULL, 0);
rspamd_snprintf(path, sizeof(path), "%s%c%*xs.map", cfg->maps_cache_dir,
G_DIR_SEPARATOR, 20, digest);
+ rspamd_snprintf(temp_path, sizeof(temp_path), "%s.tmp.%d.%d", path,
+ (int) getpid(), (int) rspamd_get_calendar_ticks());
- fd = rspamd_file_xopen(path, O_WRONLY | O_TRUNC | O_CREAT,
+ fd = rspamd_file_xopen(temp_path, O_WRONLY | O_TRUNC | O_CREAT,
00600, FALSE);
if (fd == -1) {
@@ -1510,8 +1611,9 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map,
}
if (!rspamd_file_lock(fd, FALSE)) {
- msg_err_map("cannot lock file %s: %s", path, strerror(errno));
+ msg_err_map("cannot lock file %s: %s", temp_path, strerror(errno));
close(fd);
+ unlink(temp_path);
return FALSE;
}
@@ -1530,9 +1632,10 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map,
}
if (write(fd, &header, sizeof(header)) != sizeof(header)) {
- msg_err_map("cannot write file %s (header stage): %s", path, strerror(errno));
+ msg_err_map("cannot write file %s (header stage): %s", temp_path, strerror(errno));
rspamd_file_unlock(fd, FALSE);
close(fd);
+ unlink(temp_path);
return FALSE;
}
@@ -1540,9 +1643,10 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map,
if (header.etag_len > 0) {
if (write(fd, RSPAMD_FSTRING_DATA(htdata->etag), header.etag_len) !=
header.etag_len) {
- msg_err_map("cannot write file %s (etag stage): %s", path, strerror(errno));
+ msg_err_map("cannot write file %s (etag stage): %s", temp_path, strerror(errno));
rspamd_file_unlock(fd, FALSE);
close(fd);
+ unlink(temp_path);
return FALSE;
}
@@ -1550,9 +1654,10 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map,
/* Now write the rest */
if (write(fd, data, len) != len) {
- msg_err_map("cannot write file %s (data stage): %s", path, strerror(errno));
+ msg_err_map("cannot write file %s (data stage): %s", temp_path, strerror(errno));
rspamd_file_unlock(fd, FALSE);
close(fd);
+ unlink(temp_path);
return FALSE;
}
@@ -1560,6 +1665,13 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map,
rspamd_file_unlock(fd, FALSE);
close(fd);
+ /* Atomically move temp file to final location */
+ if (rename(temp_path, path) != 0) {
+ msg_err_map("cannot rename %s to %s: %s", temp_path, path, strerror(errno));
+ unlink(temp_path);
+ return FALSE;
+ }
+
msg_info_map("saved data from %s in %s, %uz bytes", bk->uri, path, len + sizeof(header) + header.etag_len);
return TRUE;
@@ -1693,7 +1805,11 @@ rspamd_map_read_http_cached_file(struct rspamd_map *map,
double now = rspamd_get_calendar_ticks();
if (header.next_check > now) {
- map->next_check = rspamd_http_map_process_next_check(now, header.next_check, map->poll_timeout);
+ /* We assume that we have this data inside the cached file */
+ map->next_check = rspamd_http_map_process_next_check(map, bk, now, header.next_check,
+ map->poll_timeout,
+ header.etag_len > 0,
+ true);
}
else {
map->next_check = now;
@@ -1740,8 +1856,8 @@ rspamd_map_read_http_cached_file(struct rspamd_map *map,
struct tm tm;
char ncheck_buf[32], lm_buf[32];
- g_atomic_int_set(&bk->shared->loaded, 1);
- g_atomic_int_set(&bk->shared->cached, 1);
+ g_atomic_int_set(&map->shared->loaded, 1);
+ g_atomic_int_set(&map->shared->cached, 1);
rspamd_localtime(map->next_check, &tm);
strftime(ncheck_buf, sizeof(ncheck_buf) - 1, "%Y-%m-%d %H:%M:%S", &tm);
rspamd_localtime(htdata->last_modified, &tm);
@@ -1784,7 +1900,6 @@ rspamd_map_common_http_callback(struct rspamd_map *map,
(int) data->last_modified,
(int) data->cache->last_modified);
periodic->need_modify = TRUE;
- /* Reset the whole chain */
periodic->cur_backend = 0;
rspamd_map_process_periodic(periodic);
}
@@ -2054,33 +2169,10 @@ rspamd_map_process_periodic(struct map_periodic_cbdata *cbd)
bk = g_ptr_array_index(map->backends, cbd->cur_backend);
- if (!map->file_only && !cbd->locked) {
- if (!g_atomic_int_compare_and_exchange(&bk->shared->locked,
- 0, 1)) {
- msg_debug_map(
- "don't try to reread map %s as it is locked by other process, "
- "will reread it later",
- cbd->map->name);
- rspamd_map_schedule_periodic(map, RSPAMD_MAP_SCHEDULE_LOCKED);
- MAP_RELEASE(cbd, "periodic");
-
- return;
- }
- else {
- msg_debug_map("locked map %s", map->name);
- cbd->locked = TRUE;
- }
- }
-
if (cbd->errored) {
/* We should not check other backends if some backend has failed*/
rspamd_map_schedule_periodic(cbd->map, RSPAMD_MAP_SCHEDULE_ERROR);
- if (cbd->locked) {
- g_atomic_int_set(&bk->shared->locked, 0);
- cbd->locked = FALSE;
- }
-
/* Also set error flag for the map consumer */
cbd->cbdata.errored = true;
@@ -2796,9 +2888,6 @@ rspamd_map_parse_backend(struct rspamd_config *cfg, const char *map_line)
bk->data.sd = sdata;
}
- bk->shared = rspamd_mempool_alloc0_shared(cfg->cfg_pool,
- sizeof(struct rspamd_map_shared_backend_data));
-
return bk;
err:
@@ -2929,6 +3018,8 @@ rspamd_map_add(struct rspamd_config *cfg,
map->user_data = user_data;
map->cfg = cfg;
map->id = rspamd_random_uint64_fast();
+ map->shared =
+ rspamd_mempool_alloc0_shared(cfg->cfg_pool, sizeof(struct rspamd_map_shared_data));
map->backends = g_ptr_array_sized_new(1);
map->wrk = worker;
rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard,
@@ -3027,6 +3118,8 @@ rspamd_map_add_from_ucl(struct rspamd_config *cfg,
map->user_data = user_data;
map->cfg = cfg;
map->id = rspamd_random_uint64_fast();
+ map->shared =
+ rspamd_mempool_alloc0_shared(cfg->cfg_pool, sizeof(struct rspamd_map_shared_data));
map->backends = g_ptr_array_new();
map->wrk = worker;
map->no_file_read = (flags & RSPAMD_MAP_FILE_NO_READ);
@@ -3208,7 +3301,7 @@ rspamd_map_add_from_ucl(struct rspamd_config *cfg,
if (all_loaded) {
/* Static map */
- g_atomic_int_set(&bk->shared->loaded, 1);
+ g_atomic_int_set(&map->shared->loaded, 1);
}
rspamd_map_calculate_hash(map);
diff --git a/src/libserver/maps/map_private.h b/src/libserver/maps/map_private.h
index 66949f926..65df8d7f5 100644
--- a/src/libserver/maps/map_private.h
+++ b/src/libserver/maps/map_private.h
@@ -134,20 +134,12 @@ union rspamd_map_backend_data {
struct rspamd_map;
-/*
- * Shared between workers
- */
-struct rspamd_map_shared_backend_data {
- int locked;
- int loaded;
- int cached;
-};
+
struct rspamd_map_backend {
enum fetch_proto protocol;
gboolean is_signed;
gboolean is_compressed;
gboolean is_fallback;
- struct rspamd_map_shared_backend_data *shared;
struct rspamd_map *map;
struct ev_loop *event_loop;
uint64_t id;
@@ -159,6 +151,14 @@ struct rspamd_map_backend {
struct map_periodic_cbdata;
+/*
+ * Shared between workers
+ */
+struct rspamd_map_shared_data {
+ int loaded;
+ int cached;
+};
+
struct rspamd_map {
struct rspamd_dns_resolver *r;
struct rspamd_config *cfg;
@@ -193,6 +193,8 @@ struct rspamd_map {
bool static_only; /* No need to check */
bool no_file_read; /* Do not read files */
bool seen; /* This map has already been watched or pre-loaded */
+ /* Shared lock for temporary disabling of map reading (e.g. when this map is written by UI) */
+ struct rspamd_map_shared_data *shared;
char tag[MEMPOOL_UID_LEN];
};
@@ -209,7 +211,6 @@ struct map_periodic_cbdata {
ev_timer ev;
gboolean need_modify;
gboolean errored;
- gboolean locked;
unsigned int cur_backend;
ref_entry_t ref;
};
diff --git a/src/libserver/milter.c b/src/libserver/milter.c
index 94b0d6cc1..09ddddaba 100644
--- a/src/libserver/milter.c
+++ b/src/libserver/milter.c
@@ -1473,8 +1473,6 @@ rspamd_milter_macro_http(struct rspamd_milter_session *session,
{
rspamd_http_message_add_header_len(msg, QUEUE_ID_HEADER,
found->begin, found->len);
- rspamd_http_message_add_header_len(msg, LOG_TAG_HEADER,
- found->begin, found->len);
}
else
{
@@ -1482,8 +1480,6 @@ rspamd_milter_macro_http(struct rspamd_milter_session *session,
{
rspamd_http_message_add_header_len(msg, QUEUE_ID_HEADER,
found->begin, found->len);
- rspamd_http_message_add_header_len(msg, LOG_TAG_HEADER,
- found->begin, found->len);
}
}
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index 06e9f3328..50b155ae0 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -998,20 +998,21 @@ rspamd_re_cache_process_selector(struct rspamd_task *task,
return result;
}
+
static inline unsigned int
-rspamd_process_words_vector(GArray *words,
- const unsigned char **scvec,
- unsigned int *lenvec,
- struct rspamd_re_class *re_class,
- unsigned int cnt,
- gboolean *raw)
+rspamd_process_words_vector_kvec(rspamd_words_t *words,
+ const unsigned char **scvec,
+ unsigned int *lenvec,
+ struct rspamd_re_class *re_class,
+ unsigned int cnt,
+ gboolean *raw)
{
unsigned int j;
- rspamd_stat_token_t *tok;
+ rspamd_word_t *tok;
- if (words) {
- for (j = 0; j < words->len; j++) {
- tok = &g_array_index(words, rspamd_stat_token_t, j);
+ if (words && words->a) {
+ for (j = 0; j < kv_size(*words); j++) {
+ tok = &kv_A(*words, j);
if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
@@ -1432,13 +1433,13 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
{
- if (text_part->utf_words) {
- cnt += text_part->utf_words->len;
+ if (text_part->utf_words.a) {
+ cnt += kv_size(text_part->utf_words);
}
}
- if (task->meta_words && task->meta_words->len > 0) {
- cnt += task->meta_words->len;
+ if (task->meta_words.a && kv_size(task->meta_words) > 0) {
+ cnt += kv_size(task->meta_words);
}
if (cnt > 0) {
@@ -1449,15 +1450,15 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
{
- if (text_part->utf_words) {
- cnt = rspamd_process_words_vector(text_part->utf_words,
- scvec, lenvec, re_class, cnt, &raw);
+ if (text_part->utf_words.a) {
+ cnt = rspamd_process_words_vector_kvec(&text_part->utf_words,
+ scvec, lenvec, re_class, cnt, &raw);
}
}
- if (task->meta_words) {
- cnt = rspamd_process_words_vector(task->meta_words,
- scvec, lenvec, re_class, cnt, &raw);
+ if (task->meta_words.a) {
+ cnt = rspamd_process_words_vector_kvec(&task->meta_words,
+ scvec, lenvec, re_class, cnt, &raw);
}
ret = rspamd_re_cache_process_regexp_data(rt, re,
diff --git a/src/libserver/roll_history.c b/src/libserver/roll_history.c
index 66a53a597..d0f145d8f 100644
--- a/src/libserver/roll_history.c
+++ b/src/libserver/roll_history.c
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -231,7 +231,7 @@ rspamd_roll_history_load(struct roll_history *history, const char *filename)
return FALSE;
}
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_fd(parser, fd)) {
msg_warn("cannot parse history file %s: %s", filename,
diff --git a/src/libserver/rspamd_control.c b/src/libserver/rspamd_control.c
index 1bff2ff12..9e35cb575 100644
--- a/src/libserver/rspamd_control.c
+++ b/src/libserver/rspamd_control.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -214,7 +214,7 @@ rspamd_control_write_reply(struct rspamd_control_session *session)
case RSPAMD_CONTROL_FUZZY_STAT:
if (elt->attached_fd != -1) {
/* We have some data to parse */
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
ucl_object_insert_key(cur,
ucl_object_fromint(
elt->reply.reply.fuzzy_stat.status),
diff --git a/src/libserver/symcache/symcache_impl.cxx b/src/libserver/symcache/symcache_impl.cxx
index c0278cfc1..c1ca2a6ed 100644
--- a/src/libserver/symcache/symcache_impl.cxx
+++ b/src/libserver/symcache/symcache_impl.cxx
@@ -274,7 +274,7 @@ auto symcache::load_items() -> bool
return false;
}
- auto *parser = ucl_parser_new(0);
+ auto *parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
const auto *p = (const std::uint8_t *) (hdr + 1);
if (!ucl_parser_add_chunk(parser, p, cached_map->get_size() - sizeof(*hdr))) {
diff --git a/src/libserver/task.c b/src/libserver/task.c
index bd1e07549..9f5b1f00a 100644
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -196,8 +196,8 @@ void rspamd_task_free(struct rspamd_task *task)
rspamd_email_address_free(task->from_envelope_orig);
}
- if (task->meta_words) {
- g_array_free(task->meta_words, TRUE);
+ if (task->meta_words.a) {
+ kv_destroy(task->meta_words);
}
ucl_object_unref(task->messages);
diff --git a/src/libserver/task.h b/src/libserver/task.h
index 6be350098..1c1778fee 100644
--- a/src/libserver/task.h
+++ b/src/libserver/task.h
@@ -24,6 +24,7 @@
#include "dns.h"
#include "re_cache.h"
#include "khash.h"
+#include "libserver/word.h"
#ifdef __cplusplus
extern "C" {
@@ -187,7 +188,7 @@ struct rspamd_task {
struct rspamd_scan_result *result; /**< Metric result */
khash_t(rspamd_task_lua_cache) lua_cache; /**< cache of lua objects */
GPtrArray *tokens; /**< statistics tokens */
- GArray *meta_words; /**< rspamd_stat_token_t produced from meta headers
+ rspamd_words_t meta_words; /**< rspamd_word_t produced from meta headers
(e.g. Subject) */
GPtrArray *rcpt_envelope; /**< array of rspamd_email_address */
diff --git a/src/libserver/word.h b/src/libserver/word.h
new file mode 100644
index 000000000..7698bf327
--- /dev/null
+++ b/src/libserver/word.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_WORD_H
+#define RSPAMD_WORD_H
+
+#include "config.h"
+#include "fstring.h"
+#include "contrib/libucl/kvec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file word.h
+ * Word processing structures and definitions
+ */
+
+/* Word flags */
+#define RSPAMD_WORD_FLAG_TEXT (1u << 0)
+#define RSPAMD_WORD_FLAG_META (1u << 1)
+#define RSPAMD_WORD_FLAG_LUA_META (1u << 2)
+#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3)
+#define RSPAMD_WORD_FLAG_HEADER (1u << 4)
+#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5)
+#define RSPAMD_WORD_FLAG_UTF (1u << 6)
+#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7)
+#define RSPAMD_WORD_FLAG_STEMMED (1u << 8)
+#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9)
+#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10)
+#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11)
+#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12)
+#define RSPAMD_WORD_FLAG_EMOJI (1u << 13)
+
+/**
+ * Word structure representing tokenized text
+ */
+typedef struct rspamd_word_s {
+ rspamd_ftok_t original; /* utf8 raw */
+ rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
+ rspamd_ftok_t normalized; /* normalized and lowercased utf8 */
+ rspamd_ftok_t stemmed; /* stemmed utf8 */
+ unsigned int flags;
+} rspamd_word_t;
+
+/**
+ * Vector of words using kvec
+ */
+typedef kvec_t(rspamd_word_t) rspamd_words_t;
+
+/* Legacy typedefs for backward compatibility */
+typedef rspamd_word_t rspamd_stat_token_t;
+
+/* Legacy flag aliases for backward compatibility */
+#define RSPAMD_STAT_TOKEN_FLAG_TEXT RSPAMD_WORD_FLAG_TEXT
+#define RSPAMD_STAT_TOKEN_FLAG_META RSPAMD_WORD_FLAG_META
+#define RSPAMD_STAT_TOKEN_FLAG_LUA_META RSPAMD_WORD_FLAG_LUA_META
+#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION RSPAMD_WORD_FLAG_EXCEPTION
+#define RSPAMD_STAT_TOKEN_FLAG_HEADER RSPAMD_WORD_FLAG_HEADER
+#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM RSPAMD_WORD_FLAG_UNIGRAM
+#define RSPAMD_STAT_TOKEN_FLAG_UTF RSPAMD_WORD_FLAG_UTF
+#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED RSPAMD_WORD_FLAG_NORMALISED
+#define RSPAMD_STAT_TOKEN_FLAG_STEMMED RSPAMD_WORD_FLAG_STEMMED
+#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE RSPAMD_WORD_FLAG_BROKEN_UNICODE
+#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD RSPAMD_WORD_FLAG_STOP_WORD
+#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED RSPAMD_WORD_FLAG_SKIPPED
+#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES RSPAMD_WORD_FLAG_INVISIBLE_SPACES
+#define RSPAMD_STAT_TOKEN_FLAG_EMOJI RSPAMD_WORD_FLAG_EMOJI
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_WORD_H */
diff --git a/src/libserver/worker_util.c b/src/libserver/worker_util.c
index d0ac8d8d3..685ee9cd2 100644
--- a/src/libserver/worker_util.c
+++ b/src/libserver/worker_util.c
@@ -2138,7 +2138,7 @@ rspamd_controller_load_saved_stats(struct rspamd_main *rspamd_main,
return;
}
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_file(parser, cfg->stats_file)) {
msg_err_config("cannot parse controller stats from %s: %s",
diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt
index 64d572a57..eddf64e49 100644
--- a/src/libstat/CMakeLists.txt
+++ b/src/libstat/CMakeLists.txt
@@ -1,25 +1,26 @@
# Librspamdserver
-SET(LIBSTATSRC ${CMAKE_CURRENT_SOURCE_DIR}/stat_config.c
- ${CMAKE_CURRENT_SOURCE_DIR}/stat_process.c)
+SET(LIBSTATSRC ${CMAKE_CURRENT_SOURCE_DIR}/stat_config.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/stat_process.c)
-SET(TOKENIZERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/tokenizers.c
- ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/osb.c)
+SET(TOKENIZERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/tokenizers.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/tokenizer_manager.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/osb.c)
-SET(CLASSIFIERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/bayes.c
- ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/lua_classifier.c)
+SET(CLASSIFIERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/bayes.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/lua_classifier.c)
-SET(BACKENDSSRC ${CMAKE_CURRENT_SOURCE_DIR}/backends/mmaped_file.c
- ${CMAKE_CURRENT_SOURCE_DIR}/backends/sqlite3_backend.c
- ${CMAKE_CURRENT_SOURCE_DIR}/backends/cdb_backend.cxx
- ${CMAKE_CURRENT_SOURCE_DIR}/backends/http_backend.cxx
- ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis_backend.cxx)
+SET(BACKENDSSRC ${CMAKE_CURRENT_SOURCE_DIR}/backends/mmaped_file.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/backends/sqlite3_backend.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/backends/cdb_backend.cxx
+ ${CMAKE_CURRENT_SOURCE_DIR}/backends/http_backend.cxx
+ ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis_backend.cxx)
-SET(CACHESSRC ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/sqlite3_cache.c
+SET(CACHESSRC ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/sqlite3_cache.c
${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/redis_cache.cxx)
SET(RSPAMD_STAT ${LIBSTATSRC}
- ${TOKENIZERSSRC}
- ${CLASSIFIERSSRC}
- ${BACKENDSSRC}
- ${CACHESSRC} PARENT_SCOPE)
+ ${TOKENIZERSSRC}
+ ${CLASSIFIERSSRC}
+ ${BACKENDSSRC}
+ ${CACHESSRC} PARENT_SCOPE)
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index f28922588..811566ad3 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -20,6 +20,7 @@
#include "task.h"
#include "lua/lua_common.h"
#include "contrib/libev/ev.h"
+#include "libserver/word.h"
#ifdef __cplusplus
extern "C" {
@@ -30,36 +31,14 @@ extern "C" {
* High level statistics API
*/
-#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1u << 0)
-#define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1)
-#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2)
-#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3)
-#define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4)
-#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5)
-#define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
-#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
-#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
-#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
-#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10)
-#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11)
-#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12)
-#define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13)
-
-typedef struct rspamd_stat_token_s {
- rspamd_ftok_t original; /* utf8 raw */
- rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
- rspamd_ftok_t normalized; /* normalized and lowercased utf8 */
- rspamd_ftok_t stemmed; /* stemmed utf8 */
- unsigned int flags;
-} rspamd_stat_token_t;
#define RSPAMD_TOKEN_VALUE_TYPE float
typedef struct token_node_s {
uint64_t data;
unsigned int window_idx;
unsigned int flags;
- rspamd_stat_token_t *t1;
- rspamd_stat_token_t *t2;
+ rspamd_word_t *t1;
+ rspamd_word_t *t2;
RSPAMD_TOKEN_VALUE_TYPE values[0];
} rspamd_token_t;
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 17caf4cc6..176064087 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -36,12 +36,13 @@ static void
rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx,
struct rspamd_task *task)
{
- GArray *ar;
- rspamd_stat_token_t elt;
+ rspamd_words_t *words;
+ rspamd_word_t elt;
unsigned int i;
lua_State *L = task->cfg->lua_state;
- ar = g_array_sized_new(FALSE, FALSE, sizeof(elt), 16);
+ words = rspamd_mempool_alloc(task->task_pool, sizeof(*words));
+ kv_init(*words);
memset(&elt, 0, sizeof(elt));
elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
@@ -87,7 +88,7 @@ rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx,
elt.normalized.begin = elt.original.begin;
elt.normalized.len = elt.original.len;
- g_array_append_val(ar, elt);
+ kv_push_safe(rspamd_word_t, *words, elt, meta_words_error);
}
lua_pop(L, 1);
@@ -99,17 +100,20 @@ rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx,
}
- if (ar->len > 0) {
+ if (kv_size(*words) > 0) {
st_ctx->tokenizer->tokenize_func(st_ctx,
task,
- ar,
+ words,
TRUE,
"M",
task->tokens);
}
- rspamd_mempool_add_destructor(task->task_pool,
- rspamd_array_free_hard, ar);
+ return;
+meta_words_error:
+
+ msg_err("cannot process meta words for task"
+ "memory allocation error, skipping the remaining");
}
/*
@@ -134,8 +138,8 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx,
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
{
- if (!IS_TEXT_PART_EMPTY(part) && part->utf_words != NULL) {
- reserved_len += part->utf_words->len;
+ if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a) {
+ reserved_len += kv_size(part->utf_words);
}
/* XXX: normal window size */
reserved_len += 5;
@@ -149,9 +153,9 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx,
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
{
- if (!IS_TEXT_PART_EMPTY(part) && part->utf_words != NULL) {
+ if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a) {
st_ctx->tokenizer->tokenize_func(st_ctx, task,
- part->utf_words, IS_TEXT_PART_UTF(part),
+ &part->utf_words, IS_TEXT_PART_UTF(part),
NULL, task->tokens);
}
@@ -163,10 +167,10 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx,
}
}
- if (task->meta_words != NULL) {
+ if (task->meta_words.a) {
st_ctx->tokenizer->tokenize_func(st_ctx,
task,
- task->meta_words,
+ &task->meta_words,
TRUE,
"SUBJECT",
task->tokens);
diff --git a/src/libstat/tokenizers/custom_tokenizer.h b/src/libstat/tokenizers/custom_tokenizer.h
new file mode 100644
index 000000000..bc173a1da
--- /dev/null
+++ b/src/libstat/tokenizers/custom_tokenizer.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_CUSTOM_TOKENIZER_H
+#define RSPAMD_CUSTOM_TOKENIZER_H
+
+/* Check if we're being included by internal Rspamd code or external plugins */
+#ifdef RSPAMD_TOKENIZER_INTERNAL
+/* Internal Rspamd usage - use the full headers */
+#include "config.h"
+#include "ucl.h"
+#include "libserver/word.h"
+#else
+/* External plugin usage - use standalone types */
+#include "rspamd_tokenizer_types.h"
+/* Forward declaration for UCL object - plugins should include ucl.h if needed */
+typedef struct ucl_object_s ucl_object_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1
+
+/**
+ * Tokenization result - compatible with both internal and external usage
+ */
+typedef rspamd_words_t rspamd_tokenizer_result_t;
+
+/**
+ * Custom tokenizer API that must be implemented by language-specific tokenizer plugins
+ * All functions use only plain C types to ensure clean boundaries
+ */
+typedef struct rspamd_custom_tokenizer_api {
+ /* API version for compatibility checking */
+ unsigned int api_version;
+
+ /* Name of the tokenizer (e.g., "japanese_mecab") */
+ const char *name;
+
+ /**
+ * Global initialization function called once when the tokenizer is loaded
+ * @param config UCL configuration object for this tokenizer (may be NULL)
+ * @param error_buf Buffer for error message (at least 256 bytes)
+ * @return 0 on success, non-zero on failure
+ */
+ int (*init)(const ucl_object_t *config, char *error_buf, size_t error_buf_size);
+
+ /**
+ * Global cleanup function called when the tokenizer is unloaded
+ */
+ void (*deinit)(void);
+
+ /**
+ * Quick language detection to check if this tokenizer can handle the text
+ * @param text UTF-8 text to analyze
+ * @param len Length of the text in bytes
+ * @return Confidence score 0.0-1.0, or -1.0 if cannot handle
+ */
+ double (*detect_language)(const char *text, size_t len);
+
+ /**
+ * Main tokenization function
+ * @param text UTF-8 text to tokenize
+ * @param len Length of the text in bytes
+ * @param result Output kvec to fill with rspamd_word_t elements
+ * @return 0 on success, non-zero on failure
+ *
+ * The tokenizer should allocate result->a using its own allocator
+ * Rspamd will call cleanup_result() to free it after processing
+ */
+ int (*tokenize)(const char *text, size_t len,
+ rspamd_tokenizer_result_t *result);
+
+ /**
+ * Cleanup the result from tokenize()
+ * @param result Result kvec returned by tokenize()
+ *
+ * This function should free result->a using the same allocator
+ * that was used in tokenize() and reset the kvec fields.
+ * This ensures proper memory management across DLL boundaries.
+ * Note: This does NOT free the result structure itself, only its contents.
+ */
+ void (*cleanup_result)(rspamd_tokenizer_result_t *result);
+
+ /**
+ * Optional: Get language hint for better language detection
+ * @return Language code (e.g., "ja", "zh") or NULL
+ */
+ const char *(*get_language_hint)(void);
+
+ /**
+ * Optional: Get minimum confidence threshold for this tokenizer
+ * @return Minimum confidence (0.0-1.0) or -1.0 to use default
+ */
+ double (*get_min_confidence)(void);
+
+} rspamd_custom_tokenizer_api_t;
+
+/**
+ * Entry point function that plugins must export
+ * Must be named "rspamd_tokenizer_get_api"
+ */
+typedef const rspamd_custom_tokenizer_api_t *(*rspamd_tokenizer_get_api_func)(void);
+
+/* Internal Rspamd structures - not exposed to plugins */
+#ifdef RSPAMD_TOKENIZER_INTERNAL
+
+/**
+ * Custom tokenizer instance
+ */
+struct rspamd_custom_tokenizer {
+ char *name; /* Tokenizer name from config */
+ char *path; /* Path to .so file */
+ void *handle; /* dlopen handle */
+ const rspamd_custom_tokenizer_api_t *api; /* API functions */
+ double priority; /* Detection priority */
+ double min_confidence; /* Minimum confidence threshold */
+ gboolean enabled; /* Is tokenizer enabled */
+ ucl_object_t *config; /* Tokenizer-specific config */
+};
+
+/**
+ * Tokenizer manager structure
+ */
+struct rspamd_tokenizer_manager {
+ GHashTable *tokenizers; /* name -> rspamd_custom_tokenizer */
+ GArray *detection_order; /* Ordered by priority */
+ rspamd_mempool_t *pool;
+ double default_threshold; /* Default confidence threshold */
+};
+
+/* Manager functions */
+struct rspamd_tokenizer_manager *rspamd_tokenizer_manager_new(rspamd_mempool_t *pool);
+void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr);
+
+gboolean rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr,
+ const char *name,
+ const ucl_object_t *config,
+ GError **err);
+
+struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect(
+ struct rspamd_tokenizer_manager *mgr,
+ const char *text, size_t len,
+ double *confidence,
+ const char *lang_hint,
+ const char **detected_lang_hint);
+
+/* Helper function to tokenize with exceptions handling */
+rspamd_tokenizer_result_t *rspamd_custom_tokenizer_tokenize_with_exceptions(
+ struct rspamd_custom_tokenizer *tokenizer,
+ const char *text,
+ gsize len,
+ GList *exceptions,
+ rspamd_mempool_t *pool);
+
+#endif /* RSPAMD_TOKENIZER_INTERNAL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_CUSTOM_TOKENIZER_H */
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index 0bc3414a5..360c71d36 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -21,6 +21,7 @@
#include "tokenizers.h"
#include "stat_internal.h"
#include "libmime/lang_detection.h"
+#include "libserver/word.h"
/* Size for features pipe */
#define DEFAULT_FEATURE_WINDOW_SIZE 2
@@ -268,7 +269,7 @@ struct token_pipe_entry {
int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
struct rspamd_task *task,
- GArray *words,
+ rspamd_words_t *words,
gboolean is_utf,
const char *prefix,
GPtrArray *result)
@@ -282,7 +283,7 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
gsize token_size;
unsigned int processed = 0, i, w, window_size, token_flags = 0;
- if (words == NULL) {
+ if (words == NULL || !words->a) {
return FALSE;
}
@@ -306,8 +307,8 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
sizeof(RSPAMD_TOKEN_VALUE_TYPE) * ctx->statfiles->len;
g_assert(token_size > 0);
- for (w = 0; w < words->len; w++) {
- token = &g_array_index(words, rspamd_stat_token_t, w);
+ for (w = 0; w < kv_size(*words); w++) {
+ token = &kv_A(*words, w);
token_flags = token->flags;
const char *begin;
gsize len;
diff --git a/src/libstat/tokenizers/rspamd_tokenizer_types.h b/src/libstat/tokenizers/rspamd_tokenizer_types.h
new file mode 100644
index 000000000..eb8518290
--- /dev/null
+++ b/src/libstat/tokenizers/rspamd_tokenizer_types.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_TOKENIZER_TYPES_H
+#define RSPAMD_TOKENIZER_TYPES_H
+
+/*
+ * Standalone type definitions for custom tokenizers
+ * This header is completely self-contained and does not depend on any external libraries.
+ * Custom tokenizers should include only this header to get access to all necessary types.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Basic string token structure
+ */
+typedef struct rspamd_ftok {
+ size_t len;
+ const char *begin;
+} rspamd_ftok_t;
+
+/**
+ * Unicode string token structure
+ */
+typedef struct rspamd_ftok_unicode {
+ size_t len;
+ const uint32_t *begin;
+} rspamd_ftok_unicode_t;
+
+/* Word flags */
+#define RSPAMD_WORD_FLAG_TEXT (1u << 0u)
+#define RSPAMD_WORD_FLAG_META (1u << 1u)
+#define RSPAMD_WORD_FLAG_LUA_META (1u << 2u)
+#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3u)
+#define RSPAMD_WORD_FLAG_HEADER (1u << 4u)
+#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5u)
+#define RSPAMD_WORD_FLAG_UTF (1u << 6u)
+#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7u)
+#define RSPAMD_WORD_FLAG_STEMMED (1u << 8u)
+#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9u)
+#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10u)
+#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11u)
+#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12u)
+#define RSPAMD_WORD_FLAG_EMOJI (1u << 13u)
+
+/**
+ * Word structure
+ */
+typedef struct rspamd_word {
+ rspamd_ftok_t original;
+ rspamd_ftok_unicode_t unicode;
+ rspamd_ftok_t normalized;
+ rspamd_ftok_t stemmed;
+ unsigned int flags;
+} rspamd_word_t;
+
+/**
+ * Array of words
+ */
+typedef struct rspamd_words {
+ rspamd_word_t *a;
+ size_t n;
+ size_t m;
+} rspamd_words_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_TOKENIZER_TYPES_H */
diff --git a/src/libstat/tokenizers/tokenizer_manager.c b/src/libstat/tokenizers/tokenizer_manager.c
new file mode 100644
index 000000000..e6fb5e8d8
--- /dev/null
+++ b/src/libstat/tokenizers/tokenizer_manager.c
@@ -0,0 +1,500 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "tokenizers.h"
+#define RSPAMD_TOKENIZER_INTERNAL
+#include "custom_tokenizer.h"
+#include "libutil/util.h"
+#include "libserver/logger.h"
+#include <dlfcn.h>
+
+#define msg_err_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \
+ "tokenizer", "", \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_warn_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \
+ "tokenizer", "", \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_info_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \
+ "tokenizer", "", \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_debug_tokenizer(...) rspamd_conditional_debug_fast(NULL, NULL, \
+ rspamd_tokenizer_log_id, "tokenizer", "", \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+
+INIT_LOG_MODULE(tokenizer)
+
+static void
+rspamd_custom_tokenizer_dtor(gpointer p)
+{
+ struct rspamd_custom_tokenizer *tok = p;
+
+ if (tok) {
+ if (tok->api && tok->api->deinit) {
+ tok->api->deinit();
+ }
+
+ if (tok->handle) {
+ dlclose(tok->handle);
+ }
+
+ if (tok->config) {
+ ucl_object_unref(tok->config);
+ }
+
+ g_free(tok->name);
+ g_free(tok->path);
+ g_free(tok);
+ }
+}
+
+static int
+rspamd_custom_tokenizer_priority_cmp(gconstpointer a, gconstpointer b)
+{
+ const struct rspamd_custom_tokenizer *t1 = *(const struct rspamd_custom_tokenizer **) a;
+ const struct rspamd_custom_tokenizer *t2 = *(const struct rspamd_custom_tokenizer **) b;
+
+ /* Higher priority first */
+ if (t1->priority > t2->priority) {
+ return -1;
+ }
+ else if (t1->priority < t2->priority) {
+ return 1;
+ }
+
+ return 0;
+}
+
+struct rspamd_tokenizer_manager *
+rspamd_tokenizer_manager_new(rspamd_mempool_t *pool)
+{
+ struct rspamd_tokenizer_manager *mgr;
+
+ mgr = rspamd_mempool_alloc0(pool, sizeof(*mgr));
+ mgr->pool = pool;
+ mgr->tokenizers = g_hash_table_new_full(rspamd_strcase_hash,
+ rspamd_strcase_equal,
+ NULL,
+ rspamd_custom_tokenizer_dtor);
+ mgr->detection_order = g_array_new(FALSE, FALSE, sizeof(struct rspamd_custom_tokenizer *));
+ mgr->default_threshold = 0.7; /* Default confidence threshold */
+
+ rspamd_mempool_add_destructor(pool,
+ (rspamd_mempool_destruct_t) g_hash_table_unref,
+ mgr->tokenizers);
+ rspamd_mempool_add_destructor(pool,
+ (rspamd_mempool_destruct_t) rspamd_array_free_hard,
+ mgr->detection_order);
+
+ msg_info_tokenizer("created custom tokenizer manager with default confidence threshold %.3f",
+ mgr->default_threshold);
+
+ return mgr;
+}
+
+void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr)
+{
+ /* Cleanup is handled by memory pool destructors */
+}
+
+gboolean
+rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr,
+ const char *name,
+ const ucl_object_t *config,
+ GError **err)
+{
+ struct rspamd_custom_tokenizer *tok;
+ const ucl_object_t *elt;
+ rspamd_tokenizer_get_api_func get_api;
+ const rspamd_custom_tokenizer_api_t *api;
+ void *handle;
+ const char *path;
+ gboolean enabled = TRUE;
+ double priority = 50.0;
+ char error_buf[256];
+
+ g_assert(mgr != NULL);
+ g_assert(name != NULL);
+ g_assert(config != NULL);
+
+ msg_info_tokenizer("starting to load custom tokenizer '%s'", name);
+
+ /* Check if enabled */
+ elt = ucl_object_lookup(config, "enabled");
+ if (elt && ucl_object_type(elt) == UCL_BOOLEAN) {
+ enabled = ucl_object_toboolean(elt);
+ }
+
+ if (!enabled) {
+ msg_info_tokenizer("custom tokenizer '%s' is disabled", name);
+ return TRUE;
+ }
+
+ /* Get path */
+ elt = ucl_object_lookup(config, "path");
+ if (!elt || ucl_object_type(elt) != UCL_STRING) {
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "missing 'path' for tokenizer %s", name);
+ return FALSE;
+ }
+ path = ucl_object_tostring(elt);
+ msg_info_tokenizer("custom tokenizer '%s' will be loaded from path: %s", name, path);
+
+ /* Get priority */
+ elt = ucl_object_lookup(config, "priority");
+ if (elt) {
+ priority = ucl_object_todouble(elt);
+ }
+ msg_info_tokenizer("custom tokenizer '%s' priority set to %.1f", name, priority);
+
+ /* Load the shared library */
+ msg_info_tokenizer("loading shared library for custom tokenizer '%s'", name);
+ handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
+ if (!handle) {
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "cannot load tokenizer %s from %s: %s",
+ name, path, dlerror());
+ return FALSE;
+ }
+ msg_info_tokenizer("successfully loaded shared library for custom tokenizer '%s'", name);
+
+ /* Get the API entry point */
+ msg_info_tokenizer("looking up API entry point for custom tokenizer '%s'", name);
+ get_api = (rspamd_tokenizer_get_api_func) dlsym(handle, "rspamd_tokenizer_get_api");
+ if (!get_api) {
+ dlclose(handle);
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "cannot find entry point in %s: %s",
+ path, dlerror());
+ return FALSE;
+ }
+
+ /* Get the API */
+ msg_info_tokenizer("calling API entry point for custom tokenizer '%s'", name);
+ api = get_api();
+ if (!api) {
+ dlclose(handle);
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "tokenizer %s returned NULL API", name);
+ return FALSE;
+ }
+ msg_info_tokenizer("successfully obtained API from custom tokenizer '%s'", name);
+
+ /* Check API version */
+ msg_info_tokenizer("checking API version for custom tokenizer '%s' (got %u, expected %u)",
+ name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION);
+ if (api->api_version != RSPAMD_CUSTOM_TOKENIZER_API_VERSION) {
+ dlclose(handle);
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "tokenizer %s has incompatible API version %u (expected %u)",
+ name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION);
+ return FALSE;
+ }
+
+ /* Create tokenizer instance */
+ tok = g_malloc0(sizeof(*tok));
+ tok->name = g_strdup(name);
+ tok->path = g_strdup(path);
+ tok->handle = handle;
+ tok->api = api;
+ tok->priority = priority;
+ tok->enabled = enabled;
+
+ /* Get tokenizer config */
+ elt = ucl_object_lookup(config, "config");
+ if (elt) {
+ tok->config = ucl_object_ref(elt);
+ }
+
+ /* Get minimum confidence */
+ if (api->get_min_confidence) {
+ tok->min_confidence = api->get_min_confidence();
+ msg_info_tokenizer("custom tokenizer '%s' provides minimum confidence threshold: %.3f",
+ name, tok->min_confidence);
+ }
+ else {
+ tok->min_confidence = mgr->default_threshold;
+ msg_info_tokenizer("custom tokenizer '%s' using default confidence threshold: %.3f",
+ name, tok->min_confidence);
+ }
+
+ /* Initialize the tokenizer */
+ if (api->init) {
+ msg_info_tokenizer("initializing custom tokenizer '%s'", name);
+ error_buf[0] = '\0';
+ if (api->init(tok->config, error_buf, sizeof(error_buf)) != 0) {
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "failed to initialize tokenizer %s: %s",
+ name, error_buf[0] ? error_buf : "unknown error");
+ rspamd_custom_tokenizer_dtor(tok);
+ return FALSE;
+ }
+ msg_info_tokenizer("successfully initialized custom tokenizer '%s'", name);
+ }
+ else {
+ msg_info_tokenizer("custom tokenizer '%s' does not require initialization", name);
+ }
+
+ /* Add to manager */
+ g_hash_table_insert(mgr->tokenizers, tok->name, tok);
+ g_array_append_val(mgr->detection_order, tok);
+
+ /* Re-sort by priority */
+ g_array_sort(mgr->detection_order, rspamd_custom_tokenizer_priority_cmp);
+ msg_info_tokenizer("custom tokenizer '%s' registered and sorted by priority (total tokenizers: %u)",
+ name, mgr->detection_order->len);
+
+ msg_info_tokenizer("successfully loaded custom tokenizer '%s' (priority %.1f) from %s",
+ name, priority, path);
+
+ return TRUE;
+}
+
+struct rspamd_custom_tokenizer *
+rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr,
+ const char *text, size_t len,
+ double *confidence,
+ const char *lang_hint,
+ const char **detected_lang_hint)
+{
+ struct rspamd_custom_tokenizer *tok, *best_tok = NULL;
+ double conf, best_conf = 0.0;
+ unsigned int i;
+
+ g_assert(mgr != NULL);
+ g_assert(text != NULL);
+
+ msg_debug_tokenizer("starting tokenizer detection for text of length %zu", len);
+
+ if (confidence) {
+ *confidence = 0.0;
+ }
+
+ if (detected_lang_hint) {
+ *detected_lang_hint = NULL;
+ }
+
+ /* If we have a language hint, try to find a tokenizer for that language first */
+ if (lang_hint) {
+ msg_info_tokenizer("trying to find tokenizer for language hint: %s", lang_hint);
+ for (i = 0; i < mgr->detection_order->len; i++) {
+ tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i);
+
+ if (!tok->enabled || !tok->api->get_language_hint) {
+ continue;
+ }
+
+ /* Check if this tokenizer handles the hinted language */
+ const char *tok_lang = tok->api->get_language_hint();
+ if (tok_lang && g_ascii_strcasecmp(tok_lang, lang_hint) == 0) {
+ msg_info_tokenizer("found tokenizer '%s' for language hint '%s'", tok->name, lang_hint);
+ /* Found a tokenizer for this language, check if it actually detects it */
+ if (tok->api->detect_language) {
+ conf = tok->api->detect_language(text, len);
+ msg_info_tokenizer("tokenizer '%s' confidence for hinted language: %.3f (threshold: %.3f)",
+ tok->name, conf, tok->min_confidence);
+ if (conf >= tok->min_confidence) {
+ /* Use this tokenizer */
+ msg_info_tokenizer("using tokenizer '%s' for language hint '%s' with confidence %.3f",
+ tok->name, lang_hint, conf);
+ if (confidence) {
+ *confidence = conf;
+ }
+ if (detected_lang_hint) {
+ *detected_lang_hint = tok_lang;
+ }
+ return tok;
+ }
+ }
+ }
+ }
+ msg_info_tokenizer("no suitable tokenizer found for language hint '%s', falling back to general detection", lang_hint);
+ }
+
+ /* Try each tokenizer in priority order */
+ msg_info_tokenizer("trying %u tokenizers for general detection", mgr->detection_order->len);
+ for (i = 0; i < mgr->detection_order->len; i++) {
+ tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i);
+
+ if (!tok->enabled || !tok->api->detect_language) {
+ msg_debug_tokenizer("skipping tokenizer '%s' (enabled: %s, has detect_language: %s)",
+ tok->name, tok->enabled ? "yes" : "no",
+ tok->api->detect_language ? "yes" : "no");
+ continue;
+ }
+
+ conf = tok->api->detect_language(text, len);
+ msg_info_tokenizer("tokenizer '%s' detection confidence: %.3f (threshold: %.3f, current best: %.3f)",
+ tok->name, conf, tok->min_confidence, best_conf);
+
+ if (conf > best_conf && conf >= tok->min_confidence) {
+ best_conf = conf;
+ best_tok = tok;
+ msg_info_tokenizer("tokenizer '%s' is new best with confidence %.3f", tok->name, best_conf);
+
+ /* Early exit if very confident */
+ if (conf >= 0.95) {
+ msg_info_tokenizer("very high confidence (%.3f >= 0.95), using tokenizer '%s' immediately",
+ conf, tok->name);
+ break;
+ }
+ }
+ }
+
+ if (best_tok) {
+ msg_info_tokenizer("selected tokenizer '%s' with confidence %.3f", best_tok->name, best_conf);
+ if (confidence) {
+ *confidence = best_conf;
+ }
+
+ if (detected_lang_hint && best_tok->api->get_language_hint) {
+ *detected_lang_hint = best_tok->api->get_language_hint();
+ msg_info_tokenizer("detected language hint: %s", *detected_lang_hint);
+ }
+ }
+ else {
+ msg_info_tokenizer("no suitable tokenizer found during detection");
+ }
+
+ return best_tok;
+}
+
+/* Helper function to tokenize with a custom tokenizer handling exceptions */
+rspamd_tokenizer_result_t *
+rspamd_custom_tokenizer_tokenize_with_exceptions(
+ struct rspamd_custom_tokenizer *tokenizer,
+ const char *text,
+ gsize len,
+ GList *exceptions,
+ rspamd_mempool_t *pool)
+{
+ rspamd_tokenizer_result_t *words;
+ rspamd_tokenizer_result_t result;
+ struct rspamd_process_exception *ex;
+ GList *cur_ex = exceptions;
+ gsize pos = 0;
+ unsigned int i;
+ int ret;
+
+ /* Allocate result kvec in pool */
+ words = rspamd_mempool_alloc(pool, sizeof(*words));
+ kv_init(*words);
+
+ /* If no exceptions, tokenize the whole text */
+ if (!exceptions) {
+ kv_init(result);
+
+ ret = tokenizer->api->tokenize(text, len, &result);
+ if (ret == 0 && result.a) {
+ /* Copy tokens from result to output */
+ for (i = 0; i < kv_size(result); i++) {
+ rspamd_word_t tok = kv_A(result, i);
+ kv_push(rspamd_word_t, *words, tok);
+ }
+
+ /* Use tokenizer's cleanup function */
+ if (tokenizer->api->cleanup_result) {
+ tokenizer->api->cleanup_result(&result);
+ }
+ }
+
+ return words;
+ }
+
+ /* Process text with exceptions */
+ while (pos < len && cur_ex) {
+ ex = (struct rspamd_process_exception *) cur_ex->data;
+
+ /* Tokenize text before exception */
+ if (ex->pos > pos) {
+ gsize segment_len = ex->pos - pos;
+ kv_init(result);
+
+ ret = tokenizer->api->tokenize(text + pos, segment_len, &result);
+ if (ret == 0 && result.a) {
+ /* Copy tokens from result, adjusting positions for segment offset */
+ for (i = 0; i < kv_size(result); i++) {
+ rspamd_word_t tok = kv_A(result, i);
+
+ /* Adjust pointers to point to the original text */
+ gsize offset_in_segment = tok.original.begin - (text + pos);
+ if (offset_in_segment < segment_len) {
+ tok.original.begin = text + pos + offset_in_segment;
+ /* Ensure we don't go past the exception boundary */
+ if (tok.original.begin + tok.original.len <= text + ex->pos) {
+ kv_push(rspamd_word_t, *words, tok);
+ }
+ }
+ }
+
+ /* Use tokenizer's cleanup function */
+ if (tokenizer->api->cleanup_result) {
+ tokenizer->api->cleanup_result(&result);
+ }
+ }
+ }
+
+ /* Add exception as a special token */
+ rspamd_word_t ex_tok;
+ memset(&ex_tok, 0, sizeof(ex_tok));
+
+ if (ex->type == RSPAMD_EXCEPTION_URL) {
+ ex_tok.original.begin = "!!EX!!";
+ ex_tok.original.len = 6;
+ }
+ else {
+ ex_tok.original.begin = text + ex->pos;
+ ex_tok.original.len = ex->len;
+ }
+ ex_tok.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+ kv_push(rspamd_word_t, *words, ex_tok);
+
+ /* Move past exception */
+ pos = ex->pos + ex->len;
+ cur_ex = g_list_next(cur_ex);
+ }
+
+ /* Process remaining text after last exception */
+ if (pos < len) {
+ kv_init(result);
+
+ ret = tokenizer->api->tokenize(text + pos, len - pos, &result);
+ if (ret == 0 && result.a) {
+ /* Copy tokens from result, adjusting positions for segment offset */
+ for (i = 0; i < kv_size(result); i++) {
+ rspamd_word_t tok = kv_A(result, i);
+
+ /* Adjust pointers to point to the original text */
+ gsize offset_in_segment = tok.original.begin - (text + pos);
+ if (offset_in_segment < (len - pos)) {
+ tok.original.begin = text + pos + offset_in_segment;
+ kv_push(rspamd_word_t, *words, tok);
+ }
+ }
+
+ /* Use tokenizer's cleanup function */
+ if (tokenizer->api->cleanup_result) {
+ tokenizer->api->cleanup_result(&result);
+ }
+ }
+ }
+
+ return words;
+}
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 0ea1bcfc6..8a9f42992 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
#include "contrib/mumhash/mum.h"
#include "libmime/lang_detection.h"
#include "libstemmer.h"
+#define RSPAMD_TOKENIZER_INTERNAL
+#include "custom_tokenizer.h"
#include <unicode/utf8.h>
#include <unicode/uchar.h>
@@ -35,8 +37,8 @@
#include <math.h>
-typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, char const **pos,
- rspamd_stat_token_t *token,
+typedef gboolean (*token_get_function)(rspamd_word_t *buf, char const **pos,
+ rspamd_word_t *token,
GList **exceptions, gsize *rl, gboolean check_signature);
const char t_delimiters[256] = {
@@ -69,8 +71,8 @@ const char t_delimiters[256] = {
/* Get next word from specified f_str_t buf */
static gboolean
-rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf,
- char const **cur, rspamd_stat_token_t *token,
+rspamd_tokenizer_get_word_raw(rspamd_word_t *buf,
+ char const **cur, rspamd_word_t *token,
GList **exceptions, gsize *rl, gboolean unused)
{
gsize remain, pos;
@@ -164,7 +166,7 @@ rspamd_tokenize_check_limit(gboolean decay,
unsigned int nwords,
uint64_t *hv,
uint64_t *prob,
- const rspamd_stat_token_t *token,
+ const rspamd_word_t *token,
gssize remain,
gssize total)
{
@@ -242,9 +244,9 @@ rspamd_utf_word_valid(const unsigned char *text, const unsigned char *end,
} while (0)
static inline void
-rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
+rspamd_tokenize_exception(struct rspamd_process_exception *ex, rspamd_words_t *res)
{
- rspamd_stat_token_t token;
+ rspamd_word_t token;
memset(&token, 0, sizeof(token));
@@ -253,7 +255,7 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
token.original.len = sizeof("!!EX!!") - 1;
token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- g_array_append_val(res, token);
+ kv_push_safe(rspamd_word_t, *res, token, exception_error);
token.flags = 0;
}
else if (ex->type == RSPAMD_EXCEPTION_URL) {
@@ -271,28 +273,33 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
}
token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- g_array_append_val(res, token);
+ kv_push_safe(rspamd_word_t, *res, token, exception_error);
token.flags = 0;
}
+ return;
+
+exception_error:
+ /* On error, just skip this exception token */
+ return;
}
-GArray *
+rspamd_words_t *
rspamd_tokenize_text(const char *text, gsize len,
const UText *utxt,
enum rspamd_tokenize_type how,
struct rspamd_config *cfg,
GList *exceptions,
uint64_t *hash,
- GArray *cur_words,
+ rspamd_words_t *output_kvec,
rspamd_mempool_t *pool)
{
- rspamd_stat_token_t token, buf;
+ rspamd_word_t token, buf;
const char *pos = NULL;
gsize l = 0;
- GArray *res;
+ rspamd_words_t *res;
GList *cur = exceptions;
- unsigned int min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
+ unsigned int min_len = 0, max_len = 0, word_decay = 0;
uint64_t hv = 0;
gboolean decay = FALSE, long_text_mode = FALSE;
uint64_t prob = 0;
@@ -300,9 +307,12 @@ rspamd_tokenize_text(const char *text, gsize len,
static const gsize long_text_limit = 1 * 1024 * 1024;
static const ev_tstamp max_exec_time = 0.2; /* 200 ms */
ev_tstamp start;
+ struct rspamd_custom_tokenizer *custom_tok = NULL;
+ double custom_confidence = 0.0;
+ const char *detected_lang = NULL;
if (text == NULL) {
- return cur_words;
+ return output_kvec;
}
if (len > long_text_limit) {
@@ -323,15 +333,59 @@ rspamd_tokenize_text(const char *text, gsize len,
min_len = cfg->min_word_len;
max_len = cfg->max_word_len;
word_decay = cfg->words_decay;
- initial_size = word_decay * 2;
}
- if (!cur_words) {
- res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t),
- initial_size);
+ if (!output_kvec) {
+ res = pool ? rspamd_mempool_alloc0(pool, sizeof(*res)) : g_malloc0(sizeof(*res));
+ ;
}
else {
- res = cur_words;
+ res = output_kvec;
+ }
+
+ /* Try custom tokenizers first if we're in UTF mode */
+ if (cfg && cfg->tokenizer_manager && how == RSPAMD_TOKENIZE_UTF && utxt != NULL) {
+ custom_tok = rspamd_tokenizer_manager_detect(
+ cfg->tokenizer_manager,
+ text, len,
+ &custom_confidence,
+ NULL, /* no input language hint */
+ &detected_lang);
+
+ if (custom_tok && custom_confidence >= custom_tok->min_confidence) {
+ /* Use custom tokenizer with exception handling */
+ rspamd_tokenizer_result_t *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions(
+ custom_tok, text, len, exceptions, pool);
+
+ if (custom_res) {
+ msg_debug_pool("using custom tokenizer %s (confidence: %.2f) for text tokenization",
+ custom_tok->name, custom_confidence);
+
+ /* Copy custom tokenizer results to output kvec */
+ for (unsigned int i = 0; i < kv_size(*custom_res); i++) {
+ kv_push_safe(rspamd_word_t, *res, kv_A(*custom_res, i), custom_tokenizer_error);
+ }
+
+ /* Calculate hash if needed */
+ if (hash && kv_size(*res) > 0) {
+ for (unsigned int i = 0; i < kv_size(*res); i++) {
+ rspamd_word_t *t = &kv_A(*res, i);
+ if (t->original.len >= sizeof(uint64_t)) {
+ uint64_t tmp;
+ memcpy(&tmp, t->original.begin, sizeof(tmp));
+ hv = mum_hash_step(hv, tmp);
+ }
+ }
+ *hash = mum_hash_finish(hv);
+ }
+
+ return res;
+ }
+ else {
+ msg_warn_pool("custom tokenizer %s failed to tokenize text, falling back to default",
+ custom_tok->name);
+ }
+ }
}
if (G_UNLIKELY(how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
@@ -343,7 +397,7 @@ rspamd_tokenize_text(const char *text, gsize len,
}
if (token.original.len > 0 &&
- rspamd_tokenize_check_limit(decay, word_decay, res->len,
+ rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res),
&hv, &prob, &token, pos - text, len)) {
if (!decay) {
decay = TRUE;
@@ -355,28 +409,28 @@ rspamd_tokenize_text(const char *text, gsize len,
}
if (long_text_mode) {
- if ((res->len + 1) % 16 == 0) {
+ if ((kv_size(*res) + 1) % 16 == 0) {
ev_tstamp now = ev_time();
if (now - start > max_exec_time) {
msg_warn_pool_check(
"too long time has been spent on tokenization:"
- " %.1f ms, limit is %.1f ms; %d words added so far",
+ " %.1f ms, limit is %.1f ms; %z words added so far",
(now - start) * 1e3, max_exec_time * 1e3,
- res->len);
+ kv_size(*res));
goto end;
}
}
}
- g_array_append_val(res, token);
+ kv_push_safe(rspamd_word_t, *res, token, tokenize_error);
- if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
+ if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) {
/* Due to bug in glib ! */
msg_err_pool_check(
- "too many words found: %d, stop tokenization to avoid DoS",
- res->len);
+ "too many words found: %z, stop tokenization to avoid DoS",
+ kv_size(*res));
goto end;
}
@@ -523,7 +577,7 @@ rspamd_tokenize_text(const char *text, gsize len,
}
if (token.original.len > 0 &&
- rspamd_tokenize_check_limit(decay, word_decay, res->len,
+ rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res),
&hv, &prob, &token, p, len)) {
if (!decay) {
decay = TRUE;
@@ -536,15 +590,15 @@ rspamd_tokenize_text(const char *text, gsize len,
if (token.original.len > 0) {
/* Additional check for number of words */
- if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
+ if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) {
/* Due to bug in glib ! */
- msg_err("too many words found: %d, stop tokenization to avoid DoS",
- res->len);
+ msg_err("too many words found: %z, stop tokenization to avoid DoS",
+ kv_size(*res));
goto end;
}
- g_array_append_val(res, token);
+ kv_push_safe(rspamd_word_t, *res, token, tokenize_error);
}
/* Also check for long text mode */
@@ -552,15 +606,15 @@ rspamd_tokenize_text(const char *text, gsize len,
/* Check time each 128 words added */
const int words_check_mask = 0x7F;
- if ((res->len & words_check_mask) == words_check_mask) {
+ if ((kv_size(*res) & words_check_mask) == words_check_mask) {
ev_tstamp now = ev_time();
if (now - start > max_exec_time) {
msg_warn_pool_check(
"too long time has been spent on tokenization:"
- " %.1f ms, limit is %.1f ms; %d words added so far",
+ " %.1f ms, limit is %.1f ms; %z words added so far",
(now - start) * 1e3, max_exec_time * 1e3,
- res->len);
+ kv_size(*res));
goto end;
}
@@ -590,8 +644,14 @@ end:
}
return res;
+
+tokenize_error:
+custom_tokenizer_error:
+ msg_err_pool("failed to allocate memory for tokenization");
+ return res;
}
+
#undef SHIFT_EX
static void
@@ -625,32 +685,38 @@ rspamd_add_metawords_from_str(const char *beg, gsize len,
#endif
}
+ /* Initialize meta_words kvec if not already done */
+ if (!task->meta_words.a) {
+ kv_init(task->meta_words);
+ }
+
if (valid_utf) {
utext_openUTF8(&utxt,
beg,
len,
&uc_err);
- task->meta_words = rspamd_tokenize_text(beg, len,
- &utxt, RSPAMD_TOKENIZE_UTF,
- task->cfg, NULL, NULL,
- task->meta_words,
- task->task_pool);
+ rspamd_tokenize_text(beg, len,
+ &utxt, RSPAMD_TOKENIZE_UTF,
+ task->cfg, NULL, NULL,
+ &task->meta_words,
+ task->task_pool);
utext_close(&utxt);
}
else {
- task->meta_words = rspamd_tokenize_text(beg, len,
- NULL, RSPAMD_TOKENIZE_RAW,
- task->cfg, NULL, NULL, task->meta_words,
- task->task_pool);
+ rspamd_tokenize_text(beg, len,
+ NULL, RSPAMD_TOKENIZE_RAW,
+ task->cfg, NULL, NULL,
+ &task->meta_words,
+ task->task_pool);
}
}
void rspamd_tokenize_meta_words(struct rspamd_task *task)
{
unsigned int i = 0;
- rspamd_stat_token_t *tok;
+ rspamd_word_t *tok;
if (MESSAGE_FIELD(task, subject)) {
rspamd_add_metawords_from_str(MESSAGE_FIELD(task, subject),
@@ -667,7 +733,7 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task)
}
}
- if (task->meta_words != NULL) {
+ if (task->meta_words.a) {
const char *language = NULL;
if (MESSAGE_FIELD(task, text_parts) &&
@@ -680,12 +746,12 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task)
}
}
- rspamd_normalize_words(task->meta_words, task->task_pool);
- rspamd_stem_words(task->meta_words, task->task_pool, language,
+ rspamd_normalize_words(&task->meta_words, task->task_pool);
+ rspamd_stem_words(&task->meta_words, task->task_pool, language,
task->lang_det);
- for (i = 0; i < task->meta_words->len; i++) {
- tok = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(task->meta_words); i++) {
+ tok = &kv_A(task->meta_words, i);
tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
}
}
@@ -759,7 +825,7 @@ rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok,
tok->normalized.begin = dest;
}
-void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
+void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool)
{
UErrorCode uc_err = U_ZERO_ERROR;
UConverter *utf8_converter;
@@ -858,25 +924,27 @@ void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *po
}
}
-void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool)
+
+void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool)
{
- rspamd_stat_token_t *tok;
+ rspamd_word_t *tok;
unsigned int i;
- for (i = 0; i < words->len; i++) {
- tok = &g_array_index(words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(*words); i++) {
+ tok = &kv_A(*words, i);
rspamd_normalize_single_word(tok, pool);
}
}
-void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
+
+void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool,
const char *language,
struct rspamd_lang_detector *lang_detector)
{
static GHashTable *stemmers = NULL;
struct sb_stemmer *stem = NULL;
unsigned int i;
- rspamd_stat_token_t *tok;
+ rspamd_word_t *tok;
char *dest;
gsize dlen;
@@ -909,8 +977,18 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
stem = NULL;
}
}
- for (i = 0; i < words->len; i++) {
- tok = &g_array_index(words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(*words); i++) {
+ tok = &kv_A(*words, i);
+
+ /* Skip stemming if token has already been stemmed by custom tokenizer */
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) {
+ /* Already stemmed, just check for stop words */
+ if (tok->stemmed.len > 0 && lang_detector != NULL &&
+ rspamd_language_detector_is_stop_word(lang_detector, tok->stemmed.begin, tok->stemmed.len)) {
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
+ }
+ continue;
+ }
if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
if (stem) {
@@ -952,4 +1030,4 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
}
}
}
-} \ No newline at end of file
+}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index d4a8824a8..bb0bb54e2 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
#include "fstring.h"
#include "rspamd.h"
#include "stat_api.h"
+#include "libserver/word.h"
#include <unicode/utext.h>
@@ -43,7 +44,7 @@ struct rspamd_stat_tokenizer {
int (*tokenize_func)(struct rspamd_stat_ctx *ctx,
struct rspamd_task *task,
- GArray *words,
+ rspamd_words_t *words,
gboolean is_utf,
const char *prefix,
GPtrArray *result);
@@ -59,20 +60,20 @@ enum rspamd_tokenize_type {
int token_node_compare_func(gconstpointer a, gconstpointer b);
-/* Tokenize text into array of words (rspamd_stat_token_t type) */
-GArray *rspamd_tokenize_text(const char *text, gsize len,
- const UText *utxt,
- enum rspamd_tokenize_type how,
- struct rspamd_config *cfg,
- GList *exceptions,
- uint64_t *hash,
- GArray *cur_words,
- rspamd_mempool_t *pool);
+/* Tokenize text into kvec of words (rspamd_word_t type) */
+rspamd_words_t *rspamd_tokenize_text(const char *text, gsize len,
+ const UText *utxt,
+ enum rspamd_tokenize_type how,
+ struct rspamd_config *cfg,
+ GList *exceptions,
+ uint64_t *hash,
+ rspamd_words_t *output_kvec,
+ rspamd_mempool_t *pool);
/* OSB tokenize function */
int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
struct rspamd_task *task,
- GArray *words,
+ rspamd_words_t *words,
gboolean is_utf,
const char *prefix,
GPtrArray *result);
@@ -83,11 +84,11 @@ gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool,
struct rspamd_lang_detector;
-void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool);
+void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool);
-void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool);
-
-void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
+/* Word processing functions */
+void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool);
+void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool,
const char *language,
struct rspamd_lang_detector *lang_detector);
diff --git a/src/libutil/fstring.h b/src/libutil/fstring.h
index 0792ab9fa..ca9f689c8 100644
--- a/src/libutil/fstring.h
+++ b/src/libutil/fstring.h
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -30,8 +30,8 @@ extern "C" {
*/
typedef struct f_str_s {
- gsize len;
- gsize allocated;
+ size_t len;
+ size_t allocated;
char str[];
} rspamd_fstring_t;
@@ -40,12 +40,12 @@ typedef struct f_str_s {
#define RSPAMD_FSTRING_LIT(lit) rspamd_fstring_new_init((lit), sizeof(lit) - 1)
typedef struct f_str_tok {
- gsize len;
+ size_t len;
const char *begin;
} rspamd_ftok_t;
typedef struct f_str_unicode_tok {
- gsize len; /* in UChar32 */
+ size_t len; /* in UChar32 */
const UChar32 *begin;
} rspamd_ftok_unicode_t;
diff --git a/src/libutil/mem_pool.c b/src/libutil/mem_pool.c
index 3dc67bc5f..575b4e497 100644
--- a/src/libutil/mem_pool.c
+++ b/src/libutil/mem_pool.c
@@ -403,9 +403,10 @@ rspamd_mempool_new_(gsize size, const char *tag, int flags, const char *loc)
/* Generate new uid */
uint64_t uid = rspamd_random_uint64_fast();
- rspamd_encode_hex_buf((unsigned char *) &uid, sizeof(uid),
- new_pool->tag.uid, sizeof(new_pool->tag.uid) - 1);
- new_pool->tag.uid[sizeof(new_pool->tag.uid) - 1] = '\0';
+ G_STATIC_ASSERT(sizeof(new_pool->tag.uid) >= sizeof(uid) * 2 + 1);
+ int enc_len = rspamd_encode_hex_buf((unsigned char *) &uid, sizeof(uid),
+ new_pool->tag.uid, sizeof(new_pool->tag.uid) - 1);
+ new_pool->tag.uid[enc_len] = '\0';
mem_pool_stat->pools_allocated++;
diff --git a/src/libutil/mem_pool.h b/src/libutil/mem_pool.h
index 651b44661..00d1a2067 100644
--- a/src/libutil/mem_pool.h
+++ b/src/libutil/mem_pool.h
@@ -71,7 +71,7 @@ struct f_str_s;
#endif
#define MEMPOOL_TAG_LEN 16
-#define MEMPOOL_UID_LEN 16
+#define MEMPOOL_UID_LEN 32
/* All pointers are aligned as this variable */
#define MIN_MEM_ALIGNMENT G_MEM_ALIGN
diff --git a/src/libutil/radix.c b/src/libutil/radix.c
index 2cae8e34a..bdd722b49 100644
--- a/src/libutil/radix.c
+++ b/src/libutil/radix.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -66,7 +66,7 @@ radix_find_compressed(radix_compressed_t *tree, const uint8_t *key, gsize keylen
uintptr_t
radix_insert_compressed(radix_compressed_t *tree,
- uint8_t *key, gsize keylen,
+ const uint8_t *key, gsize keylen,
gsize masklen,
uintptr_t value)
{
@@ -128,6 +128,39 @@ radix_insert_compressed(radix_compressed_t *tree,
return old;
}
+uintptr_t
+radix_insert_compressed_addr(radix_compressed_t *tree,
+ const rspamd_inet_addr_t *addr,
+ uintptr_t value)
+{
+ const unsigned char *key;
+ unsigned int klen = 0;
+ unsigned char buf[16];
+
+ if (addr == NULL) {
+ return RADIX_NO_VALUE;
+ }
+
+ key = rspamd_inet_address_get_hash_key(addr, &klen);
+
+ if (key && klen) {
+ if (klen == 4) {
+ /* Map to ipv6 */
+ memset(buf, 0, 10);
+ buf[10] = 0xffu;
+ buf[11] = 0xffu;
+ memcpy(buf + 12, key, klen);
+
+ key = buf;
+ klen = sizeof(buf);
+ }
+
+ return radix_insert_compressed(tree, key, klen, 0, value);
+ }
+
+ return RADIX_NO_VALUE;
+}
+
radix_compressed_t *
radix_create_compressed(const char *tree_name)
diff --git a/src/libutil/radix.h b/src/libutil/radix.h
index c4fe96441..8c1224707 100644
--- a/src/libutil/radix.h
+++ b/src/libutil/radix.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
#include "mem_pool.h"
#include "util.h"
-#define RADIX_NO_VALUE (uintptr_t) - 1
+#define RADIX_NO_VALUE (uintptr_t) -1
#ifdef __cplusplus
extern "C" {
@@ -39,11 +39,23 @@ typedef struct radix_tree_compressed radix_compressed_t;
*/
uintptr_t
radix_insert_compressed(radix_compressed_t *tree,
- uint8_t *key, gsize keylen,
+ const uint8_t *key, gsize keylen,
gsize masklen,
uintptr_t value);
/**
+ * Insert new address to the radix trie (works for IPv4 or IPv6 addresses)
+ * @param tree radix trie
+ * @param addr address to insert
+ * @param value opaque value pointer
+ * @return previous value of the key or `RADIX_NO_VALUE`
+ */
+uintptr_t
+radix_insert_compressed_addr(radix_compressed_t *tree,
+ const rspamd_inet_addr_t *addr,
+ uintptr_t value);
+
+/**
* Find a key in a radix trie
* @param tree radix trie
* @param key key to find (bitstring)
diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c
index 5fe110eb8..c69c42292 100644
--- a/src/libutil/shingles.c
+++ b/src/libutil/shingles.c
@@ -18,6 +18,7 @@
#include "cryptobox.h"
#include "images.h"
#include "libstat/stat_api.h"
+#include "libserver/word.h"
#define SHINGLES_WINDOW 3
#define SHINGLES_KEY_SIZE rspamd_cryptobox_SIPKEYBYTES
@@ -112,7 +113,7 @@ rspamd_shingles_get_keys_cached(const unsigned char key[SHINGLES_KEY_SIZE])
}
struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
- rspamd_shingles_from_text(GArray *input,
+ rspamd_shingles_from_text(rspamd_words_t *input,
const unsigned char key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
@@ -123,12 +124,16 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
uint64_t **hashes;
unsigned char **keys;
rspamd_fstring_t *row;
- rspamd_stat_token_t *word;
+ rspamd_word_t *word;
uint64_t val;
int i, j, k;
gsize hlen, ilen = 0, beg = 0, widx = 0;
enum rspamd_cryptobox_fast_hash_type ht;
+ if (!input || !input->a) {
+ return NULL;
+ }
+
if (pool != NULL) {
res = rspamd_mempool_alloc(pool, sizeof(*res));
}
@@ -138,10 +143,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
row = rspamd_fstring_sized_new(256);
- for (i = 0; i < input->len; i++) {
- word = &g_array_index(input, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(*input); i++) {
+ word = &kv_A(*input, i);
- if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) {
+ if (!((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0)) {
ilen++;
}
}
@@ -162,10 +167,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
for (j = beg; j < i; j++) {
word = NULL;
- while (widx < input->len) {
- word = &g_array_index(input, rspamd_stat_token_t, widx);
+ while (widx < kv_size(*input)) {
+ word = &kv_A(*input, widx);
- if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) {
+ if ((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0) {
widx++;
}
else {
@@ -237,10 +242,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
word = NULL;
- while (widx < input->len) {
- word = &g_array_index(input, rspamd_stat_token_t, widx);
+ while (widx < kv_size(*input)) {
+ word = &kv_A(*input, widx);
- if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) {
+ if ((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0) {
widx++;
}
else {
diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h
index fe6f16cf8..1ab2c6842 100644
--- a/src/libutil/shingles.h
+++ b/src/libutil/shingles.h
@@ -18,6 +18,7 @@
#include "config.h"
#include "mem_pool.h"
+#include "libserver/word.h"
#define RSPAMD_SHINGLE_SIZE 32
@@ -48,14 +49,14 @@ typedef uint64_t (*rspamd_shingles_filter)(uint64_t *input, gsize count,
/**
* Generate shingles from the input of fixed size strings using lemmatizer
* if needed
- * @param input array of `rspamd_fstring_t`
+ * @param input kvec of `rspamd_word_t`
* @param key secret key used to generate shingles
* @param pool pool to allocate shingles array
* @param filter hashes filtering function
* @param filterd opaque data for filtering function
* @return shingles array
*/
-struct rspamd_shingle *rspamd_shingles_from_text(GArray *input,
+struct rspamd_shingle *rspamd_shingles_from_text(rspamd_words_t *input,
const unsigned char key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c
index 3a0f1a06c..f36228680 100644
--- a/src/lua/lua_common.c
+++ b/src/lua/lua_common.c
@@ -2401,7 +2401,7 @@ rspamd_lua_try_load_redis(lua_State *L, const ucl_object_t *obj,
return FALSE;
}
-void rspamd_lua_push_full_word(lua_State *L, rspamd_stat_token_t *w)
+void rspamd_lua_push_full_word(lua_State *L, rspamd_word_t *w)
{
int fl_cnt;
@@ -2521,6 +2521,54 @@ int rspamd_lua_push_words(lua_State *L, GArray *words,
return 1;
}
+int rspamd_lua_push_words_kvec(lua_State *L, rspamd_words_t *words,
+ enum rspamd_lua_words_type how)
+{
+ rspamd_word_t *w;
+ unsigned int i, cnt;
+
+ if (!words || !words->a) {
+ lua_createtable(L, 0, 0);
+ return 1;
+ }
+
+ lua_createtable(L, kv_size(*words), 0);
+
+ for (i = 0, cnt = 1; i < kv_size(*words); i++) {
+ w = &kv_A(*words, i);
+
+ switch (how) {
+ case RSPAMD_LUA_WORDS_STEM:
+ if (w->stemmed.len > 0) {
+ lua_pushlstring(L, w->stemmed.begin, w->stemmed.len);
+ lua_rawseti(L, -2, cnt++);
+ }
+ break;
+ case RSPAMD_LUA_WORDS_NORM:
+ if (w->normalized.len > 0) {
+ lua_pushlstring(L, w->normalized.begin, w->normalized.len);
+ lua_rawseti(L, -2, cnt++);
+ }
+ break;
+ case RSPAMD_LUA_WORDS_RAW:
+ if (w->original.len > 0) {
+ lua_pushlstring(L, w->original.begin, w->original.len);
+ lua_rawseti(L, -2, cnt++);
+ }
+ break;
+ case RSPAMD_LUA_WORDS_FULL:
+ rspamd_lua_push_full_word(L, w);
+ /* Push to the resulting vector */
+ lua_rawseti(L, -2, cnt++);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return 1;
+}
+
char *
rspamd_lua_get_module_name(lua_State *L)
{
@@ -2658,4 +2706,4 @@ int rspamd_lua_geti(lua_State *L, int pos, int i)
return lua_type(L, -1);
}
-#endif \ No newline at end of file
+#endif
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index 5819da8cb..d494f0923 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -539,7 +539,7 @@ enum lua_logger_escape_type {
* @return
*/
gsize lua_logger_out(lua_State *L, int pos, char *outbuf, gsize len,
- enum lua_logger_escape_type esc_type);
+ enum lua_logger_escape_type esc_type);
/**
* Safely checks userdata to match specified class
@@ -632,7 +632,7 @@ struct rspamd_stat_token_s;
* @param L
* @param word
*/
-void rspamd_lua_push_full_word(lua_State *L, struct rspamd_stat_token_s *word);
+void rspamd_lua_push_full_word(lua_State *L, rspamd_word_t *word);
enum rspamd_lua_words_type {
RSPAMD_LUA_WORDS_STEM = 0,
@@ -651,6 +651,9 @@ enum rspamd_lua_words_type {
int rspamd_lua_push_words(lua_State *L, GArray *words,
enum rspamd_lua_words_type how);
+int rspamd_lua_push_words_kvec(lua_State *L, rspamd_words_t *words,
+ enum rspamd_lua_words_type how);
+
/**
* Returns newly allocated name for caller module name
* @param L
diff --git a/src/lua/lua_config.c b/src/lua/lua_config.c
index f52eae44f..7b3a156cd 100644
--- a/src/lua/lua_config.c
+++ b/src/lua/lua_config.c
@@ -24,6 +24,10 @@
#include "utlist.h"
#include <math.h>
+/* Forward declarations for custom tokenizer functions */
+gboolean rspamd_config_load_custom_tokenizers(struct rspamd_config *cfg, GError **err);
+void rspamd_config_unload_custom_tokenizers(struct rspamd_config *cfg);
+
/***
* This module is used to configure rspamd and is normally available as global
* variable named `rspamd_config`. Unlike other modules, it is not necessary to
@@ -862,6 +866,19 @@ LUA_FUNCTION_DEF(config, get_dns_max_requests);
*/
LUA_FUNCTION_DEF(config, get_dns_timeout);
+/***
+ * @method rspamd_config:load_custom_tokenizers()
+ * Loads custom tokenizers from configuration
+ * @return {boolean} true if successful
+ */
+LUA_FUNCTION_DEF(config, load_custom_tokenizers);
+
+/***
+ * @method rspamd_config:unload_custom_tokenizers()
+ * Unloads custom tokenizers and frees memory
+ */
+LUA_FUNCTION_DEF(config, unload_custom_tokenizers);
+
static const struct luaL_reg configlib_m[] = {
LUA_INTERFACE_DEF(config, get_module_opt),
LUA_INTERFACE_DEF(config, get_mempool),
@@ -937,6 +954,8 @@ static const struct luaL_reg configlib_m[] = {
LUA_INTERFACE_DEF(config, get_tld_path),
LUA_INTERFACE_DEF(config, get_dns_max_requests),
LUA_INTERFACE_DEF(config, get_dns_timeout),
+ LUA_INTERFACE_DEF(config, load_custom_tokenizers),
+ LUA_INTERFACE_DEF(config, unload_custom_tokenizers),
{"__tostring", rspamd_lua_class_tostring},
{"__newindex", lua_config_newindex},
{NULL, NULL}};
@@ -4485,11 +4504,14 @@ lua_config_init_subsystem(lua_State *L)
nparts = g_strv_length(parts);
for (i = 0; i < nparts; i++) {
- if (strcmp(parts[i], "filters") == 0) {
+ const char *str = parts[i];
+
+ /* TODO: total shit, rework some day */
+ if (strcmp(str, "filters") == 0) {
rspamd_lua_post_load_config(cfg);
rspamd_init_filters(cfg, false, false);
}
- else if (strcmp(parts[i], "langdet") == 0) {
+ else if (strcmp(str, "langdet") == 0) {
if (!cfg->lang_det) {
cfg->lang_det = rspamd_language_detector_init(cfg);
rspamd_mempool_add_destructor(cfg->cfg_pool,
@@ -4497,10 +4519,10 @@ lua_config_init_subsystem(lua_State *L)
cfg->lang_det);
}
}
- else if (strcmp(parts[i], "stat") == 0) {
+ else if (strcmp(str, "stat") == 0) {
rspamd_stat_init(cfg, NULL);
}
- else if (strcmp(parts[i], "dns") == 0) {
+ else if (strcmp(str, "dns") == 0) {
struct ev_loop *ev_base = lua_check_ev_base(L, 3);
if (ev_base) {
@@ -4514,11 +4536,25 @@ lua_config_init_subsystem(lua_State *L)
return luaL_error(L, "no event base specified");
}
}
- else if (strcmp(parts[i], "symcache") == 0) {
+ else if (strcmp(str, "symcache") == 0) {
rspamd_symcache_init(cfg->cache);
}
+ else if (strcmp(str, "tokenizers") == 0 || strcmp(str, "custom_tokenizers") == 0) {
+ GError *err = NULL;
+ if (!rspamd_config_load_custom_tokenizers(cfg, &err)) {
+ g_strfreev(parts);
+ if (err) {
+ int ret = luaL_error(L, "failed to load custom tokenizers: %s", err->message);
+ g_error_free(err);
+ return ret;
+ }
+ else {
+ return luaL_error(L, "failed to load custom tokenizers");
+ }
+ }
+ }
else {
- int ret = luaL_error(L, "invalid param: %s", parts[i]);
+ int ret = luaL_error(L, "invalid param: %s", str);
g_strfreev(parts);
return ret;
@@ -4772,3 +4808,43 @@ void lua_call_finish_script(struct rspamd_config_cfg_lua_script *sc,
lua_thread_call(thread, 1);
}
+
+static int
+lua_config_load_custom_tokenizers(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_config *cfg = lua_check_config(L, 1);
+
+ if (cfg != NULL) {
+ GError *err = NULL;
+ gboolean ret = rspamd_config_load_custom_tokenizers(cfg, &err);
+
+ if (!ret && err) {
+ lua_pushboolean(L, FALSE);
+ lua_pushstring(L, err->message);
+ g_error_free(err);
+ return 2;
+ }
+
+ lua_pushboolean(L, ret);
+ return 1;
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+}
+
+static int
+lua_config_unload_custom_tokenizers(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_config *cfg = lua_check_config(L, 1);
+
+ if (cfg != NULL) {
+ rspamd_config_unload_custom_tokenizers(cfg);
+ return 0;
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+}
diff --git a/src/lua/lua_cryptobox.c b/src/lua/lua_cryptobox.c
index 721d71256..2c2254920 100644
--- a/src/lua/lua_cryptobox.c
+++ b/src/lua/lua_cryptobox.c
@@ -404,7 +404,7 @@ lua_cryptobox_keypair_load(lua_State *L)
if (lua_type(L, 1) == LUA_TSTRING) {
buf = luaL_checklstring(L, 1, &len);
if (buf != NULL) {
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_chunk(parser, buf, len)) {
msg_err("cannot open keypair from data: %s",
diff --git a/src/lua/lua_http.c b/src/lua/lua_http.c
index 7e9e7b1df..731b8b057 100644
--- a/src/lua/lua_http.c
+++ b/src/lua/lua_http.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -29,22 +29,123 @@
* This module hides all complexity: DNS resolving, sessions management, zero-copy
* text transfers and so on under the hood.
* @example
+-- Basic GET request with callback
local rspamd_http = require "rspamd_http"
local function symbol_callback(task)
local function http_callback(err_message, code, body, headers)
task:insert_result('SYMBOL', 1) -- task is available via closure
+
+ if err_message then
+ -- Handle error
+ return
+ end
+
+ -- Process response
+ if code == 200 then
+ -- Process body and headers
+ for name, value in pairs(headers) do
+ -- Headers are lowercase
+ end
+ end
end
- rspamd_http.request({
- task=task,
- url='http://example.com/data',
- body=task:get_content(),
- callback=http_callback,
- headers={Header='Value', OtherHeader='Value'},
- mime_type='text/plain',
- })
- end
+ rspamd_http.request({
+ task=task,
+ url='http://example.com/data',
+ body=task:get_content(),
+ callback=http_callback,
+ headers={Header='Value', OtherHeader='Value', DuplicatedHeader={'Multiple', 'Values'}},
+ mime_type='text/plain',
+ })
+end
+
+-- POST request with JSON body
+local function post_json_example(task)
+ local ucl = require "ucl"
+ local data = {
+ id = task:get_queue_id(),
+ sender = task:get_from()[1].addr
+ }
+
+ local json_data = ucl.to_json(data)
+
+ rspamd_http.request({
+ task = task,
+ url = "http://example.com/api/submit",
+ method = "POST",
+ body = json_data,
+ headers = {['Content-Type'] = 'application/json'},
+ callback = function(err, code, body, headers)
+ if not err and code == 200 then
+ -- Success
+ end
+ end
+ })
+end
+
+-- Synchronous HTTP request (using coroutines)
+local function sync_http_example(task)
+ -- No callback makes this a synchronous call
+ local err, response = rspamd_http.request({
+ task = task,
+ url = "http://example.com/api/data",
+ method = "GET",
+ timeout = 10.0
+ })
+
+ if not err then
+ -- Response is a table with code, content, and headers
+ if response.code == 200 then
+ -- Process response.content
+ return true
+ end
+ end
+ return false
+end
+
+-- Using authentication
+local function auth_example(task)
+ rspamd_http.request({
+ task = task,
+ url = "https://example.com/api/protected",
+ method = "GET",
+ user = "username",
+ password = "secret",
+ callback = function(err, code, body, headers)
+ -- Process authenticated response
+ end
+ })
+end
+
+-- Using HTTPS with SSL options
+local function https_example(task)
+ rspamd_http.request({
+ task = task,
+ url = "https://example.com/api/secure",
+ method = "GET",
+ no_ssl_verify = false, -- Verify SSL (default)
+ callback = function(err, code, body, headers)
+ -- Process secure response
+ end
+ })
+end
+
+-- Using keep-alive and gzip
+local function advanced_example(task)
+ rspamd_http.request({
+ task = task,
+ url = "http://example.com/api/data",
+ method = "POST",
+ body = task:get_content(),
+ gzip = true, -- Compress request body
+ keepalive = true, -- Use keep-alive connection
+ max_size = 1024 * 1024, -- Limit response to 1MB
+ callback = function(err, code, body, headers)
+ -- Process response
+ end
+ })
+end
*/
#define MAX_HEADERS_SIZE 8192
@@ -602,7 +703,7 @@ lua_http_push_headers(lua_State *L, struct rspamd_http_message *msg)
* @param {string} url specifies URL for a request in the standard URI form (e.g. 'http://example.com/path')
* @param {function} callback specifies callback function in format `function (err_message, code, body, headers)` that is called on HTTP request completion. if this parameter is missing, the function performs "pseudo-synchronous" call (see [Synchronous and Asynchronous API overview](/doc/developers/sync_async.html#API-example-http-module)
* @param {task} task if called from symbol handler it is generally a good idea to use the common task objects: event base, DNS resolver and events session
- * @param {table} headers optional headers in form `[name='value', name='value']`
+ * @param {table} headers optional headers in form `[name='value']` or `[name=['value1', 'value2']]` to duplicate a header with multiple values
* @param {string} mime_type MIME type of the HTTP content (for example, `text/html`)
* @param {string/text} body full body content, can be opaque `rspamd{text}` to avoid data copying
* @param {number} timeout floating point request timeout value in seconds (default is 5.0 seconds)
diff --git a/src/lua/lua_logger.c b/src/lua/lua_logger.c
index 8f2aa5be1..04ff81b6d 100644
--- a/src/lua/lua_logger.c
+++ b/src/lua/lua_logger.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -176,8 +176,8 @@ static const struct luaL_reg loggerlib_f[] = {
static gsize
lua_logger_out_type(lua_State *L, int pos, char *outbuf,
- gsize len, struct lua_logger_trace *trace,
- enum lua_logger_escape_type esc_type);
+ gsize len, struct lua_logger_trace *trace,
+ enum lua_logger_escape_type esc_type);
static void
lua_common_log_line(GLogLevelFlags level,
@@ -215,12 +215,12 @@ lua_common_log_line(GLogLevelFlags level,
}
rspamd_common_log_function(NULL,
- level,
- module,
- uid,
- p,
- "%s",
- msg);
+ level,
+ module,
+ uid,
+ p,
+ "%s",
+ msg);
}
/*** Logger interface ***/
@@ -280,19 +280,22 @@ lua_logger_char_safe(int t, unsigned int esc_type)
return true;
}
-/* Could return negative value in case of wrong argument number */
+#define LUA_MAX_ARGS 32
+/* Gracefully handles argument mismatches by substituting missing args and noting extra args */
static glong
lua_logger_log_format_str(lua_State *L, int offset, char *logbuf, gsize remain,
- const char *fmt,
- enum lua_logger_escape_type esc_type)
+ const char *fmt,
+ enum lua_logger_escape_type esc_type)
{
const char *c;
gsize r;
int digit;
-
char *d = logbuf;
unsigned int arg_num, cur_arg = 0, arg_max = lua_gettop(L) - offset;
+ gboolean args_used[LUA_MAX_ARGS];
+ unsigned int used_args_count = 0;
+ memset(args_used, 0, sizeof(args_used));
while (remain > 1 && *fmt) {
if (*fmt == '%') {
++fmt;
@@ -300,12 +303,13 @@ lua_logger_log_format_str(lua_State *L, int offset, char *logbuf, gsize remain,
if (*fmt == 's') {
++fmt;
++cur_arg;
- } else {
+ }
+ else {
arg_num = 0;
while ((digit = g_ascii_digit_value(*fmt)) >= 0) {
++fmt;
arg_num = arg_num * 10 + digit;
- if (arg_num >= 100) {
+ if (arg_num >= LUA_MAX_ARGS) {
/* Avoid ridiculously large numbers */
fmt = c;
break;
@@ -320,11 +324,19 @@ lua_logger_log_format_str(lua_State *L, int offset, char *logbuf, gsize remain,
if (fmt > c) {
if (cur_arg < 1 || cur_arg > arg_max) {
- *d = 0;
- return -((glong) cur_arg + 1); /* wrong argument number */
+ /* Missing argument - substitute placeholder */
+ r = rspamd_snprintf(d, remain, "<MISSING ARGUMENT>");
+ }
+ else {
+ /* Valid argument - output it */
+ r = lua_logger_out(L, offset + cur_arg, d, remain, esc_type);
+ /* Track which arguments are used */
+ if (cur_arg <= LUA_MAX_ARGS && !args_used[cur_arg - 1]) {
+ args_used[cur_arg - 1] = TRUE;
+ used_args_count++;
+ }
}
- r = lua_logger_out(L, offset + cur_arg, d, remain, esc_type);
g_assert(r < remain);
remain -= r;
d += r;
@@ -339,11 +351,21 @@ lua_logger_log_format_str(lua_State *L, int offset, char *logbuf, gsize remain,
--remain;
}
+ /* Check for extra arguments and append warning if any */
+ if (used_args_count > 0 && used_args_count < arg_max && remain > 1) {
+ unsigned int extra_args = arg_max - used_args_count;
+ r = rspamd_snprintf(d, remain, " <EXTRA %d ARGUMENTS>", (int) extra_args);
+ remain -= r;
+ d += r;
+ }
+
*d = 0;
return d - logbuf;
}
+#undef LUA_MAX_ARGS
+
static gsize
lua_logger_out_str(lua_State *L, int pos,
char *outbuf, gsize len,
@@ -486,12 +508,12 @@ lua_logger_out_userdata(lua_State *L, int pos, char *outbuf, gsize len)
return r;
}
-#define MOVE_BUF(d, remain, r) \
- (d) += (r); \
- (remain) -= (r); \
- if ((remain) <= 1) { \
- lua_settop(L, top); \
- goto table_oob; \
+#define MOVE_BUF(d, remain, r) \
+ (d) += (r); \
+ (remain) -= (r); \
+ if ((remain) <= 1) { \
+ lua_settop(L, top); \
+ goto table_oob; \
}
static gsize
@@ -545,9 +567,10 @@ lua_logger_out_table(lua_State *L, int pos, char *outbuf, gsize len,
if (first) {
first = FALSE;
- str = "[%d] = ";
- } else {
- str = ", [%d] = ";
+ str = "[%d] = ";
+ }
+ else {
+ str = ", [%d] = ";
}
r = rspamd_snprintf(d, remain, str, i);
MOVE_BUF(d, remain, r);
@@ -579,14 +602,12 @@ lua_logger_out_table(lua_State *L, int pos, char *outbuf, gsize len,
if (first) {
first = FALSE;
str = "[%2] = %1";
- } else {
+ }
+ else {
str = ", [%2] = %1";
}
r = lua_logger_log_format_str(L, top + 1, d, remain, str, esc_type);
- if (r < 0) {
- /* should not happen */
- goto table_oob;
- }
+ /* lua_logger_log_format_str now handles errors gracefully */
MOVE_BUF(d, remain, r);
/* Remove key */
@@ -606,9 +627,9 @@ table_oob:
static gsize
lua_logger_out_type(lua_State *L, int pos,
- char *outbuf, gsize len,
- struct lua_logger_trace *trace,
- enum lua_logger_escape_type esc_type)
+ char *outbuf, gsize len,
+ struct lua_logger_trace *trace,
+ enum lua_logger_escape_type esc_type)
{
if (len == 0) {
return 0;
@@ -640,8 +661,8 @@ lua_logger_out_type(lua_State *L, int pos,
}
gsize lua_logger_out(lua_State *L, int pos,
- char *outbuf, gsize len,
- enum lua_logger_escape_type esc_type)
+ char *outbuf, gsize len,
+ enum lua_logger_escape_type esc_type)
{
struct lua_logger_trace tr;
memset(&tr, 0, sizeof(tr));
@@ -747,11 +768,8 @@ lua_logger_log_format(lua_State *L, int fmt_pos, gboolean is_string,
return FALSE;
}
- glong ret = lua_logger_log_format_str(L, fmt_pos, logbuf, remain, fmt, is_string ? LUA_ESCAPE_UNPRINTABLE : LUA_ESCAPE_LOG);
- if (ret < 0) {
- msg_err("wrong argument number: %ud", -((int) ret + 1));
- return FALSE;
- }
+ /* lua_logger_log_format_str now handles argument mismatches gracefully */
+ lua_logger_log_format_str(L, fmt_pos, logbuf, remain, fmt, is_string ? LUA_ESCAPE_UNPRINTABLE : LUA_ESCAPE_LOG);
return TRUE;
}
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index 07dba9c93..982b10d90 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -901,7 +901,7 @@ lua_textpart_get_words_count(lua_State *L)
return 1;
}
- if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+ if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
lua_pushinteger(L, 0);
}
else {
@@ -943,7 +943,7 @@ lua_textpart_get_words(lua_State *L)
return luaL_error(L, "invalid arguments");
}
- if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+ if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
lua_createtable(L, 0, 0);
}
else {
@@ -957,7 +957,7 @@ lua_textpart_get_words(lua_State *L)
}
}
- return rspamd_lua_push_words(L, part->utf_words, how);
+ return rspamd_lua_push_words_kvec(L, &part->utf_words, how);
}
return 1;
@@ -976,7 +976,7 @@ lua_textpart_filter_words(lua_State *L)
return luaL_error(L, "invalid arguments");
}
- if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+ if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
lua_createtable(L, 0, 0);
}
else {
@@ -998,9 +998,8 @@ lua_textpart_filter_words(lua_State *L)
lua_createtable(L, 8, 0);
- for (i = 0, cnt = 1; i < part->utf_words->len; i++) {
- rspamd_stat_token_t *w = &g_array_index(part->utf_words,
- rspamd_stat_token_t, i);
+ for (i = 0, cnt = 1; i < kv_size(part->utf_words); i++) {
+ rspamd_word_t *w = &kv_A(part->utf_words, i);
switch (how) {
case RSPAMD_LUA_WORDS_STEM:
@@ -1194,13 +1193,13 @@ struct lua_shingle_filter_cbdata {
rspamd_mempool_t *pool;
};
-#define STORE_TOKEN(i, t) \
- do { \
- if ((i) < part->utf_words->len) { \
- word = &g_array_index(part->utf_words, rspamd_stat_token_t, (i)); \
- sd->t.begin = word->stemmed.begin; \
- sd->t.len = word->stemmed.len; \
- } \
+#define STORE_TOKEN(i, t) \
+ do { \
+ if ((i) < kv_size(part->utf_words)) { \
+ word = &kv_A(part->utf_words, (i)); \
+ sd->t.begin = word->stemmed.begin; \
+ sd->t.len = word->stemmed.len; \
+ } \
} while (0)
static uint64_t
@@ -1210,7 +1209,7 @@ lua_shingles_filter(uint64_t *input, gsize count,
uint64_t minimal = G_MAXUINT64;
gsize i, min_idx = 0;
struct lua_shingle_data *sd;
- rspamd_stat_token_t *word;
+ rspamd_word_t *word;
struct lua_shingle_filter_cbdata *cbd = (struct lua_shingle_filter_cbdata *) ud;
struct rspamd_mime_text_part *part;
@@ -1248,7 +1247,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
unsigned int i;
struct lua_shingle_data *sd;
rspamd_cryptobox_hash_state_t st;
- rspamd_stat_token_t *word;
+ rspamd_word_t *word;
struct lua_shingle_filter_cbdata cbd;
@@ -1256,7 +1255,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
return luaL_error(L, "invalid arguments");
}
- if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+ if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
lua_pushnil(L);
lua_pushnil(L);
}
@@ -1269,8 +1268,8 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
/* Calculate direct hash */
rspamd_cryptobox_hash_init(&st, key, rspamd_cryptobox_HASHKEYBYTES);
- for (i = 0; i < part->utf_words->len; i++) {
- word = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(part->utf_words); i++) {
+ word = &kv_A(part->utf_words, i);
rspamd_cryptobox_hash_update(&st,
word->stemmed.begin, word->stemmed.len);
}
@@ -1283,7 +1282,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
cbd.pool = pool;
cbd.part = part;
- sgl = rspamd_shingles_from_text(part->utf_words, key,
+ sgl = rspamd_shingles_from_text(&part->utf_words, key,
pool, lua_shingles_filter, &cbd, RSPAMD_SHINGLES_MUMHASH);
if (sgl == NULL) {
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c
index f77b36952..39e1b0317 100644
--- a/src/lua/lua_parsers.c
+++ b/src/lua/lua_parsers.c
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2020 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -108,8 +108,8 @@ int lua_parsers_tokenize_text(lua_State *L)
struct rspamd_lua_text *t;
struct rspamd_process_exception *ex;
UText utxt = UTEXT_INITIALIZER;
- GArray *res;
- rspamd_stat_token_t *w;
+ rspamd_words_t *res;
+ rspamd_word_t *w;
if (lua_type(L, 1) == LUA_TSTRING) {
in = luaL_checklstring(L, 1, &len);
@@ -175,13 +175,15 @@ int lua_parsers_tokenize_text(lua_State *L)
lua_pushnil(L);
}
else {
- lua_createtable(L, res->len, 0);
+ lua_createtable(L, kv_size(*res), 0);
- for (i = 0; i < res->len; i++) {
- w = &g_array_index(res, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(*res); i++) {
+ w = &kv_A(*res, i);
lua_pushlstring(L, w->original.begin, w->original.len);
lua_rawseti(L, -2, i + 1);
}
+ kv_destroy(*res);
+ g_free(res);
}
cur = exceptions;
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 97f9c496e..0b1473b61 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -6943,7 +6943,7 @@ lua_task_get_meta_words(lua_State *L)
return luaL_error(L, "invalid arguments");
}
- if (task->meta_words == NULL) {
+ if (!task->meta_words.a) {
lua_createtable(L, 0, 0);
}
else {
@@ -6967,7 +6967,7 @@ lua_task_get_meta_words(lua_State *L)
}
}
- return rspamd_lua_push_words(L, task->meta_words, how);
+ return rspamd_lua_push_words_kvec(L, &task->meta_words, how);
}
return 1;
@@ -7039,6 +7039,76 @@ lua_lookup_words_array(lua_State *L,
return nmatched;
}
+static unsigned int
+lua_lookup_words_kvec(lua_State *L,
+ int cbpos,
+ struct rspamd_task *task,
+ struct rspamd_lua_map *map,
+ rspamd_words_t *words)
+{
+ rspamd_word_t *tok;
+ unsigned int i, nmatched = 0;
+ int err_idx;
+ gboolean matched;
+ const char *key;
+ gsize keylen;
+
+ if (!words || !words->a) {
+ return 0;
+ }
+
+ for (i = 0; i < kv_size(*words); i++) {
+ tok = &kv_A(*words, i);
+
+ matched = FALSE;
+
+ if (tok->normalized.len == 0) {
+ continue;
+ }
+
+ key = tok->normalized.begin;
+ keylen = tok->normalized.len;
+
+ switch (map->type) {
+ case RSPAMD_LUA_MAP_SET:
+ case RSPAMD_LUA_MAP_HASH:
+ /* We know that tok->normalized is zero terminated in fact */
+ if (rspamd_match_hash_map(map->data.hash, key, keylen)) {
+ matched = TRUE;
+ }
+ break;
+ case RSPAMD_LUA_MAP_REGEXP:
+ case RSPAMD_LUA_MAP_REGEXP_MULTIPLE:
+ if (rspamd_match_regexp_map_single(map->data.re_map, key,
+ keylen)) {
+ matched = TRUE;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ break;
+ }
+
+ if (matched) {
+ nmatched++;
+
+ lua_pushcfunction(L, &rspamd_lua_traceback);
+ err_idx = lua_gettop(L);
+ lua_pushvalue(L, cbpos); /* Function */
+ rspamd_lua_push_full_word(L, tok);
+
+ if (lua_pcall(L, 1, 0, err_idx) != 0) {
+ msg_err_task("cannot call callback function for lookup words: %s",
+ lua_tostring(L, -1));
+ }
+
+ lua_settop(L, err_idx - 1);
+ }
+ }
+
+ return nmatched;
+}
+
static int
lua_task_lookup_words(lua_State *L)
{
@@ -7062,13 +7132,13 @@ lua_task_lookup_words(lua_State *L)
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, tp)
{
- if (tp->utf_words) {
- matches += lua_lookup_words_array(L, 3, task, map, tp->utf_words);
+ if (tp->utf_words.a) {
+ matches += lua_lookup_words_kvec(L, 3, task, map, &tp->utf_words);
}
}
- if (task->meta_words) {
- matches += lua_lookup_words_array(L, 3, task, map, task->meta_words);
+ if (task->meta_words.a) {
+ matches += lua_lookup_words_kvec(L, 3, task, map, &task->meta_words);
}
lua_pushinteger(L, matches);
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 9fe862757..f2e9b8fa9 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -23,12 +23,21 @@
#include "lua_parsers.h"
-#ifdef WITH_LUA_REPL
-#include "replxx.h"
-#endif
+#include "replxx.h"
#include <math.h>
#include <glob.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+#include <sys/sysctl.h>
+#ifdef __FreeBSD__
+#include <sys/user.h>
+#endif
+#endif
+#ifdef __APPLE__
+#include <mach/mach.h>
+#endif
#include "unicode/uspoof.h"
#include "unicode/uscript.h"
@@ -629,6 +638,27 @@ LUA_FUNCTION_DEF(util, caseless_hash_fast);
LUA_FUNCTION_DEF(util, get_hostname);
/***
+ * @function util.get_uptime()
+ * Returns system uptime in seconds
+ * @return {number} uptime in seconds
+ */
+LUA_FUNCTION_DEF(util, get_uptime);
+
+/***
+ * @function util.get_pid()
+ * Returns current process PID
+ * @return {number} process ID
+ */
+LUA_FUNCTION_DEF(util, get_pid);
+
+/***
+ * @function util.get_memory_usage()
+ * Returns memory usage information for current process
+ * @return {table} memory usage info with 'rss' and 'vsize' fields in bytes
+ */
+LUA_FUNCTION_DEF(util, get_memory_usage);
+
+/***
* @function util.parse_content_type(ct_string, mempool)
* Parses content-type string to a table:
* - `type`
@@ -730,6 +760,9 @@ static const struct luaL_reg utillib_f[] = {
LUA_INTERFACE_DEF(util, umask),
LUA_INTERFACE_DEF(util, isatty),
LUA_INTERFACE_DEF(util, get_hostname),
+ LUA_INTERFACE_DEF(util, get_uptime),
+ LUA_INTERFACE_DEF(util, get_pid),
+ LUA_INTERFACE_DEF(util, get_memory_usage),
LUA_INTERFACE_DEF(util, parse_content_type),
LUA_INTERFACE_DEF(util, mime_header_encode),
LUA_INTERFACE_DEF(util, pack),
@@ -2416,6 +2449,107 @@ lua_util_get_hostname(lua_State *L)
}
static int
+lua_util_get_uptime(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ double uptime = 0.0;
+
+#ifdef __linux__
+ FILE *f = fopen("/proc/uptime", "r");
+ if (f) {
+ if (fscanf(f, "%lf", &uptime) != 1) {
+ uptime = 0.0;
+ }
+ fclose(f);
+ }
+#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+ struct timeval boottime;
+ size_t len = sizeof(boottime);
+ int mib[2] = {CTL_KERN, KERN_BOOTTIME};
+
+ if (sysctl(mib, 2, &boottime, &len, NULL, 0) == 0) {
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ uptime = (now.tv_sec - boottime.tv_sec) +
+ (now.tv_usec - boottime.tv_usec) / 1000000.0;
+ }
+#endif
+
+ lua_pushnumber(L, uptime);
+ return 1;
+}
+
+static int
+lua_util_get_pid(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ lua_pushinteger(L, getpid());
+ return 1;
+}
+
+static int
+lua_util_get_memory_usage(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ lua_createtable(L, 0, 2);
+
+#ifdef __linux__
+ FILE *f = fopen("/proc/self/status", "r");
+ if (f) {
+ char line[256];
+ long rss = 0, vsize = 0;
+
+ while (fgets(line, sizeof(line), f)) {
+ if (sscanf(line, "VmRSS: %ld kB", &rss) == 1) {
+ rss *= 1024; /* Convert to bytes */
+ }
+ else if (sscanf(line, "VmSize: %ld kB", &vsize) == 1) {
+ vsize *= 1024; /* Convert to bytes */
+ }
+ }
+ fclose(f);
+
+ lua_pushstring(L, "rss");
+ lua_pushinteger(L, rss);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "vsize");
+ lua_pushinteger(L, vsize);
+ lua_settable(L, -3);
+ }
+#elif defined(__APPLE__)
+ struct task_basic_info info;
+ mach_msg_type_number_t count = TASK_BASIC_INFO_COUNT;
+
+ if (task_info(mach_task_self(), TASK_BASIC_INFO, (task_info_t) &info, &count) == KERN_SUCCESS) {
+ lua_pushstring(L, "rss");
+ lua_pushinteger(L, info.resident_size);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "vsize");
+ lua_pushinteger(L, info.virtual_size);
+ lua_settable(L, -3);
+ }
+#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+ struct kinfo_proc kp;
+ size_t len = sizeof(kp);
+ int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PID, getpid()};
+
+ if (sysctl(mib, 4, &kp, &len, NULL, 0) == 0) {
+ lua_pushstring(L, "rss");
+ lua_pushinteger(L, kp.ki_rssize * getpagesize());
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "vsize");
+ lua_pushinteger(L, kp.ki_size);
+ lua_settable(L, -3);
+ }
+#endif
+
+ return 1;
+}
+
+static int
lua_util_parse_content_type(lua_State *L)
{
return lua_parsers_parse_content_type(L);
@@ -2510,7 +2644,7 @@ lua_util_readline(lua_State *L)
if (lua_type(L, 1) == LUA_TSTRING) {
prompt = lua_tostring(L, 1);
}
-#ifdef WITH_LUA_REPL
+
static Replxx *rx_instance = NULL;
if (rx_instance == NULL) {
@@ -2527,26 +2661,6 @@ lua_util_readline(lua_State *L)
else {
lua_pushnil(L);
}
-#else
- size_t linecap = 0;
- ssize_t linelen;
-
- fprintf(stdout, "%s ", prompt);
-
- linelen = getline(&input, &linecap, stdin);
-
- if (linelen > 0) {
- if (input[linelen - 1] == '\n') {
- linelen--;
- }
-
- lua_pushlstring(L, input, linelen);
- free(input);
- }
- else {
- lua_pushnil(L);
- }
-#endif
return 1;
}
@@ -3721,4 +3835,4 @@ lua_ev_base_add_timer(lua_State *L)
ev_timer_start(ev_base, &cbdata->ev);
return 0;
-} \ No newline at end of file
+}
diff --git a/src/plugins/chartable.cxx b/src/plugins/chartable.cxx
index a5c7cb899..c82748862 100644
--- a/src/plugins/chartable.cxx
+++ b/src/plugins/chartable.cxx
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -1696,7 +1696,7 @@ rspamd_can_alias_latin(int ch)
static double
rspamd_chartable_process_word_utf(struct rspamd_task *task,
- rspamd_stat_token_t *w,
+ rspamd_word_t *w,
gboolean is_url,
unsigned int *ncap,
struct chartable_ctx *chartable_module_ctx,
@@ -1842,7 +1842,7 @@ rspamd_chartable_process_word_utf(struct rspamd_task *task,
static double
rspamd_chartable_process_word_ascii(struct rspamd_task *task,
- rspamd_stat_token_t *w,
+ rspamd_word_t *w,
gboolean is_url,
struct chartable_ctx *chartable_module_ctx)
{
@@ -1931,17 +1931,17 @@ rspamd_chartable_process_part(struct rspamd_task *task,
struct chartable_ctx *chartable_module_ctx,
gboolean ignore_diacritics)
{
- rspamd_stat_token_t *w;
+ rspamd_word_t *w;
unsigned int i, ncap = 0;
double cur_score = 0.0;
- if (part == nullptr || part->utf_words == nullptr ||
- part->utf_words->len == 0 || part->nwords == 0) {
+ if (part == nullptr || part->utf_words.a == nullptr ||
+ kv_size(part->utf_words) == 0 || part->nwords == 0) {
return FALSE;
}
- for (i = 0; i < part->utf_words->len; i++) {
- w = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(part->utf_words); i++) {
+ w = &kv_A(part->utf_words, i);
if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
@@ -2015,13 +2015,13 @@ chartable_symbol_callback(struct rspamd_task *task,
ignore_diacritics = TRUE;
}
- if (task->meta_words != nullptr && task->meta_words->len > 0) {
- rspamd_stat_token_t *w;
+ if (task->meta_words.a && kv_size(task->meta_words) > 0) {
+ rspamd_word_t *w;
double cur_score = 0;
- gsize arlen = task->meta_words->len;
+ gsize arlen = kv_size(task->meta_words);
for (i = 0; i < arlen; i++) {
- w = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+ w = &kv_A(task->meta_words, i);
cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
nullptr, chartable_module_ctx, ignore_diacritics);
}
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c
index 85ea3b00c..7dd5162ac 100644
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -1431,10 +1431,10 @@ fuzzy_io_fin(void *ud)
close(session->fd);
}
-static GArray *
+static rspamd_words_t *
fuzzy_preprocess_words(struct rspamd_mime_text_part *part, rspamd_mempool_t *pool)
{
- return part->utf_words;
+ return &part->utf_words;
}
static void
@@ -1861,7 +1861,7 @@ fuzzy_cmd_from_text_part(struct rspamd_task *task,
unsigned int i;
rspamd_cryptobox_hash_state_t st;
rspamd_stat_token_t *word;
- GArray *words;
+ rspamd_words_t *words;
struct fuzzy_cmd_io *io;
unsigned int additional_length;
unsigned char *additional_data;
@@ -1970,10 +1970,10 @@ fuzzy_cmd_from_text_part(struct rspamd_task *task,
rspamd_cryptobox_hash_init(&st, rule->hash_key->str, rule->hash_key->len);
words = fuzzy_preprocess_words(part, task->task_pool);
- for (i = 0; i < words->len; i++) {
- word = &g_array_index(words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(*words); i++) {
+ word = &kv_A(*words, i);
- if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) {
+ if (!((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0)) {
rspamd_cryptobox_hash_update(&st, word->stemmed.begin,
word->stemmed.len);
}
@@ -2684,7 +2684,7 @@ fuzzy_insert_metric_results(struct rspamd_task *task, struct fuzzy_rule *rule,
if (task->message) {
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, tp)
{
- if (!IS_TEXT_PART_EMPTY(tp) && tp->utf_words != NULL && tp->utf_words->len > 0) {
+ if (!IS_TEXT_PART_EMPTY(tp) && kv_size(tp->utf_words) > 0) {
seen_text_part = TRUE;
if (tp->utf_stripped_text.magic == UTEXT_MAGIC) {
diff --git a/src/plugins/lua/arc.lua b/src/plugins/lua/arc.lua
index 07fb5c63f..45da1f5a2 100644
--- a/src/plugins/lua/arc.lua
+++ b/src/plugins/lua/arc.lua
@@ -695,11 +695,11 @@ local function do_sign(task, sign_params)
sign_params.pubkey = results[1]
sign_params.strict_pubkey_check = not settings.allow_pubkey_mismatch
elseif not settings.allow_pubkey_mismatch then
- rspamd_logger.errx('public key for domain %s/%s is not found: %s, skip signing',
+ rspamd_logger.errx(task, 'public key for domain %s/%s is not found: %s, skip signing',
sign_params.domain, sign_params.selector, err)
return
else
- rspamd_logger.infox('public key for domain %s/%s is not found: %s',
+ rspamd_logger.infox(task, 'public key for domain %s/%s is not found: %s',
sign_params.domain, sign_params.selector, err)
end
diff --git a/src/plugins/lua/fuzzy_collect.lua b/src/plugins/lua/fuzzy_collect.lua
index 132ace90c..060cc2fc2 100644
--- a/src/plugins/lua/fuzzy_collect.lua
+++ b/src/plugins/lua/fuzzy_collect.lua
@@ -34,7 +34,7 @@ local settings = {
local function send_data_mirror(m, cfg, ev_base, body)
local function store_callback(err, _, _, _)
if err then
- rspamd_logger.errx(cfg, 'cannot save data on %(%s): %s', m.server, m.name, err)
+ rspamd_logger.errx(cfg, 'cannot save data on %s(%s): %s', m.server, m.name, err)
else
rspamd_logger.infox(cfg, 'saved data on %s(%s)', m.server, m.name)
end
diff --git a/src/plugins/lua/gpt.lua b/src/plugins/lua/gpt.lua
index 5d1cf5e06..5776791a1 100644
--- a/src/plugins/lua/gpt.lua
+++ b/src/plugins/lua/gpt.lua
@@ -20,9 +20,9 @@ local E = {}
if confighelp then
rspamd_config:add_example(nil, 'gpt',
- "Performs postfiltering using GPT model",
- [[
-gpt {
+ "Performs postfiltering using GPT model",
+ [[
+ gpt {
# Supported types: openai, ollama
type = "openai";
# Your key to access the API
@@ -53,7 +53,7 @@ gpt {
reason_header = "X-GPT-Reason";
# Use JSON format for response
json = false;
-}
+ }
]])
return
end
@@ -162,7 +162,7 @@ local function default_condition(task)
end
end
lua_util.debugm(N, task, 'symbol %s has weight %s, but required %s', s,
- sym.weight, required_weight)
+ sym.weight, required_weight)
else
return false, 'skip as "' .. s .. '" is found'
end
@@ -182,7 +182,7 @@ local function default_condition(task)
end
end
lua_util.debugm(N, task, 'symbol %s has weight %s, but required %s', s,
- sym.weight, required_weight)
+ sym.weight, required_weight)
end
else
return false, 'skip as "' .. s .. '" is not found'
@@ -301,7 +301,7 @@ local function default_openai_json_conversion(task, input)
elseif reply.probability == "low" then
spam_score = 0.1
else
- rspamd_logger.infox("cannot convert to spam probability: %s", reply.probability)
+ rspamd_logger.infox(task, "cannot convert to spam probability: %s", reply.probability)
end
end
@@ -355,14 +355,27 @@ local function default_openai_plain_conversion(task, input)
local reason = clean_reply_line(lines[2])
local categories = lua_util.str_split(clean_reply_line(lines[3]), ',')
+ if type(reply.usage) == 'table' then
+ rspamd_logger.infox(task, 'usage: %s tokens', reply.usage.total_tokens)
+ end
+
if spam_score then
return spam_score, reason, categories
end
- rspamd_logger.errx(task, 'cannot parse plain gpt reply: %s (all: %s)', lines[1])
+ rspamd_logger.errx(task, 'cannot parse plain gpt reply: %s (all: %s)', lines[1], first_message)
return
end
+-- Helper function to remove <think>...</think> and trim leading newlines
+local function clean_gpt_response(text)
+ -- Remove <think>...</think> including multiline
+ text = text:gsub("<think>.-</think>", "")
+ -- Trim leading whitespace and newlines
+ text = text:gsub("^%s*\n*", "")
+ return text
+end
+
local function default_ollama_plain_conversion(task, input)
local parser = ucl.parser()
local res, err = parser:parse_string(input)
@@ -387,6 +400,10 @@ local function default_ollama_plain_conversion(task, input)
rspamd_logger.errx(task, 'no content in the first message')
return
end
+
+ -- Clean message
+ first_message = clean_gpt_response(first_message)
+
local lines = lua_util.str_split(first_message, '\n')
local first_line = clean_reply_line(lines[1])
local spam_score = tonumber(first_line)
@@ -397,7 +414,7 @@ local function default_ollama_plain_conversion(task, input)
return spam_score, reason, categories
end
- rspamd_logger.errx(task, 'cannot parse plain gpt reply: %s', lines[1])
+ rspamd_logger.errx(task, 'cannot parse plain gpt reply: %s (all: %s)', lines[1], first_message)
return
end
@@ -449,7 +466,7 @@ local function default_ollama_json_conversion(task, input)
elseif reply.probability == "low" then
spam_score = 0.1
else
- rspamd_logger.infox("cannot convert to spam probability: %s", reply.probability)
+ rspamd_logger.infox(task, "cannot convert to spam probability: %s", reply.probability)
end
end
@@ -477,7 +494,7 @@ local function redis_cache_key(sel_part)
env_digest = digest:hex():sub(1, 4)
end
return string.format('%s_%s', env_digest,
- sel_part:get_mimepart():get_digest():sub(1, 24))
+ sel_part:get_mimepart():get_digest():sub(1, 24))
end
local function process_categories(task, categories)
@@ -514,9 +531,9 @@ local function insert_results(task, result, sel_part)
end
end
if result.reason and settings.reason_header then
- lua_mime.modify_headers(task,
- { add = { [settings.reason_header] = { value = tostring(result.reason), order = 1 } } })
- end
+ lua_mime.modify_headers(task,
+ { add = { [settings.reason_header] = { value = tostring(result.reason), order = 1 } } })
+ end
if cache_context then
lua_cache.cache_set(task, redis_cache_key(sel_part), result, cache_context)
@@ -540,12 +557,12 @@ local function check_consensus_and_insert_results(task, results, sel_part)
nspam = nspam + 1
max_spam_prob = math.max(max_spam_prob, result.probability)
lua_util.debugm(N, task, "model: %s; spam: %s; reason: '%s'",
- result.model, result.probability, result.reason)
+ result.model, result.probability, result.reason)
else
nham = nham + 1
max_ham_prob = math.min(max_ham_prob, result.probability)
lua_util.debugm(N, task, "model: %s; ham: %s; reason: '%s'",
- result.model, result.probability, result.reason)
+ result.model, result.probability, result.reason)
end
if result.reason then
@@ -559,23 +576,22 @@ local function check_consensus_and_insert_results(task, results, sel_part)
if nspam > nham and max_spam_prob > 0.75 then
insert_results(task, {
- probability = max_spam_prob,
- reason = reason.reason,
- categories = reason.categories,
- },
- sel_part)
+ probability = max_spam_prob,
+ reason = reason.reason,
+ categories = reason.categories,
+ },
+ sel_part)
elseif nham > nspam and max_ham_prob < 0.25 then
insert_results(task, {
- probability = max_ham_prob,
- reason = reason.reason,
- categories = reason.categories,
- },
- sel_part)
+ probability = max_ham_prob,
+ reason = reason.reason,
+ categories = reason.categories,
+ },
+ sel_part)
else
-- No consensus
lua_util.debugm(N, task, "no consensus")
end
-
end
local function get_meta_llm_content(task)
@@ -674,7 +690,7 @@ local function openai_check(task, content, sel_part)
},
{
role = 'user',
- content = 'Subject: ' .. task:get_subject() or '',
+ content = 'Subject: ' .. (task:get_subject() or ''),
},
{
role = 'user',
@@ -726,7 +742,6 @@ local function openai_check(task, content, sel_part)
if not rspamd_http.request(http_params) then
results[idx].checked = true
end
-
end
end
diff --git a/src/plugins/lua/hfilter.lua b/src/plugins/lua/hfilter.lua
index a783565ab..32102e4f8 100644
--- a/src/plugins/lua/hfilter.lua
+++ b/src/plugins/lua/hfilter.lua
@@ -131,6 +131,7 @@ local checks_hellohost = [[
/modem[.-][0-9]/i 5
/[0-9][.-]?dhcp/i 5
/wifi[.-][0-9]/i 5
+/[.-]vps[.-]/i 1
]]
local checks_hellohost_map
diff --git a/src/plugins/lua/history_redis.lua b/src/plugins/lua/history_redis.lua
index a3fdb0ec4..44eb40ad9 100644
--- a/src/plugins/lua/history_redis.lua
+++ b/src/plugins/lua/history_redis.lua
@@ -138,7 +138,7 @@ end
local function history_save(task)
local function redis_llen_cb(err, _)
if err then
- rspamd_logger.errx(task, 'got error %s when writing history row: %s',
+ rspamd_logger.errx(task, 'got error %s when writing history row',
err)
end
end
@@ -188,7 +188,7 @@ local function handle_history_request(task, conn, from, to, reset)
if reset then
local function redis_ltrim_cb(err, _)
if err then
- rspamd_logger.errx(task, 'got error %s when resetting history: %s',
+ rspamd_logger.errx(task, 'got error %s when resetting history',
err)
conn:send_error(504, '{"error": "' .. err .. '"}')
else
@@ -258,7 +258,7 @@ local function handle_history_request(task, conn, from, to, reset)
(rspamd_util:get_ticks() - t1) * 1000.0)
collectgarbage()
else
- rspamd_logger.errx(task, 'got error %s when getting history: %s',
+ rspamd_logger.errx(task, 'got error %s when getting history',
err)
conn:send_error(504, '{"error": "' .. err .. '"}')
end
diff --git a/src/plugins/lua/known_senders.lua b/src/plugins/lua/known_senders.lua
index 5cb2ddcf5..0cbf3cdcf 100644
--- a/src/plugins/lua/known_senders.lua
+++ b/src/plugins/lua/known_senders.lua
@@ -106,21 +106,26 @@ local function configure_scripts(_, _, _)
-- script checks if given recipients are in the local replies set of the sender
local redis_zscore_script = [[
local replies_recipients_addrs = ARGV
- if replies_recipients_addrs then
+ if replies_recipients_addrs and #replies_recipients_addrs > 0 then
+ local found = false
for _, rcpt in ipairs(replies_recipients_addrs) do
local score = redis.call('ZSCORE', KEYS[1], rcpt)
- -- check if score is nil (for some reason redis script does not see if score is a nil value)
- if type(score) == 'boolean' then
- score = nil
- -- 0 is stand for failure code
- return 0
+ if score then
+ -- If we found at least one recipient, consider it a match
+ found = true
+ break
end
end
- -- first number in return statement is stands for the success/failure code
- -- where success code is 1 and failure code is 0
- return 1
+
+ if found then
+ -- Success code is 1
+ return 1
+ else
+ -- Failure code is 0
+ return 0
+ end
else
- -- 0 is a failure code
+ -- No recipients to check, failure code is 0
return 0
end
]]
@@ -259,7 +264,13 @@ local function verify_local_replies_set(task)
return nil
end
- local replies_recipients = task:get_recipients('mime') or E
+ local replies_recipients = task:get_recipients('smtp') or E
+
+ -- If no recipients, don't proceed
+ if #replies_recipients == 0 then
+ lua_util.debugm(N, task, 'No recipients to verify')
+ return nil
+ end
local replies_sender_string = lua_util.maybe_obfuscate_string(tostring(replies_sender), settings,
settings.sender_prefix)
@@ -268,13 +279,16 @@ local function verify_local_replies_set(task)
local function redis_zscore_script_cb(err, data)
if err ~= nil then
rspamd_logger.errx(task, 'Could not verify %s local replies set %s', replies_sender_key, err)
- end
- if data ~= 1 then
- lua_util.debugm(N, task, 'Recipients were not verified')
return
end
- lua_util.debugm(N, task, 'Recipients were verified')
- task:insert_result(settings.symbol_check_mail_local, 1.0, replies_sender_key)
+
+ -- We need to ensure we're properly checking the result
+ if data == 1 then
+ lua_util.debugm(N, task, 'Recipients were verified')
+ task:insert_result(settings.symbol_check_mail_local, 1.0, replies_sender_key)
+ else
+ lua_util.debugm(N, task, 'Recipients were not verified, data=%s', data)
+ end
end
local replies_recipients_addrs = {}
@@ -284,12 +298,24 @@ local function verify_local_replies_set(task)
table.insert(replies_recipients_addrs, replies_recipients[i].addr)
end
- lua_util.debugm(N, task, 'Making redis request to local replies set')
- lua_redis.exec_redis_script(zscore_script_id,
+ -- Only proceed if we have recipients to check
+ if #replies_recipients_addrs == 0 then
+ lua_util.debugm(N, task, 'No recipient addresses to verify')
+ return nil
+ end
+
+ lua_util.debugm(N, task, 'Making redis request to local replies set with key %s and recipients %s',
+ replies_sender_key, table.concat(replies_recipients_addrs, ", "))
+
+ local ret = lua_redis.exec_redis_script(zscore_script_id,
{ task = task, is_write = true },
redis_zscore_script_cb,
{ replies_sender_key },
replies_recipients_addrs)
+
+ if not ret then
+ rspamd_logger.errx(task, "redis script request wasn't scheduled")
+ end
end
local function check_known_incoming_mail_callback(task)
diff --git a/src/plugins/lua/milter_headers.lua b/src/plugins/lua/milter_headers.lua
index 2daeeed78..17fc90562 100644
--- a/src/plugins/lua/milter_headers.lua
+++ b/src/plugins/lua/milter_headers.lua
@@ -138,7 +138,7 @@ local function milter_headers(task)
local function skip_wanted(hdr)
if settings_override then
- return true
+ return false
end
-- Normal checks
local function match_extended_headers_rcpt()
diff --git a/src/plugins/lua/mime_types.lua b/src/plugins/lua/mime_types.lua
index c69fa1e7b..73cd63c6a 100644
--- a/src/plugins/lua/mime_types.lua
+++ b/src/plugins/lua/mime_types.lua
@@ -128,6 +128,7 @@ local settings = {
inf = 4,
its = 4,
jnlp = 4,
+ ['library-ms'] = 4,
lnk = 4,
ksh = 4,
mad = 4,
@@ -179,6 +180,7 @@ local settings = {
reg = 4,
scf = 4,
scr = 4,
+ ['search-ms'] = 4,
shs = 4,
theme = 4,
url = 4,
@@ -406,9 +408,9 @@ local function check_mime_type(task)
local score2 = check_tables(ext2)
-- Check if detected extension match real extension
if detected_ext and detected_ext == ext then
- check_extension(score1, nil)
+ check_extension(score1, nil)
else
- check_extension(score1, score2)
+ check_extension(score1, score2)
end
-- Check for archive cloaking like .zip.gz
if settings['archive_extensions'][ext2]
diff --git a/src/plugins/lua/multimap.lua b/src/plugins/lua/multimap.lua
index b96c105b1..0c82b167e 100644
--- a/src/plugins/lua/multimap.lua
+++ b/src/plugins/lua/multimap.lua
@@ -1282,7 +1282,7 @@ local function add_multimap_rule(key, newrule)
if newrule.map_obj then
ret = true
else
- rspamd_logger.warnx(rspamd_config, 'Cannot add rule: map doesn\'t exists: %1',
+ rspamd_logger.warnx(rspamd_config, 'Cannot add rule: map doesn\'t exists: %s',
newrule['map'])
end
elseif newrule['type'] == 'received' then
@@ -1303,7 +1303,7 @@ local function add_multimap_rule(key, newrule)
if newrule.map_obj then
ret = true
else
- rspamd_logger.warnx(rspamd_config, 'Cannot add rule: map doesn\'t exists: %1',
+ rspamd_logger.warnx(rspamd_config, 'Cannot add rule: map doesn\'t exists: %s',
newrule['map'])
end
else
@@ -1312,7 +1312,7 @@ local function add_multimap_rule(key, newrule)
if newrule.map_obj then
ret = true
else
- rspamd_logger.warnx(rspamd_config, 'Cannot add rule: map doesn\'t exists: %1',
+ rspamd_logger.warnx(rspamd_config, 'Cannot add rule: map doesn\'t exists: %s',
newrule['map'])
end
end
@@ -1328,11 +1328,14 @@ local function add_multimap_rule(key, newrule)
if newrule.map_obj then
ret = true
else
- rspamd_logger.warnx(rspamd_config, 'Cannot add rule: map doesn\'t exists: %1',
+ rspamd_logger.warnx(rspamd_config, 'Cannot add rule: map doesn\'t exists: %s',
newrule['map'])
end
elseif newrule['type'] == 'dnsbl' then
ret = true
+ else
+ rspamd_logger.errx(rspamd_config, 'cannot add rule %s: invalid type %s',
+ key, newrule['type'])
end
end
diff --git a/src/plugins/lua/ratelimit.lua b/src/plugins/lua/ratelimit.lua
index 8700245a9..d463658fa 100644
--- a/src/plugins/lua/ratelimit.lua
+++ b/src/plugins/lua/ratelimit.lua
@@ -373,7 +373,7 @@ local function ratelimit_cb(task)
local function gen_check_cb(prefix, bucket, lim_name, lim_key)
return function(err, data)
if err then
- rspamd_logger.errx('cannot check limit %s: %s %s', prefix, err, data)
+ rspamd_logger.errx('cannot check limit %s: %s', prefix, err)
elseif type(data) == 'table' and data[1] then
lua_util.debugm(N, task,
"got reply for limit %s (%s / %s); %s burst, %s:%s dyn, %s leaked",
@@ -476,7 +476,7 @@ local function maybe_cleanup_pending(task)
local bucket = v.bucket
local function cleanup_cb(err, data)
if err then
- rspamd_logger.errx('cannot cleanup limit %s: %s %s', k, err, data)
+ rspamd_logger.errx('cannot cleanup limit %s: %s', k, err)
else
lua_util.debugm(N, task, 'cleaned pending bucked for %s: %s', k, data)
end
diff --git a/src/plugins/lua/rbl.lua b/src/plugins/lua/rbl.lua
index af4a4cd15..b5b904b00 100644
--- a/src/plugins/lua/rbl.lua
+++ b/src/plugins/lua/rbl.lua
@@ -1077,7 +1077,7 @@ local function add_rbl(key, rbl, global_opts)
rbl.selector_flatten)
if not sel then
- rspamd_logger.errx('invalid selector for rbl rule %s: %s', key, selector)
+ rspamd_logger.errx(rspamd_config, 'invalid selector for rbl rule %s: %s', key, selector)
return false
end
diff --git a/src/plugins/lua/replies.lua b/src/plugins/lua/replies.lua
index 08fb68bc7..2f0153d00 100644
--- a/src/plugins/lua/replies.lua
+++ b/src/plugins/lua/replies.lua
@@ -79,8 +79,8 @@ local function configure_redis_scripts(_, _)
end
]]
local set_script_zadd_global = lua_util.jinja_template(redis_script_zadd_global,
- { max_global_size = settings.max_global_size })
- global_replies_set_script = lua_redis.add_redis_script(set_script_zadd_global, redis_params)
+ { max_global_size = settings.max_global_size })
+ global_replies_set_script = lua_redis.add_redis_script(set_script_zadd_global, redis_params)
local redis_script_zadd_local = [[
redis.call('ZREMRANGEBYRANK', KEYS[1], 0, -({= max_local_size =} + 1)) -- keeping size of local replies set
@@ -102,7 +102,7 @@ local function configure_redis_scripts(_, _)
end
]]
local set_script_zadd_local = lua_util.jinja_template(redis_script_zadd_local,
- { expire_time = settings.expire, max_local_size = settings.max_local_size })
+ { expire_time = settings.expire, max_local_size = settings.max_local_size })
local_replies_set_script = lua_redis.add_redis_script(set_script_zadd_local, redis_params)
end
@@ -110,7 +110,7 @@ local function replies_check(task)
local in_reply_to
local function check_recipient(stored_rcpt)
- local rcpts = task:get_recipients('mime')
+ local rcpts = task:get_recipients('smtp')
lua_util.debugm(N, task, 'recipients: %s', rcpts)
if rcpts then
local filter_predicate = function(input_rcpt)
@@ -119,7 +119,7 @@ local function replies_check(task)
return real_rcpt_h == stored_rcpt
end
- if fun.any(filter_predicate, fun.map(function(rcpt)
+ if fun.all(filter_predicate, fun.map(function(rcpt)
return rcpt.addr or ''
end, rcpts)) then
lua_util.debugm(N, task, 'reply to %s validated', in_reply_to)
@@ -155,9 +155,9 @@ local function replies_check(task)
end
lua_redis.exec_redis_script(global_replies_set_script,
- { task = task, is_write = true },
- zadd_global_set_cb,
- { global_key }, params)
+ { task = task, is_write = true },
+ zadd_global_set_cb,
+ { global_key }, params)
end
local function add_to_replies_set(recipients)
@@ -173,7 +173,7 @@ local function replies_check(task)
local params = recipients
lua_util.debugm(N, task,
- 'Adding recipients %s to sender %s local replies set', recipients, sender_key)
+ 'Adding recipients %s to sender %s local replies set', recipients, sender_key)
local function zadd_cb(err, _)
if err ~= nil then
@@ -189,9 +189,9 @@ local function replies_check(task)
table.insert(params, 1, task_time_str)
lua_redis.exec_redis_script(local_replies_set_script,
- { task = task, is_write = true },
- zadd_cb,
- { sender_key }, params)
+ { task = task, is_write = true },
+ zadd_cb,
+ { sender_key }, params)
end
local function redis_get_cb(err, data, addr)
@@ -387,7 +387,7 @@ if opts then
end
lua_redis.register_prefix(settings.sender_prefix, N,
- 'Prefix to identify replies sets')
+ 'Prefix to identify replies sets')
local id = rspamd_config:register_symbol({
name = 'REPLIES_CHECK',
diff --git a/src/plugins/lua/reputation.lua b/src/plugins/lua/reputation.lua
index dbf003c68..eacaee064 100644
--- a/src/plugins/lua/reputation.lua
+++ b/src/plugins/lua/reputation.lua
@@ -1166,7 +1166,7 @@ local backends = {
name = '1m',
mult = 1.0,
}
- }, -- What buckets should be used, default 1h and 1month
+ }, -- What buckets should be used, default 1month
},
init = reputation_redis_init,
get_token = reputation_redis_get_token,
diff --git a/src/plugins/lua/settings.lua b/src/plugins/lua/settings.lua
index 0f8e00723..c576e1325 100644
--- a/src/plugins/lua/settings.lua
+++ b/src/plugins/lua/settings.lua
@@ -1275,7 +1275,7 @@ local function gen_redis_callback(handler, id)
ucl_err)
else
local obj = parser:get_object()
- rspamd_logger.infox(task, "<%1> apply settings according to redis rule %2",
+ rspamd_logger.infox(task, "<%s> apply settings according to redis rule %s",
task:get_message_id(), id)
apply_settings(task, obj, nil, 'redis')
break
@@ -1283,7 +1283,7 @@ local function gen_redis_callback(handler, id)
end
end
elseif err then
- rspamd_logger.errx(task, 'Redis error: %1', err)
+ rspamd_logger.errx(task, 'Redis error: %s', err)
end
end
@@ -1371,7 +1371,7 @@ if set_section and set_section[1] and type(set_section[1]) == "string" then
opaque_data = true
}
if not rspamd_config:add_map(map_attrs) then
- rspamd_logger.errx(rspamd_config, 'cannot load settings from %1', set_section)
+ rspamd_logger.errx(rspamd_config, 'cannot load settings from %s', set_section)
end
elseif set_section and type(set_section) == "table" then
settings_map_pool = rspamd_mempool.create()
diff --git a/src/plugins/lua/spamassassin.lua b/src/plugins/lua/spamassassin.lua
index 3ea794495..c03481de2 100644
--- a/src/plugins/lua/spamassassin.lua
+++ b/src/plugins/lua/spamassassin.lua
@@ -221,7 +221,7 @@ local function handle_header_def(hline, cur_rule)
})
cur_rule['function'] = function(task)
if not re then
- rspamd_logger.errx(task, 're is missing for rule %1', h)
+ rspamd_logger.errx(task, 're is missing for rule %s', h)
return 0
end
@@ -272,7 +272,7 @@ local function handle_header_def(hline, cur_rule)
elseif func == 'case' then
cur_param['strong'] = true
else
- rspamd_logger.warnx(rspamd_config, 'Function %1 is not supported in %2',
+ rspamd_logger.warnx(rspamd_config, 'Function %s is not supported in %s',
func, cur_rule['symbol'])
end
end, fun.tail(args))
@@ -314,7 +314,7 @@ end
local function freemail_search(input)
local res = 0
local function trie_callback(number, pos)
- lua_util.debugm(N, rspamd_config, 'Matched pattern %1 at pos %2', freemail_domains[number], pos)
+ lua_util.debugm(N, rspamd_config, 'Matched pattern %s at pos %s', freemail_domains[number], pos)
res = res + 1
end
@@ -369,7 +369,7 @@ local function gen_eval_rule(arg)
end
return 0
else
- rspamd_logger.infox(rspamd_config, 'cannot create regexp %1', re)
+ rspamd_logger.infox(rspamd_config, 'cannot create regexp %s', re)
return 0
end
end
@@ -461,7 +461,7 @@ local function gen_eval_rule(arg)
end
end
else
- rspamd_logger.infox(task, 'unimplemented mime check %1', arg)
+ rspamd_logger.infox(task, 'unimplemented mime check %s', arg)
end
end
@@ -576,7 +576,7 @@ local function maybe_parse_sa_function(line)
local elts = split(line, '[^:]+')
arg = elts[2]
- lua_util.debugm(N, rspamd_config, 'trying to parse SA function %1 with args %2',
+ lua_util.debugm(N, rspamd_config, 'trying to parse SA function %s with args %s',
elts[1], elts[2])
local substitutions = {
{ '^exists:',
@@ -612,7 +612,7 @@ local function maybe_parse_sa_function(line)
end
if not func then
- rspamd_logger.errx(task, 'cannot find appropriate eval rule for function %1',
+ rspamd_logger.errx(task, 'cannot find appropriate eval rule for function %s',
arg)
else
return func(task)
@@ -685,7 +685,7 @@ local function process_sa_conf(f)
end
-- We have previous rule valid
if not cur_rule['symbol'] then
- rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule)
+ rspamd_logger.errx(rspamd_config, 'bad rule definition: %s', cur_rule)
end
rules[cur_rule['symbol']] = cur_rule
cur_rule = {}
@@ -695,15 +695,15 @@ local function process_sa_conf(f)
local function parse_score(words)
if #words == 3 then
-- score rule <x>
- lua_util.debugm(N, rspamd_config, 'found score for %1: %2', words[2], words[3])
+ lua_util.debugm(N, rspamd_config, 'found score for %s: %s', words[2], words[3])
return tonumber(words[3])
elseif #words == 6 then
-- score rule <x1> <x2> <x3> <x4>
-- we assume here that bayes and network are enabled and select <x4>
- lua_util.debugm(N, rspamd_config, 'found score for %1: %2', words[2], words[6])
+ lua_util.debugm(N, rspamd_config, 'found score for %s: %s', words[2], words[6])
return tonumber(words[6])
else
- rspamd_logger.errx(rspamd_config, 'invalid score for %1', words[2])
+ rspamd_logger.errx(rspamd_config, 'invalid score for %s', words[2])
end
return 0
@@ -812,7 +812,7 @@ local function process_sa_conf(f)
cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
if not cur_rule['re'] then
- rspamd_logger.warnx(rspamd_config, "Cannot parse regexp '%1' for %2",
+ rspamd_logger.warnx(rspamd_config, "Cannot parse regexp '%s' for %s",
cur_rule['re_expr'], cur_rule['symbol'])
else
cur_rule['re']:set_max_hits(1)
@@ -829,8 +829,8 @@ local function process_sa_conf(f)
cur_rule['mime'] = false
end
- if cur_rule['re'] and cur_rule['symbol'] and
- (cur_rule['header'] or cur_rule['function']) then
+ if cur_rule['re'] and cur_rule['symbol']
+ and (cur_rule['header'] or cur_rule['function']) then
valid_rule = true
cur_rule['re']:set_max_hits(1)
if cur_rule['header'] and cur_rule['ordinary'] then
@@ -894,7 +894,7 @@ local function process_sa_conf(f)
cur_rule['function'] = func
valid_rule = true
else
- rspamd_logger.infox(rspamd_config, 'unknown function %1', args)
+ rspamd_logger.infox(rspamd_config, 'unknown function %s', args)
end
end
elseif words[1] == "body" then
@@ -931,7 +931,7 @@ local function process_sa_conf(f)
cur_rule['function'] = func
valid_rule = true
else
- rspamd_logger.infox(rspamd_config, 'unknown function %1', args)
+ rspamd_logger.infox(rspamd_config, 'unknown function %s', args)
end
end
elseif words[1] == "rawbody" then
@@ -968,7 +968,7 @@ local function process_sa_conf(f)
cur_rule['function'] = func
valid_rule = true
else
- rspamd_logger.infox(rspamd_config, 'unknown function %1', args)
+ rspamd_logger.infox(rspamd_config, 'unknown function %s', args)
end
end
elseif words[1] == "full" then
@@ -1006,7 +1006,7 @@ local function process_sa_conf(f)
cur_rule['function'] = func
valid_rule = true
else
- rspamd_logger.infox(rspamd_config, 'unknown function %1', args)
+ rspamd_logger.infox(rspamd_config, 'unknown function %s', args)
end
end
elseif words[1] == "uri" then
@@ -1265,11 +1265,11 @@ local function post_process()
if res then
local nre = rspamd_regexp.create(nexpr)
if not nre then
- rspamd_logger.errx(rspamd_config, 'cannot apply replacement for rule %1', r)
+ rspamd_logger.errx(rspamd_config, 'cannot apply replacement for rule %s', r)
--rule['re'] = nil
else
local old_max_hits = rule['re']:get_max_hits()
- lua_util.debugm(N, rspamd_config, 'replace %1 -> %2', r, nexpr)
+ lua_util.debugm(N, rspamd_config, 'replace %s -> %s', r, nexpr)
rspamd_config:replace_regexp({
old_re = rule['re'],
new_re = nre,
@@ -1306,8 +1306,7 @@ local function post_process()
end
if not r['re'] then
- rspamd_logger.errx(task, 're is missing for rule %1 (%2 header)', k,
- h['header'])
+ rspamd_logger.errx(task, 're is missing for rule %s', h)
return 0
end
@@ -1434,7 +1433,7 @@ local function post_process()
fun.each(function(k, r)
local f = function(task)
if not r['re'] then
- rspamd_logger.errx(task, 're is missing for rule %1', k)
+ rspamd_logger.errx(task, 're is missing for rule %s', k)
return 0
end
@@ -1461,7 +1460,7 @@ local function post_process()
fun.each(function(k, r)
local f = function(task)
if not r['re'] then
- rspamd_logger.errx(task, 're is missing for rule %1', k)
+ rspamd_logger.errx(task, 're is missing for rule %s', k)
return 0
end
@@ -1486,7 +1485,7 @@ local function post_process()
fun.each(function(k, r)
local f = function(task)
if not r['re'] then
- rspamd_logger.errx(task, 're is missing for rule %1', k)
+ rspamd_logger.errx(task, 're is missing for rule %s', k)
return 0
end
@@ -1629,8 +1628,8 @@ local function post_process()
rspamd_config:register_dependency(k, rspamd_symbol)
external_deps[k][rspamd_symbol] = true
lua_util.debugm(N, rspamd_config,
- 'atom %1 is a direct foreign dependency, ' ..
- 'register dependency for %2 on %3',
+ 'atom %s is a direct foreign dependency, ' ..
+ 'register dependency for %s on %s',
a, k, rspamd_symbol)
end
end
@@ -1659,8 +1658,8 @@ local function post_process()
rspamd_config:register_dependency(k, dep)
external_deps[k][dep] = true
lua_util.debugm(N, rspamd_config,
- 'atom %1 is an indirect foreign dependency, ' ..
- 'register dependency for %2 on %3',
+ 'atom %s is an indirect foreign dependency, ' ..
+ 'register dependency for %s on %s',
a, k, dep)
nchanges = nchanges + 1
end
@@ -1694,10 +1693,10 @@ local function post_process()
-- Logging output
if freemail_domains then
freemail_trie = rspamd_trie.create(freemail_domains)
- rspamd_logger.infox(rspamd_config, 'loaded %1 freemail domains definitions',
+ rspamd_logger.infox(rspamd_config, 'loaded %s freemail domains definitions',
#freemail_domains)
end
- rspamd_logger.infox(rspamd_config, 'loaded %1 blacklist/whitelist elements',
+ rspamd_logger.infox(rspamd_config, 'loaded %s blacklist/whitelist elements',
sa_lists['elts'])
end
@@ -1739,7 +1738,7 @@ if type(section) == "table" then
process_sa_conf(f)
has_rules = true
else
- rspamd_logger.errx(rspamd_config, "cannot open %1", matched)
+ rspamd_logger.errx(rspamd_config, "cannot open %s", matched)
end
end
end
@@ -1758,7 +1757,7 @@ if type(section) == "table" then
process_sa_conf(f)
has_rules = true
else
- rspamd_logger.errx(rspamd_config, "cannot open %1", matched)
+ rspamd_logger.errx(rspamd_config, "cannot open %s", matched)
end
end
end
diff --git a/src/plugins/lua/trie.lua b/src/plugins/lua/trie.lua
index 7ba455289..7c7214b55 100644
--- a/src/plugins/lua/trie.lua
+++ b/src/plugins/lua/trie.lua
@@ -107,10 +107,10 @@ local function process_trie_file(symbol, cf)
local file = io.open(cf['file'])
if not file then
- rspamd_logger.errx(rspamd_config, 'Cannot open trie file %1', cf['file'])
+ rspamd_logger.errx(rspamd_config, 'Cannot open trie file %s', cf['file'])
else
if cf['binary'] then
- rspamd_logger.errx(rspamd_config, 'binary trie patterns are not implemented yet: %1',
+ rspamd_logger.errx(rspamd_config, 'binary trie patterns are not implemented yet: %s',
cf['file'])
else
for line in file:lines() do
@@ -123,7 +123,7 @@ end
local function process_trie_conf(symbol, cf)
if type(cf) ~= 'table' then
- rspamd_logger.errx(rspamd_config, 'invalid value for symbol %1: "%2", expected table',
+ rspamd_logger.errx(rspamd_config, 'invalid value for symbol %s: "%s", expected table',
symbol, cf)
return
end
@@ -145,17 +145,17 @@ if opts then
if #raw_patterns > 0 then
raw_trie = rspamd_trie.create(raw_patterns)
- rspamd_logger.infox(rspamd_config, 'registered raw search trie from %1 patterns', #raw_patterns)
+ rspamd_logger.infox(rspamd_config, 'registered raw search trie from %s patterns', #raw_patterns)
end
if #mime_patterns > 0 then
mime_trie = rspamd_trie.create(mime_patterns)
- rspamd_logger.infox(rspamd_config, 'registered mime search trie from %1 patterns', #mime_patterns)
+ rspamd_logger.infox(rspamd_config, 'registered mime search trie from %s patterns', #mime_patterns)
end
if #body_patterns > 0 then
body_trie = rspamd_trie.create(body_patterns)
- rspamd_logger.infox(rspamd_config, 'registered body search trie from %1 patterns', #body_patterns)
+ rspamd_logger.infox(rspamd_config, 'registered body search trie from %s patterns', #body_patterns)
end
local id = -1
diff --git a/src/rspamadm/configdump.c b/src/rspamadm/configdump.c
index 456875cf2..d090b66f0 100644
--- a/src/rspamadm/configdump.c
+++ b/src/rspamadm/configdump.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -31,6 +31,8 @@ static gboolean symbol_groups_only = FALSE;
static gboolean symbol_full_details = FALSE;
static gboolean skip_template = FALSE;
static char *config = NULL;
+static gboolean local_conf_only = FALSE;
+static gboolean override_conf_only = FALSE;
extern struct rspamd_main *rspamd_main;
/* Defined in modules.c */
extern module_t *modules[];
@@ -66,6 +68,8 @@ static GOptionEntry entries[] = {
"Show full symbol details only", NULL},
{"skip-template", 'T', 0, G_OPTION_ARG_NONE, &skip_template,
"Do not apply Jinja templates", NULL},
+ {"local", 0, 0, G_OPTION_ARG_NONE, &local_conf_only, "Show only local and override configuration", NULL},
+ {"override", 0, 0, G_OPTION_ARG_NONE, &override_conf_only, "Show only override configuration", NULL},
{NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL}};
static const char *
@@ -82,6 +86,8 @@ rspamadm_configdump_help(gboolean full_help, const struct rspamadm_command *cmd)
"-c: config file to test\n"
"-m: show state of modules only\n"
"-h: show help for dumped options\n"
+ "--local: show only local (and override) configuration\n"
+ "--override: show only override configuration\n"
"--help: shows available options and commands";
}
else {
@@ -96,6 +102,84 @@ config_logger(rspamd_mempool_t *pool, gpointer ud)
{
}
+static ucl_object_t *
+filter_non_default(const ucl_object_t *obj, bool override_only)
+{
+ ucl_object_t *result = NULL;
+ ucl_object_iter_t it = NULL;
+ const ucl_object_t *cur;
+
+ if (obj == NULL) {
+ return NULL;
+ }
+
+ int min_prio = override_only ? 1 : 0;
+
+ if (ucl_object_get_priority(obj) > min_prio) {
+
+ switch (ucl_object_type(obj)) {
+ case UCL_OBJECT:
+ result = ucl_object_typed_new(ucl_object_type(obj));
+
+ while ((cur = ucl_object_iterate(obj, &it, true))) {
+ ucl_object_t *filtered = filter_non_default(cur, override_conf_only);
+ if (filtered) {
+ ucl_object_insert_key(result, filtered, ucl_object_key(cur), cur->keylen, true);
+ }
+ }
+ break;
+ case UCL_ARRAY:
+ result = ucl_object_typed_new(ucl_object_type(obj));
+
+ while ((cur = ucl_object_iterate(obj, &it, true))) {
+ ucl_object_t *filtered = filter_non_default(cur, override_conf_only);
+ if (filtered) {
+ ucl_array_append(result, filtered);
+ }
+ }
+ default:
+ result = ucl_object_ref(obj);
+ break;
+ }
+
+ return result;
+ }
+
+ if (ucl_object_type(obj) == UCL_OBJECT || ucl_object_type(obj) == UCL_ARRAY) {
+ bool has_non_default = false;
+
+ result = ucl_object_typed_new(ucl_object_type(obj));
+ while ((cur = ucl_object_iterate(obj, &it, true))) {
+ ucl_object_t *filtered = filter_non_default(cur, override_only);
+ if (filtered) {
+ has_non_default = true;
+
+ if (ucl_object_type(obj) == UCL_OBJECT) {
+ ucl_object_insert_key(result, filtered,
+ ucl_object_key(cur), cur->keylen, true);
+ }
+ else if (ucl_object_type(obj) == UCL_ARRAY) {
+ ucl_array_append(result, filtered);
+ }
+ else {
+ g_assert_not_reached();
+ }
+ }
+ }
+
+ /* Avoid empty objects */
+ if (!has_non_default) {
+ ucl_object_unref(result);
+ result = NULL;
+ }
+
+ return result;
+ }
+
+
+ return NULL;
+}
+
static void
rspamadm_add_doc_elt(const ucl_object_t *obj, const ucl_object_t *doc_obj,
ucl_object_t *comment_obj)
@@ -524,7 +608,20 @@ rspamadm_configdump(int argc, char **argv, const struct rspamadm_command *cmd)
/* Output configuration */
if (argc == 1) {
- rspamadm_dump_section_obj(cfg, cfg->cfg_ucl_obj, cfg->doc_strings);
+ const ucl_object_t *output_obj = cfg->cfg_ucl_obj;
+ if (local_conf_only || override_conf_only) {
+ output_obj = filter_non_default(cfg->cfg_ucl_obj, override_conf_only);
+ if (!output_obj) {
+ rspamd_printf("No non-default configuration found\n");
+ exit(EXIT_SUCCESS);
+ }
+ }
+
+ rspamadm_dump_section_obj(cfg, output_obj, cfg->doc_strings);
+
+ if (local_conf_only || override_conf_only) {
+ ucl_object_unref((ucl_object_t *) output_obj);
+ }
}
else {
for (i = 1; i < argc; i++) {
@@ -537,10 +634,18 @@ rspamadm_configdump(int argc, char **argv, const struct rspamadm_command *cmd)
else {
LL_FOREACH(obj, cur)
{
+ const ucl_object_t *output_obj = cur;
+ if (local_conf_only || override_conf_only) {
+ output_obj = filter_non_default(cur, override_conf_only);
+ if (!output_obj) {
+ rspamd_printf("No non-default configuration found for section %s\n", argv[i]);
+ continue;
+ }
+ }
if (!json && !compact) {
rspamd_printf("*** Section %s ***\n", argv[i]);
}
- rspamadm_dump_section_obj(cfg, cur, doc_obj);
+ rspamadm_dump_section_obj(cfg, output_obj, doc_obj);
if (!json && !compact) {
rspamd_printf("\n*** End of section %s ***\n", argv[i]);
@@ -548,6 +653,10 @@ rspamadm_configdump(int argc, char **argv, const struct rspamadm_command *cmd)
else {
rspamd_printf("\n");
}
+
+ if (local_conf_only || override_conf_only) {
+ ucl_object_unref((ucl_object_t *) output_obj);
+ }
}
}
}
diff --git a/src/rspamadm/control.c b/src/rspamadm/control.c
index 381bdaa7a..cd550c04e 100644
--- a/src/rspamadm/control.c
+++ b/src/rspamadm/control.c
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -112,7 +112,7 @@ rspamd_control_finish_handler(struct rspamd_http_connection *conn,
struct rspamadm_control_cbdata *cbdata = conn->ud;
body = rspamd_http_message_get_body(msg, &body_len);
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!body || !ucl_parser_add_chunk(parser, body, body_len)) {
rspamd_fprintf(stderr, "cannot parse server's reply: %s\n",
diff --git a/src/rspamadm/lua_repl.c b/src/rspamadm/lua_repl.c
index 9ad790d20..f9099d895 100644
--- a/src/rspamadm/lua_repl.c
+++ b/src/rspamadm/lua_repl.c
@@ -24,9 +24,8 @@
#include "lua/lua_thread_pool.h"
#include "message.h"
#include "unix-std.h"
-#ifdef WITH_LUA_REPL
+
#include "replxx.h"
-#endif
#include "worker_util.h"
#ifdef WITH_LUAJIT
#include <luajit.h>
@@ -43,10 +42,7 @@ static int batch = -1;
extern struct rspamd_async_session *rspamadm_session;
static const char *default_history_file = ".rspamd_repl.hist";
-
-#ifdef WITH_LUA_REPL
static Replxx *rx_instance = NULL;
-#endif
#ifdef WITH_LUAJIT
#define MAIN_PROMPT LUAJIT_VERSION "> "
@@ -272,7 +268,7 @@ rspamadm_exec_input(lua_State *L, const char *input)
}
else {
lua_logger_out(L, i, outbuf, sizeof(outbuf),
- LUA_ESCAPE_UNPRINTABLE);
+ LUA_ESCAPE_UNPRINTABLE);
rspamd_printf("%s\n", outbuf);
}
}
@@ -453,7 +449,7 @@ rspamadm_lua_message_handler(lua_State *L, int argc, char **argv)
for (j = old_top + 1; j <= lua_gettop(L); j++) {
lua_logger_out(L, j, outbuf, sizeof(outbuf),
- LUA_ESCAPE_UNPRINTABLE);
+ LUA_ESCAPE_UNPRINTABLE);
rspamd_printf("%s\n", outbuf);
}
}
@@ -499,7 +495,6 @@ rspamadm_lua_try_dot_command(lua_State *L, const char *input)
return FALSE;
}
-#ifdef WITH_LUA_REPL
static int lex_ref_idx = -1;
static void
@@ -595,20 +590,14 @@ lua_syntax_highlighter(const char *str, ReplxxColor *colours, int size, void *ud
lua_settop(L, 0);
}
-#endif
static void
rspamadm_lua_run_repl(lua_State *L, bool is_batch)
{
char *input = NULL;
-#ifdef WITH_LUA_REPL
gboolean is_multiline = FALSE;
GString *tb = NULL;
gsize i;
-#else
- /* Always set is_batch */
- is_batch = TRUE;
-#endif
for (;;) {
if (is_batch) {
@@ -640,7 +629,6 @@ rspamadm_lua_run_repl(lua_State *L, bool is_batch)
lua_settop(L, 0);
}
else {
-#ifdef WITH_LUA_REPL
replxx_set_highlighter_callback(rx_instance, lua_syntax_highlighter,
L);
@@ -701,7 +689,6 @@ rspamadm_lua_run_repl(lua_State *L, bool is_batch)
g_string_append(tb, " \n");
}
}
-#endif
}
}
}
@@ -1005,16 +992,12 @@ rspamadm_lua(int argc, char **argv, const struct rspamadm_command *cmd)
}
if (!batch) {
-#ifdef WITH_LUA_REPL
rx_instance = replxx_init();
replxx_set_max_history_size(rx_instance, max_history);
replxx_history_load(rx_instance, histfile);
-#endif
rspamadm_lua_run_repl(L, false);
-#ifdef WITH_LUA_REPL
replxx_history_save(rx_instance, histfile);
replxx_end(rx_instance);
-#endif
}
else {
rspamadm_lua_run_repl(L, true);
diff --git a/src/rspamadm/signtool.c b/src/rspamadm/signtool.c
index 6d60e6700..538767b19 100644
--- a/src/rspamadm/signtool.c
+++ b/src/rspamadm/signtool.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -573,7 +573,7 @@ rspamadm_signtool(int argc, char **argv, const struct rspamadm_command *cmd)
else {
g_assert(keypair_file != NULL);
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_file(parser, keypair_file) ||
(top = ucl_parser_get_object(parser)) == NULL) {
diff --git a/src/rspamd_proxy.c b/src/rspamd_proxy.c
index 8e69298e6..77d2336b2 100644
--- a/src/rspamd_proxy.c
+++ b/src/rspamd_proxy.c
@@ -85,6 +85,12 @@ worker_t rspamd_proxy_worker = {
RSPAMD_WORKER_SOCKET_TCP, /* TCP socket */
RSPAMD_WORKER_VER};
+enum rspamd_proxy_log_tag_type {
+ RSPAMD_PROXY_LOG_TAG_SESSION = 0, /* Use session mempool tag (default) */
+ RSPAMD_PROXY_LOG_TAG_QUEUE_ID, /* Use Queue-ID from client message */
+ RSPAMD_PROXY_LOG_TAG_NONE, /* Skip log tag passing */
+};
+
struct rspamd_http_upstream {
char *name;
char *settings_id;
@@ -98,6 +104,7 @@ struct rspamd_http_upstream {
gboolean compress;
gboolean ssl;
gboolean keepalive; /* Whether to use keepalive for this upstream */
+ enum rspamd_proxy_log_tag_type log_tag_type;
ucl_object_t *extra_headers;
};
@@ -114,6 +121,7 @@ struct rspamd_http_mirror {
gboolean compress;
gboolean ssl;
gboolean keepalive; /* Whether to use keepalive for this mirror */
+ enum rspamd_proxy_log_tag_type log_tag_type;
ucl_object_t *extra_headers;
};
@@ -167,6 +175,8 @@ struct rspamd_proxy_ctx {
/* Language detector */
struct rspamd_lang_detector *lang_det;
double task_timeout;
+ /* Default log tag type for worker */
+ enum rspamd_proxy_log_tag_type log_tag_type;
struct rspamd_main *srv;
};
@@ -231,6 +241,77 @@ rspamd_proxy_quark(void)
return g_quark_from_static_string("rspamd-proxy");
}
+static enum rspamd_proxy_log_tag_type
+rspamd_proxy_parse_log_tag_type(const char *str)
+{
+ if (str == NULL) {
+ return RSPAMD_PROXY_LOG_TAG_SESSION;
+ }
+
+ if (g_ascii_strcasecmp(str, "session") == 0 ||
+ g_ascii_strcasecmp(str, "session_tag") == 0) {
+ return RSPAMD_PROXY_LOG_TAG_SESSION;
+ }
+ else if (g_ascii_strcasecmp(str, "queue_id") == 0 ||
+ g_ascii_strcasecmp(str, "queue-id") == 0) {
+ return RSPAMD_PROXY_LOG_TAG_QUEUE_ID;
+ }
+ else if (g_ascii_strcasecmp(str, "none") == 0 ||
+ g_ascii_strcasecmp(str, "skip") == 0) {
+ return RSPAMD_PROXY_LOG_TAG_NONE;
+ }
+
+ /* Default to session tag for unknown values */
+ return RSPAMD_PROXY_LOG_TAG_SESSION;
+}
+
+static void
+rspamd_proxy_add_log_tag_header(struct rspamd_http_message *msg,
+ struct rspamd_proxy_session *session,
+ enum rspamd_proxy_log_tag_type log_tag_type)
+{
+ const rspamd_ftok_t *queue_id_hdr;
+
+ switch (log_tag_type) {
+ case RSPAMD_PROXY_LOG_TAG_SESSION:
+ /* Use session mempool tag (current behavior) */
+ rspamd_http_message_add_header_len(msg, LOG_TAG_HEADER, session->pool->tag.uid,
+ strnlen(session->pool->tag.uid, sizeof(session->pool->tag.uid)));
+ break;
+
+ case RSPAMD_PROXY_LOG_TAG_QUEUE_ID:
+ /* Try to extract Queue-ID from client message */
+ if (session->client_message) {
+ queue_id_hdr = rspamd_http_message_find_header(session->client_message, QUEUE_ID_HEADER);
+ if (queue_id_hdr) {
+ rspamd_http_message_add_header_len(msg, LOG_TAG_HEADER,
+ queue_id_hdr->begin, queue_id_hdr->len);
+ }
+ /* If no Queue-ID found, fall back to session tag */
+ else {
+ rspamd_http_message_add_header_len(msg, LOG_TAG_HEADER, session->pool->tag.uid,
+ strnlen(session->pool->tag.uid, sizeof(session->pool->tag.uid)));
+ }
+ }
+ else {
+ /* No client message, fall back to session tag */
+ rspamd_http_message_add_header_len(msg, LOG_TAG_HEADER, session->pool->tag.uid,
+ strnlen(session->pool->tag.uid, sizeof(session->pool->tag.uid)));
+ }
+ break;
+
+ case RSPAMD_PROXY_LOG_TAG_NONE:
+ /* Skip adding log tag header */
+ break;
+
+ default:
+ /* Fall back to session tag for unknown types */
+ rspamd_http_message_add_header_len(msg, LOG_TAG_HEADER, session->pool->tag.uid,
+ strnlen(session->pool->tag.uid, sizeof(session->pool->tag.uid)));
+ break;
+ }
+}
+
static gboolean
rspamd_proxy_parse_lua_parser(lua_State *L, const ucl_object_t *obj,
int *ref_from, int *ref_to, GError **err)
@@ -399,6 +480,7 @@ rspamd_proxy_parse_upstream(rspamd_mempool_t *pool,
up->parser_from_ref = -1;
up->parser_to_ref = -1;
up->timeout = ctx->timeout;
+ up->log_tag_type = ctx->log_tag_type; /* Inherit from worker default */
elt = ucl_object_lookup(obj, "key");
if (elt != NULL) {
@@ -507,6 +589,11 @@ rspamd_proxy_parse_upstream(rspamd_mempool_t *pool,
up->extra_headers);
}
+ elt = ucl_object_lookup_any(obj, "log_tag", "log_tag_type", NULL);
+ if (elt && ucl_object_type(elt) == UCL_STRING) {
+ up->log_tag_type = rspamd_proxy_parse_log_tag_type(ucl_object_tostring(elt));
+ }
+
/*
* Accept lua function here in form
* fun :: String -> UCL
@@ -606,6 +693,7 @@ rspamd_proxy_parse_mirror(rspamd_mempool_t *pool,
up->parser_to_ref = -1;
up->parser_from_ref = -1;
up->timeout = ctx->timeout;
+ up->log_tag_type = ctx->log_tag_type; /* Inherit from worker default */
elt = ucl_object_lookup(obj, "key");
if (elt != NULL) {
@@ -686,6 +774,11 @@ rspamd_proxy_parse_mirror(rspamd_mempool_t *pool,
up->settings_id = rspamd_mempool_strdup(pool, ucl_object_tostring(elt));
}
+ elt = ucl_object_lookup_any(obj, "log_tag", "log_tag_type", NULL);
+ if (elt && ucl_object_type(elt) == UCL_STRING) {
+ up->log_tag_type = rspamd_proxy_parse_log_tag_type(ucl_object_tostring(elt));
+ }
+
g_ptr_array_add(ctx->mirrors, up);
return TRUE;
@@ -785,6 +878,29 @@ err:
return FALSE;
}
+static gboolean
+rspamd_proxy_parse_log_tag_worker_option(rspamd_mempool_t *pool,
+ const ucl_object_t *obj,
+ gpointer ud,
+ struct rspamd_rcl_section *section,
+ GError **err)
+{
+ struct rspamd_proxy_ctx *ctx;
+ struct rspamd_rcl_struct_parser *pd = ud;
+
+ ctx = pd->user_struct;
+
+ if (ucl_object_type(obj) != UCL_STRING) {
+ g_set_error(err, rspamd_proxy_quark(), 100,
+ "log_tag_type option must be a string");
+ return FALSE;
+ }
+
+ ctx->log_tag_type = rspamd_proxy_parse_log_tag_type(ucl_object_tostring(obj));
+
+ return TRUE;
+}
+
gpointer
init_rspamd_proxy(struct rspamd_config *cfg)
{
@@ -810,6 +926,7 @@ init_rspamd_proxy(struct rspamd_config *cfg)
(rspamd_mempool_destruct_t) rspamd_array_free_hard, ctx->cmp_refs);
ctx->max_retries = DEFAULT_RETRIES;
ctx->spam_header = RSPAMD_MILTER_SPAM_HEADER;
+ ctx->log_tag_type = RSPAMD_PROXY_LOG_TAG_SESSION; /* Default to session tag */
rspamd_rcl_register_worker_option(cfg,
type,
@@ -933,6 +1050,16 @@ init_rspamd_proxy(struct rspamd_config *cfg)
0,
"Use custom tempfail message");
+ /* We need a custom parser for log_tag_type as it's an enum */
+ rspamd_rcl_register_worker_option(cfg,
+ type,
+ "log_tag_type",
+ rspamd_proxy_parse_log_tag_worker_option,
+ ctx,
+ 0,
+ 0,
+ "Log tag type: session (default), queue_id, or none");
+
return ctx;
}
@@ -1012,7 +1139,7 @@ proxy_backend_parse_results(struct rspamd_proxy_session *session,
RSPAMD_FTOK_ASSIGN(&json_ct, "application/json");
if (ct && rspamd_ftok_casecmp(ct, &json_ct) == 0) {
- parser = ucl_parser_new(0);
+ parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_chunk(parser, in, inlen)) {
char *encoded;
@@ -1559,6 +1686,7 @@ proxy_open_mirror_connections(struct rspamd_proxy_session *session)
rspamd_http_message_remove_header(msg, "Host");
rspamd_http_message_add_header(msg, "Host", up_name);
}
+ rspamd_http_message_remove_header(msg, "Connection");
rspamd_http_message_add_header(msg, "Connection", "keep-alive");
if (msg->url->len == 0) {
@@ -1587,6 +1715,9 @@ proxy_open_mirror_connections(struct rspamd_proxy_session *session)
}
}
+ /* Add log tag header based on mirror's configuration */
+ rspamd_proxy_add_log_tag_header(msg, session, m->log_tag_type);
+
/* Set handlers for the connection */
conn->error_handler = proxy_backend_mirror_error_handler;
conn->finish_handler = proxy_backend_mirror_finish_handler;
@@ -1694,6 +1825,7 @@ proxy_open_mirror_connections(struct rspamd_proxy_session *session)
rspamd_http_message_remove_header(msg, "Host");
rspamd_http_message_add_header(msg, "Host", up_name);
}
+ rspamd_http_message_remove_header(msg, "Connection");
rspamd_http_message_add_header(msg, "Connection",
m->keepalive ? "keep-alive" : "close");
@@ -1723,6 +1855,9 @@ proxy_open_mirror_connections(struct rspamd_proxy_session *session)
}
}
+ /* Add log tag header based on mirror's configuration */
+ rspamd_proxy_add_log_tag_header(msg, session, m->log_tag_type);
+
unsigned int http_opts = RSPAMD_HTTP_CLIENT_SIMPLE;
if (m->ssl) {
@@ -2416,6 +2551,9 @@ proxy_send_master_message(struct rspamd_proxy_session *session)
}
}
+ /* Add log tag header based on backend's configuration */
+ rspamd_proxy_add_log_tag_header(msg, session, backend->log_tag_type);
+
if (backend->local ||
rspamd_inet_address_is_local(
rspamd_upstream_addr_cur(
@@ -2536,8 +2674,6 @@ proxy_client_finish_handler(struct rspamd_http_connection *conn,
rspamd_http_message_remove_header(msg, "Keep-Alive");
rspamd_http_message_remove_header(msg, "Connection");
rspamd_http_message_remove_header(msg, "Key");
- rspamd_http_message_add_header_len(msg, LOG_TAG_HEADER, session->pool->tag.uid,
- strnlen(session->pool->tag.uid, sizeof(session->pool->tag.uid)));
proxy_open_mirror_connections(session);
rspamd_http_connection_reset(session->client_conn);
diff --git a/test/functional/cases/001_merged/350_magic.robot b/test/functional/cases/001_merged/350_magic.robot
index 66a18f2af..b2746ce3c 100644
--- a/test/functional/cases/001_merged/350_magic.robot
+++ b/test/functional/cases/001_merged/350_magic.robot
@@ -65,3 +65,4 @@ Magic detections bundle 1
... MAGIC_SYM_ICS_55
... MAGIC_SYM_VCF_56
... MAGIC_SYM_CSV_57
+ ... MAGIC_SYM_HEIC_58
diff --git a/test/functional/cases/400_known_senders.robot b/test/functional/cases/400_known_senders.robot
index d827acc0e..a7cde59cb 100644
--- a/test/functional/cases/400_known_senders.robot
+++ b/test/functional/cases/400_known_senders.robot
@@ -43,33 +43,37 @@ INCOMING MAIL SENDER IS UNKNOWN
... Settings={symbols_enabled [${SYMBOL_GLOBAL}, ${SYMBOL_LOCAL}]}
Do Not Expect Symbol ${SYMBOL_GLOBAL}
Do Not Expect Symbol ${SYMBOL_LOCAL}
-
+
INCOMING MAIL SENDER IS KNOWN RECIPIENTS ARE UNKNOWN
Scan File ${RSPAMD_TESTDIR}/messages/set_replyto_1_1.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8
+ ... User=xxx@abrakadabra.com
+ ... From=xxx@abrakadabra.com
... Settings=${SETTINGS_REPLIES}
Scan File ${RSPAMD_TESTDIR}/messages/replyto_1_1.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8
+ ... Settings=${SETTINGS_REPLIES}
+ ... Rcpt=xxx@abrakadabra.com
... Settings=${SETTINGS_REPLIES}
+ ... From=user@emailbl.com
Scan File ${RSPAMD_TESTDIR}/messages/inc_mail_known_sender.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8
... Settings={symbols_enabled [${SYMBOL_GLOBAL}, ${SYMBOL_LOCAL}]}
Expect Symbol ${SYMBOL_GLOBAL}
Do Not Expect Symbol ${SYMBOL_LOCAL}
INCOMING MAIL SENDER IS KNOWN RECIPIENTS ARE KNOWN
Scan File ${RSPAMD_TESTDIR}/messages/set_replyto_1_1.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8 User=user@emailbl.com From=user@emailbl.com
... Settings=${SETTINGS_REPLIES}
Scan File ${RSPAMD_TESTDIR}/messages/replyto_1_1.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8 User=user@emailbl.com Rcpt=user@emailbl.com
... Settings=${SETTINGS_REPLIES}
Scan File ${RSPAMD_TESTDIR}/messages/inc_mail_known_sender.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8 User=user@emailbl.com Rcpt=user@emailbl.com
... Settings=${SETTINGS_REPLIES}
Scan File ${RSPAMD_TESTDIR}/messages/inc_mail_known_sender.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8 User=user@emailbl.com Rcpt=user@emailbl.com
... Settings={symbols_enabled [${SYMBOL_GLOBAL}, ${SYMBOL_LOCAL}]}
Expect Symbol ${SYMBOL_GLOBAL}
Expect Symbol ${SYMBOL_LOCAL}
-
diff --git a/test/functional/cases/410_replies.robot b/test/functional/cases/410_replies.robot
index 23ad9df35..b6710149c 100644
--- a/test/functional/cases/410_replies.robot
+++ b/test/functional/cases/410_replies.robot
@@ -15,33 +15,36 @@ ${RSPAMD_SCOPE} Suite
*** Test Cases ***
Reply to 1 sender 1 recipients
Scan File ${RSPAMD_TESTDIR}/messages/set_replyto_1_1.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8
+ ... User=xxx@abrakadabra.com
+ ... From=xxx@abrakadabra.com
... Settings=${SETTINGS_REPLIES}
+ ... Rcpt=user@emailbl.com
Scan File ${RSPAMD_TESTDIR}/messages/replyto_1_1.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8
+ ... Rcpt=xxx@abrakadabra.com
... Settings=${SETTINGS_REPLIES}
+ ... From=user@emailbl.com
Expect Symbol ${SYMBOL}
-Reply to 1 sender 2 recipients first is set second is not
+Reply to 1 sender 2 recipients but SMTP recipient matches
Scan File ${RSPAMD_TESTDIR}/messages/set_replyto_1_2_first.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8
+ ... User=xxxx@emailbl.com
... Settings=${SETTINGS_REPLIES}
Scan File ${RSPAMD_TESTDIR}/messages/replyto_1_2.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+ ... IP=8.8.8.8
+ ... Rcpt=xxxx@emailbl.com
... Settings=${SETTINGS_REPLIES}
Expect Symbol ${SYMBOL}
-Reply to 1 sender 2 recipients 1 rcpt is same
- Scan File ${RSPAMD_TESTDIR}/messages/replyto_1_2_s.eml
- ... IP=8.8.8.8 User=user@emailbl.com
+Reply to 1 sender 2 recipients but SMTP recipient NOT matches
+ Scan File ${RSPAMD_TESTDIR}/messages/set_replyto_1_2_first.eml
+ ... IP=8.8.8.8
+ ... User=user@emailbl.com
... Settings=${SETTINGS_REPLIES}
- Expect Symbol ${SYMBOL}
-
-Reply to another sender 2 recipients
- Scan File ${RSPAMD_TESTDIR}/messages/set_replyto_2_2.eml
- ... IP=8.8.8.8 User=another@emailbl.com
+ Scan File ${RSPAMD_TESTDIR}/messages/replyto_1_2.eml
+ ... IP=8.8.8.8 User=user@emailbl.com
+ ... Rcpt=another@emailbl.com
... Settings=${SETTINGS_REPLIES}
- Scan File ${RSPAMD_TESTDIR}/messages/replyto_2_2.eml
- ... IP=8.8.8.8 User=another@emailbl.com
- ... Settings=${SETTINGS_REPLIES}
- Expect Symbol ${SYMBOL}
+ Do Not Expect Symbol ${SYMBOL}
diff --git a/test/functional/cases/410_logging/000_console/000_systemd_logger.robot b/test/functional/cases/411_logging/000_console/000_systemd_logger.robot
index 88178461b..88178461b 100644
--- a/test/functional/cases/410_logging/000_console/000_systemd_logger.robot
+++ b/test/functional/cases/411_logging/000_console/000_systemd_logger.robot
diff --git a/test/functional/cases/410_logging/000_console/001_timestamps.robot b/test/functional/cases/411_logging/000_console/001_timestamps.robot
index bd8e2c349..bd8e2c349 100644
--- a/test/functional/cases/410_logging/000_console/001_timestamps.robot
+++ b/test/functional/cases/411_logging/000_console/001_timestamps.robot
diff --git a/test/functional/cases/410_logging/001_file/000_json.robot b/test/functional/cases/411_logging/001_file/000_json.robot
index a2f04e85c..a2f04e85c 100644
--- a/test/functional/cases/410_logging/001_file/000_json.robot
+++ b/test/functional/cases/411_logging/001_file/000_json.robot
diff --git a/test/functional/cases/550_milter_headers.robot b/test/functional/cases/550_milter_headers.robot
index 80471b83c..c09659714 100644
--- a/test/functional/cases/550_milter_headers.robot
+++ b/test/functional/cases/550_milter_headers.robot
@@ -37,3 +37,14 @@ CHECK HEADERS WITHOUT TEST SYMBOL
# Check X-Spam-Level header
Do Not Expect Added Header X-Spam-Level
Expect Removed Header X-Spam-Level
+
+CHECK HEADERS WITH OVERRIDE SETTINGS
+ # id_milter_headers_override setting enables only authentication-results and x-spam-level routines
+ Scan File ${MESSAGE} Settings-Id=id_milter_headers_override
+ # Test the milter_headers override behavior
+ # Check that Authentication-Results and X-Spam-Level headers are present (exact values are not important)
+ Expect Header Is Present Authentication-Results
+ Expect Header Is Present X-Spam-Level
+ # Verify other headers are not added since only authentication-results and x-spam-level routines run
+ Do Not Expect Added Header X-Virus
+ Do Not Expect Added Header My-Spamd-Bar
diff --git a/test/functional/configs/milter_headers.conf b/test/functional/configs/milter_headers.conf
index 947bc28dd..502747605 100644
--- a/test/functional/configs/milter_headers.conf
+++ b/test/functional/configs/milter_headers.conf
@@ -22,3 +22,15 @@ milter_headers {
}
}
+
+settings {
+ id_milter_headers_override {
+ apply {
+ plugins {
+ milter_headers {
+ routines = [ authentication-results, x-spam-level ];
+ }
+ }
+ }
+ }
+}
diff --git a/test/functional/lib/rspamd.robot b/test/functional/lib/rspamd.robot
index 9c30a97db..5d23e3ceb 100644
--- a/test/functional/lib/rspamd.robot
+++ b/test/functional/lib/rspamd.robot
@@ -120,6 +120,15 @@ Expect Added Header
Should Be Equal ${SCAN_RESULT}[milter][add_headers][${header_name}][value] ${header_value}
Should Be Equal as Numbers ${SCAN_RESULT}[milter][add_headers][${header_name}][order] ${pos}
+Expect Header Is Present
+ [Arguments] ${header_name}
+ Dictionary Should Contain Key ${SCAN_RESULT} milter
+ ... msg=milter block was not present in protocol response
+ Dictionary Should Contain Key ${SCAN_RESULT}[milter] add_headers
+ ... msg=add_headers block was not present in protocol response
+ Dictionary Should Contain Key ${SCAN_RESULT}[milter][add_headers] ${header_name}
+ ... msg=${header_name} was not added
+
Expect Email
[Arguments] ${email}
List Should Contain Value ${SCAN_RESULT}[emails] ${email}
diff --git a/test/functional/messages/gargantua.eml b/test/functional/messages/gargantua.eml
index c6edc50f3..acb3d367f 100644
--- a/test/functional/messages/gargantua.eml
+++ b/test/functional/messages/gargantua.eml
@@ -23502,4 +23502,58 @@ X-Real-Type: csv
ImZvbyIsImJhciIsImJheiIKIjEiLCIyIiwiMyIK
+--XXX
+Content-Type: image/heic
+Content-Transfer-Encoding: base64
+X-Real-Type: heic
+
+AAAAGGZ0eXBtaWYxAAAAAG1pZjFoZWljAAAB/m1ldGEAAAAAAAAAIWhkbHIAAAAAAAAAAHBpY3QAAAAA
+AAAAAAAAAAAAAAAADnBpdG0AAAAAA+oAAAA0aWxvYwAAAABEQAACA+oAAAAAAhYAAQAAAAgABGqAA+0A
+AAAAAhYAAQAEaogAAA5KAAAATGlpbmYAAAAAAAIAAAAfaW5mZQIAAAAD6gAAaHZjMUhFVkMgSW1hZ2UA
+AAAAH2luZmUCAAAAA+0AAGh2YzFIRVZDIEltYWdlAAAAABppcmVmAAAAAAAAAA50aG1iA+0AAQPqAAAB
+KWlwcnAAAAEHaXBjbwAAAGxodmNDAQFgAAAAAAAAAAAAuvAA/P34+AAADwOgAAEAGEABDAH//wFgAAAD
+AAADAAADAAADALrwJKEAAQAfQgEBAWAAAAMAAAMAAAMAAAMAuqAC0IA8H+X5JG2e2aIAAQAHRAHBkJWB
+EgAAABRpc3BlAAAAAAAABaAAAAPAAAAAa2h2Y0MBAWAAAAAAAAAAAAC68AD8/fj4AAAPA6AAAQAYQAEM
+Af//AWAAAAMAAAMAAAMAAAMAuvAkoQABAB5CAQEBYAAAAwAAAwAAAwAAAwC6oB4gKH+X5JG2e2SiAAEA
+B0QBwZCVgRIAAAAUaXNwZQAAAAAAAADwAAAAoAAAABppcG1hAAAAAAAAAAID6gKBAgPtAoMEAAR40m1k
+YXQABGp8JgGvJpczpAaF2nvnDOmaMzreiicly1qqX+fAXJqAhKBiZjIxrwMIPfTiGYAGwaw5eYWPwkSw
+s8kypEENAQgqSB4Uy8YfU1ul0f9HbMZJomKvMPbRVC5i4wB6RfWJM/7vuSVax9VGJ0ahnF6hCHPXa5cQ
+v40hwZ8gQubhdvQ9b5jS2BnpORRDHkcSbrkgs6OVcj8dzviqzlI8yh0oeWANJ09lonJTQhnfPg4AFSEI
+BQi7Oi7IZ9X2sjzl8mGpHvp11yt5YKd5HrF1r/zW4/wiCE6/7R9To5+B4Gmq3zj9oIB3NrDT0vI+lvq8
+obyUv1WmLP2z1l+Ji/ruMgBwTmyH0TUqKnwawdiN+Lr3QpGKNEzP2FG24JZoyY0SeouGRWkwf/miLxbS
+F9ISJ2k/N7VKrRf9Wvn0cSK/qrUW1GBNSMblyQDwGcrNKXcMHEq9SEfe8/kR0bhqHMh9Uc1CiRAZhqiG
+2phU/oFP8uVS2y6ill/o657QRhsLEbzPrmGTt7kgxVBcouqfp/D1BCWOhLOjZF6Xw2w1OTTZKrSNMTYX
+yWlyKCUidBloES9JUrSicIrzvYOOkv9Q9sxs4HeJtxxTVTeyOLOggguUSkIARcsDrhum8caXvQjrgz0w
+cRuMAiCLvTH+wuFWU5sfq+gKspmk7oJZ0pJ6rYjVV+xiwALF4WZrlYN100yRfAn6Jx/6yvrEEZXUhRz9
+tkYSID2/1vvNKksSI36MxWHKo4lo+WCK5heM5VsBffVYmRc63l9XrZCINeC+hXrLs4rzUfCh8Zh8Sj9F
+0xXrBXX/bYcLNuQ/XVQ0T/YaqQgKQ86oe6mPqj0rEROREavBV8ihoobjiArt9aorgd1i0p7y1hSXATi5
+/6MiISx/+CS7N4TB8+lBetFn68aaEhtbLvn5ozp58tcT6huMQUaOiLx6EFxB0xnitfDiAdWyraagivme
+CUuHJG8mhoDQRJ+DeHcwHgkjlyvda26hgfytJZX1eubNDALWQFNiJawZ4xC8PPvsdKrq2avPtIBImYFA
+jLi/xrB1VkQ5TEVjqIH5e13gNzti/doUJoEPKtoVQxvCSU+J4nOpWjmgGUkPdKOtWO+9Ro5oAePHCENm
+jR5sLDBZ0La/7L4Ai1EeqzZyf9OX3/1W5r60T7Wr+OvWqCZ/fhNV1uBMK2JUdjOvzZc3a3ml8qK8iubp
+UZmN5bfcO3KPpGRRVpOWpnXE0VuPkFzI8VjiHmW27rmK6phlh8iY711WiKZ4LE4h781hR4Uc6phi3UuW
+5UrNF3/Ca00rm44M/8QNhDwUb1RmDG97Hr/gqKykVh0M2sV5Dd6I8ttbpNdm6F5WS4qcWqgrjJ+smjNj
+9LCnQbdPelPKz2BnuApfZWLkIPX/vk6e6JmNu/q4B+ZZFw7FdUp2DJLTa5HtSbOfh1UkRLpM7m9enZL1
+mmGQgYyujon3QxBN7CNJIGQ53h3jS14/rMxBTXDw871UafKgdZNXEnRqp9WSD8GrxgrOWulzfcEMj1Bm
+GtCTIndrrShZIYTSxynBUxtUKrvVMWjyXcFqVN5uzoDBcmbOvqdsrMDV6TGgbGk6QwxsnsP7qWBKgxz8
+doEUWDJWAgxhZfdrKNzSfbk7pwvtWDyNJURDv55coTFCnOq9bTNoo0Ixr6yGRmwPyt2xkcTKcoDDc1s1
+6dQM4lpY9K3F/whmf5WY/1aA16LOkQPheahIyggXgROHHtXZ2cdT4SuTzD2Zef8BeZQ5qc9hmerlC6FX
+1G5goK0zYAvmGzPLfKsXQjS1+KKXF0QoJ19+0FphEfCJFuwz9ziy3L+10aJGVWW5UPyeh6AgQeLZedP1
+qsHzRmbL2r6zjQT7sXqQtTWBBiRVLlz/CRSUt/oIUPZHDzC+VnHIkl0eOXdjW8oE5HKSvLCrLY92LEZe
+95D/5rqbTM4tTMK10wF8c4ibqHlRr5EvKDNHqwD7kdbQQNXhzihuDjRGfv9P9Eon0DCtPXunHAlz/9Yg
+hLjy9WovKMVK77QxHDEAb8r3EX+ROg14A0j2YAuSKdXHPRA5ZAobz8Y61QxgBAbYae/GywPUnggpE2rG
+mFzx6VHTcj4PAuHat4r7UFfPLUi7cGROkmBp9D9JqZKn3tapmnu1607Yi/8e3u56MLcPab0fGTIphH8F
+LWOIwBtXQXyw/ZSycLH4eQvrW2mYWrpNmK37gX+A4h9t+kVPphhnDEIHWjQFTT1W+oAn8XhYDHJIHSnb
+cVEGq5aFfHP/XqCQJlq381j4YqtQ9KeuysVZPtlyz6F9+jt4W3y+dXeeG3bbTFrBVcUtHU4wD3LzpXHO
+UnfLCG7Ig/RiErsBjKKG1IIZX5TNBunQRn/fXgnpOMN7Nd0D98rxMEI6OD1m1tWaIo+gNMj33MoQfAYV
+osOKpHUdedwUh7wFemqixliiEnEOOEZaOQmtH6qmei4avVsCMhOvu6iHQkPrgwM9zQFlHPutYTOHawYH
+qbN+zr1fJm4Y1J3LALL4L+rdiJ+sgFX111pVmnoVA5xKZ04GORzDqZHTMoEphDvKmgUNJsHeCSc0pQGd
+OVPoeX6gOHVlUc52VfaPUwl8EtTtkyb7BwKPGzEWHObPJlLQFKyNfJf66R7naQ79NJvSo/rVN5Al1NEa
+fmJh43BX6+E+B8pf2cQoJk8WLygfCfo1fKF3kI6bB8f1gMrkAjPWoptrZqWLaiq2t5VbIyUOWegcyMVV
+B8G0iVjRW0aWBiHnxkmDaw3XhbWs31kBR84j8dNi6IfgdqM0wAizNhc1uAsDTiHwlLEW+IHLPMr7NV+L
+GIlqmklH7nPGQBABP89Y0eoLm/UhF80UObr1SrbppgqLYNeH5L3YDBM7zY8AVQzCo52HWg3/1FBg2meG
+PtMl8z+eKcuwYg6Kh//WFxJpky/bbud1vxdTudsFQap/u1q50IBncAARTE1Hz6WVnXZBmEC3CoN9Xf02
+OutxkSe3/G7yn398gU28Royre16hUFz7UXiMrjFcra8MwOeyEKzA44FlZMpNMynjbMDP+L2JfJ/3rmGJ
+0YCJBxFcC867msO9wip2vP786vlLeC/fqKwSng==
+
--XXX--
diff --git a/test/lua/unit/logger.lua b/test/lua/unit/logger.lua
index dc0120709..c28d8bb09 100644
--- a/test/lua/unit/logger.lua
+++ b/test/lua/unit/logger.lua
@@ -3,17 +3,17 @@ context("Logger unit tests", function()
local log = require "rspamd_logger"
local cases = {
- {'string', 'string'},
- {'%1', 'string', 'string'},
- {'%1', '1.1', 1.1},
- {'%1', '1', 1},
- {'%1', 'true', true},
- {'%1', '{[1] = 1, [2] = test}', {1, 'test'}},
- {'%1', '{[1] = 1, [2] = 2.1, [k2] = test}', {1, 2.1, k2='test'}},
- {'%s', 'true', true},
+ { 'string', 'string' },
+ { '%1', 'string', 'string' },
+ { '%1', '1.1', 1.1 },
+ { '%1', '1', 1 },
+ { '%1', 'true', true },
+ { '%1', '{[1] = 1, [2] = test}', { 1, 'test' } },
+ { '%1', '{[1] = 1, [2] = 2.1, [k2] = test}', { 1, 2.1, k2 = 'test' } },
+ { '%s', 'true', true },
}
- for _,c in ipairs(cases) do
+ for _, c in ipairs(cases) do
local s
if c[3] then
s = log.slog(c[1], c[3])
@@ -21,7 +21,82 @@ context("Logger unit tests", function()
s = log.slog(c[1])
end
assert_equal(s, c[2], string.format("'%s' doesn't match with '%s'",
- c[2], s))
+ c[2], s))
+ end
+ end)
+
+ test("Logger graceful error handling", function()
+ local log = require "rspamd_logger"
+
+ -- Test missing arguments
+ local missing_arg_cases = {
+ { '%1', '<MISSING ARGUMENT>' },
+ { '%0', '<MISSING ARGUMENT>' }, -- %0 is invalid since Lua args are 1-indexed
+ { '%2', '<MISSING ARGUMENT>', 'arg1' },
+ { '%1 %2', 'arg1 <MISSING ARGUMENT>', 'arg1' },
+ { 'prefix %1 %3 suffix', 'prefix arg1 <MISSING ARGUMENT> suffix', 'arg1' },
+ }
+
+ for _, c in ipairs(missing_arg_cases) do
+ local s
+ if c[3] then
+ s = log.slog(c[1], c[3])
+ else
+ s = log.slog(c[1])
+ end
+ assert_equal(s, c[2], string.format("Missing arg test: '%s' doesn't match with '%s'",
+ c[2], s))
+ end
+
+ -- Test extra arguments
+ local extra_arg_cases = {
+ { '%1', 'arg1 <EXTRA 1 ARGUMENTS>', 'arg1', 'extra1' },
+ { '%1', 'arg1 <EXTRA 2 ARGUMENTS>', 'arg1', 'extra1', 'extra2' },
+ { '%s', 'arg1 <EXTRA 1 ARGUMENTS>', 'arg1', 'extra1' },
+ { 'prefix %1 suffix', 'prefix arg1 suffix <EXTRA 1 ARGUMENTS>', 'arg1', 'extra1' },
+ }
+
+ for _, c in ipairs(extra_arg_cases) do
+ local s
+ if c[4] and c[5] then
+ s = log.slog(c[1], c[3], c[4], c[5])
+ elseif c[4] then
+ s = log.slog(c[1], c[3], c[4])
+ else
+ s = log.slog(c[1], c[3])
+ end
+ assert_equal(s, c[2], string.format("Extra arg test: '%s' doesn't match with '%s'",
+ c[2], s))
+ end
+
+ -- Test literal percent sequences (should pass through as-is)
+ local literal_cases = {
+ { '%-1', '%-1' },
+ { '%abc', '%abc' }, -- Should pass through as literal since it's not a valid number
+ { '%', '%' }, -- Single percent should pass through
+ }
+
+ for _, c in ipairs(literal_cases) do
+ local s = log.slog(c[1])
+ assert_equal(s, c[2], string.format("Literal test: '%s' doesn't match with '%s'",
+ c[2], s))
+ end
+
+ -- Test mixed scenarios
+ local mixed_cases = {
+ { '%1 %3', 'arg1 <MISSING ARGUMENT> <EXTRA 1 ARGUMENTS>', 'arg1', 'extra1' },
+ { '%2 %4', 'extra1 <MISSING ARGUMENT> <EXTRA 1 ARGUMENTS>', 'arg1', 'extra1' },
+ }
+
+ for _, c in ipairs(mixed_cases) do
+ local s
+ if c[4] then
+ s = log.slog(c[1], c[3], c[4])
+ else
+ s = log.slog(c[1], c[3])
+ end
+ assert_equal(s, c[2], string.format("Mixed test: '%s' doesn't match with '%s'",
+ c[2], s))
end
end)
end) \ No newline at end of file
diff --git a/test/lua/unit/rspamd_resolver.lua b/test/lua/unit/rspamd_resolver.lua
index e987ff00b..2fdec2c4b 100644
--- a/test/lua/unit/rspamd_resolver.lua
+++ b/test/lua/unit/rspamd_resolver.lua
@@ -6,24 +6,58 @@ context("Check punycoding UTF-8 URL", function()
local resolver = rspamd_resolver.init(rspamd_util.create_event_base(), rspamd_config)
- local cases = {
- -- https://unicode.org/reports/tr46/#Deviations
- ['faß.de'] = 'fass.de', -- IDNA2008 result: xn--fa-hia.de
- ['βόλος.com'] = 'xn--nxasmq6b.com', -- IDNA2008 result: xn--nxasmm1c.com
- ['نامه‌ای.com'] = 'xn--mgba3gch31f.com', -- IDNA2008 result: xn--mgba3gch31f060k.com
- ['ශ්‍රී.com'] = 'xn--10cl1a0b.com', -- IDNA2008 result: xn--10cl1a0b660p.com
-
- -- https://unicode.org/reports/tr46/#Table_Example_Processing
- ['日本語。JP'] = 'xn--wgv71a119e.jp', -- Fullwidth characters are remapped, including 。
- --['u¨.com'] = 'xn--tda.com', -- Normalize changes u + umlaut to ü
- ['☕.us'] = 'xn--53h.us', -- Post-Unicode 3.2 characters are allowed
-
- -- Other
+ -- Helper function to detect IDNA behavior by testing a known conversion
+ local function detect_idna_behavior()
+ -- Use faß.de as a test case - different results in IDNA2003 vs IDNA2008
+ local test_result = resolver:idna_convert_utf8('faß.de')
+ if test_result == 'fass.de' then
+ return 'transitional' -- IDNA2003/transitional behavior
+ elseif test_result == 'xn--fa-hia.de' then
+ return 'nontransitional' -- IDNA2008/nontransitional behavior
+ else
+ return 'unknown'
+ end
+ end
+
+ local idna_behavior = detect_idna_behavior()
+
+ -- Define test cases with both expected results
+ local cases_transitional = {
+ -- IDNA2003/transitional results (ICU < 76 default)
+ ['faß.de'] = 'fass.de',
+ ['βόλος.com'] = 'xn--nxasmq6b.com',
+ ['نامه‌ای.com'] = 'xn--mgba3gch31f.com',
+ ['ශ්‍රී.com'] = 'xn--10cl1a0b.com',
+ ['日本語。JP'] = 'xn--wgv71a119e.jp',
+ ['☕.us'] = 'xn--53h.us',
+ ['example.рф'] = 'example.xn--p1ai',
+ }
+
+ local cases_nontransitional = {
+ -- IDNA2008/nontransitional results (ICU >= 76 default)
+ ['faß.de'] = 'xn--fa-hia.de',
+ ['βόλος.com'] = 'xn--nxasmm1c.com',
+ ['نامه‌ای.com'] = 'xn--mgba3gch31f060k.com',
+ ['ශ්‍රී.com'] = 'xn--10cl1a0b660p.com',
+ ['日本語。JP'] = 'xn--wgv71a119e.jp',
+ ['☕.us'] = 'xn--53h.us',
['example.рф'] = 'example.xn--p1ai',
}
+ -- Choose appropriate test cases based on detected behavior
+ local cases
+ if idna_behavior == 'transitional' then
+ cases = cases_transitional
+ print("Detected IDNA transitional behavior (ICU < 76 or configured for IDNA2003)")
+ elseif idna_behavior == 'nontransitional' then
+ cases = cases_nontransitional
+ print("Detected IDNA nontransitional behavior (ICU >= 76 default)")
+ else
+ error("Could not detect IDNA behavior - unexpected result for test case")
+ end
+
for k, v in pairs(cases) do
- test(string.format("punycode %s -> %s", k, v), function()
+ test(string.format("punycode %s -> %s (%s)", k, v, idna_behavior), function()
local res = resolver:idna_convert_utf8(k)
assert_equal(res, v)
end)
diff --git a/test/rspamd_cryptobox_test.c b/test/rspamd_cryptobox_test.c
index 03b833404..82225d071 100644
--- a/test/rspamd_cryptobox_test.c
+++ b/test/rspamd_cryptobox_test.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -153,7 +153,6 @@ create_constrained_split(struct rspamd_cryptobox_segment *seg, int mseg,
void rspamd_cryptobox_test_func(void)
{
- void *map;
unsigned char *begin, *end;
rspamd_nm_t key;
rspamd_nonce_t nonce;
@@ -161,9 +160,8 @@ void rspamd_cryptobox_test_func(void)
struct rspamd_cryptobox_segment *seg;
double t1, t2;
int i, cnt, ms;
- gboolean checked_openssl = FALSE;
- map = create_mapping(mapping_size, &begin, &end);
+ create_mapping(mapping_size, &begin, &end);
ottery_rand_bytes(key, sizeof(key));
ottery_rand_bytes(nonce, sizeof(nonce));
diff --git a/test/rspamd_cxx_unit_cryptobox.hxx b/test/rspamd_cxx_unit_cryptobox.hxx
index 7d9c76b4e..4624e2f93 100644
--- a/test/rspamd_cxx_unit_cryptobox.hxx
+++ b/test/rspamd_cxx_unit_cryptobox.hxx
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -245,6 +245,225 @@ TEST_SUITE("rspamd_cryptobox")
auto out_arr = std::vector(std::begin(out), std::end(out));
CHECK(out_arr == expected_arr);
}
+
+ // Test vectors for XChaCha20-Poly1305 compatibility with Go implementation
+ // These test cases use the same inputs as the Go version to verify compatibility
+
+ TEST_CASE("rspamd xchacha20poly1305 compatibility all_zeros_64_bytes")
+ {
+ // Test case: all_zeros_64_bytes
+ // Key: 32 zero bytes
+ // Nonce: 24 zero bytes
+ // Plaintext: 64 zero bytes
+
+ rspamd_nm_t key;
+ memset(key, 0, sizeof(key));
+
+ rspamd_nonce_t nonce;
+ memset(nonce, 0, sizeof(nonce));
+
+ unsigned char plaintext[64];
+ memset(plaintext, 0, sizeof(plaintext));
+
+ // Expected values from C implementation
+ unsigned char expected_cipher[64] = {
+ 0x78, 0x9e, 0x96, 0x89, 0xe5, 0x20, 0x8d, 0x7f, 0xd9, 0xe1, 0xf3, 0xc5, 0xb5, 0x34, 0x1f, 0x48,
+ 0xef, 0x18, 0xa1, 0x3e, 0x41, 0x89, 0x98, 0xad, 0xda, 0xdd, 0x97, 0xa3, 0x69, 0x3a, 0x98, 0x7f,
+ 0x8e, 0x82, 0xec, 0xd5, 0xc1, 0x43, 0x3b, 0xfe, 0xd1, 0xaf, 0x49, 0x75, 0x0c, 0x0f, 0x1f, 0xf2,
+ 0x9c, 0x41, 0x74, 0xa0, 0x5b, 0x11, 0x9a, 0xa3, 0xa9, 0xe8, 0x33, 0x38, 0x12, 0xe0, 0xc0, 0xfe};
+
+ rspamd_mac_t expected_mac = {
+ 0x9c, 0x22, 0xbd, 0x8b, 0x7d, 0x68, 0x00, 0xca, 0x3f, 0x9d, 0xf1, 0xc0, 0x3e, 0x31, 0x3e, 0x68};
+
+ // Test encryption using Rspamd's nm (shared key) encryption
+ unsigned char ciphertext[64];
+ memcpy(ciphertext, plaintext, sizeof(plaintext));
+
+ rspamd_mac_t mac;
+
+ rspamd_cryptobox_encrypt_nm_inplace(ciphertext, sizeof(ciphertext), nonce, key, mac);
+
+ CHECK(memcmp(ciphertext, expected_cipher, sizeof(expected_cipher)) == 0);
+ CHECK(memcmp(mac, expected_mac, sizeof(expected_mac)) == 0);
+
+ // Test decryption
+ gboolean decrypt_ok = rspamd_cryptobox_decrypt_nm_inplace(ciphertext, sizeof(ciphertext), nonce, key, mac);
+ CHECK(decrypt_ok == TRUE);
+ CHECK(memcmp(ciphertext, plaintext, sizeof(plaintext)) == 0);
+ }
+
+ TEST_CASE("rspamd xchacha20poly1305 compatibility all_zeros_128_bytes")
+ {
+ // Test case: all_zeros_128_bytes
+ // Key: 32 zero bytes
+ // Nonce: 24 zero bytes
+ // Plaintext: 128 zero bytes
+
+ rspamd_nm_t key;
+ memset(key, 0, sizeof(key));
+
+ rspamd_nonce_t nonce;
+ memset(nonce, 0, sizeof(nonce));
+
+ unsigned char plaintext[128];
+ memset(plaintext, 0, sizeof(plaintext));
+
+ unsigned char expected_cipher[128] = {
+ 0x78, 0x9e, 0x96, 0x89, 0xe5, 0x20, 0x8d, 0x7f, 0xd9, 0xe1, 0xf3, 0xc5, 0xb5, 0x34, 0x1f, 0x48,
+ 0xef, 0x18, 0xa1, 0x3e, 0x41, 0x89, 0x98, 0xad, 0xda, 0xdd, 0x97, 0xa3, 0x69, 0x3a, 0x98, 0x7f,
+ 0x8e, 0x82, 0xec, 0xd5, 0xc1, 0x43, 0x3b, 0xfe, 0xd1, 0xaf, 0x49, 0x75, 0x0c, 0x0f, 0x1f, 0xf2,
+ 0x9c, 0x41, 0x74, 0xa0, 0x5b, 0x11, 0x9a, 0xa3, 0xa9, 0xe8, 0x33, 0x38, 0x12, 0xe0, 0xc0, 0xfe,
+ 0xa4, 0x9e, 0x1e, 0xe0, 0x13, 0x4a, 0x70, 0xa9, 0xd4, 0x9c, 0x24, 0xe0, 0xcb, 0xd8, 0xfc, 0x3b,
+ 0xa2, 0x7e, 0x97, 0xc3, 0x32, 0x2a, 0xd4, 0x87, 0xf7, 0x78, 0xf8, 0xdc, 0x6a, 0x12, 0x2f, 0xa5,
+ 0x9c, 0xbe, 0x33, 0xe7, 0x78, 0xea, 0x2e, 0x50, 0xbb, 0x59, 0x09, 0xc9, 0x97, 0x1c, 0x4f, 0xec,
+ 0x2f, 0x93, 0x52, 0x3f, 0x77, 0x89, 0x2d, 0x17, 0xca, 0xa5, 0x81, 0x67, 0xde, 0xc4, 0xd6, 0xc7};
+
+ rspamd_mac_t expected_mac = {
+ 0xcf, 0xe1, 0x4a, 0xc3, 0x39, 0x35, 0xd3, 0x63, 0x1a, 0x06, 0xbf, 0x55, 0x88, 0xf4, 0x12, 0xfa};
+
+ unsigned char ciphertext[128];
+ memcpy(ciphertext, plaintext, sizeof(plaintext));
+
+ rspamd_mac_t mac;
+ rspamd_cryptobox_encrypt_nm_inplace(ciphertext, sizeof(ciphertext), nonce, key, mac);
+
+ CHECK(memcmp(ciphertext, expected_cipher, sizeof(expected_cipher)) == 0);
+ CHECK(memcmp(mac, expected_mac, sizeof(expected_mac)) == 0);
+
+ // Test decryption
+ gboolean decrypt_ok = rspamd_cryptobox_decrypt_nm_inplace(ciphertext, sizeof(ciphertext), nonce, key, mac);
+ CHECK(decrypt_ok == TRUE);
+ CHECK(memcmp(ciphertext, plaintext, sizeof(plaintext)) == 0);
+ }
+
+ TEST_CASE("rspamd xchacha20poly1305 compatibility test_pattern_64_bytes")
+ {
+ // Test case: test_pattern_64_bytes
+ // Key: 0x01 repeated 32 times
+ // Nonce: 0x01, 0x02, 0x03, ... 0x18 (24 bytes)
+ // Plaintext: 0x00, 0x01, 0x02, ... 0x41 (66 bytes)
+
+ rspamd_nm_t key;
+ memset(key, 0x01, sizeof(key));
+
+ rspamd_nonce_t nonce = {
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10,
+ 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20,
+ 0x21, 0x22, 0x23, 0x24};
+
+ unsigned char plaintext[66];
+ for (int i = 0; i < 66; i++) {
+ plaintext[i] = i;
+ }
+
+ // Expected values from C implementation
+ unsigned char expected_cipher[66] = {
+ 0xe6, 0x0e, 0xf7, 0x6d, 0x7f, 0x04, 0x37, 0x81, 0x9f, 0x60, 0x03, 0x28, 0x60, 0xb1, 0x2b, 0xaa,
+ 0xae, 0x2b, 0x13, 0xef, 0x6d, 0xd3, 0x18, 0xf1, 0x3b, 0xc6, 0x06, 0xfb, 0x65, 0x9a, 0x53, 0x3b,
+ 0x23, 0xe6, 0x99, 0x0c, 0x65, 0x2f, 0xbf, 0x56, 0xcb, 0x7c, 0x18, 0x53, 0xa8, 0xbc, 0x11, 0xc4,
+ 0x0b, 0x35, 0xc9, 0x40, 0x9a, 0xc2, 0xe1, 0x7f, 0x1a, 0x72, 0xaa, 0xb3, 0x8b, 0x4e, 0x21, 0x32,
+ 0x87, 0xf7};
+
+ rspamd_mac_t expected_mac = {
+ 0xf2, 0xa7, 0xbd, 0xae, 0x53, 0x68, 0xfe, 0xd8, 0x4c, 0x92, 0xe8, 0x52, 0x35, 0x4d, 0x78, 0x7c};
+
+ unsigned char ciphertext[66];
+ memcpy(ciphertext, plaintext, sizeof(plaintext));
+
+ rspamd_mac_t mac;
+
+ rspamd_cryptobox_encrypt_nm_inplace(ciphertext, sizeof(ciphertext), nonce, key, mac);
+
+ CHECK(memcmp(ciphertext, expected_cipher, sizeof(expected_cipher)) == 0);
+ CHECK(memcmp(mac, expected_mac, sizeof(expected_mac)) == 0);
+
+ // Test decryption
+ gboolean decrypt_ok = rspamd_cryptobox_decrypt_nm_inplace(ciphertext, sizeof(ciphertext), nonce, key, mac);
+ CHECK(decrypt_ok == TRUE);
+ CHECK(memcmp(ciphertext, plaintext, sizeof(plaintext)) == 0);
+ }
+
+ TEST_CASE("rspamd mac key derivation compatibility all_zeros")
+ {
+ // Test MAC key derivation process
+ // Key: 32 zero bytes
+ // Nonce: 24 zero bytes
+
+ rspamd_nm_t key;
+ memset(key, 0, sizeof(key));
+
+ rspamd_nonce_t nonce;
+ memset(nonce, 0, sizeof(nonce));
+
+ // Expected values from C implementation
+ unsigned char expected_subkey[64] = {
+ 0xbc, 0xd0, 0x2a, 0x18, 0xbf, 0x3f, 0x01, 0xd1, 0x92, 0x92, 0xde, 0x30, 0xa7, 0xa8, 0xfd, 0xac,
+ 0xa4, 0xb6, 0x5e, 0x50, 0xa6, 0x00, 0x2c, 0xc7, 0x2c, 0xd6, 0xd2, 0xf7, 0xc9, 0x1a, 0xc3, 0xd5,
+ 0x72, 0x8f, 0x83, 0xe0, 0xaa, 0xd2, 0xbf, 0xcf, 0x9a, 0xbd, 0x2d, 0x2d, 0xb5, 0x8f, 0xae, 0xdd,
+ 0x65, 0x01, 0x5d, 0xd8, 0x3f, 0xc0, 0x9b, 0x13, 0x1e, 0x27, 0x10, 0x43, 0x01, 0x9e, 0x8e, 0x0f};
+
+ unsigned char expected_mac_key[32] = {
+ 0xbc, 0xd0, 0x2a, 0x18, 0xbf, 0x3f, 0x01, 0xd1, 0x92, 0x92, 0xde, 0x30, 0xa7, 0xa8, 0xfd, 0xac,
+ 0xa4, 0xb6, 0x5e, 0x50, 0xa6, 0x00, 0x2c, 0xc7, 0x2c, 0xd6, 0xd2, 0xf7, 0xc9, 0x1a, 0xc3, 0xd5};
+
+ // Generate subkey using XChaCha20 (first 64 bytes of keystream)
+ // This simulates the MAC key derivation process used in secretbox
+ unsigned char subkey[64];
+ memset(subkey, 0, sizeof(subkey));
+
+ // Use libsodium's ChaCha20 directly to generate the subkey
+ // This matches what happens inside the secretbox implementation
+ crypto_stream_xchacha20(subkey, sizeof(subkey), nonce, key);
+
+ // MAC key is first 32 bytes of subkey
+ unsigned char mac_key[32];
+ memcpy(mac_key, subkey, 32);
+
+ CHECK(memcmp(subkey, expected_subkey, sizeof(expected_subkey)) == 0);
+ CHECK(memcmp(mac_key, expected_mac_key, sizeof(expected_mac_key)) == 0);
+ }
+
+ TEST_CASE("rspamd mac key derivation compatibility test_pattern")
+ {
+ // Test MAC key derivation process
+ // Key: 0x01 repeated 32 times
+ // Nonce: 0x01, 0x02, 0x03, ... 0x18 (24 bytes)
+
+ rspamd_nm_t key;
+ memset(key, 0x01, sizeof(key));
+
+ rspamd_nonce_t nonce = {
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10,
+ 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20,
+ 0x21, 0x22, 0x23, 0x24};
+
+ // Expected values from C implementation
+ unsigned char expected_subkey[64] = {
+ 0x47, 0xa6, 0xe3, 0xb5, 0x0f, 0xd4, 0x7f, 0x08, 0xb5, 0x35, 0x80, 0xfc, 0x93, 0x66, 0x1a, 0x7f,
+ 0x9c, 0xf5, 0x8c, 0x93, 0xae, 0x4e, 0x3f, 0xcf, 0x86, 0xb7, 0xdf, 0x34, 0x48, 0x73, 0x33, 0xdb,
+ 0x71, 0x31, 0x0f, 0xe1, 0xcc, 0xd9, 0x0c, 0x0a, 0x1a, 0x19, 0x54, 0x30, 0xdf, 0xe3, 0xda, 0xee,
+ 0x70, 0x29, 0xd9, 0xae, 0xf6, 0x4d, 0x78, 0xe3, 0xe8, 0x43, 0x98, 0xea, 0xaa, 0xd8, 0x85, 0x79};
+
+ unsigned char expected_mac_key[32] = {
+ 0x47, 0xa6, 0xe3, 0xb5, 0x0f, 0xd4, 0x7f, 0x08, 0xb5, 0x35, 0x80, 0xfc, 0x93, 0x66, 0x1a, 0x7f,
+ 0x9c, 0xf5, 0x8c, 0x93, 0xae, 0x4e, 0x3f, 0xcf, 0x86, 0xb7, 0xdf, 0x34, 0x48, 0x73, 0x33, 0xdb};
+
+ // Generate subkey using XChaCha20 (first 64 bytes of keystream)
+ // This simulates the MAC key derivation process used in secretbox
+ unsigned char subkey[64];
+ memset(subkey, 0, sizeof(subkey));
+
+ // Use libsodium's ChaCha20 directly to generate the subkey
+ // This matches what happens inside the secretbox implementation
+ crypto_stream_xchacha20(subkey, sizeof(subkey), nonce, key);
+
+ // MAC key is first 32 bytes of subkey
+ unsigned char mac_key[32];
+ memcpy(mac_key, subkey, 32);
+
+ CHECK(memcmp(subkey, expected_subkey, sizeof(expected_subkey)) == 0);
+ CHECK(memcmp(mac_key, expected_mac_key, sizeof(expected_mac_key)) == 0);
+ }
}
#endif
diff --git a/test/rspamd_dns_test.c b/test/rspamd_dns_test.c
index d041351df..ccdc47820 100644
--- a/test/rspamd_dns_test.c
+++ b/test/rspamd_dns_test.c
@@ -1,4 +1,20 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
#include "config.h"
#include "tests.h"
#include "dns.h"
@@ -51,6 +67,9 @@ test_dns_cb(struct rdns_reply *reply, gpointer arg)
case RDNS_REQUEST_MX:
msg_debug("got mx %s:%d", cur->content.mx.name, cur->content.mx.priority);
break;
+ default:
+ msg_debug("got unknown type %d", cur->type);
+ break;
}
cur = cur->next;
}
diff --git a/test/rspamd_shingles_test.c b/test/rspamd_shingles_test.c
index d1a10de84..5b88f4b2d 100644
--- a/test/rspamd_shingles_test.c
+++ b/test/rspamd_shingles_test.c
@@ -17,6 +17,7 @@
#include "rspamd.h"
#include "shingles.h"
#include "ottery.h"
+#include "libserver/word.h"
#include <math.h>
static const char *
@@ -52,63 +53,76 @@ generate_random_string(char *begin, size_t len)
}
}
-static GArray *
+static rspamd_words_t *
generate_fuzzy_words(gsize cnt, gsize max_len)
{
- GArray *res;
+ rspamd_words_t *res;
gsize i, wlen;
- rspamd_ftok_t w;
+ rspamd_word_t word;
char *t;
- res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_ftok_t), cnt);
+ res = g_malloc(sizeof(*res));
+ kv_init(*res);
for (i = 0; i < cnt; i++) {
wlen = ottery_rand_range(max_len) + 1;
/* wlen = max_len; */
- w.len = wlen;
t = g_malloc(wlen);
generate_random_string(t, wlen);
- w.begin = t;
- g_array_append_val(res, w);
+
+ memset(&word, 0, sizeof(word));
+ word.stemmed.begin = t;
+ word.stemmed.len = wlen;
+ word.original.begin = t;
+ word.original.len = wlen;
+ word.flags = 0; /* No flags set, so it won't be skipped */
+
+ kv_push(rspamd_word_t, *res, word);
}
return res;
}
static void
-permute_vector(GArray *in, double prob)
+permute_vector(rspamd_words_t *in, double prob)
{
gsize i, total = 0;
- rspamd_ftok_t *w;
+ rspamd_word_t *w;
- for (i = 0; i < in->len; i++) {
+ for (i = 0; i < kv_size(*in); i++) {
if (ottery_rand_unsigned() <= G_MAXUINT * prob) {
- w = &g_array_index(in, rspamd_ftok_t, i);
- generate_random_string((char *) w->begin, w->len);
+ w = &kv_A(*in, i);
+ generate_random_string((char *) w->stemmed.begin, w->stemmed.len);
+ /* Also update original since they point to same memory */
+ w->original.begin = w->stemmed.begin;
+ w->original.len = w->stemmed.len;
total++;
}
}
- msg_debug("generated %z permutations of %ud words", total, in->len);
+ msg_debug("generated %z permutations of %ud words", total, (unsigned int) kv_size(*in));
}
static void
-free_fuzzy_words(GArray *ar)
+free_fuzzy_words(rspamd_words_t *ar)
{
gsize i;
- rspamd_ftok_t *w;
+ rspamd_word_t *w;
- for (i = 0; i < ar->len; i++) {
- w = &g_array_index(ar, rspamd_ftok_t, i);
- g_free((gpointer) w->begin);
+ for (i = 0; i < kv_size(*ar); i++) {
+ w = &kv_A(*ar, i);
+ g_free((gpointer) w->stemmed.begin);
}
+
+ kv_destroy(*ar);
+ g_free(ar);
}
static void
test_case(gsize cnt, gsize max_len, double perm_factor,
enum rspamd_shingle_alg alg)
{
- GArray *input;
+ rspamd_words_t *input;
struct rspamd_shingle *sgl, *sgl_permuted;
double res;
unsigned char key[16];
@@ -281,51 +295,59 @@ void rspamd_shingles_test_func(void)
enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD;
struct rspamd_shingle *sgl;
unsigned char key[16];
- GArray *input;
- rspamd_ftok_t tok;
+ rspamd_words_t input;
+ rspamd_word_t word;
int i;
memset(key, 0, sizeof(key));
- input = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_ftok_t), 5);
+ kv_init(input);
for (i = 0; i < 5; i++) {
char *b = g_alloca(8);
memset(b, 0, 8);
memcpy(b + 1, "test", 4);
b[0] = 'a' + i;
- tok.begin = b;
- tok.len = 5 + ((i + 1) % 4);
- g_array_append_val(input, tok);
+
+ memset(&word, 0, sizeof(word));
+ word.stemmed.begin = b;
+ word.stemmed.len = 5 + ((i + 1) % 4);
+ word.original.begin = b;
+ word.original.len = word.stemmed.len;
+ word.flags = 0; /* No flags set, so it won't be skipped */
+
+ kv_push(rspamd_word_t, input, word);
}
- sgl = rspamd_shingles_from_text(input, key, NULL,
+ sgl = rspamd_shingles_from_text(&input, key, NULL,
rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_OLD);
for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
g_assert(sgl->hashes[i] == expected_old[i]);
}
g_free(sgl);
- sgl = rspamd_shingles_from_text(input, key, NULL,
+ sgl = rspamd_shingles_from_text(&input, key, NULL,
rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_XXHASH);
for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
g_assert(sgl->hashes[i] == expected_xxhash[i]);
}
g_free(sgl);
- sgl = rspamd_shingles_from_text(input, key, NULL,
+ sgl = rspamd_shingles_from_text(&input, key, NULL,
rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_MUMHASH);
for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
g_assert(sgl->hashes[i] == expected_mumhash[i]);
}
g_free(sgl);
- sgl = rspamd_shingles_from_text(input, key, NULL,
+ sgl = rspamd_shingles_from_text(&input, key, NULL,
rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_FAST);
for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
g_assert(sgl->hashes[i] == expected_fasthash[i]);
}
g_free(sgl);
+ kv_destroy(input);
+
for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_FAST; alg++) {
test_case(200, 10, 0.1, alg);
test_case(500, 20, 0.01, alg);