aboutsummaryrefslogtreecommitdiffstats
path: root/src/libserver/cfg_utils.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'src/libserver/cfg_utils.cxx')
-rw-r--r--src/libserver/cfg_utils.cxx266
1 files changed, 265 insertions, 1 deletions
diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx
index dfbdc6bee..c22a9b877 100644
--- a/src/libserver/cfg_utils.cxx
+++ b/src/libserver/cfg_utils.cxx
@@ -72,6 +72,11 @@
#include "contrib/expected/expected.hpp"
#include "contrib/ankerl/unordered_dense.h"
+#include "libserver/task.h"
+#include "libserver/url.h"
+#define RSPAMD_TOKENIZER_INTERNAL// We need to use internal tokenizer API
+#include "libstat/tokenizers/custom_tokenizer.h"
+
#define DEFAULT_SCORE 10.0
#define DEFAULT_RLIMIT_NOFILE 2048
@@ -821,6 +826,65 @@ rspamd_adjust_clocks_resolution(struct rspamd_config *cfg)
#endif
}
+extern "C" {
+
+gboolean
+rspamd_config_load_custom_tokenizers(struct rspamd_config *cfg, GError **err)
+{
+ /* Load custom tokenizers */
+ const ucl_object_t *custom_tokenizers = ucl_object_lookup_path(cfg->cfg_ucl_obj,
+ "options.custom_tokenizers");
+ if (custom_tokenizers != NULL) {
+ msg_info_config("loading custom tokenizers");
+
+ if (!cfg->tokenizer_manager) {
+ cfg->tokenizer_manager = rspamd_tokenizer_manager_new(cfg->cfg_pool);
+ }
+
+ ucl_object_iter_t it = ucl_object_iterate_new(custom_tokenizers);
+ const ucl_object_t *tok_obj;
+ const char *tok_name;
+
+ while ((tok_obj = ucl_object_iterate_safe(it, true)) != NULL) {
+ tok_name = ucl_object_key(tok_obj);
+ GError *local_err = NULL;
+
+ if (!rspamd_tokenizer_manager_load_tokenizer(cfg->tokenizer_manager,
+ tok_name, tok_obj, &local_err)) {
+ msg_err_config("failed to load custom tokenizer '%s': %s",
+ tok_name, local_err ? local_err->message : "unknown error");
+
+ if (err && !*err) {
+ *err = g_error_copy(local_err);
+ }
+
+ if (local_err) {
+ g_error_free(local_err);
+ }
+
+ ucl_object_iterate_free(it);
+ return FALSE;
+ }
+ }
+ ucl_object_iterate_free(it);
+
+ msg_info_config("loaded custom tokenizers successfully");
+ }
+
+ return TRUE;
+}
+
+void rspamd_config_unload_custom_tokenizers(struct rspamd_config *cfg)
+{
+ if (cfg->tokenizer_manager) {
+ msg_info_config("unloading custom tokenizers");
+ rspamd_tokenizer_manager_destroy(cfg->tokenizer_manager);
+ cfg->tokenizer_manager = NULL;
+ }
+}
+
+}// extern "C"
+
/*
* Perform post load actions
*/
@@ -940,6 +1004,20 @@ rspamd_config_post_load(struct rspamd_config *cfg,
msg_err_config("cannot configure libraries, fatal error");
return FALSE;
}
+
+ /* Load custom tokenizers using the new function */
+ GError *tokenizer_err = NULL;
+ if (!rspamd_config_load_custom_tokenizers(cfg, &tokenizer_err)) {
+ msg_err_config("failed to load custom tokenizers: %s",
+ tokenizer_err ? tokenizer_err->message : "unknown error");
+ if (tokenizer_err) {
+ g_error_free(tokenizer_err);
+ }
+
+ if (opts & RSPAMD_CONFIG_INIT_VALIDATE) {
+ ret = tl::make_unexpected(std::string{"failed to load custom tokenizers"});
+ }
+ }
}
/* Validate cache */
@@ -1363,7 +1441,7 @@ rspamd_ucl_fin_cb(struct map_cb_data *data, void **target)
}
/* New data available */
- auto *parser = ucl_parser_new(0);
+ auto *parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS);
if (!ucl_parser_add_chunk(parser, (unsigned char *) cbdata->buf.data(),
cbdata->buf.size())) {
msg_err_config("cannot parse map %s: %s",
@@ -2964,3 +3042,189 @@ rspamd_ip_is_local_cfg(struct rspamd_config *cfg,
return FALSE;
}
+
+gboolean
+rspamd_config_parse_class_labels(const ucl_object_t *obj, GHashTable **class_labels)
+{
+ const ucl_object_t *cur;
+ ucl_object_iter_t it = nullptr;
+
+ if (!obj || ucl_object_type(obj) != UCL_OBJECT) {
+ return FALSE;
+ }
+
+ if (*class_labels == nullptr) {
+ *class_labels = g_hash_table_new_full(g_str_hash, g_str_equal,
+ g_free, g_free);
+ }
+
+ while ((cur = ucl_object_iterate(obj, &it, true)) != nullptr) {
+ const char *class_name = ucl_object_key(cur);
+ const char *label = ucl_object_tostring(cur);
+
+ if (class_name && label) {
+ /* Validate class name: alphanumeric + underscore, max 32 chars */
+ if (strlen(class_name) > 32) {
+ msg_err("class name '%s' is too long (max 32 characters)", class_name);
+ g_hash_table_destroy(*class_labels);
+ *class_labels = nullptr;
+ return FALSE;
+ }
+
+ for (const char *p = class_name; *p; p++) {
+ if (!g_ascii_isalnum(*p) && *p != '_') {
+ msg_err("class name '%s' contains invalid character '%c'", class_name, *p);
+ g_hash_table_destroy(*class_labels);
+ *class_labels = nullptr;
+ return FALSE;
+ }
+ }
+
+ /* Validate label uniqueness */
+ if (g_hash_table_lookup(*class_labels, label)) {
+ msg_err("backend label '%s' is used by multiple classes", label);
+ g_hash_table_destroy(*class_labels);
+ *class_labels = nullptr;
+ return FALSE;
+ }
+ }
+
+ g_hash_table_insert(*class_labels, g_strdup(class_name), g_strdup(label));
+ }
+
+ return g_hash_table_size(*class_labels) > 0;
+}
+
+gboolean
+rspamd_config_migrate_binary_config(struct rspamd_statfile_config *stcf)
+{
+ if (stcf->class_name != nullptr) {
+ /* Already migrated or using new format */
+ return TRUE;
+ }
+
+ if (stcf->is_spam) {
+ stcf->class_name = g_strdup("spam");
+ msg_info("migrated statfile '%s' from is_spam=true to class='spam'",
+ stcf->symbol ? stcf->symbol : "unknown");
+ }
+ else {
+ stcf->class_name = g_strdup("ham");
+ msg_info("migrated statfile '%s' from is_spam=false to class='ham'",
+ stcf->symbol ? stcf->symbol : "unknown");
+ }
+
+ return TRUE;
+}
+
+gboolean
+rspamd_config_validate_class_config(struct rspamd_classifier_config *ccf, GError **err)
+{
+ GList *cur;
+ GHashTable *seen_classes = nullptr;
+ struct rspamd_statfile_config *stcf;
+ unsigned int class_count = 0;
+
+ if (!ccf || !ccf->statfiles) {
+ g_set_error(err, g_quark_from_static_string("config"), 1,
+ "classifier has no statfiles defined");
+ return FALSE;
+ }
+
+ seen_classes = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, nullptr);
+
+ /* Iterate through statfiles and collect classes */
+ cur = ccf->statfiles;
+ while (cur) {
+ stcf = (struct rspamd_statfile_config *) cur->data;
+
+ /* Migrate binary config if needed */
+ if (!rspamd_config_migrate_binary_config(stcf)) {
+ g_set_error(err, g_quark_from_static_string("config"), 1,
+ "failed to migrate binary config for statfile '%s'",
+ stcf->symbol ? stcf->symbol : "unknown");
+ g_hash_table_destroy(seen_classes);
+ return FALSE;
+ }
+
+ /* Check class name */
+ if (!stcf->class_name || strlen(stcf->class_name) == 0) {
+ g_set_error(err, g_quark_from_static_string("config"), 1,
+ "statfile '%s' has no class defined",
+ stcf->symbol ? stcf->symbol : "unknown");
+ g_hash_table_destroy(seen_classes);
+ return FALSE;
+ }
+
+ /* Track unique classes */
+ if (!g_hash_table_contains(seen_classes, stcf->class_name)) {
+ g_hash_table_insert(seen_classes, g_strdup(stcf->class_name), GINT_TO_POINTER(1));
+ class_count++;
+ }
+
+ cur = g_list_next(cur);
+ }
+
+ /* Validate class count */
+ if (class_count < 2) {
+ g_set_error(err, g_quark_from_static_string("config"), 1,
+ "classifier must have at least 2 classes, found %ud", class_count);
+ g_hash_table_destroy(seen_classes);
+ return FALSE;
+ }
+
+ if (class_count > 20) {
+ msg_warn("classifier has %ud classes, performance may be degraded above 20 classes",
+ class_count);
+ }
+
+ /* Initialize classifier class tracking - only for explicit multiclass configurations */
+ gboolean has_explicit_classes = FALSE;
+
+ /* Check if any statfile uses explicit class declaration (not converted from is_spam) */
+ cur = ccf->statfiles;
+ while (cur) {
+ stcf = (struct rspamd_statfile_config *) cur->data;
+ if (stcf->class_name && !stcf->is_spam_converted) {
+ has_explicit_classes = TRUE;
+ break;
+ }
+ cur = g_list_next(cur);
+ }
+
+ /* Only populate class_names for explicit multiclass configurations */
+ if (has_explicit_classes) {
+ if (ccf->class_names) {
+ g_ptr_array_unref(ccf->class_names);
+ }
+ ccf->class_names = g_ptr_array_new_with_free_func(g_free);
+
+ /* Populate class names array */
+ GHashTableIter iter;
+ gpointer key, value;
+ g_hash_table_iter_init(&iter, seen_classes);
+ while (g_hash_table_iter_next(&iter, &key, &value)) {
+ g_ptr_array_add(ccf->class_names, g_strdup((const char *) key));
+ }
+ }
+ else {
+ /* Binary configuration - ensure class_names is NULL */
+ if (ccf->class_names) {
+ g_ptr_array_unref(ccf->class_names);
+ ccf->class_names = nullptr;
+ }
+ }
+
+ g_hash_table_destroy(seen_classes);
+ return TRUE;
+}
+
+const char *
+rspamd_config_get_class_label(struct rspamd_classifier_config *ccf, const char *class_name)
+{
+ if (!ccf || !ccf->class_labels || !class_name) {
+ return nullptr;
+ }
+
+ return (const char *) g_hash_table_lookup(ccf->class_labels, class_name);
+}