diff options
Diffstat (limited to 'src/libserver/cfg_rcl.cxx')
-rw-r--r-- | src/libserver/cfg_rcl.cxx | 223 |
1 files changed, 208 insertions, 15 deletions
diff --git a/src/libserver/cfg_rcl.cxx b/src/libserver/cfg_rcl.cxx index 79509e12e..da5845917 100644 --- a/src/libserver/cfg_rcl.cxx +++ b/src/libserver/cfg_rcl.cxx @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ #include <algorithm>// for std::transform #include <memory> #include "contrib/ankerl/unordered_dense.h" -#include "fmt/base.h" +#include "contrib/fmt/include/fmt/base.h" #include "libutil/cxx/util.hxx" #include "libutil/cxx/file_util.hxx" #include "frozen/unordered_set.h" @@ -299,6 +299,14 @@ rspamd_rcl_logging_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, cfg->log_flags |= RSPAMD_LOG_FLAG_USEC; } + /* Set default values for new log tag options */ + if (cfg->log_max_tag_len == 0) { + cfg->log_max_tag_len = RSPAMD_LOG_ID_LEN; /* Default to new max size */ + } + if (cfg->log_tag_strip_policy_str == NULL) { + cfg->log_tag_strip_policy_str = rspamd_mempool_strdup(cfg->cfg_pool, "right"); + } + return rspamd_rcl_section_parse_defaults(cfg, *section, cfg->cfg_pool, obj, (void *) cfg, err); } @@ -1189,31 +1197,73 @@ rspamd_rcl_statfile_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, st->opts = (ucl_object_t *) obj; st->clcf = ccf; - const auto *val = ucl_object_lookup(obj, "spam"); - if (val == nullptr) { + /* Handle migration from old 'spam' field to new 'class' field */ + const auto *class_val = ucl_object_lookup(obj, "class"); + const auto *spam_val = ucl_object_lookup(obj, "spam"); + + if (class_val != nullptr && spam_val != nullptr) { + msg_warn_config("statfile %s has both 'class' and 'spam' fields, using 'class' field", + st->symbol); + } + + if (class_val == nullptr && spam_val == nullptr) { + /* Neither field present, try to guess by symbol name */ msg_info_config( - "statfile %s has no explicit 'spam' setting, trying to guess by symbol", + "statfile %s has no explicit 'class' or 'spam' setting, trying to guess by symbol", st->symbol); if (rspamd_substring_search_caseless(st->symbol, strlen(st->symbol), "spam", 4) != -1) { st->is_spam = TRUE; + st->class_name = rspamd_mempool_strdup(pool, "spam"); + st->is_spam_converted = TRUE; } else if (rspamd_substring_search_caseless(st->symbol, strlen(st->symbol), "ham", 3) != -1) { st->is_spam = FALSE; + st->class_name = rspamd_mempool_strdup(pool, "ham"); + st->is_spam_converted = TRUE; } else { g_set_error(err, CFG_RCL_ERROR, EINVAL, - "cannot guess spam setting from %s", + "cannot guess class setting from %s, please specify 'class' field", st->symbol); return FALSE; } - msg_info_config("guessed that statfile with symbol %s is %s", - st->symbol, - st->is_spam ? "spam" : "ham"); + msg_info_config("guessed that statfile with symbol %s has class '%s'", + st->symbol, st->class_name); } + else if (class_val == nullptr && spam_val != nullptr) { + /* Only spam field present - migrate to class */ + msg_warn_config("statfile %s uses deprecated 'spam' field, please use 'class' instead", + st->symbol); + if (st->is_spam) { + st->class_name = rspamd_mempool_strdup(pool, "spam"); + } + else { + st->class_name = rspamd_mempool_strdup(pool, "ham"); + } + st->is_spam_converted = TRUE; + } + else if (class_val != nullptr && spam_val == nullptr) { + /* Only class field present - set is_spam for backward compatibility */ + if (st->class_name != nullptr) { + if (strcmp(st->class_name, "spam") == 0) { + st->is_spam = TRUE; + } + else if (strcmp(st->class_name, "ham") == 0) { + st->is_spam = FALSE; + } + else { + /* For non-binary classes, default to not spam */ + st->is_spam = FALSE; + } + msg_debug_config("statfile %s with class '%s' set is_spam=%s for compatibility", + st->symbol, st->class_name, st->is_spam ? "true" : "false"); + } + } + /* If both fields are present, class takes precedence and was already parsed by the default parser */ return TRUE; } @@ -1221,6 +1271,31 @@ rspamd_rcl_statfile_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, } static gboolean +rspamd_rcl_class_labels_handler(rspamd_mempool_t *pool, + const ucl_object_t *obj, + const char *key, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *ccf = static_cast<rspamd_classifier_config *>(ud); + + if (obj->type != UCL_OBJECT) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "class_labels must be an object"); + return FALSE; + } + + if (!rspamd_config_parse_class_labels(obj, &ccf->class_labels)) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "invalid class_labels configuration"); + return FALSE; + } + + return TRUE; +} + +static gboolean rspamd_rcl_classifier_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, const char *key, @@ -1293,6 +1368,22 @@ rspamd_rcl_classifier_handler(rspamd_mempool_t *pool, } } } + else if (g_ascii_strcasecmp(st_key, "class_labels") == 0) { + /* Parse class_labels configuration directly */ + if (ucl_object_type(val) != UCL_OBJECT) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "class_labels must be an object"); + ucl_object_iterate_free(it); + return FALSE; + } + + if (!rspamd_config_parse_class_labels(val, &ccf->class_labels)) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "invalid class_labels configuration"); + ucl_object_iterate_free(it); + return FALSE; + } + } } } @@ -1367,8 +1458,80 @@ rspamd_rcl_classifier_handler(rspamd_mempool_t *pool, } ccf->opts = (ucl_object_t *) obj; + + /* Validate multi-class configuration */ + GError *validation_err = nullptr; + if (!rspamd_config_validate_class_config(ccf, &validation_err)) { + if (validation_err) { + g_propagate_error(err, validation_err); + } + else { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "multi-class configuration validation failed for classifier '%s'", + ccf->name ? ccf->name : "unknown"); + } + return FALSE; + } + cfg->classifiers = g_list_prepend(cfg->classifiers, ccf); + /* Populate class_names array from statfiles - only for explicit multiclass configs */ + if (ccf->statfiles) { + GList *cur = ccf->statfiles; + gboolean has_explicit_classes = FALSE; + + /* Check if any statfile uses explicit class declaration (not converted from is_spam) */ + cur = ccf->statfiles; + while (cur) { + struct rspamd_statfile_config *stcf = (struct rspamd_statfile_config *) cur->data; + msg_debug("checking statfile %s: class_name=%s, is_spam_converted=%s", + stcf->symbol, stcf->class_name ? stcf->class_name : "NULL", + stcf->is_spam_converted ? "true" : "false"); + if (stcf->class_name && !stcf->is_spam_converted) { + has_explicit_classes = TRUE; + break; + } + cur = g_list_next(cur); + } + + msg_debug("has_explicit_classes = %s", has_explicit_classes ? "true" : "false"); + + /* Only populate class_names for explicit multiclass configurations */ + if (has_explicit_classes) { + msg_debug("populating class_names for multiclass configuration"); + } + else { + msg_debug("skipping class_names population for binary configuration"); + } + + if (has_explicit_classes) { + ccf->class_names = g_ptr_array_new(); + + cur = ccf->statfiles; + while (cur) { + struct rspamd_statfile_config *stcf = (struct rspamd_statfile_config *) cur->data; + if (stcf->class_name) { + /* Check if class already exists */ + bool found = false; + for (unsigned int i = 0; i < ccf->class_names->len; i++) { + if (strcmp((char *) g_ptr_array_index(ccf->class_names, i), stcf->class_name) == 0) { + stcf->class_index = i; /* Store the index for O(1) lookup */ + found = true; + break; + } + } + + if (!found) { + /* Add new class */ + stcf->class_index = ccf->class_names->len; + g_ptr_array_add(ccf->class_names, g_strdup(stcf->class_name)); + } + } + cur = g_list_next(cur); + } + } + } + return TRUE; } @@ -1700,6 +1863,18 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) G_STRUCT_OFFSET(struct rspamd_config, log_task_max_elts), RSPAMD_CL_FLAG_UINT, "Maximum number of elements in task log entry (7 by default)"); + rspamd_rcl_add_default_handler(sub, + "max_tag_len", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, log_max_tag_len), + RSPAMD_CL_FLAG_UINT, + "Maximum length of log tag cannot exceed 32 (" G_STRINGIFY(RSPAMD_LOG_ID_LEN) ") by default)"); + rspamd_rcl_add_default_handler(sub, + "tag_strip_policy", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, log_tag_strip_policy_str), + 0, + "Log tag strip policy when tag exceeds max length: 'right', 'left', 'middle' (right by default)"); /* Documentation only options, handled in log_handler to map flags */ rspamd_rcl_add_doc_by_path(cfg, @@ -2210,7 +2385,7 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) rspamd_rcl_add_doc_by_path(cfg, "options", - "Swtich mode of gtube patterns: disable, reject, all", + "Switch mode of gtube patterns: disable, reject, all", "gtube_patterns", UCL_STRING, nullptr, @@ -2308,7 +2483,7 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) rspamd_rcl_parse_struct_time, G_STRUCT_OFFSET(struct rspamd_config, upstream_resolve_min_interval), RSPAMD_CL_FLAG_TIME_FLOAT, - "Minumum interval to perform resolving (60 seconds by default)"); + "Minimum interval to perform resolving (60 seconds by default)"); } if (!(skip_sections && g_hash_table_lookup(skip_sections, "actions"))) { @@ -2437,7 +2612,7 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) FALSE, TRUE, cfg->doc_strings, - "CLassifier options"); + "Classifier options"); /* Default classifier is 'bayes' for now */ sub->default_key = "bayes"; @@ -2456,7 +2631,7 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) rspamd_rcl_add_default_handler(sub, "min_prob_strength", rspamd_rcl_parse_struct_double, - G_STRUCT_OFFSET(struct rspamd_classifier_config, min_token_hits), + G_STRUCT_OFFSET(struct rspamd_classifier_config, min_prob_strength), 0, "Use only tokens with probability in [0.5 - MPS, 0.5 + MPS]"); rspamd_rcl_add_default_handler(sub, @@ -2485,6 +2660,18 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) "Name of classifier"); /* + * Multi-class configuration + */ + rspamd_rcl_add_section_doc(&top, sub, + "class_labels", nullptr, + rspamd_rcl_class_labels_handler, + UCL_OBJECT, + FALSE, + TRUE, + sub->doc_ref, + "Class to backend label mapping for multi-class classification"); + + /* * Statfile defaults */ auto *ssub = rspamd_rcl_add_section_doc(&top, sub, @@ -2502,11 +2689,17 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) 0, "Statfile unique label"); rspamd_rcl_add_default_handler(ssub, + "class", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_statfile_config, class_name), + 0, + "Class name for multi-class classification"); + rspamd_rcl_add_default_handler(ssub, "spam", rspamd_rcl_parse_struct_boolean, G_STRUCT_OFFSET(struct rspamd_statfile_config, is_spam), 0, - "Sets if this statfile contains spam samples"); + "DEPRECATED: Sets if this statfile contains spam samples (use 'class' instead)"); } if (!(skip_sections && g_hash_table_lookup(skip_sections, "composite"))) { @@ -3640,7 +3833,7 @@ rspamd_config_parse_ucl(struct rspamd_config *cfg, /* Try to load keyfile if available */ auto keyfile_name = fmt::format("{}.key", filename); rspamd::util::raii_file::open(keyfile_name, O_RDONLY).map([&](const auto &keyfile) { - auto *kp_parser = ucl_parser_new(0); + auto *kp_parser = ucl_parser_new(UCL_PARSER_DEFAULT); if (ucl_parser_add_fd(kp_parser, keyfile.get_fd())) { auto *kp_obj = ucl_parser_get_object(kp_parser); |