diff options
Diffstat (limited to 'src/libserver/cfg_rcl.cxx')
-rw-r--r-- | src/libserver/cfg_rcl.cxx | 193 |
1 files changed, 183 insertions, 10 deletions
diff --git a/src/libserver/cfg_rcl.cxx b/src/libserver/cfg_rcl.cxx index 0a48e8a4f..da5845917 100644 --- a/src/libserver/cfg_rcl.cxx +++ b/src/libserver/cfg_rcl.cxx @@ -1197,31 +1197,73 @@ rspamd_rcl_statfile_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, st->opts = (ucl_object_t *) obj; st->clcf = ccf; - const auto *val = ucl_object_lookup(obj, "spam"); - if (val == nullptr) { + /* Handle migration from old 'spam' field to new 'class' field */ + const auto *class_val = ucl_object_lookup(obj, "class"); + const auto *spam_val = ucl_object_lookup(obj, "spam"); + + if (class_val != nullptr && spam_val != nullptr) { + msg_warn_config("statfile %s has both 'class' and 'spam' fields, using 'class' field", + st->symbol); + } + + if (class_val == nullptr && spam_val == nullptr) { + /* Neither field present, try to guess by symbol name */ msg_info_config( - "statfile %s has no explicit 'spam' setting, trying to guess by symbol", + "statfile %s has no explicit 'class' or 'spam' setting, trying to guess by symbol", st->symbol); if (rspamd_substring_search_caseless(st->symbol, strlen(st->symbol), "spam", 4) != -1) { st->is_spam = TRUE; + st->class_name = rspamd_mempool_strdup(pool, "spam"); + st->is_spam_converted = TRUE; } else if (rspamd_substring_search_caseless(st->symbol, strlen(st->symbol), "ham", 3) != -1) { st->is_spam = FALSE; + st->class_name = rspamd_mempool_strdup(pool, "ham"); + st->is_spam_converted = TRUE; } else { g_set_error(err, CFG_RCL_ERROR, EINVAL, - "cannot guess spam setting from %s", + "cannot guess class setting from %s, please specify 'class' field", st->symbol); return FALSE; } - msg_info_config("guessed that statfile with symbol %s is %s", - st->symbol, - st->is_spam ? "spam" : "ham"); + msg_info_config("guessed that statfile with symbol %s has class '%s'", + st->symbol, st->class_name); + } + else if (class_val == nullptr && spam_val != nullptr) { + /* Only spam field present - migrate to class */ + msg_warn_config("statfile %s uses deprecated 'spam' field, please use 'class' instead", + st->symbol); + if (st->is_spam) { + st->class_name = rspamd_mempool_strdup(pool, "spam"); + } + else { + st->class_name = rspamd_mempool_strdup(pool, "ham"); + } + st->is_spam_converted = TRUE; } + else if (class_val != nullptr && spam_val == nullptr) { + /* Only class field present - set is_spam for backward compatibility */ + if (st->class_name != nullptr) { + if (strcmp(st->class_name, "spam") == 0) { + st->is_spam = TRUE; + } + else if (strcmp(st->class_name, "ham") == 0) { + st->is_spam = FALSE; + } + else { + /* For non-binary classes, default to not spam */ + st->is_spam = FALSE; + } + msg_debug_config("statfile %s with class '%s' set is_spam=%s for compatibility", + st->symbol, st->class_name, st->is_spam ? "true" : "false"); + } + } + /* If both fields are present, class takes precedence and was already parsed by the default parser */ return TRUE; } @@ -1229,6 +1271,31 @@ rspamd_rcl_statfile_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, } static gboolean +rspamd_rcl_class_labels_handler(rspamd_mempool_t *pool, + const ucl_object_t *obj, + const char *key, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *ccf = static_cast<rspamd_classifier_config *>(ud); + + if (obj->type != UCL_OBJECT) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "class_labels must be an object"); + return FALSE; + } + + if (!rspamd_config_parse_class_labels(obj, &ccf->class_labels)) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "invalid class_labels configuration"); + return FALSE; + } + + return TRUE; +} + +static gboolean rspamd_rcl_classifier_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, const char *key, @@ -1301,6 +1368,22 @@ rspamd_rcl_classifier_handler(rspamd_mempool_t *pool, } } } + else if (g_ascii_strcasecmp(st_key, "class_labels") == 0) { + /* Parse class_labels configuration directly */ + if (ucl_object_type(val) != UCL_OBJECT) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "class_labels must be an object"); + ucl_object_iterate_free(it); + return FALSE; + } + + if (!rspamd_config_parse_class_labels(val, &ccf->class_labels)) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "invalid class_labels configuration"); + ucl_object_iterate_free(it); + return FALSE; + } + } } } @@ -1375,8 +1458,80 @@ rspamd_rcl_classifier_handler(rspamd_mempool_t *pool, } ccf->opts = (ucl_object_t *) obj; + + /* Validate multi-class configuration */ + GError *validation_err = nullptr; + if (!rspamd_config_validate_class_config(ccf, &validation_err)) { + if (validation_err) { + g_propagate_error(err, validation_err); + } + else { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "multi-class configuration validation failed for classifier '%s'", + ccf->name ? ccf->name : "unknown"); + } + return FALSE; + } + cfg->classifiers = g_list_prepend(cfg->classifiers, ccf); + /* Populate class_names array from statfiles - only for explicit multiclass configs */ + if (ccf->statfiles) { + GList *cur = ccf->statfiles; + gboolean has_explicit_classes = FALSE; + + /* Check if any statfile uses explicit class declaration (not converted from is_spam) */ + cur = ccf->statfiles; + while (cur) { + struct rspamd_statfile_config *stcf = (struct rspamd_statfile_config *) cur->data; + msg_debug("checking statfile %s: class_name=%s, is_spam_converted=%s", + stcf->symbol, stcf->class_name ? stcf->class_name : "NULL", + stcf->is_spam_converted ? "true" : "false"); + if (stcf->class_name && !stcf->is_spam_converted) { + has_explicit_classes = TRUE; + break; + } + cur = g_list_next(cur); + } + + msg_debug("has_explicit_classes = %s", has_explicit_classes ? "true" : "false"); + + /* Only populate class_names for explicit multiclass configurations */ + if (has_explicit_classes) { + msg_debug("populating class_names for multiclass configuration"); + } + else { + msg_debug("skipping class_names population for binary configuration"); + } + + if (has_explicit_classes) { + ccf->class_names = g_ptr_array_new(); + + cur = ccf->statfiles; + while (cur) { + struct rspamd_statfile_config *stcf = (struct rspamd_statfile_config *) cur->data; + if (stcf->class_name) { + /* Check if class already exists */ + bool found = false; + for (unsigned int i = 0; i < ccf->class_names->len; i++) { + if (strcmp((char *) g_ptr_array_index(ccf->class_names, i), stcf->class_name) == 0) { + stcf->class_index = i; /* Store the index for O(1) lookup */ + found = true; + break; + } + } + + if (!found) { + /* Add new class */ + stcf->class_index = ccf->class_names->len; + g_ptr_array_add(ccf->class_names, g_strdup(stcf->class_name)); + } + } + cur = g_list_next(cur); + } + } + } + return TRUE; } @@ -2457,7 +2612,7 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) FALSE, TRUE, cfg->doc_strings, - "CLassifier options"); + "Classifier options"); /* Default classifier is 'bayes' for now */ sub->default_key = "bayes"; @@ -2476,7 +2631,7 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) rspamd_rcl_add_default_handler(sub, "min_prob_strength", rspamd_rcl_parse_struct_double, - G_STRUCT_OFFSET(struct rspamd_classifier_config, min_token_hits), + G_STRUCT_OFFSET(struct rspamd_classifier_config, min_prob_strength), 0, "Use only tokens with probability in [0.5 - MPS, 0.5 + MPS]"); rspamd_rcl_add_default_handler(sub, @@ -2505,6 +2660,18 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) "Name of classifier"); /* + * Multi-class configuration + */ + rspamd_rcl_add_section_doc(&top, sub, + "class_labels", nullptr, + rspamd_rcl_class_labels_handler, + UCL_OBJECT, + FALSE, + TRUE, + sub->doc_ref, + "Class to backend label mapping for multi-class classification"); + + /* * Statfile defaults */ auto *ssub = rspamd_rcl_add_section_doc(&top, sub, @@ -2522,11 +2689,17 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) 0, "Statfile unique label"); rspamd_rcl_add_default_handler(ssub, + "class", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_statfile_config, class_name), + 0, + "Class name for multi-class classification"); + rspamd_rcl_add_default_handler(ssub, "spam", rspamd_rcl_parse_struct_boolean, G_STRUCT_OFFSET(struct rspamd_statfile_config, is_spam), 0, - "Sets if this statfile contains spam samples"); + "DEPRECATED: Sets if this statfile contains spam samples (use 'class' instead)"); } if (!(skip_sections && g_hash_table_lookup(skip_sections, "composite"))) { |