diff options
Diffstat (limited to 'src/libserver')
31 files changed, 4565 insertions, 671 deletions
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index f59c6ff89..355046cac 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,6 +48,7 @@ struct worker_s; struct rspamd_external_libs_ctx; struct rspamd_cryptobox_pubkey; struct rspamd_dns_resolver; +struct rspamd_tokenizer_manager; /** * Logging type @@ -138,7 +139,10 @@ struct rspamd_statfile_config { char *symbol; /**< symbol of statfile */ char *label; /**< label of this statfile */ ucl_object_t *opts; /**< other options */ - gboolean is_spam; /**< spam flag */ + char *class_name; /**< class name for multi-class classification */ + unsigned int class_index; /**< class index for O(1) lookup during classification */ + gboolean is_spam; /**< DEPRECATED: spam flag - use class_name instead */ + gboolean is_spam_converted; /**< TRUE if class_name was converted from is_spam flag */ struct rspamd_classifier_config *clcf; /**< parent pointer of classifier configuration */ gpointer data; /**< opaque data */ }; @@ -181,6 +185,8 @@ struct rspamd_classifier_config { double min_prob_strength; /**< use only tokens with probability in [0.5 - MPS, 0.5 + MPS] */ unsigned int min_learns; /**< minimum number of learns for each statfile */ unsigned int flags; + GHashTable *class_labels; /**< class_name -> backend_symbol mapping for multi-class */ + GPtrArray *class_names; /**< ordered list of class names */ }; struct rspamd_worker_bind_conf { @@ -395,6 +401,8 @@ struct rspamd_config { unsigned int log_error_elts; /**< number of elements in error logbuf */ unsigned int log_error_elt_maxlen; /**< maximum size of error log element */ unsigned int log_task_max_elts; /**< maximum number of elements in task logging */ + unsigned int log_max_tag_len; /**< maximum length of log tag */ + char *log_tag_strip_policy_str; /**< log tag strip policy string */ struct rspamd_worker_log_pipe *log_pipes; gboolean compat_messages; /**< use old messages in the protocol (array) */ @@ -495,9 +503,10 @@ struct rspamd_config { char *zstd_output_dictionary; /**< path to zstd output dictionary */ ucl_object_t *neighbours; /**< other servers in the cluster */ - struct rspamd_config_settings_elt *setting_ids; /**< preprocessed settings ids */ - struct rspamd_lang_detector *lang_det; /**< language detector */ - struct rspamd_worker *cur_worker; /**< set dynamically by each worker */ + struct rspamd_config_settings_elt *setting_ids; /**< preprocessed settings ids */ + struct rspamd_lang_detector *lang_det; /**< language detector */ + struct rspamd_tokenizer_manager *tokenizer_manager; /**< custom tokenizer manager */ + struct rspamd_worker *cur_worker; /**< set dynamically by each worker */ ref_entry_t ref; /**< reference counter */ }; @@ -617,12 +626,25 @@ void rspamd_config_insert_classify_symbols(struct rspamd_config *cfg); */ gboolean rspamd_config_check_statfiles(struct rspamd_classifier_config *cf); -/* - * Find classifier config by name +/** + * Multi-class configuration helpers + */ +gboolean rspamd_config_parse_class_labels(const ucl_object_t *obj, + GHashTable **class_labels); + +gboolean rspamd_config_migrate_binary_config(struct rspamd_statfile_config *stcf); + +gboolean rspamd_config_validate_class_config(struct rspamd_classifier_config *ccf, + GError **err); + +const char *rspamd_config_get_class_label(struct rspamd_classifier_config *ccf, + const char *class_name); + +/** + * Find classifier by name */ struct rspamd_classifier_config *rspamd_config_find_classifier( - struct rspamd_config *cfg, - const char *name); + struct rspamd_config *cfg, const char *name); void rspamd_ucl_add_conf_macros(struct ucl_parser *parser, struct rspamd_config *cfg); diff --git a/src/libserver/cfg_rcl.cxx b/src/libserver/cfg_rcl.cxx index f38366908..da5845917 100644 --- a/src/libserver/cfg_rcl.cxx +++ b/src/libserver/cfg_rcl.cxx @@ -299,6 +299,14 @@ rspamd_rcl_logging_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, cfg->log_flags |= RSPAMD_LOG_FLAG_USEC; } + /* Set default values for new log tag options */ + if (cfg->log_max_tag_len == 0) { + cfg->log_max_tag_len = RSPAMD_LOG_ID_LEN; /* Default to new max size */ + } + if (cfg->log_tag_strip_policy_str == NULL) { + cfg->log_tag_strip_policy_str = rspamd_mempool_strdup(cfg->cfg_pool, "right"); + } + return rspamd_rcl_section_parse_defaults(cfg, *section, cfg->cfg_pool, obj, (void *) cfg, err); } @@ -1189,31 +1197,73 @@ rspamd_rcl_statfile_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, st->opts = (ucl_object_t *) obj; st->clcf = ccf; - const auto *val = ucl_object_lookup(obj, "spam"); - if (val == nullptr) { + /* Handle migration from old 'spam' field to new 'class' field */ + const auto *class_val = ucl_object_lookup(obj, "class"); + const auto *spam_val = ucl_object_lookup(obj, "spam"); + + if (class_val != nullptr && spam_val != nullptr) { + msg_warn_config("statfile %s has both 'class' and 'spam' fields, using 'class' field", + st->symbol); + } + + if (class_val == nullptr && spam_val == nullptr) { + /* Neither field present, try to guess by symbol name */ msg_info_config( - "statfile %s has no explicit 'spam' setting, trying to guess by symbol", + "statfile %s has no explicit 'class' or 'spam' setting, trying to guess by symbol", st->symbol); if (rspamd_substring_search_caseless(st->symbol, strlen(st->symbol), "spam", 4) != -1) { st->is_spam = TRUE; + st->class_name = rspamd_mempool_strdup(pool, "spam"); + st->is_spam_converted = TRUE; } else if (rspamd_substring_search_caseless(st->symbol, strlen(st->symbol), "ham", 3) != -1) { st->is_spam = FALSE; + st->class_name = rspamd_mempool_strdup(pool, "ham"); + st->is_spam_converted = TRUE; } else { g_set_error(err, CFG_RCL_ERROR, EINVAL, - "cannot guess spam setting from %s", + "cannot guess class setting from %s, please specify 'class' field", st->symbol); return FALSE; } - msg_info_config("guessed that statfile with symbol %s is %s", - st->symbol, - st->is_spam ? "spam" : "ham"); + msg_info_config("guessed that statfile with symbol %s has class '%s'", + st->symbol, st->class_name); } + else if (class_val == nullptr && spam_val != nullptr) { + /* Only spam field present - migrate to class */ + msg_warn_config("statfile %s uses deprecated 'spam' field, please use 'class' instead", + st->symbol); + if (st->is_spam) { + st->class_name = rspamd_mempool_strdup(pool, "spam"); + } + else { + st->class_name = rspamd_mempool_strdup(pool, "ham"); + } + st->is_spam_converted = TRUE; + } + else if (class_val != nullptr && spam_val == nullptr) { + /* Only class field present - set is_spam for backward compatibility */ + if (st->class_name != nullptr) { + if (strcmp(st->class_name, "spam") == 0) { + st->is_spam = TRUE; + } + else if (strcmp(st->class_name, "ham") == 0) { + st->is_spam = FALSE; + } + else { + /* For non-binary classes, default to not spam */ + st->is_spam = FALSE; + } + msg_debug_config("statfile %s with class '%s' set is_spam=%s for compatibility", + st->symbol, st->class_name, st->is_spam ? "true" : "false"); + } + } + /* If both fields are present, class takes precedence and was already parsed by the default parser */ return TRUE; } @@ -1221,6 +1271,31 @@ rspamd_rcl_statfile_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, } static gboolean +rspamd_rcl_class_labels_handler(rspamd_mempool_t *pool, + const ucl_object_t *obj, + const char *key, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *ccf = static_cast<rspamd_classifier_config *>(ud); + + if (obj->type != UCL_OBJECT) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "class_labels must be an object"); + return FALSE; + } + + if (!rspamd_config_parse_class_labels(obj, &ccf->class_labels)) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "invalid class_labels configuration"); + return FALSE; + } + + return TRUE; +} + +static gboolean rspamd_rcl_classifier_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, const char *key, @@ -1293,6 +1368,22 @@ rspamd_rcl_classifier_handler(rspamd_mempool_t *pool, } } } + else if (g_ascii_strcasecmp(st_key, "class_labels") == 0) { + /* Parse class_labels configuration directly */ + if (ucl_object_type(val) != UCL_OBJECT) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "class_labels must be an object"); + ucl_object_iterate_free(it); + return FALSE; + } + + if (!rspamd_config_parse_class_labels(val, &ccf->class_labels)) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "invalid class_labels configuration"); + ucl_object_iterate_free(it); + return FALSE; + } + } } } @@ -1367,8 +1458,80 @@ rspamd_rcl_classifier_handler(rspamd_mempool_t *pool, } ccf->opts = (ucl_object_t *) obj; + + /* Validate multi-class configuration */ + GError *validation_err = nullptr; + if (!rspamd_config_validate_class_config(ccf, &validation_err)) { + if (validation_err) { + g_propagate_error(err, validation_err); + } + else { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "multi-class configuration validation failed for classifier '%s'", + ccf->name ? ccf->name : "unknown"); + } + return FALSE; + } + cfg->classifiers = g_list_prepend(cfg->classifiers, ccf); + /* Populate class_names array from statfiles - only for explicit multiclass configs */ + if (ccf->statfiles) { + GList *cur = ccf->statfiles; + gboolean has_explicit_classes = FALSE; + + /* Check if any statfile uses explicit class declaration (not converted from is_spam) */ + cur = ccf->statfiles; + while (cur) { + struct rspamd_statfile_config *stcf = (struct rspamd_statfile_config *) cur->data; + msg_debug("checking statfile %s: class_name=%s, is_spam_converted=%s", + stcf->symbol, stcf->class_name ? stcf->class_name : "NULL", + stcf->is_spam_converted ? "true" : "false"); + if (stcf->class_name && !stcf->is_spam_converted) { + has_explicit_classes = TRUE; + break; + } + cur = g_list_next(cur); + } + + msg_debug("has_explicit_classes = %s", has_explicit_classes ? "true" : "false"); + + /* Only populate class_names for explicit multiclass configurations */ + if (has_explicit_classes) { + msg_debug("populating class_names for multiclass configuration"); + } + else { + msg_debug("skipping class_names population for binary configuration"); + } + + if (has_explicit_classes) { + ccf->class_names = g_ptr_array_new(); + + cur = ccf->statfiles; + while (cur) { + struct rspamd_statfile_config *stcf = (struct rspamd_statfile_config *) cur->data; + if (stcf->class_name) { + /* Check if class already exists */ + bool found = false; + for (unsigned int i = 0; i < ccf->class_names->len; i++) { + if (strcmp((char *) g_ptr_array_index(ccf->class_names, i), stcf->class_name) == 0) { + stcf->class_index = i; /* Store the index for O(1) lookup */ + found = true; + break; + } + } + + if (!found) { + /* Add new class */ + stcf->class_index = ccf->class_names->len; + g_ptr_array_add(ccf->class_names, g_strdup(stcf->class_name)); + } + } + cur = g_list_next(cur); + } + } + } + return TRUE; } @@ -1700,6 +1863,18 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) G_STRUCT_OFFSET(struct rspamd_config, log_task_max_elts), RSPAMD_CL_FLAG_UINT, "Maximum number of elements in task log entry (7 by default)"); + rspamd_rcl_add_default_handler(sub, + "max_tag_len", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, log_max_tag_len), + RSPAMD_CL_FLAG_UINT, + "Maximum length of log tag cannot exceed 32 (" G_STRINGIFY(RSPAMD_LOG_ID_LEN) ") by default)"); + rspamd_rcl_add_default_handler(sub, + "tag_strip_policy", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, log_tag_strip_policy_str), + 0, + "Log tag strip policy when tag exceeds max length: 'right', 'left', 'middle' (right by default)"); /* Documentation only options, handled in log_handler to map flags */ rspamd_rcl_add_doc_by_path(cfg, @@ -2437,7 +2612,7 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) FALSE, TRUE, cfg->doc_strings, - "CLassifier options"); + "Classifier options"); /* Default classifier is 'bayes' for now */ sub->default_key = "bayes"; @@ -2456,7 +2631,7 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) rspamd_rcl_add_default_handler(sub, "min_prob_strength", rspamd_rcl_parse_struct_double, - G_STRUCT_OFFSET(struct rspamd_classifier_config, min_token_hits), + G_STRUCT_OFFSET(struct rspamd_classifier_config, min_prob_strength), 0, "Use only tokens with probability in [0.5 - MPS, 0.5 + MPS]"); rspamd_rcl_add_default_handler(sub, @@ -2485,6 +2660,18 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) "Name of classifier"); /* + * Multi-class configuration + */ + rspamd_rcl_add_section_doc(&top, sub, + "class_labels", nullptr, + rspamd_rcl_class_labels_handler, + UCL_OBJECT, + FALSE, + TRUE, + sub->doc_ref, + "Class to backend label mapping for multi-class classification"); + + /* * Statfile defaults */ auto *ssub = rspamd_rcl_add_section_doc(&top, sub, @@ -2502,11 +2689,17 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) 0, "Statfile unique label"); rspamd_rcl_add_default_handler(ssub, + "class", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_statfile_config, class_name), + 0, + "Class name for multi-class classification"); + rspamd_rcl_add_default_handler(ssub, "spam", rspamd_rcl_parse_struct_boolean, G_STRUCT_OFFSET(struct rspamd_statfile_config, is_spam), 0, - "Sets if this statfile contains spam samples"); + "DEPRECATED: Sets if this statfile contains spam samples (use 'class' instead)"); } if (!(skip_sections && g_hash_table_lookup(skip_sections, "composite"))) { @@ -3640,7 +3833,7 @@ rspamd_config_parse_ucl(struct rspamd_config *cfg, /* Try to load keyfile if available */ auto keyfile_name = fmt::format("{}.key", filename); rspamd::util::raii_file::open(keyfile_name, O_RDONLY).map([&](const auto &keyfile) { - auto *kp_parser = ucl_parser_new(0); + auto *kp_parser = ucl_parser_new(UCL_PARSER_DEFAULT); if (ucl_parser_add_fd(kp_parser, keyfile.get_fd())) { auto *kp_obj = ucl_parser_get_object(kp_parser); diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx index dfbdc6bee..c22a9b877 100644 --- a/src/libserver/cfg_utils.cxx +++ b/src/libserver/cfg_utils.cxx @@ -72,6 +72,11 @@ #include "contrib/expected/expected.hpp" #include "contrib/ankerl/unordered_dense.h" +#include "libserver/task.h" +#include "libserver/url.h" +#define RSPAMD_TOKENIZER_INTERNAL// We need to use internal tokenizer API +#include "libstat/tokenizers/custom_tokenizer.h" + #define DEFAULT_SCORE 10.0 #define DEFAULT_RLIMIT_NOFILE 2048 @@ -821,6 +826,65 @@ rspamd_adjust_clocks_resolution(struct rspamd_config *cfg) #endif } +extern "C" { + +gboolean +rspamd_config_load_custom_tokenizers(struct rspamd_config *cfg, GError **err) +{ + /* Load custom tokenizers */ + const ucl_object_t *custom_tokenizers = ucl_object_lookup_path(cfg->cfg_ucl_obj, + "options.custom_tokenizers"); + if (custom_tokenizers != NULL) { + msg_info_config("loading custom tokenizers"); + + if (!cfg->tokenizer_manager) { + cfg->tokenizer_manager = rspamd_tokenizer_manager_new(cfg->cfg_pool); + } + + ucl_object_iter_t it = ucl_object_iterate_new(custom_tokenizers); + const ucl_object_t *tok_obj; + const char *tok_name; + + while ((tok_obj = ucl_object_iterate_safe(it, true)) != NULL) { + tok_name = ucl_object_key(tok_obj); + GError *local_err = NULL; + + if (!rspamd_tokenizer_manager_load_tokenizer(cfg->tokenizer_manager, + tok_name, tok_obj, &local_err)) { + msg_err_config("failed to load custom tokenizer '%s': %s", + tok_name, local_err ? local_err->message : "unknown error"); + + if (err && !*err) { + *err = g_error_copy(local_err); + } + + if (local_err) { + g_error_free(local_err); + } + + ucl_object_iterate_free(it); + return FALSE; + } + } + ucl_object_iterate_free(it); + + msg_info_config("loaded custom tokenizers successfully"); + } + + return TRUE; +} + +void rspamd_config_unload_custom_tokenizers(struct rspamd_config *cfg) +{ + if (cfg->tokenizer_manager) { + msg_info_config("unloading custom tokenizers"); + rspamd_tokenizer_manager_destroy(cfg->tokenizer_manager); + cfg->tokenizer_manager = NULL; + } +} + +}// extern "C" + /* * Perform post load actions */ @@ -940,6 +1004,20 @@ rspamd_config_post_load(struct rspamd_config *cfg, msg_err_config("cannot configure libraries, fatal error"); return FALSE; } + + /* Load custom tokenizers using the new function */ + GError *tokenizer_err = NULL; + if (!rspamd_config_load_custom_tokenizers(cfg, &tokenizer_err)) { + msg_err_config("failed to load custom tokenizers: %s", + tokenizer_err ? tokenizer_err->message : "unknown error"); + if (tokenizer_err) { + g_error_free(tokenizer_err); + } + + if (opts & RSPAMD_CONFIG_INIT_VALIDATE) { + ret = tl::make_unexpected(std::string{"failed to load custom tokenizers"}); + } + } } /* Validate cache */ @@ -1363,7 +1441,7 @@ rspamd_ucl_fin_cb(struct map_cb_data *data, void **target) } /* New data available */ - auto *parser = ucl_parser_new(0); + auto *parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS); if (!ucl_parser_add_chunk(parser, (unsigned char *) cbdata->buf.data(), cbdata->buf.size())) { msg_err_config("cannot parse map %s: %s", @@ -2964,3 +3042,189 @@ rspamd_ip_is_local_cfg(struct rspamd_config *cfg, return FALSE; } + +gboolean +rspamd_config_parse_class_labels(const ucl_object_t *obj, GHashTable **class_labels) +{ + const ucl_object_t *cur; + ucl_object_iter_t it = nullptr; + + if (!obj || ucl_object_type(obj) != UCL_OBJECT) { + return FALSE; + } + + if (*class_labels == nullptr) { + *class_labels = g_hash_table_new_full(g_str_hash, g_str_equal, + g_free, g_free); + } + + while ((cur = ucl_object_iterate(obj, &it, true)) != nullptr) { + const char *class_name = ucl_object_key(cur); + const char *label = ucl_object_tostring(cur); + + if (class_name && label) { + /* Validate class name: alphanumeric + underscore, max 32 chars */ + if (strlen(class_name) > 32) { + msg_err("class name '%s' is too long (max 32 characters)", class_name); + g_hash_table_destroy(*class_labels); + *class_labels = nullptr; + return FALSE; + } + + for (const char *p = class_name; *p; p++) { + if (!g_ascii_isalnum(*p) && *p != '_') { + msg_err("class name '%s' contains invalid character '%c'", class_name, *p); + g_hash_table_destroy(*class_labels); + *class_labels = nullptr; + return FALSE; + } + } + + /* Validate label uniqueness */ + if (g_hash_table_lookup(*class_labels, label)) { + msg_err("backend label '%s' is used by multiple classes", label); + g_hash_table_destroy(*class_labels); + *class_labels = nullptr; + return FALSE; + } + } + + g_hash_table_insert(*class_labels, g_strdup(class_name), g_strdup(label)); + } + + return g_hash_table_size(*class_labels) > 0; +} + +gboolean +rspamd_config_migrate_binary_config(struct rspamd_statfile_config *stcf) +{ + if (stcf->class_name != nullptr) { + /* Already migrated or using new format */ + return TRUE; + } + + if (stcf->is_spam) { + stcf->class_name = g_strdup("spam"); + msg_info("migrated statfile '%s' from is_spam=true to class='spam'", + stcf->symbol ? stcf->symbol : "unknown"); + } + else { + stcf->class_name = g_strdup("ham"); + msg_info("migrated statfile '%s' from is_spam=false to class='ham'", + stcf->symbol ? stcf->symbol : "unknown"); + } + + return TRUE; +} + +gboolean +rspamd_config_validate_class_config(struct rspamd_classifier_config *ccf, GError **err) +{ + GList *cur; + GHashTable *seen_classes = nullptr; + struct rspamd_statfile_config *stcf; + unsigned int class_count = 0; + + if (!ccf || !ccf->statfiles) { + g_set_error(err, g_quark_from_static_string("config"), 1, + "classifier has no statfiles defined"); + return FALSE; + } + + seen_classes = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, nullptr); + + /* Iterate through statfiles and collect classes */ + cur = ccf->statfiles; + while (cur) { + stcf = (struct rspamd_statfile_config *) cur->data; + + /* Migrate binary config if needed */ + if (!rspamd_config_migrate_binary_config(stcf)) { + g_set_error(err, g_quark_from_static_string("config"), 1, + "failed to migrate binary config for statfile '%s'", + stcf->symbol ? stcf->symbol : "unknown"); + g_hash_table_destroy(seen_classes); + return FALSE; + } + + /* Check class name */ + if (!stcf->class_name || strlen(stcf->class_name) == 0) { + g_set_error(err, g_quark_from_static_string("config"), 1, + "statfile '%s' has no class defined", + stcf->symbol ? stcf->symbol : "unknown"); + g_hash_table_destroy(seen_classes); + return FALSE; + } + + /* Track unique classes */ + if (!g_hash_table_contains(seen_classes, stcf->class_name)) { + g_hash_table_insert(seen_classes, g_strdup(stcf->class_name), GINT_TO_POINTER(1)); + class_count++; + } + + cur = g_list_next(cur); + } + + /* Validate class count */ + if (class_count < 2) { + g_set_error(err, g_quark_from_static_string("config"), 1, + "classifier must have at least 2 classes, found %ud", class_count); + g_hash_table_destroy(seen_classes); + return FALSE; + } + + if (class_count > 20) { + msg_warn("classifier has %ud classes, performance may be degraded above 20 classes", + class_count); + } + + /* Initialize classifier class tracking - only for explicit multiclass configurations */ + gboolean has_explicit_classes = FALSE; + + /* Check if any statfile uses explicit class declaration (not converted from is_spam) */ + cur = ccf->statfiles; + while (cur) { + stcf = (struct rspamd_statfile_config *) cur->data; + if (stcf->class_name && !stcf->is_spam_converted) { + has_explicit_classes = TRUE; + break; + } + cur = g_list_next(cur); + } + + /* Only populate class_names for explicit multiclass configurations */ + if (has_explicit_classes) { + if (ccf->class_names) { + g_ptr_array_unref(ccf->class_names); + } + ccf->class_names = g_ptr_array_new_with_free_func(g_free); + + /* Populate class names array */ + GHashTableIter iter; + gpointer key, value; + g_hash_table_iter_init(&iter, seen_classes); + while (g_hash_table_iter_next(&iter, &key, &value)) { + g_ptr_array_add(ccf->class_names, g_strdup((const char *) key)); + } + } + else { + /* Binary configuration - ensure class_names is NULL */ + if (ccf->class_names) { + g_ptr_array_unref(ccf->class_names); + ccf->class_names = nullptr; + } + } + + g_hash_table_destroy(seen_classes); + return TRUE; +} + +const char * +rspamd_config_get_class_label(struct rspamd_classifier_config *ccf, const char *class_name) +{ + if (!ccf || !ccf->class_labels || !class_name) { + return nullptr; + } + + return (const char *) g_hash_table_lookup(ccf->class_labels, class_name); +} diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx index 1b369ed17..c53e3c05e 100644 --- a/src/libserver/css/css.cxx +++ b/src/libserver/css/css.cxx @@ -1,11 +1,11 @@ -/*- - * Copyright 2021 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -107,7 +107,6 @@ auto css_style_sheet::add_selector_rule(std::unique_ptr<css_selector> &&selector auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspamd::html::html_block * { - std::optional<std::string_view> id_comp, class_comp; rspamd::html::html_block *res = nullptr; if (!tag) { @@ -115,14 +114,8 @@ auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspa } /* First, find id in a tag and a class */ - for (const auto ¶m: tag->components) { - if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) { - id_comp = param.value; - } - else if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { - class_comp = param.value; - } - } + auto id_comp = tag->find_id(); + auto class_comp = tag->find_class(); /* ID part */ if (id_comp && !pimpl->id_selectors.empty()) { @@ -224,4 +217,4 @@ auto css_parse_style(rspamd_mempool_t *pool, return std::make_pair(nullptr, parse_res.error()); } -}// namespace rspamd::css
\ No newline at end of file +}// namespace rspamd::css diff --git a/src/libserver/dynamic_cfg.c b/src/libserver/dynamic_cfg.c index 984517074..6d648d745 100644 --- a/src/libserver/dynamic_cfg.c +++ b/src/libserver/dynamic_cfg.c @@ -1,5 +1,5 @@ /* - * Copyright 2023 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -195,7 +195,7 @@ json_config_fin_cb(struct map_cb_data *data, void **target) return; } - parser = ucl_parser_new(0); + parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS); if (!ucl_parser_add_chunk(parser, jb->buf->str, jb->buf->len)) { msg_err("cannot load json data: parse error %s", diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 93d1fdf91..78a6a975c 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -39,6 +39,7 @@ #include "contrib/frozen/include/frozen/string.h" #include "contrib/fmt/include/fmt/core.h" +#include <functional> #include <unicode/uversion.h> namespace rspamd::html { @@ -47,23 +48,88 @@ static const unsigned int max_tags = 8192; /* Ignore tags if this maximum is rea static const html_tags_storage html_tags_defs; -auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>( +auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_enum_type>( { - {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME}, - {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, - {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, - {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, - {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR}, - {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR}, - {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE}, - {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS}, - {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH}, - {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT}, - {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE}, - {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL}, - {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT}, - {"id", html_component_type::RSPAMD_HTML_COMPONENT_ID}, - {"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN}, + {"name", html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME}, + {"href", html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF}, + {"src", html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC}, + {"action", html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF}, + {"color", html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR}, + {"bgcolor", html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR}, + {"style", html_component_enum_type::RSPAMD_HTML_COMPONENT_STYLE}, + {"class", html_component_enum_type::RSPAMD_HTML_COMPONENT_CLASS}, + {"width", html_component_enum_type::RSPAMD_HTML_COMPONENT_WIDTH}, + {"height", html_component_enum_type::RSPAMD_HTML_COMPONENT_HEIGHT}, + {"size", html_component_enum_type::RSPAMD_HTML_COMPONENT_SIZE}, + {"rel", html_component_enum_type::RSPAMD_HTML_COMPONENT_REL}, + {"alt", html_component_enum_type::RSPAMD_HTML_COMPONENT_ALT}, + {"id", html_component_enum_type::RSPAMD_HTML_COMPONENT_ID}, + {"hidden", html_component_enum_type::RSPAMD_HTML_COMPONENT_HIDDEN}, + // Typography + {"font-family", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_FAMILY}, + {"font-size", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_SIZE}, + {"font-weight", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_WEIGHT}, + {"font-style", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_STYLE}, + {"text-align", html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_ALIGN}, + {"text-decoration", html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_DECORATION}, + {"line-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_LINE_HEIGHT}, + // Layout & positioning + {"margin", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN}, + {"margin-top", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_TOP}, + {"margin-bottom", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM}, + {"margin-left", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_LEFT}, + {"margin-right", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_RIGHT}, + {"padding", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING}, + {"padding-top", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_TOP}, + {"padding-bottom", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_BOTTOM}, + {"padding-left", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_LEFT}, + {"padding-right", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_RIGHT}, + {"border", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER}, + {"border-color", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_COLOR}, + {"border-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_WIDTH}, + {"border-style", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_STYLE}, + // Display & visibility + {"display", html_component_enum_type::RSPAMD_HTML_COMPONENT_DISPLAY}, + {"visibility", html_component_enum_type::RSPAMD_HTML_COMPONENT_VISIBILITY}, + {"opacity", html_component_enum_type::RSPAMD_HTML_COMPONENT_OPACITY}, + // Dimensions + {"min-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_WIDTH}, + {"max-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_WIDTH}, + {"min-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_HEIGHT}, + {"max-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_HEIGHT}, + // Table attributes + {"cellpadding", html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLPADDING}, + {"cellspacing", html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLSPACING}, + {"valign", html_component_enum_type::RSPAMD_HTML_COMPONENT_VALIGN}, + {"align", html_component_enum_type::RSPAMD_HTML_COMPONENT_ALIGN}, + // Form attributes + {"type", html_component_enum_type::RSPAMD_HTML_COMPONENT_TYPE}, + {"value", html_component_enum_type::RSPAMD_HTML_COMPONENT_VALUE}, + {"placeholder", html_component_enum_type::RSPAMD_HTML_COMPONENT_PLACEHOLDER}, + {"disabled", html_component_enum_type::RSPAMD_HTML_COMPONENT_DISABLED}, + {"readonly", html_component_enum_type::RSPAMD_HTML_COMPONENT_READONLY}, + {"checked", html_component_enum_type::RSPAMD_HTML_COMPONENT_CHECKED}, + {"selected", html_component_enum_type::RSPAMD_HTML_COMPONENT_SELECTED}, + // Link & media + {"target", html_component_enum_type::RSPAMD_HTML_COMPONENT_TARGET}, + {"title", html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE}, + // Meta & document + {"charset", html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET}, + {"content", html_component_enum_type::RSPAMD_HTML_COMPONENT_CONTENT}, + {"http-equiv", html_component_enum_type::RSPAMD_HTML_COMPONENT_HTTP_EQUIV}, + // Accessibility + {"role", html_component_enum_type::RSPAMD_HTML_COMPONENT_ROLE}, + {"tabindex", html_component_enum_type::RSPAMD_HTML_COMPONENT_TABINDEX}, + // Background + {"background", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND}, + {"background-image", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE}, + {"background-color", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR}, + {"background-repeat", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT}, + {"background-position", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION}, + // Email-specific tracking + {"data-track", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_TRACK}, + {"data-id", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_ID}, + {"data-url", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_URL}, }); #define msg_debug_html(...) rspamd_conditional_debug_fast(NULL, NULL, \ @@ -199,18 +265,608 @@ html_check_balance(struct html_content *hc, return nullptr; } -auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type> +auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component { - auto known_component_it = html_components_map.find(st); + auto known_component_it = html_components_map.find(name); if (known_component_it != html_components_map.end()) { - return known_component_it->second; + switch (known_component_it->second) { + case html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME: + return html_component_name{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF: + return html_component_href{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR: + return html_component_color{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR: + return html_component_bgcolor{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_STYLE: + return html_component_style{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CLASS: + return html_component_class{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_WIDTH: + return html_component_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HEIGHT: + return html_component_height{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_SIZE: + return html_component_size{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_REL: + return html_component_rel{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ALT: + return html_component_alt{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ID: + return html_component_id{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HIDDEN: + return html_component_hidden{}; + // Typography + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_FAMILY: + return html_component_font_family{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_SIZE: + return html_component_font_size{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_WEIGHT: + return html_component_font_weight{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_STYLE: + return html_component_font_style{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_ALIGN: + return html_component_text_align{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_DECORATION: + return html_component_text_decoration{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_LINE_HEIGHT: + return html_component_line_height{value}; + // Layout + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN: + return html_component_margin{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_TOP: + return html_component_margin_top{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM: + return html_component_margin_bottom{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_LEFT: + return html_component_margin_left{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_RIGHT: + return html_component_margin_right{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING: + return html_component_padding{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_TOP: + return html_component_padding_top{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_BOTTOM: + return html_component_padding_bottom{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_LEFT: + return html_component_padding_left{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_RIGHT: + return html_component_padding_right{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER: + return html_component_border{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_COLOR: + return html_component_border_color{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_WIDTH: + return html_component_border_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_STYLE: + return html_component_border_style{value}; + // Display + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DISPLAY: + return html_component_display{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_VISIBILITY: + return html_component_visibility{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_OPACITY: + return html_component_opacity{value}; + // Dimensions + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_WIDTH: + return html_component_min_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_WIDTH: + return html_component_max_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_HEIGHT: + return html_component_min_height{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_HEIGHT: + return html_component_max_height{value}; + // Table + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLPADDING: + return html_component_cellpadding{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLSPACING: + return html_component_cellspacing{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_VALIGN: + return html_component_valign{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ALIGN: + return html_component_align{value}; + // Form + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TYPE: + return html_component_type{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_VALUE: + return html_component_value{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PLACEHOLDER: + return html_component_placeholder{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DISABLED: + return html_component_disabled{}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_READONLY: + return html_component_readonly{}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHECKED: + return html_component_checked{}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_SELECTED: + return html_component_selected{}; + // Link & media + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TARGET: + return html_component_target{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE: + return html_component_title{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC: + return html_component_src{value}; + // Meta + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET: + return html_component_charset{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CONTENT: + return html_component_content{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HTTP_EQUIV: + return html_component_http_equiv{value}; + // Accessibility + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ROLE: + return html_component_role{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TABINDEX: + return html_component_tabindex{value}; + // Background + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND: + return html_component_background{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE: + return html_component_background_image{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR: + return html_component_background_color{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT: + return html_component_background_repeat{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION: + return html_component_background_position{value}; + // Email tracking + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_TRACK: + return html_component_data_track{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_ID: + return html_component_data_id{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_URL: + return html_component_data_url{value}; + default: + return html_component_unknown{name, value}; + } } else { - return std::nullopt; + return html_component_unknown{name, value}; } } +using component_extractor_func = std::function<std::optional<std::string_view>(const html_tag *)>; +static const auto component_extractors = frozen::make_unordered_map<frozen::string, component_extractor_func>( + { + // Basic components + {"name", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_name>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"href", [](const html_tag *tag) { return tag->find_href(); }}, + {"src", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_src>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"class", [](const html_tag *tag) { return tag->find_class(); }}, + {"id", [](const html_tag *tag) { return tag->find_id(); }}, + {"style", [](const html_tag *tag) { return tag->find_style(); }}, + {"alt", [](const html_tag *tag) { return tag->find_alt(); }}, + {"rel", [](const html_tag *tag) { return tag->find_rel(); }}, + {"color", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_color>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"bgcolor", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_bgcolor>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Numeric components (return string representation) + {"width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"size", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_size>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Boolean components + {"hidden", [](const html_tag *tag) -> std::optional<std::string_view> { + return tag->is_hidden() ? std::optional<std::string_view>{"true"} : std::nullopt; + }}, + + // Typography components + {"font-family", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_family>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"font-size", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_size>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"font-weight", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_weight>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"font-style", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_style>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"text-align", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_text_align>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"text-decoration", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_text_decoration>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"line-height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_line_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Layout components + {"margin", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-top", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_top>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-bottom", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_bottom>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-left", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_left>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-right", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_right>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-top", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_top>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-bottom", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_bottom>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-left", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_left>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-right", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_right>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"border", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"border-color", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border_color>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"border-width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"border-style", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border_style>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Display components + {"display", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_display>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"visibility", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_visibility>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"opacity", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_opacity>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Additional dimensions + {"min-width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_min_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"max-width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_max_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"min-height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_min_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"max-height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_max_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Table components + {"cellpadding", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_cellpadding>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"cellspacing", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_cellspacing>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"valign", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_valign>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"align", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_align>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Form components + {"type", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_type>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"value", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_value>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"placeholder", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_placeholder>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"disabled", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_disabled>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + {"readonly", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_readonly>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + {"checked", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_checked>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + {"selected", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_selected>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + + // Link & media components + {"target", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_target>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"title", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_title>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Meta components + {"charset", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_charset>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"content", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_content>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"http-equiv", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_http_equiv>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Accessibility components + {"role", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_role>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"tabindex", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_tabindex>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Background components + {"background", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-image", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_image>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-color", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_color>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-repeat", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_repeat>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-position", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_position>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Email tracking components + {"data-track", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_data_track>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"data-id", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_data_id>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"data-url", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_data_url>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + }); + +auto html_tag::find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view> +{ + auto it = component_extractors.find(attr_name); + if (it != component_extractors.end()) { + return it->second(this); + } + + // Fallback to unknown components + return find_unknown_component(attr_name); +} + +auto html_tag::get_all_attributes() const -> std::vector<std::pair<std::string_view, std::string_view>> +{ + std::vector<std::pair<std::string_view, std::string_view>> result; + + // First, get all known attributes using the component_extractors map + for (const auto &[attr_name, extractor_func]: component_extractors) { + if (auto value = extractor_func(this)) { + // Convert frozen::string to std::string_view for the key + std::string_view name_view{attr_name.data(), attr_name.size()}; + result.emplace_back(name_view, value.value()); + } + } + + // Then add all unknown attributes + auto unknown_attrs = get_unknown_components(); + for (const auto &[name, value]: unknown_attrs) { + result.emplace_back(name, value); + } + + return result; +} + enum tag_parser_state { parse_start = 0, parse_name, @@ -234,13 +890,13 @@ enum tag_parser_state { struct tag_content_parser_state { tag_parser_state cur_state = parse_start; std::string buf; - std::optional<html_component_type> cur_component; + std::string attr_name;// Store current attribute name void reset() { cur_state = parse_start; buf.clear(); - cur_component = std::nullopt; + attr_name.clear(); } }; @@ -254,56 +910,50 @@ html_parse_tag_content(rspamd_mempool_t *pool, auto state = parser_env.cur_state; /* - * Stores tag component if it doesn't exist, performing copy of the - * value + decoding of the entities - * Parser env is set to clear the current html attribute fields (saved_p and - * cur_component) + * Stores tag component creating the appropriate variant type + * Parser env is cleared after storing */ auto store_component_value = [&]() -> void { - if (parser_env.cur_component) { + if (!parser_env.attr_name.empty()) { + std::string_view attr_name_view, value_view; - if (parser_env.buf.empty()) { - tag->components.emplace_back(parser_env.cur_component.value(), - std::string_view{}); + // Store attribute name in persistent memory + if (!parser_env.attr_name.empty()) { + auto *name_storage = rspamd_mempool_alloc_buffer(pool, parser_env.attr_name.size()); + memcpy(name_storage, parser_env.attr_name.data(), parser_env.attr_name.size()); + attr_name_view = {name_storage, parser_env.attr_name.size()}; } - else { - /* We need to copy buf to a persistent storage */ - auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); - if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID || - parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { - /* Lowercase */ - rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size()); + // Store value in persistent memory if not empty + if (!parser_env.buf.empty()) { + auto *value_storage = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); + + // Lowercase for id and class attributes + if (parser_env.attr_name == "id" || parser_env.attr_name == "class") { + rspamd_str_copy_lc(parser_env.buf.data(), value_storage, parser_env.buf.size()); } else { - memcpy(s, parser_env.buf.data(), parser_env.buf.size()); + memcpy(value_storage, parser_env.buf.data(), parser_env.buf.size()); } - auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size()); - tag->components.emplace_back(parser_env.cur_component.value(), - std::string_view{s, sz}); + auto sz = rspamd_html_decode_entitles_inplace(value_storage, parser_env.buf.size()); + value_view = {value_storage, sz}; } + + // Create the appropriate component variant + auto component = html_component_from_string(attr_name_view, value_view); + tag->components.emplace_back(std::move(component)); } parser_env.buf.clear(); - parser_env.cur_component = std::nullopt; + parser_env.attr_name.clear(); }; auto store_component_name = [&]() -> bool { decode_html_entitles_inplace(parser_env.buf); - auto known_component_it = html_components_map.find(std::string_view{parser_env.buf}); + parser_env.attr_name = parser_env.buf; parser_env.buf.clear(); - - if (known_component_it != html_components_map.end()) { - parser_env.cur_component = known_component_it->second; - - return true; - } - else { - parser_env.cur_component = std::nullopt; - } - - return false; + return true; }; auto store_value_character = [&](bool lc) -> void { @@ -471,6 +1121,7 @@ html_parse_tag_content(rspamd_mempool_t *pool, case parse_start_dquote: if (*in == '"') { + store_component_value(); state = spaces_after_param; } else { @@ -481,6 +1132,7 @@ html_parse_tag_content(rspamd_mempool_t *pool, case parse_start_squote: if (*in == '\'') { + store_component_value(); state = spaces_after_param; } else { @@ -620,7 +1272,7 @@ html_process_url_tag(rspamd_mempool_t *pool, struct html_tag *tag, struct html_content *hc) -> std::optional<struct rspamd_url *> { - auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF); + auto found_href_maybe = tag->find_href(); if (found_href_maybe) { /* Check base url */ @@ -816,130 +1468,126 @@ html_process_img_tag(rspamd_mempool_t *pool, img = rspamd_mempool_alloc0_type(pool, struct html_image); img->tag = tag; - for (const auto ¶m: tag->components) { + // Process SRC component (preferred for img tags) or HREF component (fallback) + std::optional<std::string_view> href_value; - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) { - /* Check base url */ - const auto &href_value = param.value; + // Try SRC first (standard for img tags) + if (auto src_comp = tag->find_component<html_component_src>()) { + href_value = src_comp.value()->value; + } + // Fallback to HREF (for backward compatibility or non-standard usage) + else if (auto href_comp = tag->find_href()) { + href_value = href_comp; + } - if (href_value.size() > 0) { - rspamd_ftok_t fstr; - fstr.begin = href_value.data(); - fstr.len = href_value.size(); - img->src = rspamd_mempool_ftokdup(pool, &fstr); + if (href_value && href_value->size() > 0) { + rspamd_ftok_t fstr; + fstr.begin = href_value->data(); + fstr.len = href_value->size(); + img->src = rspamd_mempool_ftokdup(pool, &fstr); - if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(), - "cid:", sizeof("cid:") - 1) == 0) { - /* We have an embedded image */ - img->src += sizeof("cid:") - 1; - img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; - } - else { - if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(), - "data:", sizeof("data:") - 1) == 0) { - /* We have an embedded image in HTML tag */ - img->flags |= - (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); - html_process_data_image(pool, img, href_value); - hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; - } - else { - img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; - if (img->src) { - - std::string_view cpy{href_value}; - auto maybe_url = html_process_url(pool, cpy); - - if (maybe_url) { - img->url = maybe_url.value(); - struct rspamd_url *existing; - - img->url->flags |= RSPAMD_URL_FLAG_IMAGE; - existing = rspamd_url_set_add_or_return(url_set, - img->url); - - if (existing && existing != img->url) { - /* - * We have some other URL that could be - * found, e.g. from another part. However, - * we still want to set an image flag on it - */ - existing->flags |= img->url->flags; - existing->count++; - } - else if (part_urls) { - /* New url */ - g_ptr_array_add(part_urls, img->url); - } - } - } - } - } - } + if (href_value->size() > sizeof("cid:") - 1 && memcmp(href_value->data(), + "cid:", sizeof("cid:") - 1) == 0) { + /* We have an embedded image */ + img->src += sizeof("cid:") - 1; + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; } + else { + if (href_value->size() > sizeof("data:") - 1 && memcmp(href_value->data(), + "data:", sizeof("data:") - 1) == 0) { + /* We have an embedded image in HTML tag */ + img->flags |= + (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); + html_process_data_image(pool, img, *href_value); + hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; + } + else { + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; + if (img->src) { + std::string_view cpy{*href_value}; + auto maybe_url = html_process_url(pool, cpy); - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) { - unsigned long val; + if (maybe_url) { + img->url = maybe_url.value(); + struct rspamd_url *existing; - rspamd_strtoul(param.value.data(), param.value.size(), &val); - img->height = val; - } + img->url->flags |= RSPAMD_URL_FLAG_IMAGE; + existing = rspamd_url_set_add_or_return(url_set, + img->url); - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) { - unsigned long val; - - rspamd_strtoul(param.value.data(), param.value.size(), &val); - img->width = val; + if (existing && existing != img->url) { + /* + * We have some other URL that could be + * found, e.g. from another part. However, + * we still want to set an image flag on it + */ + existing->flags |= img->url->flags; + existing->count++; + } + else if (part_urls) { + /* New url */ + g_ptr_array_add(part_urls, img->url); + } + } + } + } } + } - /* TODO: rework to css at some time */ - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { - if (img->height == 0) { - auto style_st = param.value; - auto pos = rspamd_substring_search_caseless(style_st.data(), - style_st.size(), - "height", sizeof("height") - 1); - if (pos != -1) { - auto substr = style_st.substr(pos + sizeof("height") - 1); + // Process numeric dimensions using the new helper methods + if (auto height = tag->find_height()) { + img->height = height.value(); + } - for (auto i = 0; i < substr.size(); i++) { - auto t = substr[i]; - if (g_ascii_isdigit(t)) { - unsigned long val; - rspamd_strtoul(substr.data(), - substr.size(), &val); - img->height = val; - break; - } - else if (!g_ascii_isspace(t) && t != '=' && t != ':') { - /* Fallback */ - break; - } + if (auto width = tag->find_width()) { + img->width = width.value(); + } + + // Process style component for dimensions + if (auto style_value = tag->find_style()) { + if (img->height == 0) { + auto pos = rspamd_substring_search_caseless(style_value->data(), + style_value->size(), + "height", sizeof("height") - 1); + if (pos != -1) { + auto substr = style_value->substr(pos + sizeof("height") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->height = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; } } } - if (img->width == 0) { - auto style_st = param.value; - auto pos = rspamd_substring_search_caseless(style_st.data(), - style_st.size(), - "width", sizeof("width") - 1); - if (pos != -1) { - auto substr = style_st.substr(pos + sizeof("width") - 1); - - for (auto i = 0; i < substr.size(); i++) { - auto t = substr[i]; - if (g_ascii_isdigit(t)) { - unsigned long val; - rspamd_strtoul(substr.data(), - substr.size(), &val); - img->width = val; - break; - } - else if (!g_ascii_isspace(t) && t != '=' && t != ':') { - /* Fallback */ - break; - } + } + if (img->width == 0) { + auto pos = rspamd_substring_search_caseless(style_value->data(), + style_value->size(), + "width", sizeof("width") - 1); + if (pos != -1) { + auto substr = style_value->substr(pos + sizeof("width") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->width = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; } } } @@ -968,7 +1616,7 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag, khash_t(rspamd_url_hash) * url_set, GPtrArray *part_urls) -> void { - auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL); + auto found_rel_maybe = tag->find_rel(); if (found_rel_maybe) { if (found_rel_maybe.value() == "icon") { @@ -984,24 +1632,23 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor; bool hidden = false; - for (const auto ¶m: tag->components) { - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) { - maybe_fgcolor = css::css_value::maybe_color_from_string(param.value); - } - - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) { - maybe_bgcolor = css::css_value::maybe_color_from_string(param.value); - } + // Process color components + if (auto color_comp = tag->find_component<html_component_color>()) { + maybe_fgcolor = css::css_value::maybe_color_from_string(color_comp.value()->value); + } - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { - tag->block = rspamd::css::parse_css_declaration(pool, param.value); - } + if (auto bgcolor_comp = tag->find_component<html_component_bgcolor>()) { + maybe_bgcolor = css::css_value::maybe_color_from_string(bgcolor_comp.value()->value); + } - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) { - hidden = true; - } + // Process style component + if (auto style_value = tag->find_style()) { + tag->block = rspamd::css::parse_css_declaration(pool, *style_value); } + // Check if hidden + hidden = tag->is_hidden(); + if (!tag->block) { tag->block = html_block::undefined_html_block_pool(pool); } @@ -1284,7 +1931,7 @@ html_append_tag_content(rspamd_mempool_t *pool, } else if (tag->id == Tag_IMG) { /* Process ALT if presented */ - auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT); + auto maybe_alt = tag->find_alt(); if (maybe_alt) { if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) { @@ -1384,9 +2031,7 @@ auto html_process_input(struct rspamd_task *task, overflow_input = true; } - auto new_tag = [&](int flags = 0) -> struct html_tag * - { - + auto new_tag = [&](int flags = 0) -> struct html_tag * { if (hc->all_tags.size() > rspamd::html::max_tags) { hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS; @@ -2151,7 +2796,7 @@ auto html_process_input(struct rspamd_task *task, /* Leftover after content */ switch (state) { case tags_limit_overflow: - html_append_parsed(hc, {c, (std::size_t)(end - c)}, + html_append_parsed(hc, {c, (std::size_t) (end - c)}, false, end - start, hc->parsed); break; default: @@ -2390,4 +3035,4 @@ gsize rspamd_html_get_tags_count(void *html_content) } return hc->all_tags.size(); -}
\ No newline at end of file +} diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index 309d76177..6d41f1337 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -1,11 +1,11 @@ -/*- - * Copyright 2021 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -26,6 +26,7 @@ #include <cstdint> #include "html_tags.h" +#include "libutil/str_util.h" struct rspamd_url; struct html_image; @@ -34,7 +35,8 @@ namespace rspamd::html { struct html_content; /* Forward declaration */ -enum class html_component_type : std::uint8_t { +// Internal enum for mapping (not exposed in public API) +enum class html_component_enum_type : std::uint8_t { RSPAMD_HTML_COMPONENT_NAME = 0, RSPAMD_HTML_COMPONENT_HREF, RSPAMD_HTML_COMPONENT_COLOR, @@ -48,8 +50,1214 @@ enum class html_component_type : std::uint8_t { RSPAMD_HTML_COMPONENT_ALT, RSPAMD_HTML_COMPONENT_ID, RSPAMD_HTML_COMPONENT_HIDDEN, + // Typography + RSPAMD_HTML_COMPONENT_FONT_FAMILY, + RSPAMD_HTML_COMPONENT_FONT_SIZE, + RSPAMD_HTML_COMPONENT_FONT_WEIGHT, + RSPAMD_HTML_COMPONENT_FONT_STYLE, + RSPAMD_HTML_COMPONENT_TEXT_ALIGN, + RSPAMD_HTML_COMPONENT_TEXT_DECORATION, + RSPAMD_HTML_COMPONENT_LINE_HEIGHT, + // Layout & positioning + RSPAMD_HTML_COMPONENT_MARGIN, + RSPAMD_HTML_COMPONENT_MARGIN_TOP, + RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM, + RSPAMD_HTML_COMPONENT_MARGIN_LEFT, + RSPAMD_HTML_COMPONENT_MARGIN_RIGHT, + RSPAMD_HTML_COMPONENT_PADDING, + RSPAMD_HTML_COMPONENT_PADDING_TOP, + RSPAMD_HTML_COMPONENT_PADDING_BOTTOM, + RSPAMD_HTML_COMPONENT_PADDING_LEFT, + RSPAMD_HTML_COMPONENT_PADDING_RIGHT, + RSPAMD_HTML_COMPONENT_BORDER, + RSPAMD_HTML_COMPONENT_BORDER_COLOR, + RSPAMD_HTML_COMPONENT_BORDER_WIDTH, + RSPAMD_HTML_COMPONENT_BORDER_STYLE, + // Display & visibility + RSPAMD_HTML_COMPONENT_DISPLAY, + RSPAMD_HTML_COMPONENT_VISIBILITY, + RSPAMD_HTML_COMPONENT_OPACITY, + // Dimensions + RSPAMD_HTML_COMPONENT_MIN_WIDTH, + RSPAMD_HTML_COMPONENT_MAX_WIDTH, + RSPAMD_HTML_COMPONENT_MIN_HEIGHT, + RSPAMD_HTML_COMPONENT_MAX_HEIGHT, + // Table attributes + RSPAMD_HTML_COMPONENT_CELLPADDING, + RSPAMD_HTML_COMPONENT_CELLSPACING, + RSPAMD_HTML_COMPONENT_VALIGN, + RSPAMD_HTML_COMPONENT_ALIGN, + // Form attributes + RSPAMD_HTML_COMPONENT_TYPE, + RSPAMD_HTML_COMPONENT_VALUE, + RSPAMD_HTML_COMPONENT_PLACEHOLDER, + RSPAMD_HTML_COMPONENT_DISABLED, + RSPAMD_HTML_COMPONENT_READONLY, + RSPAMD_HTML_COMPONENT_CHECKED, + RSPAMD_HTML_COMPONENT_SELECTED, + // Link & media + RSPAMD_HTML_COMPONENT_TARGET, + RSPAMD_HTML_COMPONENT_TITLE, + RSPAMD_HTML_COMPONENT_SRC, + // Meta & document + RSPAMD_HTML_COMPONENT_CHARSET, + RSPAMD_HTML_COMPONENT_CONTENT, + RSPAMD_HTML_COMPONENT_HTTP_EQUIV, + // Accessibility + RSPAMD_HTML_COMPONENT_ROLE, + RSPAMD_HTML_COMPONENT_TABINDEX, + // Background + RSPAMD_HTML_COMPONENT_BACKGROUND, + RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE, + RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR, + RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT, + RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION, + // Email-specific tracking + RSPAMD_HTML_COMPONENT_DATA_TRACK, + RSPAMD_HTML_COMPONENT_DATA_ID, + RSPAMD_HTML_COMPONENT_DATA_URL, }; +// Forward declarations for component types +struct html_component_name; +struct html_component_href; +struct html_component_color; +struct html_component_bgcolor; +struct html_component_style; +struct html_component_class; +struct html_component_width; +struct html_component_height; +struct html_component_size; +struct html_component_rel; +struct html_component_alt; +struct html_component_id; +struct html_component_hidden; +struct html_component_unknown; + +// Base interface for all components +struct html_component_base { + virtual ~html_component_base() = default; + virtual constexpr std::string_view get_string_value() const = 0; +}; + +// String-based components +struct html_component_name : html_component_base { + std::string_view value; + explicit constexpr html_component_name(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_href : html_component_base { + std::string_view value; + explicit constexpr html_component_href(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_style : html_component_base { + std::string_view value; + explicit constexpr html_component_style(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_class : html_component_base { + std::string_view value; + explicit constexpr html_component_class(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_rel : html_component_base { + std::string_view value; + explicit constexpr html_component_rel(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_alt : html_component_base { + std::string_view value; + explicit constexpr html_component_alt(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_id : html_component_base { + std::string_view value; + explicit constexpr html_component_id(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +// Color components (could be extended to parse actual colors) +struct html_component_color : html_component_base { + std::string_view value; + explicit constexpr html_component_color(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_bgcolor : html_component_base { + std::string_view value; + explicit constexpr html_component_bgcolor(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +// Numeric components +struct html_component_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_width(const std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_height(const std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_size : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_size(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Boolean/flag component +struct html_component_hidden : html_component_base { + bool present; + explicit constexpr html_component_hidden() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +// Unknown component with both name and value +struct html_component_unknown : html_component_base { + std::string_view name; + std::string_view value; + + constexpr html_component_unknown(std::string_view n, std::string_view v) + : name(n), value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } + constexpr std::string_view get_name() const + { + return name; + } +}; + +// Typography components +struct html_component_font_family : html_component_base { + std::string_view value; + explicit constexpr html_component_font_family(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_font_size : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_font_size(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_font_weight : html_component_base { + std::string_view value; + explicit constexpr html_component_font_weight(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_font_style : html_component_base { + std::string_view value; + explicit constexpr html_component_font_style(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_text_align : html_component_base { + std::string_view value; + explicit constexpr html_component_text_align(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_text_decoration : html_component_base { + std::string_view value; + explicit constexpr html_component_text_decoration(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_line_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_line_height(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Layout components (most are string-based for flexibility) +struct html_component_margin : html_component_base { + std::string_view value; + explicit constexpr html_component_margin(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_top : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_top(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_bottom : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_bottom(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_left : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_left(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_right : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_right(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding : html_component_base { + std::string_view value; + explicit constexpr html_component_padding(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_top : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_top(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_bottom : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_bottom(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_left : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_left(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_right : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_right(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_border : html_component_base { + std::string_view value; + explicit html_component_border(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_border_color : html_component_base { + std::string_view value; + explicit html_component_border_color(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_border_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_border_width(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_border_style : html_component_base { + std::string_view value; + explicit html_component_border_style(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Display components +struct html_component_display : html_component_base { + std::string_view value; + explicit html_component_display(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_visibility : html_component_base { + std::string_view value; + explicit html_component_visibility(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_opacity : html_component_base { + std::string_view raw_value; + std::optional<float> numeric_value; + + explicit html_component_opacity(std::string_view v) + : raw_value(v) + { + char *endptr; + auto val = std::strtof(v.data(), &endptr); + if (endptr != v.data() && val >= 0.0f && val <= 1.0f) { + numeric_value = val; + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<float> get_numeric_value() const + { + return numeric_value; + } +}; + +// Additional dimension components +struct html_component_min_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_min_width(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_max_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_max_width(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_min_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_min_height(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_max_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_max_height(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Table components +struct html_component_cellpadding : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_cellpadding(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_cellspacing : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_cellspacing(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_valign : html_component_base { + std::string_view value; + explicit html_component_valign(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_align : html_component_base { + std::string_view value; + explicit html_component_align(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Form components +struct html_component_type : html_component_base { + std::string_view value; + explicit html_component_type(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_value : html_component_base { + std::string_view value; + explicit html_component_value(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_placeholder : html_component_base { + std::string_view value; + explicit html_component_placeholder(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Boolean form components +struct html_component_disabled : html_component_base { + bool present; + explicit constexpr html_component_disabled() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +struct html_component_readonly : html_component_base { + bool present; + explicit constexpr html_component_readonly() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +struct html_component_checked : html_component_base { + bool present; + explicit constexpr html_component_checked() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +struct html_component_selected : html_component_base { + bool present; + explicit constexpr html_component_selected() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +// Link & media components +struct html_component_target : html_component_base { + std::string_view value; + explicit html_component_target(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_title : html_component_base { + std::string_view value; + explicit html_component_title(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_src : html_component_base { + std::string_view value; + explicit html_component_src(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Meta components +struct html_component_charset : html_component_base { + std::string_view value; + explicit html_component_charset(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_content : html_component_base { + std::string_view value; + explicit html_component_content(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_http_equiv : html_component_base { + std::string_view value; + explicit html_component_http_equiv(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Accessibility components +struct html_component_role : html_component_base { + std::string_view value; + explicit html_component_role(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_tabindex : html_component_base { + std::string_view raw_value; + std::optional<std::int32_t> numeric_value; + + explicit html_component_tabindex(std::string_view v) + : raw_value(v) + { + long val; + if (rspamd_strtol(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::int32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::int32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Background components +struct html_component_background : html_component_base { + std::string_view value; + explicit html_component_background(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_image : html_component_base { + std::string_view value; + explicit html_component_background_image(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_color : html_component_base { + std::string_view value; + explicit html_component_background_color(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_repeat : html_component_base { + std::string_view value; + explicit html_component_background_repeat(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_position : html_component_base { + std::string_view value; + explicit html_component_background_position(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Email tracking components +struct html_component_data_track : html_component_base { + std::string_view value; + explicit html_component_data_track(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_data_id : html_component_base { + std::string_view value; + explicit html_component_data_id(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_data_url : html_component_base { + std::string_view value; + explicit html_component_data_url(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// The variant type that holds all possible components +using html_tag_component = std::variant< + html_component_name, + html_component_href, + html_component_color, + html_component_bgcolor, + html_component_style, + html_component_class, + html_component_width, + html_component_height, + html_component_size, + html_component_rel, + html_component_alt, + html_component_id, + html_component_hidden, + // Typography + html_component_font_family, + html_component_font_size, + html_component_font_weight, + html_component_font_style, + html_component_text_align, + html_component_text_decoration, + html_component_line_height, + // Layout + html_component_margin, + html_component_margin_top, + html_component_margin_bottom, + html_component_margin_left, + html_component_margin_right, + html_component_padding, + html_component_padding_top, + html_component_padding_bottom, + html_component_padding_left, + html_component_padding_right, + html_component_border, + html_component_border_color, + html_component_border_width, + html_component_border_style, + // Display + html_component_display, + html_component_visibility, + html_component_opacity, + // Dimensions + html_component_min_width, + html_component_max_width, + html_component_min_height, + html_component_max_height, + // Table + html_component_cellpadding, + html_component_cellspacing, + html_component_valign, + html_component_align, + // Form + html_component_type, + html_component_value, + html_component_placeholder, + html_component_disabled, + html_component_readonly, + html_component_checked, + html_component_selected, + // Link & media + html_component_target, + html_component_title, + html_component_src, + // Meta + html_component_charset, + html_component_content, + html_component_http_equiv, + // Accessibility + html_component_role, + html_component_tabindex, + // Background + html_component_background, + html_component_background_image, + html_component_background_color, + html_component_background_repeat, + html_component_background_position, + // Email tracking + html_component_data_track, + html_component_data_id, + html_component_data_url, + // Unknown + html_component_unknown>; + +/** + * Returns component variant from a string + * @param name attribute name + * @param value attribute value + * @return variant component + */ +auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component; + /* Public tags flags */ /* XML tag */ #define FL_XML (1u << CM_USER_SHIFT) @@ -62,23 +1270,7 @@ enum class html_component_type : std::uint8_t { #define FL_COMMENT (1 << (CM_USER_SHIFT + 6)) #define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7)) -/** - * Returns component type from a string - * @param st - * @return - */ -auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>; - using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>; -struct html_tag_component { - html_component_type type; - std::string_view value; - - html_tag_component(html_component_type type, std::string_view value) - : type(type), value(value) - { - } -}; /* Pairing closing tag representation */ struct html_closing_tag { @@ -105,26 +1297,128 @@ struct html_tag { std::vector<struct html_tag *> children; struct html_tag *parent; - auto find_component(html_component_type what) const -> std::optional<std::string_view> + // Template method to find component by type + template<typename T> + auto find_component() const -> std::optional<const T *> { for (const auto &comp: components) { - if (comp.type == what) { - return comp.value; + if (std::holds_alternative<T>(comp)) { + return &std::get<T>(comp); } } + return std::nullopt; + } + // Helper methods for common component access + auto find_href() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_href>()) { + return comp.value()->value; + } return std::nullopt; } - auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view> + auto find_class() const -> std::optional<std::string_view> { - if (what) { - return find_component(what.value()); + if (auto comp = find_component<html_component_class>()) { + return comp.value()->value; } + return std::nullopt; + } + auto find_id() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_id>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto find_width() const -> std::optional<std::uint32_t> + { + if (auto comp = find_component<html_component_width>()) { + return comp.value()->get_numeric_value(); + } + return std::nullopt; + } + + auto find_height() const -> std::optional<std::uint32_t> + { + if (auto comp = find_component<html_component_height>()) { + return comp.value()->get_numeric_value(); + } return std::nullopt; } + auto find_style() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_style>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto find_alt() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_alt>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto find_rel() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_rel>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto is_hidden() const -> bool + { + return find_component<html_component_hidden>().has_value(); + } + + auto find_unknown_component(std::string_view attr_name) const -> std::optional<std::string_view> + { + for (const auto &comp: components) { + if (std::holds_alternative<html_component_unknown>(comp)) { + const auto &unknown = std::get<html_component_unknown>(comp); + if (unknown.name == attr_name) { + return unknown.value; + } + } + } + return std::nullopt; + } + + auto get_unknown_components() const -> std::vector<std::pair<std::string_view, std::string_view>> + { + std::vector<std::pair<std::string_view, std::string_view>> unknown_attrs; + for (const auto &comp: components) { + if (std::holds_alternative<html_component_unknown>(comp)) { + const auto &unknown = std::get<html_component_unknown>(comp); + unknown_attrs.emplace_back(unknown.name, unknown.value); + } + } + return unknown_attrs; + } + + // Generic visitor method for processing all components + template<typename Visitor> + auto visit_components(Visitor &&visitor) const + { + for (const auto &comp: components) { + std::visit(std::forward<Visitor>(visitor), comp); + } + } + + // Find any component by attribute name + auto find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>; + + // Get all attributes as name-value pairs + auto get_all_attributes() const -> std::vector<std::pair<std::string_view, std::string_view>>; + auto clear(void) -> void { id = Tag_UNKNOWN; @@ -137,7 +1431,7 @@ struct html_tag { closing.clear(); } - constexpr auto get_content_length() const -> std::size_t + auto get_content_length() const -> std::size_t { if (flags & (FL_IGNORE | CM_HEAD)) { return 0; diff --git a/src/libserver/http/http_connection.c b/src/libserver/http/http_connection.c index baf37a385..b5d70fc1c 100644 --- a/src/libserver/http/http_connection.c +++ b/src/libserver/http/http_connection.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1670,7 +1670,22 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, { char datebuf[64]; int meth_len = 0; - const char *conn_type = "close"; + const char *server_conn_header, *client_conn_header; + + /* Set up connection header strings based on flags and connection type */ + if (msg->flags & RSPAMD_HTTP_FLAG_HAS_CONNECTION_HEADER) { + server_conn_header = ""; + client_conn_header = ""; + } + else { + server_conn_header = "Connection: close\r\n"; + if (conn->opts & RSPAMD_HTTP_CLIENT_KEEP_ALIVE) { + client_conn_header = "Connection: keep-alive\r\n"; + } + else { + client_conn_header = "Connection: close\r\n"; + } + } if (conn->type == RSPAMD_HTTP_SERVER) { /* Format reply */ @@ -1712,12 +1727,14 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, meth_len = rspamd_snprintf(repbuf, replen, "HTTP/1.1 %d %T\r\n" - "Connection: close\r\n" + "%s" "Server: %s\r\n" "Date: %s\r\n" "Content-Length: %z\r\n" "Content-Type: %s", /* NO \r\n at the end ! */ - msg->code, &status, priv->ctx->config.server_hdr, + msg->code, &status, + server_conn_header, + priv->ctx->config.server_hdr, datebuf, bodylen, mime_type); } @@ -1725,11 +1742,13 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, meth_len = rspamd_snprintf(repbuf, replen, "HTTP/1.1 %d %T\r\n" - "Connection: close\r\n" + "%s" "Server: %s\r\n" "Date: %s\r\n" "Content-Length: %z", /* NO \r\n at the end ! */ - msg->code, &status, priv->ctx->config.server_hdr, + msg->code, &status, + server_conn_header, + priv->ctx->config.server_hdr, datebuf, bodylen); } @@ -1737,11 +1756,12 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, /* External reply */ rspamd_printf_fstring(buf, "HTTP/1.1 200 OK\r\n" - "Connection: close\r\n" + "%s" "Server: %s\r\n" "Date: %s\r\n" "Content-Length: %z\r\n" "Content-Type: application/octet-stream\r\n", + server_conn_header, priv->ctx->config.server_hdr, datebuf, enclen); } @@ -1750,12 +1770,14 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, meth_len = rspamd_printf_fstring(buf, "HTTP/1.1 %d %T\r\n" - "Connection: close\r\n" + "%s" "Server: %s\r\n" "Date: %s\r\n" "Content-Length: %z\r\n" "Content-Type: %s\r\n", - msg->code, &status, priv->ctx->config.server_hdr, + msg->code, &status, + server_conn_header, + priv->ctx->config.server_hdr, datebuf, bodylen, mime_type); } @@ -1763,11 +1785,13 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, meth_len = rspamd_printf_fstring(buf, "HTTP/1.1 %d %T\r\n" - "Connection: close\r\n" + "%s" "Server: %s\r\n" "Date: %s\r\n" "Content-Length: %z\r\n", - msg->code, &status, priv->ctx->config.server_hdr, + msg->code, &status, + server_conn_header, + priv->ctx->config.server_hdr, datebuf, bodylen); } @@ -1804,10 +1828,6 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, else { /* Client request */ - if (conn->opts & RSPAMD_HTTP_CLIENT_KEEP_ALIVE) { - conn_type = "keep-alive"; - } - /* Format request */ enclen += RSPAMD_FSTRING_LEN(msg->url) + strlen(http_method_str(msg->method)) + 1; @@ -1819,21 +1839,21 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, "%s %s HTTP/1.0\r\n" "Content-Length: %z\r\n" "Content-Type: application/octet-stream\r\n" - "Connection: %s\r\n", + "%s", "POST", "/post", enclen, - conn_type); + client_conn_header); } else { rspamd_printf_fstring(buf, "%s %V HTTP/1.0\r\n" "Content-Length: %z\r\n" - "Connection: %s\r\n", + "%s", http_method_str(msg->method), msg->url, bodylen, - conn_type); + client_conn_header); if (bodylen > 0) { if (mime_type == NULL) { @@ -1857,26 +1877,26 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, if (rspamd_http_message_is_standard_port(msg)) { rspamd_printf_fstring(buf, "%s %s HTTP/1.1\r\n" - "Connection: %s\r\n" + "%s" "Host: %s\r\n" "Content-Length: %z\r\n" "Content-Type: application/octet-stream\r\n", "POST", "/post", - conn_type, + client_conn_header, host, enclen); } else { rspamd_printf_fstring(buf, "%s %s HTTP/1.1\r\n" - "Connection: %s\r\n" + "%s" "Host: %s:%d\r\n" "Content-Length: %z\r\n" "Content-Type: application/octet-stream\r\n", "POST", "/post", - conn_type, + client_conn_header, host, msg->port, enclen); @@ -1888,21 +1908,21 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, if ((msg->flags & RSPAMD_HTTP_FLAG_HAS_HOST_HEADER)) { rspamd_printf_fstring(buf, "%s %s://%s:%d/%V HTTP/1.1\r\n" - "Connection: %s\r\n" + "%s" "Content-Length: %z\r\n", http_method_str(msg->method), (conn->opts & RSPAMD_HTTP_CLIENT_SSL) ? "https" : "http", host, msg->port, msg->url, - conn_type, + client_conn_header, bodylen); } else { if (rspamd_http_message_is_standard_port(msg)) { rspamd_printf_fstring(buf, "%s %s://%s:%d/%V HTTP/1.1\r\n" - "Connection: %s\r\n" + "%s" "Host: %s\r\n" "Content-Length: %z\r\n", http_method_str(msg->method), @@ -1910,14 +1930,14 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, host, msg->port, msg->url, - conn_type, + client_conn_header, host, bodylen); } else { rspamd_printf_fstring(buf, "%s %s://%s:%d/%V HTTP/1.1\r\n" - "Connection: %s\r\n" + "%s" "Host: %s:%d\r\n" "Content-Length: %z\r\n", http_method_str(msg->method), @@ -1925,7 +1945,7 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, host, msg->port, msg->url, - conn_type, + client_conn_header, host, msg->port, bodylen); @@ -1937,35 +1957,35 @@ int rspamd_http_message_write_header(const char *mime_type, gboolean encrypted, if ((msg->flags & RSPAMD_HTTP_FLAG_HAS_HOST_HEADER)) { rspamd_printf_fstring(buf, "%s %V HTTP/1.1\r\n" - "Connection: %s\r\n" + "%s" "Content-Length: %z\r\n", http_method_str(msg->method), msg->url, - conn_type, + client_conn_header, bodylen); } else { if (rspamd_http_message_is_standard_port(msg)) { rspamd_printf_fstring(buf, "%s %V HTTP/1.1\r\n" - "Connection: %s\r\n" + "%s" "Host: %s\r\n" "Content-Length: %z\r\n", http_method_str(msg->method), msg->url, - conn_type, + client_conn_header, host, bodylen); } else { rspamd_printf_fstring(buf, "%s %V HTTP/1.1\r\n" - "Connection: %s\r\n" + "%s" "Host: %s:%d\r\n" "Content-Length: %z\r\n", http_method_str(msg->method), msg->url, - conn_type, + client_conn_header, host, msg->port, bodylen); @@ -2633,4 +2653,4 @@ void rspamd_http_connection_disable_encryption(struct rspamd_http_connection *co priv->peer_key = NULL; priv->flags &= ~RSPAMD_HTTP_CONN_FLAG_ENCRYPTED; } -}
\ No newline at end of file +} diff --git a/src/libserver/http/http_connection.h b/src/libserver/http/http_connection.h index f6ec03d95..466a3edd9 100644 --- a/src/libserver/http/http_connection.h +++ b/src/libserver/http/http_connection.h @@ -1,11 +1,11 @@ -/*- - * Copyright 2016 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -80,9 +80,13 @@ struct rspamd_storage_shmem { */ #define RSPAMD_HTTP_FLAG_HAS_HOST_HEADER (1 << 7) /** + * Connection header has been set for a message + */ +#define RSPAMD_HTTP_FLAG_HAS_CONNECTION_HEADER (1 << 8) +/** * Message is intended for SSL connection */ -#define RSPAMD_HTTP_FLAG_WANT_SSL (1 << 8) +#define RSPAMD_HTTP_FLAG_WANT_SSL (1 << 9) /** * Options for HTTP connection */ diff --git a/src/libserver/http/http_message.c b/src/libserver/http/http_message.c index 0c9708450..e5e4a0469 100644 --- a/src/libserver/http/http_message.c +++ b/src/libserver/http/http_message.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -539,6 +539,9 @@ void rspamd_http_message_add_header_len(struct rspamd_http_message *msg, if (g_ascii_strcasecmp(name, "host") == 0) { msg->flags |= RSPAMD_HTTP_FLAG_HAS_HOST_HEADER; } + else if (g_ascii_strcasecmp(name, "connection") == 0) { + msg->flags |= RSPAMD_HTTP_FLAG_HAS_CONNECTION_HEADER; + } hdr->combined = rspamd_fstring_sized_new(nlen + vlen + 4); rspamd_printf_fstring(&hdr->combined, "%s: %*s\r\n", name, (int) vlen, @@ -746,4 +749,4 @@ const char *rspamd_http_message_get_url(struct rspamd_http_message *msg, gsize * } return NULL; -}
\ No newline at end of file +} diff --git a/src/libserver/http/http_router.c b/src/libserver/http/http_router.c index aca791a27..459401e9e 100644 --- a/src/libserver/http/http_router.c +++ b/src/libserver/http/http_router.c @@ -1,11 +1,11 @@ -/*- - * Copyright 2019 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -56,13 +56,13 @@ static void rspamd_http_entry_free(struct rspamd_http_connection_entry *entry) { if (entry != NULL) { - close(entry->conn->fd); rspamd_http_connection_unref(entry->conn); if (entry->rt->finish_handler) { entry->rt->finish_handler(entry); } DL_DELETE(entry->rt->conns, entry); + close(entry->conn->fd); g_free(entry); } } diff --git a/src/libserver/logger/logger.c b/src/libserver/logger/logger.c index dc0a85a05..600b7f1e1 100644 --- a/src/libserver/logger/logger.c +++ b/src/libserver/logger/logger.c @@ -22,7 +22,6 @@ #include "unix-std.h" #include "logger_private.h" - static rspamd_logger_t *default_logger = NULL; static rspamd_logger_t *emergency_logger = NULL; static struct rspamd_log_modules *log_modules = NULL; @@ -30,6 +29,61 @@ static struct rspamd_log_modules *log_modules = NULL; static const char lf_chr = '\n'; unsigned int rspamd_task_log_id = (unsigned int) -1; + +/** + * Strip log tag according to the configured policy + * @param original_tag original log tag + * @param original_len length of original tag + * @param dest destination buffer + * @param max_len maximum length allowed + * @param policy stripping policy + * @return actual length of stripped tag + */ +static gsize +rspamd_log_strip_tag(const char *original_tag, gsize original_len, + char *dest, gsize max_len, + enum rspamd_log_tag_strip_policy policy) +{ + if (original_len <= max_len) { + /* No stripping needed */ + memcpy(dest, original_tag, original_len); + return original_len; + } + + switch (policy) { + case RSPAMD_LOG_TAG_STRIP_RIGHT: + /* Cut right part (current behavior) */ + memcpy(dest, original_tag, max_len); + return max_len; + + case RSPAMD_LOG_TAG_STRIP_LEFT: + /* Cut left part (take last elements) */ + memcpy(dest, original_tag + (original_len - max_len), max_len); + return max_len; + + case RSPAMD_LOG_TAG_STRIP_MIDDLE: + /* Half from start and half from end */ + if (max_len >= 2) { + gsize first_half = max_len / 2; + gsize second_half = max_len - first_half; + + memcpy(dest, original_tag, first_half); + memcpy(dest + first_half, + original_tag + (original_len - second_half), + second_half); + } + else if (max_len == 1) { + /* Just take first character */ + dest[0] = original_tag[0]; + } + return max_len; + + default: + /* Fallback to right stripping */ + memcpy(dest, original_tag, max_len); + return max_len; + } +} RSPAMD_CONSTRUCTOR(rspamd_task_log_init) { rspamd_task_log_id = rspamd_logger_add_debug_module("task"); @@ -160,6 +214,10 @@ rspamd_log_open_emergency(rspamd_mempool_t *pool, int flags) logger->process_type = "main"; logger->pid = getpid(); + /* Initialize log tag configuration with defaults */ + logger->max_log_tag_len = RSPAMD_LOG_ID_LEN; /* Keep backward compatibility default */ + logger->log_tag_strip_policy = RSPAMD_LOG_TAG_STRIP_RIGHT; + const struct rspamd_logger_funcs *funcs = &console_log_funcs; memcpy(&logger->ops, funcs, sizeof(*funcs)); @@ -258,6 +316,28 @@ rspamd_log_open_specific(rspamd_mempool_t *pool, logger->process_type = ptype; logger->enabled = TRUE; + /* Initialize log tag configuration with defaults */ + if (cfg && cfg->log_max_tag_len > 0) { + logger->max_log_tag_len = MIN(MEMPOOL_UID_LEN, cfg->log_max_tag_len); + } + else { + logger->max_log_tag_len = RSPAMD_LOG_ID_LEN; /* Keep backward compatibility default */ + } + + logger->log_tag_strip_policy = RSPAMD_LOG_TAG_STRIP_RIGHT; + + if (cfg && cfg->log_tag_strip_policy_str) { + if (g_ascii_strcasecmp(cfg->log_tag_strip_policy_str, "left") == 0) { + logger->log_tag_strip_policy = RSPAMD_LOG_TAG_STRIP_LEFT; + } + else if (g_ascii_strcasecmp(cfg->log_tag_strip_policy_str, "middle") == 0) { + logger->log_tag_strip_policy = RSPAMD_LOG_TAG_STRIP_MIDDLE; + } + else { + logger->log_tag_strip_policy = RSPAMD_LOG_TAG_STRIP_RIGHT; /* Default */ + } + } + /* Set up conditional logging */ if (cfg) { if (cfg->debug_ip_map != NULL) { @@ -1026,16 +1106,34 @@ log_time(double now, rspamd_logger_t *rspamd_log, char *timebuf, } } +/** + * Process log ID with stripping policy and return the effective length + * @param logger logger instance with configuration + * @param id original log ID + * @param processed_id buffer to store processed ID (should be at least max_log_tag_len + 1) + * @return effective length of processed ID + */ static inline int -rspamd_log_id_strlen(const char *id) +rspamd_log_process_id(rspamd_logger_t *logger, const char *id, char *processed_id) { - for (int i = 0; i < RSPAMD_LOG_ID_LEN; i++) { - if (G_UNLIKELY(id[i] == '\0')) { - return i; - } + if (id == NULL) { + return 0; + } + + gsize original_len = strlen(id); + gsize max_len = MIN(MEMPOOL_UID_LEN, logger->max_log_tag_len); + + if (original_len <= max_len) { + /* No processing needed */ + memcpy(processed_id, id, original_len); + return original_len; } - return RSPAMD_LOG_ID_LEN; + /* Apply stripping policy */ + gsize processed_len = rspamd_log_strip_tag(id, original_len, processed_id, max_len, + logger->log_tag_strip_policy); + + return processed_len; } void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx, @@ -1071,8 +1169,17 @@ void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx, if (G_UNLIKELY(log_json)) { /* Perform JSON logging */ - unsigned int slen = id ? strlen(id) : strlen("(NULL)"); - slen = MIN(RSPAMD_LOG_ID_LEN, slen); + char processed_id[MEMPOOL_UID_LEN]; + int processed_len = 0; + + if (id) { + processed_len = rspamd_log_process_id(logger, id, processed_id); + } + else { + strcpy(processed_id, "(NULL)"); + processed_len = strlen(processed_id); + } + r = rspamd_snprintf(tmpbuf, sizeof(tmpbuf), "{\"ts\": %f, " "\"pid\": %P, " "\"severity\": \"%s\", " @@ -1085,7 +1192,7 @@ void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx, logger->pid, rspamd_get_log_severity_string(level_flags), logger->process_type, - slen, id, + processed_len, processed_id, module, function); iov_ctx->iov[0].iov_base = tmpbuf; @@ -1241,14 +1348,17 @@ void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx, glong mremain, mr; char *m; + char processed_id[MEMPOOL_UID_LEN]; + int processed_len = 0; modulebuf[0] = '\0'; mremain = sizeof(modulebuf); m = modulebuf; if (id != NULL) { - mr = rspamd_snprintf(m, mremain, "<%*.s>; ", rspamd_log_id_strlen(id), - id); + processed_len = rspamd_log_process_id(logger, id, processed_id); + mr = rspamd_snprintf(m, mremain, "<%*.s>; ", processed_len, + processed_id); m += mr; mremain -= mr; } @@ -1300,10 +1410,13 @@ void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx, iov_ctx->iov[niov].iov_base = (void *) timebuf; iov_ctx->iov[niov++].iov_len = strlen(timebuf); if (id != NULL) { + char processed_id[MEMPOOL_UID_LEN]; + int processed_len = rspamd_log_process_id(logger, id, processed_id); + iov_ctx->iov[niov].iov_base = (void *) "; "; iov_ctx->iov[niov++].iov_len = 2; - iov_ctx->iov[niov].iov_base = (void *) id; - iov_ctx->iov[niov++].iov_len = rspamd_log_id_strlen(id); + iov_ctx->iov[niov].iov_base = (void *) processed_id; + iov_ctx->iov[niov++].iov_len = processed_len; iov_ctx->iov[niov].iov_base = (void *) ";"; iov_ctx->iov[niov++].iov_len = 1; } diff --git a/src/libserver/logger/logger_private.h b/src/libserver/logger/logger_private.h index 80178ad32..387d8639b 100644 --- a/src/libserver/logger/logger_private.h +++ b/src/libserver/logger/logger_private.h @@ -1,5 +1,5 @@ /* - * Copyright 2023 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,12 @@ #define REPEATS_MAX 300 #define LOGBUF_LEN 8192 +enum rspamd_log_tag_strip_policy { + RSPAMD_LOG_TAG_STRIP_RIGHT = 0, /* Cut right part (current behavior) */ + RSPAMD_LOG_TAG_STRIP_LEFT, /* Cut left part (take last elements) */ + RSPAMD_LOG_TAG_STRIP_MIDDLE, /* Half from start and half from end */ +}; + struct rspamd_log_module { char *mname; unsigned int id; @@ -73,6 +79,10 @@ struct rspamd_logger_s { gboolean is_debug; gboolean no_lock; + /* Log tag configuration */ + unsigned int max_log_tag_len; + enum rspamd_log_tag_strip_policy log_tag_strip_policy; + pid_t pid; const char *process_type; struct rspamd_radix_map_helper *debug_ip; diff --git a/src/libserver/maps/map.c b/src/libserver/maps/map.c index 51390f24b..6de694eb3 100644 --- a/src/libserver/maps/map.c +++ b/src/libserver/maps/map.c @@ -26,6 +26,8 @@ #include "contrib/libev/ev.h" #include "contrib/uthash/utlist.h" +#include <worker_util.h> + #ifdef SYS_ZSTD #include "zstd.h" #else @@ -84,7 +86,8 @@ RSPAMD_CONSTRUCTOR(rspamd_map_log_init) } /** - * Write HTTP request + * Write HTTP request with proper cache validation headers + * Uses ETags (If-None-Match) and Last-Modified (If-Modified-Since) for conditional requests */ static void write_http_request(struct http_callback_data *cbd) @@ -109,7 +112,8 @@ write_http_request(struct http_callback_data *cbd) } if (cbd->data->etag) { rspamd_http_message_add_header_len(msg, "If-None-Match", - cbd->data->etag->str, cbd->data->etag->len); + cbd->data->etag->str, + cbd->data->etag->len); } } @@ -295,23 +299,101 @@ rspamd_map_cache_cb(struct ev_loop *loop, ev_timer *w, int revents) } } +/** + * Calculate next check time with proper priority for different cache validation mechanisms + * Priority: ETags > Last-Modified > Cache expiration headers + * @param now current time + * @param expires time from cache expiration header + * @param map_check_interval base polling interval + * @param has_etag whether we have ETag for conditional requests + * @param has_last_modified whether we have Last-Modified for conditional requests + * @return next check time + */ static inline time_t -rspamd_http_map_process_next_check(time_t now, time_t expires, time_t map_check_interval) +rspamd_http_map_process_next_check(struct rspamd_map *map, + struct rspamd_map_backend *bk, + time_t now, + time_t expires, + time_t map_check_interval, + gboolean has_etag, + gboolean has_last_modified) { - static const time_t interval_mult = 16; - /* By default use expires header */ - time_t next_check = expires; + static const time_t interval_mult = 4; /* Reduced from 16 to be more responsive */ + static const time_t min_respectful_interval = 5; + time_t next_check; + time_t effective_interval = map_check_interval; + + /* + * Priority order for cache validation: + * 1. ETags (most reliable) + * 2. Last-Modified dates + * 3. Cache expiration headers (least reliable) + */ + + if (has_etag || has_last_modified) { + /* + * If we have ETags or Last-Modified, we can use conditional requests + * to avoid unnecessary downloads. However, we still need to be respectful + * to servers and not DoS them with overly aggressive polling. + */ + if (map_check_interval < min_respectful_interval) { + /* + * User configured very aggressive polling, but server provides cache validation. + * Enforce minimum respectful interval to avoid DoS'ing the server. + */ + effective_interval = min_respectful_interval * interval_mult; + msg_info_map("map polling interval %d too aggressive with server cache support for %s, " + "using %d seconds minimum", + (int) map_check_interval, bk->uri, (int) effective_interval); + } - if (expires < now) { - return now; + if (expires > now && (expires - now) <= effective_interval * interval_mult) { + /* Use expires header if it's reasonable (within interval_mult x poll interval) */ + next_check = expires; + } + else { + /* Use effective interval, don't extend too much */ + next_check = now + effective_interval; + } + } + else if (expires > now) { + /* + * No ETags or Last-Modified available, rely on cache expiration. + * But still cap the interval to avoid too long delays. + * No need for respectful interval protection here since no conditional requests. + */ + if (expires - now > map_check_interval * interval_mult) { + next_check = now + map_check_interval * interval_mult; + } + else { + next_check = expires; + } } - else if (expires - now > map_check_interval * interval_mult) { - next_check = now + map_check_interval * interval_mult; + else { + /* No valid cache information, check immediately */ + next_check = now; } return next_check; } +/** + * Calculate respectful polling interval to avoid DoS'ing servers with cache validation + * @param map_check_interval user configured interval + * @return effective interval that respects server resources + */ +static inline time_t +rspamd_map_get_respectful_interval(time_t map_check_interval) +{ + static const time_t min_respectful_interval = 5; /* Minimum 5 seconds to be respectful */ + static const time_t interval_mult = 4; /* Multiplier for respectful minimum */ + + if (map_check_interval < min_respectful_interval) { + return min_respectful_interval * interval_mult; + } + return map_check_interval; +} + static int http_map_finish(struct rspamd_http_connection *conn, struct rspamd_http_message *msg) @@ -333,13 +415,15 @@ http_map_finish(struct rspamd_http_connection *conn, if (msg->code == 200) { if (cbd->check) { - msg_info_map("need to reread map from %s", cbd->bk->uri); + msg_info_map("need to reread map from %s (reply code 200); " + "date timestamp: %z, last modified: %z", + cbd->bk->uri, (size_t) msg->date, (size_t) msg->last_modified); cbd->periodic->need_modify = TRUE; /* Reset the whole chain */ cbd->periodic->cur_backend = 0; /* Reset cache, old cached data will be cleaned on timeout */ g_atomic_int_set(&data->cache->available, 0); - g_atomic_int_set(&bk->shared->loaded, 0); + g_atomic_int_set(&map->shared->loaded, 0); data->cur_cache_cbd = NULL; rspamd_map_process_periodic(cbd->periodic); @@ -348,6 +432,7 @@ http_map_finish(struct rspamd_http_connection *conn, return 0; } + /* This code is executed when we are actually reading a map */ cbd->data->last_checked = msg->date; if (msg->last_modified) { @@ -378,10 +463,11 @@ http_map_finish(struct rspamd_http_connection *conn, goto err; } - /* Check for expires */ + /* Check for expires + etag */ double cached_timeout = map->poll_timeout * 2; expires_hdr = rspamd_http_message_find_header(msg, "Expires"); + etag_hdr = rspamd_http_message_find_header(msg, "ETag"); if (expires_hdr) { time_t hdate; @@ -389,8 +475,10 @@ http_map_finish(struct rspamd_http_connection *conn, hdate = rspamd_http_parse_date(expires_hdr->begin, expires_hdr->len); if (hdate != (time_t) -1 && hdate > msg->date) { - map->next_check = rspamd_http_map_process_next_check(msg->date, hdate, - (time_t) map->poll_timeout); + map->next_check = rspamd_http_map_process_next_check(map, bk, msg->date, hdate, + (time_t) map->poll_timeout, + etag_hdr != NULL, + msg->last_modified != 0); cached_timeout = map->next_check - msg->date; } else { @@ -398,9 +486,16 @@ http_map_finish(struct rspamd_http_connection *conn, map->next_check = 0; } } - - /* Check for etag */ - etag_hdr = rspamd_http_message_find_header(msg, "ETag"); + else if (etag_hdr != NULL || msg->last_modified != 0) { + /* No expires header, but we have ETag or Last-Modified - use respectful interval */ + time_t effective_interval = rspamd_map_get_respectful_interval(map->poll_timeout); + if (effective_interval != map->poll_timeout) { + msg_info_map("map polling interval %d too aggressive with server cache support, " + "using %d seconds minimum", + (int) map->poll_timeout, (int) effective_interval); + } + map->next_check = msg->date + effective_interval; + } if (etag_hdr) { if (cbd->data->etag) { @@ -421,12 +516,7 @@ http_map_finish(struct rspamd_http_connection *conn, MAP_RETAIN(cbd->shmem_data, "shmem_data"); cbd->data->gen++; - /* - * We know that a map is in the locked state - */ - g_atomic_int_set(&data->cache->available, 1); - g_atomic_int_set(&bk->shared->loaded, 1); - g_atomic_int_set(&bk->shared->cached, 0); + /* Store cached data */ rspamd_strlcpy(data->cache->shmem_name, cbd->shmem_data->shm_name, sizeof(data->cache->shmem_name)); @@ -528,6 +618,12 @@ http_map_finish(struct rspamd_http_connection *conn, cbd->periodic->cur_backend++; munmap(in, dlen); + + /* Announce for other processes */ + g_atomic_int_set(&data->cache->available, 1); + g_atomic_int_set(&map->shared->loaded, 1); + g_atomic_int_set(&map->shared->cached, 1); + rspamd_map_process_periodic(cbd->periodic); } else if (msg->code == 304 && cbd->check) { @@ -541,19 +637,33 @@ http_map_finish(struct rspamd_http_connection *conn, } expires_hdr = rspamd_http_message_find_header(msg, "Expires"); + bool has_expires = (expires_hdr != NULL); if (expires_hdr) { time_t hdate; hdate = rspamd_http_parse_date(expires_hdr->begin, expires_hdr->len); if (hdate != (time_t) -1 && hdate > msg->date) { - map->next_check = rspamd_http_map_process_next_check(msg->date, hdate, - (time_t) map->poll_timeout); + map->next_check = rspamd_http_map_process_next_check(map, bk, msg->date, hdate, + (time_t) map->poll_timeout, + cbd->data->etag != NULL, + msg->last_modified != 0); } else { msg_info_map("invalid expires header: %T, ignore it", expires_hdr); map->next_check = 0; + has_expires = false; + } + } + else if (cbd->data->etag != NULL || msg->last_modified != 0) { + /* No expires header, but we have ETag or Last-Modified - use respectful interval */ + time_t effective_interval = rspamd_map_get_respectful_interval(map->poll_timeout); + if (effective_interval != map->poll_timeout) { + msg_info_map("map polling interval %d too aggressive with server cache support, " + "using %d seconds minimum", + (int) map->poll_timeout, (int) effective_interval); } + map->next_check = msg->date + effective_interval; } etag_hdr = rspamd_http_message_find_header(msg, "ETag"); @@ -567,19 +677,24 @@ http_map_finish(struct rspamd_http_connection *conn, } } - if (map->next_check) { + if (has_expires) { rspamd_http_date_format(next_check_date, sizeof(next_check_date), map->next_check); - msg_info_map("data is not modified for server %s, next check at %s " + msg_info_map("data is not modified for server %s (%s), next check at %s " "(http cache based: %T)", - cbd->data->host, next_check_date, expires_hdr); + cbd->data->host, + bk->uri, + next_check_date, + expires_hdr); } else { rspamd_http_date_format(next_check_date, sizeof(next_check_date), - rspamd_get_calendar_ticks() + map->poll_timeout); - msg_info_map("data is not modified for server %s, next check at %s " + map->next_check); + msg_info_map("data is not modified for server %s (%s), next check at %s " "(timer based)", - cbd->data->host, next_check_date); + cbd->data->host, + bk->uri, + next_check_date); } rspamd_map_update_http_cached_file(map, bk, cbd->data); @@ -922,7 +1037,7 @@ read_map_file(struct rspamd_map *map, struct file_map_data *data, map->read_callback(NULL, 0, &periodic->cbdata, TRUE); } - g_atomic_int_set(&bk->shared->loaded, 1); + g_atomic_int_set(&map->shared->loaded, 1); return TRUE; } @@ -1008,7 +1123,7 @@ read_map_static(struct rspamd_map *map, struct static_map_data *data, } data->processed = TRUE; - g_atomic_int_set(&bk->shared->loaded, 1); + g_atomic_int_set(&map->shared->loaded, 1); return TRUE; } @@ -1016,10 +1131,7 @@ read_map_static(struct rspamd_map *map, struct static_map_data *data, static void rspamd_map_periodic_dtor(struct map_periodic_cbdata *periodic) { - struct rspamd_map *map; - struct rspamd_map_backend *bk; - - map = periodic->map; + struct rspamd_map *map = periodic->map; msg_debug_map("periodic dtor %p; need_modify=%d", periodic, periodic->need_modify); if (periodic->need_modify || periodic->cbdata.errored) { @@ -1034,21 +1146,13 @@ rspamd_map_periodic_dtor(struct map_periodic_cbdata *periodic) /* Not modified */ } - if (periodic->locked) { - if (periodic->cur_backend < map->backends->len) { - bk = (struct rspamd_map_backend *) g_ptr_array_index(map->backends, periodic->cur_backend); - g_atomic_int_set(&bk->shared->locked, 0); - msg_debug_map("unlocked map %s", map->name); - } - - if (periodic->map->wrk->state == rspamd_worker_state_running) { - rspamd_map_schedule_periodic(periodic->map, - RSPAMD_SYMBOL_RESULT_NORMAL); - } - else { - msg_debug_map("stop scheduling periodics for %s; terminating state", - periodic->map->name); - } + if (periodic->map->wrk->state == rspamd_worker_state_running) { + rspamd_map_schedule_periodic(periodic->map, + RSPAMD_MAP_SCHEDULE_NORMAL); + } + else { + msg_debug_map("stop scheduling periodics for %s; terminating state", + periodic->map->name); } g_free(periodic); @@ -1448,9 +1552,6 @@ rspamd_map_read_cached(struct rspamd_map *map, struct rspamd_map_backend *bk, map->read_callback(in, len, &periodic->cbdata, TRUE); } - g_atomic_int_set(&bk->shared->loaded, 1); - g_atomic_int_set(&bk->shared->cached, 1); - munmap(in, mmap_len); return TRUE; @@ -1488,7 +1589,7 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map, const unsigned char *data, gsize len) { - char path[PATH_MAX]; + char path[PATH_MAX], temp_path[PATH_MAX]; unsigned char digest[rspamd_cryptobox_HASHBYTES]; struct rspamd_config *cfg = map->cfg; int fd; @@ -1501,8 +1602,10 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map, rspamd_cryptobox_hash(digest, bk->uri, strlen(bk->uri), NULL, 0); rspamd_snprintf(path, sizeof(path), "%s%c%*xs.map", cfg->maps_cache_dir, G_DIR_SEPARATOR, 20, digest); + rspamd_snprintf(temp_path, sizeof(temp_path), "%s.tmp.%d.%d", path, + (int) getpid(), (int) rspamd_get_calendar_ticks()); - fd = rspamd_file_xopen(path, O_WRONLY | O_TRUNC | O_CREAT, + fd = rspamd_file_xopen(temp_path, O_WRONLY | O_TRUNC | O_CREAT, 00600, FALSE); if (fd == -1) { @@ -1510,8 +1613,9 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map, } if (!rspamd_file_lock(fd, FALSE)) { - msg_err_map("cannot lock file %s: %s", path, strerror(errno)); + msg_err_map("cannot lock file %s: %s", temp_path, strerror(errno)); close(fd); + unlink(temp_path); return FALSE; } @@ -1530,9 +1634,10 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map, } if (write(fd, &header, sizeof(header)) != sizeof(header)) { - msg_err_map("cannot write file %s (header stage): %s", path, strerror(errno)); + msg_err_map("cannot write file %s (header stage): %s", temp_path, strerror(errno)); rspamd_file_unlock(fd, FALSE); close(fd); + unlink(temp_path); return FALSE; } @@ -1540,9 +1645,10 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map, if (header.etag_len > 0) { if (write(fd, RSPAMD_FSTRING_DATA(htdata->etag), header.etag_len) != header.etag_len) { - msg_err_map("cannot write file %s (etag stage): %s", path, strerror(errno)); + msg_err_map("cannot write file %s (etag stage): %s", temp_path, strerror(errno)); rspamd_file_unlock(fd, FALSE); close(fd); + unlink(temp_path); return FALSE; } @@ -1550,9 +1656,10 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map, /* Now write the rest */ if (write(fd, data, len) != len) { - msg_err_map("cannot write file %s (data stage): %s", path, strerror(errno)); + msg_err_map("cannot write file %s (data stage): %s", temp_path, strerror(errno)); rspamd_file_unlock(fd, FALSE); close(fd); + unlink(temp_path); return FALSE; } @@ -1560,6 +1667,13 @@ rspamd_map_save_http_cached_file(struct rspamd_map *map, rspamd_file_unlock(fd, FALSE); close(fd); + /* Atomically move temp file to final location */ + if (rename(temp_path, path) != 0) { + msg_err_map("cannot rename %s to %s: %s", temp_path, path, strerror(errno)); + unlink(temp_path); + return FALSE; + } + msg_info_map("saved data from %s in %s, %uz bytes", bk->uri, path, len + sizeof(header) + header.etag_len); return TRUE; @@ -1693,7 +1807,11 @@ rspamd_map_read_http_cached_file(struct rspamd_map *map, double now = rspamd_get_calendar_ticks(); if (header.next_check > now) { - map->next_check = rspamd_http_map_process_next_check(now, header.next_check, map->poll_timeout); + /* We assume that we have this data inside the cached file */ + map->next_check = rspamd_http_map_process_next_check(map, bk, now, header.next_check, + map->poll_timeout, + header.etag_len > 0, + true); } else { map->next_check = now; @@ -1740,8 +1858,9 @@ rspamd_map_read_http_cached_file(struct rspamd_map *map, struct tm tm; char ncheck_buf[32], lm_buf[32]; - g_atomic_int_set(&bk->shared->loaded, 1); - g_atomic_int_set(&bk->shared->cached, 1); + g_atomic_int_set(&map->shared->loaded, 1); + g_atomic_int_set(&map->shared->cached, 1); + rspamd_localtime(map->next_check, &tm); strftime(ncheck_buf, sizeof(ncheck_buf) - 1, "%Y-%m-%d %H:%M:%S", &tm); rspamd_localtime(htdata->last_modified, &tm); @@ -1784,7 +1903,6 @@ rspamd_map_common_http_callback(struct rspamd_map *map, (int) data->last_modified, (int) data->cache->last_modified); periodic->need_modify = TRUE; - /* Reset the whole chain */ periodic->cur_backend = 0; rspamd_map_process_periodic(periodic); } @@ -2054,33 +2172,10 @@ rspamd_map_process_periodic(struct map_periodic_cbdata *cbd) bk = g_ptr_array_index(map->backends, cbd->cur_backend); - if (!map->file_only && !cbd->locked) { - if (!g_atomic_int_compare_and_exchange(&bk->shared->locked, - 0, 1)) { - msg_debug_map( - "don't try to reread map %s as it is locked by other process, " - "will reread it later", - cbd->map->name); - rspamd_map_schedule_periodic(map, RSPAMD_MAP_SCHEDULE_LOCKED); - MAP_RELEASE(cbd, "periodic"); - - return; - } - else { - msg_debug_map("locked map %s", map->name); - cbd->locked = TRUE; - } - } - if (cbd->errored) { /* We should not check other backends if some backend has failed*/ rspamd_map_schedule_periodic(cbd->map, RSPAMD_MAP_SCHEDULE_ERROR); - if (cbd->locked) { - g_atomic_int_set(&bk->shared->locked, 0); - cbd->locked = FALSE; - } - /* Also set error flag for the map consumer */ cbd->cbdata.errored = true; @@ -2796,9 +2891,6 @@ rspamd_map_parse_backend(struct rspamd_config *cfg, const char *map_line) bk->data.sd = sdata; } - bk->shared = rspamd_mempool_alloc0_shared(cfg->cfg_pool, - sizeof(struct rspamd_map_shared_backend_data)); - return bk; err: @@ -2929,6 +3021,8 @@ rspamd_map_add(struct rspamd_config *cfg, map->user_data = user_data; map->cfg = cfg; map->id = rspamd_random_uint64_fast(); + map->shared = + rspamd_mempool_alloc0_shared(cfg->cfg_pool, sizeof(struct rspamd_map_shared_data)); map->backends = g_ptr_array_sized_new(1); map->wrk = worker; rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard, @@ -3027,6 +3121,8 @@ rspamd_map_add_from_ucl(struct rspamd_config *cfg, map->user_data = user_data; map->cfg = cfg; map->id = rspamd_random_uint64_fast(); + map->shared = + rspamd_mempool_alloc0_shared(cfg->cfg_pool, sizeof(struct rspamd_map_shared_data)); map->backends = g_ptr_array_new(); map->wrk = worker; map->no_file_read = (flags & RSPAMD_MAP_FILE_NO_READ); @@ -3208,7 +3304,7 @@ rspamd_map_add_from_ucl(struct rspamd_config *cfg, if (all_loaded) { /* Static map */ - g_atomic_int_set(&bk->shared->loaded, 1); + g_atomic_int_set(&map->shared->loaded, 1); } rspamd_map_calculate_hash(map); @@ -3257,3 +3353,51 @@ void rspamd_map_set_on_load_function(struct rspamd_map *map, rspamd_map_on_load_ map->on_load_ud_dtor = dtor; } } + +void rspamd_map_trigger_hyperscan_compilation(struct rspamd_map *map) +{ + /* Only trigger compilation in controller worker */ + if (!map->cfg || !map->cfg->cur_worker) { + return; + } + + struct rspamd_worker *worker = map->wrk; + if (!rspamd_worker_is_primary_controller(worker)) { + return; + } + + /* Check if we have any scopes that need compilation */ + if (!map->cfg->re_cache) { + return; + } + + unsigned int scope_count = rspamd_re_cache_count_scopes(map->cfg->re_cache); + if (scope_count == 0) { + return; + } + + /* Iterate through scopes and compile those that are loaded */ + struct rspamd_re_cache *scope; + + for (scope = rspamd_re_cache_scope_first(map->cfg->re_cache); + scope != NULL; + scope = rspamd_re_cache_scope_next(scope)) { + const char *scope_name = rspamd_re_cache_scope_name(scope); + const char *scope_for_check = (strcmp(scope_name, "default") == 0) ? NULL : scope_name; + + /* Only compile loaded scopes */ + if (rspamd_re_cache_is_loaded(map->cfg->re_cache, scope_for_check)) { + msg_info_map("triggering hyperscan compilation for scope: %s after map update", + scope_name); + + /* Use default settings for compilation */ + rspamd_re_cache_compile_hyperscan_scoped_single(scope, scope_for_check, + map->cfg->hs_cache_dir ? map->cfg->hs_cache_dir : RSPAMD_DBDIR "/", + 1.0, /* max_time */ + FALSE, /* silent */ + worker->ctx ? ((struct rspamd_abstract_worker_ctx *) worker->ctx)->event_loop : NULL, + NULL, /* callback */ + NULL); /* cbdata */ + } + } +} diff --git a/src/libserver/maps/map.h b/src/libserver/maps/map.h index b2ba53118..27915e4c9 100644 --- a/src/libserver/maps/map.h +++ b/src/libserver/maps/map.h @@ -161,6 +161,12 @@ void rspamd_map_traverse(struct rspamd_map *map, rspamd_map_traverse_cb cb, void rspamd_map_set_on_load_function(struct rspamd_map *map, rspamd_map_on_load_function cb, gpointer cbdata, GDestroyNotify dtor); +/** + * Trigger hyperscan compilation for regexp scopes that may have been updated + * @param map map that was updated + */ +void rspamd_map_trigger_hyperscan_compilation(struct rspamd_map *map); + #ifdef __cplusplus } #endif diff --git a/src/libserver/maps/map_private.h b/src/libserver/maps/map_private.h index 66949f926..65df8d7f5 100644 --- a/src/libserver/maps/map_private.h +++ b/src/libserver/maps/map_private.h @@ -134,20 +134,12 @@ union rspamd_map_backend_data { struct rspamd_map; -/* - * Shared between workers - */ -struct rspamd_map_shared_backend_data { - int locked; - int loaded; - int cached; -}; + struct rspamd_map_backend { enum fetch_proto protocol; gboolean is_signed; gboolean is_compressed; gboolean is_fallback; - struct rspamd_map_shared_backend_data *shared; struct rspamd_map *map; struct ev_loop *event_loop; uint64_t id; @@ -159,6 +151,14 @@ struct rspamd_map_backend { struct map_periodic_cbdata; +/* + * Shared between workers + */ +struct rspamd_map_shared_data { + int loaded; + int cached; +}; + struct rspamd_map { struct rspamd_dns_resolver *r; struct rspamd_config *cfg; @@ -193,6 +193,8 @@ struct rspamd_map { bool static_only; /* No need to check */ bool no_file_read; /* Do not read files */ bool seen; /* This map has already been watched or pre-loaded */ + /* Shared lock for temporary disabling of map reading (e.g. when this map is written by UI) */ + struct rspamd_map_shared_data *shared; char tag[MEMPOOL_UID_LEN]; }; @@ -209,7 +211,6 @@ struct map_periodic_cbdata { ev_timer ev; gboolean need_modify; gboolean errored; - gboolean locked; unsigned int cur_backend; ref_entry_t ref; }; diff --git a/src/libserver/milter.c b/src/libserver/milter.c index 94b0d6cc1..09ddddaba 100644 --- a/src/libserver/milter.c +++ b/src/libserver/milter.c @@ -1473,8 +1473,6 @@ rspamd_milter_macro_http(struct rspamd_milter_session *session, { rspamd_http_message_add_header_len(msg, QUEUE_ID_HEADER, found->begin, found->len); - rspamd_http_message_add_header_len(msg, LOG_TAG_HEADER, - found->begin, found->len); } else { @@ -1482,8 +1480,6 @@ rspamd_milter_macro_http(struct rspamd_milter_session *session, { rspamd_http_message_add_header_len(msg, QUEUE_ID_HEADER, found->begin, found->len); - rspamd_http_message_add_header_len(msg, LOG_TAG_HEADER, - found->begin, found->len); } } diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index b683547a1..b085c69d7 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -1668,8 +1668,21 @@ void rspamd_protocol_http_reply(struct rspamd_http_message *msg, } } - if ((task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_COMPRESSED) && - rspamd_libs_reset_compression(task->cfg->libs_ctx)) { + /* Check if we should compress the response */ + gboolean should_compress = FALSE; + + /* Rule 1: If request had compression, preserve it (existing behavior) */ + if (task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_COMPRESSED) { + should_compress = TRUE; + } + + /* Rule 2: If client supports zstd compression, honor it (takes precedence) */ + const rspamd_ftok_t *accept_encoding = rspamd_task_get_request_header(task, "Accept-Encoding"); + if (accept_encoding && rspamd_substring_search_caseless(accept_encoding->begin, accept_encoding->len, "zstd", 4) != -1) { + should_compress = TRUE; + } + + if (should_compress && rspamd_libs_reset_compression(task->cfg->libs_ctx)) { /* We can compress output */ ZSTD_inBuffer zin; ZSTD_outBuffer zout; diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index 06e9f3328..06ba26528 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -91,6 +91,7 @@ struct rspamd_re_class { gsize type_len; GHashTable *re; rspamd_cryptobox_hash_state_t *st; + struct rspamd_re_cache *cache; /* Back-reference to owning cache */ char hash[rspamd_cryptobox_HASHBYTES + 1]; @@ -126,6 +127,12 @@ struct rspamd_re_cache { unsigned int max_re_data; char hash[rspamd_cryptobox_HASHBYTES + 1]; lua_State *L; + + /* Intrusive linked list for scoped caches */ + struct rspamd_re_cache *next, *prev; + char *scope; + unsigned int flags; /* Cache flags (loaded state, etc.) */ + #ifdef WITH_HYPERSCAN enum rspamd_hyperscan_status hyperscan_loaded; gboolean disable_hyperscan; @@ -149,6 +156,9 @@ struct rspamd_re_runtime { struct rspamd_re_cache *cache; struct rspamd_re_cache_stat stat; gboolean has_hs; + + /* Linked list for multiple scoped runtimes */ + struct rspamd_re_runtime *next, *prev; }; static GQuark @@ -174,6 +184,63 @@ rspamd_re_cache_class_id(enum rspamd_re_type type, return rspamd_cryptobox_fast_hash_final(&st); } +static struct rspamd_re_cache * +rspamd_re_cache_find_by_scope(struct rspamd_re_cache *cache_head, const char *scope) +{ + struct rspamd_re_cache *cur; + + if (!cache_head) { + return NULL; + } + + DL_FOREACH(cache_head, cur) + { + if (scope == NULL && cur->scope == NULL) { + /* Looking for default scope */ + return cur; + } + else if (scope != NULL && cur->scope != NULL && strcmp(cur->scope, scope) == 0) { + return cur; + } + } + + return NULL; +} + +static struct rspamd_re_cache * +rspamd_re_cache_add_to_scope_list(struct rspamd_re_cache **cache_head, const char *scope) +{ + struct rspamd_re_cache *new_cache, *existing; + + if (!cache_head) { + return NULL; + } + + /* Check if scope already exists */ + existing = rspamd_re_cache_find_by_scope(*cache_head, scope); + if (existing) { + return existing; + } + + /* Create new cache for this scope */ + new_cache = rspamd_re_cache_new(); + if (new_cache->scope) { + g_free(new_cache->scope); + } + new_cache->scope = g_strdup(scope); + new_cache->flags = 0; /* New scopes start as unloaded */ + + /* Add to linked list */ + if (*cache_head) { + DL_APPEND(*cache_head, new_cache); + } + else { + *cache_head = new_cache; + } + + return new_cache; +} + static void rspamd_re_cache_destroy(struct rspamd_re_cache *cache) { @@ -230,6 +297,11 @@ rspamd_re_cache_destroy(struct rspamd_re_cache *cache) g_hash_table_unref(cache->re_classes); g_ptr_array_free(cache->re, TRUE); + + if (cache->scope) { + g_free(cache->scope); + } + g_free(cache); } @@ -252,6 +324,10 @@ rspamd_re_cache_new(void) cache->nre = 0; cache->re = g_ptr_array_new_full(256, rspamd_re_cache_elt_dtor); cache->selectors = kh_init(lua_selectors_hash); + cache->next = NULL; + cache->prev = cache; + cache->scope = NULL; /* Default scope */ + cache->flags = RSPAMD_RE_CACHE_FLAG_LOADED; /* Default scope is always loaded */ #ifdef WITH_HYPERSCAN cache->hyperscan_loaded = RSPAMD_HYPERSCAN_UNKNOWN; #endif @@ -295,6 +371,7 @@ rspamd_re_cache_add(struct rspamd_re_cache *cache, re_class->id = class_id; re_class->type_len = datalen; re_class->type = type; + re_class->cache = cache; /* Set back-reference */ re_class->re = g_hash_table_new_full(rspamd_regexp_hash, rspamd_regexp_equal, NULL, (GDestroyNotify) rspamd_regexp_unref); @@ -330,6 +407,26 @@ rspamd_re_cache_add(struct rspamd_re_cache *cache, return nre; } +rspamd_regexp_t * +rspamd_re_cache_add_scoped(struct rspamd_re_cache **cache_head, const char *scope, + rspamd_regexp_t *re, enum rspamd_re_type type, + gconstpointer type_data, gsize datalen, + int lua_cbref) +{ + struct rspamd_re_cache *cache; + + g_assert(cache_head != NULL); + g_assert(re != NULL); + + /* NULL scope is allowed for default scope */ + cache = rspamd_re_cache_add_to_scope_list(cache_head, scope); + if (!cache) { + return NULL; + } + + return rspamd_re_cache_add(cache, re, type, type_data, datalen, lua_cbref); +} + void rspamd_re_cache_replace(struct rspamd_re_cache *cache, rspamd_regexp_t *what, rspamd_regexp_t *with) @@ -371,6 +468,23 @@ void rspamd_re_cache_replace(struct rspamd_re_cache *cache, } } +void rspamd_re_cache_replace_scoped(struct rspamd_re_cache **cache_head, const char *scope, + rspamd_regexp_t *what, + rspamd_regexp_t *with) +{ + struct rspamd_re_cache *cache; + + g_assert(cache_head != NULL); + g_assert(what != NULL); + g_assert(with != NULL); + + /* NULL scope is allowed for default scope */ + cache = rspamd_re_cache_find_by_scope(*cache_head, scope); + if (cache) { + rspamd_re_cache_replace(cache, what, with); + } +} + static int rspamd_re_cache_sort_func(gconstpointer a, gconstpointer b) { @@ -515,8 +629,24 @@ void rspamd_re_cache_init(struct rspamd_re_cache *cache, struct rspamd_config *c #endif } -struct rspamd_re_runtime * -rspamd_re_cache_runtime_new(struct rspamd_re_cache *cache) +void rspamd_re_cache_init_scoped(struct rspamd_re_cache *cache_head, + struct rspamd_config *cfg) +{ + struct rspamd_re_cache *cur; + + g_assert(cache_head != NULL); + + DL_FOREACH(cache_head, cur) + { + /* Only initialize loaded scopes */ + if (cur->flags & RSPAMD_RE_CACHE_FLAG_LOADED) { + rspamd_re_cache_init(cur, cfg); + } + } +} + +static struct rspamd_re_runtime * +rspamd_re_cache_runtime_new_single(struct rspamd_re_cache *cache) { struct rspamd_re_runtime *rt; g_assert(cache != NULL); @@ -530,10 +660,73 @@ rspamd_re_cache_runtime_new(struct rspamd_re_cache *cache) #ifdef WITH_HYPERSCAN rt->has_hs = cache->hyperscan_loaded; #endif + /* Initialize the doubly-linked list pointers */ + rt->next = NULL; + rt->prev = NULL; return rt; } +struct rspamd_re_runtime * +rspamd_re_cache_runtime_new(struct rspamd_re_cache *cache) +{ + struct rspamd_re_runtime *rt_head = NULL, *rt; + struct rspamd_re_cache *cur; + + g_assert(cache != NULL); + + /* + * Create runtime for all loaded scopes in the chain. + * This ensures task has runtimes for all available loaded scopes. + */ + DL_FOREACH(cache, cur) + { + /* Skip unloaded scopes */ + if (!(cur->flags & RSPAMD_RE_CACHE_FLAG_LOADED)) { + continue; + } + + rt = rspamd_re_cache_runtime_new_single(cur); + if (rt) { + if (rt_head) { + DL_APPEND(rt_head, rt); + } + else { + rt_head = rt; + /* For doubly-linked list, first element's prev should point to itself */ + rt_head->prev = rt_head; + rt_head->next = NULL; + } + } + } + + return rt_head; +} + +struct rspamd_re_runtime * +rspamd_re_cache_runtime_new_all_scopes(struct rspamd_re_cache *cache_head) +{ + /* This is now the same as the main function since it always creates for all scopes */ + return rspamd_re_cache_runtime_new(cache_head); +} + +struct rspamd_re_runtime * +rspamd_re_cache_runtime_new_scoped(struct rspamd_re_cache *cache_head, const char *scope) +{ + struct rspamd_re_cache *cache; + + if (!cache_head) { + return NULL; + } + + cache = rspamd_re_cache_find_by_scope(cache_head, scope); + if (!cache) { + return NULL; + } + + return rspamd_re_cache_runtime_new_single(cache); +} + const struct rspamd_re_cache_stat * rspamd_re_cache_get_stat(struct rspamd_re_runtime *rt) { @@ -998,20 +1191,21 @@ rspamd_re_cache_process_selector(struct rspamd_task *task, return result; } + static inline unsigned int -rspamd_process_words_vector(GArray *words, - const unsigned char **scvec, - unsigned int *lenvec, - struct rspamd_re_class *re_class, - unsigned int cnt, - gboolean *raw) +rspamd_process_words_vector_kvec(rspamd_words_t *words, + const unsigned char **scvec, + unsigned int *lenvec, + struct rspamd_re_class *re_class, + unsigned int cnt, + gboolean *raw) { unsigned int j; - rspamd_stat_token_t *tok; + rspamd_word_t *tok; - if (words) { - for (j = 0; j < words->len; j++) { - tok = &g_array_index(words, rspamd_stat_token_t, j); + if (words && words->a) { + for (j = 0; j < kv_size(*words); j++) { + tok = &kv_A(*words, j); if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) { @@ -1432,13 +1626,13 @@ rspamd_re_cache_exec_re(struct rspamd_task *task, PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) { - if (text_part->utf_words) { - cnt += text_part->utf_words->len; + if (text_part->utf_words.a) { + cnt += kv_size(text_part->utf_words); } } - if (task->meta_words && task->meta_words->len > 0) { - cnt += task->meta_words->len; + if (task->meta_words.a && kv_size(task->meta_words) > 0) { + cnt += kv_size(task->meta_words); } if (cnt > 0) { @@ -1449,15 +1643,15 @@ rspamd_re_cache_exec_re(struct rspamd_task *task, PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) { - if (text_part->utf_words) { - cnt = rspamd_process_words_vector(text_part->utf_words, - scvec, lenvec, re_class, cnt, &raw); + if (text_part->utf_words.a) { + cnt = rspamd_process_words_vector_kvec(&text_part->utf_words, + scvec, lenvec, re_class, cnt, &raw); } } - if (task->meta_words) { - cnt = rspamd_process_words_vector(task->meta_words, - scvec, lenvec, re_class, cnt, &raw); + if (task->meta_words.a) { + cnt = rspamd_process_words_vector_kvec(&task->meta_words, + scvec, lenvec, re_class, cnt, &raw); } ret = rspamd_re_cache_process_regexp_data(rt, re, @@ -1502,20 +1696,20 @@ rspamd_re_cache_exec_re(struct rspamd_task *task, return rt->results[re_id]; } -int rspamd_re_cache_process(struct rspamd_task *task, - rspamd_regexp_t *re, - enum rspamd_re_type type, - gconstpointer type_data, - gsize datalen, - gboolean is_strong) +static int +rspamd_re_cache_process_single(struct rspamd_task *task, + struct rspamd_re_runtime *rt, + rspamd_regexp_t *re, + enum rspamd_re_type type, + gconstpointer type_data, + gsize datalen, + gboolean is_strong) { uint64_t re_id; struct rspamd_re_class *re_class; struct rspamd_re_cache *cache; - struct rspamd_re_runtime *rt; g_assert(task != NULL); - rt = task->re_rt; g_assert(rt != NULL); g_assert(re != NULL); @@ -1550,6 +1744,53 @@ int rspamd_re_cache_process(struct rspamd_task *task, return 0; } +int rspamd_re_cache_process(struct rspamd_task *task, + rspamd_regexp_t *re, + enum rspamd_re_type type, + gconstpointer type_data, + gsize datalen, + gboolean is_strong) +{ + struct rspamd_re_runtime *rt_list, *rt; + struct rspamd_re_class *re_class; + struct rspamd_re_cache *target_cache; + int result = 0; + + g_assert(task != NULL); + g_assert(re != NULL); + + rt_list = task->re_rt; + if (!rt_list) { + return 0; + } + + /* + * Since each regexp belongs to a class which belongs to a cache, + * we can find the correct cache and corresponding runtime + */ + re_class = rspamd_regexp_get_class(re); + if (!re_class) { + return 0; + } + + target_cache = re_class->cache; + if (!target_cache) { + return 0; + } + + /* Find the runtime that matches the cache */ + DL_FOREACH(rt_list, rt) + { + if (rt->cache == target_cache) { + result = rspamd_re_cache_process_single(task, rt, re, type, + type_data, datalen, is_strong); + break; + } + } + + return result; +} + int rspamd_re_cache_process_ffi(void *ptask, void *pre, int type, @@ -1570,24 +1811,30 @@ int rspamd_re_cache_process_ffi(void *ptask, void rspamd_re_cache_runtime_destroy(struct rspamd_re_runtime *rt) { + struct rspamd_re_runtime *cur, *tmp; + g_assert(rt != NULL); - if (rt->sel_cache) { - struct rspamd_re_selector_result sr; + /* Handle linked list of runtimes */ + DL_FOREACH_SAFE(rt, cur, tmp) + { + if (cur->sel_cache) { + struct rspamd_re_selector_result sr; - kh_foreach_value(rt->sel_cache, sr, { - for (unsigned int i = 0; i < sr.cnt; i++) { - g_free((gpointer) sr.scvec[i]); - } + kh_foreach_value(cur->sel_cache, sr, { + for (unsigned int i = 0; i < sr.cnt; i++) { + g_free((gpointer) sr.scvec[i]); + } - g_free(sr.scvec); - g_free(sr.lenvec); - }); - kh_destroy(selectors_results_hash, rt->sel_cache); - } + g_free(sr.scvec); + g_free(sr.lenvec); + }); + kh_destroy(selectors_results_hash, cur->sel_cache); + } - REF_RELEASE(rt->cache); - g_free(rt); + REF_RELEASE(cur->cache); + g_free(cur); + } } void rspamd_re_cache_unref(struct rspamd_re_cache *cache) @@ -1597,6 +1844,21 @@ void rspamd_re_cache_unref(struct rspamd_re_cache *cache) } } +void rspamd_re_cache_unref_scoped(struct rspamd_re_cache *cache_head) +{ + struct rspamd_re_cache *cur, *tmp; + + if (!cache_head) { + return; + } + + DL_FOREACH_SAFE(cache_head, cur, tmp) + { + DL_DELETE(cache_head, cur); + rspamd_re_cache_unref(cur); + } +} + struct rspamd_re_cache * rspamd_re_cache_ref(struct rspamd_re_cache *cache) { @@ -1619,6 +1881,23 @@ unsigned int rspamd_re_cache_set_limit(struct rspamd_re_cache *cache, unsigned i return old; } +unsigned int rspamd_re_cache_set_limit_scoped(struct rspamd_re_cache *cache_head, const char *scope, unsigned int limit) +{ + struct rspamd_re_cache *cache; + unsigned int old = 0; + + if (!cache_head || !scope) { + return old; + } + + cache = rspamd_re_cache_find_by_scope(cache_head, scope); + if (cache) { + old = rspamd_re_cache_set_limit(cache, limit); + } + + return old; +} + const char * rspamd_re_cache_type_to_string(enum rspamd_re_type type) { @@ -1936,21 +2215,27 @@ rspamd_re_cache_compile_timer_cb(EV_P_ ev_timer *w, int revents) if (re_class->type_len > 0) { if (!cbdata->silent) { msg_info_re_cache( - "skip already valid class %s(%*s) to cache %6s, %d regexps", + "skip already valid class %s(%*s) to cache %6s, %d regexps%s%s%s", rspamd_re_cache_type_to_string(re_class->type), (int) re_class->type_len - 1, re_class->type_data, re_class->hash, - n); + n, + cache->scope ? " for scope '" : "", + cache->scope ? cache->scope : "", + cache->scope ? "'" : ""); } } else { if (!cbdata->silent) { msg_info_re_cache( - "skip already valid class %s to cache %6s, %d regexps", + "skip already valid class %s to cache %6s, %d regexps%s%s%s", rspamd_re_cache_type_to_string(re_class->type), re_class->hash, - n); + n, + cache->scope ? " for scope '" : "", + cache->scope ? cache->scope : "", + cache->scope ? "'" : ""); } } @@ -2159,21 +2444,27 @@ rspamd_re_cache_compile_timer_cb(EV_P_ ev_timer *w, int revents) if (re_class->type_len > 0) { msg_info_re_cache( - "compiled class %s(%*s) to cache %6s, %d/%d regexps", + "compiled class %s(%*s) to cache %6s, %d/%d regexps%s%s%s", rspamd_re_cache_type_to_string(re_class->type), (int) re_class->type_len - 1, re_class->type_data, re_class->hash, n, - (int) g_hash_table_size(re_class->re)); + (int) g_hash_table_size(re_class->re), + cache->scope ? " for scope '" : "", + cache->scope ? cache->scope : "", + cache->scope ? "'" : ""); } else { msg_info_re_cache( - "compiled class %s to cache %6s, %d/%d regexps", + "compiled class %s to cache %6s, %d/%d regexps%s%s%s", rspamd_re_cache_type_to_string(re_class->type), re_class->hash, n, - (int) g_hash_table_size(re_class->re)); + (int) g_hash_table_size(re_class->re), + cache->scope ? " for scope '" : "", + cache->scope ? cache->scope : "", + cache->scope ? "'" : ""); } cbdata->total += n; @@ -2256,6 +2547,108 @@ int rspamd_re_cache_compile_hyperscan(struct rspamd_re_cache *cache, #endif } +#ifdef WITH_HYPERSCAN +struct rspamd_re_cache_scoped_compile_data { + unsigned int total_scopes; + unsigned int completed_scopes; + unsigned int total_compiled; + GError *first_error; + void (*final_cb)(unsigned int ncompiled, GError *err, void *cbd); + void *final_cbd; +}; + +static void +rspamd_re_cache_compile_scoped_coordination_cb(unsigned int ncompiled, GError *err, void *cbd) +{ + struct rspamd_re_cache_scoped_compile_data *coord_data = + (struct rspamd_re_cache_scoped_compile_data *) cbd; + + coord_data->completed_scopes++; + coord_data->total_compiled += ncompiled; + + /* Store the first error we encounter */ + if (err && !coord_data->first_error) { + coord_data->first_error = g_error_copy(err); + } + + /* Check if all scopes have completed */ + if (coord_data->completed_scopes >= coord_data->total_scopes) { + /* All scopes completed, call the final callback */ + if (coord_data->final_cb) { + coord_data->final_cb(coord_data->total_compiled, coord_data->first_error, coord_data->final_cbd); + } + + /* Cleanup */ + if (coord_data->first_error) { + g_error_free(coord_data->first_error); + } + g_free(coord_data); + } +} +#endif + +int rspamd_re_cache_compile_hyperscan_scoped(struct rspamd_re_cache *cache_head, + const char *cache_dir, + double max_time, + gboolean silent, + struct ev_loop *event_loop, + void (*cb)(unsigned int ncompiled, GError *err, void *cbd), + void *cbd) +{ +#ifndef WITH_HYPERSCAN + return -1; +#else + struct rspamd_re_cache *cur; + struct rspamd_re_cache_scoped_compile_data *coord_data; + unsigned int scope_count = 0; + int result; + + if (!cache_head) { + return -1; + } + + /* Count the number of scopes to compile */ + DL_COUNT(cache_head, cur, scope_count); + + if (scope_count == 0) { + /* No scopes to compile, call callback immediately */ + if (cb) { + cb(0, NULL, cbd); + } + return 0; + } + + /* Create coordination data to track completion of all scopes */ + coord_data = g_malloc0(sizeof(*coord_data)); + coord_data->total_scopes = scope_count; + coord_data->completed_scopes = 0; + coord_data->total_compiled = 0; + coord_data->first_error = NULL; + coord_data->final_cb = cb; + coord_data->final_cbd = cbd; + + /* + * Start async compilation for each scope. Each scope will use timers + * and call our coordination callback when completed. + */ + DL_FOREACH(cache_head, cur) + { + result = rspamd_re_cache_compile_hyperscan(cur, cache_dir, max_time, silent, + event_loop, rspamd_re_cache_compile_scoped_coordination_cb, coord_data); + if (result < 0) { + /* If we failed to start compilation for this scope, treat it as completed with error */ + GError *start_error = g_error_new(rspamd_re_cache_quark(), result, + "Failed to start hyperscan compilation for scope '%s'", + cur->scope ? cur->scope : "unknown"); + rspamd_re_cache_compile_scoped_coordination_cb(0, start_error, coord_data); + g_error_free(start_error); + } + } + + return 0; /* Always return 0 for async operation */ +#endif +} + gboolean rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache *cache, const char *path, gboolean silent, gboolean try_load, GError **err) @@ -2272,6 +2665,7 @@ rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache *cache, GHashTableIter it; gpointer k, v; struct rspamd_re_class *re_class; + struct rspamd_re_cache *cur; gsize len; const char *hash_pos; hs_platform_info_t test_plt; @@ -2282,7 +2676,7 @@ rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache *cache, len = strlen(path); - if (len < sizeof(rspamd_cryptobox_HASHBYTES + 3)) { + if (len < (rspamd_cryptobox_HASHBYTES + 3)) { if (!silent) { msg_err_re_cache("cannot open hyperscan cache file %s: too short filename", path); @@ -2304,174 +2698,179 @@ rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache *cache, } hash_pos = path + len - 3 - (sizeof(re_class->hash) - 1); - g_hash_table_iter_init(&it, cache->re_classes); - while (g_hash_table_iter_next(&it, &k, &v)) { - re_class = v; + /* Iterate through all scopes in the cache chain */ + DL_FOREACH(cache, cur) + { + g_hash_table_iter_init(&it, cur->re_classes); - if (memcmp(hash_pos, re_class->hash, sizeof(re_class->hash) - 1) == 0) { - /* Open file and check magic */ - gssize r; + while (g_hash_table_iter_next(&it, &k, &v)) { + re_class = v; - fd = open(path, O_RDONLY); + if (memcmp(hash_pos, re_class->hash, sizeof(re_class->hash) - 1) == 0) { + /* Open file and check magic */ + gssize r; - if (fd == -1) { - if (errno != ENOENT || !silent) { - msg_err_re_cache("cannot open hyperscan cache file %s: %s", - path, strerror(errno)); - } - g_set_error(err, rspamd_re_cache_quark(), 0, - "%s", - strerror(errno)); - return FALSE; - } + fd = open(path, O_RDONLY); - if ((r = read(fd, magicbuf, sizeof(magicbuf))) != sizeof(magicbuf)) { - if (r == -1) { - msg_err_re_cache("cannot read magic from hyperscan " - "cache file %s: %s", - path, strerror(errno)); + if (fd == -1) { + if (errno != ENOENT || !silent) { + msg_err_re_cache("cannot open hyperscan cache file %s: %s", + path, strerror(errno)); + } g_set_error(err, rspamd_re_cache_quark(), 0, - "cannot read magic: %s", + "%s", strerror(errno)); - } - else { - msg_err_re_cache("truncated read magic from hyperscan " - "cache file %s: %z, %z wanted", - path, r, (gsize) sizeof(magicbuf)); - g_set_error(err, rspamd_re_cache_quark(), 0, - "truncated read magic %zd, %zd wanted", - r, (gsize) sizeof(magicbuf)); + return FALSE; } - close(fd); - return FALSE; - } - - mb = rspamd_hs_magic; - - if (memcmp(magicbuf, mb, sizeof(magicbuf)) != 0) { - msg_err_re_cache("cannot open hyperscan cache file %s: " - "bad magic ('%*xs', '%*xs' expected)", - path, (int) RSPAMD_HS_MAGIC_LEN, magicbuf, - (int) RSPAMD_HS_MAGIC_LEN, mb); - - close(fd); - g_set_error(err, rspamd_re_cache_quark(), 0, "invalid magic"); - return FALSE; - } + if ((r = read(fd, magicbuf, sizeof(magicbuf))) != sizeof(magicbuf)) { + if (r == -1) { + msg_err_re_cache("cannot read magic from hyperscan " + "cache file %s: %s", + path, strerror(errno)); + g_set_error(err, rspamd_re_cache_quark(), 0, + "cannot read magic: %s", + strerror(errno)); + } + else { + msg_err_re_cache("truncated read magic from hyperscan " + "cache file %s: %z, %z wanted", + path, r, (gsize) sizeof(magicbuf)); + g_set_error(err, rspamd_re_cache_quark(), 0, + "truncated read magic %zd, %zd wanted", + r, (gsize) sizeof(magicbuf)); + } - if ((r = read(fd, &test_plt, sizeof(test_plt))) != sizeof(test_plt)) { - if (r == -1) { - msg_err_re_cache("cannot read platform data from hyperscan " - "cache file %s: %s", - path, strerror(errno)); - } - else { - msg_err_re_cache("truncated read platform data from hyperscan " - "cache file %s: %z, %z wanted", - path, r, (gsize) sizeof(magicbuf)); + close(fd); + return FALSE; } - g_set_error(err, rspamd_re_cache_quark(), 0, - "cannot read platform data: %s", strerror(errno)); - - close(fd); - return FALSE; - } - - if (test_plt.cpu_features != cache->plt.cpu_features) { - msg_err_re_cache("cannot open hyperscan cache file %s: " - "compiled for a different platform", - path); - g_set_error(err, rspamd_re_cache_quark(), 0, - "compiled for a different platform"); - - close(fd); - return FALSE; - } + mb = rspamd_hs_magic; - close(fd); - - if (try_load) { - map = rspamd_file_xmap(path, PROT_READ, &len, TRUE); + if (memcmp(magicbuf, mb, sizeof(magicbuf)) != 0) { + msg_err_re_cache("cannot open hyperscan cache file %s: " + "bad magic ('%*xs', '%*xs' expected)", + path, (int) RSPAMD_HS_MAGIC_LEN, magicbuf, + (int) RSPAMD_HS_MAGIC_LEN, mb); - if (map == NULL) { - msg_err_re_cache("cannot mmap hyperscan cache file %s: " - "%s", - path, strerror(errno)); - g_set_error(err, rspamd_re_cache_quark(), 0, - "mmap error: %s", strerror(errno)); + close(fd); + g_set_error(err, rspamd_re_cache_quark(), 0, "invalid magic"); return FALSE; } - p = map + RSPAMD_HS_MAGIC_LEN + sizeof(test_plt); - end = map + len; - memcpy(&n, p, sizeof(n)); - p += sizeof(int); - - if (n <= 0 || 2 * n * sizeof(int) + /* IDs + flags */ - sizeof(uint64_t) + /* crc */ - RSPAMD_HS_MAGIC_LEN + /* header */ - sizeof(cache->plt) > - len) { - /* Some wrong amount of regexps */ - msg_err_re_cache("bad number of expressions in %s: %d", - path, n); + if ((r = read(fd, &test_plt, sizeof(test_plt))) != sizeof(test_plt)) { + if (r == -1) { + msg_err_re_cache("cannot read platform data from hyperscan " + "cache file %s: %s", + path, strerror(errno)); + } + else { + msg_err_re_cache("truncated read platform data from hyperscan " + "cache file %s: %z, %z wanted", + path, r, (gsize) sizeof(magicbuf)); + } + g_set_error(err, rspamd_re_cache_quark(), 0, - "bad number of expressions: %d", n); - munmap(map, len); + "cannot read platform data: %s", strerror(errno)); + + close(fd); return FALSE; } - /* - * Magic - 8 bytes - * Platform - sizeof (platform) - * n - number of regexps - * n * <regexp ids> - * n * <regexp flags> - * crc - 8 bytes checksum - * <hyperscan blob> - */ - - memcpy(&crc, p + n * 2 * sizeof(int), sizeof(crc)); - rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe); - /* IDs */ - rspamd_cryptobox_fast_hash_update(&crc_st, p, n * sizeof(int)); - /* Flags */ - rspamd_cryptobox_fast_hash_update(&crc_st, p + n * sizeof(int), - n * sizeof(int)); - /* HS database */ - p += n * sizeof(int) * 2 + sizeof(uint64_t); - rspamd_cryptobox_fast_hash_update(&crc_st, p, end - p); - valid_crc = rspamd_cryptobox_fast_hash_final(&crc_st); - - if (crc != valid_crc) { - msg_warn_re_cache("outdated or invalid hs database in %s: " - "crc read %xL, crc expected %xL", - path, crc, valid_crc); + if (test_plt.cpu_features != cur->plt.cpu_features) { + msg_err_re_cache("cannot open hyperscan cache file %s: " + "compiled for a different platform", + path); g_set_error(err, rspamd_re_cache_quark(), 0, - "outdated or invalid hs database, crc check failure"); - munmap(map, len); + "compiled for a different platform"); + close(fd); return FALSE; } - if ((ret = hs_deserialize_database(p, end - p, &test_db)) != HS_SUCCESS) { - msg_err_re_cache("bad hs database in %s: %d", path, ret); - g_set_error(err, rspamd_re_cache_quark(), 0, - "deserialize error: %d", ret); - munmap(map, len); + close(fd); - return FALSE; + if (try_load) { + map = rspamd_file_xmap(path, PROT_READ, &len, TRUE); + + if (map == NULL) { + msg_err_re_cache("cannot mmap hyperscan cache file %s: " + "%s", + path, strerror(errno)); + g_set_error(err, rspamd_re_cache_quark(), 0, + "mmap error: %s", strerror(errno)); + return FALSE; + } + + p = map + RSPAMD_HS_MAGIC_LEN + sizeof(test_plt); + end = map + len; + memcpy(&n, p, sizeof(n)); + p += sizeof(int); + + if (n <= 0 || 2 * n * sizeof(int) + /* IDs + flags */ + sizeof(uint64_t) + /* crc */ + RSPAMD_HS_MAGIC_LEN + /* header */ + sizeof(cur->plt) > + len) { + /* Some wrong amount of regexps */ + msg_err_re_cache("bad number of expressions in %s: %d", + path, n); + g_set_error(err, rspamd_re_cache_quark(), 0, + "bad number of expressions: %d", n); + munmap(map, len); + return FALSE; + } + + /* + * Magic - 8 bytes + * Platform - sizeof (platform) + * n - number of regexps + * n * <regexp ids> + * n * <regexp flags> + * crc - 8 bytes checksum + * <hyperscan blob> + */ + + memcpy(&crc, p + n * 2 * sizeof(int), sizeof(crc)); + rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe); + /* IDs */ + rspamd_cryptobox_fast_hash_update(&crc_st, p, n * sizeof(int)); + /* Flags */ + rspamd_cryptobox_fast_hash_update(&crc_st, p + n * sizeof(int), + n * sizeof(int)); + /* HS database */ + p += n * sizeof(int) * 2 + sizeof(uint64_t); + rspamd_cryptobox_fast_hash_update(&crc_st, p, end - p); + valid_crc = rspamd_cryptobox_fast_hash_final(&crc_st); + + if (crc != valid_crc) { + msg_warn_re_cache("outdated or invalid hs database in %s: " + "crc read %xL, crc expected %xL", + path, crc, valid_crc); + g_set_error(err, rspamd_re_cache_quark(), 0, + "outdated or invalid hs database, crc check failure"); + munmap(map, len); + + return FALSE; + } + + if ((ret = hs_deserialize_database(p, end - p, &test_db)) != HS_SUCCESS) { + msg_err_re_cache("bad hs database in %s: %d", path, ret); + g_set_error(err, rspamd_re_cache_quark(), 0, + "deserialize error: %d", ret); + munmap(map, len); + + return FALSE; + } + + hs_free_database(test_db); + munmap(map, len); } + /* XXX: add crc check */ - hs_free_database(test_db); - munmap(map, len); + return TRUE; } - /* XXX: add crc check */ - - return TRUE; } } @@ -2672,16 +3071,27 @@ rspamd_re_cache_load_hyperscan(struct rspamd_re_cache *cache, if (has_valid) { if (all_valid) { - msg_info_re_cache("full hyperscan database of %d regexps has been loaded", total); + msg_info_re_cache("full hyperscan database of %d regexps has been loaded%s%s%s", + total, + cache->scope ? " for scope '" : "", + cache->scope ? cache->scope : "", + cache->scope ? "'" : ""); cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_FULL; } else { - msg_info_re_cache("partial hyperscan database of %d regexps has been loaded", total); + msg_info_re_cache("partial hyperscan database of %d regexps has been loaded%s%s%s", + total, + cache->scope ? " for scope '" : "", + cache->scope ? cache->scope : "", + cache->scope ? "'" : ""); cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_PARTIAL; } } else { - msg_info_re_cache("hyperscan database has NOT been loaded; no valid expressions"); + msg_info_re_cache("hyperscan database has NOT been loaded; no valid expressions%s%s%s", + cache->scope ? " for scope '" : "", + cache->scope ? cache->scope : "", + cache->scope ? "'" : ""); cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOAD_ERROR; } @@ -2690,6 +3100,48 @@ rspamd_re_cache_load_hyperscan(struct rspamd_re_cache *cache, #endif } +enum rspamd_hyperscan_status rspamd_re_cache_load_hyperscan_scoped( + struct rspamd_re_cache *cache_head, + const char *cache_dir, bool try_load) +{ +#ifndef WITH_HYPERSCAN + return RSPAMD_HYPERSCAN_UNSUPPORTED; +#else + struct rspamd_re_cache *cur; + enum rspamd_hyperscan_status result, overall_status = RSPAMD_HYPERSCAN_UNKNOWN; + gboolean has_loaded = FALSE, all_loaded = TRUE; + + if (!cache_head) { + return RSPAMD_HYPERSCAN_LOAD_ERROR; + } + + DL_FOREACH(cache_head, cur) + { + result = rspamd_re_cache_load_hyperscan(cur, cache_dir, try_load); + + if (result == RSPAMD_HYPERSCAN_LOADED_FULL || + result == RSPAMD_HYPERSCAN_LOADED_PARTIAL) { + has_loaded = TRUE; + if (result == RSPAMD_HYPERSCAN_LOADED_PARTIAL) { + all_loaded = FALSE; + } + } + else { + all_loaded = FALSE; + } + } + + if (has_loaded) { + overall_status = all_loaded ? RSPAMD_HYPERSCAN_LOADED_FULL : RSPAMD_HYPERSCAN_LOADED_PARTIAL; + } + else { + overall_status = RSPAMD_HYPERSCAN_LOAD_ERROR; + } + + return overall_status; +#endif +} + void rspamd_re_cache_add_selector(struct rspamd_re_cache *cache, const char *sname, int ref) @@ -2716,3 +3168,324 @@ void rspamd_re_cache_add_selector(struct rspamd_re_cache *cache, kh_value(cache->selectors, k) = ref; } } + +void rspamd_re_cache_add_selector_scoped(struct rspamd_re_cache **cache_head, const char *scope, + const char *sname, int ref) +{ + struct rspamd_re_cache *cache; + + g_assert(cache_head != NULL); + g_assert(sname != NULL); + + /* NULL scope is allowed for default scope */ + cache = rspamd_re_cache_add_to_scope_list(cache_head, scope); + if (cache) { + rspamd_re_cache_add_selector(cache, sname, ref); + } +} + +struct rspamd_re_cache *rspamd_re_cache_find_scope(struct rspamd_re_cache *cache_head, const char *scope) +{ + return rspamd_re_cache_find_by_scope(cache_head, scope); +} + +gboolean rspamd_re_cache_remove_scope(struct rspamd_re_cache **cache_head, const char *scope) +{ + struct rspamd_re_cache *target; + + if (!cache_head || !*cache_head) { + return FALSE; + } + + /* Prevent removal of default scope (NULL) to keep head stable */ + if (!scope) { + return FALSE; + } + + target = rspamd_re_cache_find_by_scope(*cache_head, scope); + if (!target) { + return FALSE; + } + + /* Remove from linked list */ + DL_DELETE(*cache_head, target); + + /* If this was the head and there are no more elements, update head */ + if (target == *cache_head && !*cache_head) { + *cache_head = NULL; + } + + /* Unref the cache */ + rspamd_re_cache_unref(target); + + return TRUE; +} + +unsigned int rspamd_re_cache_count_scopes(struct rspamd_re_cache *cache_head) +{ + struct rspamd_re_cache *cur; + unsigned int count = 0; + + if (!cache_head) { + return 0; + } + + DL_COUNT(cache_head, cur, count); + return count; +} + +struct rspamd_re_cache *rspamd_re_cache_scope_first(struct rspamd_re_cache *cache_head) +{ + return cache_head; +} + +struct rspamd_re_cache *rspamd_re_cache_scope_next(struct rspamd_re_cache *current) +{ + return current ? current->next : NULL; +} + +const char *rspamd_re_cache_scope_name(struct rspamd_re_cache *scope) +{ + if (!scope) { + return "unknown"; + } + + return scope->scope ? scope->scope : "default"; +} + +void rspamd_re_cache_scope_set_flags(struct rspamd_re_cache *scope, unsigned int flags) +{ + if (scope) { + scope->flags |= flags; + } +} + +void rspamd_re_cache_scope_clear_flags(struct rspamd_re_cache *scope, unsigned int flags) +{ + if (scope) { + scope->flags &= ~flags; + } +} + +unsigned int rspamd_re_cache_scope_get_flags(struct rspamd_re_cache *scope) +{ + return scope ? scope->flags : 0; +} + +gboolean rspamd_re_cache_scope_is_loaded(struct rspamd_re_cache *scope) +{ + if (!scope) { + return FALSE; + } + + return (scope->flags & RSPAMD_RE_CACHE_FLAG_LOADED) != 0; +} + +void rspamd_re_cache_set_flags(struct rspamd_re_cache *cache_head, const char *scope, unsigned int flags) +{ + struct rspamd_re_cache *target; + + if (!cache_head) { + return; + } + + target = rspamd_re_cache_find_by_scope(cache_head, scope); + if (target) { + target->flags |= flags; + } +} + +void rspamd_re_cache_clear_flags(struct rspamd_re_cache *cache_head, const char *scope, unsigned int flags) +{ + struct rspamd_re_cache *target; + + if (!cache_head) { + return; + } + + target = rspamd_re_cache_find_by_scope(cache_head, scope); + if (target) { + target->flags &= ~flags; + } +} + +unsigned int rspamd_re_cache_get_flags(struct rspamd_re_cache *cache_head, const char *scope) +{ + struct rspamd_re_cache *target; + + if (!cache_head) { + return 0; + } + + target = rspamd_re_cache_find_by_scope(cache_head, scope); + if (target) { + return target->flags; + } + + return 0; +} + +gboolean rspamd_re_cache_is_loaded(struct rspamd_re_cache *cache_head, const char *scope) +{ + unsigned int flags = rspamd_re_cache_get_flags(cache_head, scope); + return (flags & RSPAMD_RE_CACHE_FLAG_LOADED) != 0; +} + + +static gboolean +rspamd_re_cache_create_scope_lock(const char *cache_dir, const char *scope, int *lock_fd) +{ + char lock_path[PATH_MAX]; + pid_t myself = getpid(); + + if (!scope) { + scope = "default"; + } + + rspamd_snprintf(lock_path, sizeof(lock_path), "%s%c%s.scope.lock", + cache_dir, G_DIR_SEPARATOR, scope); + + *lock_fd = open(lock_path, O_WRONLY | O_CREAT | O_EXCL, 00600); + + if (*lock_fd == -1) { + if (errno == EEXIST || errno == EBUSY) { + /* Check if the lock is stale */ + int read_fd = open(lock_path, O_RDONLY); + if (read_fd != -1) { + pid_t lock_pid; + gssize r = read(read_fd, &lock_pid, sizeof(lock_pid)); + close(read_fd); + + if (r == sizeof(lock_pid)) { + /* Check if the process is still alive */ + if (lock_pid != myself && (kill(lock_pid, 0) == -1 && errno == ESRCH)) { + /* Stale lock, remove it */ + if (unlink(lock_path) == 0) { + /* Try to create lock again */ + *lock_fd = open(lock_path, O_WRONLY | O_CREAT | O_EXCL, 00600); + if (*lock_fd != -1) { + goto write_pid; + } + } + } + } + else { + /* Invalid lock file, remove it */ + if (unlink(lock_path) == 0) { + *lock_fd = open(lock_path, O_WRONLY | O_CREAT | O_EXCL, 00600); + if (*lock_fd != -1) { + goto write_pid; + } + } + } + } + } + return FALSE; + } + +write_pid: + /* Write our PID to the lock file */ + if (write(*lock_fd, &myself, sizeof(myself)) != sizeof(myself)) { + close(*lock_fd); + unlink(lock_path); + return FALSE; + } + + /* Lock the file */ + if (!rspamd_file_lock(*lock_fd, FALSE)) { + close(*lock_fd); + unlink(lock_path); + return FALSE; + } + + return TRUE; +} + +static void +rspamd_re_cache_remove_scope_lock(const char *cache_dir, const char *scope, int lock_fd) +{ + char lock_path[PATH_MAX]; + + if (!scope) { + scope = "default"; + } + + rspamd_snprintf(lock_path, sizeof(lock_path), "%s%c%s.scope.lock", + cache_dir, G_DIR_SEPARATOR, scope); + + if (lock_fd != -1) { + rspamd_file_unlock(lock_fd, FALSE); + close(lock_fd); + } + unlink(lock_path); +} + +#ifdef WITH_HYPERSCAN +struct rspamd_re_cache_hs_compile_scoped_cbdata { + struct rspamd_re_cache *cache; + const char *cache_dir; + const char *scope; + double max_time; + gboolean silent; + int lock_fd; + void (*cb)(const char *scope, unsigned int ncompiled, GError *err, void *cbd); + void *cbd; +}; + +static void +rspamd_re_cache_compile_scoped_cb(unsigned int ncompiled, GError *err, void *cbd) +{ + struct rspamd_re_cache_hs_compile_scoped_cbdata *scoped_cbd = + (struct rspamd_re_cache_hs_compile_scoped_cbdata *) cbd; + + /* Remove lock */ + rspamd_re_cache_remove_scope_lock(scoped_cbd->cache_dir, scoped_cbd->scope, + scoped_cbd->lock_fd); + + /* Call original callback */ + if (scoped_cbd->cb) { + scoped_cbd->cb(scoped_cbd->scope, ncompiled, err, scoped_cbd->cbd); + } + + g_free(scoped_cbd); +} + +int rspamd_re_cache_compile_hyperscan_scoped_single(struct rspamd_re_cache *cache, + const char *scope, + const char *cache_dir, + double max_time, + gboolean silent, + struct ev_loop *event_loop, + void (*cb)(const char *scope, unsigned int ncompiled, GError *err, void *cbd), + void *cbd) +{ + struct rspamd_re_cache_hs_compile_scoped_cbdata *scoped_cbd; + int lock_fd = -1; + + g_assert(cache != NULL); + g_assert(cache_dir != NULL); + + /* Try to acquire lock for this scope */ + if (!rspamd_re_cache_create_scope_lock(cache_dir, scope, &lock_fd)) { + /* Another process is compiling this scope */ + if (cb) { + cb(scope, 0, NULL, cbd); + } + return 0; + } + + /* Create callback data */ + scoped_cbd = g_malloc0(sizeof(*scoped_cbd)); + scoped_cbd->cache = cache; + scoped_cbd->cache_dir = cache_dir; + scoped_cbd->scope = scope; + scoped_cbd->max_time = max_time; + scoped_cbd->silent = silent; + scoped_cbd->lock_fd = lock_fd; + scoped_cbd->cb = cb; + scoped_cbd->cbd = cbd; + + return rspamd_re_cache_compile_hyperscan(cache, cache_dir, max_time, silent, + event_loop, rspamd_re_cache_compile_scoped_cb, scoped_cbd); +} +#endif diff --git a/src/libserver/re_cache.h b/src/libserver/re_cache.h index 20b1108e0..c5c8627d8 100644 --- a/src/libserver/re_cache.h +++ b/src/libserver/re_cache.h @@ -28,6 +28,9 @@ struct rspamd_re_runtime; struct rspamd_task; struct rspamd_config; +/* Re cache flags */ +#define RSPAMD_RE_CACHE_FLAG_LOADED (1U << 0) /* Scope is fully loaded and ready for use */ + enum rspamd_re_type { RSPAMD_RE_HEADER, RSPAMD_RE_RAWHEADER, @@ -77,6 +80,22 @@ rspamd_re_cache_add(struct rspamd_re_cache *cache, rspamd_regexp_t *re, int lua_cbref); /** + * Add the existing regexp to the cache with specified scope + * @param cache_head head of cache list + * @param scope scope name + * @param re regexp object + * @param type type of object + * @param type_data associated data with the type (e.g. header name) + * @param datalen associated data length + * @param lua_cbref optional lua callback reference for matching purposes + */ +rspamd_regexp_t * +rspamd_re_cache_add_scoped(struct rspamd_re_cache **cache_head, const char *scope, + rspamd_regexp_t *re, enum rspamd_re_type type, + gconstpointer type_data, gsize datalen, + int lua_cbref); + +/** * Replace regexp in the cache with another regexp * @param cache cache object * @param what re to replace @@ -87,11 +106,28 @@ void rspamd_re_cache_replace(struct rspamd_re_cache *cache, rspamd_regexp_t *with); /** + * Replace regexp in the scoped cache with another regexp + * @param cache_head head of cache list + * @param scope scope name + * @param what re to replace + * @param with regexp object to replace the origin + */ +void rspamd_re_cache_replace_scoped(struct rspamd_re_cache **cache_head, const char *scope, + rspamd_regexp_t *what, + rspamd_regexp_t *with); + +/** * Initialize and optimize re cache structure */ void rspamd_re_cache_init(struct rspamd_re_cache *cache, struct rspamd_config *cfg); +/** + * Initialize and optimize re cache structures for all scopes + */ +void rspamd_re_cache_init_scoped(struct rspamd_re_cache *cache_head, + struct rspamd_config *cfg); + enum rspamd_hyperscan_status { RSPAMD_HYPERSCAN_UNKNOWN = 0, RSPAMD_HYPERSCAN_UNSUPPORTED, @@ -108,11 +144,22 @@ enum rspamd_hyperscan_status { enum rspamd_hyperscan_status rspamd_re_cache_is_hs_loaded(struct rspamd_re_cache *cache); /** - * Get runtime data for a cache + * Get runtime data for a cache - automatically creates runtimes for all scopes in the chain + * This is the main function used for task runtime creation */ struct rspamd_re_runtime *rspamd_re_cache_runtime_new(struct rspamd_re_cache *cache); /** + * Get runtime data for all scoped caches (same as rspamd_re_cache_runtime_new) + */ +struct rspamd_re_runtime *rspamd_re_cache_runtime_new_all_scopes(struct rspamd_re_cache *cache_head); + +/** + * Get runtime data for a specific scoped cache only + */ +struct rspamd_re_runtime *rspamd_re_cache_runtime_new_scoped(struct rspamd_re_cache *cache_head, const char *scope); + +/** * Get runtime statistics */ const struct rspamd_re_cache_stat * @@ -152,6 +199,11 @@ void rspamd_re_cache_runtime_destroy(struct rspamd_re_runtime *rt); void rspamd_re_cache_unref(struct rspamd_re_cache *cache); /** + * Unref re cache list (all scopes) + */ +void rspamd_re_cache_unref_scoped(struct rspamd_re_cache *cache_head); + +/** * Retain reference to re cache */ struct rspamd_re_cache *rspamd_re_cache_ref(struct rspamd_re_cache *cache); @@ -162,6 +214,11 @@ struct rspamd_re_cache *rspamd_re_cache_ref(struct rspamd_re_cache *cache); unsigned int rspamd_re_cache_set_limit(struct rspamd_re_cache *cache, unsigned int limit); /** + * Set limit for all regular expressions in the scoped cache, returns previous limit + */ +unsigned int rspamd_re_cache_set_limit_scoped(struct rspamd_re_cache *cache_head, const char *scope, unsigned int limit); + +/** * Convert re type to a human readable string (constant one) */ const char *rspamd_re_cache_type_to_string(enum rspamd_re_type type); @@ -184,6 +241,17 @@ int rspamd_re_cache_compile_hyperscan(struct rspamd_re_cache *cache, void *cbd); /** + * Compile expressions to the hyperscan tree and store in the `cache_dir` for all scopes + */ +int rspamd_re_cache_compile_hyperscan_scoped(struct rspamd_re_cache *cache_head, + const char *cache_dir, + double max_time, + gboolean silent, + struct ev_loop *event_loop, + void (*cb)(unsigned int ncompiled, GError *err, void *cbd), + void *cbd); + +/** * Returns TRUE if the specified file is valid hyperscan cache */ gboolean rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache *cache, @@ -200,11 +268,139 @@ enum rspamd_hyperscan_status rspamd_re_cache_load_hyperscan( const char *cache_dir, bool try_load); /** + * Loads all hyperscan regexps precompiled for all scopes + */ +enum rspamd_hyperscan_status rspamd_re_cache_load_hyperscan_scoped( + struct rspamd_re_cache *cache_head, + const char *cache_dir, bool try_load); + +/** + * Compile expressions to the hyperscan tree for a single scope with locking + */ +int rspamd_re_cache_compile_hyperscan_scoped_single(struct rspamd_re_cache *cache, + const char *scope, + const char *cache_dir, + double max_time, + gboolean silent, + struct ev_loop *event_loop, + void (*cb)(const char *scope, unsigned int ncompiled, GError *err, void *cbd), + void *cbd); + +/** * Registers lua selector in the cache */ void rspamd_re_cache_add_selector(struct rspamd_re_cache *cache, const char *sname, int ref); +/** + * Registers lua selector in the scoped cache + */ +void rspamd_re_cache_add_selector_scoped(struct rspamd_re_cache **cache_head, const char *scope, + const char *sname, int ref); + +/** + * Find a cache by scope name + */ +struct rspamd_re_cache *rspamd_re_cache_find_scope(struct rspamd_re_cache *cache_head, const char *scope); + +/** + * Remove a cache scope from the list + */ +gboolean rspamd_re_cache_remove_scope(struct rspamd_re_cache **cache_head, const char *scope); + +/** + * Get array of scope names from the cache list + * @param cache_head head of cache list + * @return NULL-terminated array of scope names (must be freed with g_strfreev), or NULL if no scopes + */ +char **rspamd_re_cache_get_scope_names(struct rspamd_re_cache *cache_head); + +/** + * Count the number of scopes in the cache list + */ +unsigned int rspamd_re_cache_count_scopes(struct rspamd_re_cache *cache_head); + +/** + * Get the first scope in the cache list for iteration + * @param cache_head head of cache list + * @return first scope, or NULL if no scopes + */ +struct rspamd_re_cache *rspamd_re_cache_scope_first(struct rspamd_re_cache *cache_head); + +/** + * Get the next scope in iteration + * @param current current scope + * @return next scope, or NULL if at end + */ +struct rspamd_re_cache *rspamd_re_cache_scope_next(struct rspamd_re_cache *current); + +/** + * Get the scope name (for display/logging purposes) + * @param scope the scope + * @return scope name ("default" for NULL scope name), never returns NULL + */ +const char *rspamd_re_cache_scope_name(struct rspamd_re_cache *scope); + +/** + * Set flags on a scope (efficient version that works directly on scope object) + * @param scope the scope object (from iterator) + * @param flags flags to set + */ +void rspamd_re_cache_scope_set_flags(struct rspamd_re_cache *scope, unsigned int flags); + +/** + * Clear flags on a scope (efficient version that works directly on scope object) + * @param scope the scope object (from iterator) + * @param flags flags to clear + */ +void rspamd_re_cache_scope_clear_flags(struct rspamd_re_cache *scope, unsigned int flags); + +/** + * Get flags from a scope (efficient version that works directly on scope object) + * @param scope the scope object (from iterator) + * @return flags value + */ +unsigned int rspamd_re_cache_scope_get_flags(struct rspamd_re_cache *scope); + +/** + * Check if a scope is loaded (efficient version that works directly on scope object) + * @param scope the scope object (from iterator) + * @return TRUE if scope is loaded + */ +gboolean rspamd_re_cache_scope_is_loaded(struct rspamd_re_cache *scope); + +/** + * Set flags for a specific scope (legacy function - less efficient, searches by name) + * @param cache_head head of cache list + * @param scope scope name (NULL for default scope) + * @param flags flags to set + */ +void rspamd_re_cache_set_flags(struct rspamd_re_cache *cache_head, const char *scope, unsigned int flags); + +/** + * Clear flags for a specific scope (legacy function - less efficient, searches by name) + * @param cache_head head of cache list + * @param scope scope name (NULL for default scope) + * @param flags flags to clear + */ +void rspamd_re_cache_clear_flags(struct rspamd_re_cache *cache_head, const char *scope, unsigned int flags); + +/** + * Get flags for a specific scope (legacy function - less efficient, searches by name) + * @param cache_head head of cache list + * @param scope scope name (NULL for default scope) + * @return flags value + */ +unsigned int rspamd_re_cache_get_flags(struct rspamd_re_cache *cache_head, const char *scope); + +/** + * Check if a scope is loaded (legacy function - less efficient, searches by name) + * @param cache_head head of cache list + * @param scope scope name (NULL for default scope) + * @return TRUE if scope is loaded and ready for use + */ +gboolean rspamd_re_cache_is_loaded(struct rspamd_re_cache *cache_head, const char *scope); + #ifdef __cplusplus } #endif diff --git a/src/libserver/roll_history.c b/src/libserver/roll_history.c index 66a53a597..d0f145d8f 100644 --- a/src/libserver/roll_history.c +++ b/src/libserver/roll_history.c @@ -1,11 +1,11 @@ -/*- - * Copyright 2016 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -231,7 +231,7 @@ rspamd_roll_history_load(struct roll_history *history, const char *filename) return FALSE; } - parser = ucl_parser_new(0); + parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS); if (!ucl_parser_add_fd(parser, fd)) { msg_warn("cannot parse history file %s: %s", filename, diff --git a/src/libserver/rspamd_control.c b/src/libserver/rspamd_control.c index 1bff2ff12..e212f7e91 100644 --- a/src/libserver/rspamd_control.c +++ b/src/libserver/rspamd_control.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -214,7 +214,7 @@ rspamd_control_write_reply(struct rspamd_control_session *session) case RSPAMD_CONTROL_FUZZY_STAT: if (elt->attached_fd != -1) { /* We have some data to parse */ - parser = ucl_parser_new(0); + parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS); ucl_object_insert_key(cur, ucl_object_fromint( elt->reply.reply.fuzzy_stat.status), @@ -724,6 +724,9 @@ rspamd_control_default_cmd_handler(int fd, case RSPAMD_CONTROL_CHILD_CHANGE: case RSPAMD_CONTROL_FUZZY_BLOCKED: break; + case RSPAMD_CONTROL_WORKERS_SPAWNED: + rep.reply.workers_spawned.status = 0; + break; case RSPAMD_CONTROL_RERESOLVE: if (cd->worker->srv->cfg) { REF_RETAIN(cd->worker->srv->cfg); @@ -1065,30 +1068,58 @@ rspamd_srv_handler(EV_P_ ev_io *w, int revents) case RSPAMD_SRV_HYPERSCAN_LOADED: #ifdef WITH_HYPERSCAN /* Load RE cache to provide it for new forks */ - if (rspamd_re_cache_is_hs_loaded(rspamd_main->cfg->re_cache) != RSPAMD_HYPERSCAN_LOADED_FULL || - cmd.cmd.hs_loaded.forced) { - rspamd_re_cache_load_hyperscan( + if (cmd.cmd.hs_loaded.scope[0] != '\0') { + /* Scoped loading */ + const char *scope = cmd.cmd.hs_loaded.scope; + msg_info_main("received scoped hyperscan cache loaded from %s for scope: %s", + cmd.cmd.hs_loaded.cache_dir, scope); + + /* Load specific scope */ + rspamd_re_cache_load_hyperscan_scoped( rspamd_main->cfg->re_cache, cmd.cmd.hs_loaded.cache_dir, false); - } - - /* After getting this notice, we can clean up old hyperscan files */ - - rspamd_hyperscan_notice_loaded(); - msg_info_main("received hyperscan cache loaded from %s", - cmd.cmd.hs_loaded.cache_dir); + /* Broadcast scoped command to all workers */ + memset(&wcmd, 0, sizeof(wcmd)); + wcmd.type = RSPAMD_CONTROL_HYPERSCAN_LOADED; + rspamd_strlcpy(wcmd.cmd.hs_loaded.cache_dir, + cmd.cmd.hs_loaded.cache_dir, + sizeof(wcmd.cmd.hs_loaded.cache_dir)); + rspamd_strlcpy(wcmd.cmd.hs_loaded.scope, + cmd.cmd.hs_loaded.scope, + sizeof(wcmd.cmd.hs_loaded.scope)); + wcmd.cmd.hs_loaded.forced = cmd.cmd.hs_loaded.forced; + rspamd_control_broadcast_cmd(rspamd_main, &wcmd, rfd, + rspamd_control_ignore_io_handler, NULL, worker->pid); + } + else { + /* Legacy full cache loading */ + if (rspamd_re_cache_is_hs_loaded(rspamd_main->cfg->re_cache) != RSPAMD_HYPERSCAN_LOADED_FULL || + cmd.cmd.hs_loaded.forced) { + rspamd_re_cache_load_hyperscan( + rspamd_main->cfg->re_cache, + cmd.cmd.hs_loaded.cache_dir, + false); + } - /* Broadcast command to all workers */ - memset(&wcmd, 0, sizeof(wcmd)); - wcmd.type = RSPAMD_CONTROL_HYPERSCAN_LOADED; - rspamd_strlcpy(wcmd.cmd.hs_loaded.cache_dir, - cmd.cmd.hs_loaded.cache_dir, - sizeof(wcmd.cmd.hs_loaded.cache_dir)); - wcmd.cmd.hs_loaded.forced = cmd.cmd.hs_loaded.forced; - rspamd_control_broadcast_cmd(rspamd_main, &wcmd, rfd, - rspamd_control_ignore_io_handler, NULL, worker->pid); + /* After getting this notice, we can clean up old hyperscan files */ + rspamd_hyperscan_notice_loaded(); + + msg_info_main("received hyperscan cache loaded from %s", + cmd.cmd.hs_loaded.cache_dir); + + /* Broadcast command to all workers */ + memset(&wcmd, 0, sizeof(wcmd)); + wcmd.type = RSPAMD_CONTROL_HYPERSCAN_LOADED; + rspamd_strlcpy(wcmd.cmd.hs_loaded.cache_dir, + cmd.cmd.hs_loaded.cache_dir, + sizeof(wcmd.cmd.hs_loaded.cache_dir)); + wcmd.cmd.hs_loaded.forced = cmd.cmd.hs_loaded.forced; + wcmd.cmd.hs_loaded.scope[0] = '\0'; /* Empty scope for legacy */ + rspamd_control_broadcast_cmd(rspamd_main, &wcmd, rfd, + rspamd_control_ignore_io_handler, NULL, worker->pid); + } #endif break; case RSPAMD_SRV_MONITORED_CHANGE: @@ -1137,6 +1168,10 @@ rspamd_srv_handler(EV_P_ ev_io *w, int revents) rspamd_control_broadcast_cmd(rspamd_main, &wcmd, rfd, rspamd_control_ignore_io_handler, NULL, worker->pid); break; + case RSPAMD_SRV_WORKERS_SPAWNED: + /* No need to broadcast, this is just a notification from main to specific workers */ + rdata->rep.reply.workers_spawned.status = 0; + break; default: msg_err_main("unknown command type: %d", cmd.type); break; @@ -1390,6 +1425,9 @@ rspamd_control_command_from_string(const char *str) else if (g_ascii_strcasecmp(str, "child_change") == 0) { ret = RSPAMD_CONTROL_CHILD_CHANGE; } + else if (g_ascii_strcasecmp(str, "workers_spawned") == 0) { + ret = RSPAMD_CONTROL_WORKERS_SPAWNED; + } return ret; } @@ -1430,6 +1468,9 @@ rspamd_control_command_to_string(enum rspamd_control_type cmd) case RSPAMD_CONTROL_CHILD_CHANGE: reply = "child_change"; break; + case RSPAMD_CONTROL_WORKERS_SPAWNED: + reply = "workers_spawned"; + break; default: break; } @@ -1469,6 +1510,9 @@ const char *rspamd_srv_command_to_string(enum rspamd_srv_type cmd) case RSPAMD_SRV_FUZZY_BLOCKED: reply = "fuzzy_blocked"; break; + case RSPAMD_SRV_WORKERS_SPAWNED: + reply = "workers_spawned"; + break; } return reply; diff --git a/src/libserver/rspamd_control.h b/src/libserver/rspamd_control.h index a08ba7948..81603cab2 100644 --- a/src/libserver/rspamd_control.h +++ b/src/libserver/rspamd_control.h @@ -37,6 +37,7 @@ enum rspamd_control_type { RSPAMD_CONTROL_MONITORED_CHANGE, RSPAMD_CONTROL_CHILD_CHANGE, RSPAMD_CONTROL_FUZZY_BLOCKED, + RSPAMD_CONTROL_WORKERS_SPAWNED, RSPAMD_CONTROL_MAX }; @@ -49,7 +50,8 @@ enum rspamd_srv_type { RSPAMD_SRV_HEARTBEAT, RSPAMD_SRV_HEALTH, RSPAMD_SRV_NOTICE_HYPERSCAN_CACHE, - RSPAMD_SRV_FUZZY_BLOCKED, /* Used to notify main process about a blocked ip */ + RSPAMD_SRV_FUZZY_BLOCKED, /* Used to notify main process about a blocked ip */ + RSPAMD_SRV_WORKERS_SPAWNED, /* Used to notify workers that all workers have been spawned */ }; enum rspamd_log_pipe_type { @@ -74,6 +76,7 @@ struct rspamd_control_command { struct { gboolean forced; char cache_dir[CONTROL_PATHLEN]; + char scope[64]; /* Scope name, NULL means all scopes */ } hs_loaded; struct { char tag[32]; @@ -106,6 +109,9 @@ struct rspamd_control_command { } addr; sa_family_t af; } fuzzy_blocked; + struct { + unsigned int workers_count; + } workers_spawned; } cmd; }; @@ -147,6 +153,9 @@ struct rspamd_control_reply { struct { unsigned int status; } fuzzy_blocked; + struct { + unsigned int status; + } workers_spawned; } reply; }; @@ -164,6 +173,7 @@ struct rspamd_srv_command { struct { gboolean forced; char cache_dir[CONTROL_PATHLEN]; + char scope[64]; /* Scope name, NULL means all scopes */ } hs_loaded; struct { char tag[32]; @@ -201,6 +211,10 @@ struct rspamd_srv_command { } addr; sa_family_t af; } fuzzy_blocked; + /* Sent when all workers have been spawned */ + struct { + unsigned int workers_count; + } workers_spawned; } cmd; }; @@ -238,6 +252,9 @@ struct rspamd_srv_reply { struct { int unused; } fuzzy_blocked; + struct { + int status; + } workers_spawned; } reply; }; diff --git a/src/libserver/rspamd_symcache.h b/src/libserver/rspamd_symcache.h index 5725a2885..f020b6055 100644 --- a/src/libserver/rspamd_symcache.h +++ b/src/libserver/rspamd_symcache.h @@ -571,6 +571,13 @@ void rspamd_symcache_timeout_result_free(struct rspamd_symcache_timeout_result * * @param task */ void rspamd_symcache_runtime_destroy(struct rspamd_task *task); + +/** + * Promote symbols cache resort (typically after dynamic symbol registration) + * @param cache + */ +void rspamd_symcache_promote_resort(struct rspamd_symcache *cache); + #ifdef __cplusplus } #endif diff --git a/src/libserver/symcache/symcache_c.cxx b/src/libserver/symcache/symcache_c.cxx index 047fc1181..6221aa238 100644 --- a/src/libserver/symcache/symcache_c.cxx +++ b/src/libserver/symcache/symcache_c.cxx @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -719,4 +719,11 @@ void rspamd_symcache_runtime_destroy(struct rspamd_task *task) { auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); cache_runtime->savepoint_dtor(task); -}
\ No newline at end of file +} + +void rspamd_symcache_promote_resort(struct rspamd_symcache *cache) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + real_cache->promote_resort(); +} diff --git a/src/libserver/symcache/symcache_impl.cxx b/src/libserver/symcache/symcache_impl.cxx index c0278cfc1..c1ca2a6ed 100644 --- a/src/libserver/symcache/symcache_impl.cxx +++ b/src/libserver/symcache/symcache_impl.cxx @@ -274,7 +274,7 @@ auto symcache::load_items() -> bool return false; } - auto *parser = ucl_parser_new(0); + auto *parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS); const auto *p = (const std::uint8_t *) (hdr + 1); if (!ucl_parser_add_chunk(parser, p, cached_map->get_size() - sizeof(*hdr))) { diff --git a/src/libserver/symcache/symcache_internal.hxx b/src/libserver/symcache/symcache_internal.hxx index c7dda51d1..f715b5bb0 100644 --- a/src/libserver/symcache/symcache_internal.hxx +++ b/src/libserver/symcache/symcache_internal.hxx @@ -644,6 +644,14 @@ public: * @return */ auto get_max_timeout(std::vector<std::pair<double, const cache_item *>> &elts) const -> double; + + /** + * Promote cache resort on next use (after dynamic symbol registration) + */ + auto promote_resort() -> void + { + cur_order_gen++; + } }; diff --git a/src/libserver/task.c b/src/libserver/task.c index bd1e07549..f655ab11b 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -196,8 +196,8 @@ void rspamd_task_free(struct rspamd_task *task) rspamd_email_address_free(task->from_envelope_orig); } - if (task->meta_words) { - g_array_free(task->meta_words, TRUE); + if (task->meta_words.a) { + kv_destroy(task->meta_words); } ucl_object_unref(task->messages); @@ -730,7 +730,7 @@ rspamd_task_process(struct rspamd_task *task, unsigned int stages) if (all_done && (task->flags & RSPAMD_TASK_FLAG_LEARN_AUTO) && !RSPAMD_TASK_IS_EMPTY(task) && - !(task->flags & (RSPAMD_TASK_FLAG_LEARN_SPAM | RSPAMD_TASK_FLAG_LEARN_HAM))) { + !(task->flags & (RSPAMD_TASK_FLAG_LEARN_SPAM | RSPAMD_TASK_FLAG_LEARN_HAM | RSPAMD_TASK_FLAG_LEARN_CLASS))) { rspamd_stat_check_autolearn(task); } break; @@ -738,12 +738,32 @@ rspamd_task_process(struct rspamd_task *task, unsigned int stages) case RSPAMD_TASK_STAGE_LEARN: case RSPAMD_TASK_STAGE_LEARN_PRE: case RSPAMD_TASK_STAGE_LEARN_POST: - if (task->flags & (RSPAMD_TASK_FLAG_LEARN_SPAM | RSPAMD_TASK_FLAG_LEARN_HAM)) { + if (task->flags & (RSPAMD_TASK_FLAG_LEARN_SPAM | RSPAMD_TASK_FLAG_LEARN_HAM | RSPAMD_TASK_FLAG_LEARN_CLASS)) { if (task->err == NULL) { - if (!rspamd_stat_learn(task, - task->flags & RSPAMD_TASK_FLAG_LEARN_SPAM, - task->cfg->lua_state, task->classifier, - st, &stat_error)) { + gboolean learn_result = FALSE; + + if (task->flags & RSPAMD_TASK_FLAG_LEARN_CLASS) { + /* Multi-class learning */ + const char *autolearn_class = rspamd_task_get_autolearn_class(task); + if (autolearn_class) { + learn_result = rspamd_stat_learn_class(task, autolearn_class, + task->cfg->lua_state, task->classifier, + st, &stat_error); + } + else { + g_set_error(&stat_error, g_quark_from_static_string("stat"), 500, + "No autolearn class specified for multi-class learning"); + } + } + else { + /* Legacy binary learning */ + learn_result = rspamd_stat_learn(task, + task->flags & RSPAMD_TASK_FLAG_LEARN_SPAM, + task->cfg->lua_state, task->classifier, + st, &stat_error); + } + + if (!learn_result) { if (stat_error == NULL) { g_set_error(&stat_error, @@ -922,15 +942,14 @@ rspamd_learn_task_spam(struct rspamd_task *task, const char *classifier, GError **err) { + /* Use unified class-based approach internally */ + const char *class_name = is_spam ? "spam" : "ham"; + /* Disable learn auto flag to avoid bad learn codes */ task->flags &= ~RSPAMD_TASK_FLAG_LEARN_AUTO; - if (is_spam) { - task->flags |= RSPAMD_TASK_FLAG_LEARN_SPAM; - } - else { - task->flags |= RSPAMD_TASK_FLAG_LEARN_HAM; - } + /* Use the unified class-based learning approach */ + rspamd_task_set_autolearn_class(task, class_name); task->classifier = classifier; diff --git a/src/libserver/task.h b/src/libserver/task.h index 6be350098..a1742e160 100644 --- a/src/libserver/task.h +++ b/src/libserver/task.h @@ -24,6 +24,7 @@ #include "dns.h" #include "re_cache.h" #include "khash.h" +#include "libserver/word.h" #ifdef __cplusplus extern "C" { @@ -103,9 +104,9 @@ enum rspamd_task_stage { #define RSPAMD_TASK_FLAG_LEARN_SPAM (1u << 12u) #define RSPAMD_TASK_FLAG_LEARN_HAM (1u << 13u) #define RSPAMD_TASK_FLAG_LEARN_AUTO (1u << 14u) +#define RSPAMD_TASK_FLAG_LEARN_CLASS (1u << 25u) #define RSPAMD_TASK_FLAG_BROKEN_HEADERS (1u << 15u) -#define RSPAMD_TASK_FLAG_HAS_SPAM_TOKENS (1u << 16u) -#define RSPAMD_TASK_FLAG_HAS_HAM_TOKENS (1u << 17u) +/* Removed RSPAMD_TASK_FLAG_HAS_SPAM_TOKENS and RSPAMD_TASK_FLAG_HAS_HAM_TOKENS - not needed in multi-class */ #define RSPAMD_TASK_FLAG_EMPTY (1u << 18u) #define RSPAMD_TASK_FLAG_PROFILE (1u << 19u) #define RSPAMD_TASK_FLAG_GREYLISTED (1u << 20u) @@ -113,7 +114,7 @@ enum rspamd_task_stage { #define RSPAMD_TASK_FLAG_SSL (1u << 22u) #define RSPAMD_TASK_FLAG_BAD_UNICODE (1u << 23u) #define RSPAMD_TASK_FLAG_MESSAGE_REWRITE (1u << 24u) -#define RSPAMD_TASK_FLAG_MAX_SHIFT (24u) +#define RSPAMD_TASK_FLAG_MAX_SHIFT (25u) /* Request has been done by a local client */ #define RSPAMD_TASK_PROTOCOL_FLAG_LOCAL_CLIENT (1u << 1u) @@ -187,7 +188,7 @@ struct rspamd_task { struct rspamd_scan_result *result; /**< Metric result */ khash_t(rspamd_task_lua_cache) lua_cache; /**< cache of lua objects */ GPtrArray *tokens; /**< statistics tokens */ - GArray *meta_words; /**< rspamd_stat_token_t produced from meta headers + rspamd_words_t meta_words; /**< rspamd_word_t produced from meta headers (e.g. Subject) */ GPtrArray *rcpt_envelope; /**< array of rspamd_email_address */ diff --git a/src/libserver/word.h b/src/libserver/word.h new file mode 100644 index 000000000..7698bf327 --- /dev/null +++ b/src/libserver/word.h @@ -0,0 +1,88 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_WORD_H +#define RSPAMD_WORD_H + +#include "config.h" +#include "fstring.h" +#include "contrib/libucl/kvec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file word.h + * Word processing structures and definitions + */ + +/* Word flags */ +#define RSPAMD_WORD_FLAG_TEXT (1u << 0) +#define RSPAMD_WORD_FLAG_META (1u << 1) +#define RSPAMD_WORD_FLAG_LUA_META (1u << 2) +#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3) +#define RSPAMD_WORD_FLAG_HEADER (1u << 4) +#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5) +#define RSPAMD_WORD_FLAG_UTF (1u << 6) +#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7) +#define RSPAMD_WORD_FLAG_STEMMED (1u << 8) +#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9) +#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10) +#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11) +#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12) +#define RSPAMD_WORD_FLAG_EMOJI (1u << 13) + +/** + * Word structure representing tokenized text + */ +typedef struct rspamd_word_s { + rspamd_ftok_t original; /* utf8 raw */ + rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */ + rspamd_ftok_t normalized; /* normalized and lowercased utf8 */ + rspamd_ftok_t stemmed; /* stemmed utf8 */ + unsigned int flags; +} rspamd_word_t; + +/** + * Vector of words using kvec + */ +typedef kvec_t(rspamd_word_t) rspamd_words_t; + +/* Legacy typedefs for backward compatibility */ +typedef rspamd_word_t rspamd_stat_token_t; + +/* Legacy flag aliases for backward compatibility */ +#define RSPAMD_STAT_TOKEN_FLAG_TEXT RSPAMD_WORD_FLAG_TEXT +#define RSPAMD_STAT_TOKEN_FLAG_META RSPAMD_WORD_FLAG_META +#define RSPAMD_STAT_TOKEN_FLAG_LUA_META RSPAMD_WORD_FLAG_LUA_META +#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION RSPAMD_WORD_FLAG_EXCEPTION +#define RSPAMD_STAT_TOKEN_FLAG_HEADER RSPAMD_WORD_FLAG_HEADER +#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM RSPAMD_WORD_FLAG_UNIGRAM +#define RSPAMD_STAT_TOKEN_FLAG_UTF RSPAMD_WORD_FLAG_UTF +#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED RSPAMD_WORD_FLAG_NORMALISED +#define RSPAMD_STAT_TOKEN_FLAG_STEMMED RSPAMD_WORD_FLAG_STEMMED +#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE RSPAMD_WORD_FLAG_BROKEN_UNICODE +#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD RSPAMD_WORD_FLAG_STOP_WORD +#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED RSPAMD_WORD_FLAG_SKIPPED +#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES RSPAMD_WORD_FLAG_INVISIBLE_SPACES +#define RSPAMD_STAT_TOKEN_FLAG_EMOJI RSPAMD_WORD_FLAG_EMOJI + +#ifdef __cplusplus +} +#endif + +#endif /* RSPAMD_WORD_H */ diff --git a/src/libserver/worker_util.c b/src/libserver/worker_util.c index d0ac8d8d3..fdcc5a4b3 100644 --- a/src/libserver/worker_util.c +++ b/src/libserver/worker_util.c @@ -1908,14 +1908,27 @@ rspamd_worker_hyperscan_ready(struct rspamd_main *rspamd_main, memset(&rep, 0, sizeof(rep)); rep.type = RSPAMD_CONTROL_HYPERSCAN_LOADED; - if (rspamd_re_cache_is_hs_loaded(cache) != RSPAMD_HYPERSCAN_LOADED_FULL || - cmd->cmd.hs_loaded.forced) { + /* Check if this is a scoped notification */ + if (cmd->cmd.hs_loaded.scope[0] != '\0') { + /* Scoped hyperscan loading */ + const char *scope = cmd->cmd.hs_loaded.scope; - msg_info("loading hyperscan expressions after receiving compilation " - "notice: %s", - (rspamd_re_cache_is_hs_loaded(cache) != RSPAMD_HYPERSCAN_LOADED_FULL) ? "new db" : "forced update"); - rep.reply.hs_loaded.status = rspamd_re_cache_load_hyperscan( - worker->srv->cfg->re_cache, cmd->cmd.hs_loaded.cache_dir, false); + msg_info("loading hyperscan expressions for scope '%s' after receiving compilation notice", scope); + + rep.reply.hs_loaded.status = rspamd_re_cache_load_hyperscan_scoped( + cache, cmd->cmd.hs_loaded.cache_dir, false); + } + else { + /* Legacy/full cache loading */ + if (rspamd_re_cache_is_hs_loaded(cache) != RSPAMD_HYPERSCAN_LOADED_FULL || + cmd->cmd.hs_loaded.forced) { + + msg_info("loading hyperscan expressions after receiving compilation " + "notice: %s", + (rspamd_re_cache_is_hs_loaded(cache) != RSPAMD_HYPERSCAN_LOADED_FULL) ? "new db" : "forced update"); + rep.reply.hs_loaded.status = rspamd_re_cache_load_hyperscan( + worker->srv->cfg->re_cache, cmd->cmd.hs_loaded.cache_dir, false); + } } if (write(fd, &rep, sizeof(rep)) != sizeof(rep)) { @@ -2138,7 +2151,7 @@ rspamd_controller_load_saved_stats(struct rspamd_main *rspamd_main, return; } - parser = ucl_parser_new(0); + parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS); if (!ucl_parser_add_file(parser, cfg->stats_file)) { msg_err_config("cannot parse controller stats from %s: %s", @@ -2556,4 +2569,4 @@ rspamd_metrics_to_prometheus_string(const ucl_object_t *top) /* Must be finalized and freed by caller */ return output; -}
\ No newline at end of file +} |