From e6f1e32b07e275379e779e83f62d09c0ed15209f Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 8 Dec 2022 21:36:36 +0000 Subject: [PATCH] [Minor] Chartable: Adjustments to the metatokens handling --- src/plugins/chartable.cxx | 238 +++++++++++++++++++------------------- 1 file changed, 120 insertions(+), 118 deletions(-) diff --git a/src/plugins/chartable.cxx b/src/plugins/chartable.cxx index c5820c606..6e3fd9b10 100644 --- a/src/plugins/chartable.cxx +++ b/src/plugins/chartable.cxx @@ -45,18 +45,20 @@ INIT_LOG_MODULE(chartable) /* Initialization */ -gint chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx); -gint chartable_module_config (struct rspamd_config *cfg, bool validate); -gint chartable_module_reconfig (struct rspamd_config *cfg); +gint chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx); + +gint chartable_module_config(struct rspamd_config *cfg, bool validate); + +gint chartable_module_reconfig(struct rspamd_config *cfg); module_t chartable_module = { - "chartable", - chartable_module_init, - chartable_module_config, - chartable_module_reconfig, - nullptr, - RSPAMD_MODULE_VER, - (guint)-1, + "chartable", + chartable_module_init, + chartable_module_config, + chartable_module_reconfig, + nullptr, + RSPAMD_MODULE_VER, + (guint) -1, }; struct chartable_ctx { @@ -68,21 +70,22 @@ struct chartable_ctx { }; static inline struct chartable_ctx * -chartable_get_context (struct rspamd_config *cfg) +chartable_get_context(struct rspamd_config *cfg) { - return (struct chartable_ctx *)g_ptr_array_index (cfg->c_modules, - chartable_module.ctx_offset); + return (struct chartable_ctx *) g_ptr_array_index(cfg->c_modules, + chartable_module.ctx_offset); } -static void chartable_symbol_callback (struct rspamd_task *task, - struct rspamd_symcache_dynamic_item *item, - void *unused); -static void chartable_url_symbol_callback (struct rspamd_task *task, - struct rspamd_symcache_dynamic_item *item, - void *unused); +static void chartable_symbol_callback(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + void *unused); + +static void chartable_url_symbol_callback(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + void *unused); gint -chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx) +chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx) { struct chartable_ctx *chartable_module_ctx; @@ -90,40 +93,40 @@ chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx) struct chartable_ctx); chartable_module_ctx->max_word_len = 10; - *ctx = (struct module_ctx *)chartable_module_ctx; + *ctx = (struct module_ctx *) chartable_module_ctx; return 0; } gint -chartable_module_config (struct rspamd_config *cfg, bool validate) +chartable_module_config(struct rspamd_config *cfg, bool _) { const ucl_object_t *value; gint res = TRUE; - struct chartable_ctx *chartable_module_ctx = chartable_get_context (cfg); + struct chartable_ctx *chartable_module_ctx = chartable_get_context(cfg); - if (!rspamd_config_is_module_enabled (cfg, "chartable")) { + if (!rspamd_config_is_module_enabled(cfg, "chartable")) { return TRUE; } if ((value = - rspamd_config_get_module_opt (cfg, "chartable", "symbol")) != nullptr) { - chartable_module_ctx->symbol = ucl_obj_tostring (value); + rspamd_config_get_module_opt(cfg, "chartable", "symbol")) != nullptr) { + chartable_module_ctx->symbol = ucl_obj_tostring(value); } else { chartable_module_ctx->symbol = DEFAULT_SYMBOL; } if ((value = - rspamd_config_get_module_opt (cfg, "chartable", "url_symbol")) != nullptr) { - chartable_module_ctx->url_symbol = ucl_obj_tostring (value); + rspamd_config_get_module_opt(cfg, "chartable", "url_symbol")) != nullptr) { + chartable_module_ctx->url_symbol = ucl_obj_tostring(value); } else { chartable_module_ctx->url_symbol = DEFAULT_URL_SYMBOL; } if ((value = - rspamd_config_get_module_opt (cfg, "chartable", "threshold")) != nullptr) { - if (!ucl_obj_todouble_safe (value, &chartable_module_ctx->threshold)) { + rspamd_config_get_module_opt(cfg, "chartable", "threshold")) != nullptr) { + if (!ucl_obj_todouble_safe(value, &chartable_module_ctx->threshold)) { msg_warn_config ("invalid numeric value"); chartable_module_ctx->threshold = DEFAULT_THRESHOLD; } @@ -132,37 +135,37 @@ chartable_module_config (struct rspamd_config *cfg, bool validate) chartable_module_ctx->threshold = DEFAULT_THRESHOLD; } if ((value = - rspamd_config_get_module_opt (cfg, "chartable", "max_word_len")) != nullptr) { - chartable_module_ctx->max_word_len = ucl_object_toint (value); + rspamd_config_get_module_opt(cfg, "chartable", "max_word_len")) != nullptr) { + chartable_module_ctx->max_word_len = ucl_object_toint(value); } else { chartable_module_ctx->threshold = DEFAULT_THRESHOLD; } - rspamd_symcache_add_symbol (cfg->cache, - chartable_module_ctx->symbol, - 0, - chartable_symbol_callback, - nullptr, - SYMBOL_TYPE_NORMAL, - -1); - rspamd_symcache_add_symbol (cfg->cache, - chartable_module_ctx->url_symbol, - 0, - chartable_url_symbol_callback, - nullptr, - SYMBOL_TYPE_NORMAL, - -1); - - msg_info_config ("init internal chartable module"); + rspamd_symcache_add_symbol(cfg->cache, + chartable_module_ctx->symbol, + 0, + chartable_symbol_callback, + nullptr, + SYMBOL_TYPE_NORMAL, + -1); + rspamd_symcache_add_symbol(cfg->cache, + chartable_module_ctx->url_symbol, + 0, + chartable_url_symbol_callback, + nullptr, + SYMBOL_TYPE_NORMAL, + -1); + + msg_info_config("init internal chartable module"); return res; } gint -chartable_module_reconfig (struct rspamd_config *cfg) +chartable_module_reconfig(struct rspamd_config *cfg) { - return chartable_module_config (cfg, false); + return chartable_module_config(cfg, false); } static const auto latin_confusable = ankerl::unordered_dense::set{ @@ -321,19 +324,18 @@ static const auto latin_confusable = ankerl::unordered_dense::set{ }; static gboolean -rspamd_can_alias_latin (gint ch) +rspamd_can_alias_latin(gint ch) { return latin_confusable.contains(ch); } static gdouble -rspamd_chartable_process_word_utf (struct rspamd_task *task, - rspamd_stat_token_t *w, - gboolean is_url, - guint *ncap, - struct chartable_ctx *chartable_module_ctx, - const gchar *lang, - gboolean ignore_diacritics) +rspamd_chartable_process_word_utf(struct rspamd_task *task, + rspamd_stat_token_t *w, + gboolean is_url, + guint *ncap, + struct chartable_ctx *chartable_module_ctx, + gboolean ignore_diacritics) { const UChar32 *p, *end; gdouble badness = 0.0; @@ -357,12 +359,12 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, while (p < end) { uc = *p++; - if (((gint32)uc) < 0) { + if (((gint32) uc) < 0) { break; } - sc = ublock_getCode (uc); - cat = u_charType (uc); + sc = ublock_getCode(uc); + cat = u_charType(uc); if (!ignore_diacritics) { if (cat == U_NON_SPACING_MARK || @@ -375,10 +377,10 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, } } - if (u_isalpha (uc)) { + if (u_isalpha(uc)) { if (sc <= UBLOCK_COMBINING_DIACRITICAL_MARKS || - sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) { + sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) { /* * Assume all latin, IPA, diacritic and space modifiers * characters as basic latin @@ -386,16 +388,16 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, sc = UBLOCK_BASIC_LATIN; } - if (sc != UBLOCK_BASIC_LATIN && u_isupper (uc)) { + if (sc != UBLOCK_BASIC_LATIN && u_isupper(uc)) { if (ncap) { - (*ncap) ++; + (*ncap)++; } } if (state == got_digit) { /* Penalize digit -> alpha translations */ if (!is_url && sc != UBLOCK_BASIC_LATIN && - prev_state != start_process) { + prev_state != start_process) { badness += 0.25; } } @@ -404,15 +406,15 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, if (same_script_count > 0) { if (sc != UBLOCK_BASIC_LATIN && last_is_latin) { - if (rspamd_can_alias_latin (uc)) { - badness += 1.0 / (gdouble)same_script_count; + if (rspamd_can_alias_latin(uc)) { + badness += 1.0 / (gdouble) same_script_count; } last_is_latin = 0; same_script_count = 1; } else { - same_script_count ++; + same_script_count++; } } else { @@ -425,7 +427,7 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, state = got_alpha; } - else if (u_isdigit (uc)) { + else if (u_isdigit(uc)) { if (state != got_digit) { prev_state = state; } @@ -443,7 +445,7 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, same_script_count = 0; } - nsym ++; + nsym++; } if (nspecial > 0) { @@ -467,17 +469,17 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, } msg_debug_chartable ("word %*s, badness: %.2f", - (gint)w->normalized.len, w->normalized.begin, - badness); + (gint) w->normalized.len, w->normalized.begin, + badness); return badness; } static gdouble -rspamd_chartable_process_word_ascii (struct rspamd_task *task, - rspamd_stat_token_t *w, - gboolean is_url, - struct chartable_ctx *chartable_module_ctx) +rspamd_chartable_process_word_ascii(struct rspamd_task *task, + rspamd_stat_token_t *w, + gboolean is_url, + struct chartable_ctx *chartable_module_ctx) { gdouble badness = 0.0; enum { @@ -516,12 +518,12 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task, if (same_script_count > 0) { if (sc != last_sc) { - badness += 1.0 / (gdouble)same_script_count; + badness += 1.0 / (gdouble) same_script_count; last_sc = sc; same_script_count = 1; } else { - same_script_count ++; + same_script_count++; } } else { @@ -544,7 +546,7 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task, same_script_count = 0; } - p ++; + p++; } if (badness > 4.0) { @@ -552,24 +554,24 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task, } msg_debug_chartable ("word %*s, badness: %.2f", - (gint)w->normalized.len, w->normalized.begin, - badness); + (gint) w->normalized.len, w->normalized.begin, + badness); return badness; } static gboolean -rspamd_chartable_process_part (struct rspamd_task *task, - struct rspamd_mime_text_part *part, - struct chartable_ctx *chartable_module_ctx, - gboolean ignore_diacritics) +rspamd_chartable_process_part(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + struct chartable_ctx *chartable_module_ctx, + gboolean ignore_diacritics) { rspamd_stat_token_t *w; guint i, ncap = 0; gdouble cur_score = 0.0; if (part == nullptr || part->utf_words == nullptr || - part->utf_words->len == 0 || part->nwords == 0) { + part->utf_words->len == 0 || part->nwords == 0) { return FALSE; } @@ -579,12 +581,12 @@ rspamd_chartable_process_part (struct rspamd_task *task, if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { - cur_score += rspamd_chartable_process_word_utf (task, w, FALSE, - &ncap, chartable_module_ctx, part->language, ignore_diacritics); + cur_score += rspamd_chartable_process_word_utf(task, w, FALSE, + &ncap, chartable_module_ctx, ignore_diacritics); } else { - cur_score += rspamd_chartable_process_word_ascii (task, w, - FALSE, chartable_module_ctx); + cur_score += rspamd_chartable_process_word_ascii(task, w, + FALSE, chartable_module_ctx); } } } @@ -596,7 +598,7 @@ rspamd_chartable_process_part (struct rspamd_task *task, */ part->capital_letters += ncap; - cur_score /= (gdouble)part->nwords; + cur_score /= (gdouble) part->nwords; if (cur_score > 1.0) { cur_score = 1.0; @@ -604,7 +606,7 @@ rspamd_chartable_process_part (struct rspamd_task *task, if (cur_score > chartable_module_ctx->threshold) { rspamd_task_insert_result (task, chartable_module_ctx->symbol, - cur_score, nullptr); + cur_score, nullptr); return TRUE; } @@ -612,37 +614,37 @@ rspamd_chartable_process_part (struct rspamd_task *task, } static void -chartable_symbol_callback (struct rspamd_task *task, - struct rspamd_symcache_dynamic_item *item, - void *unused) +chartable_symbol_callback(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + void *_) { guint i; struct rspamd_mime_text_part *part; - struct chartable_ctx *chartable_module_ctx = chartable_get_context (task->cfg); - const gchar *language = nullptr; - gboolean ignore_diacritics = FALSE, seen_violated_part = FALSE; + struct chartable_ctx *chartable_module_ctx = chartable_get_context(task->cfg); + gboolean ignore_diacritics = TRUE, seen_violated_part = FALSE; /* Check if we have parts with diacritic symbols language */ - PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) { + PTR_ARRAY_FOREACH (MESSAGE_FIELD(task, text_parts), i, part) { if (part->languages && part->languages->len > 0) { - struct rspamd_lang_detector_res *lang = - (struct rspamd_lang_detector_res *)g_ptr_array_index (part->languages, 0); + auto *lang = (struct rspamd_lang_detector_res *) g_ptr_array_index(part->languages, 0); gint flags; - flags = rspamd_language_detector_elt_flags (lang->elt); + flags = rspamd_language_detector_elt_flags(lang->elt); - if (flags & RS_LANGUAGE_DIACRITICS) { + if ((flags & RS_LANGUAGE_DIACRITICS)) { ignore_diacritics = TRUE; } + else if (lang->prob > 0.75) { + ignore_diacritics = FALSE; + } } - if (rspamd_chartable_process_part (task, part, chartable_module_ctx, - ignore_diacritics)) { + if (rspamd_chartable_process_part(task, part, chartable_module_ctx, ignore_diacritics)) { seen_violated_part = TRUE; } } - if (MESSAGE_FIELD (task, text_parts)->len == 0) { + if (MESSAGE_FIELD(task, text_parts)->len == 0) { /* No text parts, assume that we should ignore diacritics checks for metatokens */ ignore_diacritics = TRUE; } @@ -653,12 +655,12 @@ chartable_symbol_callback (struct rspamd_task *task, gsize arlen = task->meta_words->len; for (i = 0; i < arlen; i++) { - w = &g_array_index (task->meta_words, rspamd_stat_token_t, i); - cur_score += rspamd_chartable_process_word_utf (task, w, FALSE, - nullptr, chartable_module_ctx, language, ignore_diacritics); + w = &g_array_index(task->meta_words, rspamd_stat_token_t, i); + cur_score += rspamd_chartable_process_word_utf(task, w, FALSE, + nullptr, chartable_module_ctx, ignore_diacritics); } - cur_score /= (gdouble)arlen; + cur_score /= (gdouble) (arlen + 1); if (cur_score > 1.0) { cur_score = 1.0; @@ -672,19 +674,19 @@ chartable_symbol_callback (struct rspamd_task *task, } } - rspamd_task_insert_result (task, chartable_module_ctx->symbol, - cur_score, "subject"); + rspamd_task_insert_result(task, chartable_module_ctx->symbol, + cur_score, "subject"); } } - rspamd_symcache_finalize_item (task, item); + rspamd_symcache_finalize_item(task, item); } static void -chartable_url_symbol_callback (struct rspamd_task *task, - struct rspamd_symcache_dynamic_item *item, - void *unused) +chartable_url_symbol_callback(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + void *unused) { /* XXX: TODO: unbreak module once URLs unicode project is over */ #if 0 @@ -751,5 +753,5 @@ chartable_url_symbol_callback (struct rspamd_task *task, } #endif - rspamd_symcache_finalize_item (task, item); + rspamd_symcache_finalize_item(task, item); } -- 2.39.5