diff options
Diffstat (limited to 'src/libmime')
-rw-r--r-- | src/libmime/lang_detection.c | 41 | ||||
-rw-r--r-- | src/libmime/lang_detection_fasttext.cxx | 21 | ||||
-rw-r--r-- | src/libmime/lang_detection_fasttext.h | 3 | ||||
-rw-r--r-- | src/libmime/message.c | 35 | ||||
-rw-r--r-- | src/libmime/message.h | 3 | ||||
-rw-r--r-- | src/libmime/mime_string.hxx | 8 |
6 files changed, 62 insertions, 49 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 6e180ea66..b783b8325 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -363,7 +363,7 @@ rspamd_language_detector_read_file(struct rspamd_config *cfg, double mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0; enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX; - parser = ucl_parser_new(UCL_PARSER_NO_FILEVARS); + parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS); if (!ucl_parser_add_file(parser, path)) { msg_warn_config("cannot parse file %s: %s", path, ucl_parser_get_error(parser)); @@ -825,7 +825,7 @@ rspamd_language_detector_init(struct rspamd_config *cfg) languages_pattern = g_string_sized_new(PATH_MAX); rspamd_printf_gstring(languages_pattern, "%s/stop_words", languages_path); - parser = ucl_parser_new(UCL_PARSER_DEFAULT); + parser = ucl_parser_new(UCL_PARSER_SAFE_FLAGS); if (ucl_parser_add_file(parser, languages_pattern->str)) { stop_words = ucl_parser_get_object(parser); @@ -936,7 +936,7 @@ end: } static void -rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords, +rspamd_language_detector_random_select(rspamd_words_t *ucs_tokens, unsigned int nwords, goffset *offsets_out, uint64_t *seed) { @@ -946,7 +946,7 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords, g_assert(nwords != 0); g_assert(offsets_out != NULL); - g_assert(ucs_tokens->len >= nwords); + g_assert(kv_size(*ucs_tokens) >= nwords); /* * We split input array into `nwords` parts. For each part we randomly select * an element from this particular split. Here is an example: @@ -963,22 +963,22 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords, * their splits. It is not uniform distribution but it seems to be better * to include words from different text parts */ - step_len = ucs_tokens->len / nwords; - remainder = ucs_tokens->len % nwords; + step_len = kv_size(*ucs_tokens) / nwords; + remainder = kv_size(*ucs_tokens) % nwords; out_idx = 0; coin = rspamd_random_uint64_fast_seed(seed); sel = coin % (step_len + remainder); offsets_out[out_idx] = sel; - for (i = step_len + remainder; i < ucs_tokens->len; + for (i = step_len + remainder; i < kv_size(*ucs_tokens); i += step_len, out_idx++) { unsigned int ntries = 0; coin = rspamd_random_uint64_fast_seed(seed); sel = (coin % step_len) + i; for (;;) { - tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel); + tok = &kv_A(*ucs_tokens, sel); /* Filter bad tokens */ if (tok->unicode.len >= 2 && @@ -995,8 +995,8 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords, if (ntries < step_len) { sel = (coin % step_len) + i; } - else if (ntries < ucs_tokens->len) { - sel = coin % ucs_tokens->len; + else if (ntries < kv_size(*ucs_tokens)) { + sel = coin % kv_size(*ucs_tokens); } else { offsets_out[out_idx] = sel; @@ -1223,12 +1223,12 @@ static void rspamd_language_detector_detect_type(struct rspamd_task *task, unsigned int nwords, struct rspamd_lang_detector *d, - GArray *words, + rspamd_words_t *words, enum rspamd_language_category cat, khash_t(rspamd_candidates_hash) * candidates, struct rspamd_mime_text_part *part) { - unsigned int nparts = MIN(words->len, nwords); + unsigned int nparts = MIN(kv_size(*words), nwords); goffset *selected_words; rspamd_stat_token_t *tok; unsigned int i; @@ -1241,8 +1241,7 @@ rspamd_language_detector_detect_type(struct rspamd_task *task, msg_debug_lang_det("randomly selected %d words", nparts); for (i = 0; i < nparts; i++) { - tok = &g_array_index(words, rspamd_stat_token_t, - selected_words[i]); + tok = &kv_A(*words, selected_words[i]); if (tok->unicode.len >= 3) { rspamd_language_detector_detect_word(task, d, tok, candidates, @@ -1282,7 +1281,7 @@ static enum rspamd_language_detected_type rspamd_language_detector_try_ngramm(struct rspamd_task *task, unsigned int nwords, struct rspamd_lang_detector *d, - GArray *ucs_tokens, + rspamd_words_t *ucs_tokens, enum rspamd_language_category cat, khash_t(rspamd_candidates_hash) * candidates, struct rspamd_mime_text_part *part) @@ -1863,7 +1862,7 @@ rspamd_language_detector_detect(struct rspamd_task *task, if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) { rspamd_fasttext_predict_result_t fasttext_predict_result = rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task, - part->utf_words, 4); + &part->utf_words, 4); ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result); @@ -1930,11 +1929,11 @@ rspamd_language_detector_detect(struct rspamd_task *task, if (!ret) { /* Apply trigramms detection */ candidates = kh_init(rspamd_candidates_hash); - if (part->utf_words->len < default_short_text_limit) { + if (kv_size(part->utf_words) < default_short_text_limit) { r = rs_detect_none; msg_debug_lang_det("text is too short for trigrams detection: " "%d words; at least %d words required", - (int) part->utf_words->len, + (int) kv_size(part->utf_words), (int) default_short_text_limit); switch (cat) { case RSPAMD_LANGUAGE_CYRILLIC: @@ -1960,7 +1959,7 @@ rspamd_language_detector_detect(struct rspamd_task *task, r = rspamd_language_detector_try_ngramm(task, default_words, d, - part->utf_words, + &part->utf_words, cat, candidates, part); @@ -2123,4 +2122,4 @@ int rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt) } return 0; -}
\ No newline at end of file +} diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx index 8ea2706e6..983ff78de 100644 --- a/src/libmime/lang_detection_fasttext.cxx +++ b/src/libmime/lang_detection_fasttext.cxx @@ -22,6 +22,7 @@ #include "libserver/logger.h" #include "contrib/fmt/include/fmt/base.h" #include "stat_api.h" +#include "libserver/word.h" #include <exception> #include <string_view> #include <vector> @@ -180,26 +181,32 @@ bool rspamd_lang_detection_fasttext_is_enabled(void *ud) rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, struct rspamd_task *task, - GArray *utf_words, + rspamd_words_t *utf_words, int k) { #ifndef WITH_FASTTEXT return nullptr; #else /* Avoid too long inputs */ - static const unsigned int max_fasttext_input_len = 1024 * 1024; + static const size_t max_fasttext_input_len = 1024 * 1024; auto *real_model = FASTTEXT_MODEL_TO_C_API(ud); std::vector<std::int32_t> words_vec; - words_vec.reserve(utf_words->len); - for (auto i = 0; i < std::min(utf_words->len, max_fasttext_input_len); i++) { - const auto *w = &g_array_index(utf_words, rspamd_stat_token_t, i); + if (!utf_words || !utf_words->a) { + return nullptr; + } + + auto words_count = kv_size(*utf_words); + words_vec.reserve(words_count); + + for (auto i = 0; i < std::min(words_count, max_fasttext_input_len); i++) { + const auto *w = &kv_A(*utf_words, i); if (w->original.len > 0) { real_model->word2vec(w->original.begin, w->original.len, words_vec); } } - msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), utf_words->len); + msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), words_count); auto *res = real_model->detect_language(words_vec, k); @@ -266,4 +273,4 @@ void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res #endif } -G_END_DECLS
\ No newline at end of file +G_END_DECLS diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h index 2a2756968..e2b67181a 100644 --- a/src/libmime/lang_detection_fasttext.h +++ b/src/libmime/lang_detection_fasttext.h @@ -17,6 +17,7 @@ #define RSPAMD_LANG_DETECTION_FASTTEXT_H #include "config.h" +#include "libserver/word.h" G_BEGIN_DECLS struct rspamd_config; @@ -53,7 +54,7 @@ typedef void *rspamd_fasttext_predict_result_t; * @return TRUE if language is detected */ rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, - struct rspamd_task *task, GArray *utf_words, int k); + struct rspamd_task *task, rspamd_words_t *utf_words, int k); /** * Get number of languages detected diff --git a/src/libmime/message.c b/src/libmime/message.c index f2cabf399..8442c80ac 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,6 +40,8 @@ #include "contrib/uthash/utlist.h" #include "contrib/t1ha/t1ha.h" #include "received.h" +#define RSPAMD_TOKENIZER_INTERNAL +#include "libstat/tokenizers/custom_tokenizer.h" #define GTUBE_SYMBOL "GTUBE" @@ -71,14 +73,14 @@ rspamd_mime_part_extract_words(struct rspamd_task *task, rspamd_stat_token_t *w; unsigned int i, total_len = 0, short_len = 0; - if (part->utf_words) { - rspamd_stem_words(part->utf_words, task->task_pool, part->language, + if (part->utf_words.a) { + rspamd_stem_words(&part->utf_words, task->task_pool, part->language, task->lang_det); - for (i = 0; i < part->utf_words->len; i++) { + for (i = 0; i < kv_size(part->utf_words); i++) { uint64_t h; - w = &g_array_index(part->utf_words, rspamd_stat_token_t, i); + w = &kv_A(part->utf_words, i); if (w->stemmed.len > 0) { /* @@ -108,7 +110,7 @@ rspamd_mime_part_extract_words(struct rspamd_task *task, } } - if (part->utf_words->len) { + if (kv_size(part->utf_words)) { double *avg_len_p, *short_len_p; avg_len_p = rspamd_mempool_get_variable(task->task_pool, @@ -185,21 +187,24 @@ rspamd_mime_part_create_words(struct rspamd_task *task, tok_type = RSPAMD_TOKENIZE_RAW; } - part->utf_words = rspamd_tokenize_text( + /* Initialize kvec for words */ + kv_init(part->utf_words); + + rspamd_tokenize_text( part->utf_stripped_content->data, part->utf_stripped_content->len, &part->utf_stripped_text, tok_type, task->cfg, part->exceptions, NULL, - NULL, + &part->utf_words, task->task_pool); - if (part->utf_words) { + if (part->utf_words.a) { part->normalized_hashes = g_array_sized_new(FALSE, FALSE, - sizeof(uint64_t), part->utf_words->len); - rspamd_normalize_words(part->utf_words, task->task_pool); + sizeof(uint64_t), kv_size(part->utf_words)); + rspamd_normalize_words(&part->utf_words, task->task_pool); } } @@ -209,7 +214,7 @@ rspamd_mime_part_detect_language(struct rspamd_task *task, { struct rspamd_lang_detector_res *lang; - if (!IS_TEXT_PART_EMPTY(part) && part->utf_words && part->utf_words->len > 0 && + if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a && kv_size(part->utf_words) > 0 && task->lang_det) { if (rspamd_language_detector_detect(task, task->lang_det, part)) { lang = g_ptr_array_index(part->languages, 0); @@ -1106,8 +1111,8 @@ rspamd_message_dtor(struct rspamd_message *msg) PTR_ARRAY_FOREACH(msg->text_parts, i, tp) { - if (tp->utf_words) { - g_array_free(tp->utf_words, TRUE); + if (tp->utf_words.a) { + kv_destroy(tp->utf_words); } if (tp->normalized_hashes) { g_array_free(tp->normalized_hashes, TRUE); @@ -1583,7 +1588,7 @@ void rspamd_message_process(struct rspamd_task *task) rspamd_mime_part_extract_words(task, text_part); - if (text_part->utf_words) { + if (text_part->utf_words.a) { total_words += text_part->nwords; } } diff --git a/src/libmime/message.h b/src/libmime/message.h index cb695773e..e6b454362 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -16,6 +16,7 @@ #include "libserver/url.h" #include "libutil/ref.h" #include "libutil/str_util.h" +#include "libserver/word.h" #include <unicode/uchar.h> #include <unicode/utext.h> @@ -139,7 +140,7 @@ struct rspamd_mime_text_part { GByteArray *utf_raw_content; /* utf raw content */ GByteArray *utf_stripped_content; /* utf content with no newlines */ GArray *normalized_hashes; /* Array of uint64_t */ - GArray *utf_words; /* Array of rspamd_stat_token_t */ + rspamd_words_t utf_words; /* kvec of rspamd_word_t */ UText utf_stripped_text; /* Used by libicu to represent the utf8 content */ GPtrArray *newlines; /**< positions of newlines in text, relative to content*/ diff --git a/src/libmime/mime_string.hxx b/src/libmime/mime_string.hxx index b181576d3..d6c11d018 100644 --- a/src/libmime/mime_string.hxx +++ b/src/libmime/mime_string.hxx @@ -497,19 +497,19 @@ public: } /* Comparison */ - auto operator==(const basic_mime_string &other) + auto operator==(const basic_mime_string &other) const { return other.storage == storage; } - auto operator==(const storage_type &other) + auto operator==(const storage_type &other) const { return other == storage; } - auto operator==(const view_type &other) + auto operator==(const view_type &other) const { return other == storage; } - auto operator==(const CharT *other) + auto operator==(const CharT *other) const { if (other == NULL) { return false; |