diff options
-rw-r--r-- | src/libmime/lang_detection.c | 35 | ||||
-rw-r--r-- | src/libmime/lang_detection_fasttext.cxx | 21 | ||||
-rw-r--r-- | src/libmime/lang_detection_fasttext.h | 3 | ||||
-rw-r--r-- | src/libmime/message.c | 31 | ||||
-rw-r--r-- | src/libmime/message.h | 3 | ||||
-rw-r--r-- | src/libserver/re_cache.c | 41 | ||||
-rw-r--r-- | src/libserver/task.c | 6 | ||||
-rw-r--r-- | src/libserver/task.h | 3 | ||||
-rw-r--r-- | src/libserver/word.h | 88 | ||||
-rw-r--r-- | src/libstat/stat_api.h | 27 | ||||
-rw-r--r-- | src/libstat/stat_process.c | 32 | ||||
-rw-r--r-- | src/libstat/tokenizers/custom_tokenizer.h | 26 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 9 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizer_manager.c | 91 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 165 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 31 | ||||
-rw-r--r-- | src/libutil/shingles.c | 27 | ||||
-rw-r--r-- | src/libutil/shingles.h | 5 | ||||
-rw-r--r-- | src/lua/lua_common.c | 52 | ||||
-rw-r--r-- | src/lua/lua_common.h | 7 | ||||
-rw-r--r-- | src/lua/lua_mimepart.c | 39 | ||||
-rw-r--r-- | src/lua/lua_task.c | 82 | ||||
-rw-r--r-- | src/plugins/chartable.cxx | 24 | ||||
-rw-r--r-- | src/plugins/fuzzy_check.c | 14 | ||||
-rw-r--r-- | test/rspamd_shingles_test.c | 80 |
25 files changed, 588 insertions, 354 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 07ecff76d..b783b8325 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -936,7 +936,7 @@ end: } static void -rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords, +rspamd_language_detector_random_select(rspamd_words_t *ucs_tokens, unsigned int nwords, goffset *offsets_out, uint64_t *seed) { @@ -946,7 +946,7 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords, g_assert(nwords != 0); g_assert(offsets_out != NULL); - g_assert(ucs_tokens->len >= nwords); + g_assert(kv_size(*ucs_tokens) >= nwords); /* * We split input array into `nwords` parts. For each part we randomly select * an element from this particular split. Here is an example: @@ -963,22 +963,22 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords, * their splits. It is not uniform distribution but it seems to be better * to include words from different text parts */ - step_len = ucs_tokens->len / nwords; - remainder = ucs_tokens->len % nwords; + step_len = kv_size(*ucs_tokens) / nwords; + remainder = kv_size(*ucs_tokens) % nwords; out_idx = 0; coin = rspamd_random_uint64_fast_seed(seed); sel = coin % (step_len + remainder); offsets_out[out_idx] = sel; - for (i = step_len + remainder; i < ucs_tokens->len; + for (i = step_len + remainder; i < kv_size(*ucs_tokens); i += step_len, out_idx++) { unsigned int ntries = 0; coin = rspamd_random_uint64_fast_seed(seed); sel = (coin % step_len) + i; for (;;) { - tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel); + tok = &kv_A(*ucs_tokens, sel); /* Filter bad tokens */ if (tok->unicode.len >= 2 && @@ -995,8 +995,8 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords, if (ntries < step_len) { sel = (coin % step_len) + i; } - else if (ntries < ucs_tokens->len) { - sel = coin % ucs_tokens->len; + else if (ntries < kv_size(*ucs_tokens)) { + sel = coin % kv_size(*ucs_tokens); } else { offsets_out[out_idx] = sel; @@ -1223,12 +1223,12 @@ static void rspamd_language_detector_detect_type(struct rspamd_task *task, unsigned int nwords, struct rspamd_lang_detector *d, - GArray *words, + rspamd_words_t *words, enum rspamd_language_category cat, khash_t(rspamd_candidates_hash) * candidates, struct rspamd_mime_text_part *part) { - unsigned int nparts = MIN(words->len, nwords); + unsigned int nparts = MIN(kv_size(*words), nwords); goffset *selected_words; rspamd_stat_token_t *tok; unsigned int i; @@ -1241,8 +1241,7 @@ rspamd_language_detector_detect_type(struct rspamd_task *task, msg_debug_lang_det("randomly selected %d words", nparts); for (i = 0; i < nparts; i++) { - tok = &g_array_index(words, rspamd_stat_token_t, - selected_words[i]); + tok = &kv_A(*words, selected_words[i]); if (tok->unicode.len >= 3) { rspamd_language_detector_detect_word(task, d, tok, candidates, @@ -1282,7 +1281,7 @@ static enum rspamd_language_detected_type rspamd_language_detector_try_ngramm(struct rspamd_task *task, unsigned int nwords, struct rspamd_lang_detector *d, - GArray *ucs_tokens, + rspamd_words_t *ucs_tokens, enum rspamd_language_category cat, khash_t(rspamd_candidates_hash) * candidates, struct rspamd_mime_text_part *part) @@ -1863,7 +1862,7 @@ rspamd_language_detector_detect(struct rspamd_task *task, if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) { rspamd_fasttext_predict_result_t fasttext_predict_result = rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task, - part->utf_words, 4); + &part->utf_words, 4); ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result); @@ -1930,11 +1929,11 @@ rspamd_language_detector_detect(struct rspamd_task *task, if (!ret) { /* Apply trigramms detection */ candidates = kh_init(rspamd_candidates_hash); - if (part->utf_words->len < default_short_text_limit) { + if (kv_size(part->utf_words) < default_short_text_limit) { r = rs_detect_none; msg_debug_lang_det("text is too short for trigrams detection: " "%d words; at least %d words required", - (int) part->utf_words->len, + (int) kv_size(part->utf_words), (int) default_short_text_limit); switch (cat) { case RSPAMD_LANGUAGE_CYRILLIC: @@ -1960,7 +1959,7 @@ rspamd_language_detector_detect(struct rspamd_task *task, r = rspamd_language_detector_try_ngramm(task, default_words, d, - part->utf_words, + &part->utf_words, cat, candidates, part); @@ -2123,4 +2122,4 @@ int rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt) } return 0; -}
\ No newline at end of file +} diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx index 8ea2706e6..983ff78de 100644 --- a/src/libmime/lang_detection_fasttext.cxx +++ b/src/libmime/lang_detection_fasttext.cxx @@ -22,6 +22,7 @@ #include "libserver/logger.h" #include "contrib/fmt/include/fmt/base.h" #include "stat_api.h" +#include "libserver/word.h" #include <exception> #include <string_view> #include <vector> @@ -180,26 +181,32 @@ bool rspamd_lang_detection_fasttext_is_enabled(void *ud) rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, struct rspamd_task *task, - GArray *utf_words, + rspamd_words_t *utf_words, int k) { #ifndef WITH_FASTTEXT return nullptr; #else /* Avoid too long inputs */ - static const unsigned int max_fasttext_input_len = 1024 * 1024; + static const size_t max_fasttext_input_len = 1024 * 1024; auto *real_model = FASTTEXT_MODEL_TO_C_API(ud); std::vector<std::int32_t> words_vec; - words_vec.reserve(utf_words->len); - for (auto i = 0; i < std::min(utf_words->len, max_fasttext_input_len); i++) { - const auto *w = &g_array_index(utf_words, rspamd_stat_token_t, i); + if (!utf_words || !utf_words->a) { + return nullptr; + } + + auto words_count = kv_size(*utf_words); + words_vec.reserve(words_count); + + for (auto i = 0; i < std::min(words_count, max_fasttext_input_len); i++) { + const auto *w = &kv_A(*utf_words, i); if (w->original.len > 0) { real_model->word2vec(w->original.begin, w->original.len, words_vec); } } - msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), utf_words->len); + msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), words_count); auto *res = real_model->detect_language(words_vec, k); @@ -266,4 +273,4 @@ void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res #endif } -G_END_DECLS
\ No newline at end of file +G_END_DECLS diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h index 2a2756968..e2b67181a 100644 --- a/src/libmime/lang_detection_fasttext.h +++ b/src/libmime/lang_detection_fasttext.h @@ -17,6 +17,7 @@ #define RSPAMD_LANG_DETECTION_FASTTEXT_H #include "config.h" +#include "libserver/word.h" G_BEGIN_DECLS struct rspamd_config; @@ -53,7 +54,7 @@ typedef void *rspamd_fasttext_predict_result_t; * @return TRUE if language is detected */ rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, - struct rspamd_task *task, GArray *utf_words, int k); + struct rspamd_task *task, rspamd_words_t *utf_words, int k); /** * Get number of languages detected diff --git a/src/libmime/message.c b/src/libmime/message.c index 60894d879..bac67fb07 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -72,14 +72,14 @@ rspamd_mime_part_extract_words(struct rspamd_task *task, rspamd_stat_token_t *w; unsigned int i, total_len = 0, short_len = 0; - if (part->utf_words) { - rspamd_stem_words(part->utf_words, task->task_pool, part->language, + if (part->utf_words.a) { + rspamd_stem_words(&part->utf_words, task->task_pool, part->language, task->lang_det); - for (i = 0; i < part->utf_words->len; i++) { + for (i = 0; i < kv_size(part->utf_words); i++) { uint64_t h; - w = &g_array_index(part->utf_words, rspamd_stat_token_t, i); + w = &kv_A(part->utf_words, i); if (w->stemmed.len > 0) { /* @@ -109,7 +109,7 @@ rspamd_mime_part_extract_words(struct rspamd_task *task, } } - if (part->utf_words->len) { + if (kv_size(part->utf_words)) { double *avg_len_p, *short_len_p; avg_len_p = rspamd_mempool_get_variable(task->task_pool, @@ -186,21 +186,24 @@ rspamd_mime_part_create_words(struct rspamd_task *task, tok_type = RSPAMD_TOKENIZE_RAW; } - part->utf_words = rspamd_tokenize_text( + /* Initialize kvec for words */ + kv_init(part->utf_words); + + rspamd_tokenize_text( part->utf_stripped_content->data, part->utf_stripped_content->len, &part->utf_stripped_text, tok_type, task->cfg, part->exceptions, NULL, - NULL, + &part->utf_words, task->task_pool); - if (part->utf_words) { + if (part->utf_words.a) { part->normalized_hashes = g_array_sized_new(FALSE, FALSE, - sizeof(uint64_t), part->utf_words->len); - rspamd_normalize_words(part->utf_words, task->task_pool); + sizeof(uint64_t), kv_size(part->utf_words)); + rspamd_normalize_words(&part->utf_words, task->task_pool); } } @@ -210,7 +213,7 @@ rspamd_mime_part_detect_language(struct rspamd_task *task, { struct rspamd_lang_detector_res *lang; - if (!IS_TEXT_PART_EMPTY(part) && part->utf_words && part->utf_words->len > 0 && + if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a && kv_size(part->utf_words) > 0 && task->lang_det) { if (rspamd_language_detector_detect(task, task->lang_det, part)) { lang = g_ptr_array_index(part->languages, 0); @@ -1107,8 +1110,8 @@ rspamd_message_dtor(struct rspamd_message *msg) PTR_ARRAY_FOREACH(msg->text_parts, i, tp) { - if (tp->utf_words) { - g_array_free(tp->utf_words, TRUE); + if (tp->utf_words.a) { + kv_destroy(tp->utf_words); } if (tp->normalized_hashes) { g_array_free(tp->normalized_hashes, TRUE); @@ -1584,7 +1587,7 @@ void rspamd_message_process(struct rspamd_task *task) rspamd_mime_part_extract_words(task, text_part); - if (text_part->utf_words) { + if (text_part->utf_words.a) { total_words += text_part->nwords; } } diff --git a/src/libmime/message.h b/src/libmime/message.h index cb695773e..e6b454362 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -16,6 +16,7 @@ #include "libserver/url.h" #include "libutil/ref.h" #include "libutil/str_util.h" +#include "libserver/word.h" #include <unicode/uchar.h> #include <unicode/utext.h> @@ -139,7 +140,7 @@ struct rspamd_mime_text_part { GByteArray *utf_raw_content; /* utf raw content */ GByteArray *utf_stripped_content; /* utf content with no newlines */ GArray *normalized_hashes; /* Array of uint64_t */ - GArray *utf_words; /* Array of rspamd_stat_token_t */ + rspamd_words_t utf_words; /* kvec of rspamd_word_t */ UText utf_stripped_text; /* Used by libicu to represent the utf8 content */ GPtrArray *newlines; /**< positions of newlines in text, relative to content*/ diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index 06e9f3328..50b155ae0 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -998,20 +998,21 @@ rspamd_re_cache_process_selector(struct rspamd_task *task, return result; } + static inline unsigned int -rspamd_process_words_vector(GArray *words, - const unsigned char **scvec, - unsigned int *lenvec, - struct rspamd_re_class *re_class, - unsigned int cnt, - gboolean *raw) +rspamd_process_words_vector_kvec(rspamd_words_t *words, + const unsigned char **scvec, + unsigned int *lenvec, + struct rspamd_re_class *re_class, + unsigned int cnt, + gboolean *raw) { unsigned int j; - rspamd_stat_token_t *tok; + rspamd_word_t *tok; - if (words) { - for (j = 0; j < words->len; j++) { - tok = &g_array_index(words, rspamd_stat_token_t, j); + if (words && words->a) { + for (j = 0; j < kv_size(*words); j++) { + tok = &kv_A(*words, j); if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) { @@ -1432,13 +1433,13 @@ rspamd_re_cache_exec_re(struct rspamd_task *task, PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) { - if (text_part->utf_words) { - cnt += text_part->utf_words->len; + if (text_part->utf_words.a) { + cnt += kv_size(text_part->utf_words); } } - if (task->meta_words && task->meta_words->len > 0) { - cnt += task->meta_words->len; + if (task->meta_words.a && kv_size(task->meta_words) > 0) { + cnt += kv_size(task->meta_words); } if (cnt > 0) { @@ -1449,15 +1450,15 @@ rspamd_re_cache_exec_re(struct rspamd_task *task, PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) { - if (text_part->utf_words) { - cnt = rspamd_process_words_vector(text_part->utf_words, - scvec, lenvec, re_class, cnt, &raw); + if (text_part->utf_words.a) { + cnt = rspamd_process_words_vector_kvec(&text_part->utf_words, + scvec, lenvec, re_class, cnt, &raw); } } - if (task->meta_words) { - cnt = rspamd_process_words_vector(task->meta_words, - scvec, lenvec, re_class, cnt, &raw); + if (task->meta_words.a) { + cnt = rspamd_process_words_vector_kvec(&task->meta_words, + scvec, lenvec, re_class, cnt, &raw); } ret = rspamd_re_cache_process_regexp_data(rt, re, diff --git a/src/libserver/task.c b/src/libserver/task.c index bd1e07549..9f5b1f00a 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -196,8 +196,8 @@ void rspamd_task_free(struct rspamd_task *task) rspamd_email_address_free(task->from_envelope_orig); } - if (task->meta_words) { - g_array_free(task->meta_words, TRUE); + if (task->meta_words.a) { + kv_destroy(task->meta_words); } ucl_object_unref(task->messages); diff --git a/src/libserver/task.h b/src/libserver/task.h index 6be350098..1c1778fee 100644 --- a/src/libserver/task.h +++ b/src/libserver/task.h @@ -24,6 +24,7 @@ #include "dns.h" #include "re_cache.h" #include "khash.h" +#include "libserver/word.h" #ifdef __cplusplus extern "C" { @@ -187,7 +188,7 @@ struct rspamd_task { struct rspamd_scan_result *result; /**< Metric result */ khash_t(rspamd_task_lua_cache) lua_cache; /**< cache of lua objects */ GPtrArray *tokens; /**< statistics tokens */ - GArray *meta_words; /**< rspamd_stat_token_t produced from meta headers + rspamd_words_t meta_words; /**< rspamd_word_t produced from meta headers (e.g. Subject) */ GPtrArray *rcpt_envelope; /**< array of rspamd_email_address */ diff --git a/src/libserver/word.h b/src/libserver/word.h new file mode 100644 index 000000000..7698bf327 --- /dev/null +++ b/src/libserver/word.h @@ -0,0 +1,88 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_WORD_H +#define RSPAMD_WORD_H + +#include "config.h" +#include "fstring.h" +#include "contrib/libucl/kvec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file word.h + * Word processing structures and definitions + */ + +/* Word flags */ +#define RSPAMD_WORD_FLAG_TEXT (1u << 0) +#define RSPAMD_WORD_FLAG_META (1u << 1) +#define RSPAMD_WORD_FLAG_LUA_META (1u << 2) +#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3) +#define RSPAMD_WORD_FLAG_HEADER (1u << 4) +#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5) +#define RSPAMD_WORD_FLAG_UTF (1u << 6) +#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7) +#define RSPAMD_WORD_FLAG_STEMMED (1u << 8) +#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9) +#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10) +#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11) +#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12) +#define RSPAMD_WORD_FLAG_EMOJI (1u << 13) + +/** + * Word structure representing tokenized text + */ +typedef struct rspamd_word_s { + rspamd_ftok_t original; /* utf8 raw */ + rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */ + rspamd_ftok_t normalized; /* normalized and lowercased utf8 */ + rspamd_ftok_t stemmed; /* stemmed utf8 */ + unsigned int flags; +} rspamd_word_t; + +/** + * Vector of words using kvec + */ +typedef kvec_t(rspamd_word_t) rspamd_words_t; + +/* Legacy typedefs for backward compatibility */ +typedef rspamd_word_t rspamd_stat_token_t; + +/* Legacy flag aliases for backward compatibility */ +#define RSPAMD_STAT_TOKEN_FLAG_TEXT RSPAMD_WORD_FLAG_TEXT +#define RSPAMD_STAT_TOKEN_FLAG_META RSPAMD_WORD_FLAG_META +#define RSPAMD_STAT_TOKEN_FLAG_LUA_META RSPAMD_WORD_FLAG_LUA_META +#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION RSPAMD_WORD_FLAG_EXCEPTION +#define RSPAMD_STAT_TOKEN_FLAG_HEADER RSPAMD_WORD_FLAG_HEADER +#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM RSPAMD_WORD_FLAG_UNIGRAM +#define RSPAMD_STAT_TOKEN_FLAG_UTF RSPAMD_WORD_FLAG_UTF +#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED RSPAMD_WORD_FLAG_NORMALISED +#define RSPAMD_STAT_TOKEN_FLAG_STEMMED RSPAMD_WORD_FLAG_STEMMED +#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE RSPAMD_WORD_FLAG_BROKEN_UNICODE +#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD RSPAMD_WORD_FLAG_STOP_WORD +#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED RSPAMD_WORD_FLAG_SKIPPED +#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES RSPAMD_WORD_FLAG_INVISIBLE_SPACES +#define RSPAMD_STAT_TOKEN_FLAG_EMOJI RSPAMD_WORD_FLAG_EMOJI + +#ifdef __cplusplus +} +#endif + +#endif /* RSPAMD_WORD_H */ diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index f28922588..811566ad3 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -20,6 +20,7 @@ #include "task.h" #include "lua/lua_common.h" #include "contrib/libev/ev.h" +#include "libserver/word.h" #ifdef __cplusplus extern "C" { @@ -30,36 +31,14 @@ extern "C" { * High level statistics API */ -#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1u << 0) -#define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1) -#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2) -#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3) -#define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4) -#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5) -#define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6) -#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7) -#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8) -#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9) -#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10) -#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11) -#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12) -#define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13) - -typedef struct rspamd_stat_token_s { - rspamd_ftok_t original; /* utf8 raw */ - rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */ - rspamd_ftok_t normalized; /* normalized and lowercased utf8 */ - rspamd_ftok_t stemmed; /* stemmed utf8 */ - unsigned int flags; -} rspamd_stat_token_t; #define RSPAMD_TOKEN_VALUE_TYPE float typedef struct token_node_s { uint64_t data; unsigned int window_idx; unsigned int flags; - rspamd_stat_token_t *t1; - rspamd_stat_token_t *t2; + rspamd_word_t *t1; + rspamd_word_t *t2; RSPAMD_TOKEN_VALUE_TYPE values[0]; } rspamd_token_t; diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 17caf4cc6..0bb658a3a 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -36,12 +36,13 @@ static void rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx, struct rspamd_task *task) { - GArray *ar; - rspamd_stat_token_t elt; + rspamd_words_t *words; + rspamd_word_t elt; unsigned int i; lua_State *L = task->cfg->lua_state; - ar = g_array_sized_new(FALSE, FALSE, sizeof(elt), 16); + words = rspamd_mempool_alloc(task->task_pool, sizeof(*words)); + kv_init(*words); memset(&elt, 0, sizeof(elt)); elt.flags = RSPAMD_STAT_TOKEN_FLAG_META; @@ -87,7 +88,7 @@ rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx, elt.normalized.begin = elt.original.begin; elt.normalized.len = elt.original.len; - g_array_append_val(ar, elt); + kv_push_safe(rspamd_word_t, *words, elt, meta_words_error); } lua_pop(L, 1); @@ -99,17 +100,20 @@ rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx, } - if (ar->len > 0) { + if (kv_size(*words) > 0) { st_ctx->tokenizer->tokenize_func(st_ctx, task, - ar, + words, TRUE, "M", task->tokens); } + goto meta_words_done; - rspamd_mempool_add_destructor(task->task_pool, - rspamd_array_free_hard, ar); +meta_words_error: + /* On error, just continue without the problematic tokens */ +meta_words_done: + /* kvec memory will be freed with task pool */ } /* @@ -134,8 +138,8 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx, PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part) { - if (!IS_TEXT_PART_EMPTY(part) && part->utf_words != NULL) { - reserved_len += part->utf_words->len; + if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a) { + reserved_len += kv_size(part->utf_words); } /* XXX: normal window size */ reserved_len += 5; @@ -149,9 +153,9 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx, PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part) { - if (!IS_TEXT_PART_EMPTY(part) && part->utf_words != NULL) { + if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a) { st_ctx->tokenizer->tokenize_func(st_ctx, task, - part->utf_words, IS_TEXT_PART_UTF(part), + &part->utf_words, IS_TEXT_PART_UTF(part), NULL, task->tokens); } @@ -163,10 +167,10 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx, } } - if (task->meta_words != NULL) { + if (task->meta_words.a) { st_ctx->tokenizer->tokenize_func(st_ctx, task, - task->meta_words, + &task->meta_words, TRUE, "SUBJECT", task->tokens); diff --git a/src/libstat/tokenizers/custom_tokenizer.h b/src/libstat/tokenizers/custom_tokenizer.h index b620320f4..addf08920 100644 --- a/src/libstat/tokenizers/custom_tokenizer.h +++ b/src/libstat/tokenizers/custom_tokenizer.h @@ -19,6 +19,7 @@ #include "config.h" #include "ucl.h" +#include "libserver/word.h" #ifdef __cplusplus extern "C" { @@ -27,13 +28,10 @@ extern "C" { #define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1 /** - * Tokenization result - array of word positions as (start, length) pairs - * The array is terminated by a pair with both values set to 0 + * Tokenization result - kvec of rspamd_word_t + * Uses kvec to avoid exposing GLIB structures to external API */ -struct rspamd_tokenizer_result { - unsigned int *positions; /* Array of (start, length) pairs */ - size_t count; /* Number of words (not array size!) */ -}; +typedef rspamd_words_t rspamd_tokenizer_result_t; /** * Custom tokenizer API that must be implemented by language-specific tokenizer plugins @@ -71,25 +69,25 @@ typedef struct rspamd_custom_tokenizer_api { * Main tokenization function * @param text UTF-8 text to tokenize * @param len Length of the text in bytes - * @param result Output structure to fill with word positions + * @param result Output kvec to fill with rspamd_word_t elements * @return 0 on success, non-zero on failure * - * The tokenizer should allocate result->positions using its own allocator + * The tokenizer should allocate result->a using its own allocator * Rspamd will call cleanup_result() to free it after processing */ int (*tokenize)(const char *text, size_t len, - struct rspamd_tokenizer_result *result); + rspamd_tokenizer_result_t *result); /** * Cleanup the result from tokenize() - * @param result Result structure returned by tokenize() + * @param result Result kvec returned by tokenize() * - * This function should free result->positions using the same allocator - * that was used in tokenize() and reset the structure fields. + * This function should free result->a using the same allocator + * that was used in tokenize() and reset the kvec fields. * This ensures proper memory management across DLL boundaries. * Note: This does NOT free the result structure itself, only its contents. */ - void (*cleanup_result)(struct rspamd_tokenizer_result *result); + void (*cleanup_result)(rspamd_tokenizer_result_t *result); /** * Optional: Get language hint for better language detection @@ -155,7 +153,7 @@ struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect( const char **detected_lang_hint); /* Helper function to tokenize with exceptions handling */ -GArray *rspamd_custom_tokenizer_tokenize_with_exceptions( +rspamd_tokenizer_result_t *rspamd_custom_tokenizer_tokenize_with_exceptions( struct rspamd_custom_tokenizer *tokenizer, const char *text, gsize len, diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index 0bc3414a5..360c71d36 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -21,6 +21,7 @@ #include "tokenizers.h" #include "stat_internal.h" #include "libmime/lang_detection.h" +#include "libserver/word.h" /* Size for features pipe */ #define DEFAULT_FEATURE_WINDOW_SIZE 2 @@ -268,7 +269,7 @@ struct token_pipe_entry { int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, struct rspamd_task *task, - GArray *words, + rspamd_words_t *words, gboolean is_utf, const char *prefix, GPtrArray *result) @@ -282,7 +283,7 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, gsize token_size; unsigned int processed = 0, i, w, window_size, token_flags = 0; - if (words == NULL) { + if (words == NULL || !words->a) { return FALSE; } @@ -306,8 +307,8 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, sizeof(RSPAMD_TOKEN_VALUE_TYPE) * ctx->statfiles->len; g_assert(token_size > 0); - for (w = 0; w < words->len; w++) { - token = &g_array_index(words, rspamd_stat_token_t, w); + for (w = 0; w < kv_size(*words); w++) { + token = &kv_A(*words, w); token_flags = token->flags; const char *begin; gsize len; diff --git a/src/libstat/tokenizers/tokenizer_manager.c b/src/libstat/tokenizers/tokenizer_manager.c index e2011712a..b9bfe0e6f 100644 --- a/src/libstat/tokenizers/tokenizer_manager.c +++ b/src/libstat/tokenizers/tokenizer_manager.c @@ -327,7 +327,7 @@ rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr, } /* Helper function to tokenize with a custom tokenizer handling exceptions */ -GArray * +rspamd_tokenizer_result_t * rspamd_custom_tokenizer_tokenize_with_exceptions( struct rspamd_custom_tokenizer *tokenizer, const char *text, @@ -335,36 +335,28 @@ rspamd_custom_tokenizer_tokenize_with_exceptions( GList *exceptions, rspamd_mempool_t *pool) { - GArray *words; - struct rspamd_tokenizer_result result; + rspamd_tokenizer_result_t *words; + rspamd_tokenizer_result_t result; struct rspamd_process_exception *ex; GList *cur_ex = exceptions; gsize pos = 0; unsigned int i; int ret; - words = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t), 128); + /* Allocate result kvec in pool */ + words = rspamd_mempool_alloc(pool, sizeof(*words)); + kv_init(*words); /* If no exceptions, tokenize the whole text */ if (!exceptions) { - result.positions = NULL; - result.count = 0; + kv_init(result); ret = tokenizer->api->tokenize(text, len, &result); - if (ret == 0 && result.positions) { - /* Convert positions to tokens */ - for (i = 0; i < result.count; i++) { - rspamd_stat_token_t tok; - unsigned int start = result.positions[i * 2]; - unsigned int length = result.positions[i * 2 + 1]; - - if (start + length <= len) { - memset(&tok, 0, sizeof(tok)); - tok.original.begin = text + start; - tok.original.len = length; - tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF; - g_array_append_val(words, tok); - } + if (ret == 0 && result.a) { + /* Copy tokens from result to output */ + for (i = 0; i < kv_size(result); i++) { + rspamd_word_t tok = kv_A(result, i); + kv_push(rspamd_word_t, *words, tok); } /* Use tokenizer's cleanup function */ @@ -383,23 +375,22 @@ rspamd_custom_tokenizer_tokenize_with_exceptions( /* Tokenize text before exception */ if (ex->pos > pos) { gsize segment_len = ex->pos - pos; - result.positions = NULL; - result.count = 0; + kv_init(result); ret = tokenizer->api->tokenize(text + pos, segment_len, &result); - if (ret == 0 && result.positions) { - /* Convert positions to tokens, adjusting for segment offset */ - for (i = 0; i < result.count; i++) { - rspamd_stat_token_t tok; - unsigned int start = result.positions[i * 2] + pos; - unsigned int length = result.positions[i * 2 + 1]; - - if (start + length <= ex->pos) { - memset(&tok, 0, sizeof(tok)); - tok.original.begin = text + start; - tok.original.len = length; - tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF; - g_array_append_val(words, tok); + if (ret == 0 && result.a) { + /* Copy tokens from result, adjusting positions for segment offset */ + for (i = 0; i < kv_size(result); i++) { + rspamd_word_t tok = kv_A(result, i); + + /* Adjust pointers to point to the original text */ + gsize offset_in_segment = tok.original.begin - (text + pos); + if (offset_in_segment < segment_len) { + tok.original.begin = text + pos + offset_in_segment; + /* Ensure we don't go past the exception boundary */ + if (tok.original.begin + tok.original.len <= text + ex->pos) { + kv_push(rspamd_word_t, *words, tok); + } } } @@ -411,7 +402,7 @@ rspamd_custom_tokenizer_tokenize_with_exceptions( } /* Add exception as a special token */ - rspamd_stat_token_t ex_tok; + rspamd_word_t ex_tok; memset(&ex_tok, 0, sizeof(ex_tok)); if (ex->type == RSPAMD_EXCEPTION_URL) { @@ -423,7 +414,7 @@ rspamd_custom_tokenizer_tokenize_with_exceptions( ex_tok.original.len = ex->len; } ex_tok.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - g_array_append_val(words, ex_tok); + kv_push(rspamd_word_t, *words, ex_tok); /* Move past exception */ pos = ex->pos + ex->len; @@ -432,23 +423,19 @@ rspamd_custom_tokenizer_tokenize_with_exceptions( /* Process remaining text after last exception */ if (pos < len) { - result.positions = NULL; - result.count = 0; + kv_init(result); ret = tokenizer->api->tokenize(text + pos, len - pos, &result); - if (ret == 0 && result.positions) { - /* Convert positions to tokens, adjusting for segment offset */ - for (i = 0; i < result.count; i++) { - rspamd_stat_token_t tok; - unsigned int start = result.positions[i * 2] + pos; - unsigned int length = result.positions[i * 2 + 1]; - - if (start + length <= len) { - memset(&tok, 0, sizeof(tok)); - tok.original.begin = text + start; - tok.original.len = length; - tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF; - g_array_append_val(words, tok); + if (ret == 0 && result.a) { + /* Copy tokens from result, adjusting positions for segment offset */ + for (i = 0; i < kv_size(result); i++) { + rspamd_word_t tok = kv_A(result, i); + + /* Adjust pointers to point to the original text */ + gsize offset_in_segment = tok.original.begin - (text + pos); + if (offset_in_segment < (len - pos)) { + tok.original.begin = text + pos + offset_in_segment; + kv_push(rspamd_word_t, *words, tok); } } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 4667976fb..1c5b0a4c8 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -37,8 +37,8 @@ #include <math.h> -typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, char const **pos, - rspamd_stat_token_t *token, +typedef gboolean (*token_get_function)(rspamd_word_t *buf, char const **pos, + rspamd_word_t *token, GList **exceptions, gsize *rl, gboolean check_signature); const char t_delimiters[256] = { @@ -71,8 +71,8 @@ const char t_delimiters[256] = { /* Get next word from specified f_str_t buf */ static gboolean -rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf, - char const **cur, rspamd_stat_token_t *token, +rspamd_tokenizer_get_word_raw(rspamd_word_t *buf, + char const **cur, rspamd_word_t *token, GList **exceptions, gsize *rl, gboolean unused) { gsize remain, pos; @@ -166,7 +166,7 @@ rspamd_tokenize_check_limit(gboolean decay, unsigned int nwords, uint64_t *hv, uint64_t *prob, - const rspamd_stat_token_t *token, + const rspamd_word_t *token, gssize remain, gssize total) { @@ -244,9 +244,9 @@ rspamd_utf_word_valid(const unsigned char *text, const unsigned char *end, } while (0) static inline void -rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res) +rspamd_tokenize_exception(struct rspamd_process_exception *ex, rspamd_words_t *res) { - rspamd_stat_token_t token; + rspamd_word_t token; memset(&token, 0, sizeof(token)); @@ -255,7 +255,7 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res) token.original.len = sizeof("!!EX!!") - 1; token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - g_array_append_val(res, token); + kv_push_safe(rspamd_word_t, *res, token, exception_error); token.flags = 0; } else if (ex->type == RSPAMD_EXCEPTION_URL) { @@ -273,28 +273,33 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res) } token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - g_array_append_val(res, token); + kv_push_safe(rspamd_word_t, *res, token, exception_error); token.flags = 0; } + return; + +exception_error: + /* On error, just skip this exception token */ + return; } -GArray * +rspamd_words_t * rspamd_tokenize_text(const char *text, gsize len, const UText *utxt, enum rspamd_tokenize_type how, struct rspamd_config *cfg, GList *exceptions, uint64_t *hash, - GArray *cur_words, + rspamd_words_t *output_kvec, rspamd_mempool_t *pool) { - rspamd_stat_token_t token, buf; + rspamd_word_t token, buf; const char *pos = NULL; gsize l = 0; - GArray *res; + rspamd_words_t *res; GList *cur = exceptions; - unsigned int min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; + unsigned int min_len = 0, max_len = 0, word_decay = 0; uint64_t hv = 0; gboolean decay = FALSE, long_text_mode = FALSE; uint64_t prob = 0; @@ -307,7 +312,7 @@ rspamd_tokenize_text(const char *text, gsize len, const char *detected_lang = NULL; if (text == NULL) { - return cur_words; + return output_kvec; } if (len > long_text_limit) { @@ -328,17 +333,16 @@ rspamd_tokenize_text(const char *text, gsize len, min_len = cfg->min_word_len; max_len = cfg->max_word_len; word_decay = cfg->words_decay; - initial_size = word_decay * 2; } - if (!cur_words) { - res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t), - initial_size); - } - else { - res = cur_words; + if (!output_kvec) { + /* Should not happen in normal usage */ + return NULL; } + res = output_kvec; + kv_init(*res); + /* Try custom tokenizers first if we're in UTF mode */ if (cfg && cfg->tokenizer_manager && how == RSPAMD_TOKENIZE_UTF && utxt != NULL) { custom_tok = rspamd_tokenizer_manager_detect( @@ -350,18 +354,22 @@ rspamd_tokenize_text(const char *text, gsize len, if (custom_tok && custom_confidence >= custom_tok->min_confidence) { /* Use custom tokenizer with exception handling */ - GArray *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions( + rspamd_tokenizer_result_t *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions( custom_tok, text, len, exceptions, pool); if (custom_res) { msg_debug_pool("using custom tokenizer %s (confidence: %.2f) for text tokenization", custom_tok->name, custom_confidence); + /* Copy custom tokenizer results to output kvec */ + for (unsigned int i = 0; i < kv_size(*custom_res); i++) { + kv_push_safe(rspamd_word_t, *res, kv_A(*custom_res, i), custom_tokenizer_error); + } + /* Calculate hash if needed */ - if (hash && custom_res->len > 0) { - unsigned int i; - for (i = 0; i < custom_res->len; i++) { - rspamd_stat_token_t *t = &g_array_index(custom_res, rspamd_stat_token_t, i); + if (hash && kv_size(*res) > 0) { + for (unsigned int i = 0; i < kv_size(*res); i++) { + rspamd_word_t *t = &kv_A(*res, i); if (t->original.len >= sizeof(uint64_t)) { uint64_t tmp; memcpy(&tmp, t->original.begin, sizeof(tmp)); @@ -371,14 +379,7 @@ rspamd_tokenize_text(const char *text, gsize len, *hash = mum_hash_finish(hv); } - /* If we had existing words, append to them */ - if (cur_words && custom_res != cur_words) { - g_array_append_vals(cur_words, custom_res->data, custom_res->len); - g_array_free(custom_res, TRUE); - return cur_words; - } - - return custom_res; + return res; } else { msg_warn_pool("custom tokenizer %s failed to tokenize text, falling back to default", @@ -396,7 +397,7 @@ rspamd_tokenize_text(const char *text, gsize len, } if (token.original.len > 0 && - rspamd_tokenize_check_limit(decay, word_decay, res->len, + rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res), &hv, &prob, &token, pos - text, len)) { if (!decay) { decay = TRUE; @@ -408,28 +409,28 @@ rspamd_tokenize_text(const char *text, gsize len, } if (long_text_mode) { - if ((res->len + 1) % 16 == 0) { + if ((kv_size(*res) + 1) % 16 == 0) { ev_tstamp now = ev_time(); if (now - start > max_exec_time) { msg_warn_pool_check( "too long time has been spent on tokenization:" - " %.1f ms, limit is %.1f ms; %d words added so far", + " %.1f ms, limit is %.1f ms; %z words added so far", (now - start) * 1e3, max_exec_time * 1e3, - res->len); + kv_size(*res)); goto end; } } } - g_array_append_val(res, token); + kv_push_safe(rspamd_word_t, *res, token, tokenize_error); - if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) { + if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) { /* Due to bug in glib ! */ msg_err_pool_check( - "too many words found: %d, stop tokenization to avoid DoS", - res->len); + "too many words found: %z, stop tokenization to avoid DoS", + kv_size(*res)); goto end; } @@ -576,7 +577,7 @@ rspamd_tokenize_text(const char *text, gsize len, } if (token.original.len > 0 && - rspamd_tokenize_check_limit(decay, word_decay, res->len, + rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res), &hv, &prob, &token, p, len)) { if (!decay) { decay = TRUE; @@ -589,15 +590,15 @@ rspamd_tokenize_text(const char *text, gsize len, if (token.original.len > 0) { /* Additional check for number of words */ - if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) { + if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) { /* Due to bug in glib ! */ - msg_err("too many words found: %d, stop tokenization to avoid DoS", - res->len); + msg_err("too many words found: %z, stop tokenization to avoid DoS", + kv_size(*res)); goto end; } - g_array_append_val(res, token); + kv_push_safe(rspamd_word_t, *res, token, tokenize_error); } /* Also check for long text mode */ @@ -605,15 +606,15 @@ rspamd_tokenize_text(const char *text, gsize len, /* Check time each 128 words added */ const int words_check_mask = 0x7F; - if ((res->len & words_check_mask) == words_check_mask) { + if ((kv_size(*res) & words_check_mask) == words_check_mask) { ev_tstamp now = ev_time(); if (now - start > max_exec_time) { msg_warn_pool_check( "too long time has been spent on tokenization:" - " %.1f ms, limit is %.1f ms; %d words added so far", + " %.1f ms, limit is %.1f ms; %z words added so far", (now - start) * 1e3, max_exec_time * 1e3, - res->len); + kv_size(*res)); goto end; } @@ -643,8 +644,14 @@ end: } return res; + +tokenize_error: +custom_tokenizer_error: + msg_err_pool("failed to allocate memory for tokenization"); + return res; } + #undef SHIFT_EX static void @@ -678,32 +685,38 @@ rspamd_add_metawords_from_str(const char *beg, gsize len, #endif } + /* Initialize meta_words kvec if not already done */ + if (!task->meta_words.a) { + kv_init(task->meta_words); + } + if (valid_utf) { utext_openUTF8(&utxt, beg, len, &uc_err); - task->meta_words = rspamd_tokenize_text(beg, len, - &utxt, RSPAMD_TOKENIZE_UTF, - task->cfg, NULL, NULL, - task->meta_words, - task->task_pool); + rspamd_tokenize_text(beg, len, + &utxt, RSPAMD_TOKENIZE_UTF, + task->cfg, NULL, NULL, + &task->meta_words, + task->task_pool); utext_close(&utxt); } else { - task->meta_words = rspamd_tokenize_text(beg, len, - NULL, RSPAMD_TOKENIZE_RAW, - task->cfg, NULL, NULL, task->meta_words, - task->task_pool); + rspamd_tokenize_text(beg, len, + NULL, RSPAMD_TOKENIZE_RAW, + task->cfg, NULL, NULL, + &task->meta_words, + task->task_pool); } } void rspamd_tokenize_meta_words(struct rspamd_task *task) { unsigned int i = 0; - rspamd_stat_token_t *tok; + rspamd_word_t *tok; if (MESSAGE_FIELD(task, subject)) { rspamd_add_metawords_from_str(MESSAGE_FIELD(task, subject), @@ -720,7 +733,7 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task) } } - if (task->meta_words != NULL) { + if (task->meta_words.a) { const char *language = NULL; if (MESSAGE_FIELD(task, text_parts) && @@ -733,12 +746,12 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task) } } - rspamd_normalize_words(task->meta_words, task->task_pool); - rspamd_stem_words(task->meta_words, task->task_pool, language, + rspamd_normalize_words(&task->meta_words, task->task_pool); + rspamd_stem_words(&task->meta_words, task->task_pool, language, task->lang_det); - for (i = 0; i < task->meta_words->len; i++) { - tok = &g_array_index(task->meta_words, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(task->meta_words); i++) { + tok = &kv_A(task->meta_words, i); tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER; } } @@ -812,7 +825,7 @@ rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok, tok->normalized.begin = dest; } -void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool) +void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool) { UErrorCode uc_err = U_ZERO_ERROR; UConverter *utf8_converter; @@ -911,25 +924,27 @@ void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *po } } -void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool) + +void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool) { - rspamd_stat_token_t *tok; + rspamd_word_t *tok; unsigned int i; - for (i = 0; i < words->len; i++) { - tok = &g_array_index(words, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(*words); i++) { + tok = &kv_A(*words, i); rspamd_normalize_single_word(tok, pool); } } -void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, + +void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool, const char *language, struct rspamd_lang_detector *lang_detector) { static GHashTable *stemmers = NULL; struct sb_stemmer *stem = NULL; unsigned int i; - rspamd_stat_token_t *tok; + rspamd_word_t *tok; char *dest; gsize dlen; @@ -962,8 +977,8 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, stem = NULL; } } - for (i = 0; i < words->len; i++) { - tok = &g_array_index(words, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(*words); i++) { + tok = &kv_A(*words, i); if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { if (stem) { diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index f3066b5cf..bb0bb54e2 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -22,6 +22,7 @@ #include "fstring.h" #include "rspamd.h" #include "stat_api.h" +#include "libserver/word.h" #include <unicode/utext.h> @@ -43,7 +44,7 @@ struct rspamd_stat_tokenizer { int (*tokenize_func)(struct rspamd_stat_ctx *ctx, struct rspamd_task *task, - GArray *words, + rspamd_words_t *words, gboolean is_utf, const char *prefix, GPtrArray *result); @@ -59,20 +60,20 @@ enum rspamd_tokenize_type { int token_node_compare_func(gconstpointer a, gconstpointer b); -/* Tokenize text into array of words (rspamd_stat_token_t type) */ -GArray *rspamd_tokenize_text(const char *text, gsize len, - const UText *utxt, - enum rspamd_tokenize_type how, - struct rspamd_config *cfg, - GList *exceptions, - uint64_t *hash, - GArray *cur_words, - rspamd_mempool_t *pool); +/* Tokenize text into kvec of words (rspamd_word_t type) */ +rspamd_words_t *rspamd_tokenize_text(const char *text, gsize len, + const UText *utxt, + enum rspamd_tokenize_type how, + struct rspamd_config *cfg, + GList *exceptions, + uint64_t *hash, + rspamd_words_t *output_kvec, + rspamd_mempool_t *pool); /* OSB tokenize function */ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, struct rspamd_task *task, - GArray *words, + rspamd_words_t *words, gboolean is_utf, const char *prefix, GPtrArray *result); @@ -83,11 +84,11 @@ gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool, struct rspamd_lang_detector; -void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool); +void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool); -void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool); - -void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, +/* Word processing functions */ +void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool); +void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool, const char *language, struct rspamd_lang_detector *lang_detector); diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c index 5fe110eb8..c69c42292 100644 --- a/src/libutil/shingles.c +++ b/src/libutil/shingles.c @@ -18,6 +18,7 @@ #include "cryptobox.h" #include "images.h" #include "libstat/stat_api.h" +#include "libserver/word.h" #define SHINGLES_WINDOW 3 #define SHINGLES_KEY_SIZE rspamd_cryptobox_SIPKEYBYTES @@ -112,7 +113,7 @@ rspamd_shingles_get_keys_cached(const unsigned char key[SHINGLES_KEY_SIZE]) } struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops") - rspamd_shingles_from_text(GArray *input, + rspamd_shingles_from_text(rspamd_words_t *input, const unsigned char key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, @@ -123,12 +124,16 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops") uint64_t **hashes; unsigned char **keys; rspamd_fstring_t *row; - rspamd_stat_token_t *word; + rspamd_word_t *word; uint64_t val; int i, j, k; gsize hlen, ilen = 0, beg = 0, widx = 0; enum rspamd_cryptobox_fast_hash_type ht; + if (!input || !input->a) { + return NULL; + } + if (pool != NULL) { res = rspamd_mempool_alloc(pool, sizeof(*res)); } @@ -138,10 +143,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops") row = rspamd_fstring_sized_new(256); - for (i = 0; i < input->len; i++) { - word = &g_array_index(input, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(*input); i++) { + word = &kv_A(*input, i); - if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) { + if (!((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0)) { ilen++; } } @@ -162,10 +167,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops") for (j = beg; j < i; j++) { word = NULL; - while (widx < input->len) { - word = &g_array_index(input, rspamd_stat_token_t, widx); + while (widx < kv_size(*input)) { + word = &kv_A(*input, widx); - if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) { + if ((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0) { widx++; } else { @@ -237,10 +242,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops") word = NULL; - while (widx < input->len) { - word = &g_array_index(input, rspamd_stat_token_t, widx); + while (widx < kv_size(*input)) { + word = &kv_A(*input, widx); - if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) { + if ((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0) { widx++; } else { diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h index fe6f16cf8..1ab2c6842 100644 --- a/src/libutil/shingles.h +++ b/src/libutil/shingles.h @@ -18,6 +18,7 @@ #include "config.h" #include "mem_pool.h" +#include "libserver/word.h" #define RSPAMD_SHINGLE_SIZE 32 @@ -48,14 +49,14 @@ typedef uint64_t (*rspamd_shingles_filter)(uint64_t *input, gsize count, /** * Generate shingles from the input of fixed size strings using lemmatizer * if needed - * @param input array of `rspamd_fstring_t` + * @param input kvec of `rspamd_word_t` * @param key secret key used to generate shingles * @param pool pool to allocate shingles array * @param filter hashes filtering function * @param filterd opaque data for filtering function * @return shingles array */ -struct rspamd_shingle *rspamd_shingles_from_text(GArray *input, +struct rspamd_shingle *rspamd_shingles_from_text(rspamd_words_t *input, const unsigned char key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c index 3a0f1a06c..f36228680 100644 --- a/src/lua/lua_common.c +++ b/src/lua/lua_common.c @@ -2401,7 +2401,7 @@ rspamd_lua_try_load_redis(lua_State *L, const ucl_object_t *obj, return FALSE; } -void rspamd_lua_push_full_word(lua_State *L, rspamd_stat_token_t *w) +void rspamd_lua_push_full_word(lua_State *L, rspamd_word_t *w) { int fl_cnt; @@ -2521,6 +2521,54 @@ int rspamd_lua_push_words(lua_State *L, GArray *words, return 1; } +int rspamd_lua_push_words_kvec(lua_State *L, rspamd_words_t *words, + enum rspamd_lua_words_type how) +{ + rspamd_word_t *w; + unsigned int i, cnt; + + if (!words || !words->a) { + lua_createtable(L, 0, 0); + return 1; + } + + lua_createtable(L, kv_size(*words), 0); + + for (i = 0, cnt = 1; i < kv_size(*words); i++) { + w = &kv_A(*words, i); + + switch (how) { + case RSPAMD_LUA_WORDS_STEM: + if (w->stemmed.len > 0) { + lua_pushlstring(L, w->stemmed.begin, w->stemmed.len); + lua_rawseti(L, -2, cnt++); + } + break; + case RSPAMD_LUA_WORDS_NORM: + if (w->normalized.len > 0) { + lua_pushlstring(L, w->normalized.begin, w->normalized.len); + lua_rawseti(L, -2, cnt++); + } + break; + case RSPAMD_LUA_WORDS_RAW: + if (w->original.len > 0) { + lua_pushlstring(L, w->original.begin, w->original.len); + lua_rawseti(L, -2, cnt++); + } + break; + case RSPAMD_LUA_WORDS_FULL: + rspamd_lua_push_full_word(L, w); + /* Push to the resulting vector */ + lua_rawseti(L, -2, cnt++); + break; + default: + break; + } + } + + return 1; +} + char * rspamd_lua_get_module_name(lua_State *L) { @@ -2658,4 +2706,4 @@ int rspamd_lua_geti(lua_State *L, int pos, int i) return lua_type(L, -1); } -#endif
\ No newline at end of file +#endif diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h index 5819da8cb..d494f0923 100644 --- a/src/lua/lua_common.h +++ b/src/lua/lua_common.h @@ -539,7 +539,7 @@ enum lua_logger_escape_type { * @return */ gsize lua_logger_out(lua_State *L, int pos, char *outbuf, gsize len, - enum lua_logger_escape_type esc_type); + enum lua_logger_escape_type esc_type); /** * Safely checks userdata to match specified class @@ -632,7 +632,7 @@ struct rspamd_stat_token_s; * @param L * @param word */ -void rspamd_lua_push_full_word(lua_State *L, struct rspamd_stat_token_s *word); +void rspamd_lua_push_full_word(lua_State *L, rspamd_word_t *word); enum rspamd_lua_words_type { RSPAMD_LUA_WORDS_STEM = 0, @@ -651,6 +651,9 @@ enum rspamd_lua_words_type { int rspamd_lua_push_words(lua_State *L, GArray *words, enum rspamd_lua_words_type how); +int rspamd_lua_push_words_kvec(lua_State *L, rspamd_words_t *words, + enum rspamd_lua_words_type how); + /** * Returns newly allocated name for caller module name * @param L diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index 07dba9c93..982b10d90 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -901,7 +901,7 @@ lua_textpart_get_words_count(lua_State *L) return 1; } - if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) { + if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) { lua_pushinteger(L, 0); } else { @@ -943,7 +943,7 @@ lua_textpart_get_words(lua_State *L) return luaL_error(L, "invalid arguments"); } - if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) { + if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) { lua_createtable(L, 0, 0); } else { @@ -957,7 +957,7 @@ lua_textpart_get_words(lua_State *L) } } - return rspamd_lua_push_words(L, part->utf_words, how); + return rspamd_lua_push_words_kvec(L, &part->utf_words, how); } return 1; @@ -976,7 +976,7 @@ lua_textpart_filter_words(lua_State *L) return luaL_error(L, "invalid arguments"); } - if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) { + if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) { lua_createtable(L, 0, 0); } else { @@ -998,9 +998,8 @@ lua_textpart_filter_words(lua_State *L) lua_createtable(L, 8, 0); - for (i = 0, cnt = 1; i < part->utf_words->len; i++) { - rspamd_stat_token_t *w = &g_array_index(part->utf_words, - rspamd_stat_token_t, i); + for (i = 0, cnt = 1; i < kv_size(part->utf_words); i++) { + rspamd_word_t *w = &kv_A(part->utf_words, i); switch (how) { case RSPAMD_LUA_WORDS_STEM: @@ -1194,13 +1193,13 @@ struct lua_shingle_filter_cbdata { rspamd_mempool_t *pool; }; -#define STORE_TOKEN(i, t) \ - do { \ - if ((i) < part->utf_words->len) { \ - word = &g_array_index(part->utf_words, rspamd_stat_token_t, (i)); \ - sd->t.begin = word->stemmed.begin; \ - sd->t.len = word->stemmed.len; \ - } \ +#define STORE_TOKEN(i, t) \ + do { \ + if ((i) < kv_size(part->utf_words)) { \ + word = &kv_A(part->utf_words, (i)); \ + sd->t.begin = word->stemmed.begin; \ + sd->t.len = word->stemmed.len; \ + } \ } while (0) static uint64_t @@ -1210,7 +1209,7 @@ lua_shingles_filter(uint64_t *input, gsize count, uint64_t minimal = G_MAXUINT64; gsize i, min_idx = 0; struct lua_shingle_data *sd; - rspamd_stat_token_t *word; + rspamd_word_t *word; struct lua_shingle_filter_cbdata *cbd = (struct lua_shingle_filter_cbdata *) ud; struct rspamd_mime_text_part *part; @@ -1248,7 +1247,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L) unsigned int i; struct lua_shingle_data *sd; rspamd_cryptobox_hash_state_t st; - rspamd_stat_token_t *word; + rspamd_word_t *word; struct lua_shingle_filter_cbdata cbd; @@ -1256,7 +1255,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L) return luaL_error(L, "invalid arguments"); } - if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) { + if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) { lua_pushnil(L); lua_pushnil(L); } @@ -1269,8 +1268,8 @@ lua_textpart_get_fuzzy_hashes(lua_State *L) /* Calculate direct hash */ rspamd_cryptobox_hash_init(&st, key, rspamd_cryptobox_HASHKEYBYTES); - for (i = 0; i < part->utf_words->len; i++) { - word = &g_array_index(part->utf_words, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(part->utf_words); i++) { + word = &kv_A(part->utf_words, i); rspamd_cryptobox_hash_update(&st, word->stemmed.begin, word->stemmed.len); } @@ -1283,7 +1282,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L) cbd.pool = pool; cbd.part = part; - sgl = rspamd_shingles_from_text(part->utf_words, key, + sgl = rspamd_shingles_from_text(&part->utf_words, key, pool, lua_shingles_filter, &cbd, RSPAMD_SHINGLES_MUMHASH); if (sgl == NULL) { diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 97f9c496e..0b1473b61 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -6943,7 +6943,7 @@ lua_task_get_meta_words(lua_State *L) return luaL_error(L, "invalid arguments"); } - if (task->meta_words == NULL) { + if (!task->meta_words.a) { lua_createtable(L, 0, 0); } else { @@ -6967,7 +6967,7 @@ lua_task_get_meta_words(lua_State *L) } } - return rspamd_lua_push_words(L, task->meta_words, how); + return rspamd_lua_push_words_kvec(L, &task->meta_words, how); } return 1; @@ -7039,6 +7039,76 @@ lua_lookup_words_array(lua_State *L, return nmatched; } +static unsigned int +lua_lookup_words_kvec(lua_State *L, + int cbpos, + struct rspamd_task *task, + struct rspamd_lua_map *map, + rspamd_words_t *words) +{ + rspamd_word_t *tok; + unsigned int i, nmatched = 0; + int err_idx; + gboolean matched; + const char *key; + gsize keylen; + + if (!words || !words->a) { + return 0; + } + + for (i = 0; i < kv_size(*words); i++) { + tok = &kv_A(*words, i); + + matched = FALSE; + + if (tok->normalized.len == 0) { + continue; + } + + key = tok->normalized.begin; + keylen = tok->normalized.len; + + switch (map->type) { + case RSPAMD_LUA_MAP_SET: + case RSPAMD_LUA_MAP_HASH: + /* We know that tok->normalized is zero terminated in fact */ + if (rspamd_match_hash_map(map->data.hash, key, keylen)) { + matched = TRUE; + } + break; + case RSPAMD_LUA_MAP_REGEXP: + case RSPAMD_LUA_MAP_REGEXP_MULTIPLE: + if (rspamd_match_regexp_map_single(map->data.re_map, key, + keylen)) { + matched = TRUE; + } + break; + default: + g_assert_not_reached(); + break; + } + + if (matched) { + nmatched++; + + lua_pushcfunction(L, &rspamd_lua_traceback); + err_idx = lua_gettop(L); + lua_pushvalue(L, cbpos); /* Function */ + rspamd_lua_push_full_word(L, tok); + + if (lua_pcall(L, 1, 0, err_idx) != 0) { + msg_err_task("cannot call callback function for lookup words: %s", + lua_tostring(L, -1)); + } + + lua_settop(L, err_idx - 1); + } + } + + return nmatched; +} + static int lua_task_lookup_words(lua_State *L) { @@ -7062,13 +7132,13 @@ lua_task_lookup_words(lua_State *L) PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, tp) { - if (tp->utf_words) { - matches += lua_lookup_words_array(L, 3, task, map, tp->utf_words); + if (tp->utf_words.a) { + matches += lua_lookup_words_kvec(L, 3, task, map, &tp->utf_words); } } - if (task->meta_words) { - matches += lua_lookup_words_array(L, 3, task, map, task->meta_words); + if (task->meta_words.a) { + matches += lua_lookup_words_kvec(L, 3, task, map, &task->meta_words); } lua_pushinteger(L, matches); diff --git a/src/plugins/chartable.cxx b/src/plugins/chartable.cxx index a5c7cb899..c82748862 100644 --- a/src/plugins/chartable.cxx +++ b/src/plugins/chartable.cxx @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1696,7 +1696,7 @@ rspamd_can_alias_latin(int ch) static double rspamd_chartable_process_word_utf(struct rspamd_task *task, - rspamd_stat_token_t *w, + rspamd_word_t *w, gboolean is_url, unsigned int *ncap, struct chartable_ctx *chartable_module_ctx, @@ -1842,7 +1842,7 @@ rspamd_chartable_process_word_utf(struct rspamd_task *task, static double rspamd_chartable_process_word_ascii(struct rspamd_task *task, - rspamd_stat_token_t *w, + rspamd_word_t *w, gboolean is_url, struct chartable_ctx *chartable_module_ctx) { @@ -1931,17 +1931,17 @@ rspamd_chartable_process_part(struct rspamd_task *task, struct chartable_ctx *chartable_module_ctx, gboolean ignore_diacritics) { - rspamd_stat_token_t *w; + rspamd_word_t *w; unsigned int i, ncap = 0; double cur_score = 0.0; - if (part == nullptr || part->utf_words == nullptr || - part->utf_words->len == 0 || part->nwords == 0) { + if (part == nullptr || part->utf_words.a == nullptr || + kv_size(part->utf_words) == 0 || part->nwords == 0) { return FALSE; } - for (i = 0; i < part->utf_words->len; i++) { - w = &g_array_index(part->utf_words, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(part->utf_words); i++) { + w = &kv_A(part->utf_words, i); if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { @@ -2015,13 +2015,13 @@ chartable_symbol_callback(struct rspamd_task *task, ignore_diacritics = TRUE; } - if (task->meta_words != nullptr && task->meta_words->len > 0) { - rspamd_stat_token_t *w; + if (task->meta_words.a && kv_size(task->meta_words) > 0) { + rspamd_word_t *w; double cur_score = 0; - gsize arlen = task->meta_words->len; + gsize arlen = kv_size(task->meta_words); for (i = 0; i < arlen; i++) { - w = &g_array_index(task->meta_words, rspamd_stat_token_t, i); + w = &kv_A(task->meta_words, i); cur_score += rspamd_chartable_process_word_utf(task, w, FALSE, nullptr, chartable_module_ctx, ignore_diacritics); } diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index 85ea3b00c..7dd5162ac 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -1431,10 +1431,10 @@ fuzzy_io_fin(void *ud) close(session->fd); } -static GArray * +static rspamd_words_t * fuzzy_preprocess_words(struct rspamd_mime_text_part *part, rspamd_mempool_t *pool) { - return part->utf_words; + return &part->utf_words; } static void @@ -1861,7 +1861,7 @@ fuzzy_cmd_from_text_part(struct rspamd_task *task, unsigned int i; rspamd_cryptobox_hash_state_t st; rspamd_stat_token_t *word; - GArray *words; + rspamd_words_t *words; struct fuzzy_cmd_io *io; unsigned int additional_length; unsigned char *additional_data; @@ -1970,10 +1970,10 @@ fuzzy_cmd_from_text_part(struct rspamd_task *task, rspamd_cryptobox_hash_init(&st, rule->hash_key->str, rule->hash_key->len); words = fuzzy_preprocess_words(part, task->task_pool); - for (i = 0; i < words->len; i++) { - word = &g_array_index(words, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(*words); i++) { + word = &kv_A(*words, i); - if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) { + if (!((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0)) { rspamd_cryptobox_hash_update(&st, word->stemmed.begin, word->stemmed.len); } @@ -2684,7 +2684,7 @@ fuzzy_insert_metric_results(struct rspamd_task *task, struct fuzzy_rule *rule, if (task->message) { PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, tp) { - if (!IS_TEXT_PART_EMPTY(tp) && tp->utf_words != NULL && tp->utf_words->len > 0) { + if (!IS_TEXT_PART_EMPTY(tp) && kv_size(tp->utf_words) > 0) { seen_text_part = TRUE; if (tp->utf_stripped_text.magic == UTEXT_MAGIC) { diff --git a/test/rspamd_shingles_test.c b/test/rspamd_shingles_test.c index d1a10de84..5b88f4b2d 100644 --- a/test/rspamd_shingles_test.c +++ b/test/rspamd_shingles_test.c @@ -17,6 +17,7 @@ #include "rspamd.h" #include "shingles.h" #include "ottery.h" +#include "libserver/word.h" #include <math.h> static const char * @@ -52,63 +53,76 @@ generate_random_string(char *begin, size_t len) } } -static GArray * +static rspamd_words_t * generate_fuzzy_words(gsize cnt, gsize max_len) { - GArray *res; + rspamd_words_t *res; gsize i, wlen; - rspamd_ftok_t w; + rspamd_word_t word; char *t; - res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_ftok_t), cnt); + res = g_malloc(sizeof(*res)); + kv_init(*res); for (i = 0; i < cnt; i++) { wlen = ottery_rand_range(max_len) + 1; /* wlen = max_len; */ - w.len = wlen; t = g_malloc(wlen); generate_random_string(t, wlen); - w.begin = t; - g_array_append_val(res, w); + + memset(&word, 0, sizeof(word)); + word.stemmed.begin = t; + word.stemmed.len = wlen; + word.original.begin = t; + word.original.len = wlen; + word.flags = 0; /* No flags set, so it won't be skipped */ + + kv_push(rspamd_word_t, *res, word); } return res; } static void -permute_vector(GArray *in, double prob) +permute_vector(rspamd_words_t *in, double prob) { gsize i, total = 0; - rspamd_ftok_t *w; + rspamd_word_t *w; - for (i = 0; i < in->len; i++) { + for (i = 0; i < kv_size(*in); i++) { if (ottery_rand_unsigned() <= G_MAXUINT * prob) { - w = &g_array_index(in, rspamd_ftok_t, i); - generate_random_string((char *) w->begin, w->len); + w = &kv_A(*in, i); + generate_random_string((char *) w->stemmed.begin, w->stemmed.len); + /* Also update original since they point to same memory */ + w->original.begin = w->stemmed.begin; + w->original.len = w->stemmed.len; total++; } } - msg_debug("generated %z permutations of %ud words", total, in->len); + msg_debug("generated %z permutations of %ud words", total, (unsigned int) kv_size(*in)); } static void -free_fuzzy_words(GArray *ar) +free_fuzzy_words(rspamd_words_t *ar) { gsize i; - rspamd_ftok_t *w; + rspamd_word_t *w; - for (i = 0; i < ar->len; i++) { - w = &g_array_index(ar, rspamd_ftok_t, i); - g_free((gpointer) w->begin); + for (i = 0; i < kv_size(*ar); i++) { + w = &kv_A(*ar, i); + g_free((gpointer) w->stemmed.begin); } + + kv_destroy(*ar); + g_free(ar); } static void test_case(gsize cnt, gsize max_len, double perm_factor, enum rspamd_shingle_alg alg) { - GArray *input; + rspamd_words_t *input; struct rspamd_shingle *sgl, *sgl_permuted; double res; unsigned char key[16]; @@ -281,51 +295,59 @@ void rspamd_shingles_test_func(void) enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD; struct rspamd_shingle *sgl; unsigned char key[16]; - GArray *input; - rspamd_ftok_t tok; + rspamd_words_t input; + rspamd_word_t word; int i; memset(key, 0, sizeof(key)); - input = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_ftok_t), 5); + kv_init(input); for (i = 0; i < 5; i++) { char *b = g_alloca(8); memset(b, 0, 8); memcpy(b + 1, "test", 4); b[0] = 'a' + i; - tok.begin = b; - tok.len = 5 + ((i + 1) % 4); - g_array_append_val(input, tok); + + memset(&word, 0, sizeof(word)); + word.stemmed.begin = b; + word.stemmed.len = 5 + ((i + 1) % 4); + word.original.begin = b; + word.original.len = word.stemmed.len; + word.flags = 0; /* No flags set, so it won't be skipped */ + + kv_push(rspamd_word_t, input, word); } - sgl = rspamd_shingles_from_text(input, key, NULL, + sgl = rspamd_shingles_from_text(&input, key, NULL, rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_OLD); for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { g_assert(sgl->hashes[i] == expected_old[i]); } g_free(sgl); - sgl = rspamd_shingles_from_text(input, key, NULL, + sgl = rspamd_shingles_from_text(&input, key, NULL, rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_XXHASH); for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { g_assert(sgl->hashes[i] == expected_xxhash[i]); } g_free(sgl); - sgl = rspamd_shingles_from_text(input, key, NULL, + sgl = rspamd_shingles_from_text(&input, key, NULL, rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_MUMHASH); for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { g_assert(sgl->hashes[i] == expected_mumhash[i]); } g_free(sgl); - sgl = rspamd_shingles_from_text(input, key, NULL, + sgl = rspamd_shingles_from_text(&input, key, NULL, rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_FAST); for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { g_assert(sgl->hashes[i] == expected_fasthash[i]); } g_free(sgl); + kv_destroy(input); + for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_FAST; alg++) { test_case(200, 10, 0.1, alg); test_case(500, 20, 0.01, alg); |