aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libmime/lang_detection.c35
-rw-r--r--src/libmime/lang_detection_fasttext.cxx21
-rw-r--r--src/libmime/lang_detection_fasttext.h3
-rw-r--r--src/libmime/message.c31
-rw-r--r--src/libmime/message.h3
-rw-r--r--src/libserver/re_cache.c41
-rw-r--r--src/libserver/task.c6
-rw-r--r--src/libserver/task.h3
-rw-r--r--src/libserver/word.h88
-rw-r--r--src/libstat/stat_api.h27
-rw-r--r--src/libstat/stat_process.c32
-rw-r--r--src/libstat/tokenizers/custom_tokenizer.h26
-rw-r--r--src/libstat/tokenizers/osb.c9
-rw-r--r--src/libstat/tokenizers/tokenizer_manager.c91
-rw-r--r--src/libstat/tokenizers/tokenizers.c165
-rw-r--r--src/libstat/tokenizers/tokenizers.h31
-rw-r--r--src/libutil/shingles.c27
-rw-r--r--src/libutil/shingles.h5
-rw-r--r--src/lua/lua_common.c52
-rw-r--r--src/lua/lua_common.h7
-rw-r--r--src/lua/lua_mimepart.c39
-rw-r--r--src/lua/lua_task.c82
-rw-r--r--src/plugins/chartable.cxx24
-rw-r--r--src/plugins/fuzzy_check.c14
-rw-r--r--test/rspamd_shingles_test.c80
25 files changed, 588 insertions, 354 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 07ecff76d..b783b8325 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -936,7 +936,7 @@ end:
}
static void
-rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
+rspamd_language_detector_random_select(rspamd_words_t *ucs_tokens, unsigned int nwords,
goffset *offsets_out,
uint64_t *seed)
{
@@ -946,7 +946,7 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
g_assert(nwords != 0);
g_assert(offsets_out != NULL);
- g_assert(ucs_tokens->len >= nwords);
+ g_assert(kv_size(*ucs_tokens) >= nwords);
/*
* We split input array into `nwords` parts. For each part we randomly select
* an element from this particular split. Here is an example:
@@ -963,22 +963,22 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
* their splits. It is not uniform distribution but it seems to be better
* to include words from different text parts
*/
- step_len = ucs_tokens->len / nwords;
- remainder = ucs_tokens->len % nwords;
+ step_len = kv_size(*ucs_tokens) / nwords;
+ remainder = kv_size(*ucs_tokens) % nwords;
out_idx = 0;
coin = rspamd_random_uint64_fast_seed(seed);
sel = coin % (step_len + remainder);
offsets_out[out_idx] = sel;
- for (i = step_len + remainder; i < ucs_tokens->len;
+ for (i = step_len + remainder; i < kv_size(*ucs_tokens);
i += step_len, out_idx++) {
unsigned int ntries = 0;
coin = rspamd_random_uint64_fast_seed(seed);
sel = (coin % step_len) + i;
for (;;) {
- tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel);
+ tok = &kv_A(*ucs_tokens, sel);
/* Filter bad tokens */
if (tok->unicode.len >= 2 &&
@@ -995,8 +995,8 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
if (ntries < step_len) {
sel = (coin % step_len) + i;
}
- else if (ntries < ucs_tokens->len) {
- sel = coin % ucs_tokens->len;
+ else if (ntries < kv_size(*ucs_tokens)) {
+ sel = coin % kv_size(*ucs_tokens);
}
else {
offsets_out[out_idx] = sel;
@@ -1223,12 +1223,12 @@ static void
rspamd_language_detector_detect_type(struct rspamd_task *task,
unsigned int nwords,
struct rspamd_lang_detector *d,
- GArray *words,
+ rspamd_words_t *words,
enum rspamd_language_category cat,
khash_t(rspamd_candidates_hash) * candidates,
struct rspamd_mime_text_part *part)
{
- unsigned int nparts = MIN(words->len, nwords);
+ unsigned int nparts = MIN(kv_size(*words), nwords);
goffset *selected_words;
rspamd_stat_token_t *tok;
unsigned int i;
@@ -1241,8 +1241,7 @@ rspamd_language_detector_detect_type(struct rspamd_task *task,
msg_debug_lang_det("randomly selected %d words", nparts);
for (i = 0; i < nparts; i++) {
- tok = &g_array_index(words, rspamd_stat_token_t,
- selected_words[i]);
+ tok = &kv_A(*words, selected_words[i]);
if (tok->unicode.len >= 3) {
rspamd_language_detector_detect_word(task, d, tok, candidates,
@@ -1282,7 +1281,7 @@ static enum rspamd_language_detected_type
rspamd_language_detector_try_ngramm(struct rspamd_task *task,
unsigned int nwords,
struct rspamd_lang_detector *d,
- GArray *ucs_tokens,
+ rspamd_words_t *ucs_tokens,
enum rspamd_language_category cat,
khash_t(rspamd_candidates_hash) * candidates,
struct rspamd_mime_text_part *part)
@@ -1863,7 +1862,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) {
rspamd_fasttext_predict_result_t fasttext_predict_result =
rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task,
- part->utf_words, 4);
+ &part->utf_words, 4);
ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result);
@@ -1930,11 +1929,11 @@ rspamd_language_detector_detect(struct rspamd_task *task,
if (!ret) {
/* Apply trigramms detection */
candidates = kh_init(rspamd_candidates_hash);
- if (part->utf_words->len < default_short_text_limit) {
+ if (kv_size(part->utf_words) < default_short_text_limit) {
r = rs_detect_none;
msg_debug_lang_det("text is too short for trigrams detection: "
"%d words; at least %d words required",
- (int) part->utf_words->len,
+ (int) kv_size(part->utf_words),
(int) default_short_text_limit);
switch (cat) {
case RSPAMD_LANGUAGE_CYRILLIC:
@@ -1960,7 +1959,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
r = rspamd_language_detector_try_ngramm(task,
default_words,
d,
- part->utf_words,
+ &part->utf_words,
cat,
candidates,
part);
@@ -2123,4 +2122,4 @@ int rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt)
}
return 0;
-} \ No newline at end of file
+}
diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx
index 8ea2706e6..983ff78de 100644
--- a/src/libmime/lang_detection_fasttext.cxx
+++ b/src/libmime/lang_detection_fasttext.cxx
@@ -22,6 +22,7 @@
#include "libserver/logger.h"
#include "contrib/fmt/include/fmt/base.h"
#include "stat_api.h"
+#include "libserver/word.h"
#include <exception>
#include <string_view>
#include <vector>
@@ -180,26 +181,32 @@ bool rspamd_lang_detection_fasttext_is_enabled(void *ud)
rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
struct rspamd_task *task,
- GArray *utf_words,
+ rspamd_words_t *utf_words,
int k)
{
#ifndef WITH_FASTTEXT
return nullptr;
#else
/* Avoid too long inputs */
- static const unsigned int max_fasttext_input_len = 1024 * 1024;
+ static const size_t max_fasttext_input_len = 1024 * 1024;
auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
std::vector<std::int32_t> words_vec;
- words_vec.reserve(utf_words->len);
- for (auto i = 0; i < std::min(utf_words->len, max_fasttext_input_len); i++) {
- const auto *w = &g_array_index(utf_words, rspamd_stat_token_t, i);
+ if (!utf_words || !utf_words->a) {
+ return nullptr;
+ }
+
+ auto words_count = kv_size(*utf_words);
+ words_vec.reserve(words_count);
+
+ for (auto i = 0; i < std::min(words_count, max_fasttext_input_len); i++) {
+ const auto *w = &kv_A(*utf_words, i);
if (w->original.len > 0) {
real_model->word2vec(w->original.begin, w->original.len, words_vec);
}
}
- msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), utf_words->len);
+ msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), words_count);
auto *res = real_model->detect_language(words_vec, k);
@@ -266,4 +273,4 @@ void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res
#endif
}
-G_END_DECLS \ No newline at end of file
+G_END_DECLS
diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h
index 2a2756968..e2b67181a 100644
--- a/src/libmime/lang_detection_fasttext.h
+++ b/src/libmime/lang_detection_fasttext.h
@@ -17,6 +17,7 @@
#define RSPAMD_LANG_DETECTION_FASTTEXT_H
#include "config.h"
+#include "libserver/word.h"
G_BEGIN_DECLS
struct rspamd_config;
@@ -53,7 +54,7 @@ typedef void *rspamd_fasttext_predict_result_t;
* @return TRUE if language is detected
*/
rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
- struct rspamd_task *task, GArray *utf_words, int k);
+ struct rspamd_task *task, rspamd_words_t *utf_words, int k);
/**
* Get number of languages detected
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 60894d879..bac67fb07 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -72,14 +72,14 @@ rspamd_mime_part_extract_words(struct rspamd_task *task,
rspamd_stat_token_t *w;
unsigned int i, total_len = 0, short_len = 0;
- if (part->utf_words) {
- rspamd_stem_words(part->utf_words, task->task_pool, part->language,
+ if (part->utf_words.a) {
+ rspamd_stem_words(&part->utf_words, task->task_pool, part->language,
task->lang_det);
- for (i = 0; i < part->utf_words->len; i++) {
+ for (i = 0; i < kv_size(part->utf_words); i++) {
uint64_t h;
- w = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
+ w = &kv_A(part->utf_words, i);
if (w->stemmed.len > 0) {
/*
@@ -109,7 +109,7 @@ rspamd_mime_part_extract_words(struct rspamd_task *task,
}
}
- if (part->utf_words->len) {
+ if (kv_size(part->utf_words)) {
double *avg_len_p, *short_len_p;
avg_len_p = rspamd_mempool_get_variable(task->task_pool,
@@ -186,21 +186,24 @@ rspamd_mime_part_create_words(struct rspamd_task *task,
tok_type = RSPAMD_TOKENIZE_RAW;
}
- part->utf_words = rspamd_tokenize_text(
+ /* Initialize kvec for words */
+ kv_init(part->utf_words);
+
+ rspamd_tokenize_text(
part->utf_stripped_content->data,
part->utf_stripped_content->len,
&part->utf_stripped_text,
tok_type, task->cfg,
part->exceptions,
NULL,
- NULL,
+ &part->utf_words,
task->task_pool);
- if (part->utf_words) {
+ if (part->utf_words.a) {
part->normalized_hashes = g_array_sized_new(FALSE, FALSE,
- sizeof(uint64_t), part->utf_words->len);
- rspamd_normalize_words(part->utf_words, task->task_pool);
+ sizeof(uint64_t), kv_size(part->utf_words));
+ rspamd_normalize_words(&part->utf_words, task->task_pool);
}
}
@@ -210,7 +213,7 @@ rspamd_mime_part_detect_language(struct rspamd_task *task,
{
struct rspamd_lang_detector_res *lang;
- if (!IS_TEXT_PART_EMPTY(part) && part->utf_words && part->utf_words->len > 0 &&
+ if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a && kv_size(part->utf_words) > 0 &&
task->lang_det) {
if (rspamd_language_detector_detect(task, task->lang_det, part)) {
lang = g_ptr_array_index(part->languages, 0);
@@ -1107,8 +1110,8 @@ rspamd_message_dtor(struct rspamd_message *msg)
PTR_ARRAY_FOREACH(msg->text_parts, i, tp)
{
- if (tp->utf_words) {
- g_array_free(tp->utf_words, TRUE);
+ if (tp->utf_words.a) {
+ kv_destroy(tp->utf_words);
}
if (tp->normalized_hashes) {
g_array_free(tp->normalized_hashes, TRUE);
@@ -1584,7 +1587,7 @@ void rspamd_message_process(struct rspamd_task *task)
rspamd_mime_part_extract_words(task, text_part);
- if (text_part->utf_words) {
+ if (text_part->utf_words.a) {
total_words += text_part->nwords;
}
}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index cb695773e..e6b454362 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -16,6 +16,7 @@
#include "libserver/url.h"
#include "libutil/ref.h"
#include "libutil/str_util.h"
+#include "libserver/word.h"
#include <unicode/uchar.h>
#include <unicode/utext.h>
@@ -139,7 +140,7 @@ struct rspamd_mime_text_part {
GByteArray *utf_raw_content; /* utf raw content */
GByteArray *utf_stripped_content; /* utf content with no newlines */
GArray *normalized_hashes; /* Array of uint64_t */
- GArray *utf_words; /* Array of rspamd_stat_token_t */
+ rspamd_words_t utf_words; /* kvec of rspamd_word_t */
UText utf_stripped_text; /* Used by libicu to represent the utf8 content */
GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index 06e9f3328..50b155ae0 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -998,20 +998,21 @@ rspamd_re_cache_process_selector(struct rspamd_task *task,
return result;
}
+
static inline unsigned int
-rspamd_process_words_vector(GArray *words,
- const unsigned char **scvec,
- unsigned int *lenvec,
- struct rspamd_re_class *re_class,
- unsigned int cnt,
- gboolean *raw)
+rspamd_process_words_vector_kvec(rspamd_words_t *words,
+ const unsigned char **scvec,
+ unsigned int *lenvec,
+ struct rspamd_re_class *re_class,
+ unsigned int cnt,
+ gboolean *raw)
{
unsigned int j;
- rspamd_stat_token_t *tok;
+ rspamd_word_t *tok;
- if (words) {
- for (j = 0; j < words->len; j++) {
- tok = &g_array_index(words, rspamd_stat_token_t, j);
+ if (words && words->a) {
+ for (j = 0; j < kv_size(*words); j++) {
+ tok = &kv_A(*words, j);
if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
@@ -1432,13 +1433,13 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
{
- if (text_part->utf_words) {
- cnt += text_part->utf_words->len;
+ if (text_part->utf_words.a) {
+ cnt += kv_size(text_part->utf_words);
}
}
- if (task->meta_words && task->meta_words->len > 0) {
- cnt += task->meta_words->len;
+ if (task->meta_words.a && kv_size(task->meta_words) > 0) {
+ cnt += kv_size(task->meta_words);
}
if (cnt > 0) {
@@ -1449,15 +1450,15 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
{
- if (text_part->utf_words) {
- cnt = rspamd_process_words_vector(text_part->utf_words,
- scvec, lenvec, re_class, cnt, &raw);
+ if (text_part->utf_words.a) {
+ cnt = rspamd_process_words_vector_kvec(&text_part->utf_words,
+ scvec, lenvec, re_class, cnt, &raw);
}
}
- if (task->meta_words) {
- cnt = rspamd_process_words_vector(task->meta_words,
- scvec, lenvec, re_class, cnt, &raw);
+ if (task->meta_words.a) {
+ cnt = rspamd_process_words_vector_kvec(&task->meta_words,
+ scvec, lenvec, re_class, cnt, &raw);
}
ret = rspamd_re_cache_process_regexp_data(rt, re,
diff --git a/src/libserver/task.c b/src/libserver/task.c
index bd1e07549..9f5b1f00a 100644
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -196,8 +196,8 @@ void rspamd_task_free(struct rspamd_task *task)
rspamd_email_address_free(task->from_envelope_orig);
}
- if (task->meta_words) {
- g_array_free(task->meta_words, TRUE);
+ if (task->meta_words.a) {
+ kv_destroy(task->meta_words);
}
ucl_object_unref(task->messages);
diff --git a/src/libserver/task.h b/src/libserver/task.h
index 6be350098..1c1778fee 100644
--- a/src/libserver/task.h
+++ b/src/libserver/task.h
@@ -24,6 +24,7 @@
#include "dns.h"
#include "re_cache.h"
#include "khash.h"
+#include "libserver/word.h"
#ifdef __cplusplus
extern "C" {
@@ -187,7 +188,7 @@ struct rspamd_task {
struct rspamd_scan_result *result; /**< Metric result */
khash_t(rspamd_task_lua_cache) lua_cache; /**< cache of lua objects */
GPtrArray *tokens; /**< statistics tokens */
- GArray *meta_words; /**< rspamd_stat_token_t produced from meta headers
+ rspamd_words_t meta_words; /**< rspamd_word_t produced from meta headers
(e.g. Subject) */
GPtrArray *rcpt_envelope; /**< array of rspamd_email_address */
diff --git a/src/libserver/word.h b/src/libserver/word.h
new file mode 100644
index 000000000..7698bf327
--- /dev/null
+++ b/src/libserver/word.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_WORD_H
+#define RSPAMD_WORD_H
+
+#include "config.h"
+#include "fstring.h"
+#include "contrib/libucl/kvec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file word.h
+ * Word processing structures and definitions
+ */
+
+/* Word flags */
+#define RSPAMD_WORD_FLAG_TEXT (1u << 0)
+#define RSPAMD_WORD_FLAG_META (1u << 1)
+#define RSPAMD_WORD_FLAG_LUA_META (1u << 2)
+#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3)
+#define RSPAMD_WORD_FLAG_HEADER (1u << 4)
+#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5)
+#define RSPAMD_WORD_FLAG_UTF (1u << 6)
+#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7)
+#define RSPAMD_WORD_FLAG_STEMMED (1u << 8)
+#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9)
+#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10)
+#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11)
+#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12)
+#define RSPAMD_WORD_FLAG_EMOJI (1u << 13)
+
+/**
+ * Word structure representing tokenized text
+ */
+typedef struct rspamd_word_s {
+ rspamd_ftok_t original; /* utf8 raw */
+ rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
+ rspamd_ftok_t normalized; /* normalized and lowercased utf8 */
+ rspamd_ftok_t stemmed; /* stemmed utf8 */
+ unsigned int flags;
+} rspamd_word_t;
+
+/**
+ * Vector of words using kvec
+ */
+typedef kvec_t(rspamd_word_t) rspamd_words_t;
+
+/* Legacy typedefs for backward compatibility */
+typedef rspamd_word_t rspamd_stat_token_t;
+
+/* Legacy flag aliases for backward compatibility */
+#define RSPAMD_STAT_TOKEN_FLAG_TEXT RSPAMD_WORD_FLAG_TEXT
+#define RSPAMD_STAT_TOKEN_FLAG_META RSPAMD_WORD_FLAG_META
+#define RSPAMD_STAT_TOKEN_FLAG_LUA_META RSPAMD_WORD_FLAG_LUA_META
+#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION RSPAMD_WORD_FLAG_EXCEPTION
+#define RSPAMD_STAT_TOKEN_FLAG_HEADER RSPAMD_WORD_FLAG_HEADER
+#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM RSPAMD_WORD_FLAG_UNIGRAM
+#define RSPAMD_STAT_TOKEN_FLAG_UTF RSPAMD_WORD_FLAG_UTF
+#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED RSPAMD_WORD_FLAG_NORMALISED
+#define RSPAMD_STAT_TOKEN_FLAG_STEMMED RSPAMD_WORD_FLAG_STEMMED
+#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE RSPAMD_WORD_FLAG_BROKEN_UNICODE
+#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD RSPAMD_WORD_FLAG_STOP_WORD
+#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED RSPAMD_WORD_FLAG_SKIPPED
+#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES RSPAMD_WORD_FLAG_INVISIBLE_SPACES
+#define RSPAMD_STAT_TOKEN_FLAG_EMOJI RSPAMD_WORD_FLAG_EMOJI
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_WORD_H */
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index f28922588..811566ad3 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -20,6 +20,7 @@
#include "task.h"
#include "lua/lua_common.h"
#include "contrib/libev/ev.h"
+#include "libserver/word.h"
#ifdef __cplusplus
extern "C" {
@@ -30,36 +31,14 @@ extern "C" {
* High level statistics API
*/
-#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1u << 0)
-#define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1)
-#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2)
-#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3)
-#define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4)
-#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5)
-#define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
-#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
-#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
-#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
-#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10)
-#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11)
-#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12)
-#define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13)
-
-typedef struct rspamd_stat_token_s {
- rspamd_ftok_t original; /* utf8 raw */
- rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
- rspamd_ftok_t normalized; /* normalized and lowercased utf8 */
- rspamd_ftok_t stemmed; /* stemmed utf8 */
- unsigned int flags;
-} rspamd_stat_token_t;
#define RSPAMD_TOKEN_VALUE_TYPE float
typedef struct token_node_s {
uint64_t data;
unsigned int window_idx;
unsigned int flags;
- rspamd_stat_token_t *t1;
- rspamd_stat_token_t *t2;
+ rspamd_word_t *t1;
+ rspamd_word_t *t2;
RSPAMD_TOKEN_VALUE_TYPE values[0];
} rspamd_token_t;
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 17caf4cc6..0bb658a3a 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -36,12 +36,13 @@ static void
rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx,
struct rspamd_task *task)
{
- GArray *ar;
- rspamd_stat_token_t elt;
+ rspamd_words_t *words;
+ rspamd_word_t elt;
unsigned int i;
lua_State *L = task->cfg->lua_state;
- ar = g_array_sized_new(FALSE, FALSE, sizeof(elt), 16);
+ words = rspamd_mempool_alloc(task->task_pool, sizeof(*words));
+ kv_init(*words);
memset(&elt, 0, sizeof(elt));
elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
@@ -87,7 +88,7 @@ rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx,
elt.normalized.begin = elt.original.begin;
elt.normalized.len = elt.original.len;
- g_array_append_val(ar, elt);
+ kv_push_safe(rspamd_word_t, *words, elt, meta_words_error);
}
lua_pop(L, 1);
@@ -99,17 +100,20 @@ rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx,
}
- if (ar->len > 0) {
+ if (kv_size(*words) > 0) {
st_ctx->tokenizer->tokenize_func(st_ctx,
task,
- ar,
+ words,
TRUE,
"M",
task->tokens);
}
+ goto meta_words_done;
- rspamd_mempool_add_destructor(task->task_pool,
- rspamd_array_free_hard, ar);
+meta_words_error:
+ /* On error, just continue without the problematic tokens */
+meta_words_done:
+ /* kvec memory will be freed with task pool */
}
/*
@@ -134,8 +138,8 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx,
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
{
- if (!IS_TEXT_PART_EMPTY(part) && part->utf_words != NULL) {
- reserved_len += part->utf_words->len;
+ if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a) {
+ reserved_len += kv_size(part->utf_words);
}
/* XXX: normal window size */
reserved_len += 5;
@@ -149,9 +153,9 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx,
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
{
- if (!IS_TEXT_PART_EMPTY(part) && part->utf_words != NULL) {
+ if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a) {
st_ctx->tokenizer->tokenize_func(st_ctx, task,
- part->utf_words, IS_TEXT_PART_UTF(part),
+ &part->utf_words, IS_TEXT_PART_UTF(part),
NULL, task->tokens);
}
@@ -163,10 +167,10 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx,
}
}
- if (task->meta_words != NULL) {
+ if (task->meta_words.a) {
st_ctx->tokenizer->tokenize_func(st_ctx,
task,
- task->meta_words,
+ &task->meta_words,
TRUE,
"SUBJECT",
task->tokens);
diff --git a/src/libstat/tokenizers/custom_tokenizer.h b/src/libstat/tokenizers/custom_tokenizer.h
index b620320f4..addf08920 100644
--- a/src/libstat/tokenizers/custom_tokenizer.h
+++ b/src/libstat/tokenizers/custom_tokenizer.h
@@ -19,6 +19,7 @@
#include "config.h"
#include "ucl.h"
+#include "libserver/word.h"
#ifdef __cplusplus
extern "C" {
@@ -27,13 +28,10 @@ extern "C" {
#define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1
/**
- * Tokenization result - array of word positions as (start, length) pairs
- * The array is terminated by a pair with both values set to 0
+ * Tokenization result - kvec of rspamd_word_t
+ * Uses kvec to avoid exposing GLIB structures to external API
*/
-struct rspamd_tokenizer_result {
- unsigned int *positions; /* Array of (start, length) pairs */
- size_t count; /* Number of words (not array size!) */
-};
+typedef rspamd_words_t rspamd_tokenizer_result_t;
/**
* Custom tokenizer API that must be implemented by language-specific tokenizer plugins
@@ -71,25 +69,25 @@ typedef struct rspamd_custom_tokenizer_api {
* Main tokenization function
* @param text UTF-8 text to tokenize
* @param len Length of the text in bytes
- * @param result Output structure to fill with word positions
+ * @param result Output kvec to fill with rspamd_word_t elements
* @return 0 on success, non-zero on failure
*
- * The tokenizer should allocate result->positions using its own allocator
+ * The tokenizer should allocate result->a using its own allocator
* Rspamd will call cleanup_result() to free it after processing
*/
int (*tokenize)(const char *text, size_t len,
- struct rspamd_tokenizer_result *result);
+ rspamd_tokenizer_result_t *result);
/**
* Cleanup the result from tokenize()
- * @param result Result structure returned by tokenize()
+ * @param result Result kvec returned by tokenize()
*
- * This function should free result->positions using the same allocator
- * that was used in tokenize() and reset the structure fields.
+ * This function should free result->a using the same allocator
+ * that was used in tokenize() and reset the kvec fields.
* This ensures proper memory management across DLL boundaries.
* Note: This does NOT free the result structure itself, only its contents.
*/
- void (*cleanup_result)(struct rspamd_tokenizer_result *result);
+ void (*cleanup_result)(rspamd_tokenizer_result_t *result);
/**
* Optional: Get language hint for better language detection
@@ -155,7 +153,7 @@ struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect(
const char **detected_lang_hint);
/* Helper function to tokenize with exceptions handling */
-GArray *rspamd_custom_tokenizer_tokenize_with_exceptions(
+rspamd_tokenizer_result_t *rspamd_custom_tokenizer_tokenize_with_exceptions(
struct rspamd_custom_tokenizer *tokenizer,
const char *text,
gsize len,
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index 0bc3414a5..360c71d36 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -21,6 +21,7 @@
#include "tokenizers.h"
#include "stat_internal.h"
#include "libmime/lang_detection.h"
+#include "libserver/word.h"
/* Size for features pipe */
#define DEFAULT_FEATURE_WINDOW_SIZE 2
@@ -268,7 +269,7 @@ struct token_pipe_entry {
int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
struct rspamd_task *task,
- GArray *words,
+ rspamd_words_t *words,
gboolean is_utf,
const char *prefix,
GPtrArray *result)
@@ -282,7 +283,7 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
gsize token_size;
unsigned int processed = 0, i, w, window_size, token_flags = 0;
- if (words == NULL) {
+ if (words == NULL || !words->a) {
return FALSE;
}
@@ -306,8 +307,8 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
sizeof(RSPAMD_TOKEN_VALUE_TYPE) * ctx->statfiles->len;
g_assert(token_size > 0);
- for (w = 0; w < words->len; w++) {
- token = &g_array_index(words, rspamd_stat_token_t, w);
+ for (w = 0; w < kv_size(*words); w++) {
+ token = &kv_A(*words, w);
token_flags = token->flags;
const char *begin;
gsize len;
diff --git a/src/libstat/tokenizers/tokenizer_manager.c b/src/libstat/tokenizers/tokenizer_manager.c
index e2011712a..b9bfe0e6f 100644
--- a/src/libstat/tokenizers/tokenizer_manager.c
+++ b/src/libstat/tokenizers/tokenizer_manager.c
@@ -327,7 +327,7 @@ rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr,
}
/* Helper function to tokenize with a custom tokenizer handling exceptions */
-GArray *
+rspamd_tokenizer_result_t *
rspamd_custom_tokenizer_tokenize_with_exceptions(
struct rspamd_custom_tokenizer *tokenizer,
const char *text,
@@ -335,36 +335,28 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
GList *exceptions,
rspamd_mempool_t *pool)
{
- GArray *words;
- struct rspamd_tokenizer_result result;
+ rspamd_tokenizer_result_t *words;
+ rspamd_tokenizer_result_t result;
struct rspamd_process_exception *ex;
GList *cur_ex = exceptions;
gsize pos = 0;
unsigned int i;
int ret;
- words = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t), 128);
+ /* Allocate result kvec in pool */
+ words = rspamd_mempool_alloc(pool, sizeof(*words));
+ kv_init(*words);
/* If no exceptions, tokenize the whole text */
if (!exceptions) {
- result.positions = NULL;
- result.count = 0;
+ kv_init(result);
ret = tokenizer->api->tokenize(text, len, &result);
- if (ret == 0 && result.positions) {
- /* Convert positions to tokens */
- for (i = 0; i < result.count; i++) {
- rspamd_stat_token_t tok;
- unsigned int start = result.positions[i * 2];
- unsigned int length = result.positions[i * 2 + 1];
-
- if (start + length <= len) {
- memset(&tok, 0, sizeof(tok));
- tok.original.begin = text + start;
- tok.original.len = length;
- tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
- g_array_append_val(words, tok);
- }
+ if (ret == 0 && result.a) {
+ /* Copy tokens from result to output */
+ for (i = 0; i < kv_size(result); i++) {
+ rspamd_word_t tok = kv_A(result, i);
+ kv_push(rspamd_word_t, *words, tok);
}
/* Use tokenizer's cleanup function */
@@ -383,23 +375,22 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
/* Tokenize text before exception */
if (ex->pos > pos) {
gsize segment_len = ex->pos - pos;
- result.positions = NULL;
- result.count = 0;
+ kv_init(result);
ret = tokenizer->api->tokenize(text + pos, segment_len, &result);
- if (ret == 0 && result.positions) {
- /* Convert positions to tokens, adjusting for segment offset */
- for (i = 0; i < result.count; i++) {
- rspamd_stat_token_t tok;
- unsigned int start = result.positions[i * 2] + pos;
- unsigned int length = result.positions[i * 2 + 1];
-
- if (start + length <= ex->pos) {
- memset(&tok, 0, sizeof(tok));
- tok.original.begin = text + start;
- tok.original.len = length;
- tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
- g_array_append_val(words, tok);
+ if (ret == 0 && result.a) {
+ /* Copy tokens from result, adjusting positions for segment offset */
+ for (i = 0; i < kv_size(result); i++) {
+ rspamd_word_t tok = kv_A(result, i);
+
+ /* Adjust pointers to point to the original text */
+ gsize offset_in_segment = tok.original.begin - (text + pos);
+ if (offset_in_segment < segment_len) {
+ tok.original.begin = text + pos + offset_in_segment;
+ /* Ensure we don't go past the exception boundary */
+ if (tok.original.begin + tok.original.len <= text + ex->pos) {
+ kv_push(rspamd_word_t, *words, tok);
+ }
}
}
@@ -411,7 +402,7 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
}
/* Add exception as a special token */
- rspamd_stat_token_t ex_tok;
+ rspamd_word_t ex_tok;
memset(&ex_tok, 0, sizeof(ex_tok));
if (ex->type == RSPAMD_EXCEPTION_URL) {
@@ -423,7 +414,7 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
ex_tok.original.len = ex->len;
}
ex_tok.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- g_array_append_val(words, ex_tok);
+ kv_push(rspamd_word_t, *words, ex_tok);
/* Move past exception */
pos = ex->pos + ex->len;
@@ -432,23 +423,19 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
/* Process remaining text after last exception */
if (pos < len) {
- result.positions = NULL;
- result.count = 0;
+ kv_init(result);
ret = tokenizer->api->tokenize(text + pos, len - pos, &result);
- if (ret == 0 && result.positions) {
- /* Convert positions to tokens, adjusting for segment offset */
- for (i = 0; i < result.count; i++) {
- rspamd_stat_token_t tok;
- unsigned int start = result.positions[i * 2] + pos;
- unsigned int length = result.positions[i * 2 + 1];
-
- if (start + length <= len) {
- memset(&tok, 0, sizeof(tok));
- tok.original.begin = text + start;
- tok.original.len = length;
- tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
- g_array_append_val(words, tok);
+ if (ret == 0 && result.a) {
+ /* Copy tokens from result, adjusting positions for segment offset */
+ for (i = 0; i < kv_size(result); i++) {
+ rspamd_word_t tok = kv_A(result, i);
+
+ /* Adjust pointers to point to the original text */
+ gsize offset_in_segment = tok.original.begin - (text + pos);
+ if (offset_in_segment < (len - pos)) {
+ tok.original.begin = text + pos + offset_in_segment;
+ kv_push(rspamd_word_t, *words, tok);
}
}
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 4667976fb..1c5b0a4c8 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -37,8 +37,8 @@
#include <math.h>
-typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, char const **pos,
- rspamd_stat_token_t *token,
+typedef gboolean (*token_get_function)(rspamd_word_t *buf, char const **pos,
+ rspamd_word_t *token,
GList **exceptions, gsize *rl, gboolean check_signature);
const char t_delimiters[256] = {
@@ -71,8 +71,8 @@ const char t_delimiters[256] = {
/* Get next word from specified f_str_t buf */
static gboolean
-rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf,
- char const **cur, rspamd_stat_token_t *token,
+rspamd_tokenizer_get_word_raw(rspamd_word_t *buf,
+ char const **cur, rspamd_word_t *token,
GList **exceptions, gsize *rl, gboolean unused)
{
gsize remain, pos;
@@ -166,7 +166,7 @@ rspamd_tokenize_check_limit(gboolean decay,
unsigned int nwords,
uint64_t *hv,
uint64_t *prob,
- const rspamd_stat_token_t *token,
+ const rspamd_word_t *token,
gssize remain,
gssize total)
{
@@ -244,9 +244,9 @@ rspamd_utf_word_valid(const unsigned char *text, const unsigned char *end,
} while (0)
static inline void
-rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
+rspamd_tokenize_exception(struct rspamd_process_exception *ex, rspamd_words_t *res)
{
- rspamd_stat_token_t token;
+ rspamd_word_t token;
memset(&token, 0, sizeof(token));
@@ -255,7 +255,7 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
token.original.len = sizeof("!!EX!!") - 1;
token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- g_array_append_val(res, token);
+ kv_push_safe(rspamd_word_t, *res, token, exception_error);
token.flags = 0;
}
else if (ex->type == RSPAMD_EXCEPTION_URL) {
@@ -273,28 +273,33 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
}
token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- g_array_append_val(res, token);
+ kv_push_safe(rspamd_word_t, *res, token, exception_error);
token.flags = 0;
}
+ return;
+
+exception_error:
+ /* On error, just skip this exception token */
+ return;
}
-GArray *
+rspamd_words_t *
rspamd_tokenize_text(const char *text, gsize len,
const UText *utxt,
enum rspamd_tokenize_type how,
struct rspamd_config *cfg,
GList *exceptions,
uint64_t *hash,
- GArray *cur_words,
+ rspamd_words_t *output_kvec,
rspamd_mempool_t *pool)
{
- rspamd_stat_token_t token, buf;
+ rspamd_word_t token, buf;
const char *pos = NULL;
gsize l = 0;
- GArray *res;
+ rspamd_words_t *res;
GList *cur = exceptions;
- unsigned int min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
+ unsigned int min_len = 0, max_len = 0, word_decay = 0;
uint64_t hv = 0;
gboolean decay = FALSE, long_text_mode = FALSE;
uint64_t prob = 0;
@@ -307,7 +312,7 @@ rspamd_tokenize_text(const char *text, gsize len,
const char *detected_lang = NULL;
if (text == NULL) {
- return cur_words;
+ return output_kvec;
}
if (len > long_text_limit) {
@@ -328,17 +333,16 @@ rspamd_tokenize_text(const char *text, gsize len,
min_len = cfg->min_word_len;
max_len = cfg->max_word_len;
word_decay = cfg->words_decay;
- initial_size = word_decay * 2;
}
- if (!cur_words) {
- res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t),
- initial_size);
- }
- else {
- res = cur_words;
+ if (!output_kvec) {
+ /* Should not happen in normal usage */
+ return NULL;
}
+ res = output_kvec;
+ kv_init(*res);
+
/* Try custom tokenizers first if we're in UTF mode */
if (cfg && cfg->tokenizer_manager && how == RSPAMD_TOKENIZE_UTF && utxt != NULL) {
custom_tok = rspamd_tokenizer_manager_detect(
@@ -350,18 +354,22 @@ rspamd_tokenize_text(const char *text, gsize len,
if (custom_tok && custom_confidence >= custom_tok->min_confidence) {
/* Use custom tokenizer with exception handling */
- GArray *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions(
+ rspamd_tokenizer_result_t *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions(
custom_tok, text, len, exceptions, pool);
if (custom_res) {
msg_debug_pool("using custom tokenizer %s (confidence: %.2f) for text tokenization",
custom_tok->name, custom_confidence);
+ /* Copy custom tokenizer results to output kvec */
+ for (unsigned int i = 0; i < kv_size(*custom_res); i++) {
+ kv_push_safe(rspamd_word_t, *res, kv_A(*custom_res, i), custom_tokenizer_error);
+ }
+
/* Calculate hash if needed */
- if (hash && custom_res->len > 0) {
- unsigned int i;
- for (i = 0; i < custom_res->len; i++) {
- rspamd_stat_token_t *t = &g_array_index(custom_res, rspamd_stat_token_t, i);
+ if (hash && kv_size(*res) > 0) {
+ for (unsigned int i = 0; i < kv_size(*res); i++) {
+ rspamd_word_t *t = &kv_A(*res, i);
if (t->original.len >= sizeof(uint64_t)) {
uint64_t tmp;
memcpy(&tmp, t->original.begin, sizeof(tmp));
@@ -371,14 +379,7 @@ rspamd_tokenize_text(const char *text, gsize len,
*hash = mum_hash_finish(hv);
}
- /* If we had existing words, append to them */
- if (cur_words && custom_res != cur_words) {
- g_array_append_vals(cur_words, custom_res->data, custom_res->len);
- g_array_free(custom_res, TRUE);
- return cur_words;
- }
-
- return custom_res;
+ return res;
}
else {
msg_warn_pool("custom tokenizer %s failed to tokenize text, falling back to default",
@@ -396,7 +397,7 @@ rspamd_tokenize_text(const char *text, gsize len,
}
if (token.original.len > 0 &&
- rspamd_tokenize_check_limit(decay, word_decay, res->len,
+ rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res),
&hv, &prob, &token, pos - text, len)) {
if (!decay) {
decay = TRUE;
@@ -408,28 +409,28 @@ rspamd_tokenize_text(const char *text, gsize len,
}
if (long_text_mode) {
- if ((res->len + 1) % 16 == 0) {
+ if ((kv_size(*res) + 1) % 16 == 0) {
ev_tstamp now = ev_time();
if (now - start > max_exec_time) {
msg_warn_pool_check(
"too long time has been spent on tokenization:"
- " %.1f ms, limit is %.1f ms; %d words added so far",
+ " %.1f ms, limit is %.1f ms; %z words added so far",
(now - start) * 1e3, max_exec_time * 1e3,
- res->len);
+ kv_size(*res));
goto end;
}
}
}
- g_array_append_val(res, token);
+ kv_push_safe(rspamd_word_t, *res, token, tokenize_error);
- if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
+ if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) {
/* Due to bug in glib ! */
msg_err_pool_check(
- "too many words found: %d, stop tokenization to avoid DoS",
- res->len);
+ "too many words found: %z, stop tokenization to avoid DoS",
+ kv_size(*res));
goto end;
}
@@ -576,7 +577,7 @@ rspamd_tokenize_text(const char *text, gsize len,
}
if (token.original.len > 0 &&
- rspamd_tokenize_check_limit(decay, word_decay, res->len,
+ rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res),
&hv, &prob, &token, p, len)) {
if (!decay) {
decay = TRUE;
@@ -589,15 +590,15 @@ rspamd_tokenize_text(const char *text, gsize len,
if (token.original.len > 0) {
/* Additional check for number of words */
- if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
+ if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) {
/* Due to bug in glib ! */
- msg_err("too many words found: %d, stop tokenization to avoid DoS",
- res->len);
+ msg_err("too many words found: %z, stop tokenization to avoid DoS",
+ kv_size(*res));
goto end;
}
- g_array_append_val(res, token);
+ kv_push_safe(rspamd_word_t, *res, token, tokenize_error);
}
/* Also check for long text mode */
@@ -605,15 +606,15 @@ rspamd_tokenize_text(const char *text, gsize len,
/* Check time each 128 words added */
const int words_check_mask = 0x7F;
- if ((res->len & words_check_mask) == words_check_mask) {
+ if ((kv_size(*res) & words_check_mask) == words_check_mask) {
ev_tstamp now = ev_time();
if (now - start > max_exec_time) {
msg_warn_pool_check(
"too long time has been spent on tokenization:"
- " %.1f ms, limit is %.1f ms; %d words added so far",
+ " %.1f ms, limit is %.1f ms; %z words added so far",
(now - start) * 1e3, max_exec_time * 1e3,
- res->len);
+ kv_size(*res));
goto end;
}
@@ -643,8 +644,14 @@ end:
}
return res;
+
+tokenize_error:
+custom_tokenizer_error:
+ msg_err_pool("failed to allocate memory for tokenization");
+ return res;
}
+
#undef SHIFT_EX
static void
@@ -678,32 +685,38 @@ rspamd_add_metawords_from_str(const char *beg, gsize len,
#endif
}
+ /* Initialize meta_words kvec if not already done */
+ if (!task->meta_words.a) {
+ kv_init(task->meta_words);
+ }
+
if (valid_utf) {
utext_openUTF8(&utxt,
beg,
len,
&uc_err);
- task->meta_words = rspamd_tokenize_text(beg, len,
- &utxt, RSPAMD_TOKENIZE_UTF,
- task->cfg, NULL, NULL,
- task->meta_words,
- task->task_pool);
+ rspamd_tokenize_text(beg, len,
+ &utxt, RSPAMD_TOKENIZE_UTF,
+ task->cfg, NULL, NULL,
+ &task->meta_words,
+ task->task_pool);
utext_close(&utxt);
}
else {
- task->meta_words = rspamd_tokenize_text(beg, len,
- NULL, RSPAMD_TOKENIZE_RAW,
- task->cfg, NULL, NULL, task->meta_words,
- task->task_pool);
+ rspamd_tokenize_text(beg, len,
+ NULL, RSPAMD_TOKENIZE_RAW,
+ task->cfg, NULL, NULL,
+ &task->meta_words,
+ task->task_pool);
}
}
void rspamd_tokenize_meta_words(struct rspamd_task *task)
{
unsigned int i = 0;
- rspamd_stat_token_t *tok;
+ rspamd_word_t *tok;
if (MESSAGE_FIELD(task, subject)) {
rspamd_add_metawords_from_str(MESSAGE_FIELD(task, subject),
@@ -720,7 +733,7 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task)
}
}
- if (task->meta_words != NULL) {
+ if (task->meta_words.a) {
const char *language = NULL;
if (MESSAGE_FIELD(task, text_parts) &&
@@ -733,12 +746,12 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task)
}
}
- rspamd_normalize_words(task->meta_words, task->task_pool);
- rspamd_stem_words(task->meta_words, task->task_pool, language,
+ rspamd_normalize_words(&task->meta_words, task->task_pool);
+ rspamd_stem_words(&task->meta_words, task->task_pool, language,
task->lang_det);
- for (i = 0; i < task->meta_words->len; i++) {
- tok = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(task->meta_words); i++) {
+ tok = &kv_A(task->meta_words, i);
tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
}
}
@@ -812,7 +825,7 @@ rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok,
tok->normalized.begin = dest;
}
-void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
+void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool)
{
UErrorCode uc_err = U_ZERO_ERROR;
UConverter *utf8_converter;
@@ -911,25 +924,27 @@ void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *po
}
}
-void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool)
+
+void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool)
{
- rspamd_stat_token_t *tok;
+ rspamd_word_t *tok;
unsigned int i;
- for (i = 0; i < words->len; i++) {
- tok = &g_array_index(words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(*words); i++) {
+ tok = &kv_A(*words, i);
rspamd_normalize_single_word(tok, pool);
}
}
-void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
+
+void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool,
const char *language,
struct rspamd_lang_detector *lang_detector)
{
static GHashTable *stemmers = NULL;
struct sb_stemmer *stem = NULL;
unsigned int i;
- rspamd_stat_token_t *tok;
+ rspamd_word_t *tok;
char *dest;
gsize dlen;
@@ -962,8 +977,8 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
stem = NULL;
}
}
- for (i = 0; i < words->len; i++) {
- tok = &g_array_index(words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(*words); i++) {
+ tok = &kv_A(*words, i);
if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
if (stem) {
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index f3066b5cf..bb0bb54e2 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -22,6 +22,7 @@
#include "fstring.h"
#include "rspamd.h"
#include "stat_api.h"
+#include "libserver/word.h"
#include <unicode/utext.h>
@@ -43,7 +44,7 @@ struct rspamd_stat_tokenizer {
int (*tokenize_func)(struct rspamd_stat_ctx *ctx,
struct rspamd_task *task,
- GArray *words,
+ rspamd_words_t *words,
gboolean is_utf,
const char *prefix,
GPtrArray *result);
@@ -59,20 +60,20 @@ enum rspamd_tokenize_type {
int token_node_compare_func(gconstpointer a, gconstpointer b);
-/* Tokenize text into array of words (rspamd_stat_token_t type) */
-GArray *rspamd_tokenize_text(const char *text, gsize len,
- const UText *utxt,
- enum rspamd_tokenize_type how,
- struct rspamd_config *cfg,
- GList *exceptions,
- uint64_t *hash,
- GArray *cur_words,
- rspamd_mempool_t *pool);
+/* Tokenize text into kvec of words (rspamd_word_t type) */
+rspamd_words_t *rspamd_tokenize_text(const char *text, gsize len,
+ const UText *utxt,
+ enum rspamd_tokenize_type how,
+ struct rspamd_config *cfg,
+ GList *exceptions,
+ uint64_t *hash,
+ rspamd_words_t *output_kvec,
+ rspamd_mempool_t *pool);
/* OSB tokenize function */
int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
struct rspamd_task *task,
- GArray *words,
+ rspamd_words_t *words,
gboolean is_utf,
const char *prefix,
GPtrArray *result);
@@ -83,11 +84,11 @@ gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool,
struct rspamd_lang_detector;
-void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool);
+void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool);
-void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool);
-
-void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
+/* Word processing functions */
+void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool);
+void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool,
const char *language,
struct rspamd_lang_detector *lang_detector);
diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c
index 5fe110eb8..c69c42292 100644
--- a/src/libutil/shingles.c
+++ b/src/libutil/shingles.c
@@ -18,6 +18,7 @@
#include "cryptobox.h"
#include "images.h"
#include "libstat/stat_api.h"
+#include "libserver/word.h"
#define SHINGLES_WINDOW 3
#define SHINGLES_KEY_SIZE rspamd_cryptobox_SIPKEYBYTES
@@ -112,7 +113,7 @@ rspamd_shingles_get_keys_cached(const unsigned char key[SHINGLES_KEY_SIZE])
}
struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
- rspamd_shingles_from_text(GArray *input,
+ rspamd_shingles_from_text(rspamd_words_t *input,
const unsigned char key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
@@ -123,12 +124,16 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
uint64_t **hashes;
unsigned char **keys;
rspamd_fstring_t *row;
- rspamd_stat_token_t *word;
+ rspamd_word_t *word;
uint64_t val;
int i, j, k;
gsize hlen, ilen = 0, beg = 0, widx = 0;
enum rspamd_cryptobox_fast_hash_type ht;
+ if (!input || !input->a) {
+ return NULL;
+ }
+
if (pool != NULL) {
res = rspamd_mempool_alloc(pool, sizeof(*res));
}
@@ -138,10 +143,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
row = rspamd_fstring_sized_new(256);
- for (i = 0; i < input->len; i++) {
- word = &g_array_index(input, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(*input); i++) {
+ word = &kv_A(*input, i);
- if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) {
+ if (!((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0)) {
ilen++;
}
}
@@ -162,10 +167,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
for (j = beg; j < i; j++) {
word = NULL;
- while (widx < input->len) {
- word = &g_array_index(input, rspamd_stat_token_t, widx);
+ while (widx < kv_size(*input)) {
+ word = &kv_A(*input, widx);
- if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) {
+ if ((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0) {
widx++;
}
else {
@@ -237,10 +242,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
word = NULL;
- while (widx < input->len) {
- word = &g_array_index(input, rspamd_stat_token_t, widx);
+ while (widx < kv_size(*input)) {
+ word = &kv_A(*input, widx);
- if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) {
+ if ((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0) {
widx++;
}
else {
diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h
index fe6f16cf8..1ab2c6842 100644
--- a/src/libutil/shingles.h
+++ b/src/libutil/shingles.h
@@ -18,6 +18,7 @@
#include "config.h"
#include "mem_pool.h"
+#include "libserver/word.h"
#define RSPAMD_SHINGLE_SIZE 32
@@ -48,14 +49,14 @@ typedef uint64_t (*rspamd_shingles_filter)(uint64_t *input, gsize count,
/**
* Generate shingles from the input of fixed size strings using lemmatizer
* if needed
- * @param input array of `rspamd_fstring_t`
+ * @param input kvec of `rspamd_word_t`
* @param key secret key used to generate shingles
* @param pool pool to allocate shingles array
* @param filter hashes filtering function
* @param filterd opaque data for filtering function
* @return shingles array
*/
-struct rspamd_shingle *rspamd_shingles_from_text(GArray *input,
+struct rspamd_shingle *rspamd_shingles_from_text(rspamd_words_t *input,
const unsigned char key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c
index 3a0f1a06c..f36228680 100644
--- a/src/lua/lua_common.c
+++ b/src/lua/lua_common.c
@@ -2401,7 +2401,7 @@ rspamd_lua_try_load_redis(lua_State *L, const ucl_object_t *obj,
return FALSE;
}
-void rspamd_lua_push_full_word(lua_State *L, rspamd_stat_token_t *w)
+void rspamd_lua_push_full_word(lua_State *L, rspamd_word_t *w)
{
int fl_cnt;
@@ -2521,6 +2521,54 @@ int rspamd_lua_push_words(lua_State *L, GArray *words,
return 1;
}
+int rspamd_lua_push_words_kvec(lua_State *L, rspamd_words_t *words,
+ enum rspamd_lua_words_type how)
+{
+ rspamd_word_t *w;
+ unsigned int i, cnt;
+
+ if (!words || !words->a) {
+ lua_createtable(L, 0, 0);
+ return 1;
+ }
+
+ lua_createtable(L, kv_size(*words), 0);
+
+ for (i = 0, cnt = 1; i < kv_size(*words); i++) {
+ w = &kv_A(*words, i);
+
+ switch (how) {
+ case RSPAMD_LUA_WORDS_STEM:
+ if (w->stemmed.len > 0) {
+ lua_pushlstring(L, w->stemmed.begin, w->stemmed.len);
+ lua_rawseti(L, -2, cnt++);
+ }
+ break;
+ case RSPAMD_LUA_WORDS_NORM:
+ if (w->normalized.len > 0) {
+ lua_pushlstring(L, w->normalized.begin, w->normalized.len);
+ lua_rawseti(L, -2, cnt++);
+ }
+ break;
+ case RSPAMD_LUA_WORDS_RAW:
+ if (w->original.len > 0) {
+ lua_pushlstring(L, w->original.begin, w->original.len);
+ lua_rawseti(L, -2, cnt++);
+ }
+ break;
+ case RSPAMD_LUA_WORDS_FULL:
+ rspamd_lua_push_full_word(L, w);
+ /* Push to the resulting vector */
+ lua_rawseti(L, -2, cnt++);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return 1;
+}
+
char *
rspamd_lua_get_module_name(lua_State *L)
{
@@ -2658,4 +2706,4 @@ int rspamd_lua_geti(lua_State *L, int pos, int i)
return lua_type(L, -1);
}
-#endif \ No newline at end of file
+#endif
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index 5819da8cb..d494f0923 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -539,7 +539,7 @@ enum lua_logger_escape_type {
* @return
*/
gsize lua_logger_out(lua_State *L, int pos, char *outbuf, gsize len,
- enum lua_logger_escape_type esc_type);
+ enum lua_logger_escape_type esc_type);
/**
* Safely checks userdata to match specified class
@@ -632,7 +632,7 @@ struct rspamd_stat_token_s;
* @param L
* @param word
*/
-void rspamd_lua_push_full_word(lua_State *L, struct rspamd_stat_token_s *word);
+void rspamd_lua_push_full_word(lua_State *L, rspamd_word_t *word);
enum rspamd_lua_words_type {
RSPAMD_LUA_WORDS_STEM = 0,
@@ -651,6 +651,9 @@ enum rspamd_lua_words_type {
int rspamd_lua_push_words(lua_State *L, GArray *words,
enum rspamd_lua_words_type how);
+int rspamd_lua_push_words_kvec(lua_State *L, rspamd_words_t *words,
+ enum rspamd_lua_words_type how);
+
/**
* Returns newly allocated name for caller module name
* @param L
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index 07dba9c93..982b10d90 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -901,7 +901,7 @@ lua_textpart_get_words_count(lua_State *L)
return 1;
}
- if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+ if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
lua_pushinteger(L, 0);
}
else {
@@ -943,7 +943,7 @@ lua_textpart_get_words(lua_State *L)
return luaL_error(L, "invalid arguments");
}
- if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+ if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
lua_createtable(L, 0, 0);
}
else {
@@ -957,7 +957,7 @@ lua_textpart_get_words(lua_State *L)
}
}
- return rspamd_lua_push_words(L, part->utf_words, how);
+ return rspamd_lua_push_words_kvec(L, &part->utf_words, how);
}
return 1;
@@ -976,7 +976,7 @@ lua_textpart_filter_words(lua_State *L)
return luaL_error(L, "invalid arguments");
}
- if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+ if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
lua_createtable(L, 0, 0);
}
else {
@@ -998,9 +998,8 @@ lua_textpart_filter_words(lua_State *L)
lua_createtable(L, 8, 0);
- for (i = 0, cnt = 1; i < part->utf_words->len; i++) {
- rspamd_stat_token_t *w = &g_array_index(part->utf_words,
- rspamd_stat_token_t, i);
+ for (i = 0, cnt = 1; i < kv_size(part->utf_words); i++) {
+ rspamd_word_t *w = &kv_A(part->utf_words, i);
switch (how) {
case RSPAMD_LUA_WORDS_STEM:
@@ -1194,13 +1193,13 @@ struct lua_shingle_filter_cbdata {
rspamd_mempool_t *pool;
};
-#define STORE_TOKEN(i, t) \
- do { \
- if ((i) < part->utf_words->len) { \
- word = &g_array_index(part->utf_words, rspamd_stat_token_t, (i)); \
- sd->t.begin = word->stemmed.begin; \
- sd->t.len = word->stemmed.len; \
- } \
+#define STORE_TOKEN(i, t) \
+ do { \
+ if ((i) < kv_size(part->utf_words)) { \
+ word = &kv_A(part->utf_words, (i)); \
+ sd->t.begin = word->stemmed.begin; \
+ sd->t.len = word->stemmed.len; \
+ } \
} while (0)
static uint64_t
@@ -1210,7 +1209,7 @@ lua_shingles_filter(uint64_t *input, gsize count,
uint64_t minimal = G_MAXUINT64;
gsize i, min_idx = 0;
struct lua_shingle_data *sd;
- rspamd_stat_token_t *word;
+ rspamd_word_t *word;
struct lua_shingle_filter_cbdata *cbd = (struct lua_shingle_filter_cbdata *) ud;
struct rspamd_mime_text_part *part;
@@ -1248,7 +1247,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
unsigned int i;
struct lua_shingle_data *sd;
rspamd_cryptobox_hash_state_t st;
- rspamd_stat_token_t *word;
+ rspamd_word_t *word;
struct lua_shingle_filter_cbdata cbd;
@@ -1256,7 +1255,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
return luaL_error(L, "invalid arguments");
}
- if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+ if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
lua_pushnil(L);
lua_pushnil(L);
}
@@ -1269,8 +1268,8 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
/* Calculate direct hash */
rspamd_cryptobox_hash_init(&st, key, rspamd_cryptobox_HASHKEYBYTES);
- for (i = 0; i < part->utf_words->len; i++) {
- word = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(part->utf_words); i++) {
+ word = &kv_A(part->utf_words, i);
rspamd_cryptobox_hash_update(&st,
word->stemmed.begin, word->stemmed.len);
}
@@ -1283,7 +1282,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
cbd.pool = pool;
cbd.part = part;
- sgl = rspamd_shingles_from_text(part->utf_words, key,
+ sgl = rspamd_shingles_from_text(&part->utf_words, key,
pool, lua_shingles_filter, &cbd, RSPAMD_SHINGLES_MUMHASH);
if (sgl == NULL) {
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 97f9c496e..0b1473b61 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -6943,7 +6943,7 @@ lua_task_get_meta_words(lua_State *L)
return luaL_error(L, "invalid arguments");
}
- if (task->meta_words == NULL) {
+ if (!task->meta_words.a) {
lua_createtable(L, 0, 0);
}
else {
@@ -6967,7 +6967,7 @@ lua_task_get_meta_words(lua_State *L)
}
}
- return rspamd_lua_push_words(L, task->meta_words, how);
+ return rspamd_lua_push_words_kvec(L, &task->meta_words, how);
}
return 1;
@@ -7039,6 +7039,76 @@ lua_lookup_words_array(lua_State *L,
return nmatched;
}
+static unsigned int
+lua_lookup_words_kvec(lua_State *L,
+ int cbpos,
+ struct rspamd_task *task,
+ struct rspamd_lua_map *map,
+ rspamd_words_t *words)
+{
+ rspamd_word_t *tok;
+ unsigned int i, nmatched = 0;
+ int err_idx;
+ gboolean matched;
+ const char *key;
+ gsize keylen;
+
+ if (!words || !words->a) {
+ return 0;
+ }
+
+ for (i = 0; i < kv_size(*words); i++) {
+ tok = &kv_A(*words, i);
+
+ matched = FALSE;
+
+ if (tok->normalized.len == 0) {
+ continue;
+ }
+
+ key = tok->normalized.begin;
+ keylen = tok->normalized.len;
+
+ switch (map->type) {
+ case RSPAMD_LUA_MAP_SET:
+ case RSPAMD_LUA_MAP_HASH:
+ /* We know that tok->normalized is zero terminated in fact */
+ if (rspamd_match_hash_map(map->data.hash, key, keylen)) {
+ matched = TRUE;
+ }
+ break;
+ case RSPAMD_LUA_MAP_REGEXP:
+ case RSPAMD_LUA_MAP_REGEXP_MULTIPLE:
+ if (rspamd_match_regexp_map_single(map->data.re_map, key,
+ keylen)) {
+ matched = TRUE;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ break;
+ }
+
+ if (matched) {
+ nmatched++;
+
+ lua_pushcfunction(L, &rspamd_lua_traceback);
+ err_idx = lua_gettop(L);
+ lua_pushvalue(L, cbpos); /* Function */
+ rspamd_lua_push_full_word(L, tok);
+
+ if (lua_pcall(L, 1, 0, err_idx) != 0) {
+ msg_err_task("cannot call callback function for lookup words: %s",
+ lua_tostring(L, -1));
+ }
+
+ lua_settop(L, err_idx - 1);
+ }
+ }
+
+ return nmatched;
+}
+
static int
lua_task_lookup_words(lua_State *L)
{
@@ -7062,13 +7132,13 @@ lua_task_lookup_words(lua_State *L)
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, tp)
{
- if (tp->utf_words) {
- matches += lua_lookup_words_array(L, 3, task, map, tp->utf_words);
+ if (tp->utf_words.a) {
+ matches += lua_lookup_words_kvec(L, 3, task, map, &tp->utf_words);
}
}
- if (task->meta_words) {
- matches += lua_lookup_words_array(L, 3, task, map, task->meta_words);
+ if (task->meta_words.a) {
+ matches += lua_lookup_words_kvec(L, 3, task, map, &task->meta_words);
}
lua_pushinteger(L, matches);
diff --git a/src/plugins/chartable.cxx b/src/plugins/chartable.cxx
index a5c7cb899..c82748862 100644
--- a/src/plugins/chartable.cxx
+++ b/src/plugins/chartable.cxx
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -1696,7 +1696,7 @@ rspamd_can_alias_latin(int ch)
static double
rspamd_chartable_process_word_utf(struct rspamd_task *task,
- rspamd_stat_token_t *w,
+ rspamd_word_t *w,
gboolean is_url,
unsigned int *ncap,
struct chartable_ctx *chartable_module_ctx,
@@ -1842,7 +1842,7 @@ rspamd_chartable_process_word_utf(struct rspamd_task *task,
static double
rspamd_chartable_process_word_ascii(struct rspamd_task *task,
- rspamd_stat_token_t *w,
+ rspamd_word_t *w,
gboolean is_url,
struct chartable_ctx *chartable_module_ctx)
{
@@ -1931,17 +1931,17 @@ rspamd_chartable_process_part(struct rspamd_task *task,
struct chartable_ctx *chartable_module_ctx,
gboolean ignore_diacritics)
{
- rspamd_stat_token_t *w;
+ rspamd_word_t *w;
unsigned int i, ncap = 0;
double cur_score = 0.0;
- if (part == nullptr || part->utf_words == nullptr ||
- part->utf_words->len == 0 || part->nwords == 0) {
+ if (part == nullptr || part->utf_words.a == nullptr ||
+ kv_size(part->utf_words) == 0 || part->nwords == 0) {
return FALSE;
}
- for (i = 0; i < part->utf_words->len; i++) {
- w = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(part->utf_words); i++) {
+ w = &kv_A(part->utf_words, i);
if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
@@ -2015,13 +2015,13 @@ chartable_symbol_callback(struct rspamd_task *task,
ignore_diacritics = TRUE;
}
- if (task->meta_words != nullptr && task->meta_words->len > 0) {
- rspamd_stat_token_t *w;
+ if (task->meta_words.a && kv_size(task->meta_words) > 0) {
+ rspamd_word_t *w;
double cur_score = 0;
- gsize arlen = task->meta_words->len;
+ gsize arlen = kv_size(task->meta_words);
for (i = 0; i < arlen; i++) {
- w = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+ w = &kv_A(task->meta_words, i);
cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
nullptr, chartable_module_ctx, ignore_diacritics);
}
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c
index 85ea3b00c..7dd5162ac 100644
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -1431,10 +1431,10 @@ fuzzy_io_fin(void *ud)
close(session->fd);
}
-static GArray *
+static rspamd_words_t *
fuzzy_preprocess_words(struct rspamd_mime_text_part *part, rspamd_mempool_t *pool)
{
- return part->utf_words;
+ return &part->utf_words;
}
static void
@@ -1861,7 +1861,7 @@ fuzzy_cmd_from_text_part(struct rspamd_task *task,
unsigned int i;
rspamd_cryptobox_hash_state_t st;
rspamd_stat_token_t *word;
- GArray *words;
+ rspamd_words_t *words;
struct fuzzy_cmd_io *io;
unsigned int additional_length;
unsigned char *additional_data;
@@ -1970,10 +1970,10 @@ fuzzy_cmd_from_text_part(struct rspamd_task *task,
rspamd_cryptobox_hash_init(&st, rule->hash_key->str, rule->hash_key->len);
words = fuzzy_preprocess_words(part, task->task_pool);
- for (i = 0; i < words->len; i++) {
- word = &g_array_index(words, rspamd_stat_token_t, i);
+ for (i = 0; i < kv_size(*words); i++) {
+ word = &kv_A(*words, i);
- if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) {
+ if (!((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0)) {
rspamd_cryptobox_hash_update(&st, word->stemmed.begin,
word->stemmed.len);
}
@@ -2684,7 +2684,7 @@ fuzzy_insert_metric_results(struct rspamd_task *task, struct fuzzy_rule *rule,
if (task->message) {
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, tp)
{
- if (!IS_TEXT_PART_EMPTY(tp) && tp->utf_words != NULL && tp->utf_words->len > 0) {
+ if (!IS_TEXT_PART_EMPTY(tp) && kv_size(tp->utf_words) > 0) {
seen_text_part = TRUE;
if (tp->utf_stripped_text.magic == UTEXT_MAGIC) {
diff --git a/test/rspamd_shingles_test.c b/test/rspamd_shingles_test.c
index d1a10de84..5b88f4b2d 100644
--- a/test/rspamd_shingles_test.c
+++ b/test/rspamd_shingles_test.c
@@ -17,6 +17,7 @@
#include "rspamd.h"
#include "shingles.h"
#include "ottery.h"
+#include "libserver/word.h"
#include <math.h>
static const char *
@@ -52,63 +53,76 @@ generate_random_string(char *begin, size_t len)
}
}
-static GArray *
+static rspamd_words_t *
generate_fuzzy_words(gsize cnt, gsize max_len)
{
- GArray *res;
+ rspamd_words_t *res;
gsize i, wlen;
- rspamd_ftok_t w;
+ rspamd_word_t word;
char *t;
- res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_ftok_t), cnt);
+ res = g_malloc(sizeof(*res));
+ kv_init(*res);
for (i = 0; i < cnt; i++) {
wlen = ottery_rand_range(max_len) + 1;
/* wlen = max_len; */
- w.len = wlen;
t = g_malloc(wlen);
generate_random_string(t, wlen);
- w.begin = t;
- g_array_append_val(res, w);
+
+ memset(&word, 0, sizeof(word));
+ word.stemmed.begin = t;
+ word.stemmed.len = wlen;
+ word.original.begin = t;
+ word.original.len = wlen;
+ word.flags = 0; /* No flags set, so it won't be skipped */
+
+ kv_push(rspamd_word_t, *res, word);
}
return res;
}
static void
-permute_vector(GArray *in, double prob)
+permute_vector(rspamd_words_t *in, double prob)
{
gsize i, total = 0;
- rspamd_ftok_t *w;
+ rspamd_word_t *w;
- for (i = 0; i < in->len; i++) {
+ for (i = 0; i < kv_size(*in); i++) {
if (ottery_rand_unsigned() <= G_MAXUINT * prob) {
- w = &g_array_index(in, rspamd_ftok_t, i);
- generate_random_string((char *) w->begin, w->len);
+ w = &kv_A(*in, i);
+ generate_random_string((char *) w->stemmed.begin, w->stemmed.len);
+ /* Also update original since they point to same memory */
+ w->original.begin = w->stemmed.begin;
+ w->original.len = w->stemmed.len;
total++;
}
}
- msg_debug("generated %z permutations of %ud words", total, in->len);
+ msg_debug("generated %z permutations of %ud words", total, (unsigned int) kv_size(*in));
}
static void
-free_fuzzy_words(GArray *ar)
+free_fuzzy_words(rspamd_words_t *ar)
{
gsize i;
- rspamd_ftok_t *w;
+ rspamd_word_t *w;
- for (i = 0; i < ar->len; i++) {
- w = &g_array_index(ar, rspamd_ftok_t, i);
- g_free((gpointer) w->begin);
+ for (i = 0; i < kv_size(*ar); i++) {
+ w = &kv_A(*ar, i);
+ g_free((gpointer) w->stemmed.begin);
}
+
+ kv_destroy(*ar);
+ g_free(ar);
}
static void
test_case(gsize cnt, gsize max_len, double perm_factor,
enum rspamd_shingle_alg alg)
{
- GArray *input;
+ rspamd_words_t *input;
struct rspamd_shingle *sgl, *sgl_permuted;
double res;
unsigned char key[16];
@@ -281,51 +295,59 @@ void rspamd_shingles_test_func(void)
enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD;
struct rspamd_shingle *sgl;
unsigned char key[16];
- GArray *input;
- rspamd_ftok_t tok;
+ rspamd_words_t input;
+ rspamd_word_t word;
int i;
memset(key, 0, sizeof(key));
- input = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_ftok_t), 5);
+ kv_init(input);
for (i = 0; i < 5; i++) {
char *b = g_alloca(8);
memset(b, 0, 8);
memcpy(b + 1, "test", 4);
b[0] = 'a' + i;
- tok.begin = b;
- tok.len = 5 + ((i + 1) % 4);
- g_array_append_val(input, tok);
+
+ memset(&word, 0, sizeof(word));
+ word.stemmed.begin = b;
+ word.stemmed.len = 5 + ((i + 1) % 4);
+ word.original.begin = b;
+ word.original.len = word.stemmed.len;
+ word.flags = 0; /* No flags set, so it won't be skipped */
+
+ kv_push(rspamd_word_t, input, word);
}
- sgl = rspamd_shingles_from_text(input, key, NULL,
+ sgl = rspamd_shingles_from_text(&input, key, NULL,
rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_OLD);
for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
g_assert(sgl->hashes[i] == expected_old[i]);
}
g_free(sgl);
- sgl = rspamd_shingles_from_text(input, key, NULL,
+ sgl = rspamd_shingles_from_text(&input, key, NULL,
rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_XXHASH);
for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
g_assert(sgl->hashes[i] == expected_xxhash[i]);
}
g_free(sgl);
- sgl = rspamd_shingles_from_text(input, key, NULL,
+ sgl = rspamd_shingles_from_text(&input, key, NULL,
rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_MUMHASH);
for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
g_assert(sgl->hashes[i] == expected_mumhash[i]);
}
g_free(sgl);
- sgl = rspamd_shingles_from_text(input, key, NULL,
+ sgl = rspamd_shingles_from_text(&input, key, NULL,
rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_FAST);
for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
g_assert(sgl->hashes[i] == expected_fasthash[i]);
}
g_free(sgl);
+ kv_destroy(input);
+
for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_FAST; alg++) {
test_case(200, 10, 0.1, alg);
test_case(500, 20, 0.01, alg);