diff options
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/custom_tokenizer.h | 177 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 9 | ||||
-rw-r--r-- | src/libstat/tokenizers/rspamd_tokenizer_types.h | 89 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizer_manager.c | 500 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 202 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 33 |
6 files changed, 928 insertions, 82 deletions
diff --git a/src/libstat/tokenizers/custom_tokenizer.h b/src/libstat/tokenizers/custom_tokenizer.h new file mode 100644 index 000000000..bc173a1da --- /dev/null +++ b/src/libstat/tokenizers/custom_tokenizer.h @@ -0,0 +1,177 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_CUSTOM_TOKENIZER_H +#define RSPAMD_CUSTOM_TOKENIZER_H + +/* Check if we're being included by internal Rspamd code or external plugins */ +#ifdef RSPAMD_TOKENIZER_INTERNAL +/* Internal Rspamd usage - use the full headers */ +#include "config.h" +#include "ucl.h" +#include "libserver/word.h" +#else +/* External plugin usage - use standalone types */ +#include "rspamd_tokenizer_types.h" +/* Forward declaration for UCL object - plugins should include ucl.h if needed */ +typedef struct ucl_object_s ucl_object_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1 + +/** + * Tokenization result - compatible with both internal and external usage + */ +typedef rspamd_words_t rspamd_tokenizer_result_t; + +/** + * Custom tokenizer API that must be implemented by language-specific tokenizer plugins + * All functions use only plain C types to ensure clean boundaries + */ +typedef struct rspamd_custom_tokenizer_api { + /* API version for compatibility checking */ + unsigned int api_version; + + /* Name of the tokenizer (e.g., "japanese_mecab") */ + const char *name; + + /** + * Global initialization function called once when the tokenizer is loaded + * @param config UCL configuration object for this tokenizer (may be NULL) + * @param error_buf Buffer for error message (at least 256 bytes) + * @return 0 on success, non-zero on failure + */ + int (*init)(const ucl_object_t *config, char *error_buf, size_t error_buf_size); + + /** + * Global cleanup function called when the tokenizer is unloaded + */ + void (*deinit)(void); + + /** + * Quick language detection to check if this tokenizer can handle the text + * @param text UTF-8 text to analyze + * @param len Length of the text in bytes + * @return Confidence score 0.0-1.0, or -1.0 if cannot handle + */ + double (*detect_language)(const char *text, size_t len); + + /** + * Main tokenization function + * @param text UTF-8 text to tokenize + * @param len Length of the text in bytes + * @param result Output kvec to fill with rspamd_word_t elements + * @return 0 on success, non-zero on failure + * + * The tokenizer should allocate result->a using its own allocator + * Rspamd will call cleanup_result() to free it after processing + */ + int (*tokenize)(const char *text, size_t len, + rspamd_tokenizer_result_t *result); + + /** + * Cleanup the result from tokenize() + * @param result Result kvec returned by tokenize() + * + * This function should free result->a using the same allocator + * that was used in tokenize() and reset the kvec fields. + * This ensures proper memory management across DLL boundaries. + * Note: This does NOT free the result structure itself, only its contents. + */ + void (*cleanup_result)(rspamd_tokenizer_result_t *result); + + /** + * Optional: Get language hint for better language detection + * @return Language code (e.g., "ja", "zh") or NULL + */ + const char *(*get_language_hint)(void); + + /** + * Optional: Get minimum confidence threshold for this tokenizer + * @return Minimum confidence (0.0-1.0) or -1.0 to use default + */ + double (*get_min_confidence)(void); + +} rspamd_custom_tokenizer_api_t; + +/** + * Entry point function that plugins must export + * Must be named "rspamd_tokenizer_get_api" + */ +typedef const rspamd_custom_tokenizer_api_t *(*rspamd_tokenizer_get_api_func)(void); + +/* Internal Rspamd structures - not exposed to plugins */ +#ifdef RSPAMD_TOKENIZER_INTERNAL + +/** + * Custom tokenizer instance + */ +struct rspamd_custom_tokenizer { + char *name; /* Tokenizer name from config */ + char *path; /* Path to .so file */ + void *handle; /* dlopen handle */ + const rspamd_custom_tokenizer_api_t *api; /* API functions */ + double priority; /* Detection priority */ + double min_confidence; /* Minimum confidence threshold */ + gboolean enabled; /* Is tokenizer enabled */ + ucl_object_t *config; /* Tokenizer-specific config */ +}; + +/** + * Tokenizer manager structure + */ +struct rspamd_tokenizer_manager { + GHashTable *tokenizers; /* name -> rspamd_custom_tokenizer */ + GArray *detection_order; /* Ordered by priority */ + rspamd_mempool_t *pool; + double default_threshold; /* Default confidence threshold */ +}; + +/* Manager functions */ +struct rspamd_tokenizer_manager *rspamd_tokenizer_manager_new(rspamd_mempool_t *pool); +void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr); + +gboolean rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr, + const char *name, + const ucl_object_t *config, + GError **err); + +struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect( + struct rspamd_tokenizer_manager *mgr, + const char *text, size_t len, + double *confidence, + const char *lang_hint, + const char **detected_lang_hint); + +/* Helper function to tokenize with exceptions handling */ +rspamd_tokenizer_result_t *rspamd_custom_tokenizer_tokenize_with_exceptions( + struct rspamd_custom_tokenizer *tokenizer, + const char *text, + gsize len, + GList *exceptions, + rspamd_mempool_t *pool); + +#endif /* RSPAMD_TOKENIZER_INTERNAL */ + +#ifdef __cplusplus +} +#endif + +#endif /* RSPAMD_CUSTOM_TOKENIZER_H */ diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index 0bc3414a5..360c71d36 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -21,6 +21,7 @@ #include "tokenizers.h" #include "stat_internal.h" #include "libmime/lang_detection.h" +#include "libserver/word.h" /* Size for features pipe */ #define DEFAULT_FEATURE_WINDOW_SIZE 2 @@ -268,7 +269,7 @@ struct token_pipe_entry { int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, struct rspamd_task *task, - GArray *words, + rspamd_words_t *words, gboolean is_utf, const char *prefix, GPtrArray *result) @@ -282,7 +283,7 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, gsize token_size; unsigned int processed = 0, i, w, window_size, token_flags = 0; - if (words == NULL) { + if (words == NULL || !words->a) { return FALSE; } @@ -306,8 +307,8 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, sizeof(RSPAMD_TOKEN_VALUE_TYPE) * ctx->statfiles->len; g_assert(token_size > 0); - for (w = 0; w < words->len; w++) { - token = &g_array_index(words, rspamd_stat_token_t, w); + for (w = 0; w < kv_size(*words); w++) { + token = &kv_A(*words, w); token_flags = token->flags; const char *begin; gsize len; diff --git a/src/libstat/tokenizers/rspamd_tokenizer_types.h b/src/libstat/tokenizers/rspamd_tokenizer_types.h new file mode 100644 index 000000000..eb8518290 --- /dev/null +++ b/src/libstat/tokenizers/rspamd_tokenizer_types.h @@ -0,0 +1,89 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_TOKENIZER_TYPES_H +#define RSPAMD_TOKENIZER_TYPES_H + +/* + * Standalone type definitions for custom tokenizers + * This header is completely self-contained and does not depend on any external libraries. + * Custom tokenizers should include only this header to get access to all necessary types. + */ + +#include <stdint.h> +#include <stddef.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Basic string token structure + */ +typedef struct rspamd_ftok { + size_t len; + const char *begin; +} rspamd_ftok_t; + +/** + * Unicode string token structure + */ +typedef struct rspamd_ftok_unicode { + size_t len; + const uint32_t *begin; +} rspamd_ftok_unicode_t; + +/* Word flags */ +#define RSPAMD_WORD_FLAG_TEXT (1u << 0u) +#define RSPAMD_WORD_FLAG_META (1u << 1u) +#define RSPAMD_WORD_FLAG_LUA_META (1u << 2u) +#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3u) +#define RSPAMD_WORD_FLAG_HEADER (1u << 4u) +#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5u) +#define RSPAMD_WORD_FLAG_UTF (1u << 6u) +#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7u) +#define RSPAMD_WORD_FLAG_STEMMED (1u << 8u) +#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9u) +#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10u) +#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11u) +#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12u) +#define RSPAMD_WORD_FLAG_EMOJI (1u << 13u) + +/** + * Word structure + */ +typedef struct rspamd_word { + rspamd_ftok_t original; + rspamd_ftok_unicode_t unicode; + rspamd_ftok_t normalized; + rspamd_ftok_t stemmed; + unsigned int flags; +} rspamd_word_t; + +/** + * Array of words + */ +typedef struct rspamd_words { + rspamd_word_t *a; + size_t n; + size_t m; +} rspamd_words_t; + +#ifdef __cplusplus +} +#endif + +#endif /* RSPAMD_TOKENIZER_TYPES_H */ diff --git a/src/libstat/tokenizers/tokenizer_manager.c b/src/libstat/tokenizers/tokenizer_manager.c new file mode 100644 index 000000000..e6fb5e8d8 --- /dev/null +++ b/src/libstat/tokenizers/tokenizer_manager.c @@ -0,0 +1,500 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "tokenizers.h" +#define RSPAMD_TOKENIZER_INTERNAL +#include "custom_tokenizer.h" +#include "libutil/util.h" +#include "libserver/logger.h" +#include <dlfcn.h> + +#define msg_err_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "tokenizer", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "tokenizer", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "tokenizer", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_tokenizer(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_tokenizer_log_id, "tokenizer", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(tokenizer) + +static void +rspamd_custom_tokenizer_dtor(gpointer p) +{ + struct rspamd_custom_tokenizer *tok = p; + + if (tok) { + if (tok->api && tok->api->deinit) { + tok->api->deinit(); + } + + if (tok->handle) { + dlclose(tok->handle); + } + + if (tok->config) { + ucl_object_unref(tok->config); + } + + g_free(tok->name); + g_free(tok->path); + g_free(tok); + } +} + +static int +rspamd_custom_tokenizer_priority_cmp(gconstpointer a, gconstpointer b) +{ + const struct rspamd_custom_tokenizer *t1 = *(const struct rspamd_custom_tokenizer **) a; + const struct rspamd_custom_tokenizer *t2 = *(const struct rspamd_custom_tokenizer **) b; + + /* Higher priority first */ + if (t1->priority > t2->priority) { + return -1; + } + else if (t1->priority < t2->priority) { + return 1; + } + + return 0; +} + +struct rspamd_tokenizer_manager * +rspamd_tokenizer_manager_new(rspamd_mempool_t *pool) +{ + struct rspamd_tokenizer_manager *mgr; + + mgr = rspamd_mempool_alloc0(pool, sizeof(*mgr)); + mgr->pool = pool; + mgr->tokenizers = g_hash_table_new_full(rspamd_strcase_hash, + rspamd_strcase_equal, + NULL, + rspamd_custom_tokenizer_dtor); + mgr->detection_order = g_array_new(FALSE, FALSE, sizeof(struct rspamd_custom_tokenizer *)); + mgr->default_threshold = 0.7; /* Default confidence threshold */ + + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) g_hash_table_unref, + mgr->tokenizers); + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) rspamd_array_free_hard, + mgr->detection_order); + + msg_info_tokenizer("created custom tokenizer manager with default confidence threshold %.3f", + mgr->default_threshold); + + return mgr; +} + +void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr) +{ + /* Cleanup is handled by memory pool destructors */ +} + +gboolean +rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr, + const char *name, + const ucl_object_t *config, + GError **err) +{ + struct rspamd_custom_tokenizer *tok; + const ucl_object_t *elt; + rspamd_tokenizer_get_api_func get_api; + const rspamd_custom_tokenizer_api_t *api; + void *handle; + const char *path; + gboolean enabled = TRUE; + double priority = 50.0; + char error_buf[256]; + + g_assert(mgr != NULL); + g_assert(name != NULL); + g_assert(config != NULL); + + msg_info_tokenizer("starting to load custom tokenizer '%s'", name); + + /* Check if enabled */ + elt = ucl_object_lookup(config, "enabled"); + if (elt && ucl_object_type(elt) == UCL_BOOLEAN) { + enabled = ucl_object_toboolean(elt); + } + + if (!enabled) { + msg_info_tokenizer("custom tokenizer '%s' is disabled", name); + return TRUE; + } + + /* Get path */ + elt = ucl_object_lookup(config, "path"); + if (!elt || ucl_object_type(elt) != UCL_STRING) { + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "missing 'path' for tokenizer %s", name); + return FALSE; + } + path = ucl_object_tostring(elt); + msg_info_tokenizer("custom tokenizer '%s' will be loaded from path: %s", name, path); + + /* Get priority */ + elt = ucl_object_lookup(config, "priority"); + if (elt) { + priority = ucl_object_todouble(elt); + } + msg_info_tokenizer("custom tokenizer '%s' priority set to %.1f", name, priority); + + /* Load the shared library */ + msg_info_tokenizer("loading shared library for custom tokenizer '%s'", name); + handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); + if (!handle) { + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "cannot load tokenizer %s from %s: %s", + name, path, dlerror()); + return FALSE; + } + msg_info_tokenizer("successfully loaded shared library for custom tokenizer '%s'", name); + + /* Get the API entry point */ + msg_info_tokenizer("looking up API entry point for custom tokenizer '%s'", name); + get_api = (rspamd_tokenizer_get_api_func) dlsym(handle, "rspamd_tokenizer_get_api"); + if (!get_api) { + dlclose(handle); + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "cannot find entry point in %s: %s", + path, dlerror()); + return FALSE; + } + + /* Get the API */ + msg_info_tokenizer("calling API entry point for custom tokenizer '%s'", name); + api = get_api(); + if (!api) { + dlclose(handle); + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "tokenizer %s returned NULL API", name); + return FALSE; + } + msg_info_tokenizer("successfully obtained API from custom tokenizer '%s'", name); + + /* Check API version */ + msg_info_tokenizer("checking API version for custom tokenizer '%s' (got %u, expected %u)", + name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION); + if (api->api_version != RSPAMD_CUSTOM_TOKENIZER_API_VERSION) { + dlclose(handle); + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "tokenizer %s has incompatible API version %u (expected %u)", + name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION); + return FALSE; + } + + /* Create tokenizer instance */ + tok = g_malloc0(sizeof(*tok)); + tok->name = g_strdup(name); + tok->path = g_strdup(path); + tok->handle = handle; + tok->api = api; + tok->priority = priority; + tok->enabled = enabled; + + /* Get tokenizer config */ + elt = ucl_object_lookup(config, "config"); + if (elt) { + tok->config = ucl_object_ref(elt); + } + + /* Get minimum confidence */ + if (api->get_min_confidence) { + tok->min_confidence = api->get_min_confidence(); + msg_info_tokenizer("custom tokenizer '%s' provides minimum confidence threshold: %.3f", + name, tok->min_confidence); + } + else { + tok->min_confidence = mgr->default_threshold; + msg_info_tokenizer("custom tokenizer '%s' using default confidence threshold: %.3f", + name, tok->min_confidence); + } + + /* Initialize the tokenizer */ + if (api->init) { + msg_info_tokenizer("initializing custom tokenizer '%s'", name); + error_buf[0] = '\0'; + if (api->init(tok->config, error_buf, sizeof(error_buf)) != 0) { + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "failed to initialize tokenizer %s: %s", + name, error_buf[0] ? error_buf : "unknown error"); + rspamd_custom_tokenizer_dtor(tok); + return FALSE; + } + msg_info_tokenizer("successfully initialized custom tokenizer '%s'", name); + } + else { + msg_info_tokenizer("custom tokenizer '%s' does not require initialization", name); + } + + /* Add to manager */ + g_hash_table_insert(mgr->tokenizers, tok->name, tok); + g_array_append_val(mgr->detection_order, tok); + + /* Re-sort by priority */ + g_array_sort(mgr->detection_order, rspamd_custom_tokenizer_priority_cmp); + msg_info_tokenizer("custom tokenizer '%s' registered and sorted by priority (total tokenizers: %u)", + name, mgr->detection_order->len); + + msg_info_tokenizer("successfully loaded custom tokenizer '%s' (priority %.1f) from %s", + name, priority, path); + + return TRUE; +} + +struct rspamd_custom_tokenizer * +rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr, + const char *text, size_t len, + double *confidence, + const char *lang_hint, + const char **detected_lang_hint) +{ + struct rspamd_custom_tokenizer *tok, *best_tok = NULL; + double conf, best_conf = 0.0; + unsigned int i; + + g_assert(mgr != NULL); + g_assert(text != NULL); + + msg_debug_tokenizer("starting tokenizer detection for text of length %zu", len); + + if (confidence) { + *confidence = 0.0; + } + + if (detected_lang_hint) { + *detected_lang_hint = NULL; + } + + /* If we have a language hint, try to find a tokenizer for that language first */ + if (lang_hint) { + msg_info_tokenizer("trying to find tokenizer for language hint: %s", lang_hint); + for (i = 0; i < mgr->detection_order->len; i++) { + tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i); + + if (!tok->enabled || !tok->api->get_language_hint) { + continue; + } + + /* Check if this tokenizer handles the hinted language */ + const char *tok_lang = tok->api->get_language_hint(); + if (tok_lang && g_ascii_strcasecmp(tok_lang, lang_hint) == 0) { + msg_info_tokenizer("found tokenizer '%s' for language hint '%s'", tok->name, lang_hint); + /* Found a tokenizer for this language, check if it actually detects it */ + if (tok->api->detect_language) { + conf = tok->api->detect_language(text, len); + msg_info_tokenizer("tokenizer '%s' confidence for hinted language: %.3f (threshold: %.3f)", + tok->name, conf, tok->min_confidence); + if (conf >= tok->min_confidence) { + /* Use this tokenizer */ + msg_info_tokenizer("using tokenizer '%s' for language hint '%s' with confidence %.3f", + tok->name, lang_hint, conf); + if (confidence) { + *confidence = conf; + } + if (detected_lang_hint) { + *detected_lang_hint = tok_lang; + } + return tok; + } + } + } + } + msg_info_tokenizer("no suitable tokenizer found for language hint '%s', falling back to general detection", lang_hint); + } + + /* Try each tokenizer in priority order */ + msg_info_tokenizer("trying %u tokenizers for general detection", mgr->detection_order->len); + for (i = 0; i < mgr->detection_order->len; i++) { + tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i); + + if (!tok->enabled || !tok->api->detect_language) { + msg_debug_tokenizer("skipping tokenizer '%s' (enabled: %s, has detect_language: %s)", + tok->name, tok->enabled ? "yes" : "no", + tok->api->detect_language ? "yes" : "no"); + continue; + } + + conf = tok->api->detect_language(text, len); + msg_info_tokenizer("tokenizer '%s' detection confidence: %.3f (threshold: %.3f, current best: %.3f)", + tok->name, conf, tok->min_confidence, best_conf); + + if (conf > best_conf && conf >= tok->min_confidence) { + best_conf = conf; + best_tok = tok; + msg_info_tokenizer("tokenizer '%s' is new best with confidence %.3f", tok->name, best_conf); + + /* Early exit if very confident */ + if (conf >= 0.95) { + msg_info_tokenizer("very high confidence (%.3f >= 0.95), using tokenizer '%s' immediately", + conf, tok->name); + break; + } + } + } + + if (best_tok) { + msg_info_tokenizer("selected tokenizer '%s' with confidence %.3f", best_tok->name, best_conf); + if (confidence) { + *confidence = best_conf; + } + + if (detected_lang_hint && best_tok->api->get_language_hint) { + *detected_lang_hint = best_tok->api->get_language_hint(); + msg_info_tokenizer("detected language hint: %s", *detected_lang_hint); + } + } + else { + msg_info_tokenizer("no suitable tokenizer found during detection"); + } + + return best_tok; +} + +/* Helper function to tokenize with a custom tokenizer handling exceptions */ +rspamd_tokenizer_result_t * +rspamd_custom_tokenizer_tokenize_with_exceptions( + struct rspamd_custom_tokenizer *tokenizer, + const char *text, + gsize len, + GList *exceptions, + rspamd_mempool_t *pool) +{ + rspamd_tokenizer_result_t *words; + rspamd_tokenizer_result_t result; + struct rspamd_process_exception *ex; + GList *cur_ex = exceptions; + gsize pos = 0; + unsigned int i; + int ret; + + /* Allocate result kvec in pool */ + words = rspamd_mempool_alloc(pool, sizeof(*words)); + kv_init(*words); + + /* If no exceptions, tokenize the whole text */ + if (!exceptions) { + kv_init(result); + + ret = tokenizer->api->tokenize(text, len, &result); + if (ret == 0 && result.a) { + /* Copy tokens from result to output */ + for (i = 0; i < kv_size(result); i++) { + rspamd_word_t tok = kv_A(result, i); + kv_push(rspamd_word_t, *words, tok); + } + + /* Use tokenizer's cleanup function */ + if (tokenizer->api->cleanup_result) { + tokenizer->api->cleanup_result(&result); + } + } + + return words; + } + + /* Process text with exceptions */ + while (pos < len && cur_ex) { + ex = (struct rspamd_process_exception *) cur_ex->data; + + /* Tokenize text before exception */ + if (ex->pos > pos) { + gsize segment_len = ex->pos - pos; + kv_init(result); + + ret = tokenizer->api->tokenize(text + pos, segment_len, &result); + if (ret == 0 && result.a) { + /* Copy tokens from result, adjusting positions for segment offset */ + for (i = 0; i < kv_size(result); i++) { + rspamd_word_t tok = kv_A(result, i); + + /* Adjust pointers to point to the original text */ + gsize offset_in_segment = tok.original.begin - (text + pos); + if (offset_in_segment < segment_len) { + tok.original.begin = text + pos + offset_in_segment; + /* Ensure we don't go past the exception boundary */ + if (tok.original.begin + tok.original.len <= text + ex->pos) { + kv_push(rspamd_word_t, *words, tok); + } + } + } + + /* Use tokenizer's cleanup function */ + if (tokenizer->api->cleanup_result) { + tokenizer->api->cleanup_result(&result); + } + } + } + + /* Add exception as a special token */ + rspamd_word_t ex_tok; + memset(&ex_tok, 0, sizeof(ex_tok)); + + if (ex->type == RSPAMD_EXCEPTION_URL) { + ex_tok.original.begin = "!!EX!!"; + ex_tok.original.len = 6; + } + else { + ex_tok.original.begin = text + ex->pos; + ex_tok.original.len = ex->len; + } + ex_tok.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + kv_push(rspamd_word_t, *words, ex_tok); + + /* Move past exception */ + pos = ex->pos + ex->len; + cur_ex = g_list_next(cur_ex); + } + + /* Process remaining text after last exception */ + if (pos < len) { + kv_init(result); + + ret = tokenizer->api->tokenize(text + pos, len - pos, &result); + if (ret == 0 && result.a) { + /* Copy tokens from result, adjusting positions for segment offset */ + for (i = 0; i < kv_size(result); i++) { + rspamd_word_t tok = kv_A(result, i); + + /* Adjust pointers to point to the original text */ + gsize offset_in_segment = tok.original.begin - (text + pos); + if (offset_in_segment < (len - pos)) { + tok.original.begin = text + pos + offset_in_segment; + kv_push(rspamd_word_t, *words, tok); + } + } + + /* Use tokenizer's cleanup function */ + if (tokenizer->api->cleanup_result) { + tokenizer->api->cleanup_result(&result); + } + } + } + + return words; +} diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 0ea1bcfc6..8a9f42992 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,8 @@ #include "contrib/mumhash/mum.h" #include "libmime/lang_detection.h" #include "libstemmer.h" +#define RSPAMD_TOKENIZER_INTERNAL +#include "custom_tokenizer.h" #include <unicode/utf8.h> #include <unicode/uchar.h> @@ -35,8 +37,8 @@ #include <math.h> -typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, char const **pos, - rspamd_stat_token_t *token, +typedef gboolean (*token_get_function)(rspamd_word_t *buf, char const **pos, + rspamd_word_t *token, GList **exceptions, gsize *rl, gboolean check_signature); const char t_delimiters[256] = { @@ -69,8 +71,8 @@ const char t_delimiters[256] = { /* Get next word from specified f_str_t buf */ static gboolean -rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf, - char const **cur, rspamd_stat_token_t *token, +rspamd_tokenizer_get_word_raw(rspamd_word_t *buf, + char const **cur, rspamd_word_t *token, GList **exceptions, gsize *rl, gboolean unused) { gsize remain, pos; @@ -164,7 +166,7 @@ rspamd_tokenize_check_limit(gboolean decay, unsigned int nwords, uint64_t *hv, uint64_t *prob, - const rspamd_stat_token_t *token, + const rspamd_word_t *token, gssize remain, gssize total) { @@ -242,9 +244,9 @@ rspamd_utf_word_valid(const unsigned char *text, const unsigned char *end, } while (0) static inline void -rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res) +rspamd_tokenize_exception(struct rspamd_process_exception *ex, rspamd_words_t *res) { - rspamd_stat_token_t token; + rspamd_word_t token; memset(&token, 0, sizeof(token)); @@ -253,7 +255,7 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res) token.original.len = sizeof("!!EX!!") - 1; token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - g_array_append_val(res, token); + kv_push_safe(rspamd_word_t, *res, token, exception_error); token.flags = 0; } else if (ex->type == RSPAMD_EXCEPTION_URL) { @@ -271,28 +273,33 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res) } token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - g_array_append_val(res, token); + kv_push_safe(rspamd_word_t, *res, token, exception_error); token.flags = 0; } + return; + +exception_error: + /* On error, just skip this exception token */ + return; } -GArray * +rspamd_words_t * rspamd_tokenize_text(const char *text, gsize len, const UText *utxt, enum rspamd_tokenize_type how, struct rspamd_config *cfg, GList *exceptions, uint64_t *hash, - GArray *cur_words, + rspamd_words_t *output_kvec, rspamd_mempool_t *pool) { - rspamd_stat_token_t token, buf; + rspamd_word_t token, buf; const char *pos = NULL; gsize l = 0; - GArray *res; + rspamd_words_t *res; GList *cur = exceptions; - unsigned int min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; + unsigned int min_len = 0, max_len = 0, word_decay = 0; uint64_t hv = 0; gboolean decay = FALSE, long_text_mode = FALSE; uint64_t prob = 0; @@ -300,9 +307,12 @@ rspamd_tokenize_text(const char *text, gsize len, static const gsize long_text_limit = 1 * 1024 * 1024; static const ev_tstamp max_exec_time = 0.2; /* 200 ms */ ev_tstamp start; + struct rspamd_custom_tokenizer *custom_tok = NULL; + double custom_confidence = 0.0; + const char *detected_lang = NULL; if (text == NULL) { - return cur_words; + return output_kvec; } if (len > long_text_limit) { @@ -323,15 +333,59 @@ rspamd_tokenize_text(const char *text, gsize len, min_len = cfg->min_word_len; max_len = cfg->max_word_len; word_decay = cfg->words_decay; - initial_size = word_decay * 2; } - if (!cur_words) { - res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t), - initial_size); + if (!output_kvec) { + res = pool ? rspamd_mempool_alloc0(pool, sizeof(*res)) : g_malloc0(sizeof(*res)); + ; } else { - res = cur_words; + res = output_kvec; + } + + /* Try custom tokenizers first if we're in UTF mode */ + if (cfg && cfg->tokenizer_manager && how == RSPAMD_TOKENIZE_UTF && utxt != NULL) { + custom_tok = rspamd_tokenizer_manager_detect( + cfg->tokenizer_manager, + text, len, + &custom_confidence, + NULL, /* no input language hint */ + &detected_lang); + + if (custom_tok && custom_confidence >= custom_tok->min_confidence) { + /* Use custom tokenizer with exception handling */ + rspamd_tokenizer_result_t *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions( + custom_tok, text, len, exceptions, pool); + + if (custom_res) { + msg_debug_pool("using custom tokenizer %s (confidence: %.2f) for text tokenization", + custom_tok->name, custom_confidence); + + /* Copy custom tokenizer results to output kvec */ + for (unsigned int i = 0; i < kv_size(*custom_res); i++) { + kv_push_safe(rspamd_word_t, *res, kv_A(*custom_res, i), custom_tokenizer_error); + } + + /* Calculate hash if needed */ + if (hash && kv_size(*res) > 0) { + for (unsigned int i = 0; i < kv_size(*res); i++) { + rspamd_word_t *t = &kv_A(*res, i); + if (t->original.len >= sizeof(uint64_t)) { + uint64_t tmp; + memcpy(&tmp, t->original.begin, sizeof(tmp)); + hv = mum_hash_step(hv, tmp); + } + } + *hash = mum_hash_finish(hv); + } + + return res; + } + else { + msg_warn_pool("custom tokenizer %s failed to tokenize text, falling back to default", + custom_tok->name); + } + } } if (G_UNLIKELY(how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) { @@ -343,7 +397,7 @@ rspamd_tokenize_text(const char *text, gsize len, } if (token.original.len > 0 && - rspamd_tokenize_check_limit(decay, word_decay, res->len, + rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res), &hv, &prob, &token, pos - text, len)) { if (!decay) { decay = TRUE; @@ -355,28 +409,28 @@ rspamd_tokenize_text(const char *text, gsize len, } if (long_text_mode) { - if ((res->len + 1) % 16 == 0) { + if ((kv_size(*res) + 1) % 16 == 0) { ev_tstamp now = ev_time(); if (now - start > max_exec_time) { msg_warn_pool_check( "too long time has been spent on tokenization:" - " %.1f ms, limit is %.1f ms; %d words added so far", + " %.1f ms, limit is %.1f ms; %z words added so far", (now - start) * 1e3, max_exec_time * 1e3, - res->len); + kv_size(*res)); goto end; } } } - g_array_append_val(res, token); + kv_push_safe(rspamd_word_t, *res, token, tokenize_error); - if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) { + if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) { /* Due to bug in glib ! */ msg_err_pool_check( - "too many words found: %d, stop tokenization to avoid DoS", - res->len); + "too many words found: %z, stop tokenization to avoid DoS", + kv_size(*res)); goto end; } @@ -523,7 +577,7 @@ rspamd_tokenize_text(const char *text, gsize len, } if (token.original.len > 0 && - rspamd_tokenize_check_limit(decay, word_decay, res->len, + rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res), &hv, &prob, &token, p, len)) { if (!decay) { decay = TRUE; @@ -536,15 +590,15 @@ rspamd_tokenize_text(const char *text, gsize len, if (token.original.len > 0) { /* Additional check for number of words */ - if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) { + if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) { /* Due to bug in glib ! */ - msg_err("too many words found: %d, stop tokenization to avoid DoS", - res->len); + msg_err("too many words found: %z, stop tokenization to avoid DoS", + kv_size(*res)); goto end; } - g_array_append_val(res, token); + kv_push_safe(rspamd_word_t, *res, token, tokenize_error); } /* Also check for long text mode */ @@ -552,15 +606,15 @@ rspamd_tokenize_text(const char *text, gsize len, /* Check time each 128 words added */ const int words_check_mask = 0x7F; - if ((res->len & words_check_mask) == words_check_mask) { + if ((kv_size(*res) & words_check_mask) == words_check_mask) { ev_tstamp now = ev_time(); if (now - start > max_exec_time) { msg_warn_pool_check( "too long time has been spent on tokenization:" - " %.1f ms, limit is %.1f ms; %d words added so far", + " %.1f ms, limit is %.1f ms; %z words added so far", (now - start) * 1e3, max_exec_time * 1e3, - res->len); + kv_size(*res)); goto end; } @@ -590,8 +644,14 @@ end: } return res; + +tokenize_error: +custom_tokenizer_error: + msg_err_pool("failed to allocate memory for tokenization"); + return res; } + #undef SHIFT_EX static void @@ -625,32 +685,38 @@ rspamd_add_metawords_from_str(const char *beg, gsize len, #endif } + /* Initialize meta_words kvec if not already done */ + if (!task->meta_words.a) { + kv_init(task->meta_words); + } + if (valid_utf) { utext_openUTF8(&utxt, beg, len, &uc_err); - task->meta_words = rspamd_tokenize_text(beg, len, - &utxt, RSPAMD_TOKENIZE_UTF, - task->cfg, NULL, NULL, - task->meta_words, - task->task_pool); + rspamd_tokenize_text(beg, len, + &utxt, RSPAMD_TOKENIZE_UTF, + task->cfg, NULL, NULL, + &task->meta_words, + task->task_pool); utext_close(&utxt); } else { - task->meta_words = rspamd_tokenize_text(beg, len, - NULL, RSPAMD_TOKENIZE_RAW, - task->cfg, NULL, NULL, task->meta_words, - task->task_pool); + rspamd_tokenize_text(beg, len, + NULL, RSPAMD_TOKENIZE_RAW, + task->cfg, NULL, NULL, + &task->meta_words, + task->task_pool); } } void rspamd_tokenize_meta_words(struct rspamd_task *task) { unsigned int i = 0; - rspamd_stat_token_t *tok; + rspamd_word_t *tok; if (MESSAGE_FIELD(task, subject)) { rspamd_add_metawords_from_str(MESSAGE_FIELD(task, subject), @@ -667,7 +733,7 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task) } } - if (task->meta_words != NULL) { + if (task->meta_words.a) { const char *language = NULL; if (MESSAGE_FIELD(task, text_parts) && @@ -680,12 +746,12 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task) } } - rspamd_normalize_words(task->meta_words, task->task_pool); - rspamd_stem_words(task->meta_words, task->task_pool, language, + rspamd_normalize_words(&task->meta_words, task->task_pool); + rspamd_stem_words(&task->meta_words, task->task_pool, language, task->lang_det); - for (i = 0; i < task->meta_words->len; i++) { - tok = &g_array_index(task->meta_words, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(task->meta_words); i++) { + tok = &kv_A(task->meta_words, i); tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER; } } @@ -759,7 +825,7 @@ rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok, tok->normalized.begin = dest; } -void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool) +void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool) { UErrorCode uc_err = U_ZERO_ERROR; UConverter *utf8_converter; @@ -858,25 +924,27 @@ void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *po } } -void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool) + +void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool) { - rspamd_stat_token_t *tok; + rspamd_word_t *tok; unsigned int i; - for (i = 0; i < words->len; i++) { - tok = &g_array_index(words, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(*words); i++) { + tok = &kv_A(*words, i); rspamd_normalize_single_word(tok, pool); } } -void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, + +void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool, const char *language, struct rspamd_lang_detector *lang_detector) { static GHashTable *stemmers = NULL; struct sb_stemmer *stem = NULL; unsigned int i; - rspamd_stat_token_t *tok; + rspamd_word_t *tok; char *dest; gsize dlen; @@ -909,8 +977,18 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, stem = NULL; } } - for (i = 0; i < words->len; i++) { - tok = &g_array_index(words, rspamd_stat_token_t, i); + for (i = 0; i < kv_size(*words); i++) { + tok = &kv_A(*words, i); + + /* Skip stemming if token has already been stemmed by custom tokenizer */ + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) { + /* Already stemmed, just check for stop words */ + if (tok->stemmed.len > 0 && lang_detector != NULL && + rspamd_language_detector_is_stop_word(lang_detector, tok->stemmed.begin, tok->stemmed.len)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD; + } + continue; + } if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { if (stem) { @@ -952,4 +1030,4 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, } } } -}
\ No newline at end of file +} diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index d4a8824a8..bb0bb54e2 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -1,5 +1,5 @@ /* - * Copyright 2023 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include "fstring.h" #include "rspamd.h" #include "stat_api.h" +#include "libserver/word.h" #include <unicode/utext.h> @@ -43,7 +44,7 @@ struct rspamd_stat_tokenizer { int (*tokenize_func)(struct rspamd_stat_ctx *ctx, struct rspamd_task *task, - GArray *words, + rspamd_words_t *words, gboolean is_utf, const char *prefix, GPtrArray *result); @@ -59,20 +60,20 @@ enum rspamd_tokenize_type { int token_node_compare_func(gconstpointer a, gconstpointer b); -/* Tokenize text into array of words (rspamd_stat_token_t type) */ -GArray *rspamd_tokenize_text(const char *text, gsize len, - const UText *utxt, - enum rspamd_tokenize_type how, - struct rspamd_config *cfg, - GList *exceptions, - uint64_t *hash, - GArray *cur_words, - rspamd_mempool_t *pool); +/* Tokenize text into kvec of words (rspamd_word_t type) */ +rspamd_words_t *rspamd_tokenize_text(const char *text, gsize len, + const UText *utxt, + enum rspamd_tokenize_type how, + struct rspamd_config *cfg, + GList *exceptions, + uint64_t *hash, + rspamd_words_t *output_kvec, + rspamd_mempool_t *pool); /* OSB tokenize function */ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, struct rspamd_task *task, - GArray *words, + rspamd_words_t *words, gboolean is_utf, const char *prefix, GPtrArray *result); @@ -83,11 +84,11 @@ gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool, struct rspamd_lang_detector; -void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool); +void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool); -void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool); - -void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, +/* Word processing functions */ +void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool); +void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool, const char *language, struct rspamd_lang_detector *lang_detector); |