diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2025-06-12 10:08:45 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2025-06-12 10:08:45 +0100 |
commit | 79805f3ebe0d2e511b3ce98072332abdb482333f (patch) | |
tree | bfc1a794e1429e8b4017f09081d8afbb9a997196 | |
parent | 664f0e993719d126ded977438146a06e68a6320b (diff) | |
download | rspamd-79805f3ebe0d2e511b3ce98072332abdb482333f.tar.gz rspamd-79805f3ebe0d2e511b3ce98072332abdb482333f.zip |
[Project] Add ability to create custom tokenizers for languages
-rw-r--r-- | src/libmime/message.c | 3 | ||||
-rw-r--r-- | src/libserver/cfg_file.h | 7 | ||||
-rw-r--r-- | src/libserver/cfg_utils.cxx | 35 | ||||
-rw-r--r-- | src/libstat/CMakeLists.txt | 33 | ||||
-rw-r--r-- | src/libstat/tokenizers/custom_tokenizer.h | 172 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizer_manager.c | 462 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 57 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 2 | ||||
-rw-r--r-- | src/lua/lua_parsers.c | 6 |
9 files changed, 751 insertions, 26 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index f2cabf399..60894d879 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,6 +40,7 @@ #include "contrib/uthash/utlist.h" #include "contrib/t1ha/t1ha.h" #include "received.h" +#include "libstat/tokenizers/custom_tokenizer.h" #define GTUBE_SYMBOL "GTUBE" diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index 2d0797c98..362ddc0ae 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -497,9 +497,10 @@ struct rspamd_config { char *zstd_output_dictionary; /**< path to zstd output dictionary */ ucl_object_t *neighbours; /**< other servers in the cluster */ - struct rspamd_config_settings_elt *setting_ids; /**< preprocessed settings ids */ - struct rspamd_lang_detector *lang_det; /**< language detector */ - struct rspamd_worker *cur_worker; /**< set dynamically by each worker */ + struct rspamd_config_settings_elt *setting_ids; /**< preprocessed settings ids */ + struct rspamd_lang_detector *lang_det; /**< language detector */ + struct rspamd_tokenizer_manager *tokenizer_manager; /**< custom tokenizer manager */ + struct rspamd_worker *cur_worker; /**< set dynamically by each worker */ ref_entry_t ref; /**< reference counter */ }; diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx index b430a5fca..badcf6c54 100644 --- a/src/libserver/cfg_utils.cxx +++ b/src/libserver/cfg_utils.cxx @@ -72,6 +72,10 @@ #include "contrib/expected/expected.hpp" #include "contrib/ankerl/unordered_dense.h" +#include "libserver/task.h" +#include "libserver/url.h" +#include "libstat/tokenizers/custom_tokenizer.h" + #define DEFAULT_SCORE 10.0 #define DEFAULT_RLIMIT_NOFILE 2048 @@ -940,6 +944,37 @@ rspamd_config_post_load(struct rspamd_config *cfg, msg_err_config("cannot configure libraries, fatal error"); return FALSE; } + + /* Load custom tokenizers */ + const ucl_object_t *custom_tokenizers = ucl_object_lookup_path(cfg->cfg_ucl_obj, + "options.custom_tokenizers"); + if (custom_tokenizers != NULL) { + msg_info_config("loading custom tokenizers"); + cfg->tokenizer_manager = rspamd_tokenizer_manager_new(cfg->cfg_pool); + + ucl_object_iter_t it = ucl_object_iterate_new(custom_tokenizers); + const ucl_object_t *tok_obj; + const char *tok_name; + + while ((tok_obj = ucl_object_iterate_safe(it, true)) != NULL) { + tok_name = ucl_object_key(tok_obj); + GError *err = NULL; + + if (!rspamd_tokenizer_manager_load_tokenizer(cfg->tokenizer_manager, + tok_name, tok_obj, &err)) { + msg_err_config("failed to load custom tokenizer '%s': %s", + tok_name, err ? err->message : "unknown error"); + if (err) { + g_error_free(err); + } + + if (opts & RSPAMD_CONFIG_INIT_VALIDATE) { + ret = tl::make_unexpected(fmt::format("failed to load custom tokenizer '{}'", tok_name)); + } + } + } + ucl_object_iterate_free(it); + } } /* Validate cache */ diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt index 64d572a57..eddf64e49 100644 --- a/src/libstat/CMakeLists.txt +++ b/src/libstat/CMakeLists.txt @@ -1,25 +1,26 @@ # Librspamdserver -SET(LIBSTATSRC ${CMAKE_CURRENT_SOURCE_DIR}/stat_config.c - ${CMAKE_CURRENT_SOURCE_DIR}/stat_process.c) +SET(LIBSTATSRC ${CMAKE_CURRENT_SOURCE_DIR}/stat_config.c + ${CMAKE_CURRENT_SOURCE_DIR}/stat_process.c) -SET(TOKENIZERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/tokenizers.c - ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/osb.c) +SET(TOKENIZERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/tokenizers.c + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/tokenizer_manager.c + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/osb.c) -SET(CLASSIFIERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/bayes.c - ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/lua_classifier.c) +SET(CLASSIFIERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/bayes.c + ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/lua_classifier.c) -SET(BACKENDSSRC ${CMAKE_CURRENT_SOURCE_DIR}/backends/mmaped_file.c - ${CMAKE_CURRENT_SOURCE_DIR}/backends/sqlite3_backend.c - ${CMAKE_CURRENT_SOURCE_DIR}/backends/cdb_backend.cxx - ${CMAKE_CURRENT_SOURCE_DIR}/backends/http_backend.cxx - ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis_backend.cxx) +SET(BACKENDSSRC ${CMAKE_CURRENT_SOURCE_DIR}/backends/mmaped_file.c + ${CMAKE_CURRENT_SOURCE_DIR}/backends/sqlite3_backend.c + ${CMAKE_CURRENT_SOURCE_DIR}/backends/cdb_backend.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/backends/http_backend.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis_backend.cxx) -SET(CACHESSRC ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/sqlite3_cache.c +SET(CACHESSRC ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/sqlite3_cache.c ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/redis_cache.cxx) SET(RSPAMD_STAT ${LIBSTATSRC} - ${TOKENIZERSSRC} - ${CLASSIFIERSSRC} - ${BACKENDSSRC} - ${CACHESSRC} PARENT_SCOPE) + ${TOKENIZERSSRC} + ${CLASSIFIERSSRC} + ${BACKENDSSRC} + ${CACHESSRC} PARENT_SCOPE) diff --git a/src/libstat/tokenizers/custom_tokenizer.h b/src/libstat/tokenizers/custom_tokenizer.h new file mode 100644 index 000000000..bacb4e7cd --- /dev/null +++ b/src/libstat/tokenizers/custom_tokenizer.h @@ -0,0 +1,172 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_CUSTOM_TOKENIZER_H +#define RSPAMD_CUSTOM_TOKENIZER_H + +#include "config.h" +#include "tokenizers.h" +#include "ucl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1 + +/** + * Tokenization result - array of word positions as (start, length) pairs + * The array is terminated by a pair with both values set to 0 + */ +struct rspamd_tokenizer_result { + unsigned int *positions; /* Array of (start, length) pairs */ + size_t count; /* Number of words (not array size!) */ +}; + +/** + * Custom tokenizer API that must be implemented by language-specific tokenizer plugins + * All functions use only plain C types to ensure clean boundaries + */ +typedef struct rspamd_custom_tokenizer_api { + /* API version for compatibility checking */ + unsigned int api_version; + + /* Name of the tokenizer (e.g., "japanese_mecab") */ + const char *name; + + /** + * Global initialization function called once when the tokenizer is loaded + * @param config UCL configuration object for this tokenizer (may be NULL) + * @param error_buf Buffer for error message (at least 256 bytes) + * @return 0 on success, non-zero on failure + */ + int (*init)(const ucl_object_t *config, char *error_buf, size_t error_buf_size); + + /** + * Global cleanup function called when the tokenizer is unloaded + */ + void (*deinit)(void); + + /** + * Quick language detection to check if this tokenizer can handle the text + * @param text UTF-8 text to analyze + * @param len Length of the text in bytes + * @return Confidence score 0.0-1.0, or -1.0 if cannot handle + */ + double (*detect_language)(const char *text, size_t len); + + /** + * Main tokenization function + * @param text UTF-8 text to tokenize + * @param len Length of the text in bytes + * @param result Output structure to fill with word positions + * @return 0 on success, non-zero on failure + * + * The tokenizer should allocate result->positions using its own allocator + * Rspamd will call cleanup_result() to free it after processing + */ + int (*tokenize)(const char *text, size_t len, + struct rspamd_tokenizer_result *result); + + /** + * Cleanup the result from tokenize() + * @param result Result structure returned by tokenize() + * + * This function should free result->positions using the same allocator + * that was used in tokenize() and reset the structure fields. + * This ensures proper memory management across DLL boundaries. + * Note: This does NOT free the result structure itself, only its contents. + */ + void (*cleanup_result)(struct rspamd_tokenizer_result *result); + + /** + * Optional: Get language hint for better language detection + * @return Language code (e.g., "ja", "zh") or NULL + */ + const char *(*get_language_hint)(void); + + /** + * Optional: Get minimum confidence threshold for this tokenizer + * @return Minimum confidence (0.0-1.0) or -1.0 to use default + */ + double (*get_min_confidence)(void); + +} rspamd_custom_tokenizer_api_t; + +/** + * Entry point function that plugins must export + * Must be named "rspamd_tokenizer_get_api" + */ +typedef const rspamd_custom_tokenizer_api_t *(*rspamd_tokenizer_get_api_func)(void); + +/* Internal Rspamd structures - not exposed to plugins */ +#ifdef RSPAMD_TOKENIZER_INTERNAL + +/** + * Custom tokenizer instance + */ +struct rspamd_custom_tokenizer { + char *name; /* Tokenizer name from config */ + char *path; /* Path to .so file */ + void *handle; /* dlopen handle */ + const rspamd_custom_tokenizer_api_t *api; /* API functions */ + double priority; /* Detection priority */ + double min_confidence; /* Minimum confidence threshold */ + gboolean enabled; /* Is tokenizer enabled */ + ucl_object_t *config; /* Tokenizer-specific config */ +}; + +/** + * Tokenizer manager structure + */ +struct rspamd_tokenizer_manager { + GHashTable *tokenizers; /* name -> rspamd_custom_tokenizer */ + GArray *detection_order; /* Ordered by priority */ + rspamd_mempool_t *pool; + double default_threshold; /* Default confidence threshold */ +}; + +/* Manager functions */ +struct rspamd_tokenizer_manager *rspamd_tokenizer_manager_new(rspamd_mempool_t *pool); +void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr); + +gboolean rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr, + const char *name, + const ucl_object_t *config, + GError **err); + +struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect( + struct rspamd_tokenizer_manager *mgr, + const char *text, size_t len, + double *confidence, + const char *lang_hint, + const char **detected_lang_hint); + +/* Helper function to tokenize with exceptions handling */ +GArray *rspamd_custom_tokenizer_tokenize_with_exceptions( + struct rspamd_custom_tokenizer *tokenizer, + const char *text, + gsize len, + GList *exceptions, + rspamd_mempool_t *pool); + +#endif /* RSPAMD_TOKENIZER_INTERNAL */ + +#ifdef __cplusplus +} +#endif + +#endif /* RSPAMD_CUSTOM_TOKENIZER_H */ diff --git a/src/libstat/tokenizers/tokenizer_manager.c b/src/libstat/tokenizers/tokenizer_manager.c new file mode 100644 index 000000000..cd18a5ff1 --- /dev/null +++ b/src/libstat/tokenizers/tokenizer_manager.c @@ -0,0 +1,462 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#define RSPAMD_TOKENIZER_INTERNAL +#include "custom_tokenizer.h" +#include "libutil/util.h" +#include "libserver/logger.h" +#include <dlfcn.h> + +#define msg_err_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "tokenizer", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "tokenizer", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "tokenizer", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_tokenizer(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_tokenizer_log_id, "tokenizer", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(tokenizer) + +static void +rspamd_custom_tokenizer_dtor(gpointer p) +{ + struct rspamd_custom_tokenizer *tok = p; + + if (tok) { + if (tok->api && tok->api->deinit) { + tok->api->deinit(); + } + + if (tok->handle) { + dlclose(tok->handle); + } + + if (tok->config) { + ucl_object_unref(tok->config); + } + + g_free(tok->name); + g_free(tok->path); + g_free(tok); + } +} + +static int +rspamd_custom_tokenizer_priority_cmp(gconstpointer a, gconstpointer b) +{ + const struct rspamd_custom_tokenizer *t1 = *(const struct rspamd_custom_tokenizer **) a; + const struct rspamd_custom_tokenizer *t2 = *(const struct rspamd_custom_tokenizer **) b; + + /* Higher priority first */ + if (t1->priority > t2->priority) { + return -1; + } + else if (t1->priority < t2->priority) { + return 1; + } + + return 0; +} + +struct rspamd_tokenizer_manager * +rspamd_tokenizer_manager_new(rspamd_mempool_t *pool) +{ + struct rspamd_tokenizer_manager *mgr; + + mgr = rspamd_mempool_alloc0(pool, sizeof(*mgr)); + mgr->pool = pool; + mgr->tokenizers = g_hash_table_new_full(rspamd_strcase_hash, + rspamd_strcase_equal, + NULL, + rspamd_custom_tokenizer_dtor); + mgr->detection_order = g_array_new(FALSE, FALSE, sizeof(struct rspamd_custom_tokenizer *)); + mgr->default_threshold = 0.7; /* Default confidence threshold */ + + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) g_hash_table_unref, + mgr->tokenizers); + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) g_array_free, + mgr->detection_order); + + return mgr; +} + +void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr) +{ + /* Cleanup is handled by memory pool destructors */ +} + +gboolean +rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr, + const char *name, + const ucl_object_t *config, + GError **err) +{ + struct rspamd_custom_tokenizer *tok; + const ucl_object_t *elt; + rspamd_tokenizer_get_api_func get_api; + const rspamd_custom_tokenizer_api_t *api; + void *handle; + const char *path; + gboolean enabled = TRUE; + double priority = 50.0; + char error_buf[256]; + + g_assert(mgr != NULL); + g_assert(name != NULL); + g_assert(config != NULL); + + /* Check if enabled */ + elt = ucl_object_lookup(config, "enabled"); + if (elt && ucl_object_type(elt) == UCL_BOOLEAN) { + enabled = ucl_object_toboolean(elt); + } + + if (!enabled) { + msg_info_tokenizer("custom tokenizer %s is disabled", name); + return TRUE; + } + + /* Get path */ + elt = ucl_object_lookup(config, "path"); + if (!elt || ucl_object_type(elt) != UCL_STRING) { + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "missing 'path' for tokenizer %s", name); + return FALSE; + } + path = ucl_object_tostring(elt); + + /* Get priority */ + elt = ucl_object_lookup(config, "priority"); + if (elt) { + priority = ucl_object_todouble(elt); + } + + /* Load the shared library */ + handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); + if (!handle) { + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "cannot load tokenizer %s from %s: %s", + name, path, dlerror()); + return FALSE; + } + + /* Get the API entry point */ + get_api = (rspamd_tokenizer_get_api_func) dlsym(handle, "rspamd_tokenizer_get_api"); + if (!get_api) { + dlclose(handle); + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "cannot find entry point in %s: %s", + path, dlerror()); + return FALSE; + } + + /* Get the API */ + api = get_api(); + if (!api) { + dlclose(handle); + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "tokenizer %s returned NULL API", name); + return FALSE; + } + + /* Check API version */ + if (api->api_version != RSPAMD_CUSTOM_TOKENIZER_API_VERSION) { + dlclose(handle); + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "tokenizer %s has incompatible API version %u (expected %u)", + name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION); + return FALSE; + } + + /* Create tokenizer instance */ + tok = g_malloc0(sizeof(*tok)); + tok->name = g_strdup(name); + tok->path = g_strdup(path); + tok->handle = handle; + tok->api = api; + tok->priority = priority; + tok->enabled = enabled; + + /* Get tokenizer config */ + elt = ucl_object_lookup(config, "config"); + if (elt) { + tok->config = ucl_object_ref(elt); + } + + /* Get minimum confidence */ + if (api->get_min_confidence) { + tok->min_confidence = api->get_min_confidence(); + } + else { + tok->min_confidence = mgr->default_threshold; + } + + /* Initialize the tokenizer */ + if (api->init) { + error_buf[0] = '\0'; + if (api->init(tok->config, error_buf, sizeof(error_buf)) != 0) { + g_set_error(err, g_quark_from_static_string("tokenizer"), + EINVAL, "failed to initialize tokenizer %s: %s", + name, error_buf[0] ? error_buf : "unknown error"); + rspamd_custom_tokenizer_dtor(tok); + return FALSE; + } + } + + /* Add to manager */ + g_hash_table_insert(mgr->tokenizers, tok->name, tok); + g_array_append_val(mgr->detection_order, tok); + + /* Re-sort by priority */ + g_array_sort(mgr->detection_order, rspamd_custom_tokenizer_priority_cmp); + + msg_info_tokenizer("loaded custom tokenizer %s (priority %.0f) from %s", + name, priority, path); + + return TRUE; +} + +struct rspamd_custom_tokenizer * +rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr, + const char *text, size_t len, + double *confidence, + const char *lang_hint, + const char **detected_lang_hint) +{ + struct rspamd_custom_tokenizer *tok, *best_tok = NULL; + double conf, best_conf = 0.0; + unsigned int i; + + g_assert(mgr != NULL); + g_assert(text != NULL); + + if (confidence) { + *confidence = 0.0; + } + + if (detected_lang_hint) { + *detected_lang_hint = NULL; + } + + /* If we have a language hint, try to find a tokenizer for that language first */ + if (lang_hint) { + for (i = 0; i < mgr->detection_order->len; i++) { + tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i); + + if (!tok->enabled || !tok->api->get_language_hint) { + continue; + } + + /* Check if this tokenizer handles the hinted language */ + const char *tok_lang = tok->api->get_language_hint(); + if (tok_lang && g_ascii_strcasecmp(tok_lang, lang_hint) == 0) { + /* Found a tokenizer for this language, check if it actually detects it */ + if (tok->api->detect_language) { + conf = tok->api->detect_language(text, len); + if (conf >= tok->min_confidence) { + /* Use this tokenizer */ + if (confidence) { + *confidence = conf; + } + if (detected_lang_hint) { + *detected_lang_hint = tok_lang; + } + return tok; + } + } + } + } + } + + /* Try each tokenizer in priority order */ + for (i = 0; i < mgr->detection_order->len; i++) { + tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i); + + if (!tok->enabled || !tok->api->detect_language) { + continue; + } + + conf = tok->api->detect_language(text, len); + + if (conf > best_conf && conf >= tok->min_confidence) { + best_conf = conf; + best_tok = tok; + + /* Early exit if very confident */ + if (conf >= 0.95) { + break; + } + } + } + + if (confidence && best_tok) { + *confidence = best_conf; + } + + if (detected_lang_hint && best_tok && best_tok->api->get_language_hint) { + *detected_lang_hint = best_tok->api->get_language_hint(); + } + + return best_tok; +} + +/* Helper function to tokenize with a custom tokenizer handling exceptions */ +GArray * +rspamd_custom_tokenizer_tokenize_with_exceptions( + struct rspamd_custom_tokenizer *tokenizer, + const char *text, + gsize len, + GList *exceptions, + rspamd_mempool_t *pool) +{ + GArray *words; + struct rspamd_tokenizer_result result; + struct rspamd_process_exception *ex; + GList *cur_ex = exceptions; + gsize pos = 0; + unsigned int i; + int ret; + + words = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t), 128); + + /* If no exceptions, tokenize the whole text */ + if (!exceptions) { + result.positions = NULL; + result.count = 0; + + ret = tokenizer->api->tokenize(text, len, &result); + if (ret == 0 && result.positions) { + /* Convert positions to tokens */ + for (i = 0; i < result.count; i++) { + rspamd_stat_token_t tok; + unsigned int start = result.positions[i * 2]; + unsigned int length = result.positions[i * 2 + 1]; + + if (start + length <= len) { + memset(&tok, 0, sizeof(tok)); + tok.original.begin = text + start; + tok.original.len = length; + tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF; + g_array_append_val(words, tok); + } + } + + /* Use tokenizer's cleanup function */ + if (tokenizer->api->cleanup_result) { + tokenizer->api->cleanup_result(&result); + } + } + + return words; + } + + /* Process text with exceptions */ + while (pos < len && cur_ex) { + ex = (struct rspamd_process_exception *) cur_ex->data; + + /* Tokenize text before exception */ + if (ex->pos > pos) { + gsize segment_len = ex->pos - pos; + result.positions = NULL; + result.count = 0; + + ret = tokenizer->api->tokenize(text + pos, segment_len, &result); + if (ret == 0 && result.positions) { + /* Convert positions to tokens, adjusting for segment offset */ + for (i = 0; i < result.count; i++) { + rspamd_stat_token_t tok; + unsigned int start = result.positions[i * 2] + pos; + unsigned int length = result.positions[i * 2 + 1]; + + if (start + length <= ex->pos) { + memset(&tok, 0, sizeof(tok)); + tok.original.begin = text + start; + tok.original.len = length; + tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF; + g_array_append_val(words, tok); + } + } + + /* Use tokenizer's cleanup function */ + if (tokenizer->api->cleanup_result) { + tokenizer->api->cleanup_result(&result); + } + } + } + + /* Add exception as a special token */ + rspamd_stat_token_t ex_tok; + memset(&ex_tok, 0, sizeof(ex_tok)); + + if (ex->type == RSPAMD_EXCEPTION_URL) { + ex_tok.original.begin = "!!EX!!"; + ex_tok.original.len = 6; + } + else { + ex_tok.original.begin = text + ex->pos; + ex_tok.original.len = ex->len; + } + ex_tok.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + g_array_append_val(words, ex_tok); + + /* Move past exception */ + pos = ex->pos + ex->len; + cur_ex = g_list_next(cur_ex); + } + + /* Process remaining text after last exception */ + if (pos < len) { + result.positions = NULL; + result.count = 0; + + ret = tokenizer->api->tokenize(text + pos, len - pos, &result); + if (ret == 0 && result.positions) { + /* Convert positions to tokens, adjusting for segment offset */ + for (i = 0; i < result.count; i++) { + rspamd_stat_token_t tok; + unsigned int start = result.positions[i * 2] + pos; + unsigned int length = result.positions[i * 2 + 1]; + + if (start + length <= len) { + memset(&tok, 0, sizeof(tok)); + tok.original.begin = text + start; + tok.original.len = length; + tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF; + g_array_append_val(words, tok); + } + } + + /* Use tokenizer's cleanup function */ + if (tokenizer->api->cleanup_result) { + tokenizer->api->cleanup_result(&result); + } + } + } + + return words; +} diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 0ea1bcfc6..4667976fb 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -1,5 +1,5 @@ /* - * Copyright 2024 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,8 @@ #include "contrib/mumhash/mum.h" #include "libmime/lang_detection.h" #include "libstemmer.h" +#define RSPAMD_TOKENIZER_INTERNAL +#include "custom_tokenizer.h" #include <unicode/utf8.h> #include <unicode/uchar.h> @@ -300,6 +302,9 @@ rspamd_tokenize_text(const char *text, gsize len, static const gsize long_text_limit = 1 * 1024 * 1024; static const ev_tstamp max_exec_time = 0.2; /* 200 ms */ ev_tstamp start; + struct rspamd_custom_tokenizer *custom_tok = NULL; + double custom_confidence = 0.0; + const char *detected_lang = NULL; if (text == NULL) { return cur_words; @@ -334,6 +339,54 @@ rspamd_tokenize_text(const char *text, gsize len, res = cur_words; } + /* Try custom tokenizers first if we're in UTF mode */ + if (cfg && cfg->tokenizer_manager && how == RSPAMD_TOKENIZE_UTF && utxt != NULL) { + custom_tok = rspamd_tokenizer_manager_detect( + cfg->tokenizer_manager, + text, len, + &custom_confidence, + NULL, /* no input language hint */ + &detected_lang); + + if (custom_tok && custom_confidence >= custom_tok->min_confidence) { + /* Use custom tokenizer with exception handling */ + GArray *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions( + custom_tok, text, len, exceptions, pool); + + if (custom_res) { + msg_debug_pool("using custom tokenizer %s (confidence: %.2f) for text tokenization", + custom_tok->name, custom_confidence); + + /* Calculate hash if needed */ + if (hash && custom_res->len > 0) { + unsigned int i; + for (i = 0; i < custom_res->len; i++) { + rspamd_stat_token_t *t = &g_array_index(custom_res, rspamd_stat_token_t, i); + if (t->original.len >= sizeof(uint64_t)) { + uint64_t tmp; + memcpy(&tmp, t->original.begin, sizeof(tmp)); + hv = mum_hash_step(hv, tmp); + } + } + *hash = mum_hash_finish(hv); + } + + /* If we had existing words, append to them */ + if (cur_words && custom_res != cur_words) { + g_array_append_vals(cur_words, custom_res->data, custom_res->len); + g_array_free(custom_res, TRUE); + return cur_words; + } + + return custom_res; + } + else { + msg_warn_pool("custom tokenizer %s failed to tokenize text, falling back to default", + custom_tok->name); + } + } + } + if (G_UNLIKELY(how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) { while (rspamd_tokenizer_get_word_raw(&buf, &pos, &token, &cur, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || @@ -952,4 +1005,4 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, } } } -}
\ No newline at end of file +} diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index d4a8824a8..f3066b5cf 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -1,5 +1,5 @@ /* - * Copyright 2023 Vsevolod Stakhov + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c index f77b36952..f1208abd2 100644 --- a/src/lua/lua_parsers.c +++ b/src/lua/lua_parsers.c @@ -1,11 +1,11 @@ -/*- - * Copyright 2020 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, |