/* * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "config.h" #include "tokenizers.h" #define RSPAMD_TOKENIZER_INTERNAL #include "custom_tokenizer.h" #include "libutil/util.h" #include "libserver/logger.h" #include #define msg_err_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ "tokenizer", "", \ RSPAMD_LOG_FUNC, \ __VA_ARGS__) #define msg_warn_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ "tokenizer", "", \ RSPAMD_LOG_FUNC, \ __VA_ARGS__) #define msg_info_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ "tokenizer", "", \ RSPAMD_LOG_FUNC, \ __VA_ARGS__) #define msg_debug_tokenizer(...) rspamd_conditional_debug_fast(NULL, NULL, \ rspamd_tokenizer_log_id, "tokenizer", "", \ RSPAMD_LOG_FUNC, \ __VA_ARGS__) INIT_LOG_MODULE(tokenizer) static void rspamd_custom_tokenizer_dtor(gpointer p) { struct rspamd_custom_tokenizer *tok = p; if (tok) { if (tok->api && tok->api->deinit) { tok->api->deinit(); } if (tok->handle) { dlclose(tok->handle); } if (tok->config) { ucl_object_unref(tok->config); } g_free(tok->name); g_free(tok->path); g_free(tok); } } static int rspamd_custom_tokenizer_priority_cmp(gconstpointer a, gconstpointer b) { const struct rspamd_custom_tokenizer *t1 = *(const struct rspamd_custom_tokenizer **) a; const struct rspamd_custom_tokenizer *t2 = *(const struct rspamd_custom_tokenizer **) b; /* Higher priority first */ if (t1->priority > t2->priority) { return -1; } else if (t1->priority < t2->priority) { return 1; } return 0; } struct rspamd_tokenizer_manager * rspamd_tokenizer_manager_new(rspamd_mempool_t *pool) { struct rspamd_tokenizer_manager *mgr; mgr = rspamd_mempool_alloc0(pool, sizeof(*mgr)); mgr->pool = pool; mgr->tokenizers = g_hash_table_new_full(rspamd_strcase_hash, rspamd_strcase_equal, NULL, rspamd_custom_tokenizer_dtor); mgr->detection_order = g_array_new(FALSE, FALSE, sizeof(struct rspamd_custom_tokenizer *)); mgr->default_threshold = 0.7; /* Default confidence threshold */ rspamd_mempool_add_destructor(pool, (rspamd_mempool_destruct_t) g_hash_table_unref, mgr->tokenizers); rspamd_mempool_add_destructor(pool, (rspamd_mempool_destruct_t) rspamd_array_free_hard, mgr->detection_order); msg_info_tokenizer("created custom tokenizer manager with default confidence threshold %.3f", mgr->default_threshold); return mgr; } void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr) { /* Cleanup is handled by memory pool destructors */ } gboolean rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr, const char *name, const ucl_object_t *config, GError **err) { struct rspamd_custom_tokenizer *tok; const ucl_object_t *elt; rspamd_tokenizer_get_api_func get_api; const rspamd_custom_tokenizer_api_t *api; void *handle; const char *path; gboolean enabled = TRUE; double priority = 50.0; char error_buf[256]; g_assert(mgr != NULL); g_assert(name != NULL); g_assert(config != NULL); msg_info_tokenizer("starting to load custom tokenizer '%s'", name); /* Check if enabled */ elt = ucl_object_lookup(config, "enabled"); if (elt && ucl_object_type(elt) == UCL_BOOLEAN) { enabled = ucl_object_toboolean(elt); } if (!enabled) { msg_info_tokenizer("custom tokenizer '%s' is disabled", name); return TRUE; } /* Get path */ elt = ucl_object_lookup(config, "path"); if (!elt || ucl_object_type(elt) != UCL_STRING) { g_set_error(err, g_quark_from_static_string("tokenizer"), EINVAL, "missing 'path' for tokenizer %s", name); return FALSE; } path = ucl_object_tostring(elt); msg_info_tokenizer("custom tokenizer '%s' will be loaded from path: %s", name, path); /* Get priority */ elt = ucl_object_lookup(config, "priority"); if (elt) { priority = ucl_object_todouble(elt); } msg_info_tokenizer("custom tokenizer '%s' priority set to %.1f", name, priority); /* Load the shared library */ msg_info_tokenizer("loading shared library for custom tokenizer '%s'", name); handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); if (!handle) { g_set_error(err, g_quark_from_static_string("tokenizer"), EINVAL, "cannot load tokenizer %s from %s: %s", name, path, dlerror()); return FALSE; } msg_info_tokenizer("successfully loaded shared library for custom tokenizer '%s'", name); /* Get the API entry point */ msg_info_tokenizer("looking up API entry point for custom tokenizer '%s'", name); get_api = (rspamd_tokenizer_get_api_func) dlsym(handle, "rspamd_tokenizer_get_api"); if (!get_api) { dlclose(handle); g_set_error(err, g_quark_from_static_string("tokenizer"), EINVAL, "cannot find entry point in %s: %s", path, dlerror()); return FALSE; } /* Get the API */ msg_info_tokenizer("calling API entry point for custom tokenizer '%s'", name); api = get_api(); if (!api) { dlclose(handle); g_set_error(err, g_quark_from_static_string("tokenizer"), EINVAL, "tokenizer %s returned NULL API", name); return FALSE; } msg_info_tokenizer("successfully obtained API from custom tokenizer '%s'", name); /* Check API version */ msg_info_tokenizer("checking API version for custom tokenizer '%s' (got %u, expected %u)", name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION); if (api->api_version != RSPAMD_CUSTOM_TOKENIZER_API_VERSION) { dlclose(handle); g_set_error(err, g_quark_from_static_string("tokenizer"), EINVAL, "tokenizer %s has incompatible API version %u (expected %u)", name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION); return FALSE; } /* Create tokenizer instance */ tok = g_malloc0(sizeof(*tok)); tok->name = g_strdup(name); tok->path = g_strdup(path); tok->handle = handle; tok->api = api; tok->priority = priority; tok->enabled = enabled; /* Get tokenizer config */ elt = ucl_object_lookup(config, "config"); if (elt) { tok->config = ucl_object_ref(elt); } /* Get minimum confidence */ if (api->get_min_confidence) { tok->min_confidence = api->get_min_confidence(); msg_info_tokenizer("custom tokenizer '%s' provides minimum confidence threshold: %.3f", name, tok->min_confidence); } else { tok->min_confidence = mgr->default_threshold; msg_info_tokenizer("custom tokenizer '%s' using default confidence threshold: %.3f", name, tok->min_confidence); } /* Initialize the tokenizer */ if (api->init) { msg_info_tokenizer("initializing custom tokenizer '%s'", name); error_buf[0] = '\0'; if (api->init(tok->config, error_buf, sizeof(error_buf)) != 0) { g_set_error(err, g_quark_from_static_string("tokenizer"), EINVAL, "failed to initialize tokenizer %s: %s", name, error_buf[0] ? error_buf : "unknown error"); rspamd_custom_tokenizer_dtor(tok); return FALSE; } msg_info_tokenizer("successfully initialized custom tokenizer '%s'", name); } else { msg_info_tokenizer("custom tokenizer '%s' does not require initialization", name); } /* Add to manager */ g_hash_table_insert(mgr->tokenizers, tok->name, tok); g_array_append_val(mgr->detection_order, tok); /* Re-sort by priority */ g_array_sort(mgr->detection_order, rspamd_custom_tokenizer_priority_cmp); msg_info_tokenizer("custom tokenizer '%s' registered and sorted by priority (total tokenizers: %u)", name, mgr->detection_order->len); msg_info_tokenizer("successfully loaded custom tokenizer '%s' (priority %.1f) from %s", name, priority, path); return TRUE; } struct rspamd_custom_tokenizer * rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr, const char *text, size_t len, double *confidence, const char *lang_hint, const char **detected_lang_hint) { struct rspamd_custom_tokenizer *tok, *best_tok = NULL; double conf, best_conf = 0.0; unsigned int i; g_assert(mgr != NULL); g_assert(text != NULL); msg_debug_tokenizer("starting tokenizer detection for text of length %zu", len); if (confidence) { *confidence = 0.0; } if (detected_lang_hint) { *detected_lang_hint = NULL; } /* If we have a language hint, try to find a tokenizer for that language first */ if (lang_hint) { msg_info_tokenizer("trying to find tokenizer for language hint: %s", lang_hint); for (i = 0; i < mgr->detection_order->len; i++) { tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i); if (!tok->enabled || !tok->api->get_language_hint) { continue; } /* Check if this tokenizer handles the hinted language */ const char *tok_lang = tok->api->get_language_hint(); if (tok_lang && g_ascii_strcasecmp(tok_lang, lang_hint) == 0) { msg_info_tokenizer("found tokenizer '%s' for language hint '%s'", tok->name, lang_hint); /* Found a tokenizer for this language, check if it actually detects it */ if (tok->api->detect_language) { conf = tok->api->detect_language(text, len); msg_info_tokenizer("tokenizer '%s' confidence for hinted language: %.3f (threshold: %.3f)", tok->name, conf, tok->min_confidence); if (conf >= tok->min_confidence) { /* Use this tokenizer */ msg_info_tokenizer("using tokenizer '%s' for language hint '%s' with confidence %.3f", tok->name, lang_hint, conf); if (confidence) { *confidence = conf; } if (detected_lang_hint) { *detected_lang_hint = tok_lang; } return tok; } } } } msg_info_tokenizer("no suitable tokenizer found for language hint '%s', falling back to general detection", lang_hint); } /* Try each tokenizer in priority order */ msg_info_tokenizer("trying %u tokenizers for general detection", mgr->detection_order->len); for (i = 0; i < mgr->detection_order->len; i++) { tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i); if (!tok->enabled || !tok->api->detect_language) { msg_debug_tokenizer("skipping tokenizer '%s' (enabled: %s, has detect_language: %s)", tok->name, tok->enabled ? "yes" : "no", tok->api->detect_language ? "yes" : "no"); continue; } conf = tok->api->detect_language(text, len); msg_info_tokenizer("tokenizer '%s' detection confidence: %.3f (threshold: %.3f, current best: %.3f)", tok->name, conf, tok->min_confidence, best_conf); if (conf > best_conf && conf >= tok->min_confidence) { best_conf = conf; best_tok = tok; msg_info_tokenizer("tokenizer '%s' is new best with confidence %.3f", tok->name, best_conf); /* Early exit if very confident */ if (conf >= 0.95) { msg_info_tokenizer("very high confidence (%.3f >= 0.95), using tokenizer '%s' immediately", conf, tok->name); break; } } } if (best_tok) { msg_info_tokenizer("selected tokenizer '%s' with confidence %.3f", best_tok->name, best_conf); if (confidence) { *confidence = best_conf; } if (detected_lang_hint && best_tok->api->get_language_hint) { *detected_lang_hint = best_tok->api->get_language_hint(); msg_info_tokenizer("detected language hint: %s", *detected_lang_hint); } } else { msg_info_tokenizer("no suitable tokenizer found during detection"); } return best_tok; } /* Helper function to tokenize with a custom tokenizer handling exceptions */ rspamd_tokenizer_result_t * rspamd_custom_tokenizer_tokenize_with_exceptions( struct rspamd_custom_tokenizer *tokenizer, const char *text, gsize len, GList *exceptions, rspamd_mempool_t *pool) { rspamd_tokenizer_result_t *words; rspamd_tokenizer_result_t result; struct rspamd_process_exception *ex; GList *cur_ex = exceptions; gsize pos = 0; unsigned int i; int ret; /* Allocate result kvec in pool */ words = rspamd_mempool_alloc(pool, sizeof(*words)); kv_init(*words); /* If no exceptions, tokenize the whole text */ if (!exceptions) { kv_init(result); ret = tokenizer->api->tokenize(text, len, &result); if (ret == 0 && result.a) { /* Copy tokens from result to output */ for (i = 0; i < kv_size(result); i++) { rspamd_word_t tok = kv_A(result, i); kv_push(rspamd_word_t, *words, tok); } /* Use tokenizer's cleanup function */ if (tokenizer->api->cleanup_result) { tokenizer->api->cleanup_result(&result); } } return words; } /* Process text with exceptions */ while (pos < len && cur_ex) { ex = (struct rspamd_process_exception *) cur_ex->data; /* Tokenize text before exception */ if (ex->pos > pos) { gsize segment_len = ex->pos - pos; kv_init(result); ret = tokenizer->api->tokenize(text + pos, segment_len, &result); if (ret == 0 && result.a) { /* Copy tokens from result, adjusting positions for segment offset */ for (i = 0; i < kv_size(result); i++) { rspamd_word_t tok = kv_A(result, i); /* Adjust pointers to point to the original text */ gsize offset_in_segment = tok.original.begin - (text + pos); if (offset_in_segment < segment_len) { tok.original.begin = text + pos + offset_in_segment; /* Ensure we don't go past the exception boundary */ if (tok.original.begin + tok.original.len <= text + ex->pos) { kv_push(rspamd_word_t, *words, tok); } } } /* Use tokenizer's cleanup function */ if (tokenizer->api->cleanup_result) { tokenizer->api->cleanup_result(&result); } } } /* Add exception as a special token */ rspamd_word_t ex_tok; memset(&ex_tok, 0, sizeof(ex_tok)); if (ex->type == RSPAMD_EXCEPTION_URL) { ex_tok.original.begin = "!!EX!!"; ex_tok.original.len = 6; } else { ex_tok.original.begin = text + ex->pos; ex_tok.original.len = ex->len; } ex_tok.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; kv_push(rspamd_word_t, *words, ex_tok); /* Move past exception */ pos = ex->pos + ex->len; cur_ex = g_list_next(cur_ex); } /* Process remaining text after last exception */ if (pos < len) { kv_init(result); ret = tokenizer->api->tokenize(text + pos, len - pos, &result); if (ret == 0 && result.a) { /* Copy tokens from result, adjusting positions for segment offset */ for (i = 0; i < kv_size(result); i++) { rspamd_word_t tok = kv_A(result, i); /* Adjust pointers to point to the original text */ gsize offset_in_segment = tok.original.begin - (text + pos); if (offset_in_segment < (len - pos)) { tok.original.begin = text + pos + offset_in_segment; kv_push(rspamd_word_t, *words, tok); } } /* Use tokenizer's cleanup function */ if (tokenizer->api->cleanup_result) { tokenizer->api->cleanup_result(&result); } } } return words; }