6 files changed, 928 insertions, 82 deletions
diff --git a/src/libstat/tokenizers/custom_tokenizer.h b/src/libstat/tokenizers/custom_tokenizer.h
new file mode 100644
index 000000000..bc173a1da
--- /dev/null
+++ b/src/libstat/tokenizers/custom_tokenizer.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_CUSTOM_TOKENIZER_H
+#define RSPAMD_CUSTOM_TOKENIZER_H
+
+/* Check if we're being included by internal Rspamd code or external plugins */
+#ifdef RSPAMD_TOKENIZER_INTERNAL
+/* Internal Rspamd usage - use the full headers */
+#include "config.h"
+#include "ucl.h"
+#include "libserver/word.h"
+#else
+/* External plugin usage - use standalone types */
+#include "rspamd_tokenizer_types.h"
+/* Forward declaration for UCL object - plugins should include ucl.h if needed */
+typedef struct ucl_object_s ucl_object_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1
+
+/**
+ * Tokenization result - compatible with both internal and external usage
+ */
+typedef rspamd_words_t rspamd_tokenizer_result_t;
+
+/**
+ * Custom tokenizer API that must be implemented by language-specific tokenizer plugins
+ * All functions use only plain C types to ensure clean boundaries
+ */
+typedef struct rspamd_custom_tokenizer_api {
+	/* API version for compatibility checking */
+	unsigned int api_version;
+
+	/* Name of the tokenizer (e.g., "japanese_mecab") */
+	const char *name;
+
+	/**
+	 * Global initialization function called once when the tokenizer is loaded
+	 * @param config UCL configuration object for this tokenizer (may be NULL)
+	 * @param error_buf Buffer for error message (at least 256 bytes)
+	 * @return 0 on success, non-zero on failure
+	 */
+	int (*init)(const ucl_object_t *config, char *error_buf, size_t error_buf_size);
+
+	/**
+	 * Global cleanup function called when the tokenizer is unloaded
+	 */
+	void (*deinit)(void);
+
+	/**
+	 * Quick language detection to check if this tokenizer can handle the text
+	 * @param text UTF-8 text to analyze
+	 * @param len Length of the text in bytes
+	 * @return Confidence score 0.0-1.0, or -1.0 if cannot handle
+	 */
+	double (*detect_language)(const char *text, size_t len);
+
+	/**
+	 * Main tokenization function
+	 * @param text UTF-8 text to tokenize
+	 * @param len Length of the text in bytes
+	 * @param result Output kvec to fill with rspamd_word_t elements
+	 * @return 0 on success, non-zero on failure
+	 *
+	 * The tokenizer should allocate result->a using its own allocator
+	 * Rspamd will call cleanup_result() to free it after processing
+	 */
+	int (*tokenize)(const char *text, size_t len,
+					rspamd_tokenizer_result_t *result);
+
+	/**
+	 * Cleanup the result from tokenize()
+	 * @param result Result kvec returned by tokenize()
+	 *
+	 * This function should free result->a using the same allocator
+	 * that was used in tokenize() and reset the kvec fields.
+	 * This ensures proper memory management across DLL boundaries.
+	 * Note: This does NOT free the result structure itself, only its contents.
+	 */
+	void (*cleanup_result)(rspamd_tokenizer_result_t *result);
+
+	/**
+	 * Optional: Get language hint for better language detection
+	 * @return Language code (e.g., "ja", "zh") or NULL
+	 */
+	const char *(*get_language_hint)(void);
+
+	/**
+	 * Optional: Get minimum confidence threshold for this tokenizer
+	 * @return Minimum confidence (0.0-1.0) or -1.0 to use default
+	 */
+	double (*get_min_confidence)(void);
+
+} rspamd_custom_tokenizer_api_t;
+
+/**
+ * Entry point function that plugins must export
+ * Must be named "rspamd_tokenizer_get_api"
+ */
+typedef const rspamd_custom_tokenizer_api_t *(*rspamd_tokenizer_get_api_func)(void);
+
+/* Internal Rspamd structures - not exposed to plugins */
+#ifdef RSPAMD_TOKENIZER_INTERNAL
+
+/**
+ * Custom tokenizer instance
+ */
+struct rspamd_custom_tokenizer {
+	char *name;                               /* Tokenizer name from config */
+	char *path;                               /* Path to .so file */
+	void *handle;                             /* dlopen handle */
+	const rspamd_custom_tokenizer_api_t *api; /* API functions */
+	double priority;                          /* Detection priority */
+	double min_confidence;                    /* Minimum confidence threshold */
+	gboolean enabled;                         /* Is tokenizer enabled */
+	ucl_object_t *config;                     /* Tokenizer-specific config */
+};
+
+/**
+ * Tokenizer manager structure
+ */
+struct rspamd_tokenizer_manager {
+	GHashTable *tokenizers;  /* name -> rspamd_custom_tokenizer */
+	GArray *detection_order; /* Ordered by priority */
+	rspamd_mempool_t *pool;
+	double default_threshold; /* Default confidence threshold */
+};
+
+/* Manager functions */
+struct rspamd_tokenizer_manager *rspamd_tokenizer_manager_new(rspamd_mempool_t *pool);
+void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr);
+
+gboolean rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr,
+												 const char *name,
+												 const ucl_object_t *config,
+												 GError **err);
+
+struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect(
+	struct rspamd_tokenizer_manager *mgr,
+	const char *text, size_t len,
+	double *confidence,
+	const char *lang_hint,
+	const char **detected_lang_hint);
+
+/* Helper function to tokenize with exceptions handling */
+rspamd_tokenizer_result_t *rspamd_custom_tokenizer_tokenize_with_exceptions(
+	struct rspamd_custom_tokenizer *tokenizer,
+	const char *text,
+	gsize len,
+	GList *exceptions,
+	rspamd_mempool_t *pool);
+
+#endif /* RSPAMD_TOKENIZER_INTERNAL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_CUSTOM_TOKENIZER_H */
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index 0bc3414a5..360c71d36 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -21,6 +21,7 @@
 #include "tokenizers.h"
 #include "stat_internal.h"
 #include "libmime/lang_detection.h"
+#include "libserver/word.h"
 
 /* Size for features pipe */
 #define DEFAULT_FEATURE_WINDOW_SIZE 2
@@ -268,7 +269,7 @@ struct token_pipe_entry {
 
 int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
 						 struct rspamd_task *task,
-						 GArray *words,
+						 rspamd_words_t *words,
 						 gboolean is_utf,
 						 const char *prefix,
 						 GPtrArray *result)
@@ -282,7 +283,7 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
 	gsize token_size;
 	unsigned int processed = 0, i, w, window_size, token_flags = 0;
 
-	if (words == NULL) {
+	if (words == NULL || !words->a) {
 		return FALSE;
 	}
 
@@ -306,8 +307,8 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
 				 sizeof(RSPAMD_TOKEN_VALUE_TYPE) * ctx->statfiles->len;
 	g_assert(token_size > 0);
 
-	for (w = 0; w < words->len; w++) {
-		token = &g_array_index(words, rspamd_stat_token_t, w);
+	for (w = 0; w < kv_size(*words); w++) {
+		token = &kv_A(*words, w);
 		token_flags = token->flags;
 		const char *begin;
 		gsize len;
diff --git a/src/libstat/tokenizers/rspamd_tokenizer_types.h b/src/libstat/tokenizers/rspamd_tokenizer_types.h
new file mode 100644
index 000000000..eb8518290
--- /dev/null
+++ b/src/libstat/tokenizers/rspamd_tokenizer_types.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_TOKENIZER_TYPES_H
+#define RSPAMD_TOKENIZER_TYPES_H
+
+/*
+ * Standalone type definitions for custom tokenizers
+ * This header is completely self-contained and does not depend on any external libraries.
+ * Custom tokenizers should include only this header to get access to all necessary types.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Basic string token structure
+ */
+typedef struct rspamd_ftok {
+	size_t len;
+	const char *begin;
+} rspamd_ftok_t;
+
+/**
+ * Unicode string token structure
+ */
+typedef struct rspamd_ftok_unicode {
+	size_t len;
+	const uint32_t *begin;
+} rspamd_ftok_unicode_t;
+
+/* Word flags */
+#define RSPAMD_WORD_FLAG_TEXT (1u << 0u)
+#define RSPAMD_WORD_FLAG_META (1u << 1u)
+#define RSPAMD_WORD_FLAG_LUA_META (1u << 2u)
+#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3u)
+#define RSPAMD_WORD_FLAG_HEADER (1u << 4u)
+#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5u)
+#define RSPAMD_WORD_FLAG_UTF (1u << 6u)
+#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7u)
+#define RSPAMD_WORD_FLAG_STEMMED (1u << 8u)
+#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9u)
+#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10u)
+#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11u)
+#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12u)
+#define RSPAMD_WORD_FLAG_EMOJI (1u << 13u)
+
+/**
+ * Word structure
+ */
+typedef struct rspamd_word {
+	rspamd_ftok_t original;
+	rspamd_ftok_unicode_t unicode;
+	rspamd_ftok_t normalized;
+	rspamd_ftok_t stemmed;
+	unsigned int flags;
+} rspamd_word_t;
+
+/**
+ * Array of words
+ */
+typedef struct rspamd_words {
+	rspamd_word_t *a;
+	size_t n;
+	size_t m;
+} rspamd_words_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_TOKENIZER_TYPES_H */
diff --git a/src/libstat/tokenizers/tokenizer_manager.c b/src/libstat/tokenizers/tokenizer_manager.c
new file mode 100644
index 000000000..e6fb5e8d8
--- /dev/null
+++ b/src/libstat/tokenizers/tokenizer_manager.c
@@ -0,0 +1,500 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "tokenizers.h"
+#define RSPAMD_TOKENIZER_INTERNAL
+#include "custom_tokenizer.h"
+#include "libutil/util.h"
+#include "libserver/logger.h"
+#include <dlfcn.h>
+
+#define msg_err_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \
+														   "tokenizer", "",      \
+														   RSPAMD_LOG_FUNC,      \
+														   __VA_ARGS__)
+#define msg_warn_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \
+															"tokenizer", "",     \
+															RSPAMD_LOG_FUNC,     \
+															__VA_ARGS__)
+#define msg_info_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \
+															"tokenizer", "",  \
+															RSPAMD_LOG_FUNC,  \
+															__VA_ARGS__)
+#define msg_debug_tokenizer(...) rspamd_conditional_debug_fast(NULL, NULL,                               \
+															   rspamd_tokenizer_log_id, "tokenizer", "", \
+															   RSPAMD_LOG_FUNC,                          \
+															   __VA_ARGS__)
+
+INIT_LOG_MODULE(tokenizer)
+
+static void
+rspamd_custom_tokenizer_dtor(gpointer p)
+{
+	struct rspamd_custom_tokenizer *tok = p;
+
+	if (tok) {
+		if (tok->api && tok->api->deinit) {
+			tok->api->deinit();
+		}
+
+		if (tok->handle) {
+			dlclose(tok->handle);
+		}
+
+		if (tok->config) {
+			ucl_object_unref(tok->config);
+		}
+
+		g_free(tok->name);
+		g_free(tok->path);
+		g_free(tok);
+	}
+}
+
+static int
+rspamd_custom_tokenizer_priority_cmp(gconstpointer a, gconstpointer b)
+{
+	const struct rspamd_custom_tokenizer *t1 = *(const struct rspamd_custom_tokenizer **) a;
+	const struct rspamd_custom_tokenizer *t2 = *(const struct rspamd_custom_tokenizer **) b;
+
+	/* Higher priority first */
+	if (t1->priority > t2->priority) {
+		return -1;
+	}
+	else if (t1->priority < t2->priority) {
+		return 1;
+	}
+
+	return 0;
+}
+
+struct rspamd_tokenizer_manager *
+rspamd_tokenizer_manager_new(rspamd_mempool_t *pool)
+{
+	struct rspamd_tokenizer_manager *mgr;
+
+	mgr = rspamd_mempool_alloc0(pool, sizeof(*mgr));
+	mgr->pool = pool;
+	mgr->tokenizers = g_hash_table_new_full(rspamd_strcase_hash,
+											rspamd_strcase_equal,
+											NULL,
+											rspamd_custom_tokenizer_dtor);
+	mgr->detection_order = g_array_new(FALSE, FALSE, sizeof(struct rspamd_custom_tokenizer *));
+	mgr->default_threshold = 0.7; /* Default confidence threshold */
+
+	rspamd_mempool_add_destructor(pool,
+								  (rspamd_mempool_destruct_t) g_hash_table_unref,
+								  mgr->tokenizers);
+	rspamd_mempool_add_destructor(pool,
+								  (rspamd_mempool_destruct_t) rspamd_array_free_hard,
+								  mgr->detection_order);
+
+	msg_info_tokenizer("created custom tokenizer manager with default confidence threshold %.3f",
+					   mgr->default_threshold);
+
+	return mgr;
+}
+
+void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr)
+{
+	/* Cleanup is handled by memory pool destructors */
+}
+
+gboolean
+rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr,
+										const char *name,
+										const ucl_object_t *config,
+										GError **err)
+{
+	struct rspamd_custom_tokenizer *tok;
+	const ucl_object_t *elt;
+	rspamd_tokenizer_get_api_func get_api;
+	const rspamd_custom_tokenizer_api_t *api;
+	void *handle;
+	const char *path;
+	gboolean enabled = TRUE;
+	double priority = 50.0;
+	char error_buf[256];
+
+	g_assert(mgr != NULL);
+	g_assert(name != NULL);
+	g_assert(config != NULL);
+
+	msg_info_tokenizer("starting to load custom tokenizer '%s'", name);
+
+	/* Check if enabled */
+	elt = ucl_object_lookup(config, "enabled");
+	if (elt && ucl_object_type(elt) == UCL_BOOLEAN) {
+		enabled = ucl_object_toboolean(elt);
+	}
+
+	if (!enabled) {
+		msg_info_tokenizer("custom tokenizer '%s' is disabled", name);
+		return TRUE;
+	}
+
+	/* Get path */
+	elt = ucl_object_lookup(config, "path");
+	if (!elt || ucl_object_type(elt) != UCL_STRING) {
+		g_set_error(err, g_quark_from_static_string("tokenizer"),
+					EINVAL, "missing 'path' for tokenizer %s", name);
+		return FALSE;
+	}
+	path = ucl_object_tostring(elt);
+	msg_info_tokenizer("custom tokenizer '%s' will be loaded from path: %s", name, path);
+
+	/* Get priority */
+	elt = ucl_object_lookup(config, "priority");
+	if (elt) {
+		priority = ucl_object_todouble(elt);
+	}
+	msg_info_tokenizer("custom tokenizer '%s' priority set to %.1f", name, priority);
+
+	/* Load the shared library */
+	msg_info_tokenizer("loading shared library for custom tokenizer '%s'", name);
+	handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
+	if (!handle) {
+		g_set_error(err, g_quark_from_static_string("tokenizer"),
+					EINVAL, "cannot load tokenizer %s from %s: %s",
+					name, path, dlerror());
+		return FALSE;
+	}
+	msg_info_tokenizer("successfully loaded shared library for custom tokenizer '%s'", name);
+
+	/* Get the API entry point */
+	msg_info_tokenizer("looking up API entry point for custom tokenizer '%s'", name);
+	get_api = (rspamd_tokenizer_get_api_func) dlsym(handle, "rspamd_tokenizer_get_api");
+	if (!get_api) {
+		dlclose(handle);
+		g_set_error(err, g_quark_from_static_string("tokenizer"),
+					EINVAL, "cannot find entry point in %s: %s",
+					path, dlerror());
+		return FALSE;
+	}
+
+	/* Get the API */
+	msg_info_tokenizer("calling API entry point for custom tokenizer '%s'", name);
+	api = get_api();
+	if (!api) {
+		dlclose(handle);
+		g_set_error(err, g_quark_from_static_string("tokenizer"),
+					EINVAL, "tokenizer %s returned NULL API", name);
+		return FALSE;
+	}
+	msg_info_tokenizer("successfully obtained API from custom tokenizer '%s'", name);
+
+	/* Check API version */
+	msg_info_tokenizer("checking API version for custom tokenizer '%s' (got %u, expected %u)",
+					   name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION);
+	if (api->api_version != RSPAMD_CUSTOM_TOKENIZER_API_VERSION) {
+		dlclose(handle);
+		g_set_error(err, g_quark_from_static_string("tokenizer"),
+					EINVAL, "tokenizer %s has incompatible API version %u (expected %u)",
+					name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION);
+		return FALSE;
+	}
+
+	/* Create tokenizer instance */
+	tok = g_malloc0(sizeof(*tok));
+	tok->name = g_strdup(name);
+	tok->path = g_strdup(path);
+	tok->handle = handle;
+	tok->api = api;
+	tok->priority = priority;
+	tok->enabled = enabled;
+
+	/* Get tokenizer config */
+	elt = ucl_object_lookup(config, "config");
+	if (elt) {
+		tok->config = ucl_object_ref(elt);
+	}
+
+	/* Get minimum confidence */
+	if (api->get_min_confidence) {
+		tok->min_confidence = api->get_min_confidence();
+		msg_info_tokenizer("custom tokenizer '%s' provides minimum confidence threshold: %.3f",
+						   name, tok->min_confidence);
+	}
+	else {
+		tok->min_confidence = mgr->default_threshold;
+		msg_info_tokenizer("custom tokenizer '%s' using default confidence threshold: %.3f",
+						   name, tok->min_confidence);
+	}
+
+	/* Initialize the tokenizer */
+	if (api->init) {
+		msg_info_tokenizer("initializing custom tokenizer '%s'", name);
+		error_buf[0] = '\0';
+		if (api->init(tok->config, error_buf, sizeof(error_buf)) != 0) {
+			g_set_error(err, g_quark_from_static_string("tokenizer"),
+						EINVAL, "failed to initialize tokenizer %s: %s",
+						name, error_buf[0] ? error_buf : "unknown error");
+			rspamd_custom_tokenizer_dtor(tok);
+			return FALSE;
+		}
+		msg_info_tokenizer("successfully initialized custom tokenizer '%s'", name);
+	}
+	else {
+		msg_info_tokenizer("custom tokenizer '%s' does not require initialization", name);
+	}
+
+	/* Add to manager */
+	g_hash_table_insert(mgr->tokenizers, tok->name, tok);
+	g_array_append_val(mgr->detection_order, tok);
+
+	/* Re-sort by priority */
+	g_array_sort(mgr->detection_order, rspamd_custom_tokenizer_priority_cmp);
+	msg_info_tokenizer("custom tokenizer '%s' registered and sorted by priority (total tokenizers: %u)",
+					   name, mgr->detection_order->len);
+
+	msg_info_tokenizer("successfully loaded custom tokenizer '%s' (priority %.1f) from %s",
+					   name, priority, path);
+
+	return TRUE;
+}
+
+struct rspamd_custom_tokenizer *
+rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr,
+								const char *text, size_t len,
+								double *confidence,
+								const char *lang_hint,
+								const char **detected_lang_hint)
+{
+	struct rspamd_custom_tokenizer *tok, *best_tok = NULL;
+	double conf, best_conf = 0.0;
+	unsigned int i;
+
+	g_assert(mgr != NULL);
+	g_assert(text != NULL);
+
+	msg_debug_tokenizer("starting tokenizer detection for text of length %zu", len);
+
+	if (confidence) {
+		*confidence = 0.0;
+	}
+
+	if (detected_lang_hint) {
+		*detected_lang_hint = NULL;
+	}
+
+	/* If we have a language hint, try to find a tokenizer for that language first */
+	if (lang_hint) {
+		msg_info_tokenizer("trying to find tokenizer for language hint: %s", lang_hint);
+		for (i = 0; i < mgr->detection_order->len; i++) {
+			tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i);
+
+			if (!tok->enabled || !tok->api->get_language_hint) {
+				continue;
+			}
+
+			/* Check if this tokenizer handles the hinted language */
+			const char *tok_lang = tok->api->get_language_hint();
+			if (tok_lang && g_ascii_strcasecmp(tok_lang, lang_hint) == 0) {
+				msg_info_tokenizer("found tokenizer '%s' for language hint '%s'", tok->name, lang_hint);
+				/* Found a tokenizer for this language, check if it actually detects it */
+				if (tok->api->detect_language) {
+					conf = tok->api->detect_language(text, len);
+					msg_info_tokenizer("tokenizer '%s' confidence for hinted language: %.3f (threshold: %.3f)",
+									   tok->name, conf, tok->min_confidence);
+					if (conf >= tok->min_confidence) {
+						/* Use this tokenizer */
+						msg_info_tokenizer("using tokenizer '%s' for language hint '%s' with confidence %.3f",
+										   tok->name, lang_hint, conf);
+						if (confidence) {
+							*confidence = conf;
+						}
+						if (detected_lang_hint) {
+							*detected_lang_hint = tok_lang;
+						}
+						return tok;
+					}
+				}
+			}
+		}
+		msg_info_tokenizer("no suitable tokenizer found for language hint '%s', falling back to general detection", lang_hint);
+	}
+
+	/* Try each tokenizer in priority order */
+	msg_info_tokenizer("trying %u tokenizers for general detection", mgr->detection_order->len);
+	for (i = 0; i < mgr->detection_order->len; i++) {
+		tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i);
+
+		if (!tok->enabled || !tok->api->detect_language) {
+			msg_debug_tokenizer("skipping tokenizer '%s' (enabled: %s, has detect_language: %s)",
+								tok->name, tok->enabled ? "yes" : "no",
+								tok->api->detect_language ? "yes" : "no");
+			continue;
+		}
+
+		conf = tok->api->detect_language(text, len);
+		msg_info_tokenizer("tokenizer '%s' detection confidence: %.3f (threshold: %.3f, current best: %.3f)",
+						   tok->name, conf, tok->min_confidence, best_conf);
+
+		if (conf > best_conf && conf >= tok->min_confidence) {
+			best_conf = conf;
+			best_tok = tok;
+			msg_info_tokenizer("tokenizer '%s' is new best with confidence %.3f", tok->name, best_conf);
+
+			/* Early exit if very confident */
+			if (conf >= 0.95) {
+				msg_info_tokenizer("very high confidence (%.3f >= 0.95), using tokenizer '%s' immediately",
+								   conf, tok->name);
+				break;
+			}
+		}
+	}
+
+	if (best_tok) {
+		msg_info_tokenizer("selected tokenizer '%s' with confidence %.3f", best_tok->name, best_conf);
+		if (confidence) {
+			*confidence = best_conf;
+		}
+
+		if (detected_lang_hint && best_tok->api->get_language_hint) {
+			*detected_lang_hint = best_tok->api->get_language_hint();
+			msg_info_tokenizer("detected language hint: %s", *detected_lang_hint);
+		}
+	}
+	else {
+		msg_info_tokenizer("no suitable tokenizer found during detection");
+	}
+
+	return best_tok;
+}
+
+/* Helper function to tokenize with a custom tokenizer handling exceptions */
+rspamd_tokenizer_result_t *
+rspamd_custom_tokenizer_tokenize_with_exceptions(
+	struct rspamd_custom_tokenizer *tokenizer,
+	const char *text,
+	gsize len,
+	GList *exceptions,
+	rspamd_mempool_t *pool)
+{
+	rspamd_tokenizer_result_t *words;
+	rspamd_tokenizer_result_t result;
+	struct rspamd_process_exception *ex;
+	GList *cur_ex = exceptions;
+	gsize pos = 0;
+	unsigned int i;
+	int ret;
+
+	/* Allocate result kvec in pool */
+	words = rspamd_mempool_alloc(pool, sizeof(*words));
+	kv_init(*words);
+
+	/* If no exceptions, tokenize the whole text */
+	if (!exceptions) {
+		kv_init(result);
+
+		ret = tokenizer->api->tokenize(text, len, &result);
+		if (ret == 0 && result.a) {
+			/* Copy tokens from result to output */
+			for (i = 0; i < kv_size(result); i++) {
+				rspamd_word_t tok = kv_A(result, i);
+				kv_push(rspamd_word_t, *words, tok);
+			}
+
+			/* Use tokenizer's cleanup function */
+			if (tokenizer->api->cleanup_result) {
+				tokenizer->api->cleanup_result(&result);
+			}
+		}
+
+		return words;
+	}
+
+	/* Process text with exceptions */
+	while (pos < len && cur_ex) {
+		ex = (struct rspamd_process_exception *) cur_ex->data;
+
+		/* Tokenize text before exception */
+		if (ex->pos > pos) {
+			gsize segment_len = ex->pos - pos;
+			kv_init(result);
+
+			ret = tokenizer->api->tokenize(text + pos, segment_len, &result);
+			if (ret == 0 && result.a) {
+				/* Copy tokens from result, adjusting positions for segment offset */
+				for (i = 0; i < kv_size(result); i++) {
+					rspamd_word_t tok = kv_A(result, i);
+
+					/* Adjust pointers to point to the original text */
+					gsize offset_in_segment = tok.original.begin - (text + pos);
+					if (offset_in_segment < segment_len) {
+						tok.original.begin = text + pos + offset_in_segment;
+						/* Ensure we don't go past the exception boundary */
+						if (tok.original.begin + tok.original.len <= text + ex->pos) {
+							kv_push(rspamd_word_t, *words, tok);
+						}
+					}
+				}
+
+				/* Use tokenizer's cleanup function */
+				if (tokenizer->api->cleanup_result) {
+					tokenizer->api->cleanup_result(&result);
+				}
+			}
+		}
+
+		/* Add exception as a special token */
+		rspamd_word_t ex_tok;
+		memset(&ex_tok, 0, sizeof(ex_tok));
+
+		if (ex->type == RSPAMD_EXCEPTION_URL) {
+			ex_tok.original.begin = "!!EX!!";
+			ex_tok.original.len = 6;
+		}
+		else {
+			ex_tok.original.begin = text + ex->pos;
+			ex_tok.original.len = ex->len;
+		}
+		ex_tok.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+		kv_push(rspamd_word_t, *words, ex_tok);
+
+		/* Move past exception */
+		pos = ex->pos + ex->len;
+		cur_ex = g_list_next(cur_ex);
+	}
+
+	/* Process remaining text after last exception */
+	if (pos < len) {
+		kv_init(result);
+
+		ret = tokenizer->api->tokenize(text + pos, len - pos, &result);
+		if (ret == 0 && result.a) {
+			/* Copy tokens from result, adjusting positions for segment offset */
+			for (i = 0; i < kv_size(result); i++) {
+				rspamd_word_t tok = kv_A(result, i);
+
+				/* Adjust pointers to point to the original text */
+				gsize offset_in_segment = tok.original.begin - (text + pos);
+				if (offset_in_segment < (len - pos)) {
+					tok.original.begin = text + pos + offset_in_segment;
+					kv_push(rspamd_word_t, *words, tok);
+				}
+			}
+
+			/* Use tokenizer's cleanup function */
+			if (tokenizer->api->cleanup_result) {
+				tokenizer->api->cleanup_result(&result);
+			}
+		}
+	}
+
+	return words;
+}
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 0ea1bcfc6..8a9f42992 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
 #include "contrib/mumhash/mum.h"
 #include "libmime/lang_detection.h"
 #include "libstemmer.h"
+#define RSPAMD_TOKENIZER_INTERNAL
+#include "custom_tokenizer.h"
 
 #include <unicode/utf8.h>
 #include <unicode/uchar.h>
@@ -35,8 +37,8 @@
 
 #include <math.h>
 
-typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, char const **pos,
-									   rspamd_stat_token_t *token,
+typedef gboolean (*token_get_function)(rspamd_word_t *buf, char const **pos,
+									   rspamd_word_t *token,
 									   GList **exceptions, gsize *rl, gboolean check_signature);
 
 const char t_delimiters[256] = {
@@ -69,8 +71,8 @@ const char t_delimiters[256] = {
 
 /* Get next word from specified f_str_t buf */
 static gboolean
-rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf,
-							  char const **cur, rspamd_stat_token_t *token,
+rspamd_tokenizer_get_word_raw(rspamd_word_t *buf,
+							  char const **cur, rspamd_word_t *token,
 							  GList **exceptions, gsize *rl, gboolean unused)
 {
 	gsize remain, pos;
@@ -164,7 +166,7 @@ rspamd_tokenize_check_limit(gboolean decay,
 							unsigned int nwords,
 							uint64_t *hv,
 							uint64_t *prob,
-							const rspamd_stat_token_t *token,
+							const rspamd_word_t *token,
 							gssize remain,
 							gssize total)
 {
@@ -242,9 +244,9 @@ rspamd_utf_word_valid(const unsigned char *text, const unsigned char *end,
 	} while (0)
 
 static inline void
-rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
+rspamd_tokenize_exception(struct rspamd_process_exception *ex, rspamd_words_t *res)
 {
-	rspamd_stat_token_t token;
+	rspamd_word_t token;
 
 	memset(&token, 0, sizeof(token));
 
@@ -253,7 +255,7 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
 		token.original.len = sizeof("!!EX!!") - 1;
 		token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
 
-		g_array_append_val(res, token);
+		kv_push_safe(rspamd_word_t, *res, token, exception_error);
 		token.flags = 0;
 	}
 	else if (ex->type == RSPAMD_EXCEPTION_URL) {
@@ -271,28 +273,33 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
 		}
 
 		token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
-		g_array_append_val(res, token);
+		kv_push_safe(rspamd_word_t, *res, token, exception_error);
 		token.flags = 0;
 	}
+	return;
+
+exception_error:
+	/* On error, just skip this exception token */
+	return;
 }
 
 
-GArray *
+rspamd_words_t *
 rspamd_tokenize_text(const char *text, gsize len,
 					 const UText *utxt,
 					 enum rspamd_tokenize_type how,
 					 struct rspamd_config *cfg,
 					 GList *exceptions,
 					 uint64_t *hash,
-					 GArray *cur_words,
+					 rspamd_words_t *output_kvec,
 					 rspamd_mempool_t *pool)
 {
-	rspamd_stat_token_t token, buf;
+	rspamd_word_t token, buf;
 	const char *pos = NULL;
 	gsize l = 0;
-	GArray *res;
+	rspamd_words_t *res;
 	GList *cur = exceptions;
-	unsigned int min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
+	unsigned int min_len = 0, max_len = 0, word_decay = 0;
 	uint64_t hv = 0;
 	gboolean decay = FALSE, long_text_mode = FALSE;
 	uint64_t prob = 0;
@@ -300,9 +307,12 @@ rspamd_tokenize_text(const char *text, gsize len,
 	static const gsize long_text_limit = 1 * 1024 * 1024;
 	static const ev_tstamp max_exec_time = 0.2; /* 200 ms */
 	ev_tstamp start;
+	struct rspamd_custom_tokenizer *custom_tok = NULL;
+	double custom_confidence = 0.0;
+	const char *detected_lang = NULL;
 
 	if (text == NULL) {
-		return cur_words;
+		return output_kvec;
 	}
 
 	if (len > long_text_limit) {
@@ -323,15 +333,59 @@ rspamd_tokenize_text(const char *text, gsize len,
 		min_len = cfg->min_word_len;
 		max_len = cfg->max_word_len;
 		word_decay = cfg->words_decay;
-		initial_size = word_decay * 2;
 	}
 
-	if (!cur_words) {
-		res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t),
-								initial_size);
+	if (!output_kvec) {
+		res = pool ? rspamd_mempool_alloc0(pool, sizeof(*res)) : g_malloc0(sizeof(*res));
+		;
 	}
 	else {
-		res = cur_words;
+		res = output_kvec;
+	}
+
+	/* Try custom tokenizers first if we're in UTF mode */
+	if (cfg && cfg->tokenizer_manager && how == RSPAMD_TOKENIZE_UTF && utxt != NULL) {
+		custom_tok = rspamd_tokenizer_manager_detect(
+			cfg->tokenizer_manager,
+			text, len,
+			&custom_confidence,
+			NULL, /* no input language hint */
+			&detected_lang);
+
+		if (custom_tok && custom_confidence >= custom_tok->min_confidence) {
+			/* Use custom tokenizer with exception handling */
+			rspamd_tokenizer_result_t *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions(
+				custom_tok, text, len, exceptions, pool);
+
+			if (custom_res) {
+				msg_debug_pool("using custom tokenizer %s (confidence: %.2f) for text tokenization",
+							   custom_tok->name, custom_confidence);
+
+				/* Copy custom tokenizer results to output kvec */
+				for (unsigned int i = 0; i < kv_size(*custom_res); i++) {
+					kv_push_safe(rspamd_word_t, *res, kv_A(*custom_res, i), custom_tokenizer_error);
+				}
+
+				/* Calculate hash if needed */
+				if (hash && kv_size(*res) > 0) {
+					for (unsigned int i = 0; i < kv_size(*res); i++) {
+						rspamd_word_t *t = &kv_A(*res, i);
+						if (t->original.len >= sizeof(uint64_t)) {
+							uint64_t tmp;
+							memcpy(&tmp, t->original.begin, sizeof(tmp));
+							hv = mum_hash_step(hv, tmp);
+						}
+					}
+					*hash = mum_hash_finish(hv);
+				}
+
+				return res;
+			}
+			else {
+				msg_warn_pool("custom tokenizer %s failed to tokenize text, falling back to default",
+							  custom_tok->name);
+			}
+		}
 	}
 
 	if (G_UNLIKELY(how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
@@ -343,7 +397,7 @@ rspamd_tokenize_text(const char *text, gsize len,
 			}
 
 			if (token.original.len > 0 &&
-				rspamd_tokenize_check_limit(decay, word_decay, res->len,
+				rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res),
 											&hv, &prob, &token, pos - text, len)) {
 				if (!decay) {
 					decay = TRUE;
@@ -355,28 +409,28 @@ rspamd_tokenize_text(const char *text, gsize len,
 			}
 
 			if (long_text_mode) {
-				if ((res->len + 1) % 16 == 0) {
+				if ((kv_size(*res) + 1) % 16 == 0) {
 					ev_tstamp now = ev_time();
 
 					if (now - start > max_exec_time) {
 						msg_warn_pool_check(
 							"too long time has been spent on tokenization:"
-							" %.1f ms, limit is %.1f ms; %d words added so far",
+							" %.1f ms, limit is %.1f ms; %z words added so far",
 							(now - start) * 1e3, max_exec_time * 1e3,
-							res->len);
+							kv_size(*res));
 
 						goto end;
 					}
 				}
 			}
 
-			g_array_append_val(res, token);
+			kv_push_safe(rspamd_word_t, *res, token, tokenize_error);
 
-			if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
+			if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) {
 				/* Due to bug in glib ! */
 				msg_err_pool_check(
-					"too many words found: %d, stop tokenization to avoid DoS",
-					res->len);
+					"too many words found: %z, stop tokenization to avoid DoS",
+					kv_size(*res));
 
 				goto end;
 			}
@@ -523,7 +577,7 @@ rspamd_tokenize_text(const char *text, gsize len,
 				}
 
 				if (token.original.len > 0 &&
-					rspamd_tokenize_check_limit(decay, word_decay, res->len,
+					rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res),
 												&hv, &prob, &token, p, len)) {
 					if (!decay) {
 						decay = TRUE;
@@ -536,15 +590,15 @@ rspamd_tokenize_text(const char *text, gsize len,
 
 			if (token.original.len > 0) {
 				/* Additional check for number of words */
-				if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
+				if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) {
 					/* Due to bug in glib ! */
-					msg_err("too many words found: %d, stop tokenization to avoid DoS",
-							res->len);
+					msg_err("too many words found: %z, stop tokenization to avoid DoS",
+							kv_size(*res));
 
 					goto end;
 				}
 
-				g_array_append_val(res, token);
+				kv_push_safe(rspamd_word_t, *res, token, tokenize_error);
 			}
 
 			/* Also check for long text mode */
@@ -552,15 +606,15 @@ rspamd_tokenize_text(const char *text, gsize len,
 				/* Check time each 128 words added */
 				const int words_check_mask = 0x7F;
 
-				if ((res->len & words_check_mask) == words_check_mask) {
+				if ((kv_size(*res) & words_check_mask) == words_check_mask) {
 					ev_tstamp now = ev_time();
 
 					if (now - start > max_exec_time) {
 						msg_warn_pool_check(
 							"too long time has been spent on tokenization:"
-							" %.1f ms, limit is %.1f ms; %d words added so far",
+							" %.1f ms, limit is %.1f ms; %z words added so far",
 							(now - start) * 1e3, max_exec_time * 1e3,
-							res->len);
+							kv_size(*res));
 
 						goto end;
 					}
@@ -590,8 +644,14 @@ end:
 	}
 
 	return res;
+
+tokenize_error:
+custom_tokenizer_error:
+	msg_err_pool("failed to allocate memory for tokenization");
+	return res;
 }
 
+
 #undef SHIFT_EX
 
 static void
@@ -625,32 +685,38 @@ rspamd_add_metawords_from_str(const char *beg, gsize len,
 #endif
 	}
 
+	/* Initialize meta_words kvec if not already done */
+	if (!task->meta_words.a) {
+		kv_init(task->meta_words);
+	}
+
 	if (valid_utf) {
 		utext_openUTF8(&utxt,
 					   beg,
 					   len,
 					   &uc_err);
 
-		task->meta_words = rspamd_tokenize_text(beg, len,
-												&utxt, RSPAMD_TOKENIZE_UTF,
-												task->cfg, NULL, NULL,
-												task->meta_words,
-												task->task_pool);
+		rspamd_tokenize_text(beg, len,
+							 &utxt, RSPAMD_TOKENIZE_UTF,
+							 task->cfg, NULL, NULL,
+							 &task->meta_words,
+							 task->task_pool);
 
 		utext_close(&utxt);
 	}
 	else {
-		task->meta_words = rspamd_tokenize_text(beg, len,
-												NULL, RSPAMD_TOKENIZE_RAW,
-												task->cfg, NULL, NULL, task->meta_words,
-												task->task_pool);
+		rspamd_tokenize_text(beg, len,
+							 NULL, RSPAMD_TOKENIZE_RAW,
+							 task->cfg, NULL, NULL,
+							 &task->meta_words,
+							 task->task_pool);
 	}
 }
 
 void rspamd_tokenize_meta_words(struct rspamd_task *task)
 {
 	unsigned int i = 0;
-	rspamd_stat_token_t *tok;
+	rspamd_word_t *tok;
 
 	if (MESSAGE_FIELD(task, subject)) {
 		rspamd_add_metawords_from_str(MESSAGE_FIELD(task, subject),
@@ -667,7 +733,7 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task)
 		}
 	}
 
-	if (task->meta_words != NULL) {
+	if (task->meta_words.a) {
 		const char *language = NULL;
 
 		if (MESSAGE_FIELD(task, text_parts) &&
@@ -680,12 +746,12 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task)
 			}
 		}
 
-		rspamd_normalize_words(task->meta_words, task->task_pool);
-		rspamd_stem_words(task->meta_words, task->task_pool, language,
+		rspamd_normalize_words(&task->meta_words, task->task_pool);
+		rspamd_stem_words(&task->meta_words, task->task_pool, language,
 						  task->lang_det);
 
-		for (i = 0; i < task->meta_words->len; i++) {
-			tok = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+		for (i = 0; i < kv_size(task->meta_words); i++) {
+			tok = &kv_A(task->meta_words, i);
 			tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
 		}
 	}
@@ -759,7 +825,7 @@ rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok,
 	tok->normalized.begin = dest;
 }
 
-void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
+void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool)
 {
 	UErrorCode uc_err = U_ZERO_ERROR;
 	UConverter *utf8_converter;
@@ -858,25 +924,27 @@ void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *po
 	}
 }
 
-void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool)
+
+void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool)
 {
-	rspamd_stat_token_t *tok;
+	rspamd_word_t *tok;
 	unsigned int i;
 
-	for (i = 0; i < words->len; i++) {
-		tok = &g_array_index(words, rspamd_stat_token_t, i);
+	for (i = 0; i < kv_size(*words); i++) {
+		tok = &kv_A(*words, i);
 		rspamd_normalize_single_word(tok, pool);
 	}
 }
 
-void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
+
+void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool,
 					   const char *language,
 					   struct rspamd_lang_detector *lang_detector)
 {
 	static GHashTable *stemmers = NULL;
 	struct sb_stemmer *stem = NULL;
 	unsigned int i;
-	rspamd_stat_token_t *tok;
+	rspamd_word_t *tok;
 	char *dest;
 	gsize dlen;
 
@@ -909,8 +977,18 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
 			stem = NULL;
 		}
 	}
-	for (i = 0; i < words->len; i++) {
-		tok = &g_array_index(words, rspamd_stat_token_t, i);
+	for (i = 0; i < kv_size(*words); i++) {
+		tok = &kv_A(*words, i);
+
+		/* Skip stemming if token has already been stemmed by custom tokenizer */
+		if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) {
+			/* Already stemmed, just check for stop words */
+			if (tok->stemmed.len > 0 && lang_detector != NULL &&
+				rspamd_language_detector_is_stop_word(lang_detector, tok->stemmed.begin, tok->stemmed.len)) {
+				tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
+			}
+			continue;
+		}
 
 		if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
 			if (stem) {
@@ -952,4 +1030,4 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
 			}
 		}
 	}
-}
-\ No newline at end of file
+}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index d4a8824a8..bb0bb54e2 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "fstring.h"
 #include "rspamd.h"
 #include "stat_api.h"
+#include "libserver/word.h"
 
 #include <unicode/utext.h>
 
@@ -43,7 +44,7 @@ struct rspamd_stat_tokenizer {
 
 	int (*tokenize_func)(struct rspamd_stat_ctx *ctx,
 						 struct rspamd_task *task,
-						 GArray *words,
+						 rspamd_words_t *words,
 						 gboolean is_utf,
 						 const char *prefix,
 						 GPtrArray *result);
@@ -59,20 +60,20 @@ enum rspamd_tokenize_type {
 int token_node_compare_func(gconstpointer a, gconstpointer b);
 
 
-/* Tokenize text into array of words (rspamd_stat_token_t type) */
-GArray *rspamd_tokenize_text(const char *text, gsize len,
-							 const UText *utxt,
-							 enum rspamd_tokenize_type how,
-							 struct rspamd_config *cfg,
-							 GList *exceptions,
-							 uint64_t *hash,
-							 GArray *cur_words,
-							 rspamd_mempool_t *pool);
+/* Tokenize text into kvec of words (rspamd_word_t type) */
+rspamd_words_t *rspamd_tokenize_text(const char *text, gsize len,
+									 const UText *utxt,
+									 enum rspamd_tokenize_type how,
+									 struct rspamd_config *cfg,
+									 GList *exceptions,
+									 uint64_t *hash,
+									 rspamd_words_t *output_kvec,
+									 rspamd_mempool_t *pool);
 
 /* OSB tokenize function */
 int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
 						 struct rspamd_task *task,
-						 GArray *words,
+						 rspamd_words_t *words,
 						 gboolean is_utf,
 						 const char *prefix,
 						 GPtrArray *result);
@@ -83,11 +84,11 @@ gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool,
 
 struct rspamd_lang_detector;
 
-void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool);
+void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool);
 
-void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool);
-
-void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
+/* Word processing functions */
+void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool);
+void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool,
 					   const char *language,
 					   struct rspamd_lang_detector *lang_detector);