diff options
Diffstat (limited to 'src/libstat/tokenizers/custom_tokenizer.h')
-rw-r--r-- | src/libstat/tokenizers/custom_tokenizer.h | 177 |
1 files changed, 177 insertions, 0 deletions
diff --git a/src/libstat/tokenizers/custom_tokenizer.h b/src/libstat/tokenizers/custom_tokenizer.h new file mode 100644 index 000000000..bc173a1da --- /dev/null +++ b/src/libstat/tokenizers/custom_tokenizer.h @@ -0,0 +1,177 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_CUSTOM_TOKENIZER_H +#define RSPAMD_CUSTOM_TOKENIZER_H + +/* Check if we're being included by internal Rspamd code or external plugins */ +#ifdef RSPAMD_TOKENIZER_INTERNAL +/* Internal Rspamd usage - use the full headers */ +#include "config.h" +#include "ucl.h" +#include "libserver/word.h" +#else +/* External plugin usage - use standalone types */ +#include "rspamd_tokenizer_types.h" +/* Forward declaration for UCL object - plugins should include ucl.h if needed */ +typedef struct ucl_object_s ucl_object_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1 + +/** + * Tokenization result - compatible with both internal and external usage + */ +typedef rspamd_words_t rspamd_tokenizer_result_t; + +/** + * Custom tokenizer API that must be implemented by language-specific tokenizer plugins + * All functions use only plain C types to ensure clean boundaries + */ +typedef struct rspamd_custom_tokenizer_api { + /* API version for compatibility checking */ + unsigned int api_version; + + /* Name of the tokenizer (e.g., "japanese_mecab") */ + const char *name; + + /** + * Global initialization function called once when the tokenizer is loaded + * @param config UCL configuration object for this tokenizer (may be NULL) + * @param error_buf Buffer for error message (at least 256 bytes) + * @return 0 on success, non-zero on failure + */ + int (*init)(const ucl_object_t *config, char *error_buf, size_t error_buf_size); + + /** + * Global cleanup function called when the tokenizer is unloaded + */ + void (*deinit)(void); + + /** + * Quick language detection to check if this tokenizer can handle the text + * @param text UTF-8 text to analyze + * @param len Length of the text in bytes + * @return Confidence score 0.0-1.0, or -1.0 if cannot handle + */ + double (*detect_language)(const char *text, size_t len); + + /** + * Main tokenization function + * @param text UTF-8 text to tokenize + * @param len Length of the text in bytes + * @param result Output kvec to fill with rspamd_word_t elements + * @return 0 on success, non-zero on failure + * + * The tokenizer should allocate result->a using its own allocator + * Rspamd will call cleanup_result() to free it after processing + */ + int (*tokenize)(const char *text, size_t len, + rspamd_tokenizer_result_t *result); + + /** + * Cleanup the result from tokenize() + * @param result Result kvec returned by tokenize() + * + * This function should free result->a using the same allocator + * that was used in tokenize() and reset the kvec fields. + * This ensures proper memory management across DLL boundaries. + * Note: This does NOT free the result structure itself, only its contents. + */ + void (*cleanup_result)(rspamd_tokenizer_result_t *result); + + /** + * Optional: Get language hint for better language detection + * @return Language code (e.g., "ja", "zh") or NULL + */ + const char *(*get_language_hint)(void); + + /** + * Optional: Get minimum confidence threshold for this tokenizer + * @return Minimum confidence (0.0-1.0) or -1.0 to use default + */ + double (*get_min_confidence)(void); + +} rspamd_custom_tokenizer_api_t; + +/** + * Entry point function that plugins must export + * Must be named "rspamd_tokenizer_get_api" + */ +typedef const rspamd_custom_tokenizer_api_t *(*rspamd_tokenizer_get_api_func)(void); + +/* Internal Rspamd structures - not exposed to plugins */ +#ifdef RSPAMD_TOKENIZER_INTERNAL + +/** + * Custom tokenizer instance + */ +struct rspamd_custom_tokenizer { + char *name; /* Tokenizer name from config */ + char *path; /* Path to .so file */ + void *handle; /* dlopen handle */ + const rspamd_custom_tokenizer_api_t *api; /* API functions */ + double priority; /* Detection priority */ + double min_confidence; /* Minimum confidence threshold */ + gboolean enabled; /* Is tokenizer enabled */ + ucl_object_t *config; /* Tokenizer-specific config */ +}; + +/** + * Tokenizer manager structure + */ +struct rspamd_tokenizer_manager { + GHashTable *tokenizers; /* name -> rspamd_custom_tokenizer */ + GArray *detection_order; /* Ordered by priority */ + rspamd_mempool_t *pool; + double default_threshold; /* Default confidence threshold */ +}; + +/* Manager functions */ +struct rspamd_tokenizer_manager *rspamd_tokenizer_manager_new(rspamd_mempool_t *pool); +void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr); + +gboolean rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr, + const char *name, + const ucl_object_t *config, + GError **err); + +struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect( + struct rspamd_tokenizer_manager *mgr, + const char *text, size_t len, + double *confidence, + const char *lang_hint, + const char **detected_lang_hint); + +/* Helper function to tokenize with exceptions handling */ +rspamd_tokenizer_result_t *rspamd_custom_tokenizer_tokenize_with_exceptions( + struct rspamd_custom_tokenizer *tokenizer, + const char *text, + gsize len, + GList *exceptions, + rspamd_mempool_t *pool); + +#endif /* RSPAMD_TOKENIZER_INTERNAL */ + +#ifdef __cplusplus +} +#endif + +#endif /* RSPAMD_CUSTOM_TOKENIZER_H */ |