aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers/custom_tokenizer.h
blob: bc173a1da4c2eb40ac5b659aa624b95d212a9aa6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/*
 * Copyright 2025 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef RSPAMD_CUSTOM_TOKENIZER_H
#define RSPAMD_CUSTOM_TOKENIZER_H

/* Check if we're being included by internal Rspamd code or external plugins */
#ifdef RSPAMD_TOKENIZER_INTERNAL
/* Internal Rspamd usage - use the full headers */
#include "config.h"
#include "ucl.h"
#include "libserver/word.h"
#else
/* External plugin usage - use standalone types */
#include "rspamd_tokenizer_types.h"
/* Forward declaration for UCL object - plugins should include ucl.h if needed */
typedef struct ucl_object_s ucl_object_t;
#endif

#ifdef __cplusplus
extern "C" {
#endif

#define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1

/**
 * Tokenization result - compatible with both internal and external usage
 */
typedef rspamd_words_t rspamd_tokenizer_result_t;

/**
 * Custom tokenizer API that must be implemented by language-specific tokenizer plugins
 * All functions use only plain C types to ensure clean boundaries
 */
typedef struct rspamd_custom_tokenizer_api {
	/* API version for compatibility checking */
	unsigned int api_version;

	/* Name of the tokenizer (e.g., "japanese_mecab") */
	const char *name;

	/**
	 * Global initialization function called once when the tokenizer is loaded
	 * @param config UCL configuration object for this tokenizer (may be NULL)
	 * @param error_buf Buffer for error message (at least 256 bytes)
	 * @return 0 on success, non-zero on failure
	 */
	int (*init)(const ucl_object_t *config, char *error_buf, size_t error_buf_size);

	/**
	 * Global cleanup function called when the tokenizer is unloaded
	 */
	void (*deinit)(void);

	/**
	 * Quick language detection to check if this tokenizer can handle the text
	 * @param text UTF-8 text to analyze
	 * @param len Length of the text in bytes
	 * @return Confidence score 0.0-1.0, or -1.0 if cannot handle
	 */
	double (*detect_language)(const char *text, size_t len);

	/**
	 * Main tokenization function
	 * @param text UTF-8 text to tokenize
	 * @param len Length of the text in bytes
	 * @param result Output kvec to fill with rspamd_word_t elements
	 * @return 0 on success, non-zero on failure
	 *
	 * The tokenizer should allocate result->a using its own allocator
	 * Rspamd will call cleanup_result() to free it after processing
	 */
	int (*tokenize)(const char *text, size_t len,
					rspamd_tokenizer_result_t *result);

	/**
	 * Cleanup the result from tokenize()
	 * @param result Result kvec returned by tokenize()
	 *
	 * This function should free result->a using the same allocator
	 * that was used in tokenize() and reset the kvec fields.
	 * This ensures proper memory management across DLL boundaries.
	 * Note: This does NOT free the result structure itself, only its contents.
	 */
	void (*cleanup_result)(rspamd_tokenizer_result_t *result);

	/**
	 * Optional: Get language hint for better language detection
	 * @return Language code (e.g., "ja", "zh") or NULL
	 */
	const char *(*get_language_hint)(void);

	/**
	 * Optional: Get minimum confidence threshold for this tokenizer
	 * @return Minimum confidence (0.0-1.0) or -1.0 to use default
	 */
	double (*get_min_confidence)(void);

} rspamd_custom_tokenizer_api_t;

/**
 * Entry point function that plugins must export
 * Must be named "rspamd_tokenizer_get_api"
 */
typedef const rspamd_custom_tokenizer_api_t *(*rspamd_tokenizer_get_api_func)(void);

/* Internal Rspamd structures - not exposed to plugins */
#ifdef RSPAMD_TOKENIZER_INTERNAL

/**
 * Custom tokenizer instance
 */
struct rspamd_custom_tokenizer {
	char *name;                               /* Tokenizer name from config */
	char *path;                               /* Path to .so file */
	void *handle;                             /* dlopen handle */
	const rspamd_custom_tokenizer_api_t *api; /* API functions */
	double priority;                          /* Detection priority */
	double min_confidence;                    /* Minimum confidence threshold */
	gboolean enabled;                         /* Is tokenizer enabled */
	ucl_object_t *config;                     /* Tokenizer-specific config */
};

/**
 * Tokenizer manager structure
 */
struct rspamd_tokenizer_manager {
	GHashTable *tokenizers;  /* name -> rspamd_custom_tokenizer */
	GArray *detection_order; /* Ordered by priority */
	rspamd_mempool_t *pool;
	double default_threshold; /* Default confidence threshold */
};

/* Manager functions */
struct rspamd_tokenizer_manager *rspamd_tokenizer_manager_new(rspamd_mempool_t *pool);
void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr);

gboolean rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr,
												 const char *name,
												 const ucl_object_t *config,
												 GError **err);

struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect(
	struct rspamd_tokenizer_manager *mgr,
	const char *text, size_t len,
	double *confidence,
	const char *lang_hint,
	const char **detected_lang_hint);

/* Helper function to tokenize with exceptions handling */
rspamd_tokenizer_result_t *rspamd_custom_tokenizer_tokenize_with_exceptions(
	struct rspamd_custom_tokenizer *tokenizer,
	const char *text,
	gsize len,
	GList *exceptions,
	rspamd_mempool_t *pool);

#endif /* RSPAMD_TOKENIZER_INTERNAL */

#ifdef __cplusplus
}
#endif

#endif /* RSPAMD_CUSTOM_TOKENIZER_H */