summaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-01-23 13:50:17 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-01-23 13:50:17 +0000
commit8a05515078bc8fd3d642778fcae0d005a38ec7b0 (patch)
tree239d86bbbb5ba6bafdfa6aeaa41e848dcdc46079 /src/libstat/tokenizers
parent8f5509c65dc6907a7581518246a200236088423c (diff)
downloadrspamd-8a05515078bc8fd3d642778fcae0d005a38ec7b0.tar.gz
rspamd-8a05515078bc8fd3d642778fcae0d005a38ec7b0.zip
Reorganize libstat API.
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/tokenizers.c18
-rw-r--r--src/libstat/tokenizers/tokenizers.h49
2 files changed, 49 insertions, 18 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 10e4b92d5..eee41a971 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -29,10 +29,6 @@
#include "main.h"
#include "tokenizers.h"
-struct tokenizer tokenizers[] = {
- {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word},
-};
-
const int primes[] = {
1, 7,
3, 13,
@@ -75,20 +71,6 @@ const gchar t_delimiters[255] = {
0, 0, 0, 0, 0
};
-struct tokenizer *
-rspamd_stat_get_tokenizer (const char *name)
-{
- guint i;
-
- for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) {
- if (strcmp (tokenizers[i].name, name) == 0) {
- return &tokenizers[i];
- }
- }
-
- return NULL;
-}
-
int
token_node_compare_func (gconstpointer a, gconstpointer b)
{
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
new file mode 100644
index 000000000..8ee11cea1
--- /dev/null
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -0,0 +1,49 @@
+#ifndef TOKENIZERS_H
+#define TOKENIZERS_H
+
+#include "config.h"
+#include "mem_pool.h"
+#include "fstring.h"
+#include "main.h"
+#include "stat_api.h"
+
+/* Common tokenizer structure */
+struct tokenizer {
+ gchar *name;
+ gint (*tokenize_func)(struct tokenizer *tokenizer,
+ rspamd_mempool_t *pool,
+ GArray *words,
+ GTree **cur,
+ gboolean save_token,
+ gboolean is_utf,
+ GList *exceptions);
+ gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions);
+};
+
+/* Compare two token nodes */
+int token_node_compare_func (gconstpointer a, gconstpointer b);
+
+/* Get next word from specified f_str_t buf */
+gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
+ rspamd_fstring_t *token, GList **exceptions);
+
+/* Tokenize text into array of words (rspamd_fstring_t type) */
+GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
+ gsize min_len, GList **exceptions);
+
+/* OSB tokenize function */
+int osb_tokenize_text (struct tokenizer *tokenizer,
+ rspamd_mempool_t *pool,
+ GArray *input,
+ GTree **cur,
+ gboolean save_token,
+ gboolean is_utf,
+ GList *exceptions);
+
+/* Make tokens for a subject */
+void tokenize_subject (struct rspamd_task *task, GTree ** tree);
+
+#endif
+/*
+ * vi:ts=4
+ */