diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-01-16 15:28:40 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-01-16 15:28:40 +0000 |
commit | b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f (patch) | |
tree | a647a4306708df37a3ea1d97666fd2d325e24464 /src/libstat/tokenizers | |
parent | ffd95d7c71307bb9540f07bbaac3b04859226837 (diff) | |
download | rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.tar.gz rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.zip |
Reorganize statfiles and classifiers into libstat.
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/osb.c | 122 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 260 |
2 files changed, 382 insertions, 0 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c new file mode 100644 index 000000000..9dd12a8dd --- /dev/null +++ b/src/libstat/tokenizers/osb.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2009-2012, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * OSB tokenizer + */ + +#include <sys/types.h> +#include "tokenizers.h" + +/* Minimum length of token */ +#define MIN_LEN 4 + +extern const int primes[]; + +int +osb_tokenize_text (struct tokenizer *tokenizer, + rspamd_mempool_t * pool, + GArray * input, + GTree ** tree, + gboolean save_token, + gboolean is_utf, + GList *exceptions) +{ + token_node_t *new = NULL; + rspamd_fstring_t *token; + guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; + gint i, processed = 0; + guint w; + + if (input == NULL) { + return FALSE; + } + + if (*tree == NULL) { + *tree = g_tree_new (token_node_compare_func); + rspamd_mempool_add_destructor (pool, + (rspamd_mempool_destruct_t) g_tree_destroy, + *tree); + } + + memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0])); + + for (w = 0; w < input->len; w ++) { + token = &g_array_index (input, rspamd_fstring_t, w); + + if (processed < FEATURE_WINDOW_SIZE) { + /* Just fill a hashpipe */ + hashpipe[FEATURE_WINDOW_SIZE - ++processed] = + rspamd_fstrhash_lc (token, is_utf); + } + else { + /* Shift hashpipe */ + for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { + hashpipe[i] = hashpipe[i - 1]; + } + hashpipe[0] = rspamd_fstrhash_lc (token, is_utf); + processed++; + + for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * + primes[(i << 1) - 1]; + new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = + (uintptr_t)rspamd_mempool_fstrdup (pool, token); + } + + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } + } + } + } + + if (processed <= FEATURE_WINDOW_SIZE) { + for (i = 1; i < processed; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; + new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token); + } + + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } + } + } + + return TRUE; +} + +/* + * vi:ts=4 + */ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c new file mode 100644 index 000000000..3e6c745ec --- /dev/null +++ b/src/libstat/tokenizers/tokenizers.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2009-2012, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Common tokenization functions + */ + +#include <sys/types.h> +#include "main.h" +#include "tokenizers.h" + +struct tokenizer tokenizers[] = { + {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word}, +}; + +const int primes[] = { + 1, 7, + 3, 13, + 5, 29, + 11, 51, + 23, 101, + 47, 203, + 97, 407, + 197, 817, + 397, 1637, + 797, 3277, +}; + +const gchar t_delimiters[255] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0 +}; + +struct tokenizer * +get_tokenizer (const char *name) +{ + guint i; + + for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) { + if (strcmp (tokenizers[i].name, name) == 0) { + return &tokenizers[i]; + } + } + + return NULL; +} + +int +token_node_compare_func (gconstpointer a, gconstpointer b) +{ + const token_node_t *aa = a, *bb = b; + + if (aa->h1 == bb->h1) { + return aa->h2 - bb->h2; + } + + return aa->h1 - bb->h1; +} + +/* Get next word from specified f_str_t buf */ +gchar * +rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions) +{ + gsize remain, pos; + guchar *p; + struct process_exception *ex = NULL; + + if (buf == NULL) { + return NULL; + } + + if (exceptions != NULL && *exceptions != NULL) { + ex = (*exceptions)->data; + } + + if (token->begin == NULL) { + if (ex != NULL) { + if (ex->pos == 0) { + token->begin = buf->begin + ex->len; + token->len = ex->len; + } + else { + token->begin = buf->begin; + token->len = 0; + } + } + else { + token->begin = buf->begin; + token->len = 0; + } + } + + token->len = 0; + + pos = token->begin - buf->begin; + if (pos >= buf->len) { + return NULL; + } + + remain = buf->len - pos; + p = token->begin; + /* Skip non delimiters symbols */ + do { + if (ex != NULL && ex->pos == pos) { + /* Go to the next exception */ + *exceptions = g_list_next (*exceptions); + return p + ex->len; + } + pos++; + p++; + remain--; + } while (remain > 0 && t_delimiters[*p]); + + token->begin = p; + + while (remain > 0 && !t_delimiters[*p]) { + if (ex != NULL && ex->pos == pos) { + *exceptions = g_list_next (*exceptions); + return p + ex->len; + } + token->len++; + pos++; + remain--; + p++; + } + + if (remain == 0) { + return NULL; + } + + return p; +} + +GArray * +rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, + gsize min_len, GList **exceptions) +{ + rspamd_fstring_t token, buf; + gchar *pos; + gsize l; + GArray *res; + + if (len == 0 || text == NULL) { + return NULL; + } + + buf.begin = text; + buf.len = len; + buf.size = buf.len; + token.begin = NULL; + token.len = 0; + + res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t)); + while ((pos = rspamd_tokenizer_get_word (&buf, + &token, exceptions)) != NULL) { + if (is_utf) { + l = g_utf8_strlen (token.begin, token.len); + } + else { + l = token.len; + } + if (min_len > 0 && l < min_len) { + token.begin = pos; + continue; + } + g_array_append_val (res, token); + + token.begin = pos; + } + + return res; +} + + +void +tokenize_subject (struct rspamd_task *task, GTree ** tree) +{ + gchar *sub; + struct tokenizer *osb_tokenizer; + GArray *words; + + if (*tree == NULL) { + *tree = g_tree_new (token_node_compare_func); + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t) g_tree_destroy, *tree); + } + + osb_tokenizer = get_tokenizer ("osb-text"); + + /* Try to use pre-defined subject */ + if (task->subject != NULL) { + sub = task->subject; + } + else { + sub = (gchar *)g_mime_message_get_subject (task->message); + } + + if (sub != NULL) { + words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); + if (words != NULL) { + osb_tokenizer->tokenize_func (osb_tokenizer, + task->task_pool, + words, + tree, + FALSE, + TRUE, + NULL); + g_array_free (words, TRUE); + } + } +} + +/* + * vi:ts=4 + */ |