diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-01-16 15:28:40 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-01-16 15:28:40 +0000 |
commit | b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f (patch) | |
tree | a647a4306708df37a3ea1d97666fd2d325e24464 /src/tokenizers | |
parent | ffd95d7c71307bb9540f07bbaac3b04859226837 (diff) | |
download | rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.tar.gz rspamd-b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f.zip |
Reorganize statfiles and classifiers into libstat.
Diffstat (limited to 'src/tokenizers')
-rw-r--r-- | src/tokenizers/osb.c | 122 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 260 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.h | 64 |
3 files changed, 0 insertions, 446 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c deleted file mode 100644 index 9dd12a8dd..000000000 --- a/src/tokenizers/osb.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2009-2012, Vsevolod Stakhov - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * OSB tokenizer - */ - -#include <sys/types.h> -#include "tokenizers.h" - -/* Minimum length of token */ -#define MIN_LEN 4 - -extern const int primes[]; - -int -osb_tokenize_text (struct tokenizer *tokenizer, - rspamd_mempool_t * pool, - GArray * input, - GTree ** tree, - gboolean save_token, - gboolean is_utf, - GList *exceptions) -{ - token_node_t *new = NULL; - rspamd_fstring_t *token; - guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; - gint i, processed = 0; - guint w; - - if (input == NULL) { - return FALSE; - } - - if (*tree == NULL) { - *tree = g_tree_new (token_node_compare_func); - rspamd_mempool_add_destructor (pool, - (rspamd_mempool_destruct_t) g_tree_destroy, - *tree); - } - - memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0])); - - for (w = 0; w < input->len; w ++) { - token = &g_array_index (input, rspamd_fstring_t, w); - - if (processed < FEATURE_WINDOW_SIZE) { - /* Just fill a hashpipe */ - hashpipe[FEATURE_WINDOW_SIZE - ++processed] = - rspamd_fstrhash_lc (token, is_utf); - } - else { - /* Shift hashpipe */ - for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { - hashpipe[i] = hashpipe[i - 1]; - } - hashpipe[0] = rspamd_fstrhash_lc (token, is_utf); - processed++; - - for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { - h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * - primes[(i << 1) - 1]; - new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); - new->h1 = h1; - new->h2 = h2; - if (save_token) { - new->extra = - (uintptr_t)rspamd_mempool_fstrdup (pool, token); - } - - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); - } - } - } - } - - if (processed <= FEATURE_WINDOW_SIZE) { - for (i = 1; i < processed; i++) { - h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; - new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); - new->h1 = h1; - new->h2 = h2; - if (save_token) { - new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token); - } - - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); - } - } - } - - return TRUE; -} - -/* - * vi:ts=4 - */ diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c deleted file mode 100644 index 3e6c745ec..000000000 --- a/src/tokenizers/tokenizers.c +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Copyright (c) 2009-2012, Vsevolod Stakhov - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Common tokenization functions - */ - -#include <sys/types.h> -#include "main.h" -#include "tokenizers.h" - -struct tokenizer tokenizers[] = { - {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word}, -}; - -const int primes[] = { - 1, 7, - 3, 13, - 5, 29, - 11, 51, - 23, 101, - 47, 203, - 97, 407, - 197, 817, - 397, 1637, - 797, 3277, -}; - -const gchar t_delimiters[255] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, - 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0 -}; - -struct tokenizer * -get_tokenizer (const char *name) -{ - guint i; - - for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) { - if (strcmp (tokenizers[i].name, name) == 0) { - return &tokenizers[i]; - } - } - - return NULL; -} - -int -token_node_compare_func (gconstpointer a, gconstpointer b) -{ - const token_node_t *aa = a, *bb = b; - - if (aa->h1 == bb->h1) { - return aa->h2 - bb->h2; - } - - return aa->h1 - bb->h1; -} - -/* Get next word from specified f_str_t buf */ -gchar * -rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions) -{ - gsize remain, pos; - guchar *p; - struct process_exception *ex = NULL; - - if (buf == NULL) { - return NULL; - } - - if (exceptions != NULL && *exceptions != NULL) { - ex = (*exceptions)->data; - } - - if (token->begin == NULL) { - if (ex != NULL) { - if (ex->pos == 0) { - token->begin = buf->begin + ex->len; - token->len = ex->len; - } - else { - token->begin = buf->begin; - token->len = 0; - } - } - else { - token->begin = buf->begin; - token->len = 0; - } - } - - token->len = 0; - - pos = token->begin - buf->begin; - if (pos >= buf->len) { - return NULL; - } - - remain = buf->len - pos; - p = token->begin; - /* Skip non delimiters symbols */ - do { - if (ex != NULL && ex->pos == pos) { - /* Go to the next exception */ - *exceptions = g_list_next (*exceptions); - return p + ex->len; - } - pos++; - p++; - remain--; - } while (remain > 0 && t_delimiters[*p]); - - token->begin = p; - - while (remain > 0 && !t_delimiters[*p]) { - if (ex != NULL && ex->pos == pos) { - *exceptions = g_list_next (*exceptions); - return p + ex->len; - } - token->len++; - pos++; - remain--; - p++; - } - - if (remain == 0) { - return NULL; - } - - return p; -} - -GArray * -rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList **exceptions) -{ - rspamd_fstring_t token, buf; - gchar *pos; - gsize l; - GArray *res; - - if (len == 0 || text == NULL) { - return NULL; - } - - buf.begin = text; - buf.len = len; - buf.size = buf.len; - token.begin = NULL; - token.len = 0; - - res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t)); - while ((pos = rspamd_tokenizer_get_word (&buf, - &token, exceptions)) != NULL) { - if (is_utf) { - l = g_utf8_strlen (token.begin, token.len); - } - else { - l = token.len; - } - if (min_len > 0 && l < min_len) { - token.begin = pos; - continue; - } - g_array_append_val (res, token); - - token.begin = pos; - } - - return res; -} - - -void -tokenize_subject (struct rspamd_task *task, GTree ** tree) -{ - gchar *sub; - struct tokenizer *osb_tokenizer; - GArray *words; - - if (*tree == NULL) { - *tree = g_tree_new (token_node_compare_func); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_tree_destroy, *tree); - } - - osb_tokenizer = get_tokenizer ("osb-text"); - - /* Try to use pre-defined subject */ - if (task->subject != NULL) { - sub = task->subject; - } - else { - sub = (gchar *)g_mime_message_get_subject (task->message); - } - - if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); - if (words != NULL) { - osb_tokenizer->tokenize_func (osb_tokenizer, - task->task_pool, - words, - tree, - FALSE, - TRUE, - NULL); - g_array_free (words, TRUE); - } - } -} - -/* - * vi:ts=4 - */ diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h deleted file mode 100644 index ed47e0add..000000000 --- a/src/tokenizers/tokenizers.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef TOKENIZERS_H -#define TOKENIZERS_H - -#include "config.h" -#include "mem_pool.h" -#include "fstring.h" -#include "main.h" - -/* Size for features pipe */ -#define FEATURE_WINDOW_SIZE 5 - -typedef struct token_node_s { - guint32 h1; - guint32 h2; - double value; - uintptr_t extra; -} token_node_t; - -/* Common tokenizer structure */ -struct tokenizer { - gchar *name; - gint (*tokenize_func)(struct tokenizer *tokenizer, - rspamd_mempool_t *pool, - GArray *words, - GTree **cur, - gboolean save_token, - gboolean is_utf, - GList *exceptions); - gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions); -}; - -/* Compare two token nodes */ -int token_node_compare_func (gconstpointer a, gconstpointer b); - -/* Get tokenizer structure by name or return NULL if this name is not found */ -struct tokenizer * get_tokenizer (const char *name); - -/* Get next word from specified f_str_t buf */ -gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf, - rspamd_fstring_t *token, GList **exceptions); - -/* Tokenize text into array of words (rspamd_fstring_t type) */ -GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList **exceptions); - -/* OSB tokenize function */ -int osb_tokenize_text (struct tokenizer *tokenizer, - rspamd_mempool_t *pool, - GArray *input, - GTree **cur, - gboolean save_token, - gboolean is_utf, - GList *exceptions); - -/* Make tokens for a subject */ -void tokenize_subject (struct rspamd_task *task, GTree ** tree); - -/* Array of all defined tokenizers */ -extern struct tokenizer tokenizers[]; - -#endif -/* - * vi:ts=4 - */ |