diff options
Diffstat (limited to 'src/libstat/tokenizers/osb.c')
-rw-r--r-- | src/libstat/tokenizers/osb.c | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c new file mode 100644 index 000000000..9dd12a8dd --- /dev/null +++ b/src/libstat/tokenizers/osb.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2009-2012, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * OSB tokenizer + */ + +#include <sys/types.h> +#include "tokenizers.h" + +/* Minimum length of token */ +#define MIN_LEN 4 + +extern const int primes[]; + +int +osb_tokenize_text (struct tokenizer *tokenizer, + rspamd_mempool_t * pool, + GArray * input, + GTree ** tree, + gboolean save_token, + gboolean is_utf, + GList *exceptions) +{ + token_node_t *new = NULL; + rspamd_fstring_t *token; + guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; + gint i, processed = 0; + guint w; + + if (input == NULL) { + return FALSE; + } + + if (*tree == NULL) { + *tree = g_tree_new (token_node_compare_func); + rspamd_mempool_add_destructor (pool, + (rspamd_mempool_destruct_t) g_tree_destroy, + *tree); + } + + memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0])); + + for (w = 0; w < input->len; w ++) { + token = &g_array_index (input, rspamd_fstring_t, w); + + if (processed < FEATURE_WINDOW_SIZE) { + /* Just fill a hashpipe */ + hashpipe[FEATURE_WINDOW_SIZE - ++processed] = + rspamd_fstrhash_lc (token, is_utf); + } + else { + /* Shift hashpipe */ + for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { + hashpipe[i] = hashpipe[i - 1]; + } + hashpipe[0] = rspamd_fstrhash_lc (token, is_utf); + processed++; + + for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * + primes[(i << 1) - 1]; + new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = + (uintptr_t)rspamd_mempool_fstrdup (pool, token); + } + + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } + } + } + } + + if (processed <= FEATURE_WINDOW_SIZE) { + for (i = 1; i < processed; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; + new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token); + } + + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } + } + } + + return TRUE; +} + +/* + * vi:ts=4 + */ |