diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2008-11-07 19:35:13 +0300 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2008-11-07 19:35:13 +0300 |
commit | 62cdcc73c4f817516cbcb20e9e5bfad556bea4b7 (patch) | |
tree | a6ab9b606bf4e44e405fe9ee4dae983938c9cee4 /src/tokenizers | |
parent | 2175980532791f90807eb03ef99d6f7006ada4e6 (diff) | |
download | rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.tar.gz rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.zip |
* Add simple implementation of OSB tokenizer
Diffstat (limited to 'src/tokenizers')
-rw-r--r-- | src/tokenizers/osb.c | 69 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 45 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.h | 29 |
3 files changed, 143 insertions, 0 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c new file mode 100644 index 000000000..f78e20992 --- /dev/null +++ b/src/tokenizers/osb.c @@ -0,0 +1,69 @@ +/* + * OSB tokenizer + */ + +#include <sys/types.h> +#include "tokenizers.h" + + +/* Coefficients that are used for OSB tokenizer */ +static const int primes[] = { + 1, 7, + 3, 13, + 5, 29, + 11, 51, + 23, 101, + 47, 203, + 97, 407, + 197, 817, + 397, 1637, + 797, 3277, +}; + +token_list_t * +osb_tokenize_text (memory_pool_t *pool, f_str_t *input) +{ + token_list_t *new = NULL, *head = NULL, *last = NULL; + f_str_t token = { NULL, 0, 0 }; + uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; + int i; + + /* First set all bytes of hashpipe to some common value */ + for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) { + hashpipe[i] = 0xABCDEF; + } + + while (get_next_word (input, &token)) { + /* Shift hashpipe */ + for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) { + hashpipe[i] = hashpipe[i - 1]; + } + hashpipe[0] = fstrhash (&token); + + for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) { + h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1]; + new = memory_pool_alloc (pool, sizeof (token_list_t)); + new->h1 = h1; + new->h2 = h2; + if (last) { + last->next = new; + } + else { + head = new; + } + last = new; + + msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2); + } + } + if (last) { + last->next = NULL; + } + + return head; +} + +/* + * vi:ts=4 + */ diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c new file mode 100644 index 000000000..132a57ce0 --- /dev/null +++ b/src/tokenizers/tokenizers.c @@ -0,0 +1,45 @@ +/* + * Common tokenization functions + */ + +#include <sys/types.h> +#include "tokenizers.h" + +/* Get next word from specified f_str_t buf */ +f_str_t * +get_next_word (f_str_t *buf, f_str_t *token) +{ + size_t remain; + char *pos; + + if (buf == NULL) { + return NULL; + } + + if (token->begin == NULL) { + token->begin = buf->begin; + } + + remain = buf->len - (token->begin - buf->begin); + if (remain <= 0) { + return NULL; + } + + token->begin = token->begin + token->len; + token->len = 0; + + pos = token->begin; + /* Skip non graph symbols */ + while (remain-- && !g_ascii_isgraph (*pos ++)) { + token->begin ++; + } + while (remain-- && g_ascii_isgraph (*pos ++)) { + token->len ++; + } + + return token; +} + +/* + * vi:ts=4 + */ diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h new file mode 100644 index 000000000..6b4bff5e0 --- /dev/null +++ b/src/tokenizers/tokenizers.h @@ -0,0 +1,29 @@ +#ifndef TOKENIZERS_H +#define TOKENIZERS_H + +#include <sys/types.h> +#include "../config.h" + +#ifdef HAVE_STDINT_H +#include <stdint.h> +#endif +#include "../mem_pool.h" +#include "../fstring.h" +#include "../main.h" + +/* Size for features pipe */ +#define FEATURE_WINDOW_SIZE 5 + +typedef struct token_list_s { + uint32_t h1; + uint32_t h2; + struct token_list_s *next; +} token_list_t; + +/* Get next word from specified f_str_t buf */ +f_str_t *get_next_word (f_str_t *buf, f_str_t *token); + +#endif +/* + * vi:ts=4 + */ |