From: Vsevolod Stakhov Date: Fri, 7 Nov 2008 16:35:13 +0000 (+0300) Subject: * Add simple implementation of OSB tokenizer X-Git-Tag: 0.2.7~345 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=62cdcc73c4f817516cbcb20e9e5bfad556bea4b7;p=rspamd.git * Add simple implementation of OSB tokenizer --- diff --git a/configure b/configure index a36fc05eb..2cd2af6f9 100755 --- a/configure +++ b/configure @@ -24,7 +24,7 @@ CACHE="config.cache" SRCDIR="src" OBJDIR="src/.obj" -SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c ${LEX_OUTPUT} ${YACC_OUTPUT}" +SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c ${LEX_OUTPUT} ${YACC_OUTPUT}" MODULES="surbl regexp" CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter" @@ -228,13 +228,14 @@ check_function() fi done echo -n "Testing for $FUNCTION: " + echo >> config.log + echo "Testing for $FUNCTION: " >> config.log check_cache "function" "$FUNCTION" if [ $? -eq 0 ] ; then echo "-> OK (cached)" + echo "-> OK (cached)" >> config.log return 0 fi - echo >> config.log - echo "Testing for $FUNCTION: " >> config.log echo "#include " > autotest.c if [ "F$INCLUDE" != "F" ] ; then for inc in $INCLUDE ; do @@ -271,6 +272,7 @@ check_include() check_cache "include" "$INCLUDE" if [ $? -eq 0 ] ; then echo "-> OK (cached)" + echo "-> OK (cached)" >> config.log have_opt $CACHED return 0 fi @@ -311,13 +313,14 @@ check_macro() fi done echo -n "Testing for $MACRO: " + echo >> config.log + echo "Testing for $MACRO: " >> config.log check_cache "macro" "$MACRO" if [ $? -eq 0 ] ; then echo "-> OK (cached)" + echo "-> OK (cached)" >> config.log return 0 fi - echo >> config.log - echo "Testing for $MACRO: " >> config.log echo "#include " > autotest.c for inc in $INCLUDE ; do echo "#include \"$inc\"" >> autotest.c @@ -367,6 +370,7 @@ check_lib() if [ $? -eq 0 ] ; then LIBS="$LIBS -l$LIB" echo "-> OK (cached)" + echo "-> OK (cached)" >> config.log return 0 fi echo "#include " > autotest.c @@ -411,17 +415,18 @@ check_package() fi done echo -n "Testing for $PACKAGE: " + echo >> config.log + echo "Testing for $PACKAGE: " >> config.log check_cache "package" "$PACKAGE" if [ $? -eq 0 ] ; then echo "-> OK (cached)" + echo "-> OK (cached)" >> config.log _p_cflags=`echo $CACHED | cut -d ':' -f 1` _p_libs=`echo $CACHED | cut -d ':' -f 2` LIBS="$LIBS $_p_libs" CFLAGS="$CFLAGS $_p_cflags" return 0 fi - echo >> config.log - echo "Testing for $PACKAGE: " >> config.log echo "#include " > autotest.c if [ "F$INCLUDE" != "F" ] ; then for inc in $INCLUDE ; do @@ -460,6 +465,7 @@ check_perl() check_cache "perl" "$PERL" if [ $? -eq 0 ] ; then echo "-> OK (cached)" + echo "-> OK (cached)" >> config.log _p_cflags=`echo $CACHED | cut -d ':' -f 1` _p_libs=`echo $CACHED | cut -d ':' -f 2` PERLLDFLAGS="$LIBS $_p_libs" diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c new file mode 100644 index 000000000..f78e20992 --- /dev/null +++ b/src/tokenizers/osb.c @@ -0,0 +1,69 @@ +/* + * OSB tokenizer + */ + +#include +#include "tokenizers.h" + + +/* Coefficients that are used for OSB tokenizer */ +static const int primes[] = { + 1, 7, + 3, 13, + 5, 29, + 11, 51, + 23, 101, + 47, 203, + 97, 407, + 197, 817, + 397, 1637, + 797, 3277, +}; + +token_list_t * +osb_tokenize_text (memory_pool_t *pool, f_str_t *input) +{ + token_list_t *new = NULL, *head = NULL, *last = NULL; + f_str_t token = { NULL, 0, 0 }; + uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; + int i; + + /* First set all bytes of hashpipe to some common value */ + for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) { + hashpipe[i] = 0xABCDEF; + } + + while (get_next_word (input, &token)) { + /* Shift hashpipe */ + for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) { + hashpipe[i] = hashpipe[i - 1]; + } + hashpipe[0] = fstrhash (&token); + + for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) { + h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1]; + new = memory_pool_alloc (pool, sizeof (token_list_t)); + new->h1 = h1; + new->h2 = h2; + if (last) { + last->next = new; + } + else { + head = new; + } + last = new; + + msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2); + } + } + if (last) { + last->next = NULL; + } + + return head; +} + +/* + * vi:ts=4 + */ diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c new file mode 100644 index 000000000..132a57ce0 --- /dev/null +++ b/src/tokenizers/tokenizers.c @@ -0,0 +1,45 @@ +/* + * Common tokenization functions + */ + +#include +#include "tokenizers.h" + +/* Get next word from specified f_str_t buf */ +f_str_t * +get_next_word (f_str_t *buf, f_str_t *token) +{ + size_t remain; + char *pos; + + if (buf == NULL) { + return NULL; + } + + if (token->begin == NULL) { + token->begin = buf->begin; + } + + remain = buf->len - (token->begin - buf->begin); + if (remain <= 0) { + return NULL; + } + + token->begin = token->begin + token->len; + token->len = 0; + + pos = token->begin; + /* Skip non graph symbols */ + while (remain-- && !g_ascii_isgraph (*pos ++)) { + token->begin ++; + } + while (remain-- && g_ascii_isgraph (*pos ++)) { + token->len ++; + } + + return token; +} + +/* + * vi:ts=4 + */ diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h new file mode 100644 index 000000000..6b4bff5e0 --- /dev/null +++ b/src/tokenizers/tokenizers.h @@ -0,0 +1,29 @@ +#ifndef TOKENIZERS_H +#define TOKENIZERS_H + +#include +#include "../config.h" + +#ifdef HAVE_STDINT_H +#include +#endif +#include "../mem_pool.h" +#include "../fstring.h" +#include "../main.h" + +/* Size for features pipe */ +#define FEATURE_WINDOW_SIZE 5 + +typedef struct token_list_s { + uint32_t h1; + uint32_t h2; + struct token_list_s *next; +} token_list_t; + +/* Get next word from specified f_str_t buf */ +f_str_t *get_next_word (f_str_t *buf, f_str_t *token); + +#endif +/* + * vi:ts=4 + */