]> source.dussan.org Git - rspamd.git/commitdiff
* Add simple implementation of OSB tokenizer
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 7 Nov 2008 16:35:13 +0000 (19:35 +0300)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 7 Nov 2008 16:35:13 +0000 (19:35 +0300)
configure
src/tokenizers/osb.c [new file with mode: 0644]
src/tokenizers/tokenizers.c [new file with mode: 0644]
src/tokenizers/tokenizers.h [new file with mode: 0644]

index a36fc05eb66048a9ac5d4d6814b23c4ad76e4fc3..2cd2af6f97ff21bfc9697551db1e4f65c708764d 100755 (executable)
--- a/configure
+++ b/configure
@@ -24,7 +24,7 @@ CACHE="config.cache"
 
 SRCDIR="src"
 OBJDIR="src/.obj"
-SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
+SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
 MODULES="surbl regexp"
 
 CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter"
@@ -228,13 +228,14 @@ check_function()
                fi
        done
        echo -n "Testing for $FUNCTION: "
+       echo >> config.log
+       echo "Testing for $FUNCTION: " >> config.log
        check_cache "function" "$FUNCTION"
        if [ $? -eq 0 ] ; then
                echo "-> OK (cached)"
+               echo "-> OK (cached)" >> config.log
                return 0
        fi
-       echo >> config.log
-       echo "Testing for $FUNCTION: " >> config.log
        echo "#include <sys/types.h>" > autotest.c
        if [ "F$INCLUDE" != "F" ] ; then
                for inc in $INCLUDE ; do
@@ -271,6 +272,7 @@ check_include()
        check_cache "include" "$INCLUDE"
        if [ $? -eq 0 ] ; then
                echo "-> OK (cached)"
+               echo "-> OK (cached)" >> config.log
                have_opt $CACHED
                return 0
        fi
@@ -311,13 +313,14 @@ check_macro()
                fi
        done
        echo -n "Testing for $MACRO: "
+       echo >> config.log
+       echo "Testing for $MACRO: " >> config.log
        check_cache "macro" "$MACRO"
        if [ $? -eq 0 ] ; then
                echo "-> OK (cached)"
+               echo "-> OK (cached)" >> config.log
                return 0
        fi
-       echo >> config.log
-       echo "Testing for $MACRO: " >> config.log
        echo "#include <sys/types.h>" > autotest.c
        for inc in $INCLUDE ; do
                echo "#include \"$inc\"" >> autotest.c
@@ -367,6 +370,7 @@ check_lib()
        if [ $? -eq 0 ] ; then
                LIBS="$LIBS -l$LIB"
                echo "-> OK (cached)"
+               echo "-> OK (cached)" >> config.log
                return 0
        fi
        echo "#include <sys/types.h>" > autotest.c
@@ -411,17 +415,18 @@ check_package()
                fi
        done
        echo -n "Testing for $PACKAGE: "
+       echo >> config.log
+       echo "Testing for $PACKAGE: " >> config.log
        check_cache "package" "$PACKAGE"
        if [ $? -eq 0 ] ; then
                echo "-> OK (cached)"
+               echo "-> OK (cached)" >> config.log
                _p_cflags=`echo $CACHED | cut -d ':' -f 1`
                _p_libs=`echo $CACHED | cut -d ':' -f 2`
                LIBS="$LIBS $_p_libs"
                CFLAGS="$CFLAGS $_p_cflags"
                return 0
        fi
-       echo >> config.log
-       echo "Testing for $PACKAGE: " >> config.log
        echo "#include <sys/types.h>" > autotest.c
        if [ "F$INCLUDE" != "F" ] ; then
                for inc in $INCLUDE ; do
@@ -460,6 +465,7 @@ check_perl()
        check_cache "perl" "$PERL"
        if [ $? -eq 0 ] ; then
                echo "-> OK (cached)"
+               echo "-> OK (cached)" >> config.log
                _p_cflags=`echo $CACHED | cut -d ':' -f 1`
                _p_libs=`echo $CACHED | cut -d ':' -f 2`
                PERLLDFLAGS="$LIBS $_p_libs"
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
new file mode 100644 (file)
index 0000000..f78e209
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ * OSB tokenizer
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+
+/* Coefficients that are used for OSB tokenizer */
+static const int primes[] = {
+       1, 7,
+       3, 13,
+       5, 29,
+       11, 51,
+       23, 101,
+       47, 203,
+       97, 407,
+       197, 817,
+       397, 1637,
+       797, 3277,
+};
+
+token_list_t *
+osb_tokenize_text (memory_pool_t *pool, f_str_t *input)
+{
+       token_list_t *new = NULL, *head = NULL, *last = NULL;
+       f_str_t token = { NULL, 0, 0 };
+       uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+       int i;
+
+       /* First set all bytes of hashpipe to some common value */
+       for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) {
+               hashpipe[i] = 0xABCDEF;
+       }
+
+       while (get_next_word (input, &token)) {
+               /* Shift hashpipe */
+               for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
+                       hashpipe[i] = hashpipe[i - 1];
+               }
+               hashpipe[0] = fstrhash (&token);
+               
+               for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) {
+                       h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
+                   h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1];
+                       new = memory_pool_alloc (pool, sizeof (token_list_t));
+                       new->h1 = h1;
+                       new->h2 = h2;
+                       if (last) {
+                               last->next = new;
+                       }
+                       else {
+                               head = new;
+                       }
+                       last = new;
+
+                       msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
+               }
+       }
+       if (last) {
+               last->next = NULL;
+       }
+
+       return head;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
new file mode 100644 (file)
index 0000000..132a57c
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Common tokenization functions
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+/* Get next word from specified f_str_t buf */
+f_str_t *
+get_next_word (f_str_t *buf, f_str_t *token)
+{
+       size_t remain;
+       char *pos;
+       
+       if (buf == NULL) {
+               return NULL;
+       }
+
+       if (token->begin == NULL) {
+               token->begin = buf->begin;
+       }
+
+       remain = buf->len - (token->begin - buf->begin);
+       if (remain <= 0) {
+               return NULL;
+       }
+
+       token->begin = token->begin + token->len;
+       token->len = 0;
+       
+       pos = token->begin;
+       /* Skip non graph symbols */
+       while (remain-- && !g_ascii_isgraph (*pos ++)) {
+               token->begin ++;
+       }
+       while (remain-- && g_ascii_isgraph (*pos ++)) {
+               token->len ++;
+       }
+       
+       return token;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
new file mode 100644 (file)
index 0000000..6b4bff5
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef TOKENIZERS_H
+#define TOKENIZERS_H
+
+#include <sys/types.h>
+#include "../config.h"
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#include "../mem_pool.h"
+#include "../fstring.h"
+#include "../main.h"
+
+/* Size for features pipe */
+#define FEATURE_WINDOW_SIZE 5
+
+typedef struct token_list_s {
+       uint32_t h1;
+       uint32_t h2;
+       struct token_list_s *next;
+} token_list_t;
+
+/* Get next word from specified f_str_t buf */
+f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
+
+#endif
+/*
+ * vi:ts=4
+ */