aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2008-11-07 19:35:13 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2008-11-07 19:35:13 +0300
commit62cdcc73c4f817516cbcb20e9e5bfad556bea4b7 (patch)
treea6ab9b606bf4e44e405fe9ee4dae983938c9cee4
parent2175980532791f90807eb03ef99d6f7006ada4e6 (diff)
downloadrspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.tar.gz
rspamd-62cdcc73c4f817516cbcb20e9e5bfad556bea4b7.zip
* Add simple implementation of OSB tokenizer
-rwxr-xr-xconfigure20
-rw-r--r--src/tokenizers/osb.c69
-rw-r--r--src/tokenizers/tokenizers.c45
-rw-r--r--src/tokenizers/tokenizers.h29
4 files changed, 156 insertions, 7 deletions
diff --git a/configure b/configure
index a36fc05eb..2cd2af6f9 100755
--- a/configure
+++ b/configure
@@ -24,7 +24,7 @@ CACHE="config.cache"
SRCDIR="src"
OBJDIR="src/.obj"
-SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
+SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
MODULES="surbl regexp"
CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter"
@@ -228,13 +228,14 @@ check_function()
fi
done
echo -n "Testing for $FUNCTION: "
+ echo >> config.log
+ echo "Testing for $FUNCTION: " >> config.log
check_cache "function" "$FUNCTION"
if [ $? -eq 0 ] ; then
echo "-> OK (cached)"
+ echo "-> OK (cached)" >> config.log
return 0
fi
- echo >> config.log
- echo "Testing for $FUNCTION: " >> config.log
echo "#include <sys/types.h>" > autotest.c
if [ "F$INCLUDE" != "F" ] ; then
for inc in $INCLUDE ; do
@@ -271,6 +272,7 @@ check_include()
check_cache "include" "$INCLUDE"
if [ $? -eq 0 ] ; then
echo "-> OK (cached)"
+ echo "-> OK (cached)" >> config.log
have_opt $CACHED
return 0
fi
@@ -311,13 +313,14 @@ check_macro()
fi
done
echo -n "Testing for $MACRO: "
+ echo >> config.log
+ echo "Testing for $MACRO: " >> config.log
check_cache "macro" "$MACRO"
if [ $? -eq 0 ] ; then
echo "-> OK (cached)"
+ echo "-> OK (cached)" >> config.log
return 0
fi
- echo >> config.log
- echo "Testing for $MACRO: " >> config.log
echo "#include <sys/types.h>" > autotest.c
for inc in $INCLUDE ; do
echo "#include \"$inc\"" >> autotest.c
@@ -367,6 +370,7 @@ check_lib()
if [ $? -eq 0 ] ; then
LIBS="$LIBS -l$LIB"
echo "-> OK (cached)"
+ echo "-> OK (cached)" >> config.log
return 0
fi
echo "#include <sys/types.h>" > autotest.c
@@ -411,17 +415,18 @@ check_package()
fi
done
echo -n "Testing for $PACKAGE: "
+ echo >> config.log
+ echo "Testing for $PACKAGE: " >> config.log
check_cache "package" "$PACKAGE"
if [ $? -eq 0 ] ; then
echo "-> OK (cached)"
+ echo "-> OK (cached)" >> config.log
_p_cflags=`echo $CACHED | cut -d ':' -f 1`
_p_libs=`echo $CACHED | cut -d ':' -f 2`
LIBS="$LIBS $_p_libs"
CFLAGS="$CFLAGS $_p_cflags"
return 0
fi
- echo >> config.log
- echo "Testing for $PACKAGE: " >> config.log
echo "#include <sys/types.h>" > autotest.c
if [ "F$INCLUDE" != "F" ] ; then
for inc in $INCLUDE ; do
@@ -460,6 +465,7 @@ check_perl()
check_cache "perl" "$PERL"
if [ $? -eq 0 ] ; then
echo "-> OK (cached)"
+ echo "-> OK (cached)" >> config.log
_p_cflags=`echo $CACHED | cut -d ':' -f 1`
_p_libs=`echo $CACHED | cut -d ':' -f 2`
PERLLDFLAGS="$LIBS $_p_libs"
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
new file mode 100644
index 000000000..f78e20992
--- /dev/null
+++ b/src/tokenizers/osb.c
@@ -0,0 +1,69 @@
+/*
+ * OSB tokenizer
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+
+/* Coefficients that are used for OSB tokenizer */
+static const int primes[] = {
+ 1, 7,
+ 3, 13,
+ 5, 29,
+ 11, 51,
+ 23, 101,
+ 47, 203,
+ 97, 407,
+ 197, 817,
+ 397, 1637,
+ 797, 3277,
+};
+
+token_list_t *
+osb_tokenize_text (memory_pool_t *pool, f_str_t *input)
+{
+ token_list_t *new = NULL, *head = NULL, *last = NULL;
+ f_str_t token = { NULL, 0, 0 };
+ uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+ int i;
+
+ /* First set all bytes of hashpipe to some common value */
+ for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) {
+ hashpipe[i] = 0xABCDEF;
+ }
+
+ while (get_next_word (input, &token)) {
+ /* Shift hashpipe */
+ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
+ hashpipe[i] = hashpipe[i - 1];
+ }
+ hashpipe[0] = fstrhash (&token);
+
+ for (i = 0; i < FEATURE_WINDOW_SIZE - 2; i ++) {
+ h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
+ h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i<<1)-1];
+ new = memory_pool_alloc (pool, sizeof (token_list_t));
+ new->h1 = h1;
+ new->h2 = h2;
+ if (last) {
+ last->next = new;
+ }
+ else {
+ head = new;
+ }
+ last = new;
+
+ msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
+ }
+ }
+ if (last) {
+ last->next = NULL;
+ }
+
+ return head;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
new file mode 100644
index 000000000..132a57ce0
--- /dev/null
+++ b/src/tokenizers/tokenizers.c
@@ -0,0 +1,45 @@
+/*
+ * Common tokenization functions
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+/* Get next word from specified f_str_t buf */
+f_str_t *
+get_next_word (f_str_t *buf, f_str_t *token)
+{
+ size_t remain;
+ char *pos;
+
+ if (buf == NULL) {
+ return NULL;
+ }
+
+ if (token->begin == NULL) {
+ token->begin = buf->begin;
+ }
+
+ remain = buf->len - (token->begin - buf->begin);
+ if (remain <= 0) {
+ return NULL;
+ }
+
+ token->begin = token->begin + token->len;
+ token->len = 0;
+
+ pos = token->begin;
+ /* Skip non graph symbols */
+ while (remain-- && !g_ascii_isgraph (*pos ++)) {
+ token->begin ++;
+ }
+ while (remain-- && g_ascii_isgraph (*pos ++)) {
+ token->len ++;
+ }
+
+ return token;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
new file mode 100644
index 000000000..6b4bff5e0
--- /dev/null
+++ b/src/tokenizers/tokenizers.h
@@ -0,0 +1,29 @@
+#ifndef TOKENIZERS_H
+#define TOKENIZERS_H
+
+#include <sys/types.h>
+#include "../config.h"
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#include "../mem_pool.h"
+#include "../fstring.h"
+#include "../main.h"
+
+/* Size for features pipe */
+#define FEATURE_WINDOW_SIZE 5
+
+typedef struct token_list_s {
+ uint32_t h1;
+ uint32_t h2;
+ struct token_list_s *next;
+} token_list_t;
+
+/* Get next word from specified f_str_t buf */
+f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
+
+#endif
+/*
+ * vi:ts=4
+ */