summaryrefslogtreecommitdiffstats
path: root/src/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-07-09 20:45:11 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-07-09 20:45:11 +0400
commit2234daebbb352b444b322d43cc6c1093f0ce949c (patch)
tree320131facabccd4f5aa3eddc465bc50a707b2b00 /src/tokenizers
parent19baadf6a0e6b2554de67b674a2c6f30efda13bb (diff)
downloadrspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.tar.gz
rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.zip
* Make autolearn working
Diffstat (limited to 'src/tokenizers')
-rw-r--r--src/tokenizers/osb.c10
-rw-r--r--src/tokenizers/tokenizers.c8
2 files changed, 11 insertions, 7 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 32d6b902a..d2a1fe22f 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -29,6 +29,8 @@
#include <sys/types.h>
#include "tokenizers.h"
+/* Minimum length of token */
+#define MIN_LEN 4
extern const int primes[];
@@ -36,7 +38,7 @@ int
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree)
{
token_node_t *new = NULL;
- f_str_t token = { NULL, 0, 0 };
+ f_str_t token = { NULL, 0, 0 }, *res;
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
int i;
@@ -52,7 +54,11 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
msg_debug ("osb_tokenize_text: got input length: %zd", input->len);
- while (tokenizer->get_next_word (input, &token)) {
+ while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
+ /* Skip small words */
+ if (token.len < MIN_LEN) {
+ continue;
+ }
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
hashpipe[i] = hashpipe[i - 1];
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 4527e699c..7db1af12c 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -78,12 +78,11 @@ f_str_t *
get_next_word (f_str_t *buf, f_str_t *token)
{
size_t remain;
- char *pos;
+ unsigned char *pos;
if (buf == NULL) {
return NULL;
}
-
if (token->begin == NULL) {
token->begin = buf->begin;
}
@@ -95,15 +94,14 @@ get_next_word (f_str_t *buf, f_str_t *token)
if (remain <= 0) {
return NULL;
}
-
pos = token->begin;
/* Skip non graph symbols */
- while (remain > 0 && !g_ascii_isgraph (*pos)) {
+ while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) {
token->begin ++;
pos ++;
remain --;
}
- while (remain > 0 && g_ascii_isgraph (*pos)) {
+ while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) {
token->len ++;
pos ++;
remain --;