summaryrefslogtreecommitdiffstats
path: root/src/tokenizers/tokenizers.h
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-03 20:23:13 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-03 20:23:13 +0400
commit92de380c2c5e8ce7073ce979df4e5c7868e52bb6 (patch)
tree27be3202d27f129f3d94d90298a4d1e0ecf2c281 /src/tokenizers/tokenizers.h
parent83a9452974ec2f9c7be262a77e54a1ea2557c795 (diff)
downloadrspamd-92de380c2c5e8ce7073ce979df4e5c7868e52bb6.tar.gz
rspamd-92de380c2c5e8ce7073ce979df4e5c7868e52bb6.zip
* Skip short utf words in statistics
Diffstat (limited to 'src/tokenizers/tokenizers.h')
-rw-r--r--src/tokenizers/tokenizers.h4
1 files changed, 2 insertions, 2 deletions
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index 741753328..df5481a1f 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -24,7 +24,7 @@ typedef struct token_node_s {
/* Common tokenizer structure */
struct tokenizer {
char *name;
- int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token);
+ int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
};
@@ -35,7 +35,7 @@ struct tokenizer* get_tokenizer (char *name);
/* Get next word from specified f_str_t buf */
f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
/* OSB tokenize function */
-int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token);
+int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf);
/* Common tokenizer for headers */
int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur);
/* Make tokens for a subject */