From 074de730239d987adcf08a92baa3f4b65e7d63d9 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 23 Dec 2014 14:09:01 +0000 Subject: [PATCH] Rework words tokenization. --- src/plugins/fuzzy_check.c | 12 +++-- src/tokenizers/tokenizers.c | 105 ++++++++++++------------------------ src/tokenizers/tokenizers.h | 12 +++-- 3 files changed, 48 insertions(+), 81 deletions(-) diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index 7f70ae2d1..bce8298ee 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -1155,14 +1155,16 @@ register_fuzzy_controller_call (struct rspamd_http_connection_entry *entry, struct fuzzy_learn_session *s; struct upstream *selected; gint sock; + gboolean ret = FALSE; /* Get upstream */ - selected = rspamd_upstream_get (rule->servers, RSPAMD_UPSTREAM_ROUND_ROBIN); - if (selected) { + + while ((selected = rspamd_upstream_get (rule->servers, + RSPAMD_UPSTREAM_SEQUENTIAL))) { /* Create UDP socket */ if ((sock = rspamd_inet_address_connect (rspamd_upstream_addr (selected), SOCK_DGRAM, TRUE)) == -1) { - return FALSE; + rspamd_upstream_fail (selected); } else { s = @@ -1183,11 +1185,11 @@ register_fuzzy_controller_call (struct rspamd_http_connection_entry *entry, rspamd_http_connection_ref (entry->conn); event_add (&s->ev, &s->tv); (*saved)++; - return TRUE; + ret = TRUE; } } - return FALSE; + return ret; } static void diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 2adc86ff9..01456a11b 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -176,86 +176,47 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi return p; } -/* Struct to access gmime headers */ -struct raw_header { - struct raw_header *next; - char *name; - char *value; -}; - -typedef struct _GMimeHeader { - GHashTable *hash; - GHashTable *writers; - struct raw_header *headers; -} local_GMimeHeader; - -int -tokenize_headers (rspamd_mempool_t * pool, - struct rspamd_task *task, - GTree ** tree) +GArray * +rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, + gsize min_len, GList **exceptions) { - token_node_t *new = NULL; - rspamd_fstring_t headername; - rspamd_fstring_t headervalue; + rspamd_fstring_t token, buf; + gchar *pos; + gsize l; + GArray *res; - if (*tree == NULL) { - *tree = g_tree_new (token_node_compare_func); - rspamd_mempool_add_destructor (pool, - (rspamd_mempool_destruct_t) g_tree_destroy, - *tree); + if (len == 0 || text == NULL) { + return NULL; } -#ifndef GMIME24 - struct raw_header *h; - - h = GMIME_OBJECT (task->message)->headers->headers; - while (h) { - if (h->name && h->value) { - new = rspamd_mempool_alloc (pool, sizeof (token_node_t)); - headername.begin = h->name; - headername.len = strlen (h->name); - headervalue.begin = h->value; - headervalue.len = strlen (h->value); - new->h1 = rspamd_fstrhash (&headername) * primes[0]; - new->h2 = rspamd_fstrhash (&headervalue) * primes[1]; - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); - } + + buf.begin = text; + buf.len = len; + buf.size = buf.len; + token.begin = NULL; + token.len = 0; + + res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t)); + while ((pos = rspamd_tokenizer_get_word (&buf, + &token, exceptions)) != NULL) { + if (is_utf) { + l = g_utf8_strlen (token.begin, token.len); } - h = h->next; - } -#else - GMimeHeaderList *ls; - GMimeHeaderIter *iter; - const char *name; - const char *value; - - ls = GMIME_OBJECT (task->message)->headers; - iter = g_mime_header_iter_new (); - - if (g_mime_header_list_get_iter (ls, iter)) { - while (g_mime_header_iter_is_valid (iter)) { - new = rspamd_mempool_alloc (pool, sizeof (token_node_t)); - name = g_mime_header_iter_get_name (iter); - value = g_mime_header_iter_get_value (iter); - headername.begin = (u_char *)name; - headername.len = strlen (name); - headervalue.begin = (u_char *)value; - headervalue.len = strlen (value); - new->h1 = rspamd_fstrhash (&headername) * primes[0]; - new->h2 = rspamd_fstrhash (&headervalue) * primes[1]; - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); - } - if (!g_mime_header_iter_next (iter)) { - break; - } + else { + l = token.len; } + if (min_len > 0 && l < min_len) { + token.begin = pos; + continue; + } + g_array_append_val (res, token); + + token.begin = pos; } - g_mime_header_iter_free (iter); -#endif - return TRUE; + + return res; } + void tokenize_subject (struct rspamd_task *task, GTree ** tree) { diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index efce5b307..51446b09d 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -27,11 +27,18 @@ struct tokenizer { /* Compare two token nodes */ int token_node_compare_func (gconstpointer a, gconstpointer b); + /* Get tokenizer structure by name or return NULL if this name is not found */ struct tokenizer * get_tokenizer (const char *name); + /* Get next word from specified f_str_t buf */ gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions); + +/* Tokenize text into array of words (rspamd_fstring_t type) */ +GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, + gsize min_len, GList **exceptions); + /* OSB tokenize function */ int osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t *pool, @@ -40,10 +47,7 @@ int osb_tokenize_text (struct tokenizer *tokenizer, gboolean save_token, gboolean is_utf, GList *exceptions); -/* Common tokenizer for headers */ -int tokenize_headers (rspamd_mempool_t *pool, - struct rspamd_task *task, - GTree **cur); + /* Make tokens for a subject */ void tokenize_subject (struct rspamd_task *task, GTree ** tree); -- 2.39.5