aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-23 14:09:01 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-23 14:09:01 +0000
commit074de730239d987adcf08a92baa3f4b65e7d63d9 (patch)
tree935cebf738eb138aa8ffae979bdc4245337974e5
parentba06982886382ec25a53b1672fe04d6b97a7a461 (diff)
downloadrspamd-074de730239d987adcf08a92baa3f4b65e7d63d9.tar.gz
rspamd-074de730239d987adcf08a92baa3f4b65e7d63d9.zip
Rework words tokenization.
-rw-r--r--src/plugins/fuzzy_check.c12
-rw-r--r--src/tokenizers/tokenizers.c105
-rw-r--r--src/tokenizers/tokenizers.h12
3 files changed, 48 insertions, 81 deletions
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c
index 7f70ae2d1..bce8298ee 100644
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -1155,14 +1155,16 @@ register_fuzzy_controller_call (struct rspamd_http_connection_entry *entry,
struct fuzzy_learn_session *s;
struct upstream *selected;
gint sock;
+ gboolean ret = FALSE;
/* Get upstream */
- selected = rspamd_upstream_get (rule->servers, RSPAMD_UPSTREAM_ROUND_ROBIN);
- if (selected) {
+
+ while ((selected = rspamd_upstream_get (rule->servers,
+ RSPAMD_UPSTREAM_SEQUENTIAL))) {
/* Create UDP socket */
if ((sock = rspamd_inet_address_connect (rspamd_upstream_addr (selected),
SOCK_DGRAM, TRUE)) == -1) {
- return FALSE;
+ rspamd_upstream_fail (selected);
}
else {
s =
@@ -1183,11 +1185,11 @@ register_fuzzy_controller_call (struct rspamd_http_connection_entry *entry,
rspamd_http_connection_ref (entry->conn);
event_add (&s->ev, &s->tv);
(*saved)++;
- return TRUE;
+ ret = TRUE;
}
}
- return FALSE;
+ return ret;
}
static void
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 2adc86ff9..01456a11b 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -176,86 +176,47 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi
return p;
}
-/* Struct to access gmime headers */
-struct raw_header {
- struct raw_header *next;
- char *name;
- char *value;
-};
-
-typedef struct _GMimeHeader {
- GHashTable *hash;
- GHashTable *writers;
- struct raw_header *headers;
-} local_GMimeHeader;
-
-int
-tokenize_headers (rspamd_mempool_t * pool,
- struct rspamd_task *task,
- GTree ** tree)
+GArray *
+rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
+ gsize min_len, GList **exceptions)
{
- token_node_t *new = NULL;
- rspamd_fstring_t headername;
- rspamd_fstring_t headervalue;
+ rspamd_fstring_t token, buf;
+ gchar *pos;
+ gsize l;
+ GArray *res;
- if (*tree == NULL) {
- *tree = g_tree_new (token_node_compare_func);
- rspamd_mempool_add_destructor (pool,
- (rspamd_mempool_destruct_t) g_tree_destroy,
- *tree);
+ if (len == 0 || text == NULL) {
+ return NULL;
}
-#ifndef GMIME24
- struct raw_header *h;
-
- h = GMIME_OBJECT (task->message)->headers->headers;
- while (h) {
- if (h->name && h->value) {
- new = rspamd_mempool_alloc (pool, sizeof (token_node_t));
- headername.begin = h->name;
- headername.len = strlen (h->name);
- headervalue.begin = h->value;
- headervalue.len = strlen (h->value);
- new->h1 = rspamd_fstrhash (&headername) * primes[0];
- new->h2 = rspamd_fstrhash (&headervalue) * primes[1];
- if (g_tree_lookup (*tree, new) == NULL) {
- g_tree_insert (*tree, new, new);
- }
+
+ buf.begin = text;
+ buf.len = len;
+ buf.size = buf.len;
+ token.begin = NULL;
+ token.len = 0;
+
+ res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
+ while ((pos = rspamd_tokenizer_get_word (&buf,
+ &token, exceptions)) != NULL) {
+ if (is_utf) {
+ l = g_utf8_strlen (token.begin, token.len);
}
- h = h->next;
- }
-#else
- GMimeHeaderList *ls;
- GMimeHeaderIter *iter;
- const char *name;
- const char *value;
-
- ls = GMIME_OBJECT (task->message)->headers;
- iter = g_mime_header_iter_new ();
-
- if (g_mime_header_list_get_iter (ls, iter)) {
- while (g_mime_header_iter_is_valid (iter)) {
- new = rspamd_mempool_alloc (pool, sizeof (token_node_t));
- name = g_mime_header_iter_get_name (iter);
- value = g_mime_header_iter_get_value (iter);
- headername.begin = (u_char *)name;
- headername.len = strlen (name);
- headervalue.begin = (u_char *)value;
- headervalue.len = strlen (value);
- new->h1 = rspamd_fstrhash (&headername) * primes[0];
- new->h2 = rspamd_fstrhash (&headervalue) * primes[1];
- if (g_tree_lookup (*tree, new) == NULL) {
- g_tree_insert (*tree, new, new);
- }
- if (!g_mime_header_iter_next (iter)) {
- break;
- }
+ else {
+ l = token.len;
}
+ if (min_len > 0 && l < min_len) {
+ token.begin = pos;
+ continue;
+ }
+ g_array_append_val (res, token);
+
+ token.begin = pos;
}
- g_mime_header_iter_free (iter);
-#endif
- return TRUE;
+
+ return res;
}
+
void
tokenize_subject (struct rspamd_task *task, GTree ** tree)
{
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index efce5b307..51446b09d 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -27,11 +27,18 @@ struct tokenizer {
/* Compare two token nodes */
int token_node_compare_func (gconstpointer a, gconstpointer b);
+
/* Get tokenizer structure by name or return NULL if this name is not found */
struct tokenizer * get_tokenizer (const char *name);
+
/* Get next word from specified f_str_t buf */
gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
rspamd_fstring_t *token, GList **exceptions);
+
+/* Tokenize text into array of words (rspamd_fstring_t type) */
+GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
+ gsize min_len, GList **exceptions);
+
/* OSB tokenize function */
int osb_tokenize_text (struct tokenizer *tokenizer,
rspamd_mempool_t *pool,
@@ -40,10 +47,7 @@ int osb_tokenize_text (struct tokenizer *tokenizer,
gboolean save_token,
gboolean is_utf,
GList *exceptions);
-/* Common tokenizer for headers */
-int tokenize_headers (rspamd_mempool_t *pool,
- struct rspamd_task *task,
- GTree **cur);
+
/* Make tokens for a subject */
void tokenize_subject (struct rspamd_task *task, GTree ** tree);