]> source.dussan.org Git - rspamd.git/commitdiff
Rework words tokenization.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 23 Dec 2014 14:09:01 +0000 (14:09 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 23 Dec 2014 14:09:01 +0000 (14:09 +0000)
src/plugins/fuzzy_check.c
src/tokenizers/tokenizers.c
src/tokenizers/tokenizers.h

index 7f70ae2d1d0822c02884c1d4571beb6bfff7f83f..bce8298ee133e970905bc8b96fd2bbe0509ea463 100644 (file)
@@ -1155,14 +1155,16 @@ register_fuzzy_controller_call (struct rspamd_http_connection_entry *entry,
        struct fuzzy_learn_session *s;
        struct upstream *selected;
        gint sock;
+       gboolean ret = FALSE;
 
        /* Get upstream */
-       selected = rspamd_upstream_get (rule->servers, RSPAMD_UPSTREAM_ROUND_ROBIN);
-       if (selected) {
+
+       while ((selected = rspamd_upstream_get (rule->servers,
+                       RSPAMD_UPSTREAM_SEQUENTIAL))) {
                /* Create UDP socket */
                if ((sock = rspamd_inet_address_connect (rspamd_upstream_addr (selected),
                                SOCK_DGRAM, TRUE)) == -1) {
-                       return FALSE;
+                       rspamd_upstream_fail (selected);
                }
                else {
                        s =
@@ -1183,11 +1185,11 @@ register_fuzzy_controller_call (struct rspamd_http_connection_entry *entry,
                        rspamd_http_connection_ref (entry->conn);
                        event_add (&s->ev, &s->tv);
                        (*saved)++;
-                       return TRUE;
+                       ret = TRUE;
                }
        }
 
-       return FALSE;
+       return ret;
 }
 
 static void
index 2adc86ff9a500c9e377f1c8d7e35d8a262d8a02a..01456a11bd2d9cea622bc47beb89ac96e817cecd 100644 (file)
@@ -176,86 +176,47 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi
        return p;
 }
 
-/* Struct to access gmime headers */
-struct raw_header {
-       struct raw_header *next;
-       char *name;
-       char *value;
-};
-
-typedef struct _GMimeHeader {
-       GHashTable *hash;
-       GHashTable *writers;
-       struct raw_header *headers;
-} local_GMimeHeader;
-
-int
-tokenize_headers (rspamd_mempool_t * pool,
-       struct rspamd_task *task,
-       GTree ** tree)
+GArray *
+rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
+               gsize min_len, GList **exceptions)
 {
-       token_node_t *new = NULL;
-       rspamd_fstring_t headername;
-       rspamd_fstring_t headervalue;
+       rspamd_fstring_t token, buf;
+       gchar *pos;
+       gsize l;
+       GArray *res;
 
-       if (*tree == NULL) {
-               *tree = g_tree_new (token_node_compare_func);
-               rspamd_mempool_add_destructor (pool,
-                       (rspamd_mempool_destruct_t) g_tree_destroy,
-                       *tree);
+       if (len == 0 || text == NULL) {
+               return NULL;
        }
-#ifndef GMIME24
-       struct raw_header *h;
-
-       h = GMIME_OBJECT (task->message)->headers->headers;
-       while (h) {
-               if (h->name && h->value) {
-                       new = rspamd_mempool_alloc (pool, sizeof (token_node_t));
-                       headername.begin = h->name;
-                       headername.len = strlen (h->name);
-                       headervalue.begin = h->value;
-                       headervalue.len = strlen (h->value);
-                       new->h1 = rspamd_fstrhash (&headername) * primes[0];
-                       new->h2 = rspamd_fstrhash (&headervalue) * primes[1];
-                       if (g_tree_lookup (*tree, new) == NULL) {
-                               g_tree_insert (*tree, new, new);
-                       }
+
+       buf.begin = text;
+       buf.len = len;
+       buf.size = buf.len;
+       token.begin = NULL;
+       token.len = 0;
+
+       res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
+       while ((pos = rspamd_tokenizer_get_word (&buf,
+                       &token, exceptions)) != NULL) {
+               if (is_utf) {
+                       l = g_utf8_strlen (token.begin, token.len);
                }
-               h = h->next;
-       }
-#else
-       GMimeHeaderList *ls;
-       GMimeHeaderIter *iter;
-       const char *name;
-       const char *value;
-
-       ls = GMIME_OBJECT (task->message)->headers;
-       iter = g_mime_header_iter_new ();
-
-       if (g_mime_header_list_get_iter (ls, iter)) {
-               while (g_mime_header_iter_is_valid (iter)) {
-                       new = rspamd_mempool_alloc (pool, sizeof (token_node_t));
-                       name = g_mime_header_iter_get_name (iter);
-                       value = g_mime_header_iter_get_value (iter);
-                       headername.begin = (u_char *)name;
-                       headername.len = strlen (name);
-                       headervalue.begin = (u_char *)value;
-                       headervalue.len = strlen (value);
-                       new->h1 = rspamd_fstrhash (&headername) * primes[0];
-                       new->h2 = rspamd_fstrhash (&headervalue) * primes[1];
-                       if (g_tree_lookup (*tree, new) == NULL) {
-                               g_tree_insert (*tree, new, new);
-                       }
-                       if (!g_mime_header_iter_next (iter)) {
-                               break;
-                       }
+               else {
+                       l = token.len;
                }
+               if (min_len > 0 && l < min_len) {
+                       token.begin = pos;
+                       continue;
+               }
+               g_array_append_val (res, token);
+
+               token.begin = pos;
        }
-       g_mime_header_iter_free (iter);
-#endif
-       return TRUE;
+
+       return res;
 }
 
+
 void
 tokenize_subject (struct rspamd_task *task, GTree ** tree)
 {
index efce5b3075f173115b1cabc330c1691185b54590..51446b09d814546c2388b86758bb0f783541a3d6 100644 (file)
@@ -27,11 +27,18 @@ struct tokenizer {
 
 /* Compare two token nodes */
 int token_node_compare_func (gconstpointer a, gconstpointer b);
+
 /* Get tokenizer structure by name or return NULL if this name is not found */
 struct tokenizer * get_tokenizer (const char *name);
+
 /* Get next word from specified f_str_t buf */
 gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
                rspamd_fstring_t *token, GList **exceptions);
+
+/* Tokenize text into array of words (rspamd_fstring_t type) */
+GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
+               gsize min_len, GList **exceptions);
+
 /* OSB tokenize function */
 int osb_tokenize_text (struct tokenizer *tokenizer,
        rspamd_mempool_t *pool,
@@ -40,10 +47,7 @@ int osb_tokenize_text (struct tokenizer *tokenizer,
        gboolean save_token,
        gboolean is_utf,
        GList *exceptions);
-/* Common tokenizer for headers */
-int tokenize_headers (rspamd_mempool_t *pool,
-       struct rspamd_task *task,
-       GTree **cur);
+
 /* Make tokens for a subject */
 void tokenize_subject (struct rspamd_task *task, GTree ** tree);