From: Vsevolod Stakhov Date: Mon, 24 Jan 2011 17:45:54 +0000 (+0300) Subject: * Many fixes to fuzzy hashes logic and tokenization. X-Git-Tag: 0.3.7~75 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=76b69f300d8372969b6143e3e269376229d03edf;p=rspamd.git * Many fixes to fuzzy hashes logic and tokenization. --- diff --git a/src/fuzzy.c b/src/fuzzy.c index 286f1696d..61ef5647e 100644 --- a/src/fuzzy.c +++ b/src/fuzzy.c @@ -32,6 +32,8 @@ #define MIN_FUZZY_BLOCK_SIZE 3 #define HASH_INIT 0x28021967 +static const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + struct roll_state { guint32 h[3]; gchar window[ROLL_WINDOW_SIZE]; @@ -86,6 +88,7 @@ fuzzy_blocksize (guint32 len) return g_spaced_primes_closest (len / FUZZY_HASHLEN); } + /* Update hash with new symbol */ void fuzzy_update (fuzzy_hash_t * h, gchar c) @@ -94,7 +97,7 @@ fuzzy_update (fuzzy_hash_t * h, gchar c) h->h = fuzzy_fnv_hash (c, h->h); if (h->rh % h->block_size == (h->block_size - 1)) { - h->hash_pipe[h->hi] = h->h; + h->hash_pipe[h->hi] = b64[h->h % 64]; if (h->hi < FUZZY_HASHLEN - 2) { h->h = HASH_INIT; h->hi++; @@ -226,11 +229,27 @@ fuzzy_init (f_str_t * in, memory_pool_t * pool) { fuzzy_hash_t *new; gint i, repeats = 0; - gchar *c = in->begin, last = '\0'; + gchar *c = in->begin, last = '\0'; + gsize real_len = 0; new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); bzero (&rs, sizeof (rs)); - new->block_size = fuzzy_blocksize (in->len); + for (i = 0; i < in->len; i++) { + if (*c == last) { + repeats++; + } + else { + repeats = 0; + } + if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) { + real_len ++; + } + last = *c; + c++; + } + + new->block_size = fuzzy_blocksize (real_len); + c = in->begin; for (i = 0; i < in->len; i++) { if (*c == last) { @@ -246,6 +265,11 @@ fuzzy_init (f_str_t * in, memory_pool_t * pool) c++; } + /* Check whether we have more bytes in a rolling window */ + if (new->rh != 0) { + new->hash_pipe[new->hi] = b64[new->h % 64]; + } + return new; } diff --git a/src/fuzzy.h b/src/fuzzy.h index 8db2779d1..b5b3856e6 100644 --- a/src/fuzzy.h +++ b/src/fuzzy.h @@ -13,7 +13,7 @@ #define FUZZY_HASHLEN 64 typedef struct fuzzy_hash_s { - gchar hash_pipe[FUZZY_HASHLEN]; /**< result hash */ + gchar hash_pipe[FUZZY_HASHLEN]; /**< result hash */ guint32 block_size; /**< current blocksize */ guint32 rh; /**< roll hash value */ guint32 h; /**< hash of block */ diff --git a/src/html.c b/src/html.c index 42ed9dbfa..64ebe362e 100644 --- a/src/html.c +++ b/src/html.c @@ -839,12 +839,13 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_ new = construct_html_node (pool, tag_text, tag_len); if (new == NULL) { debug_task ("cannot construct HTML node for text '%s'", tag_text); - return -1; + return FALSE; } data = new->data; if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) { parse_tag_url (task, part, data->tag->id, tag_text, tag_len); } + if (data->flags & FL_CLOSING) { if (!*cur_level) { debug_task ("bad parent node"); @@ -857,10 +858,15 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_ } } else { + g_node_append (*cur_level, new); if ((data->flags & FL_CLOSED) == 0) { *cur_level = new; } + /* Skip some tags */ + if (data->tag->id == Tag_STYLE || data->tag->id == Tag_SCRIPT || data->tag->id == Tag_OBJECT) { + return FALSE; + } } } diff --git a/src/message.c b/src/message.c index 60072d45d..8e8b8feb0 100644 --- a/src/message.c +++ b/src/message.c @@ -42,6 +42,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex gint state = 0; GByteArray *buf; GNode *level_ptr = NULL; + gboolean erase = FALSE; if (stateptr) state = *stateptr; @@ -80,7 +81,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex br++; } } - else if (state == 0) { + else if (state == 0 && !erase) { *(rp++) = c; } break; @@ -92,7 +93,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex br--; } } - else if (state == 0) { + else if (state == 0 && !erase) { *(rp++) = c; } break; @@ -111,7 +112,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex case 1: /* HTML/XML */ lc = '>'; in_q = state = 0; - add_html_node (task, pool, part, tbegin, p - tbegin - 1, &level_ptr); + erase = !add_html_node (task, pool, part, tbegin, p - tbegin - 1, &level_ptr); break; case 2: /* PHP */ @@ -134,7 +135,9 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex break; default: - *(rp++) = c; + if (!erase) { + *(rp++) = c; + } break; } break; @@ -149,7 +152,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex lc = c; } } - else if (state == 0) { + else if (state == 0 && !erase) { *(rp++) = c; } if (state && p != src->data && *(p - 1) != '\\' && (!in_q || *p == in_q)) { @@ -169,7 +172,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex lc = c; } else { - if (state == 0) { + if (state == 0 && !erase) { *(rp++) = c; } } @@ -218,7 +221,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex /* fall-through */ default: reg_char: - if (state == 0) { + if (state == 0 && !erase) { *(rp++) = c; } break; diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index 4e1f8f61c..ce6599e86 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -725,13 +725,15 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar } if (g_regex_match_full (regexp, ct, clen, 0, 0, NULL, &err) == TRUE) { if (G_UNLIKELY (re->is_test)) { - msg_info ("process test regexp %s for mime part returned TRUE", re->regexp_text); + msg_info ("process test regexp %s for mime part of length %d returned TRUE", re->regexp_text, + (gint)clen); } task_cache_add (task, re, 1); return 1; } else if (G_UNLIKELY (re->is_test)) { - msg_info ("process test regexp %s for mime part of length %d returned FALSE", re->regexp_text, (gint)part->orig->len); + msg_info ("process test regexp %s for mime part of length %d returned FALSE", re->regexp_text, + (gint)clen); } if (err != NULL) { msg_info ("error occured while processing regexp \"%s\": %s", re->regexp_text, err->message); diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index ab073a28c..5e3d39c50 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -47,6 +47,35 @@ const int primes[] = { 797, 3277, }; +const gchar t_delimiters[255] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0 +}; + struct tokenizer * get_tokenizer (char *name) { @@ -78,7 +107,7 @@ f_str_t * get_next_word (f_str_t * buf, f_str_t * token) { size_t remain; - unsigned char *pos; + guchar *pos; if (buf == NULL) { return NULL; @@ -95,13 +124,13 @@ get_next_word (f_str_t * buf, f_str_t * token) return NULL; } pos = token->begin; - /* Skip non graph symbols */ - while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) { + /* Skip non delimiters symbols */ + while (remain > 0 && t_delimiters[*pos]) { token->begin++; pos++; remain--; } - while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) { + while (remain > 0 && !t_delimiters[*pos]) { token->len++; pos++; remain--;