From: Vsevolod Stakhov <vsevolod@rambler-co.ru>
Date: Mon, 24 Jan 2011 17:45:54 +0000 (+0300)
Subject: * Many fixes to fuzzy hashes logic and tokenization.
X-Git-Tag: 0.3.7~75
X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=76b69f300d8372969b6143e3e269376229d03edf;p=rspamd.git

* Many fixes to fuzzy hashes logic and tokenization.
---

diff --git a/src/fuzzy.c b/src/fuzzy.c
index 286f1696d..61ef5647e 100644
--- a/src/fuzzy.c
+++ b/src/fuzzy.c
@@ -32,6 +32,8 @@
 #define MIN_FUZZY_BLOCK_SIZE 3
 #define HASH_INIT      0x28021967
 
+static const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
 struct roll_state {
 	guint32                         h[3];
 	gchar                           window[ROLL_WINDOW_SIZE];
@@ -86,6 +88,7 @@ fuzzy_blocksize (guint32 len)
 	return g_spaced_primes_closest (len / FUZZY_HASHLEN);
 }
 
+
 /* Update hash with new symbol */
 void
 fuzzy_update (fuzzy_hash_t * h, gchar c)
@@ -94,7 +97,7 @@ fuzzy_update (fuzzy_hash_t * h, gchar c)
 	h->h = fuzzy_fnv_hash (c, h->h);
 
 	if (h->rh % h->block_size == (h->block_size - 1)) {
-		h->hash_pipe[h->hi] = h->h;
+		h->hash_pipe[h->hi] = b64[h->h % 64];
 		if (h->hi < FUZZY_HASHLEN - 2) {
 			h->h = HASH_INIT;
 			h->hi++;
@@ -226,11 +229,27 @@ fuzzy_init (f_str_t * in, memory_pool_t * pool)
 {
 	fuzzy_hash_t                   *new;
 	gint                            i, repeats = 0;
-	gchar                           *c = in->begin, last = '\0';
+	gchar                          *c = in->begin, last = '\0';
+	gsize                           real_len = 0;
 
 	new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
 	bzero (&rs, sizeof (rs));
-	new->block_size = fuzzy_blocksize (in->len);
+	for (i = 0; i < in->len; i++) {
+		if (*c == last) {
+			repeats++;
+		}
+		else {
+			repeats = 0;
+		}
+		if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
+			real_len ++;
+		}
+		last = *c;
+		c++;
+	}
+
+	new->block_size = fuzzy_blocksize (real_len);
+	c = in->begin;
 
 	for (i = 0; i < in->len; i++) {
 		if (*c == last) {
@@ -246,6 +265,11 @@ fuzzy_init (f_str_t * in, memory_pool_t * pool)
 		c++;
 	}
 
+	/* Check whether we have more bytes in a rolling window */
+	if (new->rh != 0) {
+		new->hash_pipe[new->hi] = b64[new->h % 64];
+	}
+
 	return new;
 }
 
diff --git a/src/fuzzy.h b/src/fuzzy.h
index 8db2779d1..b5b3856e6 100644
--- a/src/fuzzy.h
+++ b/src/fuzzy.h
@@ -13,7 +13,7 @@
 #define FUZZY_HASHLEN 64
 
 typedef struct fuzzy_hash_s {
-	gchar hash_pipe[FUZZY_HASHLEN];			/**< result hash					*/
+	gchar hash_pipe[FUZZY_HASHLEN];		/**< result hash					*/
 	guint32 block_size;					/**< current blocksize				*/
 	guint32 rh;							/**< roll hash value				*/
 	guint32 h;								/**< hash of block					*/
diff --git a/src/html.c b/src/html.c
index 42ed9dbfa..64ebe362e 100644
--- a/src/html.c
+++ b/src/html.c
@@ -839,12 +839,13 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_
 		new = construct_html_node (pool, tag_text, tag_len);
 		if (new == NULL) {
 			debug_task ("cannot construct HTML node for text '%s'", tag_text);
-			return -1;
+			return FALSE;
 		}
 		data = new->data;
 		if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) {
 			parse_tag_url (task, part, data->tag->id, tag_text, tag_len);
 		}
+
 		if (data->flags & FL_CLOSING) {
 			if (!*cur_level) {
 				debug_task ("bad parent node");
@@ -857,10 +858,15 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_
 			}
 		}
 		else {
+
 			g_node_append (*cur_level, new);
 			if ((data->flags & FL_CLOSED) == 0) {
 				*cur_level = new;
 			}
+			/* Skip some tags */
+			if (data->tag->id == Tag_STYLE || data->tag->id == Tag_SCRIPT || data->tag->id == Tag_OBJECT) {
+				return FALSE;
+			}
 		}
 	}
 
diff --git a/src/message.c b/src/message.c
index 60072d45d..8e8b8feb0 100644
--- a/src/message.c
+++ b/src/message.c
@@ -42,6 +42,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
 	gint                            state = 0;
 	GByteArray                     *buf;
 	GNode                          *level_ptr = NULL;
+	gboolean                        erase = FALSE;
 
 	if (stateptr)
 		state = *stateptr;
@@ -80,7 +81,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
 					br++;
 				}
 			}
-			else if (state == 0) {
+			else if (state == 0 && !erase) {
 				*(rp++) = c;
 			}
 			break;
@@ -92,7 +93,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
 					br--;
 				}
 			}
-			else if (state == 0) {
+			else if (state == 0 && !erase) {
 				*(rp++) = c;
 			}
 			break;
@@ -111,7 +112,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
 			case 1:			/* HTML/XML */
 				lc = '>';
 				in_q = state = 0;
-				add_html_node (task, pool, part, tbegin, p - tbegin - 1, &level_ptr);
+				erase = !add_html_node (task, pool, part, tbegin, p - tbegin - 1, &level_ptr);
 				break;
 
 			case 2:			/* PHP */
@@ -134,7 +135,9 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
 				break;
 
 			default:
-				*(rp++) = c;
+				if (!erase) {
+					*(rp++) = c;
+				}
 				break;
 			}
 			break;
@@ -149,7 +152,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
 					lc = c;
 				}
 			}
-			else if (state == 0) {
+			else if (state == 0 && !erase) {
 				*(rp++) = c;
 			}
 			if (state && p != src->data && *(p - 1) != '\\' && (!in_q || *p == in_q)) {
@@ -169,7 +172,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
 				lc = c;
 			}
 			else {
-				if (state == 0) {
+				if (state == 0 && !erase) {
 					*(rp++) = c;
 				}
 			}
@@ -218,7 +221,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
 			/* fall-through */
 		default:
 		  reg_char:
-			if (state == 0) {
+			if (state == 0 && !erase) {
 				*(rp++) = c;
 			}
 			break;
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index 4e1f8f61c..ce6599e86 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -725,13 +725,15 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
 			}
 			if (g_regex_match_full (regexp, ct, clen, 0, 0, NULL, &err) == TRUE) {
 				if (G_UNLIKELY (re->is_test)) {
-					msg_info ("process test regexp %s for mime part returned TRUE", re->regexp_text);
+					msg_info ("process test regexp %s for mime part of length %d returned TRUE", re->regexp_text,
+							(gint)clen);
 				}
 				task_cache_add (task, re, 1);
 				return 1;
 			}
 			else if (G_UNLIKELY (re->is_test)) {
-				msg_info ("process test regexp %s for mime part of length %d returned FALSE", re->regexp_text, (gint)part->orig->len);
+				msg_info ("process test regexp %s for mime part of length %d returned FALSE", re->regexp_text,
+						(gint)clen);
 			}
 			if (err != NULL) {
 				msg_info ("error occured while processing regexp \"%s\": %s", re->regexp_text, err->message);
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index ab073a28c..5e3d39c50 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -47,6 +47,35 @@ const int                       primes[] = {
 	797, 3277,
 };
 
+const gchar t_delimiters[255] = {
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+		1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
+		1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+		1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0
+};
+
 struct tokenizer               *
 get_tokenizer (char *name)
 {
@@ -78,7 +107,7 @@ f_str_t                        *
 get_next_word (f_str_t * buf, f_str_t * token)
 {
 	size_t                          remain;
-	unsigned char                  *pos;
+	guchar                         *pos;
 
 	if (buf == NULL) {
 		return NULL;
@@ -95,13 +124,13 @@ get_next_word (f_str_t * buf, f_str_t * token)
 		return NULL;
 	}
 	pos = token->begin;
-	/* Skip non graph symbols */
-	while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) {
+	/* Skip non delimiters symbols */
+	while (remain > 0 && t_delimiters[*pos]) {
 		token->begin++;
 		pos++;
 		remain--;
 	}
-	while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) {
+	while (remain > 0 && !t_delimiters[*pos]) {
 		token->len++;
 		pos++;
 		remain--;