aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-01-24 20:45:54 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-01-24 20:45:54 +0300
commit76b69f300d8372969b6143e3e269376229d03edf (patch)
treed9c4dc4bfed5635869f2c9d83e9ebb94d00903a1 /src
parentb0d0a4ce50733ce162ce9738da2d416497f98763 (diff)
downloadrspamd-76b69f300d8372969b6143e3e269376229d03edf.tar.gz
rspamd-76b69f300d8372969b6143e3e269376229d03edf.zip
* Many fixes to fuzzy hashes logic and tokenization.
Diffstat (limited to 'src')
-rw-r--r--src/fuzzy.c30
-rw-r--r--src/fuzzy.h2
-rw-r--r--src/html.c8
-rw-r--r--src/message.c17
-rw-r--r--src/plugins/regexp.c6
-rw-r--r--src/tokenizers/tokenizers.c37
6 files changed, 82 insertions, 18 deletions
diff --git a/src/fuzzy.c b/src/fuzzy.c
index 286f1696d..61ef5647e 100644
--- a/src/fuzzy.c
+++ b/src/fuzzy.c
@@ -32,6 +32,8 @@
#define MIN_FUZZY_BLOCK_SIZE 3
#define HASH_INIT 0x28021967
+static const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
struct roll_state {
guint32 h[3];
gchar window[ROLL_WINDOW_SIZE];
@@ -86,6 +88,7 @@ fuzzy_blocksize (guint32 len)
return g_spaced_primes_closest (len / FUZZY_HASHLEN);
}
+
/* Update hash with new symbol */
void
fuzzy_update (fuzzy_hash_t * h, gchar c)
@@ -94,7 +97,7 @@ fuzzy_update (fuzzy_hash_t * h, gchar c)
h->h = fuzzy_fnv_hash (c, h->h);
if (h->rh % h->block_size == (h->block_size - 1)) {
- h->hash_pipe[h->hi] = h->h;
+ h->hash_pipe[h->hi] = b64[h->h % 64];
if (h->hi < FUZZY_HASHLEN - 2) {
h->h = HASH_INIT;
h->hi++;
@@ -226,11 +229,27 @@ fuzzy_init (f_str_t * in, memory_pool_t * pool)
{
fuzzy_hash_t *new;
gint i, repeats = 0;
- gchar *c = in->begin, last = '\0';
+ gchar *c = in->begin, last = '\0';
+ gsize real_len = 0;
new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
bzero (&rs, sizeof (rs));
- new->block_size = fuzzy_blocksize (in->len);
+ for (i = 0; i < in->len; i++) {
+ if (*c == last) {
+ repeats++;
+ }
+ else {
+ repeats = 0;
+ }
+ if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
+ real_len ++;
+ }
+ last = *c;
+ c++;
+ }
+
+ new->block_size = fuzzy_blocksize (real_len);
+ c = in->begin;
for (i = 0; i < in->len; i++) {
if (*c == last) {
@@ -246,6 +265,11 @@ fuzzy_init (f_str_t * in, memory_pool_t * pool)
c++;
}
+ /* Check whether we have more bytes in a rolling window */
+ if (new->rh != 0) {
+ new->hash_pipe[new->hi] = b64[new->h % 64];
+ }
+
return new;
}
diff --git a/src/fuzzy.h b/src/fuzzy.h
index 8db2779d1..b5b3856e6 100644
--- a/src/fuzzy.h
+++ b/src/fuzzy.h
@@ -13,7 +13,7 @@
#define FUZZY_HASHLEN 64
typedef struct fuzzy_hash_s {
- gchar hash_pipe[FUZZY_HASHLEN]; /**< result hash */
+ gchar hash_pipe[FUZZY_HASHLEN]; /**< result hash */
guint32 block_size; /**< current blocksize */
guint32 rh; /**< roll hash value */
guint32 h; /**< hash of block */
diff --git a/src/html.c b/src/html.c
index 42ed9dbfa..64ebe362e 100644
--- a/src/html.c
+++ b/src/html.c
@@ -839,12 +839,13 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_
new = construct_html_node (pool, tag_text, tag_len);
if (new == NULL) {
debug_task ("cannot construct HTML node for text '%s'", tag_text);
- return -1;
+ return FALSE;
}
data = new->data;
if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) {
parse_tag_url (task, part, data->tag->id, tag_text, tag_len);
}
+
if (data->flags & FL_CLOSING) {
if (!*cur_level) {
debug_task ("bad parent node");
@@ -857,10 +858,15 @@ add_html_node (struct worker_task *task, memory_pool_t * pool, struct mime_text_
}
}
else {
+
g_node_append (*cur_level, new);
if ((data->flags & FL_CLOSED) == 0) {
*cur_level = new;
}
+ /* Skip some tags */
+ if (data->tag->id == Tag_STYLE || data->tag->id == Tag_SCRIPT || data->tag->id == Tag_OBJECT) {
+ return FALSE;
+ }
}
}
diff --git a/src/message.c b/src/message.c
index 60072d45d..8e8b8feb0 100644
--- a/src/message.c
+++ b/src/message.c
@@ -42,6 +42,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
gint state = 0;
GByteArray *buf;
GNode *level_ptr = NULL;
+ gboolean erase = FALSE;
if (stateptr)
state = *stateptr;
@@ -80,7 +81,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
br++;
}
}
- else if (state == 0) {
+ else if (state == 0 && !erase) {
*(rp++) = c;
}
break;
@@ -92,7 +93,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
br--;
}
}
- else if (state == 0) {
+ else if (state == 0 && !erase) {
*(rp++) = c;
}
break;
@@ -111,7 +112,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
case 1: /* HTML/XML */
lc = '>';
in_q = state = 0;
- add_html_node (task, pool, part, tbegin, p - tbegin - 1, &level_ptr);
+ erase = !add_html_node (task, pool, part, tbegin, p - tbegin - 1, &level_ptr);
break;
case 2: /* PHP */
@@ -134,7 +135,9 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
break;
default:
- *(rp++) = c;
+ if (!erase) {
+ *(rp++) = c;
+ }
break;
}
break;
@@ -149,7 +152,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
lc = c;
}
}
- else if (state == 0) {
+ else if (state == 0 && !erase) {
*(rp++) = c;
}
if (state && p != src->data && *(p - 1) != '\\' && (!in_q || *p == in_q)) {
@@ -169,7 +172,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
lc = c;
}
else {
- if (state == 0) {
+ if (state == 0 && !erase) {
*(rp++) = c;
}
}
@@ -218,7 +221,7 @@ strip_html_tags (struct worker_task *task, memory_pool_t * pool, struct mime_tex
/* fall-through */
default:
reg_char:
- if (state == 0) {
+ if (state == 0 && !erase) {
*(rp++) = c;
}
break;
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index 4e1f8f61c..ce6599e86 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -725,13 +725,15 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
}
if (g_regex_match_full (regexp, ct, clen, 0, 0, NULL, &err) == TRUE) {
if (G_UNLIKELY (re->is_test)) {
- msg_info ("process test regexp %s for mime part returned TRUE", re->regexp_text);
+ msg_info ("process test regexp %s for mime part of length %d returned TRUE", re->regexp_text,
+ (gint)clen);
}
task_cache_add (task, re, 1);
return 1;
}
else if (G_UNLIKELY (re->is_test)) {
- msg_info ("process test regexp %s for mime part of length %d returned FALSE", re->regexp_text, (gint)part->orig->len);
+ msg_info ("process test regexp %s for mime part of length %d returned FALSE", re->regexp_text,
+ (gint)clen);
}
if (err != NULL) {
msg_info ("error occured while processing regexp \"%s\": %s", re->regexp_text, err->message);
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index ab073a28c..5e3d39c50 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -47,6 +47,35 @@ const int primes[] = {
797, 3277,
};
+const gchar t_delimiters[255] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
+ 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0
+};
+
struct tokenizer *
get_tokenizer (char *name)
{
@@ -78,7 +107,7 @@ f_str_t *
get_next_word (f_str_t * buf, f_str_t * token)
{
size_t remain;
- unsigned char *pos;
+ guchar *pos;
if (buf == NULL) {
return NULL;
@@ -95,13 +124,13 @@ get_next_word (f_str_t * buf, f_str_t * token)
return NULL;
}
pos = token->begin;
- /* Skip non graph symbols */
- while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) {
+ /* Skip non delimiters symbols */
+ while (remain > 0 && t_delimiters[*pos]) {
token->begin++;
pos++;
remain--;
}
- while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) {
+ while (remain > 0 && !t_delimiters[*pos]) {
token->len++;
pos++;
remain--;