aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-03-23 14:10:07 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-03-23 14:10:07 +0300
commitc79b5ccd22cbc1c273479f4f88189a18effda533 (patch)
tree1741743779a70146a61cd1767936aa43d671e36b /src
parentafdaddc4d0745a5bcefad73dd74fd4c03ae3de15 (diff)
downloadrspamd-c79b5ccd22cbc1c273479f4f88189a18effda533.tar.gz
rspamd-c79b5ccd22cbc1c273479f4f88189a18effda533.zip
* Fix error in expression parser that causes bad errors with expressions that have regexp at the end
* Improve test for fuzzy hashes * Add new object - TextPart to perl XS library that allows access to stripped parts and fuzzy hashes * Add documentation for expressions parser and fot Mail::Rspamd::TextPart * Allways calculate fuzzy hash for text parts * Store text parts separately from other parts * Add compare_parts_distance for expressions that calculates difference in 2 parts messages * Do not try to substitute variables in empty strings
Diffstat (limited to 'src')
-rw-r--r--src/cfg_utils.c5
-rw-r--r--src/expressions.c64
-rw-r--r--src/filter.c12
-rw-r--r--src/fuzzy.c13
-rw-r--r--src/fuzzy.h1
-rw-r--r--src/main.h1
-rw-r--r--src/message.c19
-rw-r--r--src/message.h8
8 files changed, 110 insertions, 13 deletions
diff --git a/src/cfg_utils.c b/src/cfg_utils.c
index 1eeb518ed..037f23754 100644
--- a/src/cfg_utils.c
+++ b/src/cfg_utils.c
@@ -355,6 +355,11 @@ substitute_variable (struct config_file *cfg, char *str, u_char recursive)
char *var, *new, *v_begin, *v_end;
size_t len;
+ if (str == NULL) {
+ yywarn ("substitute_variable: trying to substitute variable in NULL string");
+ return NULL;
+ }
+
while ((v_begin = strstr (str, "${")) != NULL) {
len = strlen (str);
*v_begin = '\0';
diff --git a/src/expressions.c b/src/expressions.c
index 5cb30e4c3..eefd11f78 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -26,12 +26,15 @@
#include "util.h"
#include "cfg_file.h"
#include "main.h"
+#include "message.h"
+#include "fuzzy.h"
#include "expressions.h"
typedef gboolean (*rspamd_internal_func_t)(struct worker_task *, GList *args);
gboolean rspamd_compare_encoding (struct worker_task *task, GList *args);
gboolean rspamd_header_exists (struct worker_task *task, GList *args);
+gboolean rspamd_parts_distance (struct worker_task *task, GList *args);
/*
* List of internal functions of rspamd
* Sorted by name to use bsearch
@@ -41,6 +44,7 @@ static struct _fl {
rspamd_internal_func_t func;
} rspamd_functions_list[] = {
{ "compare_encoding", rspamd_compare_encoding },
+ { "compare_parts_distance", rspamd_parts_distance },
{ "header_exists", rspamd_header_exists },
};
@@ -273,7 +277,9 @@ parse_expression (memory_pool_t *pool, char *line)
case READ_REGEXP:
if (*p == '/' && *(p - 1) != '\\') {
- p ++;
+ if (*(p + 1)) {
+ p ++;
+ }
state = READ_REGEXP_FLAGS;
}
else {
@@ -285,14 +291,17 @@ parse_expression (memory_pool_t *pool, char *line)
if (!is_regexp_flag (*p) || *(p + 1) == '\0') {
if (c != p) {
/* Copy operand */
- str = memory_pool_alloc (pool, p - c + 3);
- g_strlcpy (str, c - 1, (p - c + 3));
+ if (*(p + 1) == '\0') {
+ p++;
+ }
+ str = memory_pool_alloc (pool, p - c + 2);
+ g_strlcpy (str, c - 1, (p - c + 2));
g_strstrip (str);
if (strlen (str) > 0) {
insert_expression (pool, &expr, EXPR_REGEXP, 0, str);
}
}
- c = ++p;
+ c = p;
state = SKIP_SPACES;
}
else {
@@ -594,5 +603,52 @@ rspamd_header_exists (struct worker_task *task, GList *args)
}
/*
+ * This function is designed to find difference between text/html and text/plain parts
+ * It takes one argument: difference threshold, if we have two text parts, compare
+ * its hashes and check for threshold, if value is greater than threshold, return TRUE
+ * and return FALSE otherwise.
+ */
+gboolean
+rspamd_parts_distance (struct worker_task *task, GList *args)
+{
+ int threshold;
+ struct mime_text_part *p1, *p2;
+ GList *cur;
+
+ if (args == NULL) {
+ msg_debug ("rspamd_parts_distance: no threshold is specified, assume it 100");
+ threshold = 100;
+ }
+ else {
+ errno = 0;
+ threshold = strtoul ((char *)args->data, NULL, 10);
+ if (errno != 0) {
+ msg_info ("rspamd_parts_distance: bad numeric value for threshold \"%s\", assume it 100", (char *)args->data);
+ threshold = 100;
+ }
+ }
+
+ if (g_list_length (task->text_parts) == 2) {
+ cur = g_list_first (task->text_parts);
+ p1 = cur->data;
+ cur = g_list_next (cur);
+ if (cur == NULL) {
+ msg_info ("rspamd_parts_distance: bad parts list");
+ return FALSE;
+ }
+ p2 = cur->data;
+ if (fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy) >= threshold) {
+ return TRUE;
+ }
+ }
+ else {
+ msg_debug ("rspamd_parts_distance: message has too many text parts, so do not try to compare them with each other");
+ return FALSE;
+ }
+
+ return FALSE;
+}
+
+/*
* vi:ts=4
*/
diff --git a/src/filter.c b/src/filter.c
index 766cd16e4..1b6cdc1b0 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -437,10 +437,10 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
struct classifier *classifier;
struct statfile_result_data *res_data;
struct metric *metric;
+ struct mime_text_part *text_part;
GTree *tokens = NULL;
- GList *cur = NULL;
- GByteArray *content;
+ GList *cur;
char *filename;
f_str_t c;
@@ -457,10 +457,12 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
return;
}
+ cur = g_list_first (task->text_parts);
if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) {
- while ((content = get_next_text_part (task->task_pool, task->parts, &cur)) != NULL) {
- c.begin = content->data;
- c.len = content->len;
+ while (cur != NULL) {
+ text_part = (struct mime_text_part *)cur->data;
+ c.begin = text_part->content->data;
+ c.len = text_part->content->len;
/* Tree would be freed at task pool freeing */
if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) {
msg_info ("statfiles_callback: cannot tokenize input");
diff --git a/src/fuzzy.c b/src/fuzzy.c
index 08814eaa1..4dfec3fb7 100644
--- a/src/fuzzy.c
+++ b/src/fuzzy.c
@@ -95,7 +95,7 @@ fuzzy_update (fuzzy_hash_t *h, char c)
if (h->rh % h->block_size == (h->block_size - 1)) {
h->hash_pipe[h->hi] = h->h;
- if (h->hi < FUZZY_HASHLEN - 1) {
+ if (h->hi < FUZZY_HASHLEN - 2) {
h->h = HASH_INIT;
h->hi ++;
}
@@ -249,6 +249,17 @@ fuzzy_init (f_str_t *in, memory_pool_t *pool)
return new;
}
+fuzzy_hash_t *
+fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool)
+{
+ f_str_t f;
+
+ f.begin = in->data;
+ f.len = in->len;
+
+ return fuzzy_init (&f, pool);
+}
+
/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
int
fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2)
diff --git a/src/fuzzy.h b/src/fuzzy.h
index 91e6512c6..50d1a9110 100644
--- a/src/fuzzy.h
+++ b/src/fuzzy.h
@@ -27,6 +27,7 @@ typedef struct fuzzy_hash_s {
* @return fuzzy_hash object allocated in pool
*/
fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool);
+fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool);
/**
* Compare score of difference between two hashes
diff --git a/src/main.h b/src/main.h
index 28eb64297..e26ab3fda 100644
--- a/src/main.h
+++ b/src/main.h
@@ -174,6 +174,7 @@ struct worker_task {
int parts_count; /**< mime parts count */
GMimeMessage *message; /**< message, parsed with GMime */
GList *parts; /**< list of parsed parts */
+ GList *text_parts; /**< list of text parts */
char *raw_headers; /**< list of raw headers */
TAILQ_HEAD (uriq, uri) urls; /**< list of parsed urls */
GHashTable *results; /**< hash table of metric_result indexed by
diff --git a/src/message.c b/src/message.c
index 76743f7de..807463a82 100644
--- a/src/message.c
+++ b/src/message.c
@@ -242,6 +242,7 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
{
struct worker_task *task = (struct worker_task *)user_data;
struct mime_part *mime_part;
+ struct mime_text_part *text_part;
GMimeContentType *type;
GMimeDataWrapper *wrapper;
GMimeStream *part_stream;
@@ -302,13 +303,27 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
mime_part->content = part_content;
msg_debug ("mime_foreach_callback: found part with content-type: %s/%s", type->type, type->subtype);
task->parts = g_list_prepend (task->parts, mime_part);
- if (g_mime_content_type_is_type (type, "text", "html")) {
+ /* Now do special processing for text parts of message */
+ if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
msg_debug ("mime_foreach_callback: got urls from text/html part");
url_parse_html (task, part_content);
+
+ text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
+ text_part->content = strip_html_tags (part_content, NULL);
+ text_part->is_html = TRUE;
+ text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
+ task->text_parts = g_list_prepend (task->text_parts, text_part);
}
else if (g_mime_content_type_is_type (type, "text", "plain")) {
- url_parse_text (task, part_content);
msg_debug ("mime_foreach_callback: got urls from text/plain part");
+ url_parse_text (task, part_content);
+
+ text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
+ text_part->content = part_content;
+ text_part->is_html = FALSE;
+ text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+ task->text_parts = g_list_prepend (task->text_parts, text_part);
}
}
else {
diff --git a/src/message.h b/src/message.h
index 1122e7e3b..eaf9f493e 100644
--- a/src/message.h
+++ b/src/message.h
@@ -7,11 +7,17 @@
#define RSPAMD_MESSAGE_H
#include "config.h"
+#include "fuzzy.h"
struct mime_part {
GMimeContentType *type;
GByteArray *content;
- TAILQ_ENTRY (mime_part) next;
+};
+
+struct mime_text_part {
+ gboolean is_html;
+ GByteArray *content;
+ fuzzy_hash_t *fuzzy;
};
/**