summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-07-13 17:03:27 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-07-13 17:07:23 +0100
commit70cbb6d39a06eb6f71832517bfd788ad217b6965 (patch)
tree5e0e41033565b271021072aa5c2455f0e79a91a7
parentd2af2a1d52a8f9b26b7c77b12ce555db24f07df4 (diff)
downloadrspamd-70cbb6d39a06eb6f71832517bfd788ad217b6965.tar.gz
rspamd-70cbb6d39a06eb6f71832517bfd788ad217b6965.zip
[Rework] Rework exceptions and newlines processing
-rw-r--r--src/libmime/message.c158
-rw-r--r--src/libmime/message.h2
-rw-r--r--src/libserver/html.c3
-rw-r--r--src/libserver/url.c15
-rw-r--r--src/libstat/tokenizers/tokenizers.c22
-rw-r--r--src/lua/lua_util.c2
-rw-r--r--src/rspamd.h11
7 files changed, 126 insertions, 87 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index f6c023294..4605d1c69 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -658,15 +658,15 @@ detect_text_language (struct rspamd_mime_text_part *part)
}
static void
-rspamd_normalize_text_part (struct rspamd_task *task,
+rspamd_extract_words (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
#ifdef WITH_SNOWBALL
struct sb_stemmer *stem = NULL;
#endif
rspamd_ftok_t *w;
- const guchar *r, *p, *c, *end;
gchar *temp_word;
+ const guchar *r;
guint i, nlen;
#ifdef WITH_SNOWBALL
@@ -674,71 +674,14 @@ rspamd_normalize_text_part (struct rspamd_task *task,
stem = sb_stemmer_new (part->language, "UTF_8");
if (stem == NULL) {
msg_info_task ("<%s> cannot create lemmatizer for %s language",
- task->message_id, part->language);
+ task->message_id, part->language);
}
}
#endif
- /* Strip newlines */
- part->stripped_content = g_byte_array_sized_new (part->content->len);
- part->newlines = g_ptr_array_sized_new (128);
- p = part->content->data;
- c = p;
- end = p + part->content->len;
-
- while (p < end) {
- p = memchr (c, '\n', end - c);
-
- if (p) {
- if (*(p - 1) == '\r') {
- p --;
- }
-
- if (p > c) {
- g_byte_array_append (part->stripped_content, c, p - c);
- }
-
- /* As it could cause reallocation, we initially store offsets */
- g_ptr_array_add (part->newlines,
- GUINT_TO_POINTER (part->stripped_content->len));
- part->nlines ++;
- p ++;
-
- while (p < end && (*p == '\r' || *p == '\n')) {
- if (*p == '\n') {
- part->nlines ++;
- }
-
- p ++;
- }
- c = p;
- }
- else {
- p = end;
- break;
- }
- }
-
- if (p > c) {
- g_byte_array_append (part->stripped_content, c, p - c);
- }
-
- /* Now convert offsets to real pointers for convenience */
- for (i = 0; i < part->newlines->len; i ++) {
- guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
- g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
- }
-
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) free_byte_array_callback,
- part->stripped_content);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
- part->newlines);
-
/* Ugly workaround */
part->normalized_words = rspamd_tokenize_text (part->content->data,
part->content->len, IS_PART_UTF (part), task->cfg,
- part->urls_offset, FALSE,
+ part->exceptions, FALSE,
NULL);
if (part->normalized_words) {
@@ -798,6 +741,78 @@ rspamd_normalize_text_part (struct rspamd_task *task,
#endif
}
+static void
+rspamd_normalize_text_part (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+
+ const guchar *p, *c, *end;
+ guint i;
+ struct rspamd_process_exception *ex;
+
+ /* Strip newlines */
+ part->stripped_content = g_byte_array_sized_new (part->content->len);
+ part->newlines = g_ptr_array_sized_new (128);
+ p = part->content->data;
+ c = p;
+ end = p + part->content->len;
+
+ while (p < end) {
+ p = memchr (c, '\n', end - c);
+
+ if (p) {
+ if (*(p - 1) == '\r') {
+ p --;
+ }
+
+ if (p > c) {
+ g_byte_array_append (part->stripped_content, c, p - c);
+ }
+
+ /* As it could cause reallocation, we initially store offsets */
+ g_ptr_array_add (part->newlines,
+ GUINT_TO_POINTER (part->stripped_content->len));
+ ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
+ ex->pos = part->stripped_content->len;
+ ex->len = 0;
+ ex->type = RSPAMD_EXCEPTION_NEWLINE;
+ part->exceptions = g_list_prepend (part->exceptions, ex);
+ part->nlines ++;
+ p ++;
+
+ while (p < end && (*p == '\r' || *p == '\n')) {
+ if (*p == '\n') {
+ part->nlines ++;
+ }
+
+ p ++;
+ }
+ c = p;
+ }
+ else {
+ p = end;
+ break;
+ }
+ }
+
+ if (p > c) {
+ g_byte_array_append (part->stripped_content, c, p - c);
+ }
+
+ /* Now convert offsets to real pointers for convenience */
+ for (i = 0; i < part->newlines->len; i ++) {
+ guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
+ g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
+ }
+
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) free_byte_array_callback,
+ part->stripped_content);
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
+ part->newlines);
+}
+
#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
static guint
@@ -872,6 +887,14 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
return FALSE;
}
+static gint
+exceptions_compare_func (gconstpointer a, gconstpointer b)
+{
+ const struct rspamd_process_exception *ea = a, *eb = b;
+
+ return ea->pos - eb->pos;
+}
+
static void
process_text_part (struct rspamd_task *task,
GByteArray *part_content,
@@ -932,7 +955,7 @@ process_text_part (struct rspamd_task *task,
task->task_pool,
text_part->html,
part_content,
- &text_part->urls_offset,
+ &text_part->exceptions,
task->urls,
task->emails);
@@ -941,10 +964,10 @@ process_text_part (struct rspamd_task *task,
}
/* Handle offsets of this part */
- if (text_part->urls_offset != NULL) {
- text_part->urls_offset = g_list_reverse (text_part->urls_offset);
+ if (text_part->exceptions != NULL) {
+ text_part->exceptions = g_list_reverse (text_part->exceptions);
rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) g_list_free, text_part->urls_offset);
+ (rspamd_mempool_destruct_t) g_list_free, text_part->exceptions);
}
rspamd_mempool_add_destructor (task->task_pool,
@@ -1006,6 +1029,11 @@ process_text_part (struct rspamd_task *task,
if (!IS_PART_HTML (text_part)) {
rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
}
+
+ text_part->exceptions = g_list_sort (text_part->exceptions,
+ exceptions_compare_func);
+
+ rspamd_extract_words (task, text_part);
}
struct mime_foreach_data {
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 0d2ae74b4..3fe26e685 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -57,7 +57,7 @@ struct rspamd_mime_text_part {
GByteArray *stripped_content; /**< no newlines or html tags */
GPtrArray *newlines; /**< positions of newlines in text */
struct html_content *html;
- GList *urls_offset; /**< list of offsets of urls */
+ GList *exceptions; /**< list of offsets of urls */
GMimeObject *parent;
struct rspamd_mime_part *mime_part;
GArray *normalized_words;
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 0a25e488a..1188515c5 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1601,7 +1601,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
gint substate = 0, len, href_offset = -1;
struct html_tag *cur_tag = NULL;
struct rspamd_url *url = NULL, *turl;
- struct process_exception *ex;
+ struct rspamd_process_exception *ex;
enum {
parse_start = 0,
tag_begin,
@@ -1977,6 +1977,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
ex = rspamd_mempool_alloc (pool, sizeof (*ex));
ex->pos = href_offset;
ex->len = dest->len - href_offset;
+ ex->type = RSPAMD_EXCEPTION_URL;
*exceptions = g_list_prepend (*exceptions, ex);
}
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 823e32a43..1ccc91a27 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2294,17 +2294,18 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
gsize end_offset, gpointer ud)
{
struct rspamd_url_mimepart_cbdata *cbd = ud;
- struct process_exception *ex;
+ struct rspamd_process_exception *ex;
struct rspamd_task *task;
gchar *url_str = NULL;
struct rspamd_url *query_url;
gint rc;
task = cbd->task;
- ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct process_exception));
+ ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_process_exception));
ex->pos = start_offset;
ex->len = end_offset - start_offset;
+ ex->type = RSPAMD_EXCEPTION_URL;
if (url->protocol == PROTOCOL_MAILTO) {
if (url->userlen > 0) {
@@ -2320,8 +2321,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
}
}
- cbd->part->urls_offset = g_list_prepend (
- cbd->part->urls_offset,
+ cbd->part->exceptions = g_list_prepend (
+ cbd->part->exceptions,
ex);
/* We also search the query for additional url inside */
@@ -2376,10 +2377,10 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
rspamd_url_text_part_callback, &mcbd);
/* Handle offsets of this part */
- if (part->urls_offset != NULL) {
- part->urls_offset = g_list_reverse (part->urls_offset);
+ if (part->exceptions != NULL) {
+ part->exceptions = g_list_reverse (part->exceptions);
rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) g_list_free, part->urls_offset);
+ (rspamd_mempool_destruct_t) g_list_free, part->exceptions);
}
}
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 4e0e4b75d..6eab11f98 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -75,7 +75,7 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
{
gsize remain, pos;
const gchar *p;
- struct process_exception *ex = NULL;
+ struct rspamd_process_exception *ex = NULL;
if (buf == NULL) {
return FALSE;
@@ -166,11 +166,12 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
GList **exceptions, gboolean is_utf, gsize *rl,
gboolean check_signature)
{
- gsize remain, pos, siglen = 0;
+ gsize remain, siglen = 0;
+ goffset pos;
const gchar *p, *next_p, *sig = NULL;
gunichar uc;
guint processed = 0;
- struct process_exception *ex = NULL;
+ struct rspamd_process_exception *ex = NULL;
enum {
skip_delimiters = 0,
feed_token,
@@ -214,10 +215,12 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
switch (state) {
case skip_delimiters:
- if (ex != NULL && p - buf->begin == (gint)ex->pos) {
- token->begin = "!!EX!!";
- token->len = sizeof ("!!EX!!") - 1;
- processed = token->len;
+ if (ex != NULL && p - buf->begin == ex->pos) {
+ if (ex->type == RSPAMD_EXCEPTION_URL) {
+ token->begin = "!!EX!!";
+ token->len = sizeof ("!!EX!!") - 1;
+ processed = token->len;
+ }
state = skip_exception;
continue;
}
@@ -270,12 +273,13 @@ set_token:
*rl = processed;
}
- if (token->len == 0) {
+ if (token->len == 0 && processed > 0) {
token->len = p - token->begin;
g_assert (token->len > 0);
- *cur = p;
}
+ *cur = p;
+
return TRUE;
}
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index ccbcec6e6..6ce4179f1 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -733,7 +733,7 @@ lua_util_tokenize_text (lua_State *L)
gsize len, pos, ex_len, i;
GList *exceptions = NULL, *cur;
struct rspamd_lua_text *t;
- struct process_exception *ex;
+ struct rspamd_process_exception *ex;
GArray *res;
rspamd_ftok_t *w;
gboolean compat = FALSE;
diff --git a/src/rspamd.h b/src/rspamd.h
index 5626337fc..ffebfe387 100644
--- a/src/rspamd.h
+++ b/src/rspamd.h
@@ -265,12 +265,17 @@ struct rspamd_main {
struct event_base *ev_base;
};
+enum rspamd_exception_type {
+ RSPAMD_EXCEPTION_NEWLINE = 0,
+ RSPAMD_EXCEPTION_URL,
+};
/**
* Structure to point exception in text from processing
*/
-struct process_exception {
- gsize pos;
- gsize len;
+struct rspamd_process_exception {
+ goffset pos;
+ guint len;
+ enum rspamd_exception_type type;
};
/**