summaryrefslogtreecommitdiffstats
path: root/src/libmime
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-07-13 17:03:27 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-07-13 17:07:23 +0100
commit70cbb6d39a06eb6f71832517bfd788ad217b6965 (patch)
tree5e0e41033565b271021072aa5c2455f0e79a91a7 /src/libmime
parentd2af2a1d52a8f9b26b7c77b12ce555db24f07df4 (diff)
downloadrspamd-70cbb6d39a06eb6f71832517bfd788ad217b6965.tar.gz
rspamd-70cbb6d39a06eb6f71832517bfd788ad217b6965.zip
[Rework] Rework exceptions and newlines processing
Diffstat (limited to 'src/libmime')
-rw-r--r--src/libmime/message.c158
-rw-r--r--src/libmime/message.h2
2 files changed, 94 insertions, 66 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index f6c023294..4605d1c69 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -658,15 +658,15 @@ detect_text_language (struct rspamd_mime_text_part *part)
}
static void
-rspamd_normalize_text_part (struct rspamd_task *task,
+rspamd_extract_words (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
#ifdef WITH_SNOWBALL
struct sb_stemmer *stem = NULL;
#endif
rspamd_ftok_t *w;
- const guchar *r, *p, *c, *end;
gchar *temp_word;
+ const guchar *r;
guint i, nlen;
#ifdef WITH_SNOWBALL
@@ -674,71 +674,14 @@ rspamd_normalize_text_part (struct rspamd_task *task,
stem = sb_stemmer_new (part->language, "UTF_8");
if (stem == NULL) {
msg_info_task ("<%s> cannot create lemmatizer for %s language",
- task->message_id, part->language);
+ task->message_id, part->language);
}
}
#endif
- /* Strip newlines */
- part->stripped_content = g_byte_array_sized_new (part->content->len);
- part->newlines = g_ptr_array_sized_new (128);
- p = part->content->data;
- c = p;
- end = p + part->content->len;
-
- while (p < end) {
- p = memchr (c, '\n', end - c);
-
- if (p) {
- if (*(p - 1) == '\r') {
- p --;
- }
-
- if (p > c) {
- g_byte_array_append (part->stripped_content, c, p - c);
- }
-
- /* As it could cause reallocation, we initially store offsets */
- g_ptr_array_add (part->newlines,
- GUINT_TO_POINTER (part->stripped_content->len));
- part->nlines ++;
- p ++;
-
- while (p < end && (*p == '\r' || *p == '\n')) {
- if (*p == '\n') {
- part->nlines ++;
- }
-
- p ++;
- }
- c = p;
- }
- else {
- p = end;
- break;
- }
- }
-
- if (p > c) {
- g_byte_array_append (part->stripped_content, c, p - c);
- }
-
- /* Now convert offsets to real pointers for convenience */
- for (i = 0; i < part->newlines->len; i ++) {
- guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
- g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
- }
-
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) free_byte_array_callback,
- part->stripped_content);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
- part->newlines);
-
/* Ugly workaround */
part->normalized_words = rspamd_tokenize_text (part->content->data,
part->content->len, IS_PART_UTF (part), task->cfg,
- part->urls_offset, FALSE,
+ part->exceptions, FALSE,
NULL);
if (part->normalized_words) {
@@ -798,6 +741,78 @@ rspamd_normalize_text_part (struct rspamd_task *task,
#endif
}
+static void
+rspamd_normalize_text_part (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+
+ const guchar *p, *c, *end;
+ guint i;
+ struct rspamd_process_exception *ex;
+
+ /* Strip newlines */
+ part->stripped_content = g_byte_array_sized_new (part->content->len);
+ part->newlines = g_ptr_array_sized_new (128);
+ p = part->content->data;
+ c = p;
+ end = p + part->content->len;
+
+ while (p < end) {
+ p = memchr (c, '\n', end - c);
+
+ if (p) {
+ if (*(p - 1) == '\r') {
+ p --;
+ }
+
+ if (p > c) {
+ g_byte_array_append (part->stripped_content, c, p - c);
+ }
+
+ /* As it could cause reallocation, we initially store offsets */
+ g_ptr_array_add (part->newlines,
+ GUINT_TO_POINTER (part->stripped_content->len));
+ ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
+ ex->pos = part->stripped_content->len;
+ ex->len = 0;
+ ex->type = RSPAMD_EXCEPTION_NEWLINE;
+ part->exceptions = g_list_prepend (part->exceptions, ex);
+ part->nlines ++;
+ p ++;
+
+ while (p < end && (*p == '\r' || *p == '\n')) {
+ if (*p == '\n') {
+ part->nlines ++;
+ }
+
+ p ++;
+ }
+ c = p;
+ }
+ else {
+ p = end;
+ break;
+ }
+ }
+
+ if (p > c) {
+ g_byte_array_append (part->stripped_content, c, p - c);
+ }
+
+ /* Now convert offsets to real pointers for convenience */
+ for (i = 0; i < part->newlines->len; i ++) {
+ guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
+ g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
+ }
+
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) free_byte_array_callback,
+ part->stripped_content);
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
+ part->newlines);
+}
+
#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
static guint
@@ -872,6 +887,14 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
return FALSE;
}
+static gint
+exceptions_compare_func (gconstpointer a, gconstpointer b)
+{
+ const struct rspamd_process_exception *ea = a, *eb = b;
+
+ return ea->pos - eb->pos;
+}
+
static void
process_text_part (struct rspamd_task *task,
GByteArray *part_content,
@@ -932,7 +955,7 @@ process_text_part (struct rspamd_task *task,
task->task_pool,
text_part->html,
part_content,
- &text_part->urls_offset,
+ &text_part->exceptions,
task->urls,
task->emails);
@@ -941,10 +964,10 @@ process_text_part (struct rspamd_task *task,
}
/* Handle offsets of this part */
- if (text_part->urls_offset != NULL) {
- text_part->urls_offset = g_list_reverse (text_part->urls_offset);
+ if (text_part->exceptions != NULL) {
+ text_part->exceptions = g_list_reverse (text_part->exceptions);
rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) g_list_free, text_part->urls_offset);
+ (rspamd_mempool_destruct_t) g_list_free, text_part->exceptions);
}
rspamd_mempool_add_destructor (task->task_pool,
@@ -1006,6 +1029,11 @@ process_text_part (struct rspamd_task *task,
if (!IS_PART_HTML (text_part)) {
rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
}
+
+ text_part->exceptions = g_list_sort (text_part->exceptions,
+ exceptions_compare_func);
+
+ rspamd_extract_words (task, text_part);
}
struct mime_foreach_data {
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 0d2ae74b4..3fe26e685 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -57,7 +57,7 @@ struct rspamd_mime_text_part {
GByteArray *stripped_content; /**< no newlines or html tags */
GPtrArray *newlines; /**< positions of newlines in text */
struct html_content *html;
- GList *urls_offset; /**< list of offsets of urls */
+ GList *exceptions; /**< list of offsets of urls */
GMimeObject *parent;
struct rspamd_mime_part *mime_part;
GArray *normalized_words;