}
static void
-rspamd_normalize_text_part (struct rspamd_task *task,
+rspamd_extract_words (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
#ifdef WITH_SNOWBALL
struct sb_stemmer *stem = NULL;
#endif
rspamd_ftok_t *w;
- const guchar *r, *p, *c, *end;
gchar *temp_word;
+ const guchar *r;
guint i, nlen;
#ifdef WITH_SNOWBALL
stem = sb_stemmer_new (part->language, "UTF_8");
if (stem == NULL) {
msg_info_task ("<%s> cannot create lemmatizer for %s language",
- task->message_id, part->language);
+ task->message_id, part->language);
}
}
#endif
- /* Strip newlines */
- part->stripped_content = g_byte_array_sized_new (part->content->len);
- part->newlines = g_ptr_array_sized_new (128);
- p = part->content->data;
- c = p;
- end = p + part->content->len;
-
- while (p < end) {
- p = memchr (c, '\n', end - c);
-
- if (p) {
- if (*(p - 1) == '\r') {
- p --;
- }
-
- if (p > c) {
- g_byte_array_append (part->stripped_content, c, p - c);
- }
-
- /* As it could cause reallocation, we initially store offsets */
- g_ptr_array_add (part->newlines,
- GUINT_TO_POINTER (part->stripped_content->len));
- part->nlines ++;
- p ++;
-
- while (p < end && (*p == '\r' || *p == '\n')) {
- if (*p == '\n') {
- part->nlines ++;
- }
-
- p ++;
- }
- c = p;
- }
- else {
- p = end;
- break;
- }
- }
-
- if (p > c) {
- g_byte_array_append (part->stripped_content, c, p - c);
- }
-
- /* Now convert offsets to real pointers for convenience */
- for (i = 0; i < part->newlines->len; i ++) {
- guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
- g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
- }
-
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) free_byte_array_callback,
- part->stripped_content);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
- part->newlines);
-
/* Ugly workaround */
part->normalized_words = rspamd_tokenize_text (part->content->data,
part->content->len, IS_PART_UTF (part), task->cfg,
- part->urls_offset, FALSE,
+ part->exceptions, FALSE,
NULL);
if (part->normalized_words) {
#endif
}
+static void
+rspamd_normalize_text_part (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+
+ const guchar *p, *c, *end;
+ guint i;
+ struct rspamd_process_exception *ex;
+
+ /* Strip newlines */
+ part->stripped_content = g_byte_array_sized_new (part->content->len);
+ part->newlines = g_ptr_array_sized_new (128);
+ p = part->content->data;
+ c = p;
+ end = p + part->content->len;
+
+ while (p < end) {
+ p = memchr (c, '\n', end - c);
+
+ if (p) {
+ if (*(p - 1) == '\r') {
+ p --;
+ }
+
+ if (p > c) {
+ g_byte_array_append (part->stripped_content, c, p - c);
+ }
+
+ /* As it could cause reallocation, we initially store offsets */
+ g_ptr_array_add (part->newlines,
+ GUINT_TO_POINTER (part->stripped_content->len));
+ ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
+ ex->pos = part->stripped_content->len;
+ ex->len = 0;
+ ex->type = RSPAMD_EXCEPTION_NEWLINE;
+ part->exceptions = g_list_prepend (part->exceptions, ex);
+ part->nlines ++;
+ p ++;
+
+ while (p < end && (*p == '\r' || *p == '\n')) {
+ if (*p == '\n') {
+ part->nlines ++;
+ }
+
+ p ++;
+ }
+ c = p;
+ }
+ else {
+ p = end;
+ break;
+ }
+ }
+
+ if (p > c) {
+ g_byte_array_append (part->stripped_content, c, p - c);
+ }
+
+ /* Now convert offsets to real pointers for convenience */
+ for (i = 0; i < part->newlines->len; i ++) {
+ guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
+ g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
+ }
+
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) free_byte_array_callback,
+ part->stripped_content);
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
+ part->newlines);
+}
+
#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
static guint
return FALSE;
}
+static gint
+exceptions_compare_func (gconstpointer a, gconstpointer b)
+{
+ const struct rspamd_process_exception *ea = a, *eb = b;
+
+ return ea->pos - eb->pos;
+}
+
static void
process_text_part (struct rspamd_task *task,
GByteArray *part_content,
task->task_pool,
text_part->html,
part_content,
- &text_part->urls_offset,
+ &text_part->exceptions,
task->urls,
task->emails);
}
/* Handle offsets of this part */
- if (text_part->urls_offset != NULL) {
- text_part->urls_offset = g_list_reverse (text_part->urls_offset);
+ if (text_part->exceptions != NULL) {
+ text_part->exceptions = g_list_reverse (text_part->exceptions);
rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) g_list_free, text_part->urls_offset);
+ (rspamd_mempool_destruct_t) g_list_free, text_part->exceptions);
}
rspamd_mempool_add_destructor (task->task_pool,
if (!IS_PART_HTML (text_part)) {
rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
}
+
+ text_part->exceptions = g_list_sort (text_part->exceptions,
+ exceptions_compare_func);
+
+ rspamd_extract_words (task, text_part);
}
struct mime_foreach_data {
gsize end_offset, gpointer ud)
{
struct rspamd_url_mimepart_cbdata *cbd = ud;
- struct process_exception *ex;
+ struct rspamd_process_exception *ex;
struct rspamd_task *task;
gchar *url_str = NULL;
struct rspamd_url *query_url;
gint rc;
task = cbd->task;
- ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct process_exception));
+ ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_process_exception));
ex->pos = start_offset;
ex->len = end_offset - start_offset;
+ ex->type = RSPAMD_EXCEPTION_URL;
if (url->protocol == PROTOCOL_MAILTO) {
if (url->userlen > 0) {
}
}
- cbd->part->urls_offset = g_list_prepend (
- cbd->part->urls_offset,
+ cbd->part->exceptions = g_list_prepend (
+ cbd->part->exceptions,
ex);
/* We also search the query for additional url inside */
rspamd_url_text_part_callback, &mcbd);
/* Handle offsets of this part */
- if (part->urls_offset != NULL) {
- part->urls_offset = g_list_reverse (part->urls_offset);
+ if (part->exceptions != NULL) {
+ part->exceptions = g_list_reverse (part->exceptions);
rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) g_list_free, part->urls_offset);
+ (rspamd_mempool_destruct_t) g_list_free, part->exceptions);
}
}
{
gsize remain, pos;
const gchar *p;
- struct process_exception *ex = NULL;
+ struct rspamd_process_exception *ex = NULL;
if (buf == NULL) {
return FALSE;
GList **exceptions, gboolean is_utf, gsize *rl,
gboolean check_signature)
{
- gsize remain, pos, siglen = 0;
+ gsize remain, siglen = 0;
+ goffset pos;
const gchar *p, *next_p, *sig = NULL;
gunichar uc;
guint processed = 0;
- struct process_exception *ex = NULL;
+ struct rspamd_process_exception *ex = NULL;
enum {
skip_delimiters = 0,
feed_token,
switch (state) {
case skip_delimiters:
- if (ex != NULL && p - buf->begin == (gint)ex->pos) {
- token->begin = "!!EX!!";
- token->len = sizeof ("!!EX!!") - 1;
- processed = token->len;
+ if (ex != NULL && p - buf->begin == ex->pos) {
+ if (ex->type == RSPAMD_EXCEPTION_URL) {
+ token->begin = "!!EX!!";
+ token->len = sizeof ("!!EX!!") - 1;
+ processed = token->len;
+ }
state = skip_exception;
continue;
}
*rl = processed;
}
- if (token->len == 0) {
+ if (token->len == 0 && processed > 0) {
token->len = p - token->begin;
g_assert (token->len > 0);
- *cur = p;
}
+ *cur = p;
+
return TRUE;
}