#include "html.h"
#include "images.h"
#include "utlist.h"
+#include "tokenizers/tokenizers.h"
+
#include <iconv.h>
#define RECURSION_LIMIT 30
{
struct mime_text_part *text_part;
const gchar *cd;
+ gchar *pos;
+ rspamd_fstring_t token, buf;
/* Skip attachements */
#ifndef GMIME24
if (g_mime_content_type_is_type (type, "text",
"html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
- debug_task ("got urls from text/html part");
text_part =
rspamd_mempool_alloc0 (task->task_pool,
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
else if (g_mime_content_type_is_type (type, "text", "*")) {
- debug_task ("got urls from text/plain part");
text_part =
rspamd_mempool_alloc0 (task->task_pool,
rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
+ else {
+ return;
+ }
+
+ /* Post process part */
+ buf.begin = text_part->content->data;
+ buf.len = text_part->content->len;
+ buf.size = buf.len;
+
+ text_part->words = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
+ while ((pos = rspamd_tokenizer_get_word (&buf,
+ &token, &text_part->urls_offset)) != NULL) {
+ g_array_append_val (text_part->words, token);
+ }
}
#ifdef GMIME24
GByteArray *orig;
GByteArray *content;
GNode *html_nodes;
- GList *urls_offset; /**< list of offsets of urls */
+ GList *urls_offset; /**< list of offsets of urls */
rspamd_fuzzy_t *fuzzy;
rspamd_fuzzy_t *double_fuzzy;
GMimeObject *parent;
GUnicodeScript script;
rspamd_fstring_t *diff_str;
+ GArray *words;
};
struct received_header {
{
GList *part;
struct mime_part *p;
+ struct mime_text_part *tp;
if (task) {
debug_task ("free pointer %p", task);
g_list_free_1 (part);
}
if (task->text_parts) {
+ part = task->text_parts;
+ while (part) {
+ tp = (struct mime_text_part *)part->data;
+ if (tp->words) {
+ g_array_free (tp->words, TRUE);
+ }
+ part = g_list_next (part);
+ }
+
g_list_free (task->text_parts);
}
if (task->images) {
#include "tokenizers.h"
struct tokenizer tokenizers[] = {
- {"osb-text", osb_tokenize_text, get_next_word},
+ {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word},
};
const int primes[] = {
/* Get next word from specified f_str_t buf */
gchar *
-get_next_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
+rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
{
gsize remain, pos;
guchar *p;
/* Get tokenizer structure by name or return NULL if this name is not found */
struct tokenizer * get_tokenizer (const char *name);
/* Get next word from specified f_str_t buf */
-gchar * get_next_word (rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions);
+gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
+ rspamd_fstring_t *token, GList **exceptions);
/* OSB tokenize function */
int osb_tokenize_text (struct tokenizer *tokenizer,
rspamd_mempool_t *pool,