#define RECURSION_LIMIT 30
#define UTF8_CHARSET "UTF-8"
+#define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_PART_FLAG_UTF)
+#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_PART_FLAG_UTF)
+
GByteArray *
strip_html_tags (struct rspamd_task *task,
rspamd_mempool_t * pool,
/* Check tag balancing */
if (level_ptr && level_ptr->data != NULL) {
- part->is_balanced = FALSE;
+ part->flags &= ~RSPAMD_MIME_PART_FLAG_BALANCED;
+ }
+ else {
+ part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
}
if (stateptr) {
return res;
}
+
static GByteArray *
convert_text_to_utf (struct rspamd_task *task,
GByteArray * part_content,
GByteArray *result_array;
if (task->cfg->raw_mode) {
- text_part->is_raw = TRUE;
+ SET_PART_RAW (text_part);
return part_content;
}
if ((charset =
g_mime_content_type_get_parameter (type, "charset")) == NULL) {
- text_part->is_raw = TRUE;
+ SET_PART_RAW (text_part);
return part_content;
}
if (!charset_validate (task->task_pool, charset, &ocharset)) {
msg_info (
"<%s>: has invalid charset",
task->message_id);
- text_part->is_raw = TRUE;
+ SET_PART_RAW (text_part);
return part_content;
}
if (g_ascii_strcasecmp (ocharset,
"utf-8") == 0 || g_ascii_strcasecmp (ocharset, "utf8") == 0) {
if (g_utf8_validate (part_content->data, part_content->len, NULL)) {
- text_part->is_raw = FALSE;
- text_part->is_utf = TRUE;
+ SET_PART_UTF (text_part);
return part_content;
}
else {
msg_info (
"<%s>: contains invalid utf8 characters, assume it as raw",
task->message_id);
- text_part->is_raw = TRUE;
+ SET_PART_RAW (text_part);
return part_content;
}
}
task->message_id,
ocharset,
err ? err->message : "unknown problem");
- text_part->is_raw = TRUE;
- text_part->is_utf = FALSE;
+ SET_PART_RAW (text_part);
g_error_free (err);
return part_content;
}
result_array = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
result_array->data = res_str;
result_array->len = write_bytes;
- text_part->is_raw = FALSE;
- text_part->is_utf = TRUE;
+ SET_PART_UTF (text_part);
return result_array;
}
const int max_chars = 32;
if (part != NULL) {
- if (part->is_utf) {
+ if (IS_PART_UTF (part)) {
/* Try to detect encoding by several symbols */
const gchar *p, *pp;
gunichar c;
guint i, nlen;
GArray *tmp;
- if (part->language && part->language[0] != '\0' && part->is_utf) {
+ if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
stem = sb_stemmer_new (part->language, "UTF_8");
if (stem == NULL) {
msg_info ("<%s> cannot create lemmatizer for %s language",
/* Ugly workaround */
tmp = rspamd_tokenize_text (part->content->data,
- part->content->len, part->is_utf, task->cfg->min_word_len,
+ part->content->len, IS_PART_UTF (part), task->cfg->min_word_len,
part->urls_offset, FALSE);
if (tmp) {
w->len = nlen;
}
else {
- if (part->is_utf) {
+ if (IS_PART_UTF (part)) {
rspamd_str_lc_utf8 (w->begin, w->len);
}
else {
text_part =
rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct mime_text_part));
- text_part->is_html = TRUE;
+ text_part->flags |= RSPAMD_MIME_PART_FLAG_HTML;
if (is_empty) {
- text_part->is_empty = TRUE;
+ text_part->flags |= RSPAMD_MIME_PART_FLAG_EMPTY;
text_part->orig = NULL;
text_part->content = NULL;
task->text_parts = g_list_prepend (task->text_parts, text_part);
text_part->orig,
type,
text_part);
- text_part->is_balanced = TRUE;
text_part->html_nodes = NULL;
text_part->parent = parent;
+ text_part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
text_part->content = strip_html_tags (task,
task->task_pool,
text_part,
text_part =
rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct mime_text_part));
- text_part->is_html = FALSE;
text_part->parent = parent;
if (is_empty) {
- text_part->is_empty = TRUE;
+ text_part->flags |= RSPAMD_MIME_PART_FLAG_EMPTY;
text_part->orig = NULL;
text_part->content = NULL;
task->text_parts = g_list_prepend (task->text_parts, text_part);
/* Post process part */
detect_text_language (text_part);
text_part->words = rspamd_tokenize_text (text_part->content->data,
- text_part->content->len, text_part->is_utf, task->cfg->min_word_len,
+ text_part->content->len, IS_PART_UTF (text_part), task->cfg->min_word_len,
text_part->urls_offset, TRUE);
rspamd_normalize_text_part (task, text_part);
}
const gchar *filename;
};
+#define RSPAMD_MIME_PART_FLAG_UTF (1 << 0)
+#define RSPAMD_MIME_PART_FLAG_BALANCED (1 << 1)
+#define RSPAMD_MIME_PART_FLAG_EMPTY (1 << 2)
+#define RSPAMD_MIME_PART_FLAG_HTML (1 << 3)
+
+#define IS_PART_EMPTY(part) ((part)->flags & RSPAMD_MIME_PART_FLAG_EMPTY)
+#define IS_PART_UTF(part) ((part)->flags & RSPAMD_MIME_PART_FLAG_UTF)
+#define IS_PART_RAW(part) (!((part)->flags & RSPAMD_MIME_PART_FLAG_UTF))
+#define IS_PART_HTML(part) ((part)->flags & RSPAMD_MIME_PART_FLAG_HTML)
+
struct mime_text_part {
- gboolean is_html;
- gboolean is_raw;
- gboolean is_balanced;
- gboolean is_empty;
- gboolean is_utf;
+ guint flags;
GUnicodeScript script;
const gchar *lang_code;
const gchar *language;
while (cur) {
part = (struct mime_text_part *)cur->data;
/* Skip empty parts */
- if (part->is_empty) {
+ if (IS_PART_EMPTY (part)) {
cur = g_list_next (cur);
continue;
}
/* Check raw flags */
- if (part->is_raw) {
+ if (!IS_PART_UTF (part)) {
raw = TRUE;
}
/* Select data for regexp */
NULL);
return FALSE;
}
- if (!p1->is_empty && !p2->is_empty) {
+ if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2)) {
if (p1->diff_str != NULL && p2->diff_str != NULL) {
diff = rspamd_diff_distance_normalized (p1->diff_str,
p2->diff_str);
}
}
}
- else if ((p1->is_empty &&
- !p2->is_empty) || (!p1->is_empty && p2->is_empty)) {
+ else if ((IS_PART_EMPTY (p1) &&
+ !IS_PART_EMPTY (p2)) || (!IS_PART_EMPTY (p1)&& IS_PART_EMPTY (p2))) {
/* Empty and non empty parts are different */
*pdiff = 0;
rspamd_mempool_set_variable (task->task_pool,
cur = g_list_first (task->text_parts);
while (cur) {
p = cur->data;
- if (p->is_html) {
+ if (IS_PART_HTML (p)) {
res = TRUE;
}
else {
cur = g_list_first (task->text_parts);
while (cur) {
p = cur->data;
- if (!p->is_empty && p->is_html) {
- if (p->is_balanced) {
+ if (!IS_PART_EMPTY (p) && IS_PART_HTML (p)) {
+ if (p->flags & RSPAMD_MIME_PART_FLAG_BALANCED) {
res = TRUE;
}
else {
while (cur && res == FALSE) {
p = cur->data;
- if (!p->is_empty && p->is_html && p->html_nodes) {
+ if (!IS_PART_EMPTY (p) && IS_PART_HTML (p) && p->html_nodes) {
g_node_traverse (p->html_nodes,
G_PRE_ORDER,
G_TRAVERSE_ALL,
while (cur && res == FALSE) {
p = cur->data;
- if (!p->is_empty && p->is_html && p->html_nodes == NULL) {
+ if (!IS_PART_EMPTY (p) && IS_PART_HTML (p) && p->html_nodes == NULL) {
res = TRUE;
}
cur = g_list_next (cur);
if (!check_balance (new, cur_level)) {
debug_task (
"mark part as unbalanced as it has not pairable closing tags");
- part->is_balanced = FALSE;
+ part->flags &= ~RSPAMD_MIME_PART_FLAG_BALANCED;
}
}
else if ((data->flags & (FL_XML|FL_SGML)) == 0) {
while (cur != NULL) {
part = (struct mime_text_part *)cur->data;
- if (!part->is_empty && part->words != NULL) {
+ if (!IS_PART_EMPTY (part) && part->words != NULL) {
if (compat) {
tok->tokenizer->tokenize_func (cf, task->task_pool,
- part->words, tok->tokens, part->is_utf);
+ part->words, tok->tokens, IS_PART_UTF (part));
}
else {
tok->tokenizer->tokenize_func (cf, task->task_pool,
- part->normalized_words, tok->tokens, part->is_utf);
+ part->normalized_words, tok->tokens, IS_PART_UTF (part));
}
}
bzero (&rs, sizeof (rs));
end = c + len;
- if (part->is_utf) {
+ if (IS_PART_UTF (part)) {
while (c < end) {
if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
c += cur_ex->len + 1;
begin = (gchar *)part->content->data;
c = begin;
end = c + len;
- if (part->is_utf) {
+ if (IS_PART_UTF (part)) {
while (c < end) {
if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
{
struct mime_text_part *part = lua_check_textpart (L);
- if (part == NULL || part->is_empty) {
+ if (part == NULL || IS_PART_EMPTY (part)) {
lua_pushboolean (L, FALSE);
return 1;
}
- lua_pushboolean (L, part->is_utf);
+ lua_pushboolean (L, IS_PART_UTF (part));
return 1;
}
struct mime_text_part *part = lua_check_textpart (L);
struct rspamd_lua_text *t;
- if (part == NULL || part->is_empty) {
+ if (part == NULL || IS_PART_EMPTY (part)) {
lua_pushnil (L);
return 1;
}
return 1;
}
- if (part->is_empty) {
+ if (IS_PART_EMPTY (part)) {
lua_pushnumber (L, 0);
}
else {
return 1;
}
- lua_pushboolean (L, part->is_empty);
+ lua_pushboolean (L, IS_PART_EMPTY (part));
return 1;
}
return 1;
}
- lua_pushboolean (L, part->is_html);
+ lua_pushboolean (L, IS_PART_HTML (part));
return 1;
}
struct mime_text_part *part = lua_check_textpart (L);
gchar *out;
- if (part == NULL || part->is_empty) {
+ if (part == NULL || IS_PART_EMPTY (part)) {
lua_pushnil (L);
return 1;
}
}
else {
- if (!part->is_empty && !other->is_empty) {
+ if (!IS_PART_EMPTY (part) && !IS_PART_EMPTY (other)) {
if (part->diff_str != NULL && other->diff_str != NULL) {
diff = rspamd_diff_distance (part->diff_str,
other->diff_str);
diff = rspamd_fuzzy_compare_parts (part, other);
}
}
- else if ((part->is_empty &&
- !other->is_empty) || (!part->is_empty && other->is_empty)) {
+ else if ((IS_PART_EMPTY (part) &&
+ !IS_PART_EMPTY (other)) || (!IS_PART_EMPTY (part) &&
+ IS_PART_EMPTY (other))) {
/* Empty and non empty parts are different */
diff = 0;
}
while (cur) {
part = cur->data;
- if (!part->is_empty && part->content != NULL) {
+ if (!IS_PART_EMPTY (part) && part->content != NULL) {
text = part->content->data;
len = part->content->len;
p = part->content->data;
- if (part->is_raw || raw_mode) {
+ if (IS_PART_UTF (part) || raw_mode) {
while (remain > 1) {
if ((g_ascii_isalpha (*p) &&
(*(p + 1) & 0x80)) ||
cur = g_list_first (task->text_parts);
while (cur) {
part = cur->data;
- if (!part->is_empty && check_part (part, task->cfg->raw_mode)) {
+ if (!IS_PART_EMPTY (part) && check_part (part, task->cfg->raw_mode)) {
rspamd_task_insert_result (task, chartable_module_ctx->symbol, 1, NULL);
}
cur = g_list_next (cur);
{
GArray *res;
- if (!part->is_utf || !part->language || part->language[0] == '\0' ||
+ if (!IS_PART_UTF (part) || !part->language || part->language[0] == '\0' ||
part->normalized_words == NULL) {
res = part->words;
}
while (cur) {
part = cur->data;
- if (part->is_empty) {
+ if (IS_PART_EMPTY (part)) {
cur = g_list_next (cur);
continue;
}