result->regexp = g_regex_new (begin, regexp_flags, 0, &err);
result->regexp_text = memory_pool_strdup (pool, begin);
memory_pool_add_destructor (pool, (pool_destruct_func)g_regex_unref, (void *)result->regexp);
- *end = '/';
if (result->regexp == NULL || err != NULL) {
+ *end = '/';
msg_warn ("parse_regexp: could not read regexp: %s while reading regexp %s", err->message, src);
return NULL;
}
+ result->raw_regexp = g_regex_new (begin, regexp_flags | G_REGEX_RAW, 0, &err);
+ memory_pool_add_destructor (pool, (pool_destruct_func)g_regex_unref, (void *)result->raw_regexp);
+ *end = '/';
+
+ if (result->raw_regexp == NULL || err != NULL) {
+ msg_warn ("parse_regexp: could not read raw regexp: %s while reading regexp %s", err->message, src);
+ return NULL;
+ }
/* Add to cache for further usage */
re_cache_add (result->regexp_text, result);
}
static GByteArray *
-convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeContentType *type)
+convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeContentType *type, struct mime_text_part *text_part)
{
GError *err = NULL;
gsize read_bytes, write_bytes;
GByteArray *result_array;
if ((charset = g_mime_content_type_get_parameter (type, "charset")) == NULL) {
- charset = "ASCII";
+ text_part->is_raw = TRUE;
+ return part_content;
}
if (g_ascii_strcasecmp (charset, "utf-8") == 0 || g_ascii_strcasecmp (charset, "utf8") == 0) {
+ text_part->is_raw = TRUE;
return part_content;
}
&read_bytes, &write_bytes, &err);
if (res_str == NULL) {
msg_warn ("convert_text_to_utf: cannot convert from %s to utf8: %s", charset, err ? err->message : "unknown problem");
+ text_part->is_raw = TRUE;
return part_content;
}
result_array->data = res_str;
result_array->len = write_bytes + 1;
memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, res_str);
+ text_part->is_raw = FALSE;
return result_array;
}
url_parse_html (task, part_content);
text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
- text_part->orig = convert_text_to_utf (task, part_content, type);
+ text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
text_part->content = strip_html_tags (part_content, NULL);
text_part->is_html = TRUE;
text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
url_parse_text (task, part_content);
text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
- text_part->orig = convert_text_to_utf (task, part_content, type);
+ text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
text_part->content = part_content;
text_part->is_html = FALSE;
text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
char *headerv, *c, t;
struct mime_text_part *part;
GList *cur, *headerlist;
+ GRegex *regexp;
struct uri *url;
int r;
cur = g_list_first (task->text_parts);
while (cur) {
part = (struct mime_text_part *)cur->data;
- if (g_regex_match_full (re->regexp, part->orig->data, part->orig->len, 0, 0, NULL, NULL) == TRUE) {
+ if (part->is_raw) {
+ regexp = re->raw_regexp;
+ }
+ else {
+ regexp = re->regexp;
+ }
+ if (g_regex_match_full (regexp, part->orig->data, part->orig->len, 0, 0, NULL, NULL) == TRUE) {
task_cache_add (task, re, 1);
return 1;
}