summaryrefslogtreecommitdiffstats
path: root/src/libserver/html.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-16 13:01:09 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-16 13:01:09 +0100
commite5345b46dda5a1fc93ed34fce7bde76a3768320f (patch)
tree899c012e69c5e415440f43709859668450607e51 /src/libserver/html.c
parent86069f37686e5709411f1b9e97c0f664dc8a833e (diff)
downloadrspamd-e5345b46dda5a1fc93ed34fce7bde76a3768320f.tar.gz
rspamd-e5345b46dda5a1fc93ed34fce7bde76a3768320f.zip
Fix content saving in html parser.
Diffstat (limited to 'src/libserver/html.c')
-rw-r--r--src/libserver/html.c47
1 files changed, 43 insertions, 4 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 5bf042153..b979b3f8c 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1391,6 +1391,7 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
xml_tag_end,
content_ignore,
content_write,
+ content_ignore_sp
} state = parse_start;
g_assert (in != NULL);
@@ -1558,11 +1559,26 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
case content_write:
if (t != '<') {
- p ++;
-
if (t == '&') {
need_decode = TRUE;
}
+ else if (g_ascii_isspace (t)) {
+
+ if (c != p) {
+ if (need_decode) {
+ len = rspamd_html_decode_entitles_inplace ((gchar *)c,
+ p - c);
+ }
+ else {
+ len = p - c;
+ }
+
+ g_byte_array_append (dest, c, len);
+ }
+
+ c = p;
+ state = content_ignore_sp;
+ }
}
else {
if (c != p) {
@@ -1579,7 +1595,28 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
}
state = tag_begin;
+ continue;
}
+
+ p ++;
+ break;
+
+ case content_ignore_sp:
+ if (!g_ascii_isspace (t)) {
+ c = p;
+ state = content_write;
+
+ if (t != '<') {
+ /* Append one space if needed */
+ if (dest->len > 0 &&
+ !g_ascii_isspace (dest->data[dest->len - 1])) {
+ g_byte_array_append (dest, " ", 1);
+ }
+ }
+ continue;
+ }
+
+ p ++;
break;
case sgml_content:
@@ -1627,9 +1664,11 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
}
}
else {
- /* Do not save content of SGML/XML tags */
- state = content_ignore;
+ state = content_write;
}
+
+ p++;
+ c = p;
cur_tag = NULL;
break;
}