summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-16 13:59:24 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-16 13:59:24 +0100
commita0f8924ef0cf0351bacbdc06a2c26ff60ed85b66 (patch)
treeb43526b4b539fa9f24959947a27fadb1c74f1648
parente5345b46dda5a1fc93ed34fce7bde76a3768320f (diff)
downloadrspamd-a0f8924ef0cf0351bacbdc06a2c26ff60ed85b66.tar.gz
rspamd-a0f8924ef0cf0351bacbdc06a2c26ff60ed85b66.zip
More fixes to html parsing.
-rw-r--r--src/libserver/html.c45
-rw-r--r--test/lua/unit/html.lua25
2 files changed, 55 insertions, 15 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index b979b3f8c..0f7f98758 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -638,8 +638,16 @@ tag_cmp (const void *m1, const void *m2)
{
const struct html_tag_def *p1 = m1;
const struct html_tag_def *p2 = m2;
+ gsize l1, l2;
- return g_ascii_strcasecmp (p1->name, p2->name);
+ l1 = strlen (p1->name);
+ l2 = strlen (p2->name);
+
+ if (l1 == l2) {
+ return g_ascii_strcasecmp (p1->name, p2->name);
+ }
+
+ return l1 - l2;
}
static gint
@@ -647,8 +655,15 @@ tag_find (const void *skey, const void *elt)
{
const struct html_tag *tag = skey;
const struct html_tag_def *d = elt;
+ gsize tlen;
- return g_ascii_strncasecmp (tag->name.start, d->name, tag->name.len);
+ tlen = strlen (d->name);
+
+ if (tlen == tag->name.len) {
+ return g_ascii_strncasecmp (tag->name.start, d->name, tag->name.len);
+ }
+
+ return tag->name.len - tlen;
}
static gint
@@ -1058,7 +1073,7 @@ rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
return TRUE;
}
- return FALSE;
+ return TRUE;
}
static gboolean
@@ -1371,12 +1386,12 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
{
const guchar *p, *c, *end, *tag_start = NULL, *savep = NULL;
guchar t;
- gboolean closing = FALSE, need_decode = FALSE;
+ gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE;
GByteArray *dest;
guint obrace = 0, ebrace = 0;
GNode *cur_level = NULL;
gint substate, len;
- struct html_tag *cur_tag;
+ struct html_tag *cur_tag = NULL;
enum {
parse_start = 0,
tag_begin,
@@ -1558,11 +1573,13 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
break;
case content_write:
+
if (t != '<') {
if (t == '&') {
need_decode = TRUE;
}
else if (g_ascii_isspace (t)) {
+ save_space = TRUE;
if (c != p) {
if (need_decode) {
@@ -1579,6 +1596,16 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
c = p;
state = content_ignore_sp;
}
+ else {
+ if (save_space) {
+ /* Append one space if needed */
+ if (dest->len > 0 &&
+ !g_ascii_isspace (dest->data[dest->len - 1])) {
+ g_byte_array_append (dest, " ", 1);
+ }
+ save_space = FALSE;
+ }
+ }
}
else {
if (c != p) {
@@ -1605,14 +1632,6 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
if (!g_ascii_isspace (t)) {
c = p;
state = content_write;
-
- if (t != '<') {
- /* Append one space if needed */
- if (dest->len > 0 &&
- !g_ascii_isspace (dest->data[dest->len - 1])) {
- g_byte_array_append (dest, " ", 1);
- }
- }
continue;
}
diff --git a/test/lua/unit/html.lua b/test/lua/unit/html.lua
index 5c58e209a..f9788c349 100644
--- a/test/lua/unit/html.lua
+++ b/test/lua/unit/html.lua
@@ -15,10 +15,31 @@ context("HTML processing", function()
</head>
<body>
<!-- page content -->
- Hello, world!
+ Hello, world! <b>test</b>
+ <p>data<>
+ </P>
+ <b>stuff</p>?
</body>
</html>
- ]], 'Hello, world!'},
+ ]], 'Hello, world! test data stuff?'},
+ {[[
+<?xml version="1.0" encoding="iso-8859-1"?>
+ <!DOCTYPE html
+ PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <title>
+ Wikibooks
+ </title>
+ </head>
+ <body>
+ <p>
+ Hello, world!
+
+ </p>
+ </body>
+ </html>]], 'Hello, world!'},
}
for _,c in ipairs(cases) do