diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-16 13:59:24 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-16 13:59:24 +0100 |
commit | a0f8924ef0cf0351bacbdc06a2c26ff60ed85b66 (patch) | |
tree | b43526b4b539fa9f24959947a27fadb1c74f1648 | |
parent | e5345b46dda5a1fc93ed34fce7bde76a3768320f (diff) | |
download | rspamd-a0f8924ef0cf0351bacbdc06a2c26ff60ed85b66.tar.gz rspamd-a0f8924ef0cf0351bacbdc06a2c26ff60ed85b66.zip |
More fixes to html parsing.
-rw-r--r-- | src/libserver/html.c | 45 | ||||
-rw-r--r-- | test/lua/unit/html.lua | 25 |
2 files changed, 55 insertions, 15 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c index b979b3f8c..0f7f98758 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -638,8 +638,16 @@ tag_cmp (const void *m1, const void *m2) { const struct html_tag_def *p1 = m1; const struct html_tag_def *p2 = m2; + gsize l1, l2; - return g_ascii_strcasecmp (p1->name, p2->name); + l1 = strlen (p1->name); + l2 = strlen (p2->name); + + if (l1 == l2) { + return g_ascii_strcasecmp (p1->name, p2->name); + } + + return l1 - l2; } static gint @@ -647,8 +655,15 @@ tag_find (const void *skey, const void *elt) { const struct html_tag *tag = skey; const struct html_tag_def *d = elt; + gsize tlen; - return g_ascii_strncasecmp (tag->name.start, d->name, tag->name.len); + tlen = strlen (d->name); + + if (tlen == tag->name.len) { + return g_ascii_strncasecmp (tag->name.start, d->name, tag->name.len); + } + + return tag->name.len - tlen; } static gint @@ -1058,7 +1073,7 @@ rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc, return TRUE; } - return FALSE; + return TRUE; } static gboolean @@ -1371,12 +1386,12 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, { const guchar *p, *c, *end, *tag_start = NULL, *savep = NULL; guchar t; - gboolean closing = FALSE, need_decode = FALSE; + gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE; GByteArray *dest; guint obrace = 0, ebrace = 0; GNode *cur_level = NULL; gint substate, len; - struct html_tag *cur_tag; + struct html_tag *cur_tag = NULL; enum { parse_start = 0, tag_begin, @@ -1558,11 +1573,13 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, break; case content_write: + if (t != '<') { if (t == '&') { need_decode = TRUE; } else if (g_ascii_isspace (t)) { + save_space = TRUE; if (c != p) { if (need_decode) { @@ -1579,6 +1596,16 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, c = p; state = content_ignore_sp; } + else { + if (save_space) { + /* Append one space if needed */ + if (dest->len > 0 && + !g_ascii_isspace (dest->data[dest->len - 1])) { + g_byte_array_append (dest, " ", 1); + } + save_space = FALSE; + } + } } else { if (c != p) { @@ -1605,14 +1632,6 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, if (!g_ascii_isspace (t)) { c = p; state = content_write; - - if (t != '<') { - /* Append one space if needed */ - if (dest->len > 0 && - !g_ascii_isspace (dest->data[dest->len - 1])) { - g_byte_array_append (dest, " ", 1); - } - } continue; } diff --git a/test/lua/unit/html.lua b/test/lua/unit/html.lua index 5c58e209a..f9788c349 100644 --- a/test/lua/unit/html.lua +++ b/test/lua/unit/html.lua @@ -15,10 +15,31 @@ context("HTML processing", function() </head> <body> <!-- page content --> - Hello, world! + Hello, world! <b>test</b> + <p>data<> + </P> + <b>stuff</p>? </body> </html> - ]], 'Hello, world!'}, + ]], 'Hello, world! test data stuff?'}, + {[[ +<?xml version="1.0" encoding="iso-8859-1"?> + <!DOCTYPE html + PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <title> + Wikibooks + </title> + </head> + <body> + <p> + Hello, world! + + </p> + </body> + </html>]], 'Hello, world!'}, } for _,c in ipairs(cases) do |