diff options
Diffstat (limited to 'test/lua/unit/html.lua')
-rw-r--r-- | test/lua/unit/html.lua | 414 |
1 files changed, 397 insertions, 17 deletions
diff --git a/test/lua/unit/html.lua b/test/lua/unit/html.lua index 81c52ec1b..1802dc984 100644 --- a/test/lua/unit/html.lua +++ b/test/lua/unit/html.lua @@ -1,11 +1,10 @@ context("HTML processing", function() local rspamd_util = require("rspamd_util") - local logger = require("rspamd_logger") local cases = { - -- Entities - {[[<html><body>.firebaseapp.com</body></html>]], - [[.firebaseapp.com]]}, - {[[ + -- Entities + { [[<html><body>.firebaseapp.com</body></html>]], + [[.firebaseapp.com]] }, + { [[ <?xml version="1.0" encoding="iso-8859-1"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" @@ -22,8 +21,8 @@ context("HTML processing", function() </p> </body> - </html>]], 'Hello, world!\n'}, - {[[ + </html>]], 'Hello, world!\n' }, + { [[ <!DOCTYPE html> <html lang="en"> <head> @@ -39,8 +38,8 @@ context("HTML processing", function() Hello, world! </body> </html> - ]], 'Hello, world!'}, - {[[ + ]], 'Hello, world!' }, + { [[ <html lang="en"> <head> <meta charset="utf-8"> @@ -56,8 +55,8 @@ context("HTML processing", function() </div> </body> </html> - ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n'}, - {[[ + ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n' }, + { [[ <html lang="en"> <head> <meta charset="utf-8"> @@ -83,8 +82,8 @@ context("HTML processing", function() </body> </html> - ]], 'content\nheada headb\ndata1 data2\n'}, - {[[ + ]], 'content\nheada headb\ndata1 data2\n' }, + { [[ <html lang="en"> <head> <meta charset="utf-8"> @@ -97,17 +96,398 @@ context("HTML processing", function() a b a > b a < b a & b 'a "a" </body> </html> - ]], 'a b a > b a < b a & b \'a "a"'}, + ]], 'a b a > b a < b a & b \'a "a"' }, } - for i,c in ipairs(cases) do + for i, c in ipairs(cases) do test("Extract text from HTML " .. tostring(i), function() local t = rspamd_util.parse_html(c[1]) assert_not_nil(t) assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'", - c[2], t)) - + c[2], t)) end) end + + -- Test cases for new HTML tag API methods + local function parse_html_and_extract_tags(html_content, pool) + local rspamd_parsers = require("rspamd_parsers") + + local parsed = rspamd_parsers.parse_html_content(html_content, pool) + local tags = {} + + if parsed then + parsed:foreach_tag("any", function(tag, content_length, is_leaf) + table.insert(tags, tag) + return false + end) + end + + return parsed, tags + end + + test("HTML tag get_all_attributes basic test", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<div class="test-class" id="test-id" style="color: red;" width="100">content</div>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + assert_true(#tags > 0) + + -- Find the div tag + local div_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" then + div_tag = tag + break + end + end + + assert_not_nil(div_tag) + + local attrs = div_tag:get_all_attributes() + assert_not_nil(attrs) + + -- Check that we have the expected attributes + assert_equal("test-class", attrs["class"]) + assert_equal("test-id", attrs["id"]) + assert_equal("color: red;", attrs["style"]) + assert_equal("100", attrs["width"]) + + pool:destroy() + end) + + test("HTML tag has_attribute test", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<img src="test.jpg" width="100" height="50" alt="Test image" hidden />]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local img_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "img" then + img_tag = tag + break + end + end + + assert_not_nil(img_tag) + + -- Test existing attributes + assert_true(img_tag:has_attribute("src")) + assert_true(img_tag:has_attribute("width")) + assert_true(img_tag:has_attribute("height")) + assert_true(img_tag:has_attribute("alt")) + assert_true(img_tag:has_attribute("hidden")) + + -- Test non-existing attributes + assert_false(img_tag:has_attribute("nonexistent")) + assert_false(img_tag:has_attribute("class")) + assert_false(img_tag:has_attribute("")) + + pool:destroy() + end) + + test("HTML tag get_numeric_attribute test", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<div width="200" height="150" font-size="14" opacity="0.8" tabindex="5">content</div>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local div_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" then + div_tag = tag + break + end + end + + assert_not_nil(div_tag) + + -- Test numeric attributes + assert_equal(200, div_tag:get_numeric_attribute("width")) + assert_equal(150, div_tag:get_numeric_attribute("height")) + assert_equal(14, div_tag:get_numeric_attribute("font-size")) + + -- Test opacity with floating-point tolerance + local opacity = div_tag:get_numeric_attribute("opacity") + assert_not_nil(opacity) + assert_true(math.abs(opacity - 0.8) < 0.01, string.format("Expected opacity ~0.8, got %f", opacity)) + + assert_equal(5, div_tag:get_numeric_attribute("tabindex")) + + -- Test non-numeric attributes + assert_nil(div_tag:get_numeric_attribute("nonexistent")) + + pool:destroy() + end) + + test("HTML tag get_unknown_attributes test", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<div class="known" data-track="analytics" unknown-attr="test-value" custom-id="12345">content</div>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local div_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" then + div_tag = tag + break + end + end + + assert_not_nil(div_tag) + + local unknown_attrs = div_tag:get_unknown_attributes() + assert_not_nil(unknown_attrs) + + -- Should include unknown attributes but not known ones like "class" + assert_not_nil(unknown_attrs["unknown-attr"]) + assert_equal("test-value", unknown_attrs["unknown-attr"]) + assert_not_nil(unknown_attrs["custom-id"]) + assert_equal("12345", unknown_attrs["custom-id"]) + + -- data-track should be recognized as a known attribute now + -- but if not, it would appear in unknown attributes + + pool:destroy() + end) + + test("HTML tag get_children test", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[ + <div id="parent"> + <p>First child</p> + <span>Second child</span> + <img src="test.jpg" /> + </div> + ]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local parent_div = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" and tag:has_attribute("id") and tag:get_attribute("id") == "parent" then + parent_div = tag + break + end + end + + assert_not_nil(parent_div) + + local children = parent_div:get_children() + assert_not_nil(children) + assert_equal(3, #children) + + -- Check child types + local child_types = {} + for _, child in ipairs(children) do + table.insert(child_types, child:get_type()) + end + + -- Should contain p, span, and img + local child_types_str = table.concat(child_types, ",") + assert_true(child_types_str:find("p") ~= nil) + assert_true(child_types_str:find("span") ~= nil) + assert_true(child_types_str:find("img") ~= nil) + + pool:destroy() + end) + + test("HTML tag get_attribute vs get_all_attributes consistency", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<a href="https://example.com" class="link" target="_blank" title="Example Link">Link</a>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local a_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "a" then + a_tag = tag + break + end + end + + assert_not_nil(a_tag) + + local all_attrs = a_tag:get_all_attributes() + + -- Test that individual get_attribute calls match get_all_attributes + for attr_name, attr_value in pairs(all_attrs) do + assert_equal(attr_value, a_tag:get_attribute(attr_name), + string.format("Attribute '%s' mismatch: get_attribute='%s', get_all_attributes='%s'", + attr_name, a_tag:get_attribute(attr_name) or "nil", attr_value)) + end + + -- Test specific expected attributes + assert_equal("https://example.com", a_tag:get_attribute("href")) + assert_equal("link", a_tag:get_attribute("class")) + assert_equal("_blank", a_tag:get_attribute("target")) + assert_equal("Example Link", a_tag:get_attribute("title")) + + pool:destroy() + end) + + + + test("HTML tag attribute edge cases", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<div class="" hidden style=" " width="0" height="abc">content</div>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local div_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" then + div_tag = tag + break + end + end + + assert_not_nil(div_tag) + + -- Test empty attribute value + assert_true(div_tag:has_attribute("class")) + assert_equal("", div_tag:get_attribute("class")) + + -- Test boolean attribute (hidden) + assert_true(div_tag:has_attribute("hidden")) + + -- Test whitespace-only attribute + assert_true(div_tag:has_attribute("style")) + assert_equal(" ", div_tag:get_attribute("style")) + + -- Test numeric attributes with edge cases + assert_equal(0, div_tag:get_numeric_attribute("width")) + assert_nil(div_tag:get_numeric_attribute("height")) -- "abc" is not numeric + + -- Test non-existent attribute + assert_false(div_tag:has_attribute("nonexistent")) + assert_nil(div_tag:get_attribute("nonexistent")) + assert_nil(div_tag:get_numeric_attribute("nonexistent")) + + pool:destroy() + end) + + test("HTML tag complex nested structure", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[ + <table cellpadding="5" cellspacing="2" border="1"> + <tr> + <td align="center" valign="top" width="100"> + <img src="image1.jpg" width="80" height="60" alt="Image 1" /> + </td> + <td align="left" valign="middle"> + <p font-size="12">Text content</p> + </td> + </tr> + </table> + ]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + -- Find table tag + local table_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "table" then + table_tag = tag + break + end + end + + assert_not_nil(table_tag) + + -- Test table attributes + assert_equal(5, table_tag:get_numeric_attribute("cellpadding")) + assert_equal(2, table_tag:get_numeric_attribute("cellspacing")) + assert_equal("1", table_tag:get_attribute("border")) + + -- Test that table has children + local children = table_tag:get_children() + assert_not_nil(children) + assert_true(#children > 0) + + -- Find img tag + local img_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "img" then + img_tag = tag + break + end + end + + assert_not_nil(img_tag) + + -- Test img attributes + assert_equal("image1.jpg", img_tag:get_attribute("src")) + assert_equal(80, img_tag:get_numeric_attribute("width")) + assert_equal(60, img_tag:get_numeric_attribute("height")) + assert_equal("Image 1", img_tag:get_attribute("alt")) + + pool:destroy() + end) + + test("HTML tag with mixed known and unknown attributes", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = + [[<div class="container" data-analytics="track" custom-attr="value" style="color: blue;" unknown123="test">content</div>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local div_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" then + div_tag = tag + break + end + end + + assert_not_nil(div_tag) + + local all_attrs = div_tag:get_all_attributes() + local unknown_attrs = div_tag:get_unknown_attributes() + + -- All attributes should include both known and unknown + assert_not_nil(all_attrs["class"]) -- known + assert_not_nil(all_attrs["style"]) -- known + assert_not_nil(all_attrs["custom-attr"]) -- unknown + assert_not_nil(all_attrs["unknown123"]) -- unknown + + -- Unknown attributes should only include unrecognized ones + assert_nil(unknown_attrs["class"]) -- known, shouldn't be here + assert_nil(unknown_attrs["style"]) -- known, shouldn't be here + assert_not_nil(unknown_attrs["custom-attr"]) -- unknown, should be here + assert_not_nil(unknown_attrs["unknown123"]) -- unknown, should be here + + assert_equal("value", unknown_attrs["custom-attr"]) + assert_equal("test", unknown_attrs["unknown123"]) + + pool:destroy() + end) end) |