context("HTML processing", function() local rspamd_util = require("rspamd_util") local logger = require("rspamd_logger") test("Extract text from HTML", function() local cases = { {[[ <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>title</title> <link rel="stylesheet" href="style.css"> <script src="script.js"></script> </head> <body> <!-- page content --> Hello, world! <b>test</b> <p>data<> </P> <b>stuff</p>? </body> </html> ]], "Hello, world! test\r\ndata\r\nstuff?"}, {[[ <?xml version="1.0" encoding="iso-8859-1"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title> Wikibooks </title> </head> <body> <p> Hello, world! </p> </body> </html>]], '\r\nHello, world!\r\n'}, {[[ <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>title</title> <link rel="stylesheet" href="style.css"> <script src="script.js"></script> <style><!-- - -a -a -a -- --- - --></head> <body> <!-- page content --> Hello, world! </body> </html> ]], 'Hello, world!'}, } for _,c in ipairs(cases) do local t = rspamd_util.parse_html(c[1]) assert_not_nil(t) assert_equal(c[2], tostring(t)) end end) end)