rspamd/test/lua/unit/html.lua

129 lines
2.8 KiB
Lua
Raw Normal View History

2015-07-16 14:00:51 +02:00
context("HTML processing", function()
local rspamd_util = require("rspamd_util")
local logger = require("rspamd_logger")
2015-11-16 14:54:21 +01:00
2015-07-16 14:00:51 +02:00
test("Extract text from HTML", function()
local cases = {
{[[
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>title</title>
<link rel="stylesheet" href="style.css">
<script src="script.js"></script>
</head>
<body>
<!-- page content -->
2015-07-16 14:59:24 +02:00
Hello, world! <b>test</b>
<p>data<>
</P>
<b>stuff</p>?
2015-07-16 14:00:51 +02:00
</body>
</html>
2018-06-23 17:56:10 +02:00
]], "Hello, world! test\r\ndata\r\nstuff\r\n?"},
2015-07-16 14:59:24 +02:00
{[[
<?xml version="1.0" encoding="iso-8859-1"?>
2015-11-16 14:54:21 +01:00
<!DOCTYPE html
2015-07-16 14:59:24 +02:00
PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>
Wikibooks
</title>
</head>
<body>
<p>
2015-11-16 14:54:21 +01:00
Hello, world!
2015-07-16 14:59:24 +02:00
</p>
</body>
2015-11-16 14:54:21 +01:00
</html>]], 'Hello, world!\r\n'},
2015-07-16 15:34:09 +02:00
{[[
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>title</title>
<link rel="stylesheet" href="style.css">
<script src="script.js"></script>
<style><!--
- -a -a -a -- --- -
--></head>
<body>
<!-- page content -->
Hello, world!
2015-07-16 15:34:09 +02:00
</body>
</html>
]], 'Hello, world!'},
2017-04-14 15:32:37 +02:00
{[[
<html lang="en">
<head>
<meta charset="utf-8">
<title>title</title>
<link rel="stylesheet" href="style.css">
<script src="script.js"></script>
</head>
<body>
<!-- page content -->
Hello, world!<br>test</br><br>content</hr>more content<br>
<div>
content inside div
</div>
</body>
</html>
]], 'Hello, world!\r\ntest\r\ncontent\r\nmore content\r\ncontent inside div\r\n'},
{[[
<html lang="en">
<head>
<meta charset="utf-8">
<title>title</title>
<link rel="stylesheet" href="style.css">
<script src="script.js"></script>
</head>
<body>
<!-- tabular content -->
<table>
content
</table>
<table>
<tr>
<th>heada</th>
<th>headb</th>
</tr>
<tr>
<td>data1</td>
<td>data2</td>
</tr>
</table>
</body>
</html>
2018-06-23 20:15:34 +02:00
]], 'content\r\nheada headb\r\ndata1 data2\r\n'},
2017-04-14 15:32:37 +02:00
{[[
<html lang="en">
<head>
<meta charset="utf-8">
<title>title</title>
<link rel="stylesheet" href="style.css">
<script src="script.js"></script>
</head>
<body>
<!-- escape content -->
a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;
</body>
</html>
]], 'a b a > b a < b a & b \'a "a"'},
2015-07-16 14:00:51 +02:00
}
2015-11-16 14:54:21 +01:00
2015-07-16 14:00:51 +02:00
for _,c in ipairs(cases) do
local t = rspamd_util.parse_html(c[1])
2015-11-16 14:54:21 +01:00
2015-07-16 14:00:51 +02:00
assert_not_nil(t)
2018-08-21 16:04:35 +02:00
assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'",
c[2], t))
2015-07-16 14:00:51 +02:00
end
end)
2015-11-16 14:54:21 +01:00
end)