|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- context("HTML processing", function()
- local rspamd_util = require("rspamd_util")
- local logger = require("rspamd_logger")
- local cases = {
- -- Entities
- {[[<html><body>.firebaseapp.com</body></html>]],
- [[.firebaseapp.com]]},
- {[[
- <?xml version="1.0" encoding="iso-8859-1"?>
- <!DOCTYPE html
- PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
- <head>
- <title>
- Wikibooks
- </title>
- </head>
- <body>
- <p>
- Hello, world!
-
- </p>
- </body>
- </html>]], 'Hello, world!\n'},
- {[[
- <!DOCTYPE html>
- <html lang="en">
- <head>
- <meta charset="utf-8">
- <title>title</title>
- <link rel="stylesheet" href="style.css">
- <script src="script.js"></script>
- <style><!--
- - -a -a -a -- --- -
- --></head>
- <body>
- <!-- page content -->
- Hello, world!
- </body>
- </html>
- ]], 'Hello, world!'},
- {[[
- <html lang="en">
- <head>
- <meta charset="utf-8">
- <title>title</title>
- <link rel="stylesheet" href="style.css">
- <script src="script.js"></script>
- </head>
- <body>
- <!-- page content -->
- Hello, world!<br>test</br><br>content</hr>more content<br>
- <div>
- content inside div
- </div>
- </body>
- </html>
- ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n'},
- {[[
- <html lang="en">
- <head>
- <meta charset="utf-8">
- <title>title</title>
- <link rel="stylesheet" href="style.css">
- <script src="script.js"></script>
- </head>
- <body>
- <!-- tabular content -->
- <table>
- content
- </table>
- <table>
- <tr>
- <th>heada</th>
- <th>headb</th>
- </tr>
- <tr>
- <td>data1</td>
- <td>data2</td>
- </tr>
- </table>
-
- </body>
- </html>
- ]], 'content\nheada headb\ndata1 data2\n'},
- {[[
- <html lang="en">
- <head>
- <meta charset="utf-8">
- <title>title</title>
- <link rel="stylesheet" href="style.css">
- <script src="script.js"></script>
- </head>
- <body>
- <!-- escape content -->
- a b a > b a < b a & b 'a "a"
- </body>
- </html>
- ]], 'a b a > b a < b a & b \'a "a"'},
- }
-
- for i,c in ipairs(cases) do
- test("Extract text from HTML " .. tostring(i), function()
- local t = rspamd_util.parse_html(c[1])
-
- assert_not_nil(t)
- assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'",
- c[2], t))
-
- end)
- end
- end)
|