aboutsummaryrefslogtreecommitdiffstats
path: root/test/lua/unit/html.lua
blob: 81c52ec1bb65a9a4bdf4d3ebff482a98290f6545 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
context("HTML processing", function()
  local rspamd_util = require("rspamd_util")
  local logger = require("rspamd_logger")
  local cases = {
      -- Entities
      {[[<html><body>.&#102;&#105;&#114;&#101;&#98;&#97;&#115;&#101;&#97;&#112;&#112;.&#99;&#111;&#109;</body></html>]],
       [[.firebaseapp.com]]},
      {[[
<?xml version="1.0" encoding="iso-8859-1"?>
 <!DOCTYPE html
   PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
   <head>
     <title>
       Wikibooks
     </title>
   </head>
   <body>
     <p>
       Hello,          world!

     </p>
   </body>
 </html>]], 'Hello, world!\n'},
       {[[
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>title</title>
    <link rel="stylesheet" href="style.css">
    <script src="script.js"></script>
    <style><!--
- -a -a -a -- --- -
  --></head>
  <body>
    <!-- page content -->
    Hello, world!
  </body>
</html>
      ]], 'Hello, world!'},
      {[[
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>title</title>
    <link rel="stylesheet" href="style.css">
    <script src="script.js"></script>
  </head>
  <body>
    <!-- page content -->
    Hello, world!<br>test</br><br>content</hr>more content<br>
    <div>
      content inside div
    </div>
  </body>
</html>
      ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n'},
      {[[
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>title</title>
    <link rel="stylesheet" href="style.css">
    <script src="script.js"></script>
  </head>
  <body>
    <!-- tabular content -->
    <table>
      content
    </table>
    <table>
      <tr>
        <th>heada</th>
        <th>headb</th>
      </tr>
      <tr>
        <td>data1</td>
        <td>data2</td>
      </tr>
    </table>

  </body>
</html>
      ]], 'content\nheada headb\ndata1 data2\n'},
      {[[
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>title</title>
    <link rel="stylesheet" href="style.css">
    <script src="script.js"></script>
  </head>
  <body>
    <!-- escape content -->
    a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;
  </body>
</html>
      ]], 'a b a > b a < b a & b \'a "a"'},
  }

  for i,c in ipairs(cases) do
    test("Extract text from HTML " .. tostring(i), function()
      local t = rspamd_util.parse_html(c[1])

      assert_not_nil(t)
      assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'",
          c[2], t))

    end)
  end
end)