You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.lua 2.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. context("HTML processing", function()
  2. local rspamd_util = require("rspamd_util")
  3. local logger = require("rspamd_logger")
  4. test("Extract text from HTML", function()
  5. local cases = {
  6. {[[
  7. <!DOCTYPE html>
  8. <html lang="en">
  9. <head>
  10. <meta charset="utf-8">
  11. <title>title</title>
  12. <link rel="stylesheet" href="style.css">
  13. <script src="script.js"></script>
  14. </head>
  15. <body>
  16. <!-- page content -->
  17. Hello, world! <b>test</b>
  18. <p>data<>
  19. </P>
  20. <b>stuff</p>?
  21. </body>
  22. </html>
  23. ]], "Hello, world! test\r\ndata\r\nstuff\r\n?"},
  24. {[[
  25. <?xml version="1.0" encoding="iso-8859-1"?>
  26. <!DOCTYPE html
  27. PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
  28. "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  29. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  30. <head>
  31. <title>
  32. Wikibooks
  33. </title>
  34. </head>
  35. <body>
  36. <p>
  37. Hello, world!
  38. </p>
  39. </body>
  40. </html>]], 'Hello, world!\r\n'},
  41. {[[
  42. <!DOCTYPE html>
  43. <html lang="en">
  44. <head>
  45. <meta charset="utf-8">
  46. <title>title</title>
  47. <link rel="stylesheet" href="style.css">
  48. <script src="script.js"></script>
  49. <style><!--
  50. - -a -a -a -- --- -
  51. --></head>
  52. <body>
  53. <!-- page content -->
  54. Hello, world!
  55. </body>
  56. </html>
  57. ]], 'Hello, world!'},
  58. {[[
  59. <html lang="en">
  60. <head>
  61. <meta charset="utf-8">
  62. <title>title</title>
  63. <link rel="stylesheet" href="style.css">
  64. <script src="script.js"></script>
  65. </head>
  66. <body>
  67. <!-- page content -->
  68. Hello, world!<br>test</br><br>content</hr>more content<br>
  69. <div>
  70. content inside div
  71. </div>
  72. </body>
  73. </html>
  74. ]], 'Hello, world!\r\ntest\r\ncontent\r\nmore content\r\ncontent inside div\r\n'},
  75. {[[
  76. <html lang="en">
  77. <head>
  78. <meta charset="utf-8">
  79. <title>title</title>
  80. <link rel="stylesheet" href="style.css">
  81. <script src="script.js"></script>
  82. </head>
  83. <body>
  84. <!-- tabular content -->
  85. <table>
  86. content
  87. </table>
  88. <table>
  89. <tr>
  90. <th>heada</th>
  91. <th>headb</th>
  92. </tr>
  93. <tr>
  94. <td>data1</td>
  95. <td>data2</td>
  96. </tr>
  97. </table>
  98. </body>
  99. </html>
  100. ]], 'content\r\nheada headb\r\ndata1 data2\r\n'},
  101. {[[
  102. <html lang="en">
  103. <head>
  104. <meta charset="utf-8">
  105. <title>title</title>
  106. <link rel="stylesheet" href="style.css">
  107. <script src="script.js"></script>
  108. </head>
  109. <body>
  110. <!-- escape content -->
  111. a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;
  112. </body>
  113. </html>
  114. ]], 'a b a > b a < b a & b \'a "a"'},
  115. }
  116. for _,c in ipairs(cases) do
  117. local t = rspamd_util.parse_html(c[1])
  118. assert_not_nil(t)
  119. assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'",
  120. c[2], t))
  121. end
  122. end)
  123. end)