You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.lua 2.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. context("HTML processing", function()
  2. local rspamd_util = require("rspamd_util")
  3. local logger = require("rspamd_logger")
  4. local cases = {
  5. -- Entities
  6. {[[<html><body>.&#102;&#105;&#114;&#101;&#98;&#97;&#115;&#101;&#97;&#112;&#112;.&#99;&#111;&#109;</body></html>]],
  7. [[.firebaseapp.com]]},
  8. {[[
  9. <?xml version="1.0" encoding="iso-8859-1"?>
  10. <!DOCTYPE html
  11. PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
  12. "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  13. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  14. <head>
  15. <title>
  16. Wikibooks
  17. </title>
  18. </head>
  19. <body>
  20. <p>
  21. Hello, world!
  22. </p>
  23. </body>
  24. </html>]], 'Hello, world!\n'},
  25. {[[
  26. <!DOCTYPE html>
  27. <html lang="en">
  28. <head>
  29. <meta charset="utf-8">
  30. <title>title</title>
  31. <link rel="stylesheet" href="style.css">
  32. <script src="script.js"></script>
  33. <style><!--
  34. - -a -a -a -- --- -
  35. --></head>
  36. <body>
  37. <!-- page content -->
  38. Hello, world!
  39. </body>
  40. </html>
  41. ]], 'Hello, world!'},
  42. {[[
  43. <html lang="en">
  44. <head>
  45. <meta charset="utf-8">
  46. <title>title</title>
  47. <link rel="stylesheet" href="style.css">
  48. <script src="script.js"></script>
  49. </head>
  50. <body>
  51. <!-- page content -->
  52. Hello, world!<br>test</br><br>content</hr>more content<br>
  53. <div>
  54. content inside div
  55. </div>
  56. </body>
  57. </html>
  58. ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n'},
  59. {[[
  60. <html lang="en">
  61. <head>
  62. <meta charset="utf-8">
  63. <title>title</title>
  64. <link rel="stylesheet" href="style.css">
  65. <script src="script.js"></script>
  66. </head>
  67. <body>
  68. <!-- tabular content -->
  69. <table>
  70. content
  71. </table>
  72. <table>
  73. <tr>
  74. <th>heada</th>
  75. <th>headb</th>
  76. </tr>
  77. <tr>
  78. <td>data1</td>
  79. <td>data2</td>
  80. </tr>
  81. </table>
  82. </body>
  83. </html>
  84. ]], 'content\nheada headb\ndata1 data2\n'},
  85. {[[
  86. <html lang="en">
  87. <head>
  88. <meta charset="utf-8">
  89. <title>title</title>
  90. <link rel="stylesheet" href="style.css">
  91. <script src="script.js"></script>
  92. </head>
  93. <body>
  94. <!-- escape content -->
  95. a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;
  96. </body>
  97. </html>
  98. ]], 'a b a > b a < b a & b \'a "a"'},
  99. }
  100. for i,c in ipairs(cases) do
  101. test("Extract text from HTML " .. tostring(i), function()
  102. local t = rspamd_util.parse_html(c[1])
  103. assert_not_nil(t)
  104. assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'",
  105. c[2], t))
  106. end)
  107. end
  108. end)