You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer.lua 2.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. context("Text tokenization test", function()
  2. local util = require "rspamd_util"
  3. local logger = require "rspamd_logger"
  4. local cases = {
  5. {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
  6. {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
  7. "Integer", "mattis", "nibh"
  8. }
  9. },
  10. {"Հետաքրքրվողների համար ոտորև ներկայացված",
  11. {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
  12. },
  13. {"", {}},
  14. {",,,,,", {}},
  15. {"word,,,,,word ", {"word", "word"}},
  16. {"word", {"word"}},
  17. {",,,,word,,,", {"word"}}
  18. }
  19. for i,c in ipairs(cases) do
  20. test("Tokenize simple " .. i, function()
  21. local w = util.tokenize_text(c[1])
  22. if #c[2] == 0 then
  23. assert_equal(#w, 0, "must not have tokens " .. c[1])
  24. else
  25. assert_not_nil(w, "must tokenize " .. c[1])
  26. for i,wrd in ipairs(w) do
  27. assert_equal(wrd, c[2][i])
  28. end
  29. end
  30. end)
  31. end
  32. cases = {
  33. {"word https://example.com/path word",
  34. {{5, 24}},
  35. {"word", "!!EX!!", "word"}
  36. },
  37. {"համար https://example.com/path համար",
  38. {{11, 24}},
  39. {"համար", "!!EX!!", "համար"}
  40. },
  41. {"word https://example.com/path https://example.com/path word",
  42. {{5, 24}, {30, 24}},
  43. {"word", "!!EX!!", "!!EX!!", "word"}
  44. },
  45. {"word https://example.com/path https://example.com/path",
  46. {{5, 24}, {30, 24}},
  47. {"word", "!!EX!!", "!!EX!!"}
  48. },
  49. {"https://example.com/path https://example.com/path word",
  50. {{0, 24}, {25, 24}},
  51. {"!!EX!!", "!!EX!!", "word"}
  52. },
  53. {"https://example.com/path https://example.com/path",
  54. {{0, 24}, {25, 24}},
  55. {"!!EX!!", "!!EX!!"}
  56. },
  57. {",,,,https://example.com/path https://example.com/path ",
  58. {{4, 24}, {29, 24}},
  59. {"!!EX!!", "!!EX!!"}
  60. },
  61. }
  62. for i,c in ipairs(cases) do
  63. test("Tokenize with exceptions " .. i, function()
  64. local w = util.tokenize_text(c[1], c[2])
  65. if #c[3] == 0 then
  66. assert_equal(#w, 0, "must not have tokens " .. c[1])
  67. else
  68. assert_not_nil(w, "must tokenize " .. c[1])
  69. for i,wrd in ipairs(w) do
  70. assert_equal(wrd, c[3][i])
  71. end
  72. end
  73. end)
  74. end
  75. end)