aboutsummaryrefslogtreecommitdiffstats
path: root/test/lua/unit/tokenizer.lua
blob: fbf7ee3e73742b6f416063d73a9ecd871d68f796 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
context("Text tokenization test", function()
  local util = require "rspamd_util"
  local logger = require "rspamd_logger"

  local cases = {
    {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
     {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
      "Integer", "mattis", "nibh"
     }
    },
    {"Հետաքրքրվողների համար ոտորև ներկայացված",
     {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
    },
    {"", {}},
    {",,,,,", {}},
    {"word,,,,,word    ", {"word", "word"}},
    {"word", {"word"}},
    {",,,,word,,,", {"word"}}
  }

  for i,c in ipairs(cases) do
    test("Tokenize simple " .. i, function()
      local w = util.tokenize_text(c[1])
      if #c[2] == 0 then
        assert_equal(#w, 0, "must not have tokens " .. c[1])
      else
        assert_not_nil(w, "must tokenize " .. c[1])

        for i,wrd in ipairs(w) do
          assert_equal(wrd, c[2][i])
        end
      end
    end)
  end

  cases = {
    {"word https://example.com/path word",
     {{5, 24}},
     {"word", "!!EX!!", "word"}
    },
    {"համար https://example.com/path համար",
     {{11, 24}},
     {"համար", "!!EX!!", "համար"}
    },
    {"word https://example.com/path https://example.com/path word",
     {{5, 24}, {30, 24}},
     {"word", "!!EX!!", "!!EX!!", "word"}
    },
    {"word https://example.com/path https://example.com/path",
     {{5, 24}, {30, 24}},
     {"word", "!!EX!!", "!!EX!!"}
    },
    {"https://example.com/path https://example.com/path word",
     {{0, 24}, {25, 24}},
     {"!!EX!!", "!!EX!!", "word"}
    },
    {"https://example.com/path https://example.com/path",
     {{0, 24}, {25, 24}},
     {"!!EX!!", "!!EX!!"}
    },
    {",,,,https://example.com/path https://example.com/path    ",
     {{4, 24}, {29, 24}},
     {"!!EX!!", "!!EX!!"}
    },
  }

  for i,c in ipairs(cases) do
    test("Tokenize with exceptions " .. i, function()
      local w = util.tokenize_text(c[1], c[2])
      if #c[3] == 0 then
        assert_equal(#w, 0, "must not have tokens " .. c[1])
      else
        assert_not_nil(w, "must tokenize " .. c[1])
        for i,wrd in ipairs(w) do
          assert_equal(wrd, c[3][i])
        end
      end
    end)
  end

end)