test/lua/unit/tokenizer.lua


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

context("Text tokenization test", function()
  local util = require "rspamd_util"
  local logger = require "rspamd_logger"
  test("Tokenize simple text", function()
    local cases = {
      {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
        {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
        "Integer", "mattis", "nibh"
        }
      },
      {"Հետաքրքրվողների համար ոտորև ներկայացված",
        {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
      },
      {"", {}},
      {",,,,,", {}},
      {"word,,,,,word    ", {"word", "word"}},
      {"word", {"word"}},
      {",,,,word,,,", {"word"}}
    }
    
    for _,c in ipairs(cases) do
      local w = util.tokenize_text(c[1])
      if #c[2] == 0 then
        assert_equal(#w, 0, "must not have tokens " .. c[1])
      else
        assert_not_nil(w, "must tokenize " .. c[1])
        
        for i,wrd in ipairs(w) do
          assert_equal(wrd, c[2][i])
        end
      end
    end
  end)
    test("Tokenize simple text (legacy)", function()
    local cases = {
      -- First token is bad
      {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
        {"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
        "Integer", "mattis", "nibh"
        }
      },
      -- Unicode is broken
      --{"Հետաքրքրվողների համար ոտորև ներկայացված",
      --  {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
      --},
      {"", {}},
      {",,,,,", {}},
      {"word,,,,,word    ", {"ord", "word"}},
      {"word", {"ord"}},
      {",,,,word,,,", {"word"}}
    }
    
    for _,c in ipairs(cases) do
      local w = util.tokenize_text(c[1], {}, true)
      if #c[2] == 0 then
        assert_equal(#w, 0, "must not have tokens " .. c[1])
      else
        assert_not_nil(w, "must tokenize " .. c[1])
        
        for i,wrd in ipairs(w) do
          assert_equal(wrd, c[2][i])
        end
      end
    end
  end)
  test("Tokenize with exceptions", function()
    local cases = {
      {"word https://example.com/path word",
        {{5, 24}},
        {"word", "!!EX!!", "word"}
      },
      {"համար https://example.com/path համար",
        {{11, 24}},
        {"համար", "!!EX!!", "համար"}
      },
      {"word https://example.com/path https://example.com/path word",
        {{5, 24}, {30, 24}},
        {"word", "!!EX!!", "!!EX!!", "word"}
      },
      {"word https://example.com/path https://example.com/path",
        {{5, 24}, {30, 24}},
        {"word", "!!EX!!", "!!EX!!"}
      },
      {"https://example.com/path https://example.com/path word",
        {{0, 24}, {25, 24}},
        {"!!EX!!", "!!EX!!", "word"}
      },
      {"https://example.com/path https://example.com/path",
        {{0, 24}, {25, 24}},
        {"!!EX!!", "!!EX!!"}
      },
      {",,,,https://example.com/path https://example.com/path    ",
        {{4, 24}, {29, 24}},
        {"!!EX!!", "!!EX!!"}
      },
    }
    
    for _,c in ipairs(cases) do
      local w = util.tokenize_text(c[1], c[2])
      if #c[3] == 0 then
        assert_equal(#w, 0, "must not have tokens " .. c[1])
      else
        assert_not_nil(w, "must tokenize " .. c[1])
        
        for i,wrd in ipairs(w) do
          assert_equal(wrd, c[3][i])
        end
      end
    end
  end)
end)