1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
context("Text tokenization test", function()
local util = require "rspamd_util"
local logger = require "rspamd_logger"
test("Tokenize simple text", function()
local cases = {
{"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
{"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
"Integer", "mattis", "nibh"
}
},
{"Հետաքրքրվողների համար ոտորև ներկայացված",
{"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
},
{"", {}},
{",,,,,", {}},
{"word,,,,,word ", {"word", "word"}},
{"word", {"word"}},
{",,,,word,,,", {"word"}}
}
for _,c in ipairs(cases) do
local w = util.tokenize_text(c[1])
if #c[2] == 0 then
assert_equal(#w, 0, "must not have tokens " .. c[1])
else
assert_not_nil(w, "must tokenize " .. c[1])
for i,wrd in ipairs(w) do
assert_equal(wrd, c[2][i])
end
end
end
end)
test("Tokenize simple text (legacy)", function()
local cases = {
-- First token is bad
{"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
{"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
"Integer", "mattis", "nibh"
}
},
-- Unicode is broken
--{"Հետաքրքրվողների համար ոտորև ներկայացված",
-- {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
--},
{"", {}},
{",,,,,", {}},
{"word,,,,,word ", {"ord", "word"}},
{"word", {"ord"}},
{",,,,word,,,", {"word"}}
}
for _,c in ipairs(cases) do
local w = util.tokenize_text(c[1], {}, true)
if #c[2] == 0 then
assert_equal(#w, 0, "must not have tokens " .. c[1])
else
assert_not_nil(w, "must tokenize " .. c[1])
for i,wrd in ipairs(w) do
assert_equal(wrd, c[2][i])
end
end
end
end)
test("Tokenize with exceptions", function()
local cases = {
{"word https://example.com/path word",
{{5, 24}},
{"word", "!!EX!!", "word"}
},
{"համար https://example.com/path համար",
{{11, 24}},
{"համար", "!!EX!!", "համար"}
},
{"word https://example.com/path https://example.com/path word",
{{5, 24}, {30, 24}},
{"word", "!!EX!!", "!!EX!!", "word"}
},
{"word https://example.com/path https://example.com/path",
{{5, 24}, {30, 24}},
{"word", "!!EX!!", "!!EX!!"}
},
{"https://example.com/path https://example.com/path word",
{{0, 24}, {25, 24}},
{"!!EX!!", "!!EX!!", "word"}
},
{"https://example.com/path https://example.com/path",
{{0, 24}, {25, 24}},
{"!!EX!!", "!!EX!!"}
},
{",,,,https://example.com/path https://example.com/path ",
{{4, 24}, {29, 24}},
{"!!EX!!", "!!EX!!"}
},
}
for _,c in ipairs(cases) do
local w = util.tokenize_text(c[1], c[2])
if #c[3] == 0 then
assert_equal(#w, 0, "must not have tokens " .. c[1])
else
assert_not_nil(w, "must tokenize " .. c[1])
for i,wrd in ipairs(w) do
assert_equal(wrd, c[3][i])
end
end
end
end)
end)
|