aboutsummaryrefslogtreecommitdiffstats
path: root/test
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-03-30 14:27:14 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-03-30 14:27:14 +0100
commit8160aa803846616f28d94519486d310a771f58bf (patch)
tree56ccb9681b77f574d42f054f1b2431609d80dbd6 /test
parentf3d6aacab592f53a346b7573911dee2489392b3a (diff)
downloadrspamd-8160aa803846616f28d94519486d310a771f58bf.tar.gz
rspamd-8160aa803846616f28d94519486d310a771f58bf.zip
[Test] Improve tokenization tests
Diffstat (limited to 'test')
-rw-r--r--test/lua/unit/tokenizer.lua164
1 files changed, 84 insertions, 80 deletions
diff --git a/test/lua/unit/tokenizer.lua b/test/lua/unit/tokenizer.lua
index e05f74d86..16f8f1846 100644
--- a/test/lua/unit/tokenizer.lua
+++ b/test/lua/unit/tokenizer.lua
@@ -1,111 +1,115 @@
context("Text tokenization test", function()
local util = require "rspamd_util"
local logger = require "rspamd_logger"
- test("Tokenize simple text", function()
- local cases = {
- {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
- {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
- "Integer", "mattis", "nibh"
- }
- },
- {"Հետաքրքրվողների համար ոտորև ներկայացված",
- {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
- },
- {"", {}},
- {",,,,,", {}},
- {"word,,,,,word ", {"word", "word"}},
- {"word", {"word"}},
- {",,,,word,,,", {"word"}}
- }
-
- for _,c in ipairs(cases) do
+
+ local cases = {
+ {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
+ {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
+ "Integer", "mattis", "nibh"
+ }
+ },
+ {"Հետաքրքրվողների համար ոտորև ներկայացված",
+ {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
+ },
+ {"", {}},
+ {",,,,,", {}},
+ {"word,,,,,word ", {"word", "word"}},
+ {"word", {"word"}},
+ {",,,,word,,,", {"word"}}
+ }
+
+ for i,c in ipairs(cases) do
+ test("Tokenize simple " .. i, function()
local w = util.tokenize_text(c[1])
if #c[2] == 0 then
assert_equal(#w, 0, "must not have tokens " .. c[1])
else
assert_not_nil(w, "must tokenize " .. c[1])
-
+
for i,wrd in ipairs(w) do
assert_equal(wrd, c[2][i])
end
end
- end
- end)
- test("Tokenize simple text (legacy)", function()
- local cases = {
- -- First token is bad
- {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
- {"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
- "Integer", "mattis", "nibh"
- }
- },
- -- Unicode is broken
- --{"Հետաքրքրվողների համար ոտորև ներկայացված",
- -- {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
- --},
- {"", {}},
- {",,,,,", {}},
- {"word,,,,,word ", {"ord", "word"}},
- {"word", {"ord"}},
- {",,,,word,,,", {"word"}}
- }
-
- for _,c in ipairs(cases) do
+ end)
+ end
+
+
+ cases = {
+ -- First token is bad
+ {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
+ {"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
+ "Integer", "mattis", "nibh"
+ }
+ },
+ -- Unicode is broken
+ --{"Հետաքրքրվողների համար ոտորև ներկայացված",
+ -- {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
+ --},
+ {"", {}},
+ {",,,,,", {}},
+ {"word,,,,,word ", {"ord", "word"}},
+ {"word", {"ord"}},
+ {",,,,word,,,", {"word"}}
+ }
+
+ for i,c in ipairs(cases) do
+ test("Tokenize simple text (legacy) " .. i, function()
local w = util.tokenize_text(c[1], {}, true)
if #c[2] == 0 then
assert_equal(#w, 0, "must not have tokens " .. c[1])
else
assert_not_nil(w, "must tokenize " .. c[1])
-
+
for i,wrd in ipairs(w) do
assert_equal(wrd, c[2][i])
end
end
- end
- end)
- test("Tokenize with exceptions", function()
- local cases = {
- {"word https://example.com/path word",
- {{5, 24}},
- {"word", "!!EX!!", "word"}
- },
- {"համար https://example.com/path համար",
- {{11, 24}},
- {"համար", "!!EX!!", "համար"}
- },
- {"word https://example.com/path https://example.com/path word",
- {{5, 24}, {30, 24}},
- {"word", "!!EX!!", "!!EX!!", "word"}
- },
- {"word https://example.com/path https://example.com/path",
- {{5, 24}, {30, 24}},
- {"word", "!!EX!!", "!!EX!!"}
- },
- {"https://example.com/path https://example.com/path word",
- {{0, 24}, {25, 24}},
- {"!!EX!!", "!!EX!!", "word"}
- },
- {"https://example.com/path https://example.com/path",
- {{0, 24}, {25, 24}},
- {"!!EX!!", "!!EX!!"}
- },
- {",,,,https://example.com/path https://example.com/path ",
- {{4, 24}, {29, 24}},
- {"!!EX!!", "!!EX!!"}
- },
- }
-
- for _,c in ipairs(cases) do
+ end)
+ end
+
+ cases = {
+ {"word https://example.com/path word",
+ {{5, 24}},
+ {"word", "!!EX!!", "word"}
+ },
+ {"համար https://example.com/path համար",
+ {{11, 24}},
+ {"համար", "!!EX!!", "համար"}
+ },
+ {"word https://example.com/path https://example.com/path word",
+ {{5, 24}, {30, 24}},
+ {"word", "!!EX!!", "!!EX!!", "word"}
+ },
+ {"word https://example.com/path https://example.com/path",
+ {{5, 24}, {30, 24}},
+ {"word", "!!EX!!", "!!EX!!"}
+ },
+ {"https://example.com/path https://example.com/path word",
+ {{0, 24}, {25, 24}},
+ {"!!EX!!", "!!EX!!", "word"}
+ },
+ {"https://example.com/path https://example.com/path",
+ {{0, 24}, {25, 24}},
+ {"!!EX!!", "!!EX!!"}
+ },
+ {",,,,https://example.com/path https://example.com/path ",
+ {{4, 24}, {29, 24}},
+ {"!!EX!!", "!!EX!!"}
+ },
+ }
+
+ for i,c in ipairs(cases) do
+ test("Tokenize with exceptions " .. i, function()
local w = util.tokenize_text(c[1], c[2])
if #c[3] == 0 then
assert_equal(#w, 0, "must not have tokens " .. c[1])
else
assert_not_nil(w, "must tokenize " .. c[1])
-
for i,wrd in ipairs(w) do
assert_equal(wrd, c[3][i])
end
end
- end
- end)
+ end)
+ end
+
end) \ No newline at end of file