aboutsummaryrefslogtreecommitdiffstats
path: root/test/lua/unit
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-05-21 10:23:52 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-05-21 10:23:52 +0100
commit97f3f341e396a2df2d00e5a3491e2eb4bc882547 (patch)
treecd565444844b2bfef06324e7d5dbd25cd90322dd /test/lua/unit
parentc44ddedf8c4950cc679073bb809e8d27b0186951 (diff)
downloadrspamd-97f3f341e396a2df2d00e5a3491e2eb4bc882547.tar.gz
rspamd-97f3f341e396a2df2d00e5a3491e2eb4bc882547.zip
Add more unit tests for tokenization.
Diffstat (limited to 'test/lua/unit')
-rw-r--r--test/lua/unit/tokenizer.lua95
1 files changed, 90 insertions, 5 deletions
diff --git a/test/lua/unit/tokenizer.lua b/test/lua/unit/tokenizer.lua
index 628d70298..6aefeec6f 100644
--- a/test/lua/unit/tokenizer.lua
+++ b/test/lua/unit/tokenizer.lua
@@ -11,15 +11,100 @@ context("Text tokenization test", function()
{"Հետաքրքրվողների համար ոտորև ներկայացված",
{"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
},
+ {"", {}},
+ {",,,,,", {}},
+ {"word,,,,,word ", {"word", "word"}},
+ {"word", {"word"}},
+ {",,,,word,,,", {"word"}}
}
for _,c in ipairs(cases) do
local w = util.tokenize_text(c[1])
- assert_not_nil(w, "cannot tokenize " .. c[1])
-
- for i,wrd in ipairs(w) do
- logger.infox('%1:%2', i, wrd)
- assert_equal(wrd, c[2][i])
+ if #c[2] == 0 then
+ assert_equal(#w, 0, "must not have tokens " .. c[1])
+ else
+ assert_not_nil(w, "must tokenize " .. c[1])
+
+ for i,wrd in ipairs(w) do
+ assert_equal(wrd, c[2][i])
+ end
+ end
+ end
+ end)
+ test("Tokenize simple text (legacy)", function()
+ local cases = {
+ -- First token is bad
+ {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
+ {"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
+ "Integer", "mattis", "nibh"
+ }
+ },
+ -- Unicode is broken
+ --{"Հետաքրքրվողների համար ոտորև ներկայացված",
+ -- {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
+ --},
+ {"", {}},
+ {",,,,,", {}},
+ {"word,,,,,word ", {"ord", "word"}},
+ {"word", {"ord"}},
+ {",,,,word,,,", {"word"}}
+ }
+
+ for _,c in ipairs(cases) do
+ local w = util.tokenize_text(c[1], {}, true)
+ if #c[2] == 0 then
+ assert_equal(#w, 0, "must not have tokens " .. c[1])
+ else
+ assert_not_nil(w, "must tokenize " .. c[1])
+
+ for i,wrd in ipairs(w) do
+ assert_equal(wrd, c[2][i])
+ end
+ end
+ end
+ end)
+ test("Tokenize with exceptions", function()
+ local cases = {
+ {"word https://example.com/path word",
+ {{5, 24}},
+ {"word", "exception", "word"}
+ },
+ {"համար https://example.com/path համար",
+ {{11, 24}},
+ {"համար", "exception", "համար"}
+ },
+ {"word https://example.com/path https://example.com/path word",
+ {{5, 24}, {30, 24}},
+ {"word", "exception", "exception", "word"}
+ },
+ {"word https://example.com/path https://example.com/path",
+ {{5, 24}, {30, 24}},
+ {"word", "exception", "exception"}
+ },
+ {"https://example.com/path https://example.com/path word",
+ {{0, 24}, {25, 24}},
+ {"exception", "exception", "word"}
+ },
+ {"https://example.com/path https://example.com/path",
+ {{0, 24}, {25, 24}},
+ {"exception", "exception"}
+ },
+ {",,,,https://example.com/path https://example.com/path ",
+ {{4, 24}, {29, 24}},
+ {"exception", "exception"}
+ },
+ }
+
+ for _,c in ipairs(cases) do
+ local w = util.tokenize_text(c[1], c[2])
+ if #c[3] == 0 then
+ assert_equal(#w, 0, "must not have tokens " .. c[1])
+ else
+ assert_not_nil(w, "must tokenize " .. c[1])
+
+ for i,wrd in ipairs(w) do
+ assert_equal(wrd, c[3][i])
+ end
end
end
end)