diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-05-21 10:23:52 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-05-21 10:23:52 +0100 |
commit | 97f3f341e396a2df2d00e5a3491e2eb4bc882547 (patch) | |
tree | cd565444844b2bfef06324e7d5dbd25cd90322dd /test/lua/unit | |
parent | c44ddedf8c4950cc679073bb809e8d27b0186951 (diff) | |
download | rspamd-97f3f341e396a2df2d00e5a3491e2eb4bc882547.tar.gz rspamd-97f3f341e396a2df2d00e5a3491e2eb4bc882547.zip |
Add more unit tests for tokenization.
Diffstat (limited to 'test/lua/unit')
-rw-r--r-- | test/lua/unit/tokenizer.lua | 95 |
1 files changed, 90 insertions, 5 deletions
diff --git a/test/lua/unit/tokenizer.lua b/test/lua/unit/tokenizer.lua index 628d70298..6aefeec6f 100644 --- a/test/lua/unit/tokenizer.lua +++ b/test/lua/unit/tokenizer.lua @@ -11,15 +11,100 @@ context("Text tokenization test", function() {"Հետաքրքրվողների համար ոտորև ներկայացված", {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"} }, + {"", {}}, + {",,,,,", {}}, + {"word,,,,,word ", {"word", "word"}}, + {"word", {"word"}}, + {",,,,word,,,", {"word"}} } for _,c in ipairs(cases) do local w = util.tokenize_text(c[1]) - assert_not_nil(w, "cannot tokenize " .. c[1]) - - for i,wrd in ipairs(w) do - logger.infox('%1:%2', i, wrd) - assert_equal(wrd, c[2][i]) + if #c[2] == 0 then + assert_equal(#w, 0, "must not have tokens " .. c[1]) + else + assert_not_nil(w, "must tokenize " .. c[1]) + + for i,wrd in ipairs(w) do + assert_equal(wrd, c[2][i]) + end + end + end + end) + test("Tokenize simple text (legacy)", function() + local cases = { + -- First token is bad + {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh", + {"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", + "Integer", "mattis", "nibh" + } + }, + -- Unicode is broken + --{"Հետաքրքրվողների համար ոտորև ներկայացված", + -- {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"} + --}, + {"", {}}, + {",,,,,", {}}, + {"word,,,,,word ", {"ord", "word"}}, + {"word", {"ord"}}, + {",,,,word,,,", {"word"}} + } + + for _,c in ipairs(cases) do + local w = util.tokenize_text(c[1], {}, true) + if #c[2] == 0 then + assert_equal(#w, 0, "must not have tokens " .. c[1]) + else + assert_not_nil(w, "must tokenize " .. c[1]) + + for i,wrd in ipairs(w) do + assert_equal(wrd, c[2][i]) + end + end + end + end) + test("Tokenize with exceptions", function() + local cases = { + {"word https://example.com/path word", + {{5, 24}}, + {"word", "exception", "word"} + }, + {"համար https://example.com/path համար", + {{11, 24}}, + {"համար", "exception", "համար"} + }, + {"word https://example.com/path https://example.com/path word", + {{5, 24}, {30, 24}}, + {"word", "exception", "exception", "word"} + }, + {"word https://example.com/path https://example.com/path", + {{5, 24}, {30, 24}}, + {"word", "exception", "exception"} + }, + {"https://example.com/path https://example.com/path word", + {{0, 24}, {25, 24}}, + {"exception", "exception", "word"} + }, + {"https://example.com/path https://example.com/path", + {{0, 24}, {25, 24}}, + {"exception", "exception"} + }, + {",,,,https://example.com/path https://example.com/path ", + {{4, 24}, {29, 24}}, + {"exception", "exception"} + }, + } + + for _,c in ipairs(cases) do + local w = util.tokenize_text(c[1], c[2]) + if #c[3] == 0 then + assert_equal(#w, 0, "must not have tokens " .. c[1]) + else + assert_not_nil(w, "must tokenize " .. c[1]) + + for i,wrd in ipairs(w) do + assert_equal(wrd, c[3][i]) + end end end end) |