diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-03-30 14:27:14 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-03-30 14:27:14 +0100 |
commit | 8160aa803846616f28d94519486d310a771f58bf (patch) | |
tree | 56ccb9681b77f574d42f054f1b2431609d80dbd6 /test | |
parent | f3d6aacab592f53a346b7573911dee2489392b3a (diff) | |
download | rspamd-8160aa803846616f28d94519486d310a771f58bf.tar.gz rspamd-8160aa803846616f28d94519486d310a771f58bf.zip |
[Test] Improve tokenization tests
Diffstat (limited to 'test')
-rw-r--r-- | test/lua/unit/tokenizer.lua | 164 |
1 files changed, 84 insertions, 80 deletions
diff --git a/test/lua/unit/tokenizer.lua b/test/lua/unit/tokenizer.lua index e05f74d86..16f8f1846 100644 --- a/test/lua/unit/tokenizer.lua +++ b/test/lua/unit/tokenizer.lua @@ -1,111 +1,115 @@ context("Text tokenization test", function() local util = require "rspamd_util" local logger = require "rspamd_logger" - test("Tokenize simple text", function() - local cases = { - {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh", - {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", - "Integer", "mattis", "nibh" - } - }, - {"Հետաքրքրվողների համար ոտորև ներկայացված", - {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"} - }, - {"", {}}, - {",,,,,", {}}, - {"word,,,,,word ", {"word", "word"}}, - {"word", {"word"}}, - {",,,,word,,,", {"word"}} - } - - for _,c in ipairs(cases) do + + local cases = { + {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh", + {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", + "Integer", "mattis", "nibh" + } + }, + {"Հետաքրքրվողների համար ոտորև ներկայացված", + {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"} + }, + {"", {}}, + {",,,,,", {}}, + {"word,,,,,word ", {"word", "word"}}, + {"word", {"word"}}, + {",,,,word,,,", {"word"}} + } + + for i,c in ipairs(cases) do + test("Tokenize simple " .. i, function() local w = util.tokenize_text(c[1]) if #c[2] == 0 then assert_equal(#w, 0, "must not have tokens " .. c[1]) else assert_not_nil(w, "must tokenize " .. c[1]) - + for i,wrd in ipairs(w) do assert_equal(wrd, c[2][i]) end end - end - end) - test("Tokenize simple text (legacy)", function() - local cases = { - -- First token is bad - {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh", - {"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", - "Integer", "mattis", "nibh" - } - }, - -- Unicode is broken - --{"Հետաքրքրվողների համար ոտորև ներկայացված", - -- {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"} - --}, - {"", {}}, - {",,,,,", {}}, - {"word,,,,,word ", {"ord", "word"}}, - {"word", {"ord"}}, - {",,,,word,,,", {"word"}} - } - - for _,c in ipairs(cases) do + end) + end + + + cases = { + -- First token is bad + {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh", + {"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", + "Integer", "mattis", "nibh" + } + }, + -- Unicode is broken + --{"Հետաքրքրվողների համար ոտորև ներկայացված", + -- {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"} + --}, + {"", {}}, + {",,,,,", {}}, + {"word,,,,,word ", {"ord", "word"}}, + {"word", {"ord"}}, + {",,,,word,,,", {"word"}} + } + + for i,c in ipairs(cases) do + test("Tokenize simple text (legacy) " .. i, function() local w = util.tokenize_text(c[1], {}, true) if #c[2] == 0 then assert_equal(#w, 0, "must not have tokens " .. c[1]) else assert_not_nil(w, "must tokenize " .. c[1]) - + for i,wrd in ipairs(w) do assert_equal(wrd, c[2][i]) end end - end - end) - test("Tokenize with exceptions", function() - local cases = { - {"word https://example.com/path word", - {{5, 24}}, - {"word", "!!EX!!", "word"} - }, - {"համար https://example.com/path համար", - {{11, 24}}, - {"համար", "!!EX!!", "համար"} - }, - {"word https://example.com/path https://example.com/path word", - {{5, 24}, {30, 24}}, - {"word", "!!EX!!", "!!EX!!", "word"} - }, - {"word https://example.com/path https://example.com/path", - {{5, 24}, {30, 24}}, - {"word", "!!EX!!", "!!EX!!"} - }, - {"https://example.com/path https://example.com/path word", - {{0, 24}, {25, 24}}, - {"!!EX!!", "!!EX!!", "word"} - }, - {"https://example.com/path https://example.com/path", - {{0, 24}, {25, 24}}, - {"!!EX!!", "!!EX!!"} - }, - {",,,,https://example.com/path https://example.com/path ", - {{4, 24}, {29, 24}}, - {"!!EX!!", "!!EX!!"} - }, - } - - for _,c in ipairs(cases) do + end) + end + + cases = { + {"word https://example.com/path word", + {{5, 24}}, + {"word", "!!EX!!", "word"} + }, + {"համար https://example.com/path համար", + {{11, 24}}, + {"համար", "!!EX!!", "համար"} + }, + {"word https://example.com/path https://example.com/path word", + {{5, 24}, {30, 24}}, + {"word", "!!EX!!", "!!EX!!", "word"} + }, + {"word https://example.com/path https://example.com/path", + {{5, 24}, {30, 24}}, + {"word", "!!EX!!", "!!EX!!"} + }, + {"https://example.com/path https://example.com/path word", + {{0, 24}, {25, 24}}, + {"!!EX!!", "!!EX!!", "word"} + }, + {"https://example.com/path https://example.com/path", + {{0, 24}, {25, 24}}, + {"!!EX!!", "!!EX!!"} + }, + {",,,,https://example.com/path https://example.com/path ", + {{4, 24}, {29, 24}}, + {"!!EX!!", "!!EX!!"} + }, + } + + for i,c in ipairs(cases) do + test("Tokenize with exceptions " .. i, function() local w = util.tokenize_text(c[1], c[2]) if #c[3] == 0 then assert_equal(#w, 0, "must not have tokens " .. c[1]) else assert_not_nil(w, "must tokenize " .. c[1]) - for i,wrd in ipairs(w) do assert_equal(wrd, c[3][i]) end end - end - end) + end) + end + end)
\ No newline at end of file |