|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- context("Text tokenization test", function()
- local util = require "rspamd_util"
- local logger = require "rspamd_logger"
-
- local cases = {
- {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
- {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
- "Integer", "mattis", "nibh"
- }
- },
- {"Հետաքրքրվողների համար ոտորև ներկայացված",
- {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
- },
- {"", {}},
- {",,,,,", {}},
- {"word,,,,,word ", {"word", "word"}},
- {"word", {"word"}},
- {",,,,word,,,", {"word"}}
- }
-
- for i,c in ipairs(cases) do
- test("Tokenize simple " .. i, function()
- local w = util.tokenize_text(c[1])
- if #c[2] == 0 then
- assert_equal(#w, 0, "must not have tokens " .. c[1])
- else
- assert_not_nil(w, "must tokenize " .. c[1])
-
- for i,wrd in ipairs(w) do
- assert_equal(wrd, c[2][i])
- end
- end
- end)
- end
-
- cases = {
- {"word https://example.com/path word",
- {{5, 24}},
- {"word", "!!EX!!", "word"}
- },
- {"համար https://example.com/path համար",
- {{11, 24}},
- {"համար", "!!EX!!", "համար"}
- },
- {"word https://example.com/path https://example.com/path word",
- {{5, 24}, {30, 24}},
- {"word", "!!EX!!", "!!EX!!", "word"}
- },
- {"word https://example.com/path https://example.com/path",
- {{5, 24}, {30, 24}},
- {"word", "!!EX!!", "!!EX!!"}
- },
- {"https://example.com/path https://example.com/path word",
- {{0, 24}, {25, 24}},
- {"!!EX!!", "!!EX!!", "word"}
- },
- {"https://example.com/path https://example.com/path",
- {{0, 24}, {25, 24}},
- {"!!EX!!", "!!EX!!"}
- },
- {",,,,https://example.com/path https://example.com/path ",
- {{4, 24}, {29, 24}},
- {"!!EX!!", "!!EX!!"}
- },
- }
-
- for i,c in ipairs(cases) do
- test("Tokenize with exceptions " .. i, function()
- local w = util.tokenize_text(c[1], c[2])
- if #c[3] == 0 then
- assert_equal(#w, 0, "must not have tokens " .. c[1])
- else
- assert_not_nil(w, "must tokenize " .. c[1])
- for i,wrd in ipairs(w) do
- assert_equal(wrd, c[3][i])
- end
- end
- end)
- end
-
- end)
|