Add more unit tests for tokenization.

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-05-21 10:23:52 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-05-21 10:23:52 +0100
commit: 97f3f341e396a2df2d00e5a3491e2eb4bc882547 (patch)
tree: cd565444844b2bfef06324e7d5dbd25cd90322dd /test/lua/unit
parent: c44ddedf8c4950cc679073bb809e8d27b0186951 (diff)
download: rspamd-97f3f341e396a2df2d00e5a3491e2eb4bc882547.tar.gz
rspamd-97f3f341e396a2df2d00e5a3491e2eb4bc882547.zip
1 files changed, 90 insertions, 5 deletions
diff --git a/test/lua/unit/tokenizer.lua b/test/lua/unit/tokenizer.lua
index 628d70298..6aefeec6f 100644
--- a/test/lua/unit/tokenizer.lua
+++ b/test/lua/unit/tokenizer.lua
@@ -11,15 +11,100 @@ context("Text tokenization test", function()
       {"Հետաքրքրվողների համար ոտորև ներկայացված",
         {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
       },
+      {"", {}},
+      {",,,,,", {}},
+      {"word,,,,,word    ", {"word", "word"}},
+      {"word", {"word"}},
+      {",,,,word,,,", {"word"}}
     }
     
     for _,c in ipairs(cases) do
       local w = util.tokenize_text(c[1])
-      assert_not_nil(w, "cannot tokenize " .. c[1])
-      
-      for i,wrd in ipairs(w) do
-        logger.infox('%1:%2', i, wrd)
-        assert_equal(wrd, c[2][i])
+      if #c[2] == 0 then
+        assert_equal(#w, 0, "must not have tokens " .. c[1])
+      else
+        assert_not_nil(w, "must tokenize " .. c[1])
+        
+        for i,wrd in ipairs(w) do
+          assert_equal(wrd, c[2][i])
+        end
+      end
+    end
+  end)
+    test("Tokenize simple text (legacy)", function()
+    local cases = {
+      -- First token is bad
+      {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
+        {"orem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
+        "Integer", "mattis", "nibh"
+        }
+      },
+      -- Unicode is broken
+      --{"Հետաքրքրվողների համար ոտորև ներկայացված",
+      --  {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
+      --},
+      {"", {}},
+      {",,,,,", {}},
+      {"word,,,,,word    ", {"ord", "word"}},
+      {"word", {"ord"}},
+      {",,,,word,,,", {"word"}}
+    }
+    
+    for _,c in ipairs(cases) do
+      local w = util.tokenize_text(c[1], {}, true)
+      if #c[2] == 0 then
+        assert_equal(#w, 0, "must not have tokens " .. c[1])
+      else
+        assert_not_nil(w, "must tokenize " .. c[1])
+        
+        for i,wrd in ipairs(w) do
+          assert_equal(wrd, c[2][i])
+        end
+      end
+    end
+  end)
+  test("Tokenize with exceptions", function()
+    local cases = {
+      {"word https://example.com/path word",
+        {{5, 24}},
+        {"word", "exception", "word"}
+      },
+      {"համար https://example.com/path համար",
+        {{11, 24}},
+        {"համար", "exception", "համար"}
+      },
+      {"word https://example.com/path https://example.com/path word",
+        {{5, 24}, {30, 24}},
+        {"word", "exception", "exception", "word"}
+      },
+      {"word https://example.com/path https://example.com/path",
+        {{5, 24}, {30, 24}},
+        {"word", "exception", "exception"}
+      },
+      {"https://example.com/path https://example.com/path word",
+        {{0, 24}, {25, 24}},
+        {"exception", "exception", "word"}
+      },
+      {"https://example.com/path https://example.com/path",
+        {{0, 24}, {25, 24}},
+        {"exception", "exception"}
+      },
+      {",,,,https://example.com/path https://example.com/path    ",
+        {{4, 24}, {29, 24}},
+        {"exception", "exception"}
+      },
+    }
+    
+    for _,c in ipairs(cases) do
+      local w = util.tokenize_text(c[1], c[2])
+      if #c[3] == 0 then
+        assert_equal(#w, 0, "must not have tokens " .. c[1])
+      else
+        assert_not_nil(w, "must tokenize " .. c[1])
+        
+        for i,wrd in ipairs(w) do
+          assert_equal(wrd, c[3][i])
+        end
       end
     end
   end)
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-05-21 10:23:52 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-05-21 10:23:52 +0100
commit	97f3f341e396a2df2d00e5a3491e2eb4bc882547 (patch)
tree	cd565444844b2bfef06324e7d5dbd25cd90322dd /test/lua/unit
parent	c44ddedf8c4950cc679073bb809e8d27b0186951 (diff)
download	rspamd-97f3f341e396a2df2d00e5a3491e2eb4bc882547.tar.gz rspamd-97f3f341e396a2df2d00e5a3491e2eb4bc882547.zip