aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-05-20 22:17:45 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-05-20 22:17:45 +0100
commit255340bb9fbc5e13d3eae070762a5af5f8275fe3 (patch)
tree68e255d6ab5b78cc3a050e8a694fde14bfa267c5
parentfc91dd7a8186ef9d4d9f2e861f5d6e482033e748 (diff)
downloadrspamd-255340bb9fbc5e13d3eae070762a5af5f8275fe3.tar.gz
rspamd-255340bb9fbc5e13d3eae070762a5af5f8275fe3.zip
Add lua bindings to tokenizer.
-rw-r--r--src/lua/lua_util.c96
1 files changed, 96 insertions, 0 deletions
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 20c14d24d..37bf98ead 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -25,6 +25,7 @@
#include "task.h"
#include "main.h"
#include "cfg_rcl.h"
+#include "tokenizers/tokenizers.h"
/***
* @function util.create_event_base()
@@ -52,6 +53,14 @@ LUA_FUNCTION_DEF (util, config_from_ucl);
* @return {rspamd_text} encoded data chunk
*/
LUA_FUNCTION_DEF (util, encode_base64);
+/***
+ * @function util.tokenize_text(input[, exceptions])
+ * Create tokens from a text using optional exceptions list
+ * @param {text/string} input input data
+ * @param {table} exceptions, a table of pairs containing <start_pos,lenght> of exceptions in the input
+ * @return {table/strings} list of strings representing words in the text
+ */
+LUA_FUNCTION_DEF (util, tokenize_text);
LUA_FUNCTION_DEF (util, process_message);
static const struct luaL_reg utillib_f[] = {
@@ -60,6 +69,7 @@ static const struct luaL_reg utillib_f[] = {
LUA_INTERFACE_DEF (util, config_from_ucl),
LUA_INTERFACE_DEF (util, process_message),
LUA_INTERFACE_DEF (util, encode_base64),
+ LUA_INTERFACE_DEF (util, tokenize_text),
{NULL, NULL}
};
@@ -251,6 +261,92 @@ lua_util_encode_base64 (lua_State *L)
}
static gint
+lua_util_tokenize_text (lua_State *L)
+{
+ const gchar *in = NULL;
+ gsize len, pos, ex_len, i;
+ GList *exceptions = NULL, *cur;
+ struct rspamd_lua_text *t;
+ struct process_exception *ex;
+ GArray *res;
+ rspamd_fstring_t *w;
+ gboolean compat = FALSE;
+
+ if (lua_type (L, 1) == LUA_TSTRING) {
+ in = luaL_checklstring (L, 1, &len);
+ }
+ else if (lua_type (L, 1) == LUA_TTABLE) {
+ t = lua_check_text (L, 1);
+
+ if (t) {
+ in = t->start;
+ len = t->len;
+ }
+ }
+
+ if (in == NULL) {
+ lua_pushnil (L);
+ return 1;
+ }
+
+ if (lua_gettop (L) > 1 && lua_type (L, 2) == LUA_TTABLE) {
+ lua_pushvalue (L, 2);
+ lua_pushnil (L);
+
+ while (lua_next (L, -2) != 0) {
+ if (lua_type (L, -1) == LUA_TTABLE) {
+ lua_rawgeti (L, -1, 1);
+ pos = luaL_checknumber (L, -1);
+ lua_pop (L, 1);
+ lua_rawgeti (L, -1, 2);
+ ex_len = luaL_checknumber (L, -1);
+ lua_pop (L, 1);
+
+ if (ex_len > 0) {
+ ex = g_slice_alloc (sizeof (*ex));
+ ex->pos = pos;
+ ex->len = ex_len;
+ exceptions = g_list_prepend (exceptions, ex);
+ }
+ }
+ lua_pop (L, 1);
+ }
+
+ lua_pop (L, 1);
+ }
+
+ if (lua_gettop (L) > 2 && lua_type (L, 3) == LUA_TBOOLEAN) {
+ compat = lua_toboolean (L, 3);
+ }
+
+ res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat);
+
+ if (res == NULL) {
+ lua_pushnil (L);
+ }
+ else {
+ lua_newtable (L);
+
+ for (i = 0; i < res->len; i ++) {
+ w = &g_array_index (res, rspamd_fstring_t, i);
+ lua_pushlstring (L, w->begin, w->len);
+ lua_settable (L, i + 1);
+ }
+ }
+
+ cur = exceptions;
+ while (cur) {
+ ex = cur->data;
+ g_slice_free1 (sizeof (*ex), ex);
+ cur = g_list_next (cur);
+ }
+
+ g_list_free (exceptions);
+
+ return 1;
+}
+
+static gint
lua_load_util (lua_State * L)
{
lua_newtable (L);