123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500 |
- /*
- * Copyright 2024 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include "lua_common.h"
- #include "message.h"
- #include "libutil/multipattern.h"
-
- /***
- * @module rspamd_trie
- * Rspamd trie module provides the data structure suitable for searching of many
- * patterns in arbitrary texts (or binary chunks). The algorithmic complexity of
- * this algorithm is at most O(n + m + z), where `n` is the length of text, `m` is a length of pattern and `z` is a number of patterns in the text.
- *
- * Here is a typical example of trie usage:
- * @example
- local rspamd_trie = require "rspamd_trie"
- local patterns = {'aab', 'ab', 'bcd\0ef'}
-
- local trie = rspamd_trie.create(patterns)
-
- local function trie_callback(number, pos)
- print('Matched pattern number ' .. tostring(number) .. ' at pos: ' .. tostring(pos))
- end
-
- trie:match('some big text', trie_callback)
- */
-
- /* Suffix trie */
- LUA_FUNCTION_DEF(trie, create);
- LUA_FUNCTION_DEF(trie, has_hyperscan);
- LUA_FUNCTION_DEF(trie, match);
- LUA_FUNCTION_DEF(trie, search_mime);
- LUA_FUNCTION_DEF(trie, search_rawmsg);
- LUA_FUNCTION_DEF(trie, search_rawbody);
- LUA_FUNCTION_DEF(trie, destroy);
-
- static const struct luaL_reg trielib_m[] = {
- LUA_INTERFACE_DEF(trie, match),
- LUA_INTERFACE_DEF(trie, search_mime),
- LUA_INTERFACE_DEF(trie, search_rawmsg),
- LUA_INTERFACE_DEF(trie, search_rawbody),
- {"__tostring", rspamd_lua_class_tostring},
- {"__gc", lua_trie_destroy},
- {NULL, NULL}};
- static const struct luaL_reg trielib_f[] = {
- LUA_INTERFACE_DEF(trie, create),
- LUA_INTERFACE_DEF(trie, has_hyperscan),
- {NULL, NULL}};
-
- static struct rspamd_multipattern *
- lua_check_trie(lua_State *L, int idx)
- {
- void *ud = rspamd_lua_check_udata(L, 1, rspamd_trie_classname);
-
- luaL_argcheck(L, ud != NULL, 1, "'trie' expected");
- return ud ? *((struct rspamd_multipattern **) ud) : NULL;
- }
-
- static int
- lua_trie_destroy(lua_State *L)
- {
- struct rspamd_multipattern *trie = lua_check_trie(L, 1);
-
- if (trie) {
- rspamd_multipattern_destroy(trie);
- }
-
- return 0;
- }
-
- /***
- * function trie.has_hyperscan()
- * Checks for hyperscan support
- *
- * @return {bool} true if hyperscan is supported
- */
- static int
- lua_trie_has_hyperscan(lua_State *L)
- {
- lua_pushboolean(L, rspamd_multipattern_has_hyperscan());
- return 1;
- }
-
- /***
- * function trie.create(patterns, [flags])
- * Creates new trie data structure
- * @param {table} array of string patterns
- * @return {trie} new trie object
- */
- static int
- lua_trie_create(lua_State *L)
- {
- struct rspamd_multipattern *trie, **ptrie;
- int npat = 0, flags = RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_GLOB;
- GError *err = NULL;
-
- if (lua_isnumber(L, 2)) {
- flags = lua_tointeger(L, 2);
- }
-
- if (!lua_istable(L, 1)) {
- return luaL_error(L, "lua trie expects array of patterns for now");
- }
- else {
- lua_pushvalue(L, 1);
- lua_pushnil(L);
-
- while (lua_next(L, -2) != 0) {
- if (lua_isstring(L, -1)) {
- npat++;
- }
-
- lua_pop(L, 1);
- }
-
- trie = rspamd_multipattern_create_sized(npat, flags);
- lua_pushnil(L);
-
- while (lua_next(L, -2) != 0) {
- if (lua_isstring(L, -1)) {
- const char *pat;
- gsize patlen;
-
- pat = lua_tolstring(L, -1, &patlen);
- rspamd_multipattern_add_pattern_len(trie, pat, patlen, flags);
- }
-
- lua_pop(L, 1);
- }
-
- lua_pop(L, 1); /* table */
-
- if (!rspamd_multipattern_compile(trie, 0, &err)) {
- msg_err("cannot compile multipattern: %e", err);
- g_error_free(err);
- rspamd_multipattern_destroy(trie);
- lua_pushnil(L);
- }
- else {
- ptrie = lua_newuserdata(L, sizeof(void *));
- rspamd_lua_setclass(L, rspamd_trie_classname, -1);
- *ptrie = trie;
- }
- }
-
- return 1;
- }
-
- #define PUSH_TRIE_MATCH(L, start, end, report_start) \
- do { \
- if (report_start) { \
- lua_createtable(L, 2, 0); \
- lua_pushinteger(L, (start)); \
- lua_rawseti(L, -2, 1); \
- lua_pushinteger(L, (end)); \
- lua_rawseti(L, -2, 2); \
- } \
- else { \
- lua_pushinteger(L, (end)); \
- } \
- } while (0)
-
- /* Normal callback type */
- static int
- lua_trie_lua_cb_callback(struct rspamd_multipattern *mp,
- unsigned int strnum,
- int match_start,
- int textpos,
- const char *text,
- gsize len,
- void *context)
- {
- lua_State *L = context;
- int ret;
-
- gboolean report_start = lua_toboolean(L, -1);
-
- /* Function */
- lua_pushvalue(L, 3);
- lua_pushinteger(L, strnum + 1);
-
- PUSH_TRIE_MATCH(L, match_start, textpos, report_start);
-
- if (lua_pcall(L, 2, 1, 0) != 0) {
- msg_info("call to trie callback has failed: %s",
- lua_tostring(L, -1));
- lua_pop(L, 1);
-
- return 1;
- }
-
- ret = lua_tonumber(L, -1);
- lua_pop(L, 1);
-
- return ret;
- }
-
- /* Table like callback, expect result table on top of the stack */
- static int
- lua_trie_table_callback(struct rspamd_multipattern *mp,
- unsigned int strnum,
- int match_start,
- int textpos,
- const char *text,
- gsize len,
- void *context)
- {
- lua_State *L = context;
-
- int report_start = lua_toboolean(L, -2);
- /* Set table, indexed by pattern number */
- lua_rawgeti(L, -1, strnum + 1);
-
- if (lua_istable(L, -1)) {
- /* Already have table, add offset */
- gsize last = rspamd_lua_table_size(L, -1);
- PUSH_TRIE_MATCH(L, match_start, textpos, report_start);
- lua_rawseti(L, -2, last + 1);
- /* Remove table from the stack */
- lua_pop(L, 1);
- }
- else {
- /* Pop none */
- lua_pop(L, 1);
- /* New table */
- lua_newtable(L);
- PUSH_TRIE_MATCH(L, match_start, textpos, report_start);
- lua_rawseti(L, -2, 1);
- lua_rawseti(L, -2, strnum + 1);
- }
-
- return 0;
- }
-
- /*
- * We assume that callback argument is at pos 3 and icase is in position 4
- */
- static int
- lua_trie_search_str(lua_State *L, struct rspamd_multipattern *trie,
- const char *str, gsize len, rspamd_multipattern_cb_t cb)
- {
- int ret;
- unsigned int nfound = 0;
-
- if ((ret = rspamd_multipattern_lookup(trie, str, len,
- cb, L, &nfound)) == 0) {
- return nfound;
- }
-
- return ret;
- }
-
- /***
- * @method trie:match(input, [cb][, report_start])
- * Search for patterns in `input` invoking `cb` optionally ignoring case
- * @param {table or string} input one or several (if `input` is an array) strings of input text
- * @param {function} cb callback called on each pattern match in form `function (idx, pos)` where `idx` is a numeric index of pattern (starting from 1) and `pos` is a numeric offset where the pattern ends
- * @param {boolean} report_start report both start and end offset when matching patterns
- * @return {boolean} `true` if any pattern has been found (`cb` might be called multiple times however). If `cb` is not defined then it returns a table of match positions indexed by pattern number
- */
- static int
- lua_trie_match(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_multipattern *trie = lua_check_trie(L, 1);
- const char *text;
- gsize len;
- gboolean found = FALSE, report_start = FALSE;
- struct rspamd_lua_text *t;
- rspamd_multipattern_cb_t cb = lua_trie_lua_cb_callback;
-
- int old_top = lua_gettop(L);
-
- if (trie) {
- if (lua_type(L, 3) != LUA_TFUNCTION) {
- if (lua_isboolean(L, 3)) {
- report_start = lua_toboolean(L, 3);
- }
-
- lua_pushboolean(L, report_start);
- /* Table like match */
- lua_newtable(L);
- cb = lua_trie_table_callback;
- }
- else {
- if (lua_isboolean(L, 4)) {
- report_start = lua_toboolean(L, 4);
- }
- lua_pushboolean(L, report_start);
- }
-
- if (lua_type(L, 2) == LUA_TTABLE) {
- lua_pushvalue(L, 2);
- lua_pushnil(L);
-
- while (lua_next(L, -2) != 0) {
- if (lua_isstring(L, -1)) {
- text = lua_tolstring(L, -1, &len);
-
- if (lua_trie_search_str(L, trie, text, len, cb)) {
- found = TRUE;
- }
- }
- else if (lua_isuserdata(L, -1)) {
- t = lua_check_text(L, -1);
-
- if (t) {
- if (lua_trie_search_str(L, trie, t->start, t->len, cb)) {
- found = TRUE;
- }
- }
- }
- lua_pop(L, 1);
- }
- }
- else if (lua_type(L, 2) == LUA_TSTRING) {
- text = lua_tolstring(L, 2, &len);
-
- if (lua_trie_search_str(L, trie, text, len, cb)) {
- found = TRUE;
- }
- }
- else if (lua_type(L, 2) == LUA_TUSERDATA) {
- t = lua_check_text(L, 2);
-
- if (t && lua_trie_search_str(L, trie, t->start, t->len, cb)) {
- found = TRUE;
- }
- }
- }
-
- if (lua_type(L, 3) == LUA_TFUNCTION) {
- lua_settop(L, old_top);
- lua_pushboolean(L, found);
- }
- else {
- lua_remove(L, -2);
- }
-
- return 1;
- }
-
- /***
- * @method trie:search_mime(task, cb)
- * This is a helper mehthod to search pattern within text parts of a message in rspamd task
- * @param {task} task object
- * @param {function} cb callback called on each pattern match @see trie:match
- * @param {boolean} caseless if `true` then match ignores symbols case (ASCII only)
- * @return {boolean} `true` if any pattern has been found (`cb` might be called multiple times however)
- */
- static int
- lua_trie_search_mime(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_multipattern *trie = lua_check_trie(L, 1);
- struct rspamd_task *task = lua_check_task(L, 2);
- struct rspamd_mime_text_part *part;
- const char *text;
- gsize len, i;
- gboolean found = FALSE;
- rspamd_multipattern_cb_t cb = lua_trie_lua_cb_callback;
-
- if (trie && task) {
- PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
- {
- if (!IS_TEXT_PART_EMPTY(part) && part->utf_content.len > 0) {
- text = part->utf_content.begin;
- len = part->utf_content.len;
-
- if (lua_trie_search_str(L, trie, text, len, cb) != 0) {
- found = TRUE;
- }
- }
- }
- }
-
- lua_pushboolean(L, found);
- return 1;
- }
-
- /***
- * @method trie:search_rawmsg(task, cb[, caseless])
- * This is a helper mehthod to search pattern within the whole undecoded content of rspamd task
- * @param {task} task object
- * @param {function} cb callback called on each pattern match @see trie:match
- * @param {boolean} caseless if `true` then match ignores symbols case (ASCII only)
- * @return {boolean} `true` if any pattern has been found (`cb` might be called multiple times however)
- */
- static int
- lua_trie_search_rawmsg(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_multipattern *trie = lua_check_trie(L, 1);
- struct rspamd_task *task = lua_check_task(L, 2);
- const char *text;
- gsize len;
- gboolean found = FALSE;
-
- if (trie && task) {
- text = task->msg.begin;
- len = task->msg.len;
-
- if (lua_trie_search_str(L, trie, text, len, lua_trie_lua_cb_callback) != 0) {
- found = TRUE;
- }
- }
-
- lua_pushboolean(L, found);
- return 1;
- }
-
- /***
- * @method trie:search_rawbody(task, cb[, caseless])
- * This is a helper mehthod to search pattern within the whole undecoded content of task's body (not including headers)
- * @param {task} task object
- * @param {function} cb callback called on each pattern match @see trie:match
- * @param {boolean} caseless if `true` then match ignores symbols case (ASCII only)
- * @return {boolean} `true` if any pattern has been found (`cb` might be called multiple times however)
- */
- static int
- lua_trie_search_rawbody(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_multipattern *trie = lua_check_trie(L, 1);
- struct rspamd_task *task = lua_check_task(L, 2);
- const char *text;
- gsize len;
- gboolean found = FALSE;
-
- if (trie && task) {
- if (MESSAGE_FIELD(task, raw_headers_content).len > 0) {
- text = task->msg.begin + MESSAGE_FIELD(task, raw_headers_content).len;
- len = task->msg.len - MESSAGE_FIELD(task, raw_headers_content).len;
- }
- else {
- /* Treat as raw message */
- text = task->msg.begin;
- len = task->msg.len;
- }
-
- if (lua_trie_search_str(L, trie, text, len, lua_trie_lua_cb_callback) != 0) {
- found = TRUE;
- }
- }
-
- lua_pushboolean(L, found);
- return 1;
- }
-
- static int
- lua_load_trie(lua_State *L)
- {
- lua_newtable(L);
-
- /* Flags */
- lua_pushstring(L, "flags");
- lua_newtable(L);
-
- lua_pushinteger(L, RSPAMD_MULTIPATTERN_GLOB);
- lua_setfield(L, -2, "glob");
- lua_pushinteger(L, RSPAMD_MULTIPATTERN_RE);
- lua_setfield(L, -2, "re");
- lua_pushinteger(L, RSPAMD_MULTIPATTERN_ICASE);
- lua_setfield(L, -2, "icase");
- lua_pushinteger(L, RSPAMD_MULTIPATTERN_UTF8);
- lua_setfield(L, -2, "utf8");
- lua_pushinteger(L, RSPAMD_MULTIPATTERN_TLD);
- lua_setfield(L, -2, "tld");
- lua_pushinteger(L, RSPAMD_MULTIPATTERN_DOTALL);
- lua_setfield(L, -2, "dot_all");
- lua_pushinteger(L, RSPAMD_MULTIPATTERN_SINGLEMATCH);
- lua_setfield(L, -2, "single_match");
- lua_pushinteger(L, RSPAMD_MULTIPATTERN_NO_START);
- lua_setfield(L, -2, "no_start");
- lua_settable(L, -3);
-
- /* Main content */
- luaL_register(L, NULL, trielib_f);
-
- return 1;
- }
-
- void luaopen_trie(lua_State *L)
- {
- rspamd_lua_new_class(L, rspamd_trie_classname, trielib_m);
- lua_pop(L, 1);
- rspamd_lua_add_preload(L, "rspamd_trie", lua_load_trie);
- }
|