*/
LUA_FUNCTION_DEF (textpart, get_words);
+/***
+ * @method mime_part:filter_words(regexp, [how][, max]])
+ * Filter words using some regexp:
+ * - `stem`: stemmed words (default)
+ * - `norm`: normalised words (utf normalised + lowercased)
+ * - `raw`: raw words in utf (if possible)
+ * - `full`: list of tables, each table has the following fields:
+ * - [1] - stemmed word
+ * - [2] - normalised word
+ * - [3] - raw word
+ * - [4] - flags (table of strings)
+ * @param {rspamd_regexp} regexp regexp to match
+ * @param {string} how what words to extract
+ * @param {number} max maximum number of hits returned (all hits if <= 0 or nil)
+ * @return {table/strings} words matching regexp
+ */
+LUA_FUNCTION_DEF (textpart, filter_words);
+
/***
* @method text_part:is_empty()
* Returns `true` if the specified part is empty
LUA_INTERFACE_DEF (textpart, get_lines_count),
LUA_INTERFACE_DEF (textpart, get_words_count),
LUA_INTERFACE_DEF (textpart, get_words),
+ LUA_INTERFACE_DEF (textpart, filter_words),
LUA_INTERFACE_DEF (textpart, is_empty),
LUA_INTERFACE_DEF (textpart, is_html),
LUA_INTERFACE_DEF (textpart, get_html),
return 1;
}
+static inline enum rspamd_lua_words_type
+word_extract_type_from_string (const gchar *how_str)
+{
+ enum rspamd_lua_words_type how = RSPAMD_LUA_WORDS_MAX;
+
+ if (strcmp (how_str, "stem") == 0) {
+ how = RSPAMD_LUA_WORDS_STEM;
+ }
+ else if (strcmp (how_str, "norm") == 0) {
+ how = RSPAMD_LUA_WORDS_NORM;
+ }
+ else if (strcmp (how_str, "raw") == 0) {
+ how = RSPAMD_LUA_WORDS_RAW;
+ }
+ else if (strcmp (how_str, "full") == 0) {
+ how = RSPAMD_LUA_WORDS_FULL;
+ }
+
+ return how;
+}
+
static gint
lua_textpart_get_words (lua_State *L)
{
if (lua_type (L, 2) == LUA_TSTRING) {
const gchar *how_str = lua_tostring (L, 2);
- if (strcmp (how_str, "stem") == 0) {
- how = RSPAMD_LUA_WORDS_STEM;
- }
- else if (strcmp (how_str, "norm") == 0) {
- how = RSPAMD_LUA_WORDS_NORM;
+ how = word_extract_type_from_string (how_str);
+
+ if (how == RSPAMD_LUA_WORDS_MAX) {
+ return luaL_error (L, "invalid extraction type: %s", how_str);
}
- else if (strcmp (how_str, "raw") == 0) {
- how = RSPAMD_LUA_WORDS_RAW;
+ }
+
+ return rspamd_lua_push_words (L, part->utf_words, how);
+ }
+
+ return 1;
+}
+
+static gint
+lua_textpart_filter_words (lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_mime_text_part *part = lua_check_textpart (L);
+ struct rspamd_lua_regexp *re = lua_check_regexp (L, 2);
+ gint lim = -1;
+ enum rspamd_lua_words_type how = RSPAMD_LUA_WORDS_STEM;
+
+ if (part == NULL || re == NULL) {
+ return luaL_error (L, "invalid arguments");
+ }
+
+ if (IS_PART_EMPTY (part) || part->utf_words == NULL) {
+ lua_createtable (L, 0, 0);
+ }
+ else {
+ if (lua_type (L, 3) == LUA_TSTRING) {
+ const gchar *how_str = lua_tostring (L, 2);
+
+ how = word_extract_type_from_string (how_str);
+
+ if (how == RSPAMD_LUA_WORDS_MAX) {
+ return luaL_error (L, "invalid extraction type: %s", how_str);
}
- else if (strcmp (how_str, "full") == 0) {
- how = RSPAMD_LUA_WORDS_FULL;
+ }
+
+ if (lua_type (L, 4) == LUA_TNUMBER) {
+ lim = lua_tointeger (L, 4);
+ }
+
+ guint cnt, i;
+
+ lua_createtable (L, 8, 0);
+
+ for (i = 0, cnt = 1; i < part->utf_words->len; i ++) {
+ rspamd_stat_token_t *w = &g_array_index (part->utf_words,
+ rspamd_stat_token_t, i);
+
+ switch (how) {
+ case RSPAMD_LUA_WORDS_STEM:
+ if (w->stemmed.len > 0) {
+ if (rspamd_regexp_match (re->re, w->stemmed.begin,
+ w->stemmed.len, FALSE)) {
+ lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
+ lua_rawseti (L, -2, cnt++);
+ }
+ }
+ break;
+ case RSPAMD_LUA_WORDS_NORM:
+ if (w->normalized.len > 0) {
+ if (rspamd_regexp_match (re->re, w->normalized.begin,
+ w->normalized.len, FALSE)) {
+ lua_pushlstring (L, w->normalized.begin, w->normalized.len);
+ lua_rawseti (L, -2, cnt++);
+ }
+ }
+ break;
+ case RSPAMD_LUA_WORDS_RAW:
+ if (w->original.len > 0) {
+ if (rspamd_regexp_match (re->re, w->original.begin,
+ w->original.len, TRUE)) {
+ lua_pushlstring (L, w->original.begin, w->original.len);
+ lua_rawseti (L, -2, cnt++);
+ }
+ }
+ break;
+ case RSPAMD_LUA_WORDS_FULL:
+ if (rspamd_regexp_match (re->re, w->normalized.begin,
+ w->normalized.len, FALSE)) {
+ rspamd_lua_push_full_word (L, w);
+ /* Push to the resulting vector */
+ lua_rawseti (L, -2, cnt++);
+ }
+ break;
+ default:
+ break;
}
- else {
- return luaL_error (L, "unknown words type: %s", how_str);
+
+ if (lim > 0 && cnt >= lim) {
+ break;
}
}
-
- return rspamd_lua_push_words (L, part->utf_words, how);
}
return 1;
rspamd_mempool_t *regexp_static_pool = NULL;
-static struct rspamd_lua_regexp *
-lua_check_regexp (lua_State * L)
+struct rspamd_lua_regexp *
+lua_check_regexp (lua_State * L, gint pos)
{
- void *ud = rspamd_lua_check_udata (L, 1, "rspamd{regexp}");
+ void *ud = rspamd_lua_check_udata (L, pos, "rspamd{regexp}");
- luaL_argcheck (L, ud != NULL, 1, "'regexp' expected");
+ luaL_argcheck (L, ud != NULL, pos, "'regexp' expected");
return ud ? *((struct rspamd_lua_regexp **)ud) : NULL;
}
lua_regexp_get_pattern (lua_State *L)
{
LUA_TRACE_POINT;
- struct rspamd_lua_regexp *re = lua_check_regexp (L);
+ struct rspamd_lua_regexp *re = lua_check_regexp (L, 1);
if (re && re->re && !IS_DESTROYED (re)) {
lua_pushstring (L, rspamd_regexp_get_pattern (re->re));
lua_regexp_set_limit (lua_State *L)
{
LUA_TRACE_POINT;
- struct rspamd_lua_regexp *re = lua_check_regexp (L);
+ struct rspamd_lua_regexp *re = lua_check_regexp (L, 1);
gint64 lim;
- lim = luaL_checknumber (L, 2);
+ lim = lua_tointeger (L, 2);
if (re && re->re && !IS_DESTROYED (re)) {
if (lim > 0) {
lua_regexp_set_max_hits (lua_State *L)
{
LUA_TRACE_POINT;
- struct rspamd_lua_regexp *re = lua_check_regexp (L);
+ struct rspamd_lua_regexp *re = lua_check_regexp (L, 1);
guint lim;
lim = luaL_checkinteger (L, 2);
lua_regexp_get_max_hits (lua_State *L)
{
LUA_TRACE_POINT;
- struct rspamd_lua_regexp *re = lua_check_regexp (L);
+ struct rspamd_lua_regexp *re = lua_check_regexp (L, 1);
if (re && re->re && !IS_DESTROYED (re)) {
lua_pushinteger (L, rspamd_regexp_get_maxhits (re->re));
lua_regexp_search (lua_State *L)
{
LUA_TRACE_POINT;
- struct rspamd_lua_regexp *re = lua_check_regexp (L);
+ struct rspamd_lua_regexp *re = lua_check_regexp (L, 1);
const gchar *data = NULL;
struct rspamd_lua_text *t;
const gchar *start = NULL, *end = NULL;
gint i;
- gsize len, capn;
+ gsize len = 0, capn;
gboolean matched = FALSE, capture = FALSE, raw = FALSE;
GArray *captures = NULL;
struct rspamd_re_capture *cap;
lua_regexp_match (lua_State *L)
{
LUA_TRACE_POINT;
- struct rspamd_lua_regexp *re = lua_check_regexp (L);
+ struct rspamd_lua_regexp *re = lua_check_regexp (L, 1);
struct rspamd_lua_text *t;
const gchar *data = NULL;
gsize len = 0;
lua_regexp_matchn (lua_State *L)
{
LUA_TRACE_POINT;
- struct rspamd_lua_regexp *re = lua_check_regexp (L);
+ struct rspamd_lua_regexp *re = lua_check_regexp (L, 1);
struct rspamd_lua_text *t;
const gchar *data = NULL, *start = NULL, *end = NULL;
gint max_matches, matches;
}
}
- max_matches = lua_tonumber (L, 3);
+ max_matches = lua_tointeger (L, 3);
if (lua_gettop (L) == 4) {
raw = lua_toboolean (L, 4);
lua_regexp_split (lua_State *L)
{
LUA_TRACE_POINT;
- struct rspamd_lua_regexp *re = lua_check_regexp (L);
+ struct rspamd_lua_regexp *re = lua_check_regexp (L, 1);
const gchar *data = NULL;
struct rspamd_lua_text *t;
gboolean matched = FALSE, is_text = FALSE;
lua_regexp_destroy (lua_State *L)
{
LUA_TRACE_POINT;
- struct rspamd_lua_regexp *to_del = lua_check_regexp (L);
+ struct rspamd_lua_regexp *to_del = lua_check_regexp (L, 1);
if (to_del) {
rspamd_regexp_cache_remove (NULL, to_del->re);
lua_regexp_gc (lua_State *L)
{
LUA_TRACE_POINT;
- struct rspamd_lua_regexp *to_del = lua_check_regexp (L);
+ struct rspamd_lua_regexp *to_del = lua_check_regexp (L, 1);
if (to_del) {
if (!IS_DESTROYED (to_del)) {