diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-04-21 16:07:40 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-04-21 16:07:40 +0100 |
commit | 4385e1701570617eda31009299817e0b38a90be5 (patch) | |
tree | cbaba3cbfe48818de2510def4b71cbc4d4c4ed49 /src | |
parent | 2a7ad799437105f91dfe7e89ba50de655c1d06eb (diff) | |
download | rspamd-4385e1701570617eda31009299817e0b38a90be5.tar.gz rspamd-4385e1701570617eda31009299817e0b38a90be5.zip |
[Rework] Further rework of lua urls extraction API
Diffstat (limited to 'src')
-rw-r--r-- | src/lua/lua_task.c | 32 | ||||
-rw-r--r-- | src/lua/lua_url.c | 148 | ||||
-rw-r--r-- | src/lua/lua_url.h | 14 |
3 files changed, 150 insertions, 44 deletions
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 5c7a8b0a4..2ceb1c3c2 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -2256,6 +2256,8 @@ lua_task_get_urls (lua_State * L) struct rspamd_task *task = lua_check_task (L, 1); struct lua_tree_cb_data cb; struct rspamd_url *u; + static const gint default_protocols_mask = PROTOCOL_HTTP|PROTOCOL_HTTPS| + PROTOCOL_FILE|PROTOCOL_FTP; gsize sz, max_urls = 0; if (task) { @@ -2269,15 +2271,15 @@ lua_task_get_urls (lua_State * L) return 1; } - if (!lua_url_cbdata_fill (L, 2, &cb)) { + /* Exclude RSPAMD_URL_FLAG_CONTENT to preserve backward compatibility */ + if (!lua_url_cbdata_fill (L, 2, &cb, default_protocols_mask, + (~RSPAMD_URL_FLAG_CONTENT), max_urls)) { return luaL_error (L, "invalid arguments"); } - memset (&cb, 0, sizeof (cb)); - sz = kh_size (MESSAGE_FIELD (task, urls)); sz = lua_url_adjust_skip_prob (task->task_timestamp, - MESSAGE_FIELD (task, digest), &cb, sz, max_urls); + MESSAGE_FIELD (task, digest), &cb, sz); lua_createtable (L, sz, 0); @@ -2425,20 +2427,26 @@ lua_task_get_emails (lua_State * L) struct rspamd_task *task = lua_check_task (L, 1); struct lua_tree_cb_data cb; struct rspamd_url *u; + gsize max_urls = 0, sz; if (task) { if (task->message) { - lua_createtable (L, kh_size (MESSAGE_FIELD (task, urls)), 0); - memset (&cb, 0, sizeof (cb)); - cb.i = 1; - cb.L = L; - cb.mask = PROTOCOL_MAILTO; + if (!lua_url_cbdata_fill (L, 2, &cb, PROTOCOL_MAILTO, + (~RSPAMD_URL_FLAG_CONTENT), max_urls)) { + return luaL_error (L, "invalid arguments"); + } + + sz = kh_size (MESSAGE_FIELD (task, urls)); + sz = lua_url_adjust_skip_prob (task->task_timestamp, + MESSAGE_FIELD (task, digest), &cb, sz); + + lua_createtable (L, sz, 0); kh_foreach_key (MESSAGE_FIELD (task, urls), u, { - if ((u->protocol & PROTOCOL_MAILTO)) { - lua_tree_url_callback (u, u, &cb); - } + lua_tree_url_callback (u, u, &cb); }); + + lua_url_cbdata_dtor (&cb); } else { lua_newtable (L); diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c index 65f0569a5..45f9ab683 100644 --- a/src/lua/lua_url.c +++ b/src/lua/lua_url.c @@ -933,10 +933,7 @@ lua_tree_url_callback (gpointer key, gpointer value, gpointer ud) struct rspamd_url *url = (struct rspamd_url *)value; struct lua_tree_cb_data *cb = ud; - if (url->protocol & cb->mask) { - if (!cb->need_images && (url->flags & RSPAMD_URL_FLAG_IMAGE)) { - return; - } + if ((url->protocol & cb->protocols_mask) && (url->flags & cb->flags_mask)) { if (cb->skip_prob > 0) { gdouble coin = rspamd_random_double_fast_seed (cb->xoroshiro_state); @@ -955,35 +952,126 @@ lua_tree_url_callback (gpointer key, gpointer value, gpointer ud) } gboolean -lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd) +lua_url_cbdata_fill (lua_State *L, + gint pos, + struct lua_tree_cb_data *cbd, + guint default_protocols, + guint default_flags, + gsize max_urls) { - gboolean need_images = FALSE; gint protocols_mask = 0; - static const gint default_mask = PROTOCOL_HTTP|PROTOCOL_HTTPS| - PROTOCOL_FILE|PROTOCOL_FTP; + gint pos_arg_type = lua_type (L, pos); + guint flags_mask = default_flags; if (pos_arg_type == LUA_TBOOLEAN) { - protocols_mask = default_mask; + protocols_mask = default_protocols; if (lua_toboolean (L, 2)) { protocols_mask |= PROTOCOL_MAILTO; } } else if (pos_arg_type == LUA_TTABLE) { - for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) { - int nmask; - const gchar *pname = lua_tostring (L, -1); + if (rspamd_lua_geti (L, 1, pos) == LUA_TNIL) { + /* New method: indexed table */ + + lua_getfield (L, pos, "flags"); + if (lua_istable (L, -1)) { + for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) { + int nmask = 0; + const gchar *fname = lua_tostring (L, -1); + + + if (rspamd_url_flag_from_string (fname, &nmask)) { + flags_mask |= nmask; + } + else { + msg_info ("bad url flag: %s", fname); + return FALSE; + } + } + } + else { + flags_mask |= default_flags; + } + lua_pop (L, 1); + + lua_getfield (L, pos, "protocols"); + if (lua_istable (L, -1)) { + for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) { + int nmask; + const gchar *pname = lua_tostring (L, -1); + + nmask = rspamd_url_protocol_from_string (pname); + + if (nmask != PROTOCOL_UNKNOWN) { + protocols_mask |= nmask; + } + else { + msg_info ("bad url protocol: %s", pname); + return FALSE; + } + } + } + else { + protocols_mask = default_protocols; + } + lua_pop (L, 1); - nmask = rspamd_url_protocol_from_string (pname); + lua_getfield (L, pos, "emails"); + if (lua_isboolean (L, -1)) { + if (lua_toboolean (L, -1)) { + protocols_mask |= PROTOCOL_MAILTO; + } + } + lua_pop (L, 1); - if (nmask != PROTOCOL_UNKNOWN) { - protocols_mask |= nmask; + lua_getfield (L, pos, "images"); + if (lua_isboolean (L, -1)) { + if (lua_toboolean (L, -1)) { + flags_mask |= RSPAMD_URL_FLAG_IMAGE; + } + else { + flags_mask &= ~RSPAMD_URL_FLAG_IMAGE; + } } - else { - msg_info ("bad url protocol: %s", pname); - return FALSE; + lua_pop (L, 1); + + lua_getfield (L, pos, "content"); + if (lua_isboolean (L, -1)) { + if (lua_toboolean (L, -1)) { + flags_mask |= RSPAMD_URL_FLAG_CONTENT; + } + else { + flags_mask &= ~RSPAMD_URL_FLAG_CONTENT; + } } + lua_pop (L, 1); + + lua_getfield (L, pos, "max_urls"); + if (lua_isnumber (L, -1)) { + max_urls = lua_tonumber (L, -1); + } + lua_pop (L, 1); } + else { + /* Plain table of the protocols */ + for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) { + int nmask; + const gchar *pname = lua_tostring (L, -1); + + nmask = rspamd_url_protocol_from_string (pname); + + if (nmask != PROTOCOL_UNKNOWN) { + protocols_mask |= nmask; + } + else { + msg_info ("bad url protocol: %s", pname); + return FALSE; + } + } + } + + lua_pop (L, 1); /* After rspamd_lua_geti */ } else if (pos_arg_type == LUA_TSTRING) { const gchar *plist = lua_tostring (L, pos); @@ -1012,22 +1100,29 @@ lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd) g_strfreev (strvec); } else if (pos_arg_type == LUA_TNONE || pos_arg_type == LUA_TNIL) { - protocols_mask = default_mask; + protocols_mask = default_protocols; + flags_mask = default_flags; } else { return FALSE; } if (lua_type (L, pos + 1) == LUA_TBOOLEAN) { - need_images = lua_toboolean (L, pos + 1); + if (lua_toboolean (L, pos + 1)) { + flags_mask |= RSPAMD_URL_FLAG_IMAGE; + } + else { + flags_mask &= ~RSPAMD_URL_FLAG_IMAGE; + } } memset (cbd, 0, sizeof (*cbd)); cbd->i = 1; cbd->L = L; - cbd->mask = protocols_mask; - cbd->need_images = need_images; + cbd->max_urls = max_urls; + cbd->protocols_mask = protocols_mask; + cbd->flags_mask = flags_mask; /* This needs to be removed from the stack */ rspamd_lua_class_metatable (L, "rspamd{url}"); @@ -1049,11 +1144,10 @@ gsize lua_url_adjust_skip_prob (gdouble timestamp, guchar *digest, struct lua_tree_cb_data *cb, - gsize sz, - gsize max_urls) + gsize sz) { - if (max_urls > 0 && sz > max_urls) { - cb->skip_prob = 1.0 - ((gdouble)max_urls) / (gdouble)sz; + if (cb->max_urls > 0 && sz > cb->max_urls) { + cb->skip_prob = 1.0 - ((gdouble)cb->max_urls) / (gdouble)sz; /* * Use task dependent probabilistic seed to ensure that * consequent task:get_urls return the same list of urls @@ -1062,7 +1156,7 @@ lua_url_adjust_skip_prob (gdouble timestamp, MIN (sizeof (cb->xoroshiro_state[0]), sizeof (timestamp))); memcpy (&cb->xoroshiro_state[1], digest, sizeof (cb->xoroshiro_state[1]) * 3); - sz = max_urls; + sz = cb->max_urls; } return sz; diff --git a/src/lua/lua_url.h b/src/lua/lua_url.h index 57d20f920..0ea2186d8 100644 --- a/src/lua/lua_url.h +++ b/src/lua/lua_url.h @@ -26,8 +26,9 @@ struct lua_tree_cb_data { lua_State *L; int i; int metatable_pos; - gint mask; - gint need_images; + guint flags_mask; + guint protocols_mask; + gsize max_urls; gdouble skip_prob; guint64 xoroshiro_state[4]; }; @@ -41,7 +42,11 @@ void lua_tree_url_callback (gpointer key, gpointer value, gpointer ud); * @param cbd * @return */ -gboolean lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd); +gboolean lua_url_cbdata_fill (lua_State *L, gint pos, + struct lua_tree_cb_data *cbd, + guint default_protocols, + guint default_flags, + gsize max_urls); /** * Cleanup url cbdata @@ -61,8 +66,7 @@ void lua_url_cbdata_dtor (struct lua_tree_cb_data *cbd); gsize lua_url_adjust_skip_prob (gdouble timestamp, guchar *digest, struct lua_tree_cb_data *cb, - gsize sz, - gsize max_urls); + gsize sz); #ifdef __cplusplus } |