12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475 |
- /*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include "lua_common.h"
- #include "lua_url.h"
-
-
- /***
- * @module rspamd_url
- * This module provides routines to handle URL's and extract URL's from the text.
- * Objects of this class are returned, for example, by `task:get_urls()` or `task:get_emails()`.
- * You can also create `rspamd_url` from any text.
- * @example
- local url = require "rspamd_url"
- local mpool = require "rspamd_mempool"
-
- url.init("/usr/share/rspamd/effective_tld_names.dat")
- local pool = mpool.create()
- local res = url.create(pool, 'Look at: http://user@test.example.com/test?query")
- local t = res:to_table()
- -- Content of t:
- -- url = ['http://test.example.com/test?query']
- -- host = ['test.example.com']
- -- user = ['user']
- -- path = ['test']
- -- tld = ['example.com']
-
- pool:destroy() -- res is destroyed here, so you should not use it afterwards
-
- local mistake = res:to_table() -- INVALID! as pool is destroyed
- */
-
- /* URL methods */
- LUA_FUNCTION_DEF(url, get_length);
- LUA_FUNCTION_DEF(url, get_host);
- LUA_FUNCTION_DEF(url, get_port);
- LUA_FUNCTION_DEF(url, get_user);
- LUA_FUNCTION_DEF(url, get_path);
- LUA_FUNCTION_DEF(url, get_query);
- LUA_FUNCTION_DEF(url, get_fragment);
- LUA_FUNCTION_DEF(url, get_text);
- LUA_FUNCTION_DEF(url, tostring);
- LUA_FUNCTION_DEF(url, get_raw);
- LUA_FUNCTION_DEF(url, get_tld);
- LUA_FUNCTION_DEF(url, get_flags);
- LUA_FUNCTION_DEF(url, get_flags_num);
- LUA_FUNCTION_DEF(url, get_protocol);
- LUA_FUNCTION_DEF(url, to_table);
- LUA_FUNCTION_DEF(url, is_phished);
- LUA_FUNCTION_DEF(url, is_redirected);
- LUA_FUNCTION_DEF(url, is_obscured);
- LUA_FUNCTION_DEF(url, is_html_displayed);
- LUA_FUNCTION_DEF(url, is_subject);
- LUA_FUNCTION_DEF(url, get_phished);
- LUA_FUNCTION_DEF(url, set_redirected);
- LUA_FUNCTION_DEF(url, get_count);
- LUA_FUNCTION_DEF(url, get_visible);
- LUA_FUNCTION_DEF(url, create);
- LUA_FUNCTION_DEF(url, init);
- LUA_FUNCTION_DEF(url, all);
- LUA_FUNCTION_DEF(url, lt);
- LUA_FUNCTION_DEF(url, eq);
- LUA_FUNCTION_DEF(url, get_order);
- LUA_FUNCTION_DEF(url, get_part_order);
-
- static const struct luaL_reg urllib_m[] = {
- LUA_INTERFACE_DEF(url, get_length),
- LUA_INTERFACE_DEF(url, get_host),
- LUA_INTERFACE_DEF(url, get_port),
- LUA_INTERFACE_DEF(url, get_user),
- LUA_INTERFACE_DEF(url, get_path),
- LUA_INTERFACE_DEF(url, get_query),
- LUA_INTERFACE_DEF(url, get_fragment),
- LUA_INTERFACE_DEF(url, get_text),
- LUA_INTERFACE_DEF(url, get_tld),
- LUA_INTERFACE_DEF(url, get_raw),
- LUA_INTERFACE_DEF(url, get_protocol),
- LUA_INTERFACE_DEF(url, to_table),
- LUA_INTERFACE_DEF(url, is_phished),
- LUA_INTERFACE_DEF(url, is_redirected),
- LUA_INTERFACE_DEF(url, is_obscured),
- LUA_INTERFACE_DEF(url, is_html_displayed),
- LUA_INTERFACE_DEF(url, is_subject),
- LUA_INTERFACE_DEF(url, get_phished),
-
- LUA_INTERFACE_DEF(url, get_visible),
- LUA_INTERFACE_DEF(url, get_count),
- LUA_INTERFACE_DEF(url, get_flags),
- LUA_INTERFACE_DEF(url, get_flags_num),
- LUA_INTERFACE_DEF(url, get_order),
- LUA_INTERFACE_DEF(url, get_part_order),
- {"get_redirected", lua_url_get_phished},
- LUA_INTERFACE_DEF(url, set_redirected),
- {"__tostring", lua_url_tostring},
- {"__eq", lua_url_eq},
- {"__lt", lua_url_lt},
- {NULL, NULL}};
-
- static const struct luaL_reg urllib_f[] = {
- LUA_INTERFACE_DEF(url, init),
- LUA_INTERFACE_DEF(url, create),
- LUA_INTERFACE_DEF(url, all),
- {NULL, NULL}};
-
- struct rspamd_lua_url *
- lua_check_url(lua_State *L, gint pos)
- {
- void *ud = rspamd_lua_check_udata(L, pos, "rspamd{url}");
- luaL_argcheck(L, ud != NULL, pos, "'url' expected");
- return ud ? ((struct rspamd_lua_url *) ud) : NULL;
- }
-
- static gboolean
- lua_url_single_inserter(struct rspamd_url *url, gsize start_offset,
- gsize end_offset, gpointer ud)
- {
- lua_State *L = ud;
- struct rspamd_lua_url *lua_url;
-
- lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
- rspamd_lua_setclass(L, "rspamd{url}", -1);
- lua_url->url = url;
-
- return TRUE;
- }
-
- /***
- * @method url:get_length()
- * Get length of the url
- * @return {number} length of url in bytes
- */
- static gint
- lua_url_get_length(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL) {
- lua_pushinteger(L, url->url->urllen);
- }
- else {
- lua_pushnil(L);
- }
- return 1;
- }
-
- /***
- * @method url:get_host()
- * Get domain part of the url
- * @return {string} domain part of URL
- */
- static gint
- lua_url_get_host(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL && url->url && url->url->hostlen > 0) {
- lua_pushlstring(L, rspamd_url_host(url->url), url->url->hostlen);
- }
- else {
- lua_pushnil(L);
- }
- return 1;
- }
-
- /***
- * @method url:get_port()
- * Get port of the url
- * @return {number} url port
- */
- static gint
- lua_url_get_port(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL) {
- if (rspamd_url_get_port_if_special(url->url) == 0) {
- lua_pushnil(L);
- }
- else {
- lua_pushinteger(L, rspamd_url_get_port_if_special(url->url));
- }
- }
- else {
- lua_pushnil(L);
- }
- return 1;
- }
-
- /***
- * @method url:get_user()
- * Get user part of the url (e.g. username in email)
- * @return {string} user part of URL
- */
- static gint
- lua_url_get_user(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL && rspamd_url_user(url->url) != NULL) {
- lua_pushlstring(L, rspamd_url_user(url->url), url->url->userlen);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:get_path()
- * Get path of the url
- * @return {string} path part of URL
- */
- static gint
- lua_url_get_path(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL && url->url->datalen > 0) {
- lua_pushlstring(L, rspamd_url_data_unsafe(url->url), url->url->datalen);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:get_query()
- * Get query of the url
- * @return {string} query part of URL
- */
- static gint
- lua_url_get_query(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL && url->url->querylen > 0) {
- lua_pushlstring(L, rspamd_url_query_unsafe(url->url), url->url->querylen);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:get_fragment()
- * Get fragment of the url
- * @return {string} fragment part of URL
- */
- static gint
- lua_url_get_fragment(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL && url->url->fragmentlen > 0) {
- lua_pushlstring(L, rspamd_url_fragment_unsafe(url->url), url->url->fragmentlen);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:get_text()
- * Get full content of the url
- * @return {string} url string
- */
- static gint
- lua_url_get_text(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL) {
- lua_pushlstring(L, url->url->string, url->url->urllen);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:tostring()
- * Get full content of the url or user@domain in case of email
- * @return {string} url as a string
- */
- static gint
- lua_url_tostring(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL && url->url != NULL) {
- if (url->url->protocol == PROTOCOL_MAILTO) {
- gchar *tmp = g_malloc(url->url->userlen + 1 +
- url->url->hostlen);
- if (url->url->userlen) {
- memcpy(tmp, url->url->string + url->url->usershift, url->url->userlen);
- }
-
- tmp[url->url->userlen] = '@';
- memcpy(tmp + url->url->userlen + 1, rspamd_url_host_unsafe(url->url),
- url->url->hostlen);
-
- lua_pushlstring(L, tmp, url->url->userlen + 1 + url->url->hostlen);
- g_free(tmp);
- }
- else {
- lua_pushlstring(L, url->url->string, url->url->urllen);
- }
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:get_raw()
- * Get full content of the url as it was parsed (e.g. with urldecode)
- * @return {string} url string
- */
- static gint
- lua_url_get_raw(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL) {
- lua_pushlstring(L, url->url->raw, url->url->rawlen);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:is_phished()
- * Check whether URL is treated as phished
- * @return {boolean} `true` if URL is phished
- */
- static gint
- lua_url_is_phished(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL) {
- lua_pushboolean(L, url->url->flags & RSPAMD_URL_FLAG_PHISHED);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:is_redirected()
- * Check whether URL was redirected
- * @return {boolean} `true` if URL is redirected
- */
- static gint
- lua_url_is_redirected(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL) {
- lua_pushboolean(L, url->url->flags & RSPAMD_URL_FLAG_REDIRECTED);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:is_obscured()
- * Check whether URL is treated as obscured or obfuscated (e.g. numbers in IP address or other hacks)
- * @return {boolean} `true` if URL is obscured
- */
- static gint
- lua_url_is_obscured(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL) {
- lua_pushboolean(L, url->url->flags & RSPAMD_URL_FLAG_OBSCURED);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
-
- /***
- * @method url:is_html_displayed()
- * Check whether URL is just displayed in HTML (e.g. NOT a real href)
- * @return {boolean} `true` if URL is displayed only
- */
- static gint
- lua_url_is_html_displayed(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL) {
- lua_pushboolean(L, url->url->flags & RSPAMD_URL_FLAG_HTML_DISPLAYED);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:is_subject()
- * Check whether URL is found in subject
- * @return {boolean} `true` if URL is found in subject
- */
- static gint
- lua_url_is_subject(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL) {
- lua_pushboolean(L, url->url->flags & RSPAMD_URL_FLAG_SUBJECT);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:get_phished()
- * Get another URL that pretends to be this URL (e.g. used in phishing)
- * @return {url} phished URL
- */
- static gint
- lua_url_get_phished(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *purl, *url = lua_check_url(L, 1);
-
- if (url) {
- if (url->url->ext && url->url->ext->linked_url != NULL) {
- /* XXX: in fact, this is the only possible combination of flags, so this check is redundant */
- if (url->url->flags &
- (RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_REDIRECTED)) {
- purl = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
- rspamd_lua_setclass(L, "rspamd{url}", -1);
- purl->url = url->url->ext->linked_url;
-
- return 1;
- }
- }
- }
-
- lua_pushnil(L);
- return 1;
- }
-
- /***
- * @method url:set_redirected(url,[ pool])
- * Set url as redirected to another url
- * @param {string|url} url new url that is redirecting an old one
- * @param {pool} pool if url is a string this is required for parsing
- * @return {url} parsed redirected url (if needed)
- */
- static gint
- lua_url_set_redirected(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1), *redir;
- rspamd_mempool_t *pool = NULL;
-
- if (url == NULL) {
- return luaL_error(L, "url is required as the first argument");
- }
-
- if (lua_type(L, 2) == LUA_TSTRING) {
- /* Parse url */
- if (lua_type(L, 3) != LUA_TUSERDATA) {
- return luaL_error(L, "mempool is required as the third argument");
- }
-
- pool = rspamd_lua_check_mempool(L, 3);
-
- if (pool == NULL) {
- return luaL_error(L, "mempool is required as the third argument");
- }
-
- gsize len;
- const gchar *urlstr = lua_tolstring(L, 2, &len);
-
- rspamd_url_find_single(pool, urlstr, len, RSPAMD_URL_FIND_ALL,
- lua_url_single_inserter, L);
-
- if (lua_type(L, -1) != LUA_TUSERDATA) {
- /* URL is actually not found */
- lua_pushnil(L);
- }
- else {
- redir = lua_check_url(L, -1);
-
- url->url->flags |= RSPAMD_URL_FLAG_REDIRECTED;
-
- if (url->url->ext == NULL) {
- url->url->ext = rspamd_mempool_alloc0_type(pool, struct rspamd_url_ext);
- }
- url->url->ext->linked_url = redir->url;
- }
- }
- else {
- redir = lua_check_url(L, 2);
-
- if (redir == NULL) {
- return luaL_error(L, "url is required as the second argument");
- }
-
- url->url->flags |= RSPAMD_URL_FLAG_REDIRECTED;
- if (url->url->ext == NULL) {
- url->url->ext = rspamd_mempool_alloc0_type(pool, struct rspamd_url_ext);
- }
- url->url->ext->linked_url = redir->url;
-
- /* Push back on stack */
- lua_pushvalue(L, 2);
- }
-
- return 1;
- }
-
- /***
- * @method url:get_tld()
- * Get effective second level domain part (eSLD) of the url host
- * @return {string} effective second level domain part (eSLD) of the url host
- */
- static gint
- lua_url_get_tld(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL && url->url->tldlen > 0) {
- lua_pushlstring(L, rspamd_url_tld_unsafe(url->url), url->url->tldlen);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:get_protocol()
- * Get protocol name
- * @return {string} protocol as a string
- */
- static gint
- lua_url_get_protocol(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL && url->url->protocol != PROTOCOL_UNKNOWN) {
- lua_pushstring(L, rspamd_url_protocol_name(url->url->protocol));
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:get_count()
- * Return number of occurrences for this particular URL
- * @return {number} number of occurrences
- */
- static gint
- lua_url_get_count(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL && url->url != NULL) {
- lua_pushinteger(L, url->url->count);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:get_visible()
- * Get visible part of the url with html tags stripped
- * @return {string} url string
- */
- static gint
- lua_url_get_visible(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url != NULL && url->url->ext && url->url->ext->visible_part) {
- lua_pushstring(L, url->url->ext->visible_part);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- /***
- * @method url:to_table()
- * Return url as a table with the following fields:
- *
- * - `url`: full content
- * - `host`: hostname part
- * - `user`: user part
- * - `path`: path part
- * - `tld`: top level domain
- * - `protocol`: url protocol
- * @return {table} URL as a table
- */
- static gint
- lua_url_to_table(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
- struct rspamd_url *u;
-
- if (url != NULL) {
- u = url->url;
- lua_createtable(L, 0, 12);
- lua_pushstring(L, "url");
- lua_pushlstring(L, u->string, u->urllen);
- lua_settable(L, -3);
-
- if (u->hostlen > 0) {
- lua_pushstring(L, "host");
- lua_pushlstring(L, rspamd_url_host_unsafe(u), u->hostlen);
- lua_settable(L, -3);
- }
-
- if (rspamd_url_get_port_if_special(u) != 0) {
- lua_pushstring(L, "port");
- lua_pushinteger(L, rspamd_url_get_port_if_special(u));
- lua_settable(L, -3);
- }
-
- if (u->tldlen > 0) {
- lua_pushstring(L, "tld");
- lua_pushlstring(L, rspamd_url_tld_unsafe(u), u->tldlen);
- lua_settable(L, -3);
- }
-
- if (u->userlen > 0) {
- lua_pushstring(L, "user");
- lua_pushlstring(L, rspamd_url_user(u), u->userlen);
- lua_settable(L, -3);
- }
-
- if (u->datalen > 0) {
- lua_pushstring(L, "path");
- lua_pushlstring(L, rspamd_url_data_unsafe(u), u->datalen);
- lua_settable(L, -3);
- }
-
- if (u->querylen > 0) {
- lua_pushstring(L, "query");
- lua_pushlstring(L, rspamd_url_query_unsafe(u), u->querylen);
- lua_settable(L, -3);
- }
-
- if (u->fragmentlen > 0) {
- lua_pushstring(L, "fragment");
- lua_pushlstring(L, rspamd_url_fragment_unsafe(u), u->fragmentlen);
- lua_settable(L, -3);
- }
-
-
- lua_pushstring(L, "protocol");
- lua_pushstring(L, rspamd_url_protocol_name(u->protocol));
- lua_settable(L, -3);
- }
- else {
- lua_pushnil(L);
- }
-
- return 1;
- }
-
- static rspamd_mempool_t *static_lua_url_pool;
-
- RSPAMD_CONSTRUCTOR(rspamd_urls_static_pool_ctor)
- {
- static_lua_url_pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
- "static_lua_url", 0);
- }
-
- RSPAMD_DESTRUCTOR(rspamd_urls_static_pool_dtor)
- {
- rspamd_mempool_delete(static_lua_url_pool);
- }
-
- /***
- * @function url.create([mempool,] str, [{flags_table}])
- * @param {rspamd_mempool} memory pool for URL, e.g. `task:get_mempool()`
- * @param {string} text that contains URL (can also contain other stuff)
- * @return {url} new url object that exists as long as the corresponding mempool exists
- */
- static gint
- lua_url_create(lua_State *L)
- {
- LUA_TRACE_POINT;
- rspamd_mempool_t *pool;
- struct rspamd_lua_text *t;
- struct rspamd_lua_url *u;
-
- if (lua_type(L, 1) == LUA_TUSERDATA) {
- pool = rspamd_lua_check_mempool(L, 1);
- t = lua_check_text_or_string(L, 2);
- }
- else {
- pool = static_lua_url_pool;
- t = lua_check_text_or_string(L, 2);
- }
-
- if (pool == NULL || t == NULL) {
- return luaL_error(L, "invalid arguments");
- }
- else {
- rspamd_url_find_single(pool, t->start, t->len, RSPAMD_URL_FIND_ALL,
- lua_url_single_inserter, L);
-
- if (lua_type(L, -1) != LUA_TUSERDATA) {
- /* URL is actually not found */
- lua_pushnil(L);
-
- return 1;
- }
-
- u = (struct rspamd_lua_url *) lua_touserdata(L, -1);
-
- if (lua_type(L, 3) == LUA_TTABLE) {
- /* Add flags */
- for (lua_pushnil(L); lua_next(L, 3); lua_pop(L, 1)) {
- int nmask = 0;
- const gchar *fname = lua_tostring(L, -1);
-
- if (rspamd_url_flag_from_string(fname, &nmask)) {
- u->url->flags |= nmask;
- }
- else {
- lua_pop(L, 1);
- return luaL_error(L, "invalid flag: %s", fname);
- }
- }
- }
- }
-
- return 1;
- }
-
- /***
- * @function url.init(tld_file)
- * Initialize url library if not initialized yet by Rspamd
- * @param {string} tld_file path to effective_tld_names.dat file (public suffix list)
- * @return nothing
- */
- static gint
- lua_url_init(lua_State *L)
- {
- const gchar *tld_path;
-
- tld_path = luaL_checkstring(L, 1);
-
- rspamd_url_init(tld_path);
-
- return 0;
- }
-
- static gboolean
- lua_url_table_inserter(struct rspamd_url *url, gsize start_offset,
- gsize end_offset, gpointer ud)
- {
- lua_State *L = ud;
- struct rspamd_lua_url *lua_url;
- gint n;
-
- n = rspamd_lua_table_size(L, -1);
- lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
- rspamd_lua_setclass(L, "rspamd{url}", -1);
- lua_url->url = url;
- lua_rawseti(L, -2, n + 1);
-
- return TRUE;
- }
-
-
- static gint
- lua_url_all(lua_State *L)
- {
- LUA_TRACE_POINT;
- rspamd_mempool_t *pool = rspamd_lua_check_mempool(L, 1);
- const gchar *text;
- size_t length;
-
- if (pool == NULL) {
- lua_pushnil(L);
- }
- else {
- text = luaL_checklstring(L, 2, &length);
-
- if (text != NULL) {
- lua_newtable(L);
- rspamd_url_find_multiple(pool, text, length,
- RSPAMD_URL_FIND_ALL, NULL,
- lua_url_table_inserter, L);
- }
- else {
- lua_pushnil(L);
- }
- }
-
- return 1;
- }
-
- /***
- * @method url:get_flags()
- * Return flags for a specified URL as map 'flag'->true for all flags set,
- * possible flags are:
- *
- * - `phished`: URL is likely phished
- * - `numeric`: URL is numeric (e.g. IP address)
- * - `obscured`: URL was obscured
- * - `redirected`: URL comes from redirector
- * - `html_displayed`: URL is used just for displaying purposes
- * - `text`: URL comes from the text
- * - `subject`: URL comes from the subject
- * - `host_encoded`: URL host part is encoded
- * - `schema_encoded`: URL schema part is encoded
- * - `query_encoded`: URL query part is encoded
- * - `missing_slashes`: URL has some slashes missing
- * - `idn`: URL has international characters
- * - `has_port`: URL has port
- * - `has_user`: URL has user part
- * - `schemaless`: URL has no schema
- * - `unnormalised`: URL has some unicode unnormalities
- * - `zw_spaces`: URL has some zero width spaces
- * - `url_displayed`: URL has some other url-like string in visible part
- * - `image`: URL is from src attribute of img HTML tag
- * @return {table} URL flags
- */
- #define PUSH_FLAG(fl) \
- do { \
- if (flags & (fl)) { \
- lua_pushstring(L, rspamd_url_flag_to_string(fl)); \
- lua_pushboolean(L, true); \
- lua_settable(L, -3); \
- } \
- } while (0)
-
- static gint
- lua_url_get_flags(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
- enum rspamd_url_flags flags;
-
- if (url != NULL) {
- flags = url->url->flags;
-
- lua_createtable(L, 0, 4);
-
- for (gint i = 0; i < RSPAMD_URL_MAX_FLAG_SHIFT; i++) {
- PUSH_FLAG(1u << i);
- }
- }
- else {
- return luaL_error(L, "invalid arguments");
- }
-
- return 1;
- }
-
- #undef PUSH_FLAG
-
- static gint
- lua_url_get_flags_num(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url) {
- lua_pushinteger(L, url->url->flags);
- }
- else {
- return luaL_error(L, "invalid arguments");
- }
-
- return 1;
- }
-
- static gint
- lua_url_get_order(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url) {
- if (url->url->order != (uint16_t) -1) {
- lua_pushinteger(L, url->url->order);
- }
- else {
- lua_pushnil(L);
- }
- }
- else {
- return luaL_error(L, "invalid arguments");
- }
-
- return 1;
- }
-
- static gint
- lua_url_get_part_order(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *url = lua_check_url(L, 1);
-
- if (url) {
- if (url->url->part_order != (uint16_t) -1) {
- lua_pushinteger(L, url->url->part_order);
- }
- else {
- lua_pushnil(L);
- }
- }
- else {
- return luaL_error(L, "invalid arguments");
- }
-
- return 1;
- }
-
- void lua_tree_url_callback(gpointer key, gpointer value, gpointer ud)
- {
- struct rspamd_lua_url *lua_url;
- struct rspamd_url *url = (struct rspamd_url *) value;
- struct lua_tree_cb_data *cb = ud;
-
- if ((url->protocol & cb->protocols_mask) == url->protocol) {
-
- /* Handle different flags application logic */
- switch (cb->flags_mode) {
- case url_flags_mode_include_any:
- if (url->flags != (url->flags & cb->flags_mask)) {
- return;
- }
- break;
- case url_flags_mode_include_explicit:
- if ((url->flags & cb->flags_mask) != cb->flags_mask) {
- return;
- }
- break;
- case url_flags_mode_exclude_include:
- if ((url->flags & cb->flags_exclude_mask) != 0) {
- return;
- }
- if ((url->flags & cb->flags_mask) == 0) {
- return;
- }
- break;
- }
-
- if (cb->skip_prob > 0) {
- gdouble coin = rspamd_random_double_fast_seed(&cb->random_seed);
-
- if (coin < cb->skip_prob) {
- return;
- }
- }
-
- lua_url = lua_newuserdata(cb->L, sizeof(struct rspamd_lua_url));
- lua_pushvalue(cb->L, cb->metatable_pos);
- lua_setmetatable(cb->L, -2);
- lua_url->url = url;
- lua_rawseti(cb->L, -2, cb->i++);
- }
- }
-
- gboolean
- lua_url_cbdata_fill(lua_State *L,
- gint pos,
- struct lua_tree_cb_data *cbd,
- guint default_protocols,
- guint default_flags,
- gsize max_urls)
- {
- gint protocols_mask = 0;
-
- gint pos_arg_type = lua_type(L, pos);
- guint flags_mask = default_flags;
- gboolean seen_flags = FALSE, seen_protocols = FALSE;
-
- memset(cbd, 0, sizeof(*cbd));
- cbd->flags_mode = url_flags_mode_include_any;
-
- if (pos_arg_type == LUA_TBOOLEAN) {
- protocols_mask = default_protocols;
- if (lua_toboolean(L, 2)) {
- protocols_mask |= PROTOCOL_MAILTO;
- }
- }
- else if (pos_arg_type == LUA_TTABLE) {
- if (rspamd_lua_geti(L, 1, pos) == LUA_TNIL) {
- /* New method: indexed table */
-
- lua_getfield(L, pos, "flags");
- if (lua_istable(L, -1)) {
- gint top = lua_gettop(L);
-
- lua_getfield(L, pos, "flags_mode");
- if (lua_isstring(L, -1)) {
- const gchar *mode_str = lua_tostring(L, -1);
-
- if (strcmp(mode_str, "explicit") == 0) {
- cbd->flags_mode = url_flags_mode_include_explicit;
- /*
- * Ignore default flags in this mode and include
- * merely flags specified by a caller
- */
- flags_mask = 0;
- }
- }
- lua_pop(L, 1);
-
- for (lua_pushnil(L); lua_next(L, top); lua_pop(L, 1)) {
- int nmask = 0;
-
-
- if (lua_type(L, -1) == LUA_TSTRING) {
- const gchar *fname = lua_tostring(L, -1);
-
-
- if (rspamd_url_flag_from_string(fname, &nmask)) {
- flags_mask |= nmask;
- }
- else {
- msg_info("bad url flag: %s", fname);
- return FALSE;
- }
- }
- else {
- flags_mask |= lua_tointeger(L, -1);
- }
- }
-
- seen_flags = TRUE;
- }
- else {
- flags_mask |= default_flags;
- }
- lua_pop(L, 1);
-
- lua_getfield(L, pos, "protocols");
- if (lua_istable(L, -1)) {
- gint top = lua_gettop(L);
-
- for (lua_pushnil(L); lua_next(L, top); lua_pop(L, 1)) {
- int nmask;
- const gchar *pname = lua_tostring(L, -1);
-
- nmask = rspamd_url_protocol_from_string(pname);
-
- if (nmask != PROTOCOL_UNKNOWN) {
- protocols_mask |= nmask;
- }
- else {
- msg_info("bad url protocol: %s", pname);
- return FALSE;
- }
- }
- seen_protocols = TRUE;
- }
- else {
- protocols_mask = default_protocols;
- }
- lua_pop(L, 1);
-
- if (!seen_protocols) {
- lua_getfield(L, pos, "emails");
- if (lua_isboolean(L, -1)) {
- if (lua_toboolean(L, -1)) {
- protocols_mask |= PROTOCOL_MAILTO;
- }
- }
- lua_pop(L, 1);
- }
-
- if (!seen_flags) {
- lua_getfield(L, pos, "images");
- if (lua_isboolean(L, -1)) {
- if (lua_toboolean(L, -1)) {
- flags_mask |= RSPAMD_URL_FLAG_IMAGE;
- }
- else {
- flags_mask &= ~RSPAMD_URL_FLAG_IMAGE;
- }
- }
- else {
- flags_mask &= ~RSPAMD_URL_FLAG_IMAGE;
- }
- lua_pop(L, 1);
- }
-
- if (!seen_flags) {
- lua_getfield(L, pos, "content");
- if (lua_isboolean(L, -1)) {
- if (lua_toboolean(L, -1)) {
- flags_mask |= RSPAMD_URL_FLAG_CONTENT;
- }
- else {
- flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
- }
- }
- else {
- flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
- }
- lua_pop(L, 1);
- }
-
- lua_getfield(L, pos, "max_urls");
- if (lua_isnumber(L, -1)) {
- max_urls = lua_tonumber(L, -1);
- }
- lua_pop(L, 1);
-
- lua_getfield(L, pos, "sort");
- if (lua_isboolean(L, -1)) {
- cbd->sort = TRUE;
- }
- lua_pop(L, 1);
- }
- else {
- /* Plain table of the protocols */
- for (lua_pushnil(L); lua_next(L, pos); lua_pop(L, 1)) {
- int nmask;
- const gchar *pname = lua_tostring(L, -1);
-
- nmask = rspamd_url_protocol_from_string(pname);
-
- if (nmask != PROTOCOL_UNKNOWN) {
- protocols_mask |= nmask;
- }
- else {
- msg_info("bad url protocol: %s", pname);
- return FALSE;
- }
- }
- }
-
- lua_pop(L, 1); /* After rspamd_lua_geti */
- }
- else if (pos_arg_type == LUA_TSTRING) {
- const gchar *plist = lua_tostring(L, pos);
- gchar **strvec;
- gchar *const *cvec;
-
- strvec = g_strsplit_set(plist, ",;", -1);
- cvec = strvec;
-
- while (*cvec) {
- int nmask;
-
- nmask = rspamd_url_protocol_from_string(*cvec);
-
- if (nmask != PROTOCOL_UNKNOWN) {
- protocols_mask |= nmask;
- }
- else {
- msg_info("bad url protocol: %s", *cvec);
- g_strfreev(strvec);
-
- return FALSE;
- }
-
- cvec++;
- }
-
- g_strfreev(strvec);
- }
- else if (pos_arg_type == LUA_TNONE || pos_arg_type == LUA_TNIL) {
- protocols_mask = default_protocols;
- flags_mask = default_flags;
- }
- else {
- return FALSE;
- }
-
- if (lua_type(L, pos + 1) == LUA_TBOOLEAN) {
- if (lua_toboolean(L, pos + 1)) {
- flags_mask |= RSPAMD_URL_FLAG_IMAGE;
- }
- else {
- flags_mask &= ~RSPAMD_URL_FLAG_IMAGE;
- }
- }
-
- cbd->i = 1;
- cbd->L = L;
- cbd->max_urls = max_urls;
- cbd->protocols_mask = protocols_mask;
- cbd->flags_mask = flags_mask;
-
- /* This needs to be removed from the stack */
- rspamd_lua_class_metatable(L, "rspamd{url}");
- cbd->metatable_pos = lua_gettop(L);
- (void) lua_checkstack(L, cbd->metatable_pos + 4);
-
- return TRUE;
- }
-
- gboolean
- lua_url_cbdata_fill_exclude_include(lua_State *L,
- gint pos,
- struct lua_tree_cb_data *cbd,
- guint default_protocols,
- gsize max_urls)
- {
- guint protocols_mask = default_protocols;
- guint include_flags_mask, exclude_flags_mask;
-
- gint pos_arg_type = lua_type(L, pos);
-
- memset(cbd, 0, sizeof(*cbd));
- cbd->flags_mode = url_flags_mode_exclude_include;
-
- /* Include flags */
- if (pos_arg_type == LUA_TTABLE) {
- include_flags_mask = 0; /* Reset to no flags */
-
- for (lua_pushnil(L); lua_next(L, pos); lua_pop(L, 1)) {
- int nmask = 0;
-
- if (lua_type(L, -1) == LUA_TSTRING) {
- const gchar *fname = lua_tostring(L, -1);
-
- if (rspamd_url_flag_from_string(fname, &nmask)) {
- include_flags_mask |= nmask;
- }
- else {
- msg_info("bad url include flag: %s", fname);
- return FALSE;
- }
- }
- else {
- include_flags_mask |= lua_tointeger(L, -1);
- }
- }
- }
- else if (pos_arg_type == LUA_TNIL || pos_arg_type == LUA_TNONE) {
- /* Include all flags */
- include_flags_mask = ~0U;
- }
- else {
- msg_info("bad arguments: wrong include mask");
- return FALSE;
- }
-
- /* Exclude flags */
- pos_arg_type = lua_type(L, pos + 1);
- if (pos_arg_type == LUA_TTABLE) {
- exclude_flags_mask = 0; /* Reset to no flags */
-
- for (lua_pushnil(L); lua_next(L, pos + 1); lua_pop(L, 1)) {
- int nmask = 0;
-
- if (lua_type(L, -1) == LUA_TSTRING) {
- const gchar *fname = lua_tostring(L, -1);
-
- if (rspamd_url_flag_from_string(fname, &nmask)) {
- exclude_flags_mask |= nmask;
- }
- else {
- msg_info("bad url exclude flag: %s", fname);
- return FALSE;
- }
- }
- else {
- exclude_flags_mask |= lua_tointeger(L, -1);
- }
- }
- }
- else if (pos_arg_type == LUA_TNIL || pos_arg_type == LUA_TNONE) {
- /* Empty all exclude flags */
- exclude_flags_mask = 0U;
- }
- else {
- msg_info("bad arguments: wrong exclude mask");
- return FALSE;
- }
-
- if (lua_type(L, pos + 2) == LUA_TTABLE) {
- protocols_mask = 0U; /* Reset all protocols */
-
- for (lua_pushnil(L); lua_next(L, pos + 2); lua_pop(L, 1)) {
- int nmask;
- const gchar *pname = lua_tostring(L, -1);
-
- nmask = rspamd_url_protocol_from_string(pname);
-
- if (nmask != PROTOCOL_UNKNOWN) {
- protocols_mask |= nmask;
- }
- else {
- msg_info("bad url protocol: %s", pname);
- return FALSE;
- }
- }
- }
- else {
- protocols_mask = default_protocols;
- }
-
- cbd->i = 1;
- cbd->L = L;
- cbd->max_urls = max_urls;
- cbd->protocols_mask = protocols_mask;
- cbd->flags_mask = include_flags_mask;
- cbd->flags_exclude_mask = exclude_flags_mask;
-
- /* This needs to be removed from the stack */
- rspamd_lua_class_metatable(L, "rspamd{url}");
- cbd->metatable_pos = lua_gettop(L);
- (void) lua_checkstack(L, cbd->metatable_pos + 4);
-
- return TRUE;
- }
-
-
- void lua_url_cbdata_dtor(struct lua_tree_cb_data *cbd)
- {
- if (cbd->metatable_pos != -1) {
- lua_remove(cbd->L, cbd->metatable_pos);
- }
- }
-
- gsize lua_url_adjust_skip_prob(float timestamp,
- guchar digest[16],
- struct lua_tree_cb_data *cb,
- gsize sz)
- {
- if (cb->max_urls > 0 && sz > cb->max_urls) {
- cb->skip_prob = 1.0 - ((gdouble) cb->max_urls) / (gdouble) sz;
- /*
- * Use task dependent probabilistic seed to ensure that
- * consequent task:get_urls return the same list of urls
- * We use both digest and timestamp here to avoid attack surface
- * based just on digest.
- */
- memcpy(&cb->random_seed, digest, 4);
- memcpy(((unsigned char *) &cb->random_seed) + 4, ×tamp, 4);
- sz = cb->max_urls;
- }
-
- return sz;
- }
-
- static gint
- lua_url_eq(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *u1 = lua_check_url(L, 1),
- *u2 = lua_check_url(L, 2);
-
- if (u1 && u2) {
- lua_pushboolean(L, (rspamd_url_cmp(u1->url, u2->url) == 0));
- }
- else {
- lua_pushboolean(L, false);
- }
-
- return 1;
- }
-
- static gint
- lua_url_lt(lua_State *L)
- {
- LUA_TRACE_POINT;
- struct rspamd_lua_url *u1 = lua_check_url(L, 1),
- *u2 = lua_check_url(L, 2);
-
- if (u1 && u2) {
- lua_pushinteger(L, rspamd_url_cmp(u1->url, u2->url));
- }
- else {
- return luaL_error(L, "invalid arguments");
- }
-
- return 1;
- }
-
- static gint
- lua_load_url(lua_State *L)
- {
- lua_newtable(L);
- luaL_register(L, NULL, urllib_f);
-
- /* Push flags */
- lua_createtable(L, 0, RSPAMD_URL_MAX_FLAG_SHIFT);
- for (int i = 0; i < RSPAMD_URL_MAX_FLAG_SHIFT; i++) {
- guint flag = 1u << i;
-
- lua_pushinteger(L, flag);
- lua_setfield(L, -2, rspamd_url_flag_to_string(flag));
- }
-
- lua_setfield(L, -2, "flags");
-
- return 1;
- }
-
- void luaopen_url(lua_State *L)
- {
- rspamd_lua_new_class(L, "rspamd{url}", urllib_m);
- lua_pop(L, 1);
-
- rspamd_lua_add_preload(L, "rspamd_url", lua_load_url);
- }
|