You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lua_parsers.c 9.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. /*-
  2. * Copyright 2020 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "lua_common.h"
  17. #include "tokenizers/tokenizers.h"
  18. #include "contrib/uthash/utlist.h"
  19. #include "libserver/html/html.h"
  20. #include "libmime/email_addr.h"
  21. #include "libmime/content_type.h"
  22. #include "libmime/mime_headers.h"
  23. #include "libmime/smtp_parsers.h"
  24. #include "lua_parsers.h"
  25. /***
  26. * @module rspamd_parsers
  27. * This module contains Lua-C interfaces to Rspamd parsers of different kind.
  28. */
  29. /***
  30. * @function parsers.tokenize_text(input[, exceptions])
  31. * Create tokens from a text using optional exceptions list
  32. * @param {text/string} input input data
  33. * @param {table} exceptions, a table of pairs containing <start_pos,length> of exceptions in the input
  34. * @return {table/strings} list of strings representing words in the text
  35. */
  36. /***
  37. * @function parsers.parse_html(input)
  38. * Parses HTML and returns the according text
  39. * @param {string|text} in input HTML
  40. * @return {rspamd_text} processed text with no HTML tags
  41. */
  42. /***
  43. * @function parsers.parse_mail_address(str, [pool])
  44. * Parses email address and returns a table of tables in the following format:
  45. *
  46. * - `raw` - the original value without any processing
  47. * - `name` - name of internet address in UTF8, e.g. for `Vsevolod Stakhov <blah@foo.com>` it returns `Vsevolod Stakhov`
  48. * - `addr` - address part of the address
  49. * - `user` - user part (if present) of the address, e.g. `blah`
  50. * - `domain` - domain part (if present), e.g. `foo.com`
  51. * - `flags` - table with following keys set to true if given condition fulfilled:
  52. * - [valid] - valid SMTP address in conformity with https://tools.ietf.org/html/rfc5321#section-4.1.
  53. * - [ip] - domain is IPv4/IPv6 address
  54. * - [braced] - angled `<blah@foo.com>` address
  55. * - [quoted] - quoted user part
  56. * - [empty] - empty address
  57. * - [backslash] - user part contains backslash
  58. * - [8bit] - contains 8bit characters
  59. *
  60. * @param {string} str input string
  61. * @param {rspamd_mempool} pool memory pool to use
  62. * @return {table/tables} parsed list of mail addresses
  63. */
  64. /***
  65. * @function parsers.parse_content_type(ct_string, mempool)
  66. * Parses content-type string to a table:
  67. * - `type`
  68. * - `subtype`
  69. * - `charset`
  70. * - `boundary`
  71. * - other attributes
  72. *
  73. * @param {string} ct_string content type as string
  74. * @param {rspamd_mempool} mempool needed to store temporary data (e.g. task pool)
  75. * @return table or nil if cannot parse content type
  76. */
  77. /***
  78. * @function parsers.parse_smtp_date(str[, local_tz])
  79. * Converts an SMTP date string to unix timestamp
  80. * @param {string} str input string
  81. * @param {boolean} local_tz convert to local tz if `true`
  82. * @return {number} time as unix timestamp (converted to float)
  83. */
  84. static const struct luaL_reg parserslib_f[] = {
  85. LUA_INTERFACE_DEF(parsers, tokenize_text),
  86. LUA_INTERFACE_DEF(parsers, parse_html),
  87. LUA_INTERFACE_DEF(parsers, parse_mail_address),
  88. LUA_INTERFACE_DEF(parsers, parse_content_type),
  89. LUA_INTERFACE_DEF(parsers, parse_smtp_date),
  90. {NULL, NULL}};
  91. int lua_parsers_tokenize_text(lua_State *L)
  92. {
  93. LUA_TRACE_POINT;
  94. const char *in = NULL;
  95. gsize len = 0, pos, ex_len, i;
  96. GList *exceptions = NULL, *cur;
  97. struct rspamd_lua_text *t;
  98. struct rspamd_process_exception *ex;
  99. UText utxt = UTEXT_INITIALIZER;
  100. GArray *res;
  101. rspamd_stat_token_t *w;
  102. if (lua_type(L, 1) == LUA_TSTRING) {
  103. in = luaL_checklstring(L, 1, &len);
  104. }
  105. else if (lua_type(L, 1) == LUA_TUSERDATA) {
  106. t = lua_check_text(L, 1);
  107. if (t) {
  108. in = t->start;
  109. len = t->len;
  110. }
  111. }
  112. if (in == NULL) {
  113. lua_pushnil(L);
  114. return 1;
  115. }
  116. if (lua_gettop(L) > 1 && lua_type(L, 2) == LUA_TTABLE) {
  117. lua_pushvalue(L, 2);
  118. lua_pushnil(L);
  119. while (lua_next(L, -2) != 0) {
  120. if (lua_type(L, -1) == LUA_TTABLE) {
  121. lua_rawgeti(L, -1, 1);
  122. pos = luaL_checknumber(L, -1);
  123. lua_pop(L, 1);
  124. lua_rawgeti(L, -1, 2);
  125. ex_len = luaL_checknumber(L, -1);
  126. lua_pop(L, 1);
  127. if (ex_len > 0) {
  128. ex = g_malloc0(sizeof(*ex));
  129. ex->pos = pos;
  130. ex->len = ex_len;
  131. ex->type = RSPAMD_EXCEPTION_GENERIC;
  132. exceptions = g_list_prepend(exceptions, ex);
  133. }
  134. }
  135. lua_pop(L, 1);
  136. }
  137. lua_pop(L, 1);
  138. }
  139. if (exceptions) {
  140. exceptions = g_list_reverse(exceptions);
  141. }
  142. UErrorCode uc_err = U_ZERO_ERROR;
  143. utext_openUTF8(&utxt,
  144. in,
  145. len,
  146. &uc_err);
  147. res = rspamd_tokenize_text((char *) in, len,
  148. &utxt,
  149. RSPAMD_TOKENIZE_UTF, NULL,
  150. exceptions,
  151. NULL, NULL, NULL);
  152. if (res == NULL) {
  153. lua_pushnil(L);
  154. }
  155. else {
  156. lua_createtable(L, res->len, 0);
  157. for (i = 0; i < res->len; i++) {
  158. w = &g_array_index(res, rspamd_stat_token_t, i);
  159. lua_pushlstring(L, w->original.begin, w->original.len);
  160. lua_rawseti(L, -2, i + 1);
  161. }
  162. }
  163. cur = exceptions;
  164. while (cur) {
  165. ex = cur->data;
  166. g_free(ex);
  167. cur = g_list_next(cur);
  168. }
  169. g_list_free(exceptions);
  170. utext_close(&utxt);
  171. return 1;
  172. }
  173. int lua_parsers_parse_html(lua_State *L)
  174. {
  175. LUA_TRACE_POINT;
  176. struct rspamd_lua_text *t;
  177. const char *start = NULL;
  178. gsize len;
  179. GByteArray *in;
  180. rspamd_mempool_t *pool;
  181. void *hc;
  182. if (lua_type(L, 1) == LUA_TUSERDATA) {
  183. t = lua_check_text(L, 1);
  184. if (t != NULL) {
  185. start = t->start;
  186. len = t->len;
  187. }
  188. }
  189. else if (lua_type(L, 1) == LUA_TSTRING) {
  190. start = luaL_checklstring(L, 1, &len);
  191. }
  192. if (start != NULL) {
  193. pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), NULL, 0);
  194. in = g_byte_array_sized_new(len);
  195. g_byte_array_append(in, start, len);
  196. hc = rspamd_html_process_part(pool, in);
  197. rspamd_ftok_t res;
  198. rspamd_html_get_parsed_content(hc, &res);
  199. lua_new_text(L, res.begin, res.len, TRUE);
  200. g_byte_array_free(in, TRUE);
  201. rspamd_mempool_delete(pool);
  202. }
  203. else {
  204. lua_pushnil(L);
  205. }
  206. return 1;
  207. }
  208. int lua_parsers_parse_mail_address(lua_State *L)
  209. {
  210. LUA_TRACE_POINT;
  211. GPtrArray *addrs;
  212. gsize len;
  213. const char *str = luaL_checklstring(L, 1, &len);
  214. int max_addrs = luaL_optinteger(L, 3, 10240);
  215. rspamd_mempool_t *pool;
  216. gboolean own_pool = FALSE;
  217. if (str) {
  218. if (lua_type(L, 2) == LUA_TUSERDATA) {
  219. pool = rspamd_lua_check_mempool(L, 2);
  220. if (pool == NULL) {
  221. return luaL_error(L, "invalid arguments");
  222. }
  223. }
  224. else {
  225. pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
  226. "lua parsers", 0);
  227. own_pool = TRUE;
  228. }
  229. addrs = rspamd_email_address_from_mime(pool, str, len, NULL, max_addrs);
  230. if (addrs == NULL) {
  231. lua_pushnil(L);
  232. }
  233. else {
  234. lua_push_emails_address_list(L, addrs, 0);
  235. }
  236. if (own_pool) {
  237. rspamd_mempool_delete(pool);
  238. }
  239. }
  240. else {
  241. lua_pushnil(L);
  242. }
  243. return 1;
  244. }
  245. int lua_parsers_parse_content_type(lua_State *L)
  246. {
  247. LUA_TRACE_POINT;
  248. gsize len;
  249. const char *ct_str = luaL_checklstring(L, 1, &len);
  250. rspamd_mempool_t *pool = rspamd_lua_check_mempool(L, 2);
  251. struct rspamd_content_type *ct;
  252. if (!ct_str || !pool) {
  253. return luaL_error(L, "invalid arguments");
  254. }
  255. ct = rspamd_content_type_parse(ct_str, len, pool);
  256. if (ct == NULL) {
  257. lua_pushnil(L);
  258. }
  259. else {
  260. GHashTableIter it;
  261. gpointer k, v;
  262. lua_createtable(L, 0, 4 + (ct->attrs ? g_hash_table_size(ct->attrs) : 0));
  263. if (ct->type.len > 0) {
  264. lua_pushstring(L, "type");
  265. lua_pushlstring(L, ct->type.begin, ct->type.len);
  266. lua_settable(L, -3);
  267. }
  268. if (ct->subtype.len > 0) {
  269. lua_pushstring(L, "subtype");
  270. lua_pushlstring(L, ct->subtype.begin, ct->subtype.len);
  271. lua_settable(L, -3);
  272. }
  273. if (ct->charset.len > 0) {
  274. lua_pushstring(L, "charset");
  275. lua_pushlstring(L, ct->charset.begin, ct->charset.len);
  276. lua_settable(L, -3);
  277. }
  278. if (ct->orig_boundary.len > 0) {
  279. lua_pushstring(L, "boundary");
  280. lua_pushlstring(L, ct->orig_boundary.begin, ct->orig_boundary.len);
  281. lua_settable(L, -3);
  282. }
  283. if (ct->attrs) {
  284. g_hash_table_iter_init(&it, ct->attrs);
  285. while (g_hash_table_iter_next(&it, &k, &v)) {
  286. struct rspamd_content_type_param *param =
  287. (struct rspamd_content_type_param *) v,
  288. *cur;
  289. unsigned int i = 1;
  290. lua_pushlstring(L, param->name.begin, param->name.len);
  291. lua_createtable(L, 1, 0);
  292. DL_FOREACH(param, cur)
  293. {
  294. lua_pushlstring(L, cur->value.begin, cur->value.len);
  295. lua_rawseti(L, -2, i++);
  296. }
  297. lua_settable(L, -3);
  298. }
  299. }
  300. }
  301. return 1;
  302. }
  303. int lua_parsers_parse_smtp_date(lua_State *L)
  304. {
  305. gsize slen;
  306. const char *str = lua_tolstring(L, 1, &slen);
  307. GError *err = NULL;
  308. if (str == NULL) {
  309. return luaL_argerror(L, 1, "invalid argument");
  310. }
  311. time_t tt = rspamd_parse_smtp_date(str, slen, &err);
  312. if (err == NULL) {
  313. if (lua_isboolean(L, 2) && !!lua_toboolean(L, 2)) {
  314. struct tm t;
  315. rspamd_localtime(tt, &t);
  316. #if !defined(__sun)
  317. t.tm_gmtoff = 0;
  318. #endif
  319. t.tm_isdst = 0;
  320. tt = mktime(&t);
  321. }
  322. lua_pushnumber(L, tt);
  323. }
  324. else {
  325. lua_pushnil(L);
  326. lua_pushstring(L, err->message);
  327. g_error_free(err);
  328. return 2;
  329. }
  330. return 1;
  331. }
  332. static int
  333. lua_load_parsers(lua_State *L)
  334. {
  335. lua_newtable(L);
  336. luaL_register(L, NULL, parserslib_f);
  337. return 1;
  338. }
  339. void luaopen_parsers(lua_State *L)
  340. {
  341. rspamd_lua_add_preload(L, "rspamd_parsers", lua_load_parsers);
  342. }