You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lua_parsers.c 9.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. /*-
  2. * Copyright 2020 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "lua_common.h"
  17. #include "tokenizers/tokenizers.h"
  18. #include "contrib/uthash/utlist.h"
  19. #include "libserver/html.h"
  20. #include "libmime/email_addr.h"
  21. #include "libmime/content_type.h"
  22. #include "libmime/mime_headers.h"
  23. #include "libmime/smtp_parsers.h"
  24. #include "lua_parsers.h"
  25. /***
  26. * @module rspamd_parsers
  27. * This module contains Lua-C interfaces to Rspamd parsers of different kind.
  28. */
  29. /***
  30. * @function parsers.tokenize_text(input[, exceptions])
  31. * Create tokens from a text using optional exceptions list
  32. * @param {text/string} input input data
  33. * @param {table} exceptions, a table of pairs containing <start_pos,length> of exceptions in the input
  34. * @return {table/strings} list of strings representing words in the text
  35. */
  36. /***
  37. * @function parsers.parse_html(input)
  38. * Parses HTML and returns the according text
  39. * @param {string|text} in input HTML
  40. * @return {rspamd_text} processed text with no HTML tags
  41. */
  42. /***
  43. * @function parsers.parse_mail_address(str, [pool])
  44. * Parses email address and returns a table of tables in the following format:
  45. *
  46. * - `raw` - the original value without any processing
  47. * - `name` - name of internet address in UTF8, e.g. for `Vsevolod Stakhov <blah@foo.com>` it returns `Vsevolod Stakhov`
  48. * - `addr` - address part of the address
  49. * - `user` - user part (if present) of the address, e.g. `blah`
  50. * - `domain` - domain part (if present), e.g. `foo.com`
  51. * - `flags` - table with following keys set to true if given condition fulfilled:
  52. * - [valid] - valid SMTP address in conformity with https://tools.ietf.org/html/rfc5321#section-4.1.
  53. * - [ip] - domain is IPv4/IPv6 address
  54. * - [braced] - angled `<blah@foo.com>` address
  55. * - [quoted] - quoted user part
  56. * - [empty] - empty address
  57. * - [backslash] - user part contains backslash
  58. * - [8bit] - contains 8bit characters
  59. *
  60. * @param {string} str input string
  61. * @param {rspamd_mempool} pool memory pool to use
  62. * @return {table/tables} parsed list of mail addresses
  63. */
  64. /***
  65. * @function parsers.parse_content_type(ct_string, mempool)
  66. * Parses content-type string to a table:
  67. * - `type`
  68. * - `subtype`
  69. * - `charset`
  70. * - `boundary`
  71. * - other attributes
  72. *
  73. * @param {string} ct_string content type as string
  74. * @param {rspamd_mempool} mempool needed to store temporary data (e.g. task pool)
  75. * @return table or nil if cannot parse content type
  76. */
  77. /***
  78. * @function parsers.parse_smtp_date(str[, local_tz])
  79. * Converts an SMTP date string to unix timestamp
  80. * @param {string} str input string
  81. * @param {boolean} local_tz convert to local tz if `true`
  82. * @return {number} time as unix timestamp (converted to float)
  83. */
  84. static const struct luaL_reg parserslib_f[] = {
  85. LUA_INTERFACE_DEF (parsers, tokenize_text),
  86. LUA_INTERFACE_DEF (parsers, parse_html),
  87. LUA_INTERFACE_DEF (parsers, parse_mail_address),
  88. LUA_INTERFACE_DEF (parsers, parse_content_type),
  89. LUA_INTERFACE_DEF (parsers, parse_smtp_date),
  90. {NULL, NULL}
  91. };
  92. gint
  93. lua_parsers_tokenize_text (lua_State *L)
  94. {
  95. LUA_TRACE_POINT;
  96. const gchar *in = NULL;
  97. gsize len = 0, pos, ex_len, i;
  98. GList *exceptions = NULL, *cur;
  99. struct rspamd_lua_text *t;
  100. struct rspamd_process_exception *ex;
  101. UText utxt = UTEXT_INITIALIZER;
  102. GArray *res;
  103. rspamd_stat_token_t *w;
  104. if (lua_type (L, 1) == LUA_TSTRING) {
  105. in = luaL_checklstring (L, 1, &len);
  106. }
  107. else if (lua_type (L, 1) == LUA_TUSERDATA) {
  108. t = lua_check_text (L, 1);
  109. if (t) {
  110. in = t->start;
  111. len = t->len;
  112. }
  113. }
  114. if (in == NULL) {
  115. lua_pushnil (L);
  116. return 1;
  117. }
  118. if (lua_gettop (L) > 1 && lua_type (L, 2) == LUA_TTABLE) {
  119. lua_pushvalue (L, 2);
  120. lua_pushnil (L);
  121. while (lua_next (L, -2) != 0) {
  122. if (lua_type (L, -1) == LUA_TTABLE) {
  123. lua_rawgeti (L, -1, 1);
  124. pos = luaL_checknumber (L, -1);
  125. lua_pop (L, 1);
  126. lua_rawgeti (L, -1, 2);
  127. ex_len = luaL_checknumber (L, -1);
  128. lua_pop (L, 1);
  129. if (ex_len > 0) {
  130. ex = g_malloc0 (sizeof (*ex));
  131. ex->pos = pos;
  132. ex->len = ex_len;
  133. ex->type = RSPAMD_EXCEPTION_GENERIC;
  134. exceptions = g_list_prepend (exceptions, ex);
  135. }
  136. }
  137. lua_pop (L, 1);
  138. }
  139. lua_pop (L, 1);
  140. }
  141. if (exceptions) {
  142. exceptions = g_list_reverse (exceptions);
  143. }
  144. UErrorCode uc_err = U_ZERO_ERROR;
  145. utext_openUTF8 (&utxt,
  146. in,
  147. len,
  148. &uc_err);
  149. res = rspamd_tokenize_text ((gchar *)in, len,
  150. &utxt,
  151. RSPAMD_TOKENIZE_UTF, NULL,
  152. exceptions,
  153. NULL, NULL, NULL);
  154. if (res == NULL) {
  155. lua_pushnil (L);
  156. }
  157. else {
  158. lua_createtable (L, res->len, 0);
  159. for (i = 0; i < res->len; i ++) {
  160. w = &g_array_index (res, rspamd_stat_token_t, i);
  161. lua_pushlstring (L, w->original.begin, w->original.len);
  162. lua_rawseti (L, -2, i + 1);
  163. }
  164. }
  165. cur = exceptions;
  166. while (cur) {
  167. ex = cur->data;
  168. g_free (ex);
  169. cur = g_list_next (cur);
  170. }
  171. g_list_free (exceptions);
  172. utext_close (&utxt);
  173. return 1;
  174. }
  175. gint
  176. lua_parsers_parse_html (lua_State *L)
  177. {
  178. LUA_TRACE_POINT;
  179. struct rspamd_lua_text *t;
  180. const gchar *start = NULL;
  181. gsize len;
  182. GByteArray *res, *in;
  183. rspamd_mempool_t *pool;
  184. struct html_content *hc;
  185. if (lua_type (L, 1) == LUA_TUSERDATA) {
  186. t = lua_check_text (L, 1);
  187. if (t != NULL) {
  188. start = t->start;
  189. len = t->len;
  190. }
  191. }
  192. else if (lua_type (L, 1) == LUA_TSTRING) {
  193. start = luaL_checklstring (L, 1, &len);
  194. }
  195. if (start != NULL) {
  196. pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL, 0);
  197. hc = rspamd_mempool_alloc0 (pool, sizeof (*hc));
  198. in = g_byte_array_sized_new (len);
  199. g_byte_array_append (in, start, len);
  200. res = rspamd_html_process_part (pool, hc, in);
  201. t = lua_newuserdata (L, sizeof (*t));
  202. rspamd_lua_setclass (L, "rspamd{text}", -1);
  203. t->start = res->data;
  204. t->len = res->len;
  205. t->flags = RSPAMD_TEXT_FLAG_OWN;
  206. g_byte_array_free (res, FALSE);
  207. g_byte_array_free (in, TRUE);
  208. rspamd_mempool_delete (pool);
  209. }
  210. else {
  211. lua_pushnil (L);
  212. }
  213. return 1;
  214. }
  215. gint
  216. lua_parsers_parse_mail_address (lua_State *L)
  217. {
  218. LUA_TRACE_POINT;
  219. GPtrArray *addrs;
  220. gsize len;
  221. const gchar *str = luaL_checklstring (L, 1, &len);
  222. rspamd_mempool_t *pool;
  223. gboolean own_pool = FALSE;
  224. if (str) {
  225. if (lua_type (L, 2) == LUA_TUSERDATA) {
  226. pool = rspamd_lua_check_mempool (L, 2);
  227. if (pool == NULL) {
  228. return luaL_error (L, "invalid arguments");
  229. }
  230. }
  231. else {
  232. pool = rspamd_mempool_new (rspamd_mempool_suggest_size (),
  233. "lua parsers", 0);
  234. own_pool = TRUE;
  235. }
  236. addrs = rspamd_email_address_from_mime (pool, str, len, NULL, -1);
  237. if (addrs == NULL) {
  238. lua_pushnil (L);
  239. }
  240. else {
  241. lua_push_emails_address_list (L, addrs, 0);
  242. }
  243. if (own_pool) {
  244. rspamd_mempool_delete (pool);
  245. }
  246. }
  247. else {
  248. lua_pushnil (L);
  249. }
  250. return 1;
  251. }
  252. gint
  253. lua_parsers_parse_content_type (lua_State *L)
  254. {
  255. LUA_TRACE_POINT;
  256. gsize len;
  257. const gchar *ct_str = luaL_checklstring (L, 1, &len);
  258. rspamd_mempool_t *pool = rspamd_lua_check_mempool (L, 2);
  259. struct rspamd_content_type *ct;
  260. if (!ct_str || !pool) {
  261. return luaL_error (L, "invalid arguments");
  262. }
  263. ct = rspamd_content_type_parse (ct_str, len, pool);
  264. if (ct == NULL) {
  265. lua_pushnil (L);
  266. }
  267. else {
  268. GHashTableIter it;
  269. gpointer k, v;
  270. lua_createtable (L, 0, 4 + (ct->attrs ? g_hash_table_size (ct->attrs) : 0));
  271. if (ct->type.len > 0) {
  272. lua_pushstring (L, "type");
  273. lua_pushlstring (L, ct->type.begin, ct->type.len);
  274. lua_settable (L, -3);
  275. }
  276. if (ct->subtype.len > 0) {
  277. lua_pushstring (L, "subtype");
  278. lua_pushlstring (L, ct->subtype.begin, ct->subtype.len);
  279. lua_settable (L, -3);
  280. }
  281. if (ct->charset.len > 0) {
  282. lua_pushstring (L, "charset");
  283. lua_pushlstring (L, ct->charset.begin, ct->charset.len);
  284. lua_settable (L, -3);
  285. }
  286. if (ct->orig_boundary.len > 0) {
  287. lua_pushstring (L, "boundary");
  288. lua_pushlstring (L, ct->orig_boundary.begin, ct->orig_boundary.len);
  289. lua_settable (L, -3);
  290. }
  291. if (ct->attrs) {
  292. g_hash_table_iter_init (&it, ct->attrs);
  293. while (g_hash_table_iter_next (&it, &k, &v)) {
  294. struct rspamd_content_type_param *param =
  295. (struct rspamd_content_type_param *)v, *cur;
  296. guint i = 1;
  297. lua_pushlstring (L, param->name.begin, param->name.len);
  298. lua_createtable (L, 1, 0);
  299. DL_FOREACH (param, cur) {
  300. lua_pushlstring (L, cur->value.begin, cur->value.len);
  301. lua_rawseti (L, -2, i++);
  302. }
  303. lua_settable (L, -3);
  304. }
  305. }
  306. }
  307. return 1;
  308. }
  309. int
  310. lua_parsers_parse_smtp_date (lua_State *L)
  311. {
  312. gsize slen;
  313. const gchar *str = lua_tolstring (L, 1, &slen);
  314. GError *err = NULL;
  315. if (str == NULL) {
  316. return luaL_argerror (L, 1, "invalid argument");
  317. }
  318. time_t tt = rspamd_parse_smtp_date (str, slen, &err);
  319. if (err == NULL) {
  320. if (lua_isboolean (L, 2) && !!lua_toboolean (L, 2)) {
  321. struct tm t;
  322. rspamd_localtime (tt, &t);
  323. #if !defined(__sun)
  324. t.tm_gmtoff = 0;
  325. #endif
  326. t.tm_isdst = 0;
  327. tt = mktime (&t);
  328. }
  329. lua_pushnumber (L, tt);
  330. }
  331. else {
  332. lua_pushnil (L);
  333. lua_pushstring (L, err->message);
  334. g_error_free (err);
  335. return 2;
  336. }
  337. return 1;
  338. }
  339. static gint
  340. lua_load_parsers (lua_State * L)
  341. {
  342. lua_newtable (L);
  343. luaL_register (L, NULL, parserslib_f);
  344. return 1;
  345. }
  346. void
  347. luaopen_parsers (lua_State * L)
  348. {
  349. rspamd_lua_add_preload (L, "rspamd_parsers", lua_load_parsers);
  350. }