You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

regexp.c 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /***MODULE:regexp
  17. * rspamd module that implements different regexp rules
  18. */
  19. #include "config.h"
  20. #include "libmime/message.h"
  21. #include "expression.h"
  22. #include "mime_expressions.h"
  23. #include "libserver/maps/map.h"
  24. #include "lua/lua_common.h"
  25. static const guint64 rspamd_regexp_cb_magic = 0xca9d9649fc3e2659ULL;
  26. struct regexp_module_item {
  27. guint64 magic;
  28. struct rspamd_expression *expr;
  29. const gchar *symbol;
  30. struct ucl_lua_funcdata *lua_function;
  31. };
  32. struct regexp_ctx {
  33. struct module_ctx ctx;
  34. gsize max_size;
  35. };
  36. static void process_regexp_item(struct rspamd_task *task,
  37. struct rspamd_symcache_dynamic_item *item,
  38. void *user_data);
  39. /* Initialization */
  40. gint regexp_module_init(struct rspamd_config *cfg, struct module_ctx **ctx);
  41. gint regexp_module_config(struct rspamd_config *cfg, bool validate);
  42. gint regexp_module_reconfig(struct rspamd_config *cfg);
  43. module_t regexp_module = {
  44. "regexp",
  45. regexp_module_init,
  46. regexp_module_config,
  47. regexp_module_reconfig,
  48. NULL,
  49. RSPAMD_MODULE_VER,
  50. (guint) -1,
  51. };
  52. static inline struct regexp_ctx *
  53. regexp_get_context(struct rspamd_config *cfg)
  54. {
  55. return (struct regexp_ctx *) g_ptr_array_index(cfg->c_modules,
  56. regexp_module.ctx_offset);
  57. }
  58. /* Process regexp expression */
  59. static gboolean
  60. read_regexp_expression(rspamd_mempool_t *pool,
  61. struct regexp_module_item *chain,
  62. const gchar *symbol,
  63. const gchar *line,
  64. struct rspamd_mime_expr_ud *ud)
  65. {
  66. struct rspamd_expression *e = NULL;
  67. GError *err = NULL;
  68. if (!rspamd_parse_expression(line, 0, &mime_expr_subr, ud, pool, &err,
  69. &e)) {
  70. msg_warn_pool("%s = \"%s\" is invalid regexp expression: %e", symbol,
  71. line,
  72. err);
  73. g_error_free(err);
  74. return FALSE;
  75. }
  76. g_assert(e != NULL);
  77. chain->expr = e;
  78. return TRUE;
  79. }
  80. /* Init function */
  81. gint regexp_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
  82. {
  83. struct regexp_ctx *regexp_module_ctx;
  84. regexp_module_ctx = rspamd_mempool_alloc0(cfg->cfg_pool,
  85. sizeof(*regexp_module_ctx));
  86. *ctx = (struct module_ctx *) regexp_module_ctx;
  87. rspamd_rcl_add_doc_by_path(cfg,
  88. NULL,
  89. "Regular expressions rules plugin",
  90. "regexp",
  91. UCL_OBJECT,
  92. NULL,
  93. 0,
  94. NULL,
  95. 0);
  96. rspamd_rcl_add_doc_by_path(cfg,
  97. "regexp",
  98. "Maximum size of data chunk scanned with any regexp (further data is truncated)",
  99. "max_size",
  100. UCL_INT,
  101. NULL,
  102. 0,
  103. NULL,
  104. 0);
  105. return 0;
  106. }
  107. gint regexp_module_config(struct rspamd_config *cfg, bool validate)
  108. {
  109. struct regexp_ctx *regexp_module_ctx = regexp_get_context(cfg);
  110. struct regexp_module_item *cur_item = NULL;
  111. const ucl_object_t *sec, *value, *elt;
  112. ucl_object_iter_t it = NULL;
  113. gint res = TRUE, nre = 0, nlua = 0, nshots = cfg->default_max_shots;
  114. if (!rspamd_config_is_module_enabled(cfg, "regexp")) {
  115. return TRUE;
  116. }
  117. sec = ucl_object_lookup(cfg->cfg_ucl_obj, "regexp");
  118. if (sec == NULL) {
  119. msg_err_config("regexp module enabled, but no rules are defined");
  120. return TRUE;
  121. }
  122. regexp_module_ctx->max_size = 0;
  123. while ((value = ucl_object_iterate(sec, &it, true)) != NULL) {
  124. if (g_ascii_strncasecmp(ucl_object_key(value), "max_size",
  125. sizeof("max_size") - 1) == 0) {
  126. regexp_module_ctx->max_size = ucl_obj_toint(value);
  127. rspamd_re_cache_set_limit(cfg->re_cache, regexp_module_ctx->max_size);
  128. }
  129. else if (g_ascii_strncasecmp(ucl_object_key(value), "max_threads",
  130. sizeof("max_threads") - 1) == 0) {
  131. msg_warn_config("regexp module is now single threaded, max_threads is ignored");
  132. }
  133. else if (value->type == UCL_STRING) {
  134. struct rspamd_mime_expr_ud ud;
  135. cur_item = rspamd_mempool_alloc0(cfg->cfg_pool,
  136. sizeof(struct regexp_module_item));
  137. cur_item->symbol = ucl_object_key(value);
  138. cur_item->magic = rspamd_regexp_cb_magic;
  139. ud.conf_obj = NULL;
  140. ud.cfg = cfg;
  141. if (!read_regexp_expression(cfg->cfg_pool,
  142. cur_item, ucl_object_key(value),
  143. ucl_obj_tostring(value), &ud)) {
  144. if (validate) {
  145. return FALSE;
  146. }
  147. }
  148. else {
  149. rspamd_symcache_add_symbol(cfg->cache,
  150. cur_item->symbol,
  151. 0,
  152. process_regexp_item,
  153. cur_item,
  154. SYMBOL_TYPE_NORMAL, -1);
  155. nre++;
  156. }
  157. }
  158. else if (value->type == UCL_USERDATA) {
  159. /* Just a lua function */
  160. cur_item = rspamd_mempool_alloc0(cfg->cfg_pool,
  161. sizeof(struct regexp_module_item));
  162. cur_item->magic = rspamd_regexp_cb_magic;
  163. cur_item->symbol = ucl_object_key(value);
  164. cur_item->lua_function = ucl_object_toclosure(value);
  165. rspamd_symcache_add_symbol(cfg->cache,
  166. cur_item->symbol,
  167. 0,
  168. process_regexp_item,
  169. cur_item,
  170. SYMBOL_TYPE_NORMAL, -1);
  171. nlua++;
  172. }
  173. else if (value->type == UCL_OBJECT) {
  174. const gchar *description = NULL, *group = NULL;
  175. gdouble score = 0.0;
  176. guint flags = 0, priority = 0;
  177. gboolean is_lua = FALSE, valid_expression = TRUE;
  178. struct rspamd_mime_expr_ud ud;
  179. /* We have some lua table, extract its arguments */
  180. elt = ucl_object_lookup(value, "callback");
  181. if (elt == NULL || elt->type != UCL_USERDATA) {
  182. /* Try plain regexp expression */
  183. elt = ucl_object_lookup_any(value, "regexp", "re", NULL);
  184. if (elt != NULL && ucl_object_type(elt) == UCL_STRING) {
  185. cur_item = rspamd_mempool_alloc0(cfg->cfg_pool,
  186. sizeof(struct regexp_module_item));
  187. cur_item->symbol = ucl_object_key(value);
  188. cur_item->magic = rspamd_regexp_cb_magic;
  189. ud.cfg = cfg;
  190. ud.conf_obj = value;
  191. if (!read_regexp_expression(cfg->cfg_pool,
  192. cur_item, ucl_object_key(value),
  193. ucl_obj_tostring(elt), &ud)) {
  194. if (validate) {
  195. return FALSE;
  196. }
  197. }
  198. else {
  199. valid_expression = TRUE;
  200. nre++;
  201. }
  202. }
  203. else {
  204. msg_err_config(
  205. "no callback/expression defined for regexp symbol: "
  206. "%s",
  207. ucl_object_key(value));
  208. }
  209. }
  210. else {
  211. is_lua = TRUE;
  212. nlua++;
  213. cur_item = rspamd_mempool_alloc0(
  214. cfg->cfg_pool,
  215. sizeof(struct regexp_module_item));
  216. cur_item->magic = rspamd_regexp_cb_magic;
  217. cur_item->symbol = ucl_object_key(value);
  218. cur_item->lua_function = ucl_object_toclosure(value);
  219. }
  220. if (cur_item && (is_lua || valid_expression)) {
  221. flags = SYMBOL_TYPE_NORMAL;
  222. elt = ucl_object_lookup(value, "mime_only");
  223. if (elt) {
  224. if (ucl_object_type(elt) != UCL_BOOLEAN) {
  225. msg_err_config(
  226. "mime_only attribute is not boolean for symbol: '%s'",
  227. cur_item->symbol);
  228. if (validate) {
  229. return FALSE;
  230. }
  231. }
  232. else {
  233. if (ucl_object_toboolean(elt)) {
  234. flags |= SYMBOL_TYPE_MIME_ONLY;
  235. }
  236. }
  237. }
  238. rspamd_symcache_add_symbol(cfg->cache,
  239. cur_item->symbol,
  240. 0,
  241. process_regexp_item,
  242. cur_item,
  243. flags, -1);
  244. /* Reset flags */
  245. flags = 0;
  246. elt = ucl_object_lookup(value, "condition");
  247. if (elt != NULL && ucl_object_type(elt) == UCL_USERDATA) {
  248. struct ucl_lua_funcdata *conddata;
  249. g_assert(cur_item->symbol != NULL);
  250. conddata = ucl_object_toclosure(elt);
  251. rspamd_symcache_add_condition_delayed(cfg->cache,
  252. cur_item->symbol,
  253. conddata->L, conddata->idx);
  254. }
  255. elt = ucl_object_lookup(value, "description");
  256. if (elt) {
  257. description = ucl_object_tostring(elt);
  258. }
  259. elt = ucl_object_lookup(value, "group");
  260. if (elt) {
  261. group = ucl_object_tostring(elt);
  262. }
  263. elt = ucl_object_lookup(value, "score");
  264. if (elt) {
  265. if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) {
  266. msg_err_config(
  267. "score attribute is not numeric for symbol: '%s'",
  268. cur_item->symbol);
  269. if (validate) {
  270. return FALSE;
  271. }
  272. }
  273. else {
  274. score = ucl_object_todouble(elt);
  275. }
  276. }
  277. elt = ucl_object_lookup(value, "one_shot");
  278. if (elt) {
  279. if (ucl_object_type(elt) != UCL_BOOLEAN) {
  280. msg_err_config(
  281. "one_shot attribute is not boolean for symbol: '%s'",
  282. cur_item->symbol);
  283. if (validate) {
  284. return FALSE;
  285. }
  286. }
  287. else {
  288. if (ucl_object_toboolean(elt)) {
  289. nshots = 1;
  290. }
  291. }
  292. }
  293. if ((elt = ucl_object_lookup(value, "any_shot")) != NULL) {
  294. if (ucl_object_type(elt) != UCL_BOOLEAN) {
  295. msg_err_config(
  296. "any_shot attribute is not boolean for symbol: '%s'",
  297. cur_item->symbol);
  298. if (validate) {
  299. return FALSE;
  300. }
  301. }
  302. else {
  303. if (ucl_object_toboolean(elt)) {
  304. nshots = -1;
  305. }
  306. }
  307. }
  308. if ((elt = ucl_object_lookup(value, "nshots")) != NULL) {
  309. if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) {
  310. msg_err_config(
  311. "nshots attribute is not numeric for symbol: '%s'",
  312. cur_item->symbol);
  313. if (validate) {
  314. return FALSE;
  315. }
  316. }
  317. else {
  318. nshots = ucl_object_toint(elt);
  319. }
  320. }
  321. elt = ucl_object_lookup(value, "one_param");
  322. if (elt) {
  323. if (ucl_object_type(elt) != UCL_BOOLEAN) {
  324. msg_err_config(
  325. "one_param attribute is not boolean for symbol: '%s'",
  326. cur_item->symbol);
  327. if (validate) {
  328. return FALSE;
  329. }
  330. }
  331. else {
  332. if (ucl_object_toboolean(elt)) {
  333. flags |= RSPAMD_SYMBOL_FLAG_ONEPARAM;
  334. }
  335. }
  336. }
  337. elt = ucl_object_lookup(value, "priority");
  338. if (elt) {
  339. if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) {
  340. msg_err_config(
  341. "priority attribute is not numeric for symbol: '%s'",
  342. cur_item->symbol);
  343. if (validate) {
  344. return FALSE;
  345. }
  346. }
  347. else {
  348. priority = ucl_object_toint(elt);
  349. }
  350. }
  351. else {
  352. priority = 0;
  353. }
  354. rspamd_config_add_symbol(cfg, cur_item->symbol,
  355. score, description, group, flags, priority, nshots);
  356. elt = ucl_object_lookup(value, "groups");
  357. if (elt) {
  358. ucl_object_iter_t gr_it;
  359. const ucl_object_t *cur_gr;
  360. gr_it = ucl_object_iterate_new(elt);
  361. while ((cur_gr = ucl_object_iterate_safe(gr_it, true)) != NULL) {
  362. rspamd_config_add_symbol_group(cfg, cur_item->symbol,
  363. ucl_object_tostring(cur_gr));
  364. }
  365. ucl_object_iterate_free(gr_it);
  366. }
  367. }
  368. }
  369. else {
  370. msg_warn_config("unknown type of attribute %s for regexp module",
  371. ucl_object_key(value));
  372. }
  373. }
  374. if (res) {
  375. msg_info_config("init internal regexp module, %d regexp rules and %d "
  376. "lua rules are loaded",
  377. nre, nlua);
  378. }
  379. else {
  380. msg_err_config("fatal regexp module error");
  381. }
  382. return res;
  383. }
  384. gint regexp_module_reconfig(struct rspamd_config *cfg)
  385. {
  386. return regexp_module_config(cfg, false);
  387. }
  388. static gboolean
  389. rspamd_lua_call_expression_func(struct ucl_lua_funcdata *lua_data,
  390. struct rspamd_task *task,
  391. GArray *args, gdouble *res,
  392. const gchar *symbol)
  393. {
  394. lua_State *L = lua_data->L;
  395. struct rspamd_task **ptask;
  396. struct expression_argument *arg;
  397. gint pop = 0, i, nargs = 0;
  398. lua_rawgeti(L, LUA_REGISTRYINDEX, lua_data->idx);
  399. /* Now we got function in top of stack */
  400. ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
  401. rspamd_lua_setclass(L, rspamd_task_classname, -1);
  402. *ptask = task;
  403. /* Now push all arguments */
  404. if (args) {
  405. for (i = 0; i < (gint) args->len; i++) {
  406. arg = &g_array_index(args, struct expression_argument, i);
  407. if (arg) {
  408. switch (arg->type) {
  409. case EXPRESSION_ARGUMENT_NORMAL:
  410. lua_pushstring(L, (const gchar *) arg->data);
  411. break;
  412. case EXPRESSION_ARGUMENT_BOOL:
  413. lua_pushboolean(L, (gboolean) GPOINTER_TO_SIZE(arg->data));
  414. break;
  415. default:
  416. msg_err_task("%s: cannot pass custom params to lua function",
  417. symbol);
  418. return FALSE;
  419. }
  420. }
  421. }
  422. nargs = args->len;
  423. }
  424. if (lua_pcall(L, nargs + 1, 1, 0) != 0) {
  425. msg_info_task("%s: call to lua function failed: %s", symbol,
  426. lua_tostring(L, -1));
  427. lua_pop(L, 1);
  428. return FALSE;
  429. }
  430. pop++;
  431. if (lua_type(L, -1) == LUA_TNUMBER) {
  432. *res = lua_tonumber(L, -1);
  433. }
  434. else if (lua_type(L, -1) == LUA_TBOOLEAN) {
  435. *res = lua_toboolean(L, -1);
  436. }
  437. else {
  438. msg_info_task("%s: lua function must return a boolean", symbol);
  439. *res = FALSE;
  440. }
  441. lua_pop(L, pop);
  442. return TRUE;
  443. }
  444. static void
  445. process_regexp_item(struct rspamd_task *task,
  446. struct rspamd_symcache_dynamic_item *symcache_item,
  447. void *user_data)
  448. {
  449. struct regexp_module_item *item = user_data;
  450. gdouble res = FALSE;
  451. /* Non-threaded version */
  452. if (item->lua_function) {
  453. /* Just call function */
  454. res = FALSE;
  455. if (!rspamd_lua_call_expression_func(item->lua_function, task, NULL,
  456. &res, item->symbol)) {
  457. msg_err_task("error occurred when checking symbol %s",
  458. item->symbol);
  459. }
  460. }
  461. else {
  462. /* Process expression */
  463. if (item->expr) {
  464. res = rspamd_process_expression(item->expr, 0, task);
  465. }
  466. else {
  467. msg_warn_task("FIXME: %s symbol is broken with new expressions",
  468. item->symbol);
  469. }
  470. }
  471. if (res != 0) {
  472. rspamd_task_insert_result(task, item->symbol, res, NULL);
  473. }
  474. rspamd_symcache_finalize_item(task, symcache_item);
  475. }