You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

regexp.c 14KB

15 vuotta sitten
15 vuotta sitten
15 vuotta sitten
10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /***MODULE:regexp
  17. * rspamd module that implements different regexp rules
  18. */
  19. #include "config.h"
  20. #include "libmime/message.h"
  21. #include "expression.h"
  22. #include "mime_expressions.h"
  23. #include "libserver/maps/map.h"
  24. #include "lua/lua_common.h"
  25. static const guint64 rspamd_regexp_cb_magic = 0xca9d9649fc3e2659ULL;
  26. struct regexp_module_item {
  27. guint64 magic;
  28. struct rspamd_expression *expr;
  29. const gchar *symbol;
  30. struct ucl_lua_funcdata *lua_function;
  31. };
  32. struct regexp_ctx {
  33. struct module_ctx ctx;
  34. gsize max_size;
  35. };
  36. static void process_regexp_item(struct rspamd_task *task,
  37. struct rspamd_symcache_dynamic_item *item,
  38. void *user_data);
  39. /* Initialization */
  40. gint regexp_module_init(struct rspamd_config *cfg, struct module_ctx **ctx);
  41. gint regexp_module_config(struct rspamd_config *cfg, bool validate);
  42. gint regexp_module_reconfig(struct rspamd_config *cfg);
  43. module_t regexp_module = {
  44. "regexp",
  45. regexp_module_init,
  46. regexp_module_config,
  47. regexp_module_reconfig,
  48. NULL,
  49. RSPAMD_MODULE_VER,
  50. (guint) -1,
  51. };
  52. static inline struct regexp_ctx *
  53. regexp_get_context(struct rspamd_config *cfg)
  54. {
  55. return (struct regexp_ctx *) g_ptr_array_index(cfg->c_modules,
  56. regexp_module.ctx_offset);
  57. }
  58. /* Process regexp expression */
  59. static gboolean
  60. read_regexp_expression(rspamd_mempool_t *pool,
  61. struct regexp_module_item *chain,
  62. const gchar *symbol,
  63. const gchar *line,
  64. struct rspamd_mime_expr_ud *ud)
  65. {
  66. struct rspamd_expression *e = NULL;
  67. GError *err = NULL;
  68. if (!rspamd_parse_expression(line, 0, &mime_expr_subr, ud, pool, &err,
  69. &e)) {
  70. msg_warn_pool("%s = \"%s\" is invalid regexp expression: %e", symbol,
  71. line,
  72. err);
  73. g_error_free(err);
  74. return FALSE;
  75. }
  76. g_assert(e != NULL);
  77. chain->expr = e;
  78. return TRUE;
  79. }
  80. /* Init function */
  81. gint regexp_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
  82. {
  83. struct regexp_ctx *regexp_module_ctx;
  84. regexp_module_ctx = rspamd_mempool_alloc0(cfg->cfg_pool,
  85. sizeof(*regexp_module_ctx));
  86. *ctx = (struct module_ctx *) regexp_module_ctx;
  87. rspamd_rcl_add_doc_by_path(cfg,
  88. NULL,
  89. "Regular expressions rules plugin",
  90. "regexp",
  91. UCL_OBJECT,
  92. NULL,
  93. 0,
  94. NULL,
  95. 0);
  96. rspamd_rcl_add_doc_by_path(cfg,
  97. "regexp",
  98. "Maximum size of data chunk scanned with any regexp (further data is truncated)",
  99. "max_size",
  100. UCL_INT,
  101. NULL,
  102. 0,
  103. NULL,
  104. 0);
  105. return 0;
  106. }
  107. gint regexp_module_config(struct rspamd_config *cfg, bool validate)
  108. {
  109. struct regexp_ctx *regexp_module_ctx = regexp_get_context(cfg);
  110. struct regexp_module_item *cur_item = NULL;
  111. const ucl_object_t *sec, *value, *elt;
  112. ucl_object_iter_t it = NULL;
  113. gint res = TRUE, nre = 0, nlua = 0, nshots = cfg->default_max_shots;
  114. if (!rspamd_config_is_module_enabled(cfg, "regexp")) {
  115. return TRUE;
  116. }
  117. sec = ucl_object_lookup(cfg->cfg_ucl_obj, "regexp");
  118. if (sec == NULL) {
  119. msg_err_config("regexp module enabled, but no rules are defined");
  120. return TRUE;
  121. }
  122. regexp_module_ctx->max_size = 0;
  123. while ((value = ucl_object_iterate(sec, &it, true)) != NULL) {
  124. if (g_ascii_strncasecmp(ucl_object_key(value), "max_size",
  125. sizeof("max_size") - 1) == 0) {
  126. regexp_module_ctx->max_size = ucl_obj_toint(value);
  127. rspamd_re_cache_set_limit(cfg->re_cache, regexp_module_ctx->max_size);
  128. }
  129. else if (g_ascii_strncasecmp(ucl_object_key(value), "max_threads",
  130. sizeof("max_threads") - 1) == 0) {
  131. msg_warn_config("regexp module is now single threaded, max_threads is ignored");
  132. }
  133. else if (value->type == UCL_STRING) {
  134. struct rspamd_mime_expr_ud ud;
  135. cur_item = rspamd_mempool_alloc0(cfg->cfg_pool,
  136. sizeof(struct regexp_module_item));
  137. cur_item->symbol = ucl_object_key(value);
  138. cur_item->magic = rspamd_regexp_cb_magic;
  139. ud.conf_obj = NULL;
  140. ud.cfg = cfg;
  141. if (!read_regexp_expression(cfg->cfg_pool,
  142. cur_item, ucl_object_key(value),
  143. ucl_obj_tostring(value), &ud)) {
  144. if (validate) {
  145. return FALSE;
  146. }
  147. }
  148. else {
  149. rspamd_symcache_add_symbol(cfg->cache,
  150. cur_item->symbol,
  151. 0,
  152. process_regexp_item,
  153. cur_item,
  154. SYMBOL_TYPE_NORMAL, -1);
  155. nre++;
  156. }
  157. }
  158. else if (value->type == UCL_USERDATA) {
  159. /* Just a lua function */
  160. cur_item = rspamd_mempool_alloc0(cfg->cfg_pool,
  161. sizeof(struct regexp_module_item));
  162. cur_item->magic = rspamd_regexp_cb_magic;
  163. cur_item->symbol = ucl_object_key(value);
  164. cur_item->lua_function = ucl_object_toclosure(value);
  165. rspamd_symcache_add_symbol(cfg->cache,
  166. cur_item->symbol,
  167. 0,
  168. process_regexp_item,
  169. cur_item,
  170. SYMBOL_TYPE_NORMAL, -1);
  171. nlua++;
  172. }
  173. else if (value->type == UCL_OBJECT) {
  174. const gchar *description = NULL, *group = NULL;
  175. gdouble score = 0.0;
  176. guint flags = 0, priority = 0;
  177. gboolean is_lua = FALSE, valid_expression = TRUE;
  178. struct rspamd_mime_expr_ud ud;
  179. /* We have some lua table, extract its arguments */
  180. elt = ucl_object_lookup(value, "callback");
  181. if (elt == NULL || elt->type != UCL_USERDATA) {
  182. /* Try plain regexp expression */
  183. elt = ucl_object_lookup_any(value, "regexp", "re", NULL);
  184. if (elt != NULL && ucl_object_type(elt) == UCL_STRING) {
  185. cur_item = rspamd_mempool_alloc0(cfg->cfg_pool,
  186. sizeof(struct regexp_module_item));
  187. cur_item->symbol = ucl_object_key(value);
  188. cur_item->magic = rspamd_regexp_cb_magic;
  189. ud.cfg = cfg;
  190. ud.conf_obj = value;
  191. if (!read_regexp_expression(cfg->cfg_pool,
  192. cur_item, ucl_object_key(value),
  193. ucl_obj_tostring(elt), &ud)) {
  194. if (validate) {
  195. return FALSE;
  196. }
  197. }
  198. else {
  199. valid_expression = TRUE;
  200. nre++;
  201. }
  202. }
  203. else {
  204. msg_err_config(
  205. "no callback/expression defined for regexp symbol: "
  206. "%s",
  207. ucl_object_key(value));
  208. }
  209. }
  210. else {
  211. is_lua = TRUE;
  212. nlua++;
  213. cur_item = rspamd_mempool_alloc0(
  214. cfg->cfg_pool,
  215. sizeof(struct regexp_module_item));
  216. cur_item->magic = rspamd_regexp_cb_magic;
  217. cur_item->symbol = ucl_object_key(value);
  218. cur_item->lua_function = ucl_object_toclosure(value);
  219. }
  220. if (cur_item && (is_lua || valid_expression)) {
  221. flags = SYMBOL_TYPE_NORMAL;
  222. elt = ucl_object_lookup(value, "mime_only");
  223. if (elt) {
  224. if (ucl_object_type(elt) != UCL_BOOLEAN) {
  225. msg_err_config(
  226. "mime_only attribute is not boolean for symbol: '%s'",
  227. cur_item->symbol);
  228. if (validate) {
  229. return FALSE;
  230. }
  231. }
  232. else {
  233. if (ucl_object_toboolean(elt)) {
  234. flags |= SYMBOL_TYPE_MIME_ONLY;
  235. }
  236. }
  237. }
  238. rspamd_symcache_add_symbol(cfg->cache,
  239. cur_item->symbol,
  240. 0,
  241. process_regexp_item,
  242. cur_item,
  243. flags, -1);
  244. /* Reset flags */
  245. flags = 0;
  246. elt = ucl_object_lookup(value, "condition");
  247. if (elt != NULL && ucl_object_type(elt) == UCL_USERDATA) {
  248. struct ucl_lua_funcdata *conddata;
  249. g_assert(cur_item->symbol != NULL);
  250. conddata = ucl_object_toclosure(elt);
  251. rspamd_symcache_add_condition_delayed(cfg->cache,
  252. cur_item->symbol,
  253. conddata->L, conddata->idx);
  254. }
  255. elt = ucl_object_lookup(value, "description");
  256. if (elt) {
  257. description = ucl_object_tostring(elt);
  258. }
  259. elt = ucl_object_lookup(value, "group");
  260. if (elt) {
  261. group = ucl_object_tostring(elt);
  262. }
  263. elt = ucl_object_lookup(value, "score");
  264. if (elt) {
  265. if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) {
  266. msg_err_config(
  267. "score attribute is not numeric for symbol: '%s'",
  268. cur_item->symbol);
  269. if (validate) {
  270. return FALSE;
  271. }
  272. }
  273. else {
  274. score = ucl_object_todouble(elt);
  275. }
  276. }
  277. elt = ucl_object_lookup(value, "one_shot");
  278. if (elt) {
  279. if (ucl_object_type(elt) != UCL_BOOLEAN) {
  280. msg_err_config(
  281. "one_shot attribute is not boolean for symbol: '%s'",
  282. cur_item->symbol);
  283. if (validate) {
  284. return FALSE;
  285. }
  286. }
  287. else {
  288. if (ucl_object_toboolean(elt)) {
  289. nshots = 1;
  290. }
  291. }
  292. }
  293. if ((elt = ucl_object_lookup(value, "any_shot")) != NULL) {
  294. if (ucl_object_type(elt) != UCL_BOOLEAN) {
  295. msg_err_config(
  296. "any_shot attribute is not boolean for symbol: '%s'",
  297. cur_item->symbol);
  298. if (validate) {
  299. return FALSE;
  300. }
  301. }
  302. else {
  303. if (ucl_object_toboolean(elt)) {
  304. nshots = -1;
  305. }
  306. }
  307. }
  308. if ((elt = ucl_object_lookup(value, "nshots")) != NULL) {
  309. if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) {
  310. msg_err_config(
  311. "nshots attribute is not numeric for symbol: '%s'",
  312. cur_item->symbol);
  313. if (validate) {
  314. return FALSE;
  315. }
  316. }
  317. else {
  318. nshots = ucl_object_toint(elt);
  319. }
  320. }
  321. elt = ucl_object_lookup(value, "one_param");
  322. if (elt) {
  323. if (ucl_object_type(elt) != UCL_BOOLEAN) {
  324. msg_err_config(
  325. "one_param attribute is not boolean for symbol: '%s'",
  326. cur_item->symbol);
  327. if (validate) {
  328. return FALSE;
  329. }
  330. }
  331. else {
  332. if (ucl_object_toboolean(elt)) {
  333. flags |= RSPAMD_SYMBOL_FLAG_ONEPARAM;
  334. }
  335. }
  336. }
  337. elt = ucl_object_lookup(value, "priority");
  338. if (elt) {
  339. if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) {
  340. msg_err_config(
  341. "priority attribute is not numeric for symbol: '%s'",
  342. cur_item->symbol);
  343. if (validate) {
  344. return FALSE;
  345. }
  346. }
  347. else {
  348. priority = ucl_object_toint(elt);
  349. }
  350. }
  351. else {
  352. priority = 0;
  353. }
  354. rspamd_config_add_symbol(cfg, cur_item->symbol,
  355. score, description, group, flags, priority, nshots);
  356. elt = ucl_object_lookup(value, "groups");
  357. if (elt) {
  358. ucl_object_iter_t gr_it;
  359. const ucl_object_t *cur_gr;
  360. gr_it = ucl_object_iterate_new(elt);
  361. while ((cur_gr = ucl_object_iterate_safe(gr_it, true)) != NULL) {
  362. rspamd_config_add_symbol_group(cfg, cur_item->symbol,
  363. ucl_object_tostring(cur_gr));
  364. }
  365. ucl_object_iterate_free(gr_it);
  366. }
  367. }
  368. }
  369. else {
  370. msg_warn_config("unknown type of attribute %s for regexp module",
  371. ucl_object_key(value));
  372. }
  373. }
  374. if (res) {
  375. msg_info_config("init internal regexp module, %d regexp rules and %d "
  376. "lua rules are loaded",
  377. nre, nlua);
  378. }
  379. else {
  380. msg_err_config("fatal regexp module error");
  381. }
  382. return res;
  383. }
  384. gint regexp_module_reconfig(struct rspamd_config *cfg)
  385. {
  386. return regexp_module_config(cfg, false);
  387. }
  388. static gboolean
  389. rspamd_lua_call_expression_func(struct ucl_lua_funcdata *lua_data,
  390. struct rspamd_task *task,
  391. GArray *args, gdouble *res,
  392. const gchar *symbol)
  393. {
  394. lua_State *L = lua_data->L;
  395. struct rspamd_task **ptask;
  396. struct expression_argument *arg;
  397. gint pop = 0, i, nargs = 0;
  398. lua_rawgeti(L, LUA_REGISTRYINDEX, lua_data->idx);
  399. /* Now we got function in top of stack */
  400. ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
  401. rspamd_lua_setclass(L, rspamd_task_classname, -1);
  402. *ptask = task;
  403. /* Now push all arguments */
  404. if (args) {
  405. for (i = 0; i < (gint) args->len; i++) {
  406. arg = &g_array_index(args, struct expression_argument, i);
  407. if (arg) {
  408. switch (arg->type) {
  409. case EXPRESSION_ARGUMENT_NORMAL:
  410. lua_pushstring(L, (const gchar *) arg->data);
  411. break;
  412. case EXPRESSION_ARGUMENT_BOOL:
  413. lua_pushboolean(L, (gboolean) GPOINTER_TO_SIZE(arg->data));
  414. break;
  415. default:
  416. msg_err_task("%s: cannot pass custom params to lua function",
  417. symbol);
  418. return FALSE;
  419. }
  420. }
  421. }
  422. nargs = args->len;
  423. }
  424. if (lua_pcall(L, nargs + 1, 1, 0) != 0) {
  425. msg_info_task("%s: call to lua function failed: %s", symbol,
  426. lua_tostring(L, -1));
  427. lua_pop(L, 1);
  428. return FALSE;
  429. }
  430. pop++;
  431. if (lua_type(L, -1) == LUA_TNUMBER) {
  432. *res = lua_tonumber(L, -1);
  433. }
  434. else if (lua_type(L, -1) == LUA_TBOOLEAN) {
  435. *res = lua_toboolean(L, -1);
  436. }
  437. else {
  438. msg_info_task("%s: lua function must return a boolean", symbol);
  439. *res = FALSE;
  440. }
  441. lua_pop(L, pop);
  442. return TRUE;
  443. }
  444. static void
  445. process_regexp_item(struct rspamd_task *task,
  446. struct rspamd_symcache_dynamic_item *symcache_item,
  447. void *user_data)
  448. {
  449. struct regexp_module_item *item = user_data;
  450. gdouble res = FALSE;
  451. /* Non-threaded version */
  452. if (item->lua_function) {
  453. /* Just call function */
  454. res = FALSE;
  455. if (!rspamd_lua_call_expression_func(item->lua_function, task, NULL,
  456. &res, item->symbol)) {
  457. msg_err_task("error occurred when checking symbol %s",
  458. item->symbol);
  459. }
  460. }
  461. else {
  462. /* Process expression */
  463. if (item->expr) {
  464. res = rspamd_process_expression(item->expr, 0, task);
  465. }
  466. else {
  467. msg_warn_task("FIXME: %s symbol is broken with new expressions",
  468. item->symbol);
  469. }
  470. }
  471. if (res != 0) {
  472. rspamd_task_insert_result(task, item->symbol, res, NULL);
  473. }
  474. rspamd_symcache_finalize_item(task, symcache_item);
  475. }