Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "regexp.h"
  18. #include "cryptobox.h"
  19. #include "ref.h"
  20. #include "util.h"
  21. #include "rspamd.h"
  22. #ifndef WITH_PCRE2
  23. /* Normal pcre path */
  24. #include <pcre.h>
  25. #define PCRE_T pcre
  26. #define PCRE_EXTRA_T pcre_extra
  27. #define PCRE_JIT_T pcre_jit_stack
  28. #define PCRE_FREE pcre_free
  29. #define PCRE_JIT_STACK_FREE pcre_jit_stack_free
  30. #define PCRE_FLAG(x) G_PASTE(PCRE_, x)
  31. #else
  32. /* PCRE 2 path */
  33. #ifndef PCRE2_CODE_UNIT_WIDTH
  34. #define PCRE2_CODE_UNIT_WIDTH 8
  35. #endif
  36. #include <pcre2.h>
  37. #define PCRE_T pcre2_code
  38. #define PCRE_JIT_T pcre2_jit_stack
  39. #define PCRE_FREE pcre2_code_free
  40. #define PCRE_JIT_STACK_FREE pcre2_jit_stack_free
  41. #define PCRE_FLAG(x) G_PASTE(PCRE2_, x)
  42. #endif
  43. typedef guchar regexp_id_t[rspamd_cryptobox_HASHBYTES];
  44. #undef DISABLE_JIT_FAST
  45. struct rspamd_regexp_s {
  46. gdouble exec_time;
  47. gchar *pattern;
  48. PCRE_T *re;
  49. PCRE_T *raw_re;
  50. #ifndef WITH_PCRE2
  51. PCRE_EXTRA_T *extra;
  52. PCRE_EXTRA_T *raw_extra;
  53. #else
  54. pcre2_match_context *mcontext;
  55. pcre2_match_context *raw_mcontext;
  56. #endif
  57. regexp_id_t id;
  58. ref_entry_t ref;
  59. gpointer ud;
  60. gpointer re_class;
  61. guint64 cache_id;
  62. guint max_hits;
  63. gint flags;
  64. gint pcre_flags;
  65. gint ncaptures;
  66. gint nbackref;
  67. };
  68. struct rspamd_regexp_cache {
  69. GHashTable *tbl;
  70. #ifdef HAVE_PCRE_JIT
  71. PCRE_JIT_T *jstack;
  72. #endif
  73. };
  74. static struct rspamd_regexp_cache *global_re_cache = NULL;
  75. static gboolean can_jit = FALSE;
  76. static gboolean check_jit = TRUE;
  77. #ifdef WITH_PCRE2
  78. static pcre2_compile_context *pcre2_ctx = NULL;
  79. #endif
  80. static GQuark
  81. rspamd_regexp_quark (void)
  82. {
  83. return g_quark_from_static_string ("rspamd-regexp");
  84. }
  85. static void
  86. rspamd_regexp_generate_id (const gchar *pattern, const gchar *flags,
  87. regexp_id_t out)
  88. {
  89. rspamd_cryptobox_hash_state_t st;
  90. rspamd_cryptobox_hash_init (&st, NULL, 0);
  91. if (flags) {
  92. rspamd_cryptobox_hash_update (&st, flags, strlen (flags));
  93. }
  94. rspamd_cryptobox_hash_update (&st, pattern, strlen (pattern));
  95. rspamd_cryptobox_hash_final (&st, out);
  96. }
  97. static void
  98. rspamd_regexp_dtor (rspamd_regexp_t *re)
  99. {
  100. if (re) {
  101. if (re->raw_re && re->raw_re != re->re) {
  102. #ifndef WITH_PCRE2
  103. #ifdef HAVE_PCRE_JIT
  104. if (re->raw_extra) {
  105. pcre_free_study (re->raw_extra);
  106. }
  107. #endif
  108. #else
  109. if (re->mcontext) {
  110. pcre2_match_context_free (re->mcontext);
  111. }
  112. #endif
  113. PCRE_FREE (re->raw_re);
  114. }
  115. if (re->re) {
  116. #ifndef WITH_PCRE2
  117. #ifdef HAVE_PCRE_JIT
  118. if (re->extra) {
  119. pcre_free_study (re->extra);
  120. }
  121. #endif
  122. #else
  123. if (re->raw_mcontext) {
  124. pcre2_match_context_free (re->raw_mcontext);
  125. }
  126. #endif
  127. PCRE_FREE (re->re);
  128. }
  129. if (re->pattern) {
  130. g_free (re->pattern);
  131. }
  132. g_free (re);
  133. }
  134. }
  135. static void
  136. rspamd_regexp_post_process (rspamd_regexp_t *r)
  137. {
  138. if (global_re_cache == NULL) {
  139. rspamd_regexp_library_init (NULL);
  140. }
  141. #if defined(WITH_PCRE2)
  142. gsize jsz;
  143. guint jit_flags = PCRE2_JIT_COMPLETE;
  144. /* Create match context */
  145. r->mcontext = pcre2_match_context_create (NULL);
  146. if (r->re != r->raw_re) {
  147. r->raw_mcontext = pcre2_match_context_create (NULL);
  148. }
  149. else {
  150. r->raw_mcontext = r->mcontext;
  151. }
  152. #ifdef HAVE_PCRE_JIT
  153. if (pcre2_jit_compile (r->re, jit_flags) < 0) {
  154. msg_err ("jit compilation of %s is not supported: %d", r->pattern, jit_flags);
  155. r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT;
  156. }
  157. else {
  158. if (!(pcre2_pattern_info (r->re, PCRE2_INFO_JITSIZE, &jsz) >= 0 && jsz > 0)) {
  159. msg_err ("jit compilation of %s is not supported", r->pattern);
  160. r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT;
  161. }
  162. }
  163. if (!(r->flags & RSPAMD_REGEXP_FLAG_DISABLE_JIT)) {
  164. pcre2_jit_stack_assign (r->mcontext, NULL, global_re_cache->jstack);
  165. }
  166. if (r->re != r->raw_re) {
  167. if (pcre2_jit_compile (r->raw_re, jit_flags) < 0) {
  168. msg_debug ("jit compilation of %s is not supported", r->pattern);
  169. r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT;
  170. }
  171. if (!(pcre2_pattern_info (r->raw_re, PCRE2_INFO_JITSIZE, &jsz) >= 0 && jsz > 0)) {
  172. msg_debug ("jit compilation of raw %s is not supported", r->pattern);
  173. }
  174. else if (!(r->flags & RSPAMD_REGEXP_FLAG_DISABLE_JIT)) {
  175. pcre2_jit_stack_assign (r->raw_mcontext, NULL, global_re_cache->jstack);
  176. }
  177. }
  178. #endif
  179. #else
  180. const gchar *err_str = "unknown";
  181. gboolean try_jit = TRUE, try_raw_jit = TRUE;
  182. gint study_flags = 0;
  183. #if defined(HAVE_PCRE_JIT)
  184. study_flags |= PCRE_STUDY_JIT_COMPILE;
  185. #endif
  186. /* Pcre 1 needs study */
  187. if (r->re) {
  188. r->extra = pcre_study (r->re, study_flags, &err_str);
  189. if (r->extra == NULL) {
  190. msg_debug ("cannot optimize regexp pattern: '%s': %s",
  191. r->pattern, err_str);
  192. try_jit = FALSE;
  193. r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT;
  194. }
  195. }
  196. else {
  197. g_assert_not_reached ();
  198. }
  199. if (r->raw_re && r->raw_re != r->re) {
  200. r->raw_extra = pcre_study (r->re, study_flags, &err_str);
  201. }
  202. else if (r->raw_re == r->re) {
  203. r->raw_extra = r->extra;
  204. }
  205. if (r->raw_extra == NULL) {
  206. msg_debug ("cannot optimize raw regexp pattern: '%s': %s",
  207. r->pattern, err_str);
  208. try_raw_jit = FALSE;
  209. }
  210. /* JIT path */
  211. if (try_jit) {
  212. #ifdef HAVE_PCRE_JIT
  213. gint jit, n;
  214. if (can_jit) {
  215. jit = 0;
  216. n = pcre_fullinfo (r->re, r->extra,
  217. PCRE_INFO_JIT, &jit);
  218. if (n != 0 || jit != 1) {
  219. msg_debug ("jit compilation of %s is not supported", r->pattern);
  220. r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT;
  221. }
  222. else {
  223. pcre_assign_jit_stack (r->extra, NULL, global_re_cache->jstack);
  224. }
  225. }
  226. #endif
  227. }
  228. else {
  229. msg_debug ("cannot optimize regexp pattern: '%s': %s",
  230. r->pattern, err_str);
  231. r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT;
  232. }
  233. if (try_raw_jit) {
  234. #ifdef HAVE_PCRE_JIT
  235. gint jit, n;
  236. if (can_jit) {
  237. if (r->raw_re != r->re) {
  238. jit = 0;
  239. n = pcre_fullinfo (r->raw_re, r->raw_extra,
  240. PCRE_INFO_JIT, &jit);
  241. if (n != 0 || jit != 1) {
  242. msg_debug ("jit compilation of %s is not supported", r->pattern);
  243. r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT;
  244. }
  245. else {
  246. pcre_assign_jit_stack (r->raw_extra, NULL,
  247. global_re_cache->jstack);
  248. }
  249. }
  250. }
  251. #endif
  252. }
  253. #endif /* WITH_PCRE2 */
  254. }
  255. rspamd_regexp_t*
  256. rspamd_regexp_new (const gchar *pattern, const gchar *flags,
  257. GError **err)
  258. {
  259. const gchar *start = pattern, *end, *flags_str = NULL;
  260. gchar *err_str;
  261. rspamd_regexp_t *res;
  262. gboolean explicit_utf = FALSE;
  263. PCRE_T *r;
  264. gchar sep = 0, *real_pattern;
  265. #ifndef WITH_PCRE2
  266. gint err_off;
  267. #else
  268. gsize err_off;
  269. #endif
  270. gint regexp_flags = 0, rspamd_flags = 0, err_code, ncaptures;
  271. gboolean strict_flags = FALSE;
  272. rspamd_regexp_library_init (NULL);
  273. if (flags == NULL) {
  274. /* We need to parse pattern and detect flags set */
  275. if (*start == '/') {
  276. sep = '/';
  277. }
  278. else if (*start == 'm') {
  279. start ++;
  280. sep = *start;
  281. /* Paired braces */
  282. if (sep == '{') {
  283. sep = '}';
  284. }
  285. rspamd_flags |= RSPAMD_REGEXP_FLAG_FULL_MATCH;
  286. }
  287. if (sep == '\0' || g_ascii_isalnum (sep)) {
  288. /* We have no flags, no separators and just use all line as expr */
  289. start = pattern;
  290. end = start + strlen (pattern);
  291. rspamd_flags &= ~RSPAMD_REGEXP_FLAG_FULL_MATCH;
  292. }
  293. else {
  294. end = strrchr (pattern, sep);
  295. if (end == NULL || end <= start) {
  296. g_set_error (err, rspamd_regexp_quark(), EINVAL,
  297. "pattern is not enclosed with %c: %s",
  298. sep, pattern);
  299. return NULL;
  300. }
  301. flags_str = end + 1;
  302. start ++;
  303. }
  304. }
  305. else {
  306. /* Strictly check all flags */
  307. strict_flags = TRUE;
  308. start = pattern;
  309. end = pattern + strlen (pattern);
  310. flags_str = flags;
  311. }
  312. rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW;
  313. #ifndef WITH_PCRE2
  314. regexp_flags &= ~PCRE_FLAG(UTF8);
  315. regexp_flags |= PCRE_FLAG(NEWLINE_ANYCRLF);
  316. #else
  317. regexp_flags &= ~PCRE_FLAG(UTF);
  318. #endif
  319. if (flags_str != NULL) {
  320. while (*flags_str) {
  321. switch (*flags_str) {
  322. case 'i':
  323. regexp_flags |= PCRE_FLAG(CASELESS);
  324. break;
  325. case 'm':
  326. regexp_flags |= PCRE_FLAG(MULTILINE);
  327. break;
  328. case 's':
  329. regexp_flags |= PCRE_FLAG(DOTALL);
  330. break;
  331. case 'x':
  332. regexp_flags |= PCRE_FLAG(EXTENDED);
  333. break;
  334. case 'u':
  335. rspamd_flags &= ~RSPAMD_REGEXP_FLAG_RAW;
  336. rspamd_flags |= RSPAMD_REGEXP_FLAG_UTF;
  337. #ifndef WITH_PCRE2
  338. regexp_flags |= PCRE_FLAG(UTF8);
  339. #else
  340. regexp_flags |= PCRE_FLAG(UTF);
  341. #endif
  342. explicit_utf = TRUE;
  343. break;
  344. case 'O':
  345. /* We optimize all regexps by default */
  346. rspamd_flags |= RSPAMD_REGEXP_FLAG_NOOPT;
  347. break;
  348. case 'r':
  349. rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW;
  350. rspamd_flags &= ~RSPAMD_REGEXP_FLAG_UTF;
  351. #ifndef WITH_PCRE2
  352. regexp_flags &= ~PCRE_FLAG(UTF8);
  353. #else
  354. regexp_flags &= ~PCRE_FLAG(UTF);
  355. #endif
  356. break;
  357. default:
  358. if (strict_flags) {
  359. g_set_error (err, rspamd_regexp_quark(), EINVAL,
  360. "invalid regexp flag: %c in pattern %s",
  361. *flags_str, pattern);
  362. return NULL;
  363. }
  364. msg_warn ("invalid flag '%c' in pattern %s", *flags_str, pattern);
  365. goto fin;
  366. break;
  367. }
  368. flags_str++;
  369. }
  370. }
  371. fin:
  372. real_pattern = g_malloc (end - start + 1);
  373. rspamd_strlcpy (real_pattern, start, end - start + 1);
  374. #ifndef WITH_PCRE2
  375. r = pcre_compile (real_pattern, regexp_flags,
  376. (const char **)&err_str, &err_off, NULL);
  377. (void)err_code;
  378. #else
  379. r = pcre2_compile (real_pattern, PCRE2_ZERO_TERMINATED,
  380. regexp_flags,
  381. &err_code, &err_off, pcre2_ctx);
  382. if (r == NULL) {
  383. err_str = g_alloca (1024);
  384. memset (err_str, 0, 1024);
  385. pcre2_get_error_message (err_code, err_str, 1024);
  386. }
  387. #endif
  388. if (r == NULL) {
  389. g_set_error (err, rspamd_regexp_quark(), EINVAL,
  390. "regexp parsing error: '%s' at position %d",
  391. err_str, (gint)err_off);
  392. g_free (real_pattern);
  393. return NULL;
  394. }
  395. /* Now allocate the target structure */
  396. res = g_malloc0 (sizeof (*res));
  397. REF_INIT_RETAIN (res, rspamd_regexp_dtor);
  398. res->flags = rspamd_flags;
  399. res->pattern = real_pattern;
  400. res->cache_id = RSPAMD_INVALID_ID;
  401. res->pcre_flags = regexp_flags;
  402. res->max_hits = 0;
  403. res->re = r;
  404. if (rspamd_flags & RSPAMD_REGEXP_FLAG_RAW) {
  405. res->raw_re = r;
  406. }
  407. else if (!explicit_utf) {
  408. #ifndef WITH_PCRE2
  409. res->raw_re = pcre_compile (real_pattern, regexp_flags & ~PCRE_FLAG(UTF8),
  410. (const char **)&err_str, &err_off, NULL);
  411. (void)err_code;
  412. #else
  413. res->raw_re = pcre2_compile (real_pattern, PCRE2_ZERO_TERMINATED,
  414. regexp_flags & ~PCRE_FLAG(UTF),
  415. &err_code, &err_off, pcre2_ctx);
  416. if (res->raw_re == NULL) {
  417. err_str = g_alloca (1024);
  418. memset (err_str, 0, 1024);
  419. pcre2_get_error_message (err_code, err_str, 1024);
  420. }
  421. #endif
  422. if (res->raw_re == NULL) {
  423. msg_warn ("raw regexp parsing error: '%s': '%s' at position %d",
  424. err_str, real_pattern, (gint)err_off);
  425. }
  426. }
  427. rspamd_regexp_post_process (res);
  428. rspamd_regexp_generate_id (pattern, flags, res->id);
  429. #ifndef WITH_PCRE2
  430. /* Check number of captures */
  431. if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_CAPTURECOUNT,
  432. &ncaptures) == 0) {
  433. res->ncaptures = ncaptures;
  434. }
  435. /* Check number of backrefs */
  436. if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_BACKREFMAX,
  437. &ncaptures) == 0) {
  438. res->nbackref = ncaptures;
  439. }
  440. #else
  441. /* Check number of captures */
  442. if (pcre2_pattern_info (res->raw_re, PCRE2_INFO_CAPTURECOUNT,
  443. &ncaptures) == 0) {
  444. res->ncaptures = ncaptures;
  445. }
  446. /* Check number of backrefs */
  447. if (pcre2_pattern_info (res->raw_re, PCRE2_INFO_BACKREFMAX,
  448. &ncaptures) == 0) {
  449. res->nbackref = ncaptures;
  450. }
  451. #endif
  452. return res;
  453. }
  454. #ifndef WITH_PCRE2
  455. gboolean
  456. rspamd_regexp_search (rspamd_regexp_t *re, const gchar *text, gsize len,
  457. const gchar **start, const gchar **end, gboolean raw,
  458. GArray *captures)
  459. {
  460. pcre *r;
  461. pcre_extra *ext;
  462. #if defined(HAVE_PCRE_JIT) && defined(HAVE_PCRE_JIT_FAST) && !defined(DISABLE_JIT_FAST)
  463. pcre_jit_stack *st = NULL;
  464. #endif
  465. const gchar *mt;
  466. gsize remain = 0;
  467. gint rc, match_flags = 0, *ovec, ncaptures, i;
  468. g_assert (re != NULL);
  469. g_assert (text != NULL);
  470. if (len == 0) {
  471. len = strlen (text);
  472. }
  473. if (end != NULL && *end != NULL) {
  474. /* Incremental search */
  475. mt = (*end);
  476. if ((gint)len > (mt - text)) {
  477. remain = len - (mt - text);
  478. }
  479. }
  480. else {
  481. mt = text;
  482. remain = len;
  483. }
  484. if (remain == 0) {
  485. return FALSE;
  486. }
  487. match_flags = PCRE_NEWLINE_ANYCRLF;
  488. if ((re->flags & RSPAMD_REGEXP_FLAG_RAW) || raw) {
  489. r = re->raw_re;
  490. ext = re->raw_extra;
  491. #if defined(HAVE_PCRE_JIT) && defined(HAVE_PCRE_JIT_FAST) && !defined(DISABLE_JIT_FAST)
  492. st = global_re_cache->jstack;
  493. #endif
  494. }
  495. else {
  496. r = re->re;
  497. ext = re->extra;
  498. #if defined(HAVE_PCRE_JIT) && defined(HAVE_PCRE_JIT_FAST) && !defined(DISABLE_JIT_FAST)
  499. if (g_utf8_validate (mt, remain, NULL)) {
  500. st = global_re_cache->jstack;
  501. }
  502. else {
  503. msg_err ("bad utf8 input for JIT re");
  504. return FALSE;
  505. }
  506. #endif
  507. }
  508. if (r == NULL) {
  509. /* Invalid regexp type for the specified input */
  510. return FALSE;
  511. }
  512. ncaptures = (re->ncaptures + 1) * 3;
  513. ovec = g_alloca (sizeof (gint) * ncaptures);
  514. if (!(re->flags & RSPAMD_REGEXP_FLAG_NOOPT)) {
  515. #ifdef HAVE_PCRE_JIT
  516. # if defined(HAVE_PCRE_JIT_FAST) && !defined(DISABLE_JIT_FAST)
  517. /* XXX: flags seems to be broken with jit fast path */
  518. g_assert (remain > 0);
  519. g_assert (mt != NULL);
  520. if (st != NULL && !(re->flags & RSPAMD_REGEXP_FLAG_DISABLE_JIT) && can_jit) {
  521. rc = pcre_jit_exec (r, ext, mt, remain, 0, 0, ovec,
  522. ncaptures, st);
  523. }
  524. else {
  525. rc = pcre_exec (r, ext, mt, remain, 0, match_flags, ovec,
  526. ncaptures);
  527. }
  528. # else
  529. rc = pcre_exec (r, ext, mt, remain, 0, match_flags, ovec,
  530. ncaptures);
  531. #endif
  532. #else
  533. rc = pcre_exec (r, ext, mt, remain, 0, match_flags, ovec,
  534. ncaptures);
  535. #endif
  536. }
  537. else {
  538. rc = pcre_exec (r, ext, mt, remain, 0, match_flags, ovec,
  539. ncaptures);
  540. }
  541. if (rc >= 0) {
  542. if (start) {
  543. *start = mt + ovec[0];
  544. }
  545. if (end) {
  546. *end = mt + ovec[1];
  547. }
  548. if (captures != NULL && rc > 1) {
  549. struct rspamd_re_capture *elt;
  550. g_assert (g_array_get_element_size (captures) ==
  551. sizeof (struct rspamd_re_capture));
  552. g_array_set_size (captures, rc);
  553. for (i = 0; i < rc; i ++) {
  554. elt = &g_array_index (captures, struct rspamd_re_capture, i);
  555. elt->p = mt + ovec[i * 2];
  556. elt->len = (mt + ovec[i * 2 + 1]) - elt->p;
  557. }
  558. }
  559. if (re->flags & RSPAMD_REGEXP_FLAG_FULL_MATCH) {
  560. /* We also ensure that the match is full */
  561. if (ovec[0] != 0 || (guint)ovec[1] < len) {
  562. return FALSE;
  563. }
  564. }
  565. return TRUE;
  566. }
  567. return FALSE;
  568. }
  569. #else
  570. /* PCRE 2 version */
  571. gboolean
  572. rspamd_regexp_search (rspamd_regexp_t *re, const gchar *text, gsize len,
  573. const gchar **start, const gchar **end, gboolean raw,
  574. GArray *captures)
  575. {
  576. pcre2_match_data *match_data;
  577. pcre2_match_context *mcontext;
  578. PCRE_T *r;
  579. const gchar *mt;
  580. gsize remain = 0, *ovec;
  581. gint rc, match_flags, novec, i;
  582. gboolean ret = FALSE;
  583. g_assert (re != NULL);
  584. g_assert (text != NULL);
  585. if (len == 0) {
  586. len = strlen (text);
  587. }
  588. if (end != NULL && *end != NULL) {
  589. /* Incremental search */
  590. mt = (*end);
  591. if ((gint)len > (mt - text)) {
  592. remain = len - (mt - text);
  593. }
  594. }
  595. else {
  596. mt = text;
  597. remain = len;
  598. }
  599. if (remain == 0) {
  600. return FALSE;
  601. }
  602. match_flags = 0;
  603. if (raw || re->re == re->raw_re) {
  604. r = re->raw_re;
  605. mcontext = re->raw_mcontext;
  606. }
  607. else {
  608. r = re->re;
  609. mcontext = re->mcontext;
  610. }
  611. match_data = pcre2_match_data_create (re->ncaptures + 1, NULL);
  612. #ifdef HAVE_PCRE_JIT
  613. if (!(re->flags & RSPAMD_REGEXP_FLAG_DISABLE_JIT) && can_jit) {
  614. if (re->re != re->raw_re && !g_utf8_validate (mt, remain, NULL)) {
  615. msg_err ("bad utf8 input for JIT re");
  616. return FALSE;
  617. }
  618. rc = pcre2_jit_match (r, mt, remain, 0, match_flags, match_data,
  619. mcontext);
  620. }
  621. else {
  622. rc = pcre2_match (r, mt, remain, 0, match_flags, match_data,
  623. mcontext);
  624. }
  625. #else
  626. rc = pcre2_match (r, mt, remain, 0, match_flags, match_data,
  627. mcontext);
  628. #endif
  629. if (rc >= 0) {
  630. novec = pcre2_get_ovector_count (match_data);
  631. ovec = pcre2_get_ovector_pointer (match_data);
  632. if (start) {
  633. *start = mt + ovec[0];
  634. }
  635. if (end) {
  636. *end = mt + ovec[1];
  637. }
  638. if (captures != NULL && novec > 1) {
  639. struct rspamd_re_capture *elt;
  640. g_assert (g_array_get_element_size (captures) ==
  641. sizeof (struct rspamd_re_capture));
  642. g_array_set_size (captures, novec);
  643. for (i = 0; i < novec; i ++) {
  644. elt = &g_array_index (captures, struct rspamd_re_capture, i);
  645. elt->p = mt + ovec[i * 2];
  646. elt->len = (mt + ovec[i * 2 + 1]) - elt->p;
  647. }
  648. }
  649. ret = TRUE;
  650. if (re->flags & RSPAMD_REGEXP_FLAG_FULL_MATCH) {
  651. /* We also ensure that the match is full */
  652. if (ovec[0] != 0 || (guint)ovec[1] < len) {
  653. ret = FALSE;
  654. }
  655. }
  656. }
  657. pcre2_match_data_free (match_data);
  658. return ret;
  659. }
  660. #endif
  661. const char*
  662. rspamd_regexp_get_pattern (rspamd_regexp_t *re)
  663. {
  664. g_assert (re != NULL);
  665. return re->pattern;
  666. }
  667. guint
  668. rspamd_regexp_set_flags (rspamd_regexp_t *re, guint new_flags)
  669. {
  670. guint old_flags;
  671. g_assert (re != NULL);
  672. old_flags = re->flags;
  673. re->flags = new_flags;
  674. return old_flags;
  675. }
  676. guint
  677. rspamd_regexp_get_flags (rspamd_regexp_t *re)
  678. {
  679. g_assert (re != NULL);
  680. return re->flags;
  681. }
  682. guint
  683. rspamd_regexp_get_pcre_flags (rspamd_regexp_t *re)
  684. {
  685. g_assert (re != NULL);
  686. return re->pcre_flags;
  687. }
  688. gint
  689. rspamd_regexp_get_nbackrefs (rspamd_regexp_t *re)
  690. {
  691. g_assert (re != NULL);
  692. return re->nbackref;
  693. }
  694. gint
  695. rspamd_regexp_get_ncaptures (rspamd_regexp_t *re)
  696. {
  697. g_assert (re != NULL);
  698. return re->ncaptures;
  699. }
  700. guint
  701. rspamd_regexp_get_maxhits (rspamd_regexp_t *re)
  702. {
  703. g_assert (re != NULL);
  704. return re->max_hits;
  705. }
  706. guint
  707. rspamd_regexp_set_maxhits (rspamd_regexp_t *re, guint new_maxhits)
  708. {
  709. guint old_hits;
  710. g_assert (re != NULL);
  711. old_hits = re->max_hits;
  712. re->max_hits = new_maxhits;
  713. return old_hits;
  714. }
  715. guint64
  716. rspamd_regexp_get_cache_id (rspamd_regexp_t *re)
  717. {
  718. g_assert (re != NULL);
  719. return re->cache_id;
  720. }
  721. guint64
  722. rspamd_regexp_set_cache_id (rspamd_regexp_t *re, guint64 id)
  723. {
  724. guint64 old;
  725. g_assert (re != NULL);
  726. old = re->cache_id;
  727. re->cache_id = id;
  728. return old;
  729. }
  730. gboolean
  731. rspamd_regexp_match (rspamd_regexp_t *re, const gchar *text, gsize len,
  732. gboolean raw)
  733. {
  734. const gchar *start = NULL, *end = NULL;
  735. g_assert (re != NULL);
  736. g_assert (text != NULL);
  737. if (rspamd_regexp_search (re, text, len, &start, &end, raw, NULL)) {
  738. if (start == text && end == text + len) {
  739. return TRUE;
  740. }
  741. }
  742. return FALSE;
  743. }
  744. void
  745. rspamd_regexp_unref (rspamd_regexp_t *re)
  746. {
  747. REF_RELEASE (re);
  748. }
  749. rspamd_regexp_t*
  750. rspamd_regexp_ref (rspamd_regexp_t *re)
  751. {
  752. g_assert (re != NULL);
  753. REF_RETAIN (re);
  754. return re;
  755. }
  756. void
  757. rspamd_regexp_set_ud (rspamd_regexp_t *re, gpointer ud)
  758. {
  759. g_assert (re != NULL);
  760. re->ud = ud;
  761. }
  762. gpointer
  763. rspamd_regexp_get_ud (rspamd_regexp_t *re)
  764. {
  765. g_assert (re != NULL);
  766. return re->ud;
  767. }
  768. gboolean
  769. rspamd_regexp_equal (gconstpointer a, gconstpointer b)
  770. {
  771. const guchar *ia = a, *ib = b;
  772. return (memcmp (ia, ib, sizeof (regexp_id_t)) == 0);
  773. }
  774. guint32
  775. rspamd_regexp_hash (gconstpointer a)
  776. {
  777. const guchar *ia = a;
  778. guint32 res;
  779. memcpy (&res, ia, sizeof (res));
  780. return res;
  781. }
  782. gboolean
  783. rspamd_regexp_cmp (gconstpointer a, gconstpointer b)
  784. {
  785. const guchar *ia = a, *ib = b;
  786. return memcmp (ia, ib, sizeof (regexp_id_t));
  787. }
  788. struct rspamd_regexp_cache*
  789. rspamd_regexp_cache_new (void)
  790. {
  791. struct rspamd_regexp_cache *ncache;
  792. ncache = g_malloc0 (sizeof (*ncache));
  793. ncache->tbl = g_hash_table_new_full (rspamd_regexp_hash, rspamd_regexp_equal,
  794. NULL, (GDestroyNotify)rspamd_regexp_unref);
  795. #ifdef HAVE_PCRE_JIT
  796. #ifdef WITH_PCRE2
  797. ncache->jstack = pcre2_jit_stack_create (32 * 1024, 1024 * 1024, NULL);
  798. #else
  799. ncache->jstack = pcre_jit_stack_alloc (32 * 1024, 1024 * 1024);
  800. #endif
  801. #endif
  802. return ncache;
  803. }
  804. rspamd_regexp_t*
  805. rspamd_regexp_cache_query (struct rspamd_regexp_cache* cache,
  806. const gchar *pattern,
  807. const gchar *flags)
  808. {
  809. rspamd_regexp_t *res = NULL;
  810. regexp_id_t id;
  811. if (cache == NULL) {
  812. rspamd_regexp_library_init (NULL);
  813. cache = global_re_cache;
  814. }
  815. g_assert (cache != NULL);
  816. rspamd_regexp_generate_id (pattern, flags, id);
  817. res = g_hash_table_lookup (cache->tbl, id);
  818. return res;
  819. }
  820. rspamd_regexp_t*
  821. rspamd_regexp_cache_create (struct rspamd_regexp_cache *cache,
  822. const gchar *pattern,
  823. const gchar *flags, GError **err)
  824. {
  825. rspamd_regexp_t *res;
  826. if (cache == NULL) {
  827. rspamd_regexp_library_init (NULL);
  828. cache = global_re_cache;
  829. }
  830. g_assert (cache != NULL);
  831. res = rspamd_regexp_cache_query (cache, pattern, flags);
  832. if (res != NULL) {
  833. return res;
  834. }
  835. res = rspamd_regexp_new (pattern, flags, err);
  836. if (res) {
  837. REF_RETAIN (res);
  838. g_hash_table_insert (cache->tbl, res->id, res);
  839. }
  840. return res;
  841. }
  842. void rspamd_regexp_cache_insert (struct rspamd_regexp_cache* cache,
  843. const gchar *pattern,
  844. const gchar *flags, rspamd_regexp_t *re)
  845. {
  846. g_assert (re != NULL);
  847. g_assert (pattern != NULL);
  848. if (cache == NULL) {
  849. rspamd_regexp_library_init (NULL);
  850. cache = global_re_cache;
  851. }
  852. g_assert (cache != NULL);
  853. /* Generate custom id */
  854. rspamd_regexp_generate_id (pattern, flags, re->id);
  855. REF_RETAIN (re);
  856. g_hash_table_insert (cache->tbl, re->id, re);
  857. }
  858. gboolean
  859. rspamd_regexp_cache_remove (struct rspamd_regexp_cache *cache,
  860. rspamd_regexp_t *re)
  861. {
  862. if (cache == NULL) {
  863. cache = global_re_cache;
  864. }
  865. g_assert (cache != NULL);
  866. g_assert (re != NULL);
  867. return g_hash_table_remove (cache->tbl, re->id);
  868. }
  869. void
  870. rspamd_regexp_cache_destroy (struct rspamd_regexp_cache *cache)
  871. {
  872. if (cache != NULL) {
  873. g_hash_table_destroy (cache->tbl);
  874. #ifdef HAVE_PCRE_JIT
  875. #ifdef WITH_PCRE2
  876. if (cache->jstack) {
  877. pcre2_jit_stack_free (cache->jstack);
  878. }
  879. #else
  880. if (cache->jstack) {
  881. pcre_jit_stack_free (cache->jstack);
  882. }
  883. #endif
  884. #endif
  885. }
  886. }
  887. void
  888. rspamd_regexp_library_init (struct rspamd_config *cfg)
  889. {
  890. if (cfg) {
  891. if (cfg->disable_pcre_jit) {
  892. can_jit = FALSE;
  893. check_jit = FALSE;
  894. }
  895. }
  896. if (global_re_cache == NULL) {
  897. global_re_cache = rspamd_regexp_cache_new ();
  898. #ifdef HAVE_PCRE_JIT
  899. gint jit, rc;
  900. gchar *str;
  901. if (check_jit) {
  902. #ifdef WITH_PCRE2
  903. pcre2_ctx = pcre2_compile_context_create (NULL);
  904. pcre2_set_newline (pcre2_ctx, PCRE_FLAG(NEWLINE_ANY));
  905. #endif
  906. #ifndef WITH_PCRE2
  907. rc = pcre_config (PCRE_CONFIG_JIT, &jit);
  908. #else
  909. rc = pcre2_config (PCRE2_CONFIG_JIT, &jit);
  910. #endif
  911. if (rc == 0 && jit == 1) {
  912. #ifndef WITH_PCRE2
  913. #ifdef PCRE_CONFIG_JITTARGET
  914. pcre_config (PCRE_CONFIG_JITTARGET, &str);
  915. msg_info ("pcre is compiled with JIT for %s", str);
  916. #else
  917. msg_info ("pcre is compiled with JIT for unknown target");
  918. #endif
  919. #else
  920. rc = pcre2_config (PCRE2_CONFIG_JITTARGET, NULL);
  921. if (rc > 0) {
  922. str = g_alloca (rc);
  923. pcre2_config (PCRE2_CONFIG_JITTARGET, str);
  924. msg_info ("pcre2 is compiled with JIT for %s", str);
  925. }
  926. else {
  927. msg_info ("pcre2 is compiled with JIT for unknown");
  928. }
  929. #endif /* WITH_PCRE2 */
  930. if (getenv ("VALGRIND") == NULL) {
  931. can_jit = TRUE;
  932. } else {
  933. msg_info ("disabling PCRE jit as it does not play well with valgrind");
  934. can_jit = FALSE;
  935. }
  936. } else {
  937. msg_info ("pcre is compiled without JIT support, so many optimizations"
  938. " are impossible");
  939. }
  940. }
  941. #else
  942. msg_info ("pcre is too old and has no JIT support, so many optimizations"
  943. " are impossible");
  944. #endif
  945. }
  946. }
  947. void
  948. rspamd_regexp_library_finalize (void)
  949. {
  950. if (global_re_cache != NULL) {
  951. rspamd_regexp_cache_destroy (global_re_cache);
  952. #ifdef WITH_PCRE2
  953. pcre2_compile_context_free (pcre2_ctx);
  954. #endif
  955. }
  956. }
  957. gpointer
  958. rspamd_regexp_get_id (rspamd_regexp_t *re)
  959. {
  960. g_assert (re != NULL);
  961. return re->id;
  962. }
  963. gpointer
  964. rspamd_regexp_get_class (rspamd_regexp_t *re)
  965. {
  966. g_assert (re != NULL);
  967. return re->re_class;
  968. }
  969. gpointer
  970. rspamd_regexp_set_class (rspamd_regexp_t *re, gpointer re_class)
  971. {
  972. gpointer old_class;
  973. g_assert (re != NULL);
  974. old_class = re->re_class;
  975. re->re_class = re_class;
  976. return old_class;
  977. }
  978. rspamd_regexp_t *
  979. rspamd_regexp_from_glob (const gchar *gl, gsize sz, GError **err)
  980. {
  981. GString *out;
  982. rspamd_regexp_t *re;
  983. const gchar *end;
  984. gboolean escaping = FALSE;
  985. gint nbraces = 0;
  986. g_assert (gl != NULL);
  987. if (sz == 0) {
  988. sz = strlen (gl);
  989. }
  990. end = gl + sz;
  991. out = g_string_sized_new (sz + 2);
  992. g_string_append_c (out, '^');
  993. while (gl < end) {
  994. switch (*gl) {
  995. case '*':
  996. if (escaping) {
  997. g_string_append (out, "\\*");
  998. }
  999. else {
  1000. g_string_append (out, ".*");
  1001. }
  1002. escaping = FALSE;
  1003. break;
  1004. case '?':
  1005. if (escaping) {
  1006. g_string_append (out, "\\?");
  1007. }
  1008. else {
  1009. g_string_append (out, ".");
  1010. }
  1011. escaping = FALSE;
  1012. break;
  1013. case '.':
  1014. case '(':
  1015. case ')':
  1016. case '+':
  1017. case '|':
  1018. case '^':
  1019. case '$':
  1020. case '@':
  1021. case '%':
  1022. g_string_append_c (out, '\\');
  1023. g_string_append_c (out, *gl);
  1024. escaping = FALSE;
  1025. break;
  1026. case '\\':
  1027. if (escaping) {
  1028. g_string_append (out, "\\\\");
  1029. escaping = FALSE;
  1030. }
  1031. else {
  1032. escaping = TRUE;
  1033. }
  1034. break;
  1035. case '{':
  1036. if (escaping) {
  1037. g_string_append (out, "\\{");
  1038. }
  1039. else {
  1040. g_string_append_c (out, '(');
  1041. nbraces++;
  1042. }
  1043. escaping = FALSE;
  1044. break;
  1045. case '}':
  1046. if (nbraces > 0 && !escaping) {
  1047. g_string_append_c (out, ')');
  1048. nbraces--;
  1049. }
  1050. else if (escaping) {
  1051. g_string_append (out, "\\}");
  1052. }
  1053. else {
  1054. g_string_append (out, "}");
  1055. }
  1056. escaping = FALSE;
  1057. break;
  1058. case ',':
  1059. if (nbraces > 0 && !escaping) {
  1060. g_string_append_c (out, '|');
  1061. }
  1062. else if (escaping) {
  1063. g_string_append (out, "\\,");
  1064. }
  1065. else {
  1066. g_string_append_c (out, ',');
  1067. }
  1068. break;
  1069. default:
  1070. escaping = FALSE;
  1071. g_string_append_c (out, *gl);
  1072. break;
  1073. }
  1074. gl ++;
  1075. }
  1076. g_string_append_c (out, '$');
  1077. re = rspamd_regexp_new (out->str, "i", err);
  1078. g_string_free (out, TRUE);
  1079. return re;
  1080. }