You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

multipattern.c 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "libutil/multipattern.h"
  18. #include "libutil/str_util.h"
  19. #include "libcryptobox/cryptobox.h"
  20. #ifdef WITH_HYPERSCAN
  21. #include "logger.h"
  22. #include "unix-std.h"
  23. #include "hs.h"
  24. #include "libserver/hyperscan_tools.h"
  25. #endif
  26. #include "acism.h"
  27. #include "libutil/regexp.h"
  28. #include <stdalign.h>
  29. #define MAX_SCRATCH 4
  30. enum rspamd_hs_check_state {
  31. RSPAMD_HS_UNCHECKED = 0,
  32. RSPAMD_HS_SUPPORTED,
  33. RSPAMD_HS_UNSUPPORTED
  34. };
  35. static const char *hs_cache_dir = NULL;
  36. static enum rspamd_hs_check_state hs_suitable_cpu = RSPAMD_HS_UNCHECKED;
  37. struct RSPAMD_ALIGNED(64) rspamd_multipattern {
  38. #ifdef WITH_HYPERSCAN
  39. rspamd_cryptobox_hash_state_t hash_state;
  40. rspamd_hyperscan_t *hs_db;
  41. hs_scratch_t *scratch[MAX_SCRATCH];
  42. GArray *hs_pats;
  43. GArray *hs_ids;
  44. GArray *hs_flags;
  45. unsigned int scratch_used;
  46. #endif
  47. ac_trie_t *t;
  48. GArray *pats;
  49. GArray *res;
  50. gboolean compiled;
  51. unsigned int cnt;
  52. enum rspamd_multipattern_flags flags;
  53. };
  54. static GQuark
  55. rspamd_multipattern_quark(void)
  56. {
  57. return g_quark_from_static_string("multipattern");
  58. }
  59. static inline gboolean
  60. rspamd_hs_check(void)
  61. {
  62. #ifdef WITH_HYPERSCAN
  63. if (G_UNLIKELY(hs_suitable_cpu == RSPAMD_HS_UNCHECKED)) {
  64. if (hs_valid_platform() == HS_SUCCESS) {
  65. hs_suitable_cpu = RSPAMD_HS_SUPPORTED;
  66. }
  67. else {
  68. hs_suitable_cpu = RSPAMD_HS_UNSUPPORTED;
  69. }
  70. }
  71. #endif
  72. return hs_suitable_cpu == RSPAMD_HS_SUPPORTED;
  73. }
  74. void rspamd_multipattern_library_init(const char *cache_dir)
  75. {
  76. hs_cache_dir = cache_dir;
  77. #ifdef WITH_HYPERSCAN
  78. rspamd_hs_check();
  79. #endif
  80. }
  81. #ifdef WITH_HYPERSCAN
  82. static char *
  83. rspamd_multipattern_escape_tld_hyperscan(const char *pattern, gsize slen,
  84. gsize *dst_len)
  85. {
  86. gsize len;
  87. const char *p, *prefix, *suffix;
  88. char *res;
  89. /*
  90. * We understand the following cases
  91. * 1) blah -> .blah\b
  92. * 2) *.blah -> ..*\\.blah\b|$
  93. * 3) ???
  94. */
  95. if (pattern[0] == '*') {
  96. p = strchr(pattern, '.');
  97. if (p == NULL) {
  98. /* XXX: bad */
  99. p = pattern;
  100. }
  101. else {
  102. p++;
  103. }
  104. prefix = "\\.";
  105. len = slen + strlen(prefix);
  106. }
  107. else {
  108. prefix = "\\.";
  109. p = pattern;
  110. len = slen + strlen(prefix);
  111. }
  112. suffix = "(:?\\b|$)";
  113. len += strlen(suffix);
  114. res = g_malloc(len + 1);
  115. slen = rspamd_strlcpy(res, prefix, len + 1);
  116. slen += rspamd_strlcpy(res + slen, p, len + 1 - slen);
  117. slen += rspamd_strlcpy(res + slen, suffix, len + 1 - slen);
  118. *dst_len = slen;
  119. return res;
  120. }
  121. #endif
  122. static char *
  123. rspamd_multipattern_escape_tld_acism(const char *pattern, gsize len,
  124. gsize *dst_len)
  125. {
  126. gsize dlen, slen;
  127. const char *p, *prefix;
  128. char *res;
  129. /*
  130. * We understand the following cases
  131. * 1) blah -> \\.blah
  132. * 2) *.blah -> \\..*\\.blah
  133. * 3) ???
  134. */
  135. slen = len;
  136. if (pattern[0] == '*') {
  137. dlen = slen;
  138. p = memchr(pattern, '.', len);
  139. if (p == NULL) {
  140. /* XXX: bad */
  141. p = pattern;
  142. }
  143. else {
  144. p++;
  145. }
  146. dlen -= p - pattern;
  147. prefix = ".";
  148. dlen++;
  149. }
  150. else {
  151. dlen = slen + 1;
  152. prefix = ".";
  153. p = pattern;
  154. }
  155. res = g_malloc(dlen + 1);
  156. slen = strlen(prefix);
  157. memcpy(res, prefix, slen);
  158. rspamd_strlcpy(res + slen, p, dlen - slen + 1);
  159. *dst_len = dlen;
  160. return res;
  161. }
  162. /*
  163. * Escapes special characters from specific pattern
  164. */
  165. static char *
  166. rspamd_multipattern_pattern_filter(const char *pattern, gsize len,
  167. enum rspamd_multipattern_flags flags,
  168. gsize *dst_len)
  169. {
  170. char *ret = NULL;
  171. int gl_flags = RSPAMD_REGEXP_ESCAPE_ASCII;
  172. if (flags & RSPAMD_MULTIPATTERN_UTF8) {
  173. gl_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
  174. }
  175. #ifdef WITH_HYPERSCAN
  176. if (rspamd_hs_check()) {
  177. if (flags & RSPAMD_MULTIPATTERN_TLD) {
  178. char *tmp;
  179. gsize tlen;
  180. tmp = rspamd_multipattern_escape_tld_hyperscan(pattern, len, &tlen);
  181. ret = rspamd_str_regexp_escape(tmp, tlen, dst_len,
  182. gl_flags | RSPAMD_REGEXP_ESCAPE_RE);
  183. g_free(tmp);
  184. }
  185. else if (flags & RSPAMD_MULTIPATTERN_RE) {
  186. ret = rspamd_str_regexp_escape(pattern, len, dst_len, gl_flags | RSPAMD_REGEXP_ESCAPE_RE);
  187. }
  188. else if (flags & RSPAMD_MULTIPATTERN_GLOB) {
  189. ret = rspamd_str_regexp_escape(pattern, len, dst_len,
  190. gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB);
  191. }
  192. else {
  193. ret = rspamd_str_regexp_escape(pattern, len, dst_len, gl_flags);
  194. }
  195. return ret;
  196. }
  197. #endif
  198. if (flags & RSPAMD_MULTIPATTERN_TLD) {
  199. ret = rspamd_multipattern_escape_tld_acism(pattern, len, dst_len);
  200. }
  201. else if (flags & RSPAMD_MULTIPATTERN_RE) {
  202. ret = rspamd_str_regexp_escape(pattern, len, dst_len, gl_flags | RSPAMD_REGEXP_ESCAPE_RE);
  203. }
  204. else if (flags & RSPAMD_MULTIPATTERN_GLOB) {
  205. ret = rspamd_str_regexp_escape(pattern, len, dst_len,
  206. gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB);
  207. }
  208. else {
  209. ret = malloc(len + 1);
  210. *dst_len = rspamd_strlcpy(ret, pattern, len + 1);
  211. }
  212. return ret;
  213. }
  214. struct rspamd_multipattern *
  215. rspamd_multipattern_create(enum rspamd_multipattern_flags flags)
  216. {
  217. struct rspamd_multipattern *mp;
  218. /* Align due to blake2b state */
  219. (void) !posix_memalign((void **) &mp, RSPAMD_ALIGNOF(struct rspamd_multipattern),
  220. sizeof(*mp));
  221. g_assert(mp != NULL);
  222. memset(mp, 0, sizeof(*mp));
  223. mp->flags = flags;
  224. #ifdef WITH_HYPERSCAN
  225. if (rspamd_hs_check()) {
  226. mp->hs_pats = g_array_new(FALSE, TRUE, sizeof(char *));
  227. mp->hs_flags = g_array_new(FALSE, TRUE, sizeof(int));
  228. mp->hs_ids = g_array_new(FALSE, TRUE, sizeof(int));
  229. rspamd_cryptobox_hash_init(&mp->hash_state, NULL, 0);
  230. return mp;
  231. }
  232. #endif
  233. mp->pats = g_array_new(FALSE, TRUE, sizeof(ac_trie_pat_t));
  234. return mp;
  235. }
  236. struct rspamd_multipattern *
  237. rspamd_multipattern_create_sized(unsigned int npatterns,
  238. enum rspamd_multipattern_flags flags)
  239. {
  240. struct rspamd_multipattern *mp;
  241. /* Align due to blake2b state */
  242. (void) !posix_memalign((void **) &mp, RSPAMD_ALIGNOF(struct rspamd_multipattern), sizeof(*mp));
  243. g_assert(mp != NULL);
  244. memset(mp, 0, sizeof(*mp));
  245. mp->flags = flags;
  246. #ifdef WITH_HYPERSCAN
  247. if (rspamd_hs_check()) {
  248. mp->hs_pats = g_array_sized_new(FALSE, TRUE, sizeof(char *), npatterns);
  249. mp->hs_flags = g_array_sized_new(FALSE, TRUE, sizeof(int), npatterns);
  250. mp->hs_ids = g_array_sized_new(FALSE, TRUE, sizeof(int), npatterns);
  251. rspamd_cryptobox_hash_init(&mp->hash_state, NULL, 0);
  252. return mp;
  253. }
  254. #endif
  255. mp->pats = g_array_sized_new(FALSE, TRUE, sizeof(ac_trie_pat_t), npatterns);
  256. return mp;
  257. }
  258. void rspamd_multipattern_add_pattern(struct rspamd_multipattern *mp,
  259. const char *pattern, int flags)
  260. {
  261. g_assert(pattern != NULL);
  262. rspamd_multipattern_add_pattern_len(mp, pattern, strlen(pattern), flags);
  263. }
  264. void rspamd_multipattern_add_pattern_len(struct rspamd_multipattern *mp,
  265. const char *pattern, gsize patlen, int flags)
  266. {
  267. gsize dlen;
  268. g_assert(pattern != NULL);
  269. g_assert(mp != NULL);
  270. g_assert(!mp->compiled);
  271. #ifdef WITH_HYPERSCAN
  272. if (rspamd_hs_check()) {
  273. char *np;
  274. int fl = HS_FLAG_SOM_LEFTMOST;
  275. int adjusted_flags = mp->flags | flags;
  276. if (adjusted_flags & RSPAMD_MULTIPATTERN_ICASE) {
  277. fl |= HS_FLAG_CASELESS;
  278. }
  279. if (adjusted_flags & RSPAMD_MULTIPATTERN_UTF8) {
  280. if (adjusted_flags & RSPAMD_MULTIPATTERN_TLD) {
  281. fl |= HS_FLAG_UTF8;
  282. }
  283. else {
  284. fl |= HS_FLAG_UTF8 | HS_FLAG_UCP;
  285. }
  286. }
  287. if (adjusted_flags & RSPAMD_MULTIPATTERN_DOTALL) {
  288. fl |= HS_FLAG_DOTALL;
  289. }
  290. if (adjusted_flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
  291. fl |= HS_FLAG_SINGLEMATCH;
  292. fl &= ~HS_FLAG_SOM_LEFTMOST; /* According to hyperscan docs */
  293. }
  294. if (adjusted_flags & RSPAMD_MULTIPATTERN_NO_START) {
  295. fl &= ~HS_FLAG_SOM_LEFTMOST;
  296. }
  297. g_array_append_val(mp->hs_flags, fl);
  298. np = rspamd_multipattern_pattern_filter(pattern, patlen, flags, &dlen);
  299. g_array_append_val(mp->hs_pats, np);
  300. fl = mp->cnt;
  301. g_array_append_val(mp->hs_ids, fl);
  302. rspamd_cryptobox_hash_update(&mp->hash_state, np, dlen);
  303. mp->cnt++;
  304. return;
  305. }
  306. #endif
  307. ac_trie_pat_t pat;
  308. pat.ptr = rspamd_multipattern_pattern_filter(pattern, patlen, flags, &dlen);
  309. pat.len = dlen;
  310. g_array_append_val(mp->pats, pat);
  311. mp->cnt++;
  312. }
  313. struct rspamd_multipattern *
  314. rspamd_multipattern_create_full(const char **patterns,
  315. unsigned int npatterns, enum rspamd_multipattern_flags flags)
  316. {
  317. struct rspamd_multipattern *mp;
  318. unsigned int i;
  319. g_assert(npatterns > 0);
  320. g_assert(patterns != NULL);
  321. mp = rspamd_multipattern_create_sized(npatterns, flags);
  322. for (i = 0; i < npatterns; i++) {
  323. rspamd_multipattern_add_pattern(mp, patterns[i], flags);
  324. }
  325. return mp;
  326. }
  327. #ifdef WITH_HYPERSCAN
  328. static gboolean
  329. rspamd_multipattern_try_load_hs(struct rspamd_multipattern *mp,
  330. const unsigned char *hash)
  331. {
  332. char fp[PATH_MAX];
  333. if (hs_cache_dir == NULL) {
  334. return FALSE;
  335. }
  336. rspamd_snprintf(fp, sizeof(fp), "%s/%*xs.hsmp", hs_cache_dir,
  337. (int) rspamd_cryptobox_HASHBYTES / 2, hash);
  338. mp->hs_db = rspamd_hyperscan_maybe_load(fp, 0);
  339. return mp->hs_db != NULL;
  340. }
  341. static void
  342. rspamd_multipattern_try_save_hs(struct rspamd_multipattern *mp,
  343. const unsigned char *hash)
  344. {
  345. char fp[PATH_MAX], np[PATH_MAX];
  346. char *bytes = NULL;
  347. gsize len;
  348. int fd;
  349. if (hs_cache_dir == NULL) {
  350. return;
  351. }
  352. rspamd_snprintf(fp, sizeof(fp), "%s%shsmp-XXXXXXXXXXXXX", G_DIR_SEPARATOR_S,
  353. hs_cache_dir);
  354. if ((fd = g_mkstemp_full(fp, O_CREAT | O_EXCL | O_WRONLY, 00644)) != -1) {
  355. int ret;
  356. if ((ret = hs_serialize_database(rspamd_hyperscan_get_database(mp->hs_db), &bytes, &len)) == HS_SUCCESS) {
  357. if (write(fd, bytes, len) == -1) {
  358. msg_warn("cannot write hyperscan cache to %s: %s",
  359. fp, strerror(errno));
  360. unlink(fp);
  361. free(bytes);
  362. }
  363. else {
  364. free(bytes);
  365. fsync(fd);
  366. rspamd_snprintf(np, sizeof(np), "%s/%*xs.hsmp", hs_cache_dir,
  367. (int) rspamd_cryptobox_HASHBYTES / 2, hash);
  368. if (rename(fp, np) == -1) {
  369. msg_warn("cannot rename hyperscan cache from %s to %s: %s",
  370. fp, np, strerror(errno));
  371. unlink(fp);
  372. }
  373. else {
  374. rspamd_hyperscan_notice_known(np);
  375. }
  376. }
  377. }
  378. else {
  379. msg_warn("cannot serialize hyperscan cache to %s: error code %d",
  380. fp, ret);
  381. unlink(fp);
  382. }
  383. close(fd);
  384. }
  385. else {
  386. msg_warn("cannot open a temp file %s to write hyperscan cache: %s", fp, strerror(errno));
  387. }
  388. }
  389. #endif
  390. gboolean
  391. rspamd_multipattern_compile(struct rspamd_multipattern *mp, int flags, GError **err)
  392. {
  393. g_assert(mp != NULL);
  394. g_assert(!mp->compiled);
  395. #ifdef WITH_HYPERSCAN
  396. if (rspamd_hs_check()) {
  397. unsigned int i;
  398. hs_platform_info_t plt;
  399. hs_compile_error_t *hs_errors;
  400. unsigned char hash[rspamd_cryptobox_HASHBYTES];
  401. if (mp->cnt > 0) {
  402. g_assert(hs_populate_platform(&plt) == HS_SUCCESS);
  403. rspamd_cryptobox_hash_update(&mp->hash_state, (void *) &plt, sizeof(plt));
  404. rspamd_cryptobox_hash_final(&mp->hash_state, hash);
  405. if ((flags & RSPAMD_MULTIPATTERN_COMPILE_NO_FS) || !rspamd_multipattern_try_load_hs(mp, hash)) {
  406. hs_database_t *db = NULL;
  407. if (hs_compile_multi((const char *const *) mp->hs_pats->data,
  408. (const unsigned int *) mp->hs_flags->data,
  409. (const unsigned int *) mp->hs_ids->data,
  410. mp->cnt,
  411. HS_MODE_BLOCK,
  412. &plt,
  413. &db,
  414. &hs_errors) != HS_SUCCESS) {
  415. g_set_error(err, rspamd_multipattern_quark(), EINVAL,
  416. "cannot create tree of regexp when processing '%s': %s",
  417. g_array_index(mp->hs_pats, char *, hs_errors->expression),
  418. hs_errors->message);
  419. hs_free_compile_error(hs_errors);
  420. return FALSE;
  421. }
  422. if (!(flags & RSPAMD_MULTIPATTERN_COMPILE_NO_FS)) {
  423. if (hs_cache_dir != NULL) {
  424. char fpath[PATH_MAX];
  425. rspamd_snprintf(fpath, sizeof(fpath), "%s/%*xs.hsmp", hs_cache_dir,
  426. (int) rspamd_cryptobox_HASHBYTES / 2, hash);
  427. mp->hs_db = rspamd_hyperscan_from_raw_db(db, fpath);
  428. }
  429. else {
  430. /* Should not happen in the real life */
  431. mp->hs_db = rspamd_hyperscan_from_raw_db(db, NULL);
  432. }
  433. rspamd_multipattern_try_save_hs(mp, hash);
  434. }
  435. else {
  436. mp->hs_db = rspamd_hyperscan_from_raw_db(db, NULL);
  437. }
  438. }
  439. for (i = 0; i < MAX_SCRATCH; i++) {
  440. mp->scratch[i] = NULL;
  441. }
  442. for (i = 0; i < MAX_SCRATCH; i++) {
  443. int ret;
  444. if ((ret = hs_alloc_scratch(rspamd_hyperscan_get_database(mp->hs_db), &mp->scratch[i])) != HS_SUCCESS) {
  445. msg_err("cannot allocate scratch space for hyperscan: error code %d", ret);
  446. /* Clean all scratches that are non-NULL */
  447. for (int ii = 0; ii < MAX_SCRATCH; ii++) {
  448. if (mp->scratch[ii] != NULL) {
  449. hs_free_scratch(mp->scratch[ii]);
  450. }
  451. }
  452. g_set_error(err, rspamd_multipattern_quark(), EINVAL,
  453. "cannot allocate scratch space for hyperscan: error code %d", ret);
  454. rspamd_hyperscan_free(mp->hs_db, true);
  455. mp->hs_db = NULL;
  456. return FALSE;
  457. }
  458. }
  459. }
  460. mp->compiled = TRUE;
  461. return TRUE;
  462. }
  463. #endif
  464. if (mp->cnt > 0) {
  465. if (mp->flags & (RSPAMD_MULTIPATTERN_GLOB | RSPAMD_MULTIPATTERN_RE)) {
  466. /* Fallback to pcre... */
  467. rspamd_regexp_t *re;
  468. mp->res = g_array_sized_new(FALSE, TRUE,
  469. sizeof(rspamd_regexp_t *), mp->cnt);
  470. for (unsigned int i = 0; i < mp->cnt; i++) {
  471. const ac_trie_pat_t *pat;
  472. const char *pat_flags = NULL;
  473. if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) {
  474. pat_flags = "u";
  475. }
  476. pat = &g_array_index(mp->pats, ac_trie_pat_t, i);
  477. re = rspamd_regexp_new(pat->ptr, pat_flags, err);
  478. if (re == NULL) {
  479. return FALSE;
  480. }
  481. g_array_append_val(mp->res, re);
  482. }
  483. }
  484. else {
  485. mp->t = acism_create((const ac_trie_pat_t *) mp->pats->data, mp->cnt);
  486. }
  487. }
  488. mp->compiled = TRUE;
  489. return TRUE;
  490. }
  491. struct rspamd_multipattern_cbdata {
  492. struct rspamd_multipattern *mp;
  493. const char *in;
  494. gsize len;
  495. rspamd_multipattern_cb_t cb;
  496. gpointer ud;
  497. unsigned int nfound;
  498. int ret;
  499. };
  500. #ifdef WITH_HYPERSCAN
  501. static int
  502. rspamd_multipattern_hs_cb(unsigned int id,
  503. unsigned long long from,
  504. unsigned long long to,
  505. unsigned int flags,
  506. void *ud)
  507. {
  508. struct rspamd_multipattern_cbdata *cbd = ud;
  509. int ret = 0;
  510. if (to > 0) {
  511. if (from == HS_OFFSET_PAST_HORIZON) {
  512. from = 0;
  513. }
  514. ret = cbd->cb(cbd->mp, id, from, to, cbd->in, cbd->len, cbd->ud);
  515. cbd->nfound++;
  516. cbd->ret = ret;
  517. }
  518. return ret;
  519. }
  520. #endif
  521. static int
  522. rspamd_multipattern_acism_cb(int strnum, int textpos, void *context)
  523. {
  524. struct rspamd_multipattern_cbdata *cbd = context;
  525. int ret;
  526. ac_trie_pat_t pat;
  527. pat = g_array_index(cbd->mp->pats, ac_trie_pat_t, strnum);
  528. ret = cbd->cb(cbd->mp, strnum, textpos - pat.len,
  529. textpos, cbd->in, cbd->len, cbd->ud);
  530. cbd->nfound++;
  531. cbd->ret = ret;
  532. return ret;
  533. }
  534. int rspamd_multipattern_lookup(struct rspamd_multipattern *mp,
  535. const char *in, gsize len, rspamd_multipattern_cb_t cb,
  536. gpointer ud, unsigned int *pnfound)
  537. {
  538. struct rspamd_multipattern_cbdata cbd;
  539. int ret = 0;
  540. g_assert(mp != NULL);
  541. if (mp->cnt == 0 || !mp->compiled || len == 0) {
  542. return 0;
  543. }
  544. cbd.mp = mp;
  545. cbd.in = in;
  546. cbd.len = len;
  547. cbd.cb = cb;
  548. cbd.ud = ud;
  549. cbd.nfound = 0;
  550. cbd.ret = 0;
  551. #ifdef WITH_HYPERSCAN
  552. if (rspamd_hs_check()) {
  553. hs_scratch_t *scr = NULL;
  554. unsigned int i;
  555. for (i = 0; i < MAX_SCRATCH; i++) {
  556. if (!(mp->scratch_used & (1 << i))) {
  557. mp->scratch_used |= (1 << i);
  558. scr = mp->scratch[i];
  559. break;
  560. }
  561. }
  562. g_assert(scr != NULL);
  563. ret = hs_scan(rspamd_hyperscan_get_database(mp->hs_db), in, len, 0, scr,
  564. rspamd_multipattern_hs_cb, &cbd);
  565. mp->scratch_used &= ~(1 << i);
  566. if (ret == HS_SUCCESS) {
  567. ret = 0;
  568. }
  569. else if (ret == HS_SCAN_TERMINATED) {
  570. ret = cbd.ret;
  571. }
  572. if (pnfound) {
  573. *pnfound = cbd.nfound;
  574. }
  575. return ret;
  576. }
  577. #endif
  578. int state = 0;
  579. if (mp->flags & (RSPAMD_MULTIPATTERN_GLOB | RSPAMD_MULTIPATTERN_RE)) {
  580. /* Terribly inefficient, but who cares - just use hyperscan */
  581. for (unsigned int i = 0; i < mp->cnt; i++) {
  582. rspamd_regexp_t *re = g_array_index(mp->res, rspamd_regexp_t *, i);
  583. const char *start = NULL, *end = NULL;
  584. while (rspamd_regexp_search(re,
  585. in,
  586. len,
  587. &start,
  588. &end,
  589. TRUE,
  590. NULL)) {
  591. if (start >= end) {
  592. /* We found all matches, so no more hits are possible (protect from empty patterns) */
  593. break;
  594. }
  595. if (rspamd_multipattern_acism_cb(i, end - in, &cbd)) {
  596. goto out;
  597. }
  598. }
  599. }
  600. out:
  601. ret = cbd.ret;
  602. if (pnfound) {
  603. *pnfound = cbd.nfound;
  604. }
  605. }
  606. else {
  607. /* Plain trie */
  608. ret = acism_lookup(mp->t, in, len, rspamd_multipattern_acism_cb, &cbd,
  609. &state, mp->flags & RSPAMD_MULTIPATTERN_ICASE);
  610. if (pnfound) {
  611. *pnfound = cbd.nfound;
  612. }
  613. }
  614. return ret;
  615. }
  616. void rspamd_multipattern_destroy(struct rspamd_multipattern *mp)
  617. {
  618. unsigned int i;
  619. if (mp) {
  620. #ifdef WITH_HYPERSCAN
  621. if (rspamd_hs_check()) {
  622. char *p;
  623. if (mp->compiled && mp->cnt > 0) {
  624. for (i = 0; i < MAX_SCRATCH; i++) {
  625. hs_free_scratch(mp->scratch[i]);
  626. }
  627. if (mp->hs_db) {
  628. rspamd_hyperscan_free(mp->hs_db, false);
  629. }
  630. }
  631. for (i = 0; i < mp->cnt; i++) {
  632. p = g_array_index(mp->hs_pats, char *, i);
  633. g_free(p);
  634. }
  635. g_array_free(mp->hs_pats, TRUE);
  636. g_array_free(mp->hs_ids, TRUE);
  637. g_array_free(mp->hs_flags, TRUE);
  638. free(mp); /* Due to posix_memalign */
  639. return;
  640. }
  641. #endif
  642. ac_trie_pat_t pat;
  643. if (mp->compiled && mp->cnt > 0) {
  644. acism_destroy(mp->t);
  645. }
  646. for (i = 0; i < mp->cnt; i++) {
  647. pat = g_array_index(mp->pats, ac_trie_pat_t, i);
  648. g_free((char *) pat.ptr);
  649. }
  650. g_array_free(mp->pats, TRUE);
  651. g_free(mp);
  652. }
  653. }
  654. const char *
  655. rspamd_multipattern_get_pattern(struct rspamd_multipattern *mp,
  656. unsigned int index)
  657. {
  658. g_assert(mp != NULL);
  659. g_assert(index < mp->cnt);
  660. #ifdef WITH_HYPERSCAN
  661. if (rspamd_hs_check()) {
  662. return g_array_index(mp->hs_pats, char *, index);
  663. }
  664. #endif
  665. ac_trie_pat_t pat;
  666. pat = g_array_index(mp->pats, ac_trie_pat_t, index);
  667. return pat.ptr;
  668. }
  669. unsigned int rspamd_multipattern_get_npatterns(struct rspamd_multipattern *mp)
  670. {
  671. g_assert(mp != NULL);
  672. return mp->cnt;
  673. }
  674. gboolean
  675. rspamd_multipattern_has_hyperscan(void)
  676. {
  677. return rspamd_hs_check();
  678. }