You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

multipattern.c 14KB


  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "libutil/multipattern.h"
  18. #include "libutil/str_util.h"
  19. #include "libcryptobox/cryptobox.h"
  20. #ifdef WITH_HYPERSCAN
  21. #include "logger.h"
  22. #include "unix-std.h"
  23. #include "hs.h"
  24. #endif
  25. #include "acism.h"
  26. #define MAX_SCRATCH 4
  27. enum rspamd_hs_check_state {
  28. RSPAMD_HS_UNCHECKED = 0,
  29. RSPAMD_HS_SUPPORTED,
  30. RSPAMD_HS_UNSUPPORTED
  31. };
  32. static const char *hs_cache_dir = NULL;
  33. static enum rspamd_hs_check_state hs_suitable_cpu = RSPAMD_HS_UNCHECKED;
  34. struct rspamd_multipattern {
  35. #ifdef WITH_HYPERSCAN
  36. hs_database_t *db;
  37. hs_scratch_t *scratch[MAX_SCRATCH];
  38. GArray *hs_pats;
  39. GArray *hs_ids;
  40. GArray *hs_flags;
  41. rspamd_cryptobox_hash_state_t hash_state;
  42. guint scratch_used;
  43. #endif
  44. ac_trie_t *t;
  45. GArray *pats;
  46. gboolean compiled;
  47. guint cnt;
  48. enum rspamd_multipattern_flags flags;
  49. };
  50. static GQuark
  51. rspamd_multipattern_quark (void)
  52. {
  53. return g_quark_from_static_string ("multipattern");
  54. }
  55. static inline gboolean
  56. rspamd_hs_check (void)
  57. {
  58. #ifdef WITH_HYPERSCAN
  59. if (G_UNLIKELY (hs_suitable_cpu == RSPAMD_HS_UNCHECKED)) {
  60. if (hs_valid_platform () == HS_SUCCESS) {
  61. hs_suitable_cpu = RSPAMD_HS_SUPPORTED;
  62. }
  63. else {
  64. hs_suitable_cpu = RSPAMD_HS_UNSUPPORTED;
  65. }
  66. }
  67. #endif
  68. return hs_suitable_cpu == RSPAMD_HS_SUPPORTED;
  69. }
  70. void
  71. rspamd_multipattern_library_init (const gchar *cache_dir)
  72. {
  73. hs_cache_dir = cache_dir;
  74. #ifdef WITH_HYPERSCAN
  75. rspamd_hs_check ();
  76. #endif
  77. }
  78. #ifdef WITH_HYPERSCAN
  79. static gchar *
  80. rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen,
  81. gsize *dst_len)
  82. {
  83. gsize len;
  84. const gchar *p, *prefix;
  85. gchar *res;
  86. /*
  87. * We understand the following cases
  88. * 1) blah -> \\.blah
  89. * 2) *.blah -> \\..*\\.blah
  90. * 3) ???
  91. */
  92. if (pattern[0] == '*') {
  93. len = slen + 4;
  94. p = strchr (pattern, '.');
  95. if (p == NULL) {
  96. /* XXX: bad */
  97. p = pattern;
  98. }
  99. else {
  100. p ++;
  101. }
  102. prefix = ".*";
  103. }
  104. else {
  105. len = slen + 2;
  106. prefix = ".";
  107. p = pattern;
  108. }
  109. res = g_malloc (len + 1);
  110. slen = rspamd_strlcpy (res, prefix, len + 1);
  111. slen += rspamd_strlcpy (res + slen, p, len + 1 - slen);
  112. *dst_len = slen;
  113. return res;
  114. }
  115. #endif
  116. static gchar *
  117. rspamd_multipattern_escape_tld_acism (const gchar *pattern, gsize len,
  118. gsize *dst_len)
  119. {
  120. gsize dlen, slen;
  121. const gchar *p, *prefix;
  122. gchar *res;
  123. /*
  124. * We understand the following cases
  125. * 1) blah -> \\.blah
  126. * 2) *.blah -> \\..*\\.blah
  127. * 3) ???
  128. */
  129. slen = len;
  130. if (pattern[0] == '*') {
  131. dlen = slen;
  132. p = memchr (pattern, '.', len);
  133. if (p == NULL) {
  134. /* XXX: bad */
  135. p = pattern;
  136. }
  137. else {
  138. p ++;
  139. }
  140. dlen -= p - pattern;
  141. prefix = ".";
  142. dlen ++;
  143. }
  144. else {
  145. dlen = slen + 1;
  146. prefix = ".";
  147. p = pattern;
  148. }
  149. res = g_malloc (dlen + 1);
  150. slen = strlen (prefix);
  151. memcpy (res, prefix, slen);
  152. rspamd_strlcpy (res + slen, p, dlen - slen + 1);
  153. *dst_len = dlen;
  154. return res;
  155. }
  156. /*
  157. * Escapes special characters from specific pattern
  158. */
  159. static gchar *
  160. rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len,
  161. enum rspamd_multipattern_flags flags,
  162. gsize *dst_len)
  163. {
  164. gchar *ret = NULL;
  165. #ifdef WITH_HYPERSCAN
  166. if (rspamd_hs_check ()) {
  167. gint gl_flags = RSPAMD_REGEXP_ESCAPE_ASCII;
  168. if (flags & RSPAMD_MULTIPATTERN_UTF8) {
  169. gl_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
  170. }
  171. if (flags & RSPAMD_MULTIPATTERN_TLD) {
  172. gchar *tmp;
  173. gsize tlen;
  174. tmp = rspamd_multipattern_escape_tld_hyperscan (pattern, len, &tlen);
  175. ret = rspamd_str_regexp_escape (tmp, tlen, dst_len,
  176. gl_flags|RSPAMD_REGEXP_ESCAPE_GLOB);
  177. g_free (tmp);
  178. }
  179. else if (flags & RSPAMD_MULTIPATTERN_RE) {
  180. ret = malloc (len + 1);
  181. ret = rspamd_str_regexp_escape (pattern, len, dst_len, gl_flags |
  182. RSPAMD_REGEXP_ESCAPE_RE);
  183. }
  184. else if (flags & RSPAMD_MULTIPATTERN_GLOB) {
  185. ret = rspamd_str_regexp_escape (pattern, len, dst_len,
  186. gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB);
  187. }
  188. else {
  189. ret = rspamd_str_regexp_escape (pattern, len, dst_len, gl_flags);
  190. }
  191. return ret;
  192. }
  193. #endif
  194. if (flags & RSPAMD_MULTIPATTERN_TLD) {
  195. ret = rspamd_multipattern_escape_tld_acism (pattern, len, dst_len);
  196. }
  197. else {
  198. ret = malloc (len + 1);
  199. *dst_len = rspamd_strlcpy (ret, pattern, len + 1);
  200. }
  201. return ret;
  202. }
  203. struct rspamd_multipattern *
  204. rspamd_multipattern_create (enum rspamd_multipattern_flags flags)
  205. {
  206. struct rspamd_multipattern *mp;
  207. mp = g_malloc0 (sizeof (*mp));
  208. mp->flags = flags;
  209. #ifdef WITH_HYPERSCAN
  210. if (rspamd_hs_check ()) {
  211. mp->hs_pats = g_array_new (FALSE, TRUE, sizeof (gchar *));
  212. mp->hs_flags = g_array_new (FALSE, TRUE, sizeof (gint));
  213. mp->hs_ids = g_array_new (FALSE, TRUE, sizeof (gint));
  214. rspamd_cryptobox_hash_init (&mp->hash_state, NULL, 0);
  215. return mp;
  216. }
  217. #endif
  218. mp->pats = g_array_new (FALSE, TRUE, sizeof (ac_trie_pat_t));
  219. return mp;
  220. }
  221. struct rspamd_multipattern *
  222. rspamd_multipattern_create_sized (guint npatterns,
  223. enum rspamd_multipattern_flags flags)
  224. {
  225. struct rspamd_multipattern *mp;
  226. mp = g_malloc0 (sizeof (*mp));
  227. mp->flags = flags;
  228. #ifdef WITH_HYPERSCAN
  229. if (rspamd_hs_check ()) {
  230. mp->hs_pats = g_array_sized_new (FALSE, TRUE, sizeof (gchar *), npatterns);
  231. mp->hs_flags = g_array_sized_new (FALSE, TRUE, sizeof (gint), npatterns);
  232. mp->hs_ids = g_array_sized_new (FALSE, TRUE, sizeof (gint), npatterns);
  233. rspamd_cryptobox_hash_init (&mp->hash_state, NULL, 0);
  234. return mp;
  235. }
  236. #endif
  237. mp->pats = g_array_sized_new (FALSE, TRUE, sizeof (ac_trie_pat_t), npatterns);
  238. return mp;
  239. }
  240. void
  241. rspamd_multipattern_add_pattern (struct rspamd_multipattern *mp,
  242. const gchar *pattern, gint flags)
  243. {
  244. g_assert (pattern != NULL);
  245. rspamd_multipattern_add_pattern_len (mp, pattern, strlen (pattern), flags);
  246. }
  247. void
  248. rspamd_multipattern_add_pattern_len (struct rspamd_multipattern *mp,
  249. const gchar *pattern, gsize patlen, gint flags)
  250. {
  251. gsize dlen;
  252. g_assert (pattern != NULL);
  253. g_assert (mp != NULL);
  254. g_assert (!mp->compiled);
  255. #ifdef WITH_HYPERSCAN
  256. if (rspamd_hs_check ()) {
  257. gchar *np;
  258. gint fl = HS_FLAG_SOM_LEFTMOST;
  259. if (mp->flags & RSPAMD_MULTIPATTERN_ICASE) {
  260. fl |= HS_FLAG_CASELESS;
  261. }
  262. if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) {
  263. fl |= HS_FLAG_UTF8|HS_FLAG_UCP;
  264. }
  265. g_array_append_val (mp->hs_flags, fl);
  266. np = rspamd_multipattern_pattern_filter (pattern, patlen, flags, &dlen);
  267. g_array_append_val (mp->hs_pats, np);
  268. fl = mp->cnt;
  269. g_array_append_val (mp->hs_ids, fl);
  270. rspamd_cryptobox_hash_update (&mp->hash_state, np, dlen);
  271. mp->cnt ++;
  272. return;
  273. }
  274. #endif
  275. ac_trie_pat_t pat;
  276. pat.ptr = rspamd_multipattern_pattern_filter (pattern, patlen, flags, &dlen);
  277. pat.len = dlen;
  278. g_array_append_val (mp->pats, pat);
  279. mp->cnt ++;
  280. }
  281. struct rspamd_multipattern *
  282. rspamd_multipattern_create_full (const gchar **patterns,
  283. guint npatterns, enum rspamd_multipattern_flags flags)
  284. {
  285. struct rspamd_multipattern *mp;
  286. guint i;
  287. g_assert (npatterns > 0);
  288. g_assert (patterns != NULL);
  289. mp = rspamd_multipattern_create_sized (npatterns, flags);
  290. for (i = 0; i < npatterns; i++) {
  291. rspamd_multipattern_add_pattern (mp, patterns[i], flags);
  292. }
  293. return mp;
  294. }
  295. #ifdef WITH_HYPERSCAN
  296. static gboolean
  297. rspamd_multipattern_try_load_hs (struct rspamd_multipattern *mp,
  298. const guchar *hash)
  299. {
  300. gchar fp[PATH_MAX];
  301. gpointer map;
  302. gsize len;
  303. if (hs_cache_dir == NULL) {
  304. return FALSE;
  305. }
  306. rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmp", hs_cache_dir,
  307. (gint)rspamd_cryptobox_HASHBYTES / 2, hash);
  308. if ((map = rspamd_file_xmap (fp, PROT_READ, &len, TRUE)) != NULL) {
  309. if (hs_deserialize_database (map, len, &mp->db) == HS_SUCCESS) {
  310. munmap (map, len);
  311. return TRUE;
  312. }
  313. munmap (map, len);
  314. /* Remove stale file */
  315. (void)unlink (fp);
  316. }
  317. return FALSE;
  318. }
  319. static void
  320. rspamd_multipattern_try_save_hs (struct rspamd_multipattern *mp,
  321. const guchar *hash)
  322. {
  323. gchar fp[PATH_MAX], np[PATH_MAX];
  324. char *bytes = NULL;
  325. gsize len;
  326. gint fd;
  327. if (hs_cache_dir == NULL) {
  328. return;
  329. }
  330. rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmp.tmp", hs_cache_dir,
  331. (gint)rspamd_cryptobox_HASHBYTES / 2, hash);
  332. if ((fd = rspamd_file_xopen (fp, O_WRONLY | O_CREAT | O_EXCL, 00644, 0)) != -1) {
  333. if (hs_serialize_database (mp->db, &bytes, &len) == HS_SUCCESS) {
  334. if (write (fd, bytes, len) == -1) {
  335. msg_warn ("cannot write hyperscan cache to %s: %s",
  336. fp, strerror (errno));
  337. unlink (fp);
  338. free (bytes);
  339. }
  340. else {
  341. free (bytes);
  342. fsync (fd);
  343. rspamd_snprintf (np, sizeof (np), "%s/%*xs.hsmp", hs_cache_dir,
  344. (gint)rspamd_cryptobox_HASHBYTES / 2, hash);
  345. if (rename (fp, np) == -1) {
  346. msg_warn ("cannot rename hyperscan cache from %s to %s: %s",
  347. fp, np, strerror (errno));
  348. unlink (fp);
  349. }
  350. }
  351. }
  352. else {
  353. msg_warn ("cannot serialize hyperscan cache to %s: %s",
  354. fp, strerror (errno));
  355. unlink (fp);
  356. }
  357. close (fd);
  358. }
  359. }
  360. #endif
  361. gboolean
  362. rspamd_multipattern_compile (struct rspamd_multipattern *mp, GError **err)
  363. {
  364. g_assert (mp != NULL);
  365. g_assert (!mp->compiled);
  366. #ifdef WITH_HYPERSCAN
  367. if (rspamd_hs_check ()) {
  368. guint i;
  369. hs_platform_info_t plt;
  370. hs_compile_error_t *hs_errors;
  371. guchar hash[rspamd_cryptobox_HASHBYTES];
  372. if (mp->cnt > 0) {
  373. g_assert (hs_populate_platform (&plt) == HS_SUCCESS);
  374. rspamd_cryptobox_hash_update (&mp->hash_state, (void *)&plt, sizeof (plt));
  375. rspamd_cryptobox_hash_final (&mp->hash_state, hash);
  376. if (!rspamd_multipattern_try_load_hs (mp, hash)) {
  377. if (hs_compile_multi ((const char *const *)mp->hs_pats->data,
  378. (const unsigned int *)mp->hs_flags->data,
  379. (const unsigned int *)mp->hs_ids->data,
  380. mp->cnt,
  381. HS_MODE_BLOCK,
  382. &plt,
  383. &mp->db,
  384. &hs_errors) != HS_SUCCESS) {
  385. g_set_error (err, rspamd_multipattern_quark (), EINVAL,
  386. "cannot create tree of regexp when processing '%s': %s",
  387. g_array_index (mp->hs_pats, char *, hs_errors->expression),
  388. hs_errors->message);
  389. hs_free_compile_error (hs_errors);
  390. return FALSE;
  391. }
  392. }
  393. rspamd_multipattern_try_save_hs (mp, hash);
  394. for (i = 0; i < MAX_SCRATCH; i ++) {
  395. g_assert (hs_alloc_scratch (mp->db, &mp->scratch[i]) == HS_SUCCESS);
  396. }
  397. }
  398. mp->compiled = TRUE;
  399. return TRUE;
  400. }
  401. #endif
  402. if (mp->cnt > 0) {
  403. mp->t = acism_create ((const ac_trie_pat_t *)mp->pats->data, mp->cnt);
  404. }
  405. mp->compiled = TRUE;
  406. return TRUE;
  407. }
  408. struct rspamd_multipattern_cbdata {
  409. struct rspamd_multipattern *mp;
  410. const gchar *in;
  411. gsize len;
  412. rspamd_multipattern_cb_t cb;
  413. gpointer ud;
  414. guint nfound;
  415. gint ret;
  416. };
  417. #ifdef WITH_HYPERSCAN
  418. static gint
  419. rspamd_multipattern_hs_cb (unsigned int id,
  420. unsigned long long from,
  421. unsigned long long to,
  422. unsigned int flags,
  423. void *ud)
  424. {
  425. struct rspamd_multipattern_cbdata *cbd = ud;
  426. gint ret = 0;
  427. if (to > 0) {
  428. if (from == HS_OFFSET_PAST_HORIZON) {
  429. from = 0;
  430. }
  431. ret = cbd->cb (cbd->mp, id, from, to, cbd->in, cbd->len, cbd->ud);
  432. cbd->nfound ++;
  433. cbd->ret = ret;
  434. }
  435. return ret;
  436. }
  437. #endif
  438. static gint
  439. rspamd_multipattern_acism_cb (int strnum, int textpos, void *context)
  440. {
  441. struct rspamd_multipattern_cbdata *cbd = context;
  442. gint ret;
  443. ac_trie_pat_t pat;
  444. pat = g_array_index (cbd->mp->pats, ac_trie_pat_t, strnum);
  445. ret = cbd->cb (cbd->mp, strnum, textpos - pat.len,
  446. textpos, cbd->in, cbd->len, cbd->ud);
  447. cbd->nfound ++;
  448. cbd->ret = ret;
  449. return ret;
  450. }
  451. gint
  452. rspamd_multipattern_lookup (struct rspamd_multipattern *mp,
  453. const gchar *in, gsize len, rspamd_multipattern_cb_t cb,
  454. gpointer ud, guint *pnfound)
  455. {
  456. struct rspamd_multipattern_cbdata cbd;
  457. gint ret = 0;
  458. g_assert (mp != NULL);
  459. if (mp->cnt == 0 || !mp->compiled) {
  460. return 0;
  461. }
  462. cbd.mp = mp;
  463. cbd.in = in;
  464. cbd.len = len;
  465. cbd.cb = cb;
  466. cbd.ud = ud;
  467. cbd.nfound = 0;
  468. cbd.ret = 0;
  469. #ifdef WITH_HYPERSCAN
  470. if (rspamd_hs_check ()) {
  471. hs_scratch_t *scr = NULL;
  472. guint i;
  473. for (i = 0; i < MAX_SCRATCH; i ++) {
  474. if (!(mp->scratch_used & (1 << i))) {
  475. mp->scratch_used |= (1 << i);
  476. scr = mp->scratch[i];
  477. break;
  478. }
  479. }
  480. g_assert (scr != NULL);
  481. ret = hs_scan (mp->db, in, len, 0, scr,
  482. rspamd_multipattern_hs_cb, &cbd);
  483. mp->scratch_used &= ~(1 << i);
  484. if (ret == HS_SUCCESS) {
  485. ret = 0;
  486. }
  487. else if (ret == HS_SCAN_TERMINATED) {
  488. ret = cbd.ret;
  489. }
  490. if (pnfound) {
  491. *pnfound = cbd.nfound;
  492. }
  493. return ret;
  494. }
  495. #endif
  496. gint state = 0;
  497. ret = acism_lookup (mp->t, in, len, rspamd_multipattern_acism_cb, &cbd,
  498. &state, mp->flags & RSPAMD_MULTIPATTERN_ICASE);
  499. if (pnfound) {
  500. *pnfound = cbd.nfound;
  501. }
  502. return ret;
  503. }
  504. void
  505. rspamd_multipattern_destroy (struct rspamd_multipattern *mp)
  506. {
  507. guint i;
  508. if (mp) {
  509. #ifdef WITH_HYPERSCAN
  510. if (rspamd_hs_check ()) {
  511. gchar *p;
  512. if (mp->compiled && mp->cnt > 0) {
  513. for (i = 0; i < MAX_SCRATCH; i ++) {
  514. hs_free_scratch (mp->scratch[i]);
  515. }
  516. hs_free_database (mp->db);
  517. }
  518. for (i = 0; i < mp->cnt; i ++) {
  519. p = g_array_index (mp->hs_pats, gchar *, i);
  520. g_free (p);
  521. }
  522. g_array_free (mp->hs_pats, TRUE);
  523. g_array_free (mp->hs_ids, TRUE);
  524. g_array_free (mp->hs_flags, TRUE);
  525. g_free (mp);
  526. return;
  527. }
  528. #endif
  529. ac_trie_pat_t pat;
  530. if (mp->compiled && mp->cnt > 0) {
  531. acism_destroy (mp->t);
  532. }
  533. for (i = 0; i < mp->cnt; i ++) {
  534. pat = g_array_index (mp->pats, ac_trie_pat_t, i);
  535. g_free ((gchar *)pat.ptr);
  536. }
  537. g_array_free (mp->pats, TRUE);
  538. g_free (mp);
  539. }
  540. }
  541. const gchar*
  542. rspamd_multipattern_get_pattern (struct rspamd_multipattern *mp,
  543. guint index)
  544. {
  545. g_assert (mp != NULL);
  546. g_assert (index < mp->cnt);
  547. #ifdef WITH_HYPERSCAN
  548. if (rspamd_hs_check ()) {
  549. return g_array_index (mp->hs_pats, gchar *, index);
  550. }
  551. #endif
  552. ac_trie_pat_t pat;
  553. pat = g_array_index (mp->pats, ac_trie_pat_t, index);
  554. return pat.ptr;
  555. }
  556. guint
  557. rspamd_multipattern_get_npatterns (struct rspamd_multipattern *mp)
  558. {
  559. g_assert (mp != NULL);
  560. return mp->cnt;
  561. }
  562. gboolean
  563. rspamd_multipattern_has_hyperscan (void)
  564. {
  565. return rspamd_hs_check ();
  566. }