/*- * Copyright 2016 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "libmime/message.h" #include "re_cache.h" #include "cryptobox.h" #include "ref.h" #include "libserver/url.h" #include "libserver/task.h" #include "libserver/cfg_file.h" #include "libutil/util.h" #include "libutil/regexp.h" #include "lua/lua_common.h" #ifdef WITH_HYPERSCAN #include "hs.h" #include "unix-std.h" #include #ifndef WITH_PCRE2 #include #else #include #endif #ifdef HAVE_SYS_WAIT_H #include #endif #endif #define msg_err_re_cache(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \ "re_cache", cache->hash, \ G_STRFUNC, \ __VA_ARGS__) #define msg_warn_re_cache(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \ "re_cache", cache->hash, \ G_STRFUNC, \ __VA_ARGS__) #define msg_info_re_cache(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \ "re_cache", cache->hash, \ G_STRFUNC, \ __VA_ARGS__) #define msg_debug_re_task(...) rspamd_conditional_debug_fast (NULL, NULL, \ rspamd_re_cache_log_id, "re_cache", task->task_pool->tag.uid, \ G_STRFUNC, \ __VA_ARGS__) #define msg_debug_re_cache(...) rspamd_conditional_debug_fast (NULL, NULL, \ rspamd_re_cache_log_id, "re_cache", cache->hash, \ G_STRFUNC, \ __VA_ARGS__) INIT_LOG_MODULE(re_cache) #ifdef WITH_HYPERSCAN #define RSPAMD_HS_MAGIC_LEN (sizeof (rspamd_hs_magic)) static const guchar rspamd_hs_magic[] = {'r', 's', 'h', 's', 'r', 'e', '1', '1'}, rspamd_hs_magic_vector[] = {'r', 's', 'h', 's', 'r', 'v', '1', '1'}; #endif struct rspamd_re_class { guint64 id; enum rspamd_re_type type; gpointer type_data; gsize type_len; GHashTable *re; gchar hash[rspamd_cryptobox_HASHBYTES + 1]; rspamd_cryptobox_hash_state_t *st; #ifdef WITH_HYPERSCAN hs_database_t *hs_db; hs_scratch_t *hs_scratch; gint *hs_ids; guint nhs; #endif }; enum rspamd_re_cache_elt_match_type { RSPAMD_RE_CACHE_PCRE = 0, RSPAMD_RE_CACHE_HYPERSCAN, RSPAMD_RE_CACHE_HYPERSCAN_PRE }; struct rspamd_re_cache_elt { rspamd_regexp_t *re; enum rspamd_re_cache_elt_match_type match_type; }; struct rspamd_re_cache { GHashTable *re_classes; GPtrArray *re; ref_entry_t ref; guint nre; guint max_re_data; gchar hash[rspamd_cryptobox_HASHBYTES + 1]; #ifdef WITH_HYPERSCAN gboolean hyperscan_loaded; gboolean disable_hyperscan; gboolean vectorized_hyperscan; hs_platform_info_t plt; #endif }; struct rspamd_re_runtime { guchar *checked; guchar *results; struct rspamd_re_cache *cache; struct rspamd_re_cache_stat stat; gboolean has_hs; }; static GQuark rspamd_re_cache_quark (void) { return g_quark_from_static_string ("re_cache"); } static guint64 rspamd_re_cache_class_id (enum rspamd_re_type type, gpointer type_data, gsize datalen) { rspamd_cryptobox_fast_hash_state_t st; rspamd_cryptobox_fast_hash_init (&st, 0xdeadbabe); rspamd_cryptobox_fast_hash_update (&st, &type, sizeof (type)); if (datalen > 0) { rspamd_cryptobox_fast_hash_update (&st, type_data, datalen); } return rspamd_cryptobox_fast_hash_final (&st); } static void rspamd_re_cache_destroy (struct rspamd_re_cache *cache) { GHashTableIter it; gpointer k, v; struct rspamd_re_class *re_class; g_assert (cache != NULL); g_hash_table_iter_init (&it, cache->re_classes); while (g_hash_table_iter_next (&it, &k, &v)) { re_class = v; g_hash_table_iter_steal (&it); g_hash_table_unref (re_class->re); if (re_class->type_data) { g_free (re_class->type_data); } #ifdef WITH_HYPERSCAN if (re_class->hs_db) { hs_free_database (re_class->hs_db); } if (re_class->hs_scratch) { hs_free_scratch (re_class->hs_scratch); } if (re_class->hs_ids) { g_free (re_class->hs_ids); } #endif g_free (re_class); } g_hash_table_unref (cache->re_classes); g_ptr_array_free (cache->re, TRUE); g_free (cache); } static void rspamd_re_cache_elt_dtor (gpointer e) { struct rspamd_re_cache_elt *elt = e; rspamd_regexp_unref (elt->re); g_free (elt); } struct rspamd_re_cache * rspamd_re_cache_new (void) { struct rspamd_re_cache *cache; cache = g_malloc0 (sizeof (*cache)); cache->re_classes = g_hash_table_new (g_int64_hash, g_int64_equal); cache->nre = 0; cache->re = g_ptr_array_new_full (256, rspamd_re_cache_elt_dtor); #ifdef WITH_HYPERSCAN cache->hyperscan_loaded = FALSE; #endif REF_INIT_RETAIN (cache, rspamd_re_cache_destroy); return cache; } gboolean rspamd_re_cache_is_hs_loaded (struct rspamd_re_cache *cache) { g_assert (cache != NULL); #ifdef WITH_HYPERSCAN return cache->hyperscan_loaded; #else return FALSE; #endif } rspamd_regexp_t * rspamd_re_cache_add (struct rspamd_re_cache *cache, rspamd_regexp_t *re, enum rspamd_re_type type, gpointer type_data, gsize datalen) { guint64 class_id; struct rspamd_re_class *re_class; rspamd_regexp_t *nre; struct rspamd_re_cache_elt *elt; g_assert (cache != NULL); g_assert (re != NULL); class_id = rspamd_re_cache_class_id (type, type_data, datalen); re_class = g_hash_table_lookup (cache->re_classes, &class_id); if (re_class == NULL) { re_class = g_malloc0 (sizeof (*re_class)); re_class->id = class_id; re_class->type_len = datalen; re_class->type = type; re_class->re = g_hash_table_new_full (rspamd_regexp_hash, rspamd_regexp_equal, NULL, (GDestroyNotify)rspamd_regexp_unref); if (datalen > 0) { re_class->type_data = g_malloc0 (datalen); memcpy (re_class->type_data, type_data, datalen); } g_hash_table_insert (cache->re_classes, &re_class->id, re_class); } if ((nre = g_hash_table_lookup (re_class->re, rspamd_regexp_get_id (re))) == NULL) { /* * We set re id based on the global position in the cache */ elt = g_malloc0 (sizeof (*elt)); /* One ref for re_class */ nre = rspamd_regexp_ref (re); rspamd_regexp_set_cache_id (re, cache->nre++); /* One ref for cache */ elt->re = rspamd_regexp_ref (re); g_ptr_array_add (cache->re, elt); rspamd_regexp_set_class (re, re_class); g_hash_table_insert (re_class->re, rspamd_regexp_get_id (nre), nre); } return nre; } void rspamd_re_cache_replace (struct rspamd_re_cache *cache, rspamd_regexp_t *what, rspamd_regexp_t *with) { guint64 re_id; struct rspamd_re_class *re_class; rspamd_regexp_t *src; struct rspamd_re_cache_elt *elt; g_assert (cache != NULL); g_assert (what != NULL); g_assert (with != NULL); re_class = rspamd_regexp_get_class (what); if (re_class != NULL) { re_id = rspamd_regexp_get_cache_id (what); g_assert (re_id != RSPAMD_INVALID_ID); src = g_hash_table_lookup (re_class->re, rspamd_regexp_get_id (what)); elt = g_ptr_array_index (cache->re, re_id); g_assert (elt != NULL); g_assert (src != NULL); rspamd_regexp_set_cache_id (what, RSPAMD_INVALID_ID); rspamd_regexp_set_class (what, NULL); rspamd_regexp_set_cache_id (with, re_id); rspamd_regexp_set_class (with, re_class); /* * On calling of this function, we actually unref old re (what) */ g_hash_table_insert (re_class->re, rspamd_regexp_get_id (what), rspamd_regexp_ref (with)); rspamd_regexp_unref (elt->re); elt->re = rspamd_regexp_ref (with); /* XXX: do not touch match type here */ } } static gint rspamd_re_cache_sort_func (gconstpointer a, gconstpointer b) { struct rspamd_re_cache_elt * const *re1 = a, * const *re2 = b; return rspamd_regexp_cmp (rspamd_regexp_get_id ((*re1)->re), rspamd_regexp_get_id ((*re2)->re)); } void rspamd_re_cache_init (struct rspamd_re_cache *cache, struct rspamd_config *cfg) { guint i, fl; GHashTableIter it; gpointer k, v; struct rspamd_re_class *re_class; rspamd_cryptobox_hash_state_t st_global; rspamd_regexp_t *re; struct rspamd_re_cache_elt *elt; guchar hash_out[rspamd_cryptobox_HASHBYTES]; g_assert (cache != NULL); rspamd_cryptobox_hash_init (&st_global, NULL, 0); /* Resort all regexps */ g_ptr_array_sort (cache->re, rspamd_re_cache_sort_func); for (i = 0; i < cache->re->len; i ++) { elt = g_ptr_array_index (cache->re, i); re = elt->re; re_class = rspamd_regexp_get_class (re); g_assert (re_class != NULL); rspamd_regexp_set_cache_id (re, i); if (re_class->st == NULL) { re_class->st = g_malloc (sizeof (*re_class->st)); rspamd_cryptobox_hash_init (re_class->st, NULL, 0); } /* Update hashes */ /* Id of re class */ rspamd_cryptobox_hash_update (re_class->st, (gpointer) &re_class->id, sizeof (re_class->id)); rspamd_cryptobox_hash_update (&st_global, (gpointer) &re_class->id, sizeof (re_class->id)); /* Id of re expression */ rspamd_cryptobox_hash_update (re_class->st, rspamd_regexp_get_id (re), rspamd_cryptobox_HASHBYTES); rspamd_cryptobox_hash_update (&st_global, rspamd_regexp_get_id (re), rspamd_cryptobox_HASHBYTES); /* PCRE flags */ fl = rspamd_regexp_get_pcre_flags (re); rspamd_cryptobox_hash_update (re_class->st, (const guchar *)&fl, sizeof (fl)); rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl, sizeof (fl)); /* Rspamd flags */ fl = rspamd_regexp_get_flags (re); rspamd_cryptobox_hash_update (re_class->st, (const guchar *) &fl, sizeof (fl)); rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl, sizeof (fl)); /* Limit of hits */ fl = rspamd_regexp_get_maxhits (re); rspamd_cryptobox_hash_update (re_class->st, (const guchar *) &fl, sizeof (fl)); rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl, sizeof (fl)); /* Numberic order */ rspamd_cryptobox_hash_update (re_class->st, (const guchar *)&i, sizeof (i)); rspamd_cryptobox_hash_update (&st_global, (const guchar *)&i, sizeof (i)); } rspamd_cryptobox_hash_final (&st_global, hash_out); rspamd_snprintf (cache->hash, sizeof (cache->hash), "%*xs", (gint) rspamd_cryptobox_HASHBYTES, hash_out); /* Now finalize all classes */ g_hash_table_iter_init (&it, cache->re_classes); while (g_hash_table_iter_next (&it, &k, &v)) { re_class = v; if (re_class->st) { /* * We finally update all classes with the number of expressions * in the cache to ensure that if even a single re has been changed * we won't be broken due to id mismatch */ rspamd_cryptobox_hash_update (re_class->st, (gpointer)&cache->re->len, sizeof (cache->re->len)); rspamd_cryptobox_hash_final (re_class->st, hash_out); rspamd_snprintf (re_class->hash, sizeof (re_class->hash), "%*xs", (gint) rspamd_cryptobox_HASHBYTES, hash_out); g_free (re_class->st); re_class->st = NULL; } } #ifdef WITH_HYPERSCAN const gchar *platform = "generic"; rspamd_fstring_t *features = rspamd_fstring_new (); cache->disable_hyperscan = cfg->disable_hyperscan; cache->vectorized_hyperscan = cfg->vectorized_hyperscan; g_assert (hs_populate_platform (&cache->plt) == HS_SUCCESS); /* Now decode what we do have */ switch (cache->plt.tune) { case HS_TUNE_FAMILY_HSW: platform = "haswell"; break; case HS_TUNE_FAMILY_SNB: platform = "sandy"; break; case HS_TUNE_FAMILY_BDW: platform = "broadwell"; break; case HS_TUNE_FAMILY_IVB: platform = "ivy"; break; default: break; } if (cache->plt.cpu_features & HS_CPU_FEATURES_AVX2) { features = rspamd_fstring_append (features, "AVX2", 4); } hs_set_allocator (g_malloc, g_free); msg_info_re_cache ("loaded hyperscan engine with cpu tune '%s' and features '%V'", platform, features); rspamd_fstring_free (features); #endif } struct rspamd_re_runtime * rspamd_re_cache_runtime_new (struct rspamd_re_cache *cache) { struct rspamd_re_runtime *rt; g_assert (cache != NULL); rt = g_malloc0 (sizeof (*rt)); rt->cache = cache; REF_RETAIN (cache); rt->checked = g_malloc0 (NBYTES (cache->nre)); rt->results = g_malloc0 (cache->nre); rt->stat.regexp_total = cache->nre; #ifdef WITH_HYPERSCAN rt->has_hs = cache->hyperscan_loaded; #endif return rt; } const struct rspamd_re_cache_stat * rspamd_re_cache_get_stat (struct rspamd_re_runtime *rt) { g_assert (rt != NULL); return &rt->stat; } static guint rspamd_re_cache_process_pcre (struct rspamd_re_runtime *rt, rspamd_regexp_t *re, struct rspamd_task *task, const guchar *in, gsize len, gboolean is_raw) { guint r = 0; const gchar *start = NULL, *end = NULL; guint max_hits = rspamd_regexp_get_maxhits (re); guint64 id = rspamd_regexp_get_cache_id (re); gdouble t1, t2, pr; const gdouble slow_time = 1e8; if (in == NULL) { return rt->results[id]; } if (len == 0) { len = strlen (in); } if (rt->cache->max_re_data > 0 && len > rt->cache->max_re_data) { len = rt->cache->max_re_data; } r = rt->results[id]; if (max_hits == 0 || r < max_hits) { pr = rspamd_random_double_fast (); if (pr > 0.9) { t1 = rspamd_get_ticks (TRUE); } while (rspamd_regexp_search (re, in, len, &start, &end, is_raw, NULL)) { r++; msg_debug_re_task ("found regexp /%s/, total hits: %d", rspamd_regexp_get_pattern (re), r); if (max_hits > 0 && r >= max_hits) { break; } } rt->results[id] += r; rt->stat.regexp_checked++; rt->stat.bytes_scanned_pcre += len; rt->stat.bytes_scanned += len; if (r > 0) { rt->stat.regexp_matched += r; } if (pr > 0.9) { t2 = rspamd_get_ticks (TRUE); if (t2 - t1 > slow_time) { msg_info_task ("regexp '%16s' took %.0f ticks to execute", rspamd_regexp_get_pattern (re), t2 - t1); } } } return r; } #ifdef WITH_HYPERSCAN struct rspamd_re_hyperscan_cbdata { struct rspamd_re_runtime *rt; const guchar **ins; const guint *lens; guint count; rspamd_regexp_t *re; struct rspamd_task *task; }; static gint rspamd_re_cache_hyperscan_cb (unsigned int id, unsigned long long from, unsigned long long to, unsigned int flags, void *ud) { struct rspamd_re_hyperscan_cbdata *cbdata = ud; struct rspamd_re_runtime *rt; struct rspamd_re_cache_elt *pcre_elt; guint ret, maxhits, i, processed; struct rspamd_task *task; rt = cbdata->rt; task = cbdata->task; pcre_elt = g_ptr_array_index (rt->cache->re, id); maxhits = rspamd_regexp_get_maxhits (pcre_elt->re); if (pcre_elt->match_type == RSPAMD_RE_CACHE_HYPERSCAN) { ret = 1; setbit (rt->checked, id); if (maxhits == 0 || rt->results[id] < maxhits) { rt->results[id] += ret; rt->stat.regexp_matched++; } msg_debug_re_task ("found regexp /%s/ using hyperscan only, total hits: %d", rspamd_regexp_get_pattern (pcre_elt->re), rt->results[id]); } else { if (!isset (rt->checked, id)) { processed = 0; for (i = 0; i < cbdata->count; i ++) { rspamd_re_cache_process_pcre (rt, pcre_elt->re, cbdata->task, cbdata->ins[i], cbdata->lens[i], FALSE); setbit (rt->checked, id); processed += cbdata->lens[i]; if (processed >= to) { break; } } } } return 0; } #endif static guint rspamd_re_cache_process_regexp_data (struct rspamd_re_runtime *rt, rspamd_regexp_t *re, struct rspamd_task *task, const guchar **in, guint *lens, guint count, gboolean is_raw) { guint64 re_id; guint ret = 0; guint i; re_id = rspamd_regexp_get_cache_id (re); if (count == 0 || in == NULL) { /* We assume this as absence of the specified data */ setbit (rt->checked, re_id); rt->results[re_id] = ret; return ret; } #ifndef WITH_HYPERSCAN for (i = 0; i < count; i++) { ret = rspamd_re_cache_process_pcre (rt, re, task, in[i], lens[i], is_raw); rt->results[re_id] = ret; } setbit (rt->checked, re_id); #else struct rspamd_re_cache_elt *elt; struct rspamd_re_class *re_class; struct rspamd_re_hyperscan_cbdata cbdata; elt = g_ptr_array_index (rt->cache->re, re_id); re_class = rspamd_regexp_get_class (re); if (rt->cache->disable_hyperscan || elt->match_type == RSPAMD_RE_CACHE_PCRE || !rt->has_hs) { for (i = 0; i < count; i++) { ret = rspamd_re_cache_process_pcre (rt, re, task, in[i], lens[i], is_raw); } setbit (rt->checked, re_id); } else { for (i = 0; i < count; i ++) { if (rt->cache->max_re_data > 0 && lens[i] > rt->cache->max_re_data) { lens[i] = rt->cache->max_re_data; } rt->stat.bytes_scanned += lens[i]; } g_assert (re_class->hs_scratch != NULL); g_assert (re_class->hs_db != NULL); /* Go through hyperscan API */ if (!rt->cache->vectorized_hyperscan) { for (i = 0; i < count; i++) { cbdata.ins = &in[i]; cbdata.re = re; cbdata.rt = rt; cbdata.lens = &lens[i]; cbdata.count = 1; cbdata.task = task; if ((hs_scan (re_class->hs_db, in[i], lens[i], 0, re_class->hs_scratch, rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) { ret = 0; } else { ret = rt->results[re_id]; } } } else { cbdata.ins = in; cbdata.re = re; cbdata.rt = rt; cbdata.lens = lens; cbdata.count = 1; cbdata.task = task; if ((hs_scan_vector (re_class->hs_db, (const char **)in, lens, count, 0, re_class->hs_scratch, rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) { ret = 0; } else { ret = rt->results[re_id]; } } } #endif return ret; } static void rspamd_re_cache_finish_class (struct rspamd_re_runtime *rt, struct rspamd_re_class *re_class) { #ifdef WITH_HYPERSCAN guint i; guint64 re_id; /* Set all bits unchecked */ for (i = 0; i < re_class->nhs; i++) { re_id = re_class->hs_ids[i]; if (!isset (rt->checked, re_id)) { g_assert (rt->results[re_id] == 0); rt->results[re_id] = 0; setbit (rt->checked, re_id); } } #endif } /* * Calculates the specified regexp for the specified class if it's not calculated */ static guint rspamd_re_cache_exec_re (struct rspamd_task *task, struct rspamd_re_runtime *rt, rspamd_regexp_t *re, struct rspamd_re_class *re_class, gboolean is_strong) { guint ret = 0, i, re_id; GPtrArray *headerlist; GHashTableIter it; struct rspamd_mime_header *rh; const gchar *in, *end; const guchar **scvec; guint *lenvec; gboolean raw = FALSE; struct rspamd_mime_text_part *part; struct rspamd_url *url; gpointer k, v; guint len, cnt; msg_debug_re_task ("check re type: %s: /%s/", rspamd_re_cache_type_to_string (re_class->type), rspamd_regexp_get_pattern (re)); re_id = rspamd_regexp_get_cache_id (re); switch (re_class->type) { case RSPAMD_RE_HEADER: case RSPAMD_RE_RAWHEADER: /* Get list of specified headers */ headerlist = rspamd_message_get_header_array (task, re_class->type_data, is_strong); if (headerlist && headerlist->len > 0) { scvec = g_malloc (sizeof (*scvec) * headerlist->len); lenvec = g_malloc (sizeof (*lenvec) * headerlist->len); for (i = 0; i < headerlist->len; i ++) { rh = g_ptr_array_index (headerlist, i); if (re_class->type == RSPAMD_RE_RAWHEADER) { in = rh->value; raw = TRUE; lenvec[i] = strlen (rh->value); } else { in = rh->decoded; /* Validate input */ if (!in || !g_utf8_validate (in, -1, &end)) { lenvec[i] = 0; scvec[i] = (guchar *)""; continue; } lenvec[i] = end - in; } scvec[i] = (guchar *)in; } ret = rspamd_re_cache_process_regexp_data (rt, re, task, scvec, lenvec, headerlist->len, raw); msg_debug_re_task ("checking header %s regexp: %s=%*s -> %d", re_class->type_data, rspamd_regexp_get_pattern (re), (int)lenvec[0], scvec[0], ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_ALLHEADER: raw = TRUE; in = task->raw_headers_content.begin; len = task->raw_headers_content.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task, (const guchar **)&in, &len, 1, raw); msg_debug_re_task ("checking allheader regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MIMEHEADER: headerlist = rspamd_message_get_mime_header_array (task, re_class->type_data, is_strong); if (headerlist && headerlist->len > 0) { scvec = g_malloc (sizeof (*scvec) * headerlist->len); lenvec = g_malloc (sizeof (*lenvec) * headerlist->len); for (i = 0; i < headerlist->len; i ++) { rh = g_ptr_array_index (headerlist, i); if (re_class->type == RSPAMD_RE_RAWHEADER) { in = rh->value; raw = TRUE; lenvec[i] = strlen (rh->value); } else { in = rh->decoded; /* Validate input */ if (!in || !g_utf8_validate (in, -1, &end)) { lenvec[i] = 0; scvec[i] = (guchar *)""; continue; } lenvec[i] = end - in; } scvec[i] = (guchar *)in; } ret = rspamd_re_cache_process_regexp_data (rt, re, task, scvec, lenvec, headerlist->len, raw); msg_debug_re_task ("checking mime header %s regexp: %s -> %d", re_class->type_data, rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_MIME: case RSPAMD_RE_RAWMIME: /* Iterate through text parts */ if (task->text_parts->len > 0) { cnt = task->text_parts->len; scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); /* Skip empty parts */ if (IS_PART_EMPTY (part)) { lenvec[i] = 0; scvec[i] = (guchar *) ""; continue; } /* Check raw flags */ if (!IS_PART_UTF (part)) { raw = TRUE; } /* Select data for regexp */ if (re_class->type == RSPAMD_RE_RAWMIME) { in = part->raw.begin; len = part->raw.len; raw = TRUE; } else { in = part->content->data; len = part->content->len; } scvec[i] = (guchar *) in; lenvec[i] = len; } ret = rspamd_re_cache_process_regexp_data (rt, re, task, scvec, lenvec, cnt, raw); msg_debug_re_task ("checking mime regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_URL: cnt = g_hash_table_size (task->urls) + g_hash_table_size (task->emails); if (cnt > 0) { scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); g_hash_table_iter_init (&it, task->urls); i = 0; while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; scvec[i] = (guchar *)in; lenvec[i++] = len; } g_hash_table_iter_init (&it, task->emails); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; scvec[i] = (guchar *) in; lenvec[i++] = len; } g_assert (i == cnt); ret = rspamd_re_cache_process_regexp_data (rt, re, task, scvec, lenvec, i, raw); msg_debug_re_task ("checking url regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_BODY: raw = TRUE; in = task->msg.begin; len = task->msg.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task, (const guchar **)&in, &len, 1, raw); msg_debug_re_task ("checking rawbody regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_SABODY: /* According to SA docs: * The 'body' in this case is the textual parts of the message body; * any non-text MIME parts are stripped, and the message decoded from * Quoted-Printable or Base-64-encoded format if necessary. The message * Subject header is considered part of the body and becomes the first * paragraph when running the rules. All HTML tags and line breaks will * be removed before matching. */ cnt = task->text_parts->len + 1; scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); /* * Body rules also include the Subject as the first line * of the body content. */ headerlist = rspamd_message_get_header_array (task, "Subject", FALSE); if (headerlist && headerlist->len > 0) { rh = g_ptr_array_index (headerlist, 0); scvec[0] = (guchar *)rh->decoded; lenvec[0] = strlen (rh->decoded); } else { scvec[0] = (guchar *)""; lenvec[0] = 0; } for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); if (part->stripped_content) { scvec[i + 1] = (guchar *)part->stripped_content->data; lenvec[i + 1] = part->stripped_content->len; } else { scvec[i + 1] = (guchar *)""; lenvec[i + 1] = 0; } } ret = rspamd_re_cache_process_regexp_data (rt, re, task, scvec, lenvec, cnt, TRUE); msg_debug_re_task ("checking sa body regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); break; case RSPAMD_RE_SARAWBODY: /* According to SA docs: * The 'raw body' of a message is the raw data inside all textual * parts. The text will be decoded from base64 or quoted-printable * encoding, but HTML tags and line breaks will still be present. * Multiline expressions will need to be used to match strings that are * broken by line breaks. */ if (task->text_parts->len > 0) { cnt = task->text_parts->len; scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); if (part->parsed.len > 0) { scvec[i] = (guchar *)part->parsed.begin; lenvec[i] = part->parsed.len; } else { scvec[i] = (guchar *)""; lenvec[i] = 0; } } ret = rspamd_re_cache_process_regexp_data (rt, re, task, scvec, lenvec, cnt, TRUE); msg_debug_re_task ("checking sa rawbody regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_MAX: msg_err_task ("regexp of class invalid has been called: %s", rspamd_regexp_get_pattern (re)); break; } #if WITH_HYPERSCAN if (!rt->cache->disable_hyperscan && rt->has_hs) { rspamd_re_cache_finish_class (rt, re_class); } #endif setbit (rt->checked, re_id); return rt->results[re_id]; } gint rspamd_re_cache_process (struct rspamd_task *task, rspamd_regexp_t *re, enum rspamd_re_type type, gpointer type_data, gsize datalen, gboolean is_strong) { guint64 re_id; struct rspamd_re_class *re_class; struct rspamd_re_cache *cache; struct rspamd_re_runtime *rt; g_assert (task != NULL); rt = task->re_rt; g_assert (rt != NULL); g_assert (re != NULL); cache = rt->cache; re_id = rspamd_regexp_get_cache_id (re); if (re_id == RSPAMD_INVALID_ID || re_id > cache->nre) { msg_err_task ("re '%s' has no valid id for the cache", rspamd_regexp_get_pattern (re)); return 0; } if (isset (rt->checked, re_id)) { /* Fast path */ rt->stat.regexp_fast_cached ++; return rt->results[re_id]; } else { /* Slow path */ re_class = rspamd_regexp_get_class (re); if (re_class == NULL) { msg_err_task ("cannot find re class for regexp '%s'", rspamd_regexp_get_pattern (re)); return 0; } return rspamd_re_cache_exec_re (task, rt, re, re_class, is_strong); } return 0; } int rspamd_re_cache_process_ffi (void *ptask, void *pre, int type, void *type_data, int is_strong) { struct rspamd_lua_regexp **lua_re = pre; struct rspamd_task **real_task = ptask; gsize typelen = 0; if (type_data) { typelen = strlen (type_data); } return rspamd_re_cache_process (*real_task, (*lua_re)->re, type, type_data, typelen, is_strong); } void rspamd_re_cache_runtime_destroy (struct rspamd_re_runtime *rt) { g_assert (rt != NULL); g_free (rt->checked); g_free (rt->results); REF_RELEASE (rt->cache); g_free (rt); } void rspamd_re_cache_unref (struct rspamd_re_cache *cache) { if (cache) { REF_RELEASE (cache); } } struct rspamd_re_cache * rspamd_re_cache_ref (struct rspamd_re_cache *cache) { if (cache) { REF_RETAIN (cache); } return cache; } guint rspamd_re_cache_set_limit (struct rspamd_re_cache *cache, guint limit) { guint old; g_assert (cache != NULL); old = cache->max_re_data; cache->max_re_data = limit; return old; } const gchar * rspamd_re_cache_type_to_string (enum rspamd_re_type type) { const gchar *ret = "unknown"; switch (type) { case RSPAMD_RE_HEADER: ret = "header"; break; case RSPAMD_RE_RAWHEADER: ret = "raw header"; break; case RSPAMD_RE_MIMEHEADER: ret = "mime header"; break; case RSPAMD_RE_ALLHEADER: ret = "all headers"; break; case RSPAMD_RE_MIME: ret = "part"; break; case RSPAMD_RE_RAWMIME: ret = "raw part"; break; case RSPAMD_RE_BODY: ret = "rawbody"; break; case RSPAMD_RE_URL: ret = "url"; break; case RSPAMD_RE_SABODY: ret = "sa body"; break; case RSPAMD_RE_SARAWBODY: ret = "sa body"; break; case RSPAMD_RE_MAX: ret = "invalid class"; break; } return ret; } enum rspamd_re_type rspamd_re_cache_type_from_string (const char *str) { enum rspamd_re_type ret; guint64 h; /* * To optimize this function, we apply hash to input string and * pre-select it from the values */ if (str != NULL) { h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64, str, strlen (str), 0xdeadbabe); switch (h) { case G_GUINT64_CONSTANT(0x298b9c8a58887d44): /* header */ ret = RSPAMD_RE_HEADER; break; case G_GUINT64_CONSTANT(0x467bfb5cd7ddf890): /* rawheader */ ret = RSPAMD_RE_RAWHEADER; break; case G_GUINT64_CONSTANT(0xda081341fb600389): /* mime */ ret = RSPAMD_RE_MIME; break; case G_GUINT64_CONSTANT(0xc35831e067a8221d): /* rawmime */ ret = RSPAMD_RE_RAWMIME; break; case G_GUINT64_CONSTANT(0xc625e13dbe636de2): /* body */ case G_GUINT64_CONSTANT(0xCCDEBA43518F721C): /* message */ ret = RSPAMD_RE_BODY; break; case G_GUINT64_CONSTANT(0x286edbe164c791d2): /* url */ case G_GUINT64_CONSTANT(0x7D9ACDF6685661A1): /* uri */ ret = RSPAMD_RE_URL; break; case G_GUINT64_CONSTANT(0x796d62205a8778c7): /* allheader */ ret = RSPAMD_RE_ALLHEADER; break; case G_GUINT64_CONSTANT(0xa3c6c153b3b00a5e): /* mimeheader */ ret = RSPAMD_RE_MIMEHEADER; break; case G_GUINT64_CONSTANT(0x7794501506e604e9): /* sabody */ ret = RSPAMD_RE_SABODY; break; case G_GUINT64_CONSTANT(0x28828962E7D2A05F): /* sarawbody */ ret = RSPAMD_RE_SARAWBODY; break; default: ret = RSPAMD_RE_MAX; break; } } else { ret = RSPAMD_RE_MAX; } return ret; } #ifdef WITH_HYPERSCAN static gboolean rspamd_re_cache_is_finite (struct rspamd_re_cache *cache, rspamd_regexp_t *re, gint flags, gdouble max_time) { pid_t cld; gint status; struct timespec ts; hs_compile_error_t *hs_errors; hs_database_t *test_db; gdouble wait_time; const gint max_tries = 10; gint tries = 0, rc; wait_time = max_time / max_tries; /* We need to restore SIGCHLD processing */ signal (SIGCHLD, SIG_DFL); cld = fork (); g_assert (cld != -1); if (cld == 0) { /* Try to compile pattern */ if (hs_compile (rspamd_regexp_get_pattern (re), flags | HS_FLAG_PREFILTER, cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK, &cache->plt, &test_db, &hs_errors) != HS_SUCCESS) { exit (EXIT_FAILURE); } exit (EXIT_SUCCESS); } else { double_to_ts (wait_time, &ts); while ((rc = waitpid (cld, &status, WNOHANG)) == 0 && tries ++ < max_tries) { (void)nanosleep (&ts, NULL); } /* Child has been terminated */ if (rc > 0) { /* Forget about SIGCHLD after this point */ signal (SIGCHLD, SIG_IGN); if (WIFEXITED (status) && WEXITSTATUS (status) == EXIT_SUCCESS) { return TRUE; } else { msg_err_re_cache ( "cannot approximate %s to hyperscan", rspamd_regexp_get_pattern (re)); return FALSE; } } else { /* We consider that as timeout */ kill (cld, SIGKILL); g_assert (waitpid (cld, &status, 0) != -1); msg_err_re_cache ( "cannot approximate %s to hyperscan: timeout waiting", rspamd_regexp_get_pattern (re)); signal (SIGCHLD, SIG_IGN); } } return FALSE; } #endif gint rspamd_re_cache_compile_hyperscan (struct rspamd_re_cache *cache, const char *cache_dir, gdouble max_time, gboolean silent, GError **err) { g_assert (cache != NULL); g_assert (cache_dir != NULL); #ifndef WITH_HYPERSCAN g_set_error (err, rspamd_re_cache_quark (), EINVAL, "hyperscan is disabled"); return -1; #else GHashTableIter it, cit; gpointer k, v; struct rspamd_re_class *re_class; gchar path[PATH_MAX], npath[PATH_MAX]; hs_database_t *test_db; gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags; rspamd_cryptobox_fast_hash_state_t crc_st; guint64 crc; rspamd_regexp_t *re; hs_compile_error_t *hs_errors; guint *hs_flags = NULL; const hs_expr_ext_t **hs_exts = NULL; const gchar **hs_pats = NULL; gchar *hs_serialized; gsize serialized_len, total = 0; struct iovec iov[7]; g_hash_table_iter_init (&it, cache->re_classes); while (g_hash_table_iter_next (&it, &k, &v)) { re_class = v; rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cache_dir, G_DIR_SEPARATOR, re_class->hash); if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, TRUE, TRUE)) { fd = open (path, O_RDONLY, 00600); /* Read number of regexps */ g_assert (fd != -1); lseek (fd, RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt), SEEK_SET); g_assert (read (fd, &n, sizeof (n)) == sizeof (n)); close (fd); if (re_class->type_len > 0) { if (!silent) { msg_info_re_cache ( "skip already valid class %s(%*s) to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), (gint) re_class->type_len - 1, re_class->type_data, re_class->hash, n); } } else { if (!silent) { msg_info_re_cache ( "skip already valid class %s to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), re_class->hash, n); } } continue; } rspamd_snprintf (path, sizeof (path), "%s%c%s.hs.new", cache_dir, G_DIR_SEPARATOR, re_class->hash); fd = open (path, O_CREAT|O_TRUNC|O_EXCL|O_WRONLY, 00600); if (fd == -1) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot open file " "%s: %s", path, strerror (errno)); return -1; } g_hash_table_iter_init (&cit, re_class->re); n = g_hash_table_size (re_class->re); hs_flags = g_malloc0 (sizeof (*hs_flags) * n); hs_ids = g_malloc (sizeof (*hs_ids) * n); hs_pats = g_malloc (sizeof (*hs_pats) * n); hs_exts = g_malloc0 (sizeof (*hs_exts) * n); i = 0; while (g_hash_table_iter_next (&cit, &k, &v)) { re = v; pcre_flags = rspamd_regexp_get_pcre_flags (re); re_flags = rspamd_regexp_get_flags (re); if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) { /* Do not try to compile bad regexp */ msg_info_re_cache ( "do not try compile %s to hyperscan as it is PCRE only", rspamd_regexp_get_pattern (re)); continue; } hs_flags[i] = 0; hs_exts[i] = NULL; #ifndef WITH_PCRE2 if (pcre_flags & PCRE_FLAG(UTF8)) { hs_flags[i] |= HS_FLAG_UTF8; } #else if (pcre_flags & PCRE_FLAG(UTF)) { hs_flags[i] |= HS_FLAG_UTF8; } #endif if (pcre_flags & PCRE_FLAG(CASELESS)) { hs_flags[i] |= HS_FLAG_CASELESS; } if (pcre_flags & PCRE_FLAG(MULTILINE)) { hs_flags[i] |= HS_FLAG_MULTILINE; } if (pcre_flags & PCRE_FLAG(DOTALL)) { hs_flags[i] |= HS_FLAG_DOTALL; } if (rspamd_regexp_get_maxhits (re) == 1) { hs_flags[i] |= HS_FLAG_SINGLEMATCH; } if (hs_compile (rspamd_regexp_get_pattern (re), hs_flags[i], cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK, &cache->plt, &test_db, &hs_errors) != HS_SUCCESS) { msg_info_re_cache ("cannot compile %s to hyperscan, try prefilter match", rspamd_regexp_get_pattern (re)); hs_free_compile_error (hs_errors); /* The approximation operation might take a significant * amount of time, so we need to check if it's finite */ if (rspamd_re_cache_is_finite (cache, re, hs_flags[i], max_time)) { hs_flags[i] |= HS_FLAG_PREFILTER; hs_ids[i] = rspamd_regexp_get_cache_id (re); hs_pats[i] = rspamd_regexp_get_pattern (re); i++; } } else { hs_ids[i] = rspamd_regexp_get_cache_id (re); hs_pats[i] = rspamd_regexp_get_pattern (re); i ++; hs_free_database (test_db); } } /* Adjust real re number */ n = i; if (n > 0) { /* Create the hs tree */ if (hs_compile_ext_multi (hs_pats, hs_flags, hs_ids, hs_exts, n, cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK, &cache->plt, &test_db, &hs_errors) != HS_SUCCESS) { g_set_error (err, rspamd_re_cache_quark (), EINVAL, "cannot create tree of regexp when processing '%s': %s", hs_pats[hs_errors->expression], hs_errors->message); g_free (hs_flags); g_free (hs_ids); g_free (hs_pats); g_free (hs_exts); close (fd); unlink (path); hs_free_compile_error (hs_errors); return -1; } g_free (hs_pats); g_free (hs_exts); if (hs_serialize_database (test_db, &hs_serialized, &serialized_len) != HS_SUCCESS) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot serialize tree of regexp for %s", re_class->hash); close (fd); unlink (path); g_free (hs_ids); g_free (hs_flags); hs_free_database (test_db); return -1; } hs_free_database (test_db); /* * Magic - 8 bytes * Platform - sizeof (platform) * n - number of regexps * n * * n * * crc - 8 bytes checksum * */ rspamd_cryptobox_fast_hash_init (&crc_st, 0xdeadbabe); /* IDs -> Flags -> Hs blob */ rspamd_cryptobox_fast_hash_update (&crc_st, hs_ids, sizeof (*hs_ids) * n); rspamd_cryptobox_fast_hash_update (&crc_st, hs_flags, sizeof (*hs_flags) * n); rspamd_cryptobox_fast_hash_update (&crc_st, hs_serialized, serialized_len); crc = rspamd_cryptobox_fast_hash_final (&crc_st); if (cache->vectorized_hyperscan) { iov[0].iov_base = (void *) rspamd_hs_magic_vector; } else { iov[0].iov_base = (void *) rspamd_hs_magic; } iov[0].iov_len = RSPAMD_HS_MAGIC_LEN; iov[1].iov_base = &cache->plt; iov[1].iov_len = sizeof (cache->plt); iov[2].iov_base = &n; iov[2].iov_len = sizeof (n); iov[3].iov_base = hs_ids; iov[3].iov_len = sizeof (*hs_ids) * n; iov[4].iov_base = hs_flags; iov[4].iov_len = sizeof (*hs_flags) * n; iov[5].iov_base = &crc; iov[5].iov_len = sizeof (crc); iov[6].iov_base = hs_serialized; iov[6].iov_len = serialized_len; if (writev (fd, iov, G_N_ELEMENTS (iov)) == -1) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot serialize tree of regexp to %s: %s", path, strerror (errno)); close (fd); unlink (path); g_free (hs_ids); g_free (hs_flags); g_free (hs_serialized); return -1; } if (re_class->type_len > 0) { msg_info_re_cache ( "compiled class %s(%*s) to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), (gint) re_class->type_len - 1, re_class->type_data, re_class->hash, n); } else { msg_info_re_cache ( "compiled class %s to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), re_class->hash, n); } total += n; g_free (hs_serialized); g_free (hs_ids); g_free (hs_flags); } fsync (fd); /* Now rename temporary file to the new .hs file */ rspamd_snprintf (npath, sizeof (path), "%s%c%s.hs", cache_dir, G_DIR_SEPARATOR, re_class->hash); if (rename (path, npath) == -1) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot rename %s to %s: %s", path, npath, strerror (errno)); unlink (path); close (fd); return -1; } close (fd); } return total; #endif } gboolean rspamd_re_cache_is_valid_hyperscan_file (struct rspamd_re_cache *cache, const char *path, gboolean silent, gboolean try_load) { g_assert (cache != NULL); g_assert (path != NULL); #ifndef WITH_HYPERSCAN return FALSE; #else gint fd, n, ret; guchar magicbuf[RSPAMD_HS_MAGIC_LEN]; const guchar *mb; GHashTableIter it; gpointer k, v; struct rspamd_re_class *re_class; gsize len; const gchar *hash_pos; hs_platform_info_t test_plt; hs_database_t *test_db = NULL; guchar *map, *p, *end; rspamd_cryptobox_fast_hash_state_t crc_st; guint64 crc, valid_crc; len = strlen (path); if (len < sizeof (rspamd_cryptobox_HASHBYTES + 3)) { return FALSE; } if (memcmp (path + len - 3, ".hs", 3) != 0) { return FALSE; } hash_pos = path + len - 3 - (sizeof (re_class->hash) - 1); g_hash_table_iter_init (&it, cache->re_classes); while (g_hash_table_iter_next (&it, &k, &v)) { re_class = v; if (memcmp (hash_pos, re_class->hash, sizeof (re_class->hash) - 1) == 0) { /* Open file and check magic */ fd = open (path, O_RDONLY); if (fd == -1) { if (!silent) { msg_err_re_cache ("cannot open hyperscan cache file %s: %s", path, strerror (errno)); } return FALSE; } if (read (fd, magicbuf, sizeof (magicbuf)) != sizeof (magicbuf)) { msg_err_re_cache ("cannot read hyperscan cache file %s: %s", path, strerror (errno)); close (fd); return FALSE; } if (cache->vectorized_hyperscan) { mb = rspamd_hs_magic_vector; } else { mb = rspamd_hs_magic; } if (memcmp (magicbuf, mb, sizeof (magicbuf)) != 0) { msg_err_re_cache ("cannot open hyperscan cache file %s: " "bad magic ('%*xs', '%*xs' expected)", path, (int) RSPAMD_HS_MAGIC_LEN, magicbuf, (int) RSPAMD_HS_MAGIC_LEN, mb); close (fd); return FALSE; } if (read (fd, &test_plt, sizeof (test_plt)) != sizeof (test_plt)) { msg_err_re_cache ("cannot read hyperscan cache file %s: %s", path, strerror (errno)); close (fd); return FALSE; } if (memcmp (&test_plt, &cache->plt, sizeof (test_plt)) != 0) { msg_err_re_cache ("cannot open hyperscan cache file %s: " "compiled for a different platform", path); close (fd); return FALSE; } close (fd); if (try_load) { map = rspamd_file_xmap (path, PROT_READ, &len, TRUE); if (map == NULL) { msg_err_re_cache ("cannot mmap hyperscan cache file %s: " "%s", path, strerror (errno)); return FALSE; } p = map + RSPAMD_HS_MAGIC_LEN + sizeof (test_plt); end = map + len; n = *(gint *)p; p += sizeof (gint); if (n <= 0 || 2 * n * sizeof (gint) + /* IDs + flags */ sizeof (guint64) + /* crc */ RSPAMD_HS_MAGIC_LEN + /* header */ sizeof (cache->plt) > len) { /* Some wrong amount of regexps */ msg_err_re_cache ("bad number of expressions in %s: %d", path, n); munmap (map, len); return FALSE; } /* * Magic - 8 bytes * Platform - sizeof (platform) * n - number of regexps * n * * n * * crc - 8 bytes checksum * */ memcpy (&crc, p + n * 2 * sizeof (gint), sizeof (crc)); rspamd_cryptobox_fast_hash_init (&crc_st, 0xdeadbabe); /* IDs */ rspamd_cryptobox_fast_hash_update (&crc_st, p, n * sizeof (gint)); /* Flags */ rspamd_cryptobox_fast_hash_update (&crc_st, p + n * sizeof (gint), n * sizeof (gint)); /* HS database */ p += n * sizeof (gint) * 2 + sizeof (guint64); rspamd_cryptobox_fast_hash_update (&crc_st, p, end - p); valid_crc = rspamd_cryptobox_fast_hash_final (&crc_st); if (crc != valid_crc) { msg_warn_re_cache ("outdated or invalid hs database in %s: " "crc read %xL, crc expected %xL", path, crc, valid_crc); munmap (map, len); return FALSE; } if ((ret = hs_deserialize_database (p, end - p, &test_db)) != HS_SUCCESS) { msg_err_re_cache ("bad hs database in %s: %d", path, ret); munmap (map, len); return FALSE; } hs_free_database (test_db); munmap (map, len); } /* XXX: add crc check */ return TRUE; } } if (!silent) { msg_warn_re_cache ("unknown hyperscan cache file %s", path); } return FALSE; #endif } gboolean rspamd_re_cache_load_hyperscan (struct rspamd_re_cache *cache, const char *cache_dir) { g_assert (cache != NULL); g_assert (cache_dir != NULL); #ifndef WITH_HYPERSCAN return FALSE; #else gchar path[PATH_MAX]; gint fd, i, n, *hs_ids = NULL, *hs_flags = NULL, total = 0, ret; GHashTableIter it; gpointer k, v; guint8 *map, *p, *end; struct rspamd_re_class *re_class; struct rspamd_re_cache_elt *elt; struct stat st; g_hash_table_iter_init (&it, cache->re_classes); while (g_hash_table_iter_next (&it, &k, &v)) { re_class = v; rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cache_dir, G_DIR_SEPARATOR, re_class->hash); if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, FALSE, FALSE)) { msg_debug_re_cache ("load hyperscan database from '%s'", re_class->hash); fd = open (path, O_RDONLY); /* Read number of regexps */ g_assert (fd != -1); fstat (fd, &st); map = mmap (NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { msg_err_re_cache ("cannot mmap %s: %s", path, strerror (errno)); close (fd); return FALSE; } close (fd); end = map + st.st_size; p = map + RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt); n = *(gint *)p; if (n <= 0 || 2 * n * sizeof (gint) + /* IDs + flags */ sizeof (guint64) + /* crc */ RSPAMD_HS_MAGIC_LEN + /* header */ sizeof (cache->plt) > (gsize)st.st_size) { /* Some wrong amount of regexps */ msg_err_re_cache ("bad number of expressions in %s: %d", path, n); munmap (map, st.st_size); return FALSE; } total += n; p += sizeof (n); hs_ids = g_malloc (n * sizeof (*hs_ids)); memcpy (hs_ids, p, n * sizeof (*hs_ids)); p += n * sizeof (*hs_ids); hs_flags = g_malloc (n * sizeof (*hs_flags)); memcpy (hs_flags, p, n * sizeof (*hs_flags)); /* Skip crc */ p += n * sizeof (*hs_ids) + sizeof (guint64); /* Cleanup */ if (re_class->hs_scratch != NULL) { hs_free_scratch (re_class->hs_scratch); } if (re_class->hs_db != NULL) { hs_free_database (re_class->hs_db); } if (re_class->hs_ids) { g_free (re_class->hs_ids); } re_class->hs_ids = NULL; re_class->hs_scratch = NULL; re_class->hs_db = NULL; if ((ret = hs_deserialize_database (p, end - p, &re_class->hs_db)) != HS_SUCCESS) { msg_err_re_cache ("bad hs database in %s: %d", path, ret); munmap (map, st.st_size); g_free (hs_ids); g_free (hs_flags); return FALSE; } munmap (map, st.st_size); g_assert (hs_alloc_scratch (re_class->hs_db, &re_class->hs_scratch) == HS_SUCCESS); /* * Now find hyperscan elts that are successfully compiled and * specify that they should be matched using hyperscan */ for (i = 0; i < n; i ++) { g_assert ((gint)cache->re->len > hs_ids[i] && hs_ids[i] >= 0); elt = g_ptr_array_index (cache->re, hs_ids[i]); if (hs_flags[i] & HS_FLAG_PREFILTER) { elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN_PRE; } else { elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN; } } re_class->hs_ids = hs_ids; g_free (hs_flags); re_class->nhs = n; } else { msg_err_re_cache ("invalid hyperscan hash file '%s'", path); return FALSE; } } msg_info_re_cache ("hyperscan database of %d regexps has been loaded", total); cache->hyperscan_loaded = TRUE; return TRUE; #endif }