From fffeb9ff378e41e1b7c7bfb9fb4215261fd3c636 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 22 Oct 2022 15:52:59 +0100 Subject: [PATCH] [Rework] Convert multipattern to use hyperscan tools --- src/libserver/hyperscan_tools.cxx | 73 ++++++++++++++- src/libserver/hyperscan_tools.h | 13 +++ src/libutil/multipattern.c | 151 ++++-------------------------- 3 files changed, 103 insertions(+), 134 deletions(-) diff --git a/src/libserver/hyperscan_tools.cxx b/src/libserver/hyperscan_tools.cxx index 309f821dd..4f3ac013a 100644 --- a/src/libserver/hyperscan_tools.cxx +++ b/src/libserver/hyperscan_tools.cxx @@ -108,6 +108,38 @@ private: } } } + /* Have to duplicate raii_file methods to use raw filenames */ + static auto get_dir(std::string_view fname) -> std::string_view { + auto sep_pos = fname.rfind(G_DIR_SEPARATOR); + + if (sep_pos == std::string::npos) { + return std::string_view{fname}; + } + + while (sep_pos >= 1 && fname[sep_pos - 1] == G_DIR_SEPARATOR) { + sep_pos --; + } + + return std::string_view{fname.data(), sep_pos}; + } + + static auto get_extension(std::string_view fname) -> std::string_view { + auto sep_pos = fname.rfind(G_DIR_SEPARATOR); + + if (sep_pos == std::string::npos) { + sep_pos = 0; + } + + auto filename = std::string_view{fname.data() + sep_pos}; + auto dot_pos = filename.find('.'); + + if (dot_pos == std::string::npos) { + return std::string_view{}; + } + else { + return std::string_view{filename.data() + dot_pos + 1, filename.size() - dot_pos - 1}; + } + } public: hs_known_files_cache(const hs_known_files_cache &) = delete; hs_known_files_cache(hs_known_files_cache &&) = delete; @@ -135,10 +167,31 @@ public: cache_extensions.emplace_back(std::string{ext}); } - known_cached_files.insert(file.get_name()); - msg_debug_hyperscan("added new known hyperscan file: %*s", (int)file.get_name().size(), + auto is_known = known_cached_files.insert(file.get_name()); + msg_debug_hyperscan("added %s known hyperscan file: %*s", + is_known.second ? "new" : "already", + (int)file.get_name().size(), file.get_name().data()); } + + void add_cached_file(const char *fname) { + auto dir = hs_known_files_cache::get_dir(fname); + auto ext = hs_known_files_cache::get_extension(fname); + + if (std::find_if(cache_dirs.begin(), cache_dirs.end(), + [&](const auto& item){ return item == dir; }) == std::end(cache_dirs)) { + cache_dirs.emplace_back(std::string{dir}); + } + if (std::find_if(cache_extensions.begin(), cache_extensions.end(), + [&](const auto& item){ return item == ext; }) == std::end(cache_extensions)) { + cache_extensions.emplace_back(std::string{ext}); + } + + auto is_known = known_cached_files.insert(fname); + msg_debug_hyperscan("added %s known hyperscan file: %s", + is_known.second ? "new" : "already", + fname); + } }; @@ -312,7 +365,7 @@ auto load_cached_hs_file(const char *fname) -> tl::expected(obj)) rspamd_hyperscan_t * -rspamd_maybe_load_hyperscan(const char *filename) +rspamd_hyperscan_maybe_load(const char *filename) { auto maybe_db = rspamd::util::load_cached_hs_file(filename); @@ -350,6 +403,14 @@ rspamd_hyperscan_get_database(rspamd_hyperscan_t *db) return real_db->db; } +rspamd_hyperscan_t * +rspamd_hyperscan_from_raw_db(hs_database_t *db) +{ + auto *ndb = new rspamd::util::hs_shared_database{db}; + + return C_DB_FROM_CXX(ndb); +} + void rspamd_hyperscan_free(rspamd_hyperscan_t *db) { @@ -358,4 +419,10 @@ rspamd_hyperscan_free(rspamd_hyperscan_t *db) delete real_db; } +void +rspamd_hyperscan_notice_known(const char *fname) +{ + rspamd::util::hs_known_files_cache::get().add_cached_file(fname); +} + #endif // WITH_HYPERSCAN \ No newline at end of file diff --git a/src/libserver/hyperscan_tools.h b/src/libserver/hyperscan_tools.h index 31139e6af..50ca51543 100644 --- a/src/libserver/hyperscan_tools.h +++ b/src/libserver/hyperscan_tools.h @@ -35,6 +35,13 @@ typedef struct rspamd_hyperscan_s rspamd_hyperscan_t; * @return cached database if available */ rspamd_hyperscan_t *rspamd_hyperscan_maybe_load(const char *filename); + +/** + * Creates a wrapper for a raw hs db. Ownership is transferred to the enclosing object returned + * @param filename + * @return + */ +rspamd_hyperscan_t *rspamd_hyperscan_from_raw_db(hs_database_t *db); /** * Get the internal database * @param db @@ -47,6 +54,12 @@ hs_database_t* rspamd_hyperscan_get_database(rspamd_hyperscan_t *db); */ void rspamd_hyperscan_free(rspamd_hyperscan_t *db); +/** + * Notice a known hyperscan file (e.g. externally serialized) + * @param fname + */ +void rspamd_hyperscan_notice_known(const char *fname); + G_END_DECLS #endif diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c index d795da3b4..dd9a37cec 100644 --- a/src/libutil/multipattern.c +++ b/src/libutil/multipattern.c @@ -23,6 +23,7 @@ #include "logger.h" #include "unix-std.h" #include "hs.h" +#include "libserver/hyperscan_tools.h" #endif #include "acism.h" #include "libutil/regexp.h" @@ -43,15 +44,12 @@ static enum rspamd_hs_check_state hs_suitable_cpu = RSPAMD_HS_UNCHECKED; struct RSPAMD_ALIGNED(64) rspamd_multipattern { #ifdef WITH_HYPERSCAN rspamd_cryptobox_hash_state_t hash_state; - hs_database_t *db; + rspamd_hyperscan_t *hs_db; hs_scratch_t *scratch[MAX_SCRATCH]; GArray *hs_pats; GArray *hs_ids; GArray *hs_flags; guint scratch_used; - /* If serialized into shared memory */ - gboolean unser_fd; - gsize unser_size; #endif ac_trie_t *t; GArray *pats; @@ -403,8 +401,6 @@ rspamd_multipattern_try_load_hs (struct rspamd_multipattern *mp, const guchar *hash) { gchar fp[PATH_MAX]; - gpointer map; - gsize len; if (hs_cache_dir == NULL) { return FALSE; @@ -412,119 +408,9 @@ rspamd_multipattern_try_load_hs (struct rspamd_multipattern *mp, rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmp", hs_cache_dir, (gint)rspamd_cryptobox_HASHBYTES / 2, hash); + mp->hs_db = rspamd_hyperscan_maybe_load(fp); - if ((map = rspamd_file_xmap (fp, PROT_READ, &len, TRUE)) != NULL) { - - mp->unser_fd = -1; -#if defined(HS_MAJOR) && defined(HS_MINOR) && HS_MAJOR >= 5 && HS_MINOR >= 4 - /* Here is a logic to use a shared memory for hyperscan database */ - rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmp.unser", hs_cache_dir, - (gint)rspamd_cryptobox_HASHBYTES / 2, hash); - /* Try to create a new file and lock it */ - mp->unser_fd = rspamd_file_xopen (fp, O_CREAT|O_RDWR|O_EXCL, 00644, false); - if (mp->unser_fd == -1) { - /* A file can be already existing */ - mp->unser_fd = rspamd_file_xopen (fp, O_RDONLY, 00644, false); - } - else { - /* Allocate new file, write database and reopen it in RO mode afterwards */ - gchar tmpfp[PATH_MAX]; - rspamd_snprintf (tmpfp, sizeof (tmpfp), "%s/hsmp-XXXXXXXXXXXXXXXXXX", hs_cache_dir); - int tmp_fd = g_mkstemp_full(tmpfp, O_CREAT|O_RDWR|O_EXCL, 00600); - g_assert(tmp_fd != -1); - hs_serialized_database_size (map, len, &mp->unser_size); - msg_debug("multipattern: create new database in %s; %Hz size", tmpfp, mp->unser_size); - void *buf; - posix_memalign(&buf, 16, mp->unser_size); - if (buf == NULL) { - g_abort(); - } - - int ret; - - if ((ret = hs_deserialize_database_at (map, len, (hs_database_t *)buf)) != HS_SUCCESS) { - msg_err ("cannot deserialize hyperscan database: %d", ret); - (void)unlink(tmpfp); - close (tmp_fd); - mp->unser_fd = -1; - free (buf); - } - else { - if (write(tmp_fd, buf, mp->unser_size) == -1) { - msg_err ("cannot write to %s: %s", fp, strerror(errno)); - close(tmp_fd); - (void)unlink(tmpfp); - mp->unser_fd = -1; - free(buf); - } - else { - free(buf); - if (rename(tmpfp, fp) == -1) { - if (errno != EEXIST) { - msg_err("cannot rename %s -> %s: %s", tmpfp, fp, - strerror(errno)); - } - (void)unlink(tmpfp); - close(tmp_fd); - } - else { - (void) unlink(tmpfp); - close(tmp_fd); - } - /* Reopen in RO mode */ - mp->unser_fd = rspamd_file_xopen (fp, O_RDONLY, 00644, false); - } - } - - } -#endif - if (mp->unser_fd != -1) { - /* We have a prepared database, so we can just use it */ - struct stat st; - - g_assert(fstat(mp->unser_fd, &st) != -1); - mp->unser_size = st.st_size; - mp->db = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, mp->unser_fd, 0); - - if (mp->db == MAP_FAILED) { - mp->db = NULL; - msg_err ("cannot open cached hyperscan database: %s", strerror(errno)); - close(mp->unser_fd); - mp->unser_fd = -1; - mp->unser_size = 0; - (void)unlink(fp); - } - else { - close(mp->unser_fd); - mp->unser_fd = -1; - msg_debug("multipattern: loaded hyperscan db from: %s, size = %Hz", fp, mp->unser_size); - - return TRUE; - } - munmap(map, len); - - } - else { - int ret; - if ((ret = hs_deserialize_database(map, len, &mp->db)) == HS_SUCCESS) { - munmap(map, len); - return TRUE; - } - else { - msg_err ("cannot deserialize hyperscan database: %d", ret); - } - } - - munmap (map, len); - if (mp->unser_fd != -1) { - close (mp->unser_fd); - munmap (mp->db, mp->unser_size); - } - /* Remove stale file */ - (void)unlink (fp); - } - - return FALSE; + return mp->hs_db != NULL; } static void @@ -544,7 +430,7 @@ rspamd_multipattern_try_save_hs (struct rspamd_multipattern *mp, (gint)rspamd_cryptobox_HASHBYTES / 2, hash); if ((fd = rspamd_file_xopen (fp, O_WRONLY | O_CREAT | O_EXCL, 00644, 0)) != -1) { - if (hs_serialize_database (mp->db, &bytes, &len) == HS_SUCCESS) { + if (hs_serialize_database (rspamd_hyperscan_get_database(mp->hs_db), &bytes, &len) == HS_SUCCESS) { if (write (fd, bytes, len) == -1) { msg_warn ("cannot write hyperscan cache to %s: %s", fp, strerror (errno)); @@ -563,6 +449,9 @@ rspamd_multipattern_try_save_hs (struct rspamd_multipattern *mp, fp, np, strerror (errno)); unlink (fp); } + else { + rspamd_hyperscan_notice_known(np); + } } } else { @@ -596,13 +485,15 @@ rspamd_multipattern_compile (struct rspamd_multipattern *mp, GError **err) rspamd_cryptobox_hash_final (&mp->hash_state, hash); if (!rspamd_multipattern_try_load_hs (mp, hash)) { + hs_database_t *db = NULL; + if (hs_compile_multi ((const char *const *)mp->hs_pats->data, (const unsigned int *)mp->hs_flags->data, (const unsigned int *)mp->hs_ids->data, mp->cnt, HS_MODE_BLOCK, &plt, - &mp->db, + &db, &hs_errors) != HS_SUCCESS) { g_set_error (err, rspamd_multipattern_quark (), EINVAL, @@ -613,12 +504,17 @@ rspamd_multipattern_compile (struct rspamd_multipattern *mp, GError **err) return FALSE; } + mp->hs_db = rspamd_hyperscan_from_raw_db(db); } rspamd_multipattern_try_save_hs (mp, hash); for (i = 0; i < MAX_SCRATCH; i ++) { - g_assert (hs_alloc_scratch (mp->db, &mp->scratch[i]) == HS_SUCCESS); + int ret; + if ((ret = hs_alloc_scratch (rspamd_hyperscan_get_database(mp->hs_db), &mp->scratch[i])) != HS_SUCCESS) { + msg_err("fatal error: cannot allocate scratch space for hyperscan: %d", ret); + g_abort(); + } } } @@ -755,7 +651,7 @@ rspamd_multipattern_lookup (struct rspamd_multipattern *mp, g_assert (scr != NULL); - ret = hs_scan (mp->db, in, len, 0, scr, + ret = hs_scan (rspamd_hyperscan_get_database(mp->hs_db), in, len, 0, scr, rspamd_multipattern_hs_cb, &cbd); mp->scratch_used &= ~(1 << i); @@ -831,15 +727,8 @@ rspamd_multipattern_destroy (struct rspamd_multipattern *mp) hs_free_scratch (mp->scratch[i]); } - if (mp->db) { - if (mp->unser_size) { - /* Mmapped database */ - munmap(mp->db, mp->unser_size); - } - else { - /* Allocated database */ - hs_free_database (mp->db); - } + if (mp->hs_db) { + rspamd_hyperscan_free(mp->hs_db); } } -- 2.39.5