From 49b60a37470e59aeadc7c25a966a656d054a2bc2 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sun, 16 Oct 2022 15:22:27 +0100 Subject: [PATCH] [Project] (Re)implement hyperscan caching --- src/libutil/cxx/hyperscan_tools.cxx | 200 +++++++++++++++++++++++++++- 1 file changed, 197 insertions(+), 3 deletions(-) diff --git a/src/libutil/cxx/hyperscan_tools.cxx b/src/libutil/cxx/hyperscan_tools.cxx index 82664d810..f5086d1e4 100644 --- a/src/libutil/cxx/hyperscan_tools.cxx +++ b/src/libutil/cxx/hyperscan_tools.cxx @@ -20,10 +20,27 @@ #include "contrib/ankerl/unordered_dense.h" #include "contrib/ankerl/svector.h" #include "fmt/core.h" +#include "locked_file.hxx" +#include "hs.h" +#include "logger.h" +#include "locked_file.hxx" #include /* for glob */ -#include /* for stat */ #include /* for unlink */ +#include +#include "unix-std.h" + +#define msg_info_hyperscan(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \ + "hyperscan", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +#define msg_debug_hyperscan(...) rspamd_conditional_debug_fast (NULL, NULL, \ + rspamd_hyperscan_log_id, "hyperscan", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE_PUBLIC(hyperscan) namespace rspamd::util { @@ -45,7 +62,7 @@ private: glob_t globbuf; auto glob_pattern = fmt::format("{}{}*.{}", - dir, G_DIR_SEPARATOR_S, ext); + dir, G_DIR_SEPARATOR_S, ext); memset(&globbuf, 0, sizeof(globbuf)); if (glob(glob_pattern.c_str(), 0, nullptr, &globbuf) == 0) { @@ -54,13 +71,20 @@ private: struct stat st; if (stat(path, &st) == -1) { + msg_debug_hyperscan("cannot stat file %s: %s", + path, strerror(errno)); continue; } if (S_ISREG(st.st_mode)) { if (!known_cached_files.contains(path)) { + msg_info_hyperscan("remove stale hyperscan file %s", path); unlink(path); } + else { + msg_debug_hyperscan("found known hyperscan file %s, size: %Hz", + path, st.st_size); + } } } } @@ -88,10 +112,180 @@ public: return *singleton; } + + void add_cached_file(const raii_file &file) { + auto dir = file.get_dir(); + auto ext = file.get_extension(); + + if (std::find_if(cache_dirs.begin(), cache_dirs.end(), + [&](const auto& item){ return item == dir; }) == std::end(cache_dirs)) { + cache_dirs.emplace_back(std::string{dir}); + } + if (std::find_if(cache_extensions.begin(), cache_extensions.end(), + [&](const auto& item){ return item == ext; }) == std::end(cache_extensions)) { + cache_extensions.emplace_back(std::string{ext}); + } + + known_cached_files.insert(file.get_name()); + msg_debug_hyperscan("added new known hyperscan file: %*s", (int)file.get_name().size(), + file.get_name().data()); + } }; +/** + * This is a higher level representation of the cached hyperscan file + */ +struct hs_shared_database { + hs_database_t *db; /**< internal database (might be in a shared memory) */ + std::optional maybe_map; + + ~hs_shared_database() { + if (!maybe_map) { + hs_free_database(db); + } + // Otherwise, handled by maybe_map dtor + } + + explicit hs_shared_database(raii_mmaped_file &&map, hs_database_t *db) : db(db), maybe_map(std::move(map)) {} + explicit hs_shared_database(hs_database_t *db) : db(db), maybe_map(std::nullopt) {} +}; + +static auto +hs_shared_from_unserialized(raii_mmaped_file &&map) -> tl::expected +{ + auto ptr = map.get_map(); + return tl::expected{tl::in_place, std::move(map), (hs_database_t *)ptr}; +} + +static auto +hs_shared_from_serialized(raii_mmaped_file &&map) -> tl::expected +{ + hs_database_t *target = nullptr; + + if (auto ret = hs_deserialize_database((const char *)map.get_map(), map.get_size(), &target); ret != HS_SUCCESS) { + return tl::make_unexpected("cannot deserialize database"); + } + + return tl::expected{tl::in_place, target}; +} + +auto load_cached_hs_file(const char *fname, bool need_cleanup) -> tl::expected +{ + auto &hs_cache = hs_known_files_cache::get(need_cleanup); + + return raii_mmaped_file::mmap_shared(fname, O_RDONLY, PROT_READ) + .and_then([&](T &&cached_serialized) -> tl::expected { +#if defined(HS_MAJOR) && defined(HS_MINOR) && HS_MAJOR >= 5 && HS_MINOR >= 4 + auto unserialized_fname = fmt::format("{}.unser", fname); + auto unserialized_file = raii_locked_file::create(unserialized_fname.c_str(), O_CREAT | O_RDWR | O_EXCL, + 00644) + .and_then([&](auto &&new_file_locked) -> tl::expected { + auto tmpfile_pattern = fmt::format("{}{}hsmp-XXXXXXXXXXXXXXXXXX", + cached_serialized.get_file().get_dir(), G_DIR_SEPARATOR); + auto tmpfile = raii_locked_file::mkstemp(tmpfile_pattern.data(), O_CREAT | O_RDWR | O_EXCL, + 00644); + + if (!tmpfile) { + return tl::make_unexpected(tmpfile.error()); + } + else { + auto &tmpfile_checked = tmpfile.value(); + std::size_t unserialized_size; + + hs_serialized_database_size((const char *)cached_serialized.get_map(), + cached_serialized.get_size(), &unserialized_size); + + msg_debug("multipattern: create new database in %s; %Hz size", + tmpfile_pattern.data(), unserialized_size); + void *buf; + posix_memalign(&buf, 16, unserialized_size); + if (buf == NULL) { + return tl::make_unexpected("Cannot allocate memory"); + } + + // Store owned string + auto tmpfile_name = std::string{tmpfile_checked.get_name()}; + + if (auto ret = hs_deserialize_database_at((const char *)cached_serialized.get_map(), + cached_serialized.get_size(), (hs_database_t *) buf); ret != HS_SUCCESS) { + return tl::make_unexpected( + fmt::format("cannot deserialize hyperscan database: {}", ret)); + } + else { + if (write(tmpfile_checked.get_fd(), buf, unserialized_size) == -1) { + free(buf); + return tl::make_unexpected(fmt::format("cannot write to {}: {}", + tmpfile_name, ::strerror(errno))); + } + else { + free(buf); + + /* + * Unlink target file before renaming to avoid + * race condition. + * So what we have is that `new_file_locked` + * will have flock on that file, so it will be + * replaced after unlink safely, and also unlocked. + */ + (void) unlink(unserialized_fname.c_str()); + if (rename(tmpfile_name.c_str(), + unserialized_fname.c_str()) == -1) { + if (errno != EEXIST) { + msg_err("cannot rename %s -> %s: %s", + tmpfile_name.c_str(), + unserialized_fname.c_str(), + strerror(errno)); + } + } + else { + /* Unlock file but mark it as immortal first to avoid deletion */ + tmpfile_checked.make_immortal(); + (void) tmpfile_checked.unlock(); + } + } + } + /* Reopen in RO mode */ + return raii_file::open(unserialized_fname.c_str(), O_RDONLY); + }; + }) + .or_else([&](auto unused) -> tl::expected { + // Cannot create file, so try to open it in RO mode + return raii_file::open(unserialized_fname.c_str(), O_RDONLY); + }); + + hs_cache.add_cached_file(cached_serialized.get_file()); + + if (unserialized_file.has_value()) { + + auto &unserialized_checked = unserialized_file.value(); + hs_cache.add_cached_file(unserialized_checked); + + if (unserialized_checked.get_size() == 0) { + /* + * This is a case when we have a file that is currently + * being created by another process. + * We cannot use it! + */ + return hs_shared_from_serialized(std::forward(cached_serialized)); + } + else { + return raii_mmaped_file::mmap_shared(std::move(unserialized_checked), PROT_READ) + .and_then([&](U &&mmapped_unserialized) -> auto { + return hs_shared_from_unserialized(std::forward(mmapped_unserialized)); + }); + } + } + else { + return hs_shared_from_serialized(std::forward(cached_serialized)); + } +#else // defined(HS_MAJOR) && defined(HS_MINOR) && HS_MAJOR >= 5 && HS_MINOR >= 4 + hs_cache.add_cached_file(cached_serialized.get_file()); + return hs_shared_from_serialized(std::forward(cached_serialized)); +#endif // defined(HS_MAJOR) && defined(HS_MINOR) && HS_MAJOR >= 5 && HS_MINOR >= 4 + }); +} } // namespace rspamd::util -#endif \ No newline at end of file +#endif // WITH_HYPERSCAN \ No newline at end of file -- 2.39.5