From c85595e6230a5563055eabd69135c8342d31b207 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sun, 23 Oct 2022 21:37:38 +0100 Subject: [PATCH] [Project] Rework cleanup --- src/libserver/hyperscan_tools.cxx | 115 ++++++++++-------- src/libserver/hyperscan_tools.h | 5 + src/libserver/maps/map_helpers.c | 194 +++--------------------------- src/rspamd.c | 7 ++ 4 files changed, 98 insertions(+), 223 deletions(-) diff --git a/src/libserver/hyperscan_tools.cxx b/src/libserver/hyperscan_tools.cxx index 3383915c3..6ec5f7c36 100644 --- a/src/libserver/hyperscan_tools.cxx +++ b/src/libserver/hyperscan_tools.cxx @@ -67,49 +67,9 @@ private: virtual ~hs_known_files_cache() { // Cleanup cache dir - /* We clean dir merely if we are running from the main process */ - if (rspamd_current_worker == nullptr) { - auto cleanup_dir = [&](std::string_view dir) -> void { - for (const auto &ext : cache_extensions) { - glob_t globbuf; - - auto glob_pattern = fmt::format("{}{}*.{}", - dir, G_DIR_SEPARATOR_S, ext); - memset(&globbuf, 0, sizeof(globbuf)); - - if (glob(glob_pattern.c_str(), 0, nullptr, &globbuf) == 0) { - for (auto i = 0; i < globbuf.gl_pathc; i++) { - const auto *path = globbuf.gl_pathv[i]; - struct stat st; - - if (stat(path, &st) == -1) { - msg_debug_hyperscan("cannot stat file %s: %s", - path, strerror(errno)); - continue; - } - - if (S_ISREG(st.st_mode)) { - if (!known_cached_files.contains(path)) { - msg_info_hyperscan("remove stale hyperscan file %s", path); - unlink(path); - } - else { - msg_debug_hyperscan("found known hyperscan file %s, size: %Hz", - path, st.st_size); - } - } - } - } - - globfree(&globbuf); - } - }; - - for (const auto &dir: cache_dirs) { - cleanup_dir(dir); - } - } + cleanup_maybe(); } + /* Have to duplicate raii_file methods to use raw filenames */ static auto get_dir(std::string_view fname) -> std::string_view { auto sep_pos = fname.rfind(G_DIR_SEPARATOR); @@ -177,8 +137,13 @@ public: } void add_cached_file(const char *fname) { - auto dir = hs_known_files_cache::get_dir(fname); - auto ext = hs_known_files_cache::get_extension(fname); + + auto mut_fname = std::string{fname}; + std::size_t sz; + rspamd_http_normalize_path_inplace(mut_fname.data(), mut_fname.size(), &sz); + mut_fname.resize(sz); + auto dir = hs_known_files_cache::get_dir(mut_fname); + auto ext = hs_known_files_cache::get_extension(mut_fname); if (std::find_if(cache_dirs.begin(), cache_dirs.end(), [&](const auto& item){ return item == dir; }) == std::end(cache_dirs)) { @@ -189,10 +154,60 @@ public: cache_extensions.emplace_back(std::string{ext}); } - auto is_known = known_cached_files.insert(fname); + auto is_known = known_cached_files.insert(mut_fname); msg_debug_hyperscan("added %s known hyperscan file: %s", is_known.second ? "new" : "already", - fname); + mut_fname.c_str()); + } + + auto cleanup_maybe() -> void { + /* We clean dir merely if we are running from the main process */ + if (rspamd_current_worker == nullptr) { + auto cleanup_dir = [&](std::string_view dir) -> void { + for (const auto &ext : cache_extensions) { + glob_t globbuf; + + auto glob_pattern = fmt::format("{}{}*.{}", + dir, G_DIR_SEPARATOR_S, ext); + memset(&globbuf, 0, sizeof(globbuf)); + + if (glob(glob_pattern.c_str(), 0, nullptr, &globbuf) == 0) { + for (auto i = 0; i < globbuf.gl_pathc; i++) { + const auto *path = globbuf.gl_pathv[i]; + struct stat st; + + if (stat(path, &st) == -1) { + msg_debug_hyperscan("cannot stat file %s: %s", + path, strerror(errno)); + continue; + } + + if (S_ISREG(st.st_mode)) { + if (!known_cached_files.contains(path)) { + msg_info_hyperscan("remove stale hyperscan file %s", path); + unlink(path); + } + else { + msg_debug_hyperscan("found known hyperscan file %s, size: %Hz", + path, st.st_size); + } + } + } + } + + globfree(&globbuf); + } + }; + + for (const auto &dir: cache_dirs) { + msg_debug_hyperscan("cleaning up directory %s", dir.c_str()); + cleanup_dir(dir); + } + + cache_dirs.clear(); + cache_extensions.clear(); + known_cached_files.clear(); + } } }; @@ -333,7 +348,6 @@ auto load_cached_hs_file(const char *fname) -> tl::expected tl::expected(cached_serialized)); } else { + hs_cache.add_cached_file(unserialized_checked); return raii_mmaped_file::mmap_shared(std::move(unserialized_checked), PROT_READ) .and_then([&](U &&mmapped_unserialized) -> auto { return hs_shared_from_unserialized(std::forward(mmapped_unserialized)); @@ -444,4 +459,10 @@ rspamd_hyperscan_notice_known(const char *fname) } } +void +rspamd_hyperscan_cleanup_maybe(void) +{ + rspamd::util::hs_known_files_cache::get().cleanup_maybe(); +} + #endif // WITH_HYPERSCAN \ No newline at end of file diff --git a/src/libserver/hyperscan_tools.h b/src/libserver/hyperscan_tools.h index 50ca51543..5d50e07ec 100644 --- a/src/libserver/hyperscan_tools.h +++ b/src/libserver/hyperscan_tools.h @@ -60,6 +60,11 @@ void rspamd_hyperscan_free(rspamd_hyperscan_t *db); */ void rspamd_hyperscan_notice_known(const char *fname); +/** + * Cleans up old files. This method should be called on config free (in the main process) + */ +void rspamd_hyperscan_cleanup_maybe(void); + G_END_DECLS #endif diff --git a/src/libserver/maps/map_helpers.c b/src/libserver/maps/map_helpers.c index 6381e6d51..8850d052c 100644 --- a/src/libserver/maps/map_helpers.c +++ b/src/libserver/maps/map_helpers.c @@ -26,6 +26,7 @@ #ifdef WITH_HYPERSCAN #include "hs.h" +#include "hyperscan_tools.h" #endif #ifndef WITH_PCRE2 #include @@ -82,7 +83,7 @@ struct rspamd_regexp_map_helper { khash_t(rspamd_map_hash) *htb; enum rspamd_regexp_map_flags map_flags; #ifdef WITH_HYPERSCAN - hs_database_t *hs_db; + rspamd_hyperscan_t *hs_db; hs_scratch_t *hs_scratch; gchar **patterns; gint *flags; @@ -883,7 +884,7 @@ rspamd_map_helper_destroy_regexp (struct rspamd_regexp_map_helper *re_map) hs_free_scratch (re_map->hs_scratch); } if (re_map->hs_db) { - hs_free_database (re_map->hs_db); + rspamd_hyperscan_free(re_map->hs_db); } if (re_map->patterns) { for (i = 0; i < re_map->regexps->len; i ++) { @@ -1055,112 +1056,11 @@ rspamd_radix_dtor (struct map_cb_data *data) } #ifdef WITH_HYPERSCAN -struct rspamd_re_maps_cache_dtor_cbdata { - struct rspamd_config *cfg; - GHashTable *valid_re_hashes; - gchar *dirname; -}; - -static void -rspamd_re_maps_cache_cleanup_dtor (gpointer ud) -{ - struct rspamd_re_maps_cache_dtor_cbdata *cbd = - (struct rspamd_re_maps_cache_dtor_cbdata *)ud; - GPtrArray *cache_files; - GError *err = NULL; - struct rspamd_config *cfg; - - cfg = cbd->cfg; - - if (cfg->cur_worker != NULL) { - /* Skip dtor, limit it to main process only */ - return; - } - - cache_files = rspamd_glob_path (cbd->dirname, "*.hsmc", FALSE, &err); - - if (!cache_files) { - msg_err_config ("cannot glob files in %s: %e", cbd->dirname, err); - g_error_free (err); - } - else { - const gchar *fname; - guint i; - - PTR_ARRAY_FOREACH (cache_files, i, fname) { - gchar *basename = g_path_get_basename (fname); - - if (g_hash_table_lookup (cbd->valid_re_hashes, basename) == NULL) { - gchar *dir; - - dir = g_path_get_dirname (fname); - - /* Sanity check to avoid removal of something bad */ - if (strcmp (dir, cbd->dirname) != 0) { - msg_err_config ("bogus file found: %s in %s, skip deleting", - fname, dir); - } - else { - if (unlink (fname) == -1) { - msg_err_config ("cannot delete obsolete file %s in %s: %s", - fname, dir, strerror (errno)); - } - else { - msg_info_config ("deleted obsolete file %s in %s", - fname, dir); - } - } - - g_free (dir); - } - else { - msg_debug_config ("valid re cache file %s", fname); - } - - g_free (basename); - } - - g_ptr_array_free (cache_files, TRUE); - } - - g_hash_table_unref (cbd->valid_re_hashes); - g_free (cbd->dirname); -} - -static void -rspamd_re_map_cache_update (const gchar *fname, struct rspamd_config *cfg) -{ - GHashTable *valid_re_hashes; - - valid_re_hashes = rspamd_mempool_get_variable (cfg->cfg_pool, - RSPAMD_MEMPOOL_RE_MAPS_CACHE); - - if (!valid_re_hashes) { - valid_re_hashes = g_hash_table_new_full (g_str_hash, g_str_equal, - g_free, NULL); - rspamd_mempool_set_variable (cfg->cfg_pool, - RSPAMD_MEMPOOL_RE_MAPS_CACHE, - valid_re_hashes, (rspamd_mempool_destruct_t)g_hash_table_unref); - - /* We also add a cleanup dtor for all hashes */ - static struct rspamd_re_maps_cache_dtor_cbdata cbd; - - cbd.valid_re_hashes = g_hash_table_ref (valid_re_hashes); - cbd.cfg = cfg; - cbd.dirname = g_path_get_dirname (fname); - rspamd_mempool_add_destructor (cfg->cfg_pool, - rspamd_re_maps_cache_cleanup_dtor, &cbd); - } - - g_hash_table_insert (valid_re_hashes, g_path_get_basename (fname), "1"); -} static gboolean rspamd_try_load_re_map_cache (struct rspamd_regexp_map_helper *re_map) { gchar fp[PATH_MAX]; - gpointer data; - gsize len; struct rspamd_map *map; map = re_map->map; @@ -1173,25 +1073,9 @@ rspamd_try_load_re_map_cache (struct rspamd_regexp_map_helper *re_map) map->cfg->hs_cache_dir, (gint)rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest); - if ((data = rspamd_file_xmap (fp, PROT_READ, &len, TRUE)) != NULL) { - if (hs_deserialize_database (data, len, &re_map->hs_db) == HS_SUCCESS) { - rspamd_re_map_cache_update (fp, map->cfg); - munmap (data, len); - - msg_info_map ("loaded hypersan cache from %s (%Hz length) for %s", - fp, len, map->name); - - return TRUE; - } - - msg_info_map ("invalid hypersan cache in %s (%Hz length) for %s, removing file", - fp, len, map->name); - munmap (data, len); - /* Remove stale file */ - (void)unlink (fp); - } + re_map->hs_db = rspamd_hyperscan_maybe_load(fp); - return FALSE; + return re_map->hs_db != NULL; } static gboolean @@ -1214,7 +1098,7 @@ rspamd_try_save_re_map_cache (struct rspamd_regexp_map_helper *re_map) (gint)rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest); if ((fd = rspamd_file_xopen (fp, O_WRONLY | O_CREAT | O_EXCL, 00644, 0)) != -1) { - if (hs_serialize_database (re_map->hs_db, &bytes, &len) == HS_SUCCESS) { + if (hs_serialize_database (rspamd_hyperscan_get_database(re_map->hs_db), &bytes, &len) == HS_SUCCESS) { if (write (fd, bytes, len) == -1) { msg_warn_map ("cannot write hyperscan cache to %s: %s", fp, strerror (errno)); @@ -1237,8 +1121,7 @@ rspamd_try_save_re_map_cache (struct rspamd_regexp_map_helper *re_map) else { msg_info_map ("written cached hyperscan data for %s to %s (%Hz length)", map->name, np, len); - - rspamd_re_map_cache_update (np, map->cfg); + rspamd_hyperscan_notice_known(np); } } } @@ -1255,43 +1138,6 @@ rspamd_try_save_re_map_cache (struct rspamd_regexp_map_helper *re_map) return FALSE; } -static gboolean -rspamd_re_map_cache_cleanup_old (struct rspamd_regexp_map_helper *old_re_map) -{ - gchar fp[PATH_MAX]; - struct rspamd_map *map; - gboolean ret = TRUE; - - map = old_re_map->map; - - if (!map->cfg->hs_cache_dir) { - return FALSE; - } - - rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmc", - map->cfg->hs_cache_dir, - (gint)rspamd_cryptobox_HASHBYTES / 2, old_re_map->re_digest); - - msg_info_map ("unlink stale cache file for %s: %s", map->name, fp); - - if (unlink (fp) == -1) { - msg_warn_map ("cannot unlink stale cache file for %s (%s): %s", - map->name, fp, strerror (errno)); - ret = FALSE; - } - - GHashTable *valid_re_hashes; - - valid_re_hashes = rspamd_mempool_get_variable (map->cfg->cfg_pool, - RSPAMD_MEMPOOL_RE_MAPS_CACHE); - - if (valid_re_hashes) { - g_hash_table_remove (valid_re_hashes, fp); - } - - return ret; -} - #endif static void @@ -1376,6 +1222,7 @@ rspamd_re_map_finalize (struct rspamd_regexp_map_helper *re_map) if (!rspamd_try_load_re_map_cache (re_map)) { gdouble ts1 = rspamd_get_ticks (FALSE); + hs_database_t *hs_db = NULL; if (hs_compile_multi ((const gchar **) re_map->patterns, re_map->flags, @@ -1383,7 +1230,7 @@ rspamd_re_map_finalize (struct rspamd_regexp_map_helper *re_map) re_map->regexps->len, HS_MODE_BLOCK, &plt, - &re_map->hs_db, + &hs_db, &err) != HS_SUCCESS) { msg_err_map ("cannot create tree of regexp when processing '%s': %s", @@ -1396,6 +1243,8 @@ rspamd_re_map_finalize (struct rspamd_regexp_map_helper *re_map) return; } + re_map->hs_db = rspamd_hyperscan_from_raw_db(hs_db); + ts1 = (rspamd_get_ticks (FALSE) - ts1) * 1000.0; msg_info_map ("hyperscan compiled %d regular expressions from %s in %.1f ms", re_map->regexps->len, re_map->map->name, ts1); @@ -1406,9 +1255,9 @@ rspamd_re_map_finalize (struct rspamd_regexp_map_helper *re_map) re_map->regexps->len, re_map->map->name); } - if (hs_alloc_scratch (re_map->hs_db, &re_map->hs_scratch) != HS_SUCCESS) { + if (hs_alloc_scratch (rspamd_hyperscan_get_database(re_map->hs_db), &re_map->hs_scratch) != HS_SUCCESS) { msg_err_map ("cannot allocate scratch space for hyperscan"); - hs_free_database (re_map->hs_db); + rspamd_hyperscan_free(re_map->hs_db); re_map->hs_db = NULL; } } @@ -1547,15 +1396,6 @@ rspamd_regexp_list_fin (struct map_cb_data *data, void **target) if (data->prev_data) { old_re_map = data->prev_data; - -#ifdef WITH_HYPERSCAN - if (re_map && memcmp(re_map->re_digest, old_re_map->re_digest, - sizeof(re_map->re_digest)) != 0) { - /* Cleanup old stuff */ - rspamd_re_map_cache_cleanup_old(old_re_map); - } -#endif - rspamd_map_helper_destroy_regexp(old_re_map); } } @@ -1614,8 +1454,9 @@ rspamd_match_regexp_map_single (struct rspamd_regexp_map_helper *map, if (validated) { - res = hs_scan (map->hs_db, in, len, 0, map->hs_scratch, - rspamd_match_hs_single_handler, (void *)&i); + res = hs_scan (rspamd_hyperscan_get_database(map->hs_db), in, len, 0, + map->hs_scratch, + rspamd_match_hs_single_handler, (void *)&i); if (res == HS_SCAN_TERMINATED) { res = 1; @@ -1711,7 +1552,8 @@ rspamd_match_regexp_map_all (struct rspamd_regexp_map_helper *map, cbd.ar = ret; cbd.map = map; - if (hs_scan (map->hs_db, in, len, 0, map->hs_scratch, + if (hs_scan (rspamd_hyperscan_get_database(map->hs_db), in, len, + 0, map->hs_scratch, rspamd_match_hs_multiple_handler, &cbd) == HS_SUCCESS) { res = 1; } diff --git a/src/rspamd.c b/src/rspamd.c index 3779e7f8e..d8371de55 100644 --- a/src/rspamd.c +++ b/src/rspamd.c @@ -53,6 +53,10 @@ #include "sqlite3.h" #include "contrib/libev/ev.h" +#ifdef WITH_HYPERSCAN +#include "libserver/hyperscan_tools.h" +#endif + /* 2 seconds to fork new process in place of dead one */ #define SOFT_FORK_TIME 2 @@ -1643,6 +1647,9 @@ main (gint argc, gchar **argv, gchar **env) msg_info_main ("terminating..."); +#ifdef WITH_HYPERSCAN + rspamd_hyperscan_cleanup_maybe(); +#endif REF_RELEASE (rspamd_main->cfg); rspamd_log_close (rspamd_main->logger); g_hash_table_unref (rspamd_main->spairs); -- 2.39.5