From 261c54963d3e48834100180125a2a17e1759cb61 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 21 Sep 2019 16:27:25 +0100 Subject: [PATCH] [Project] Allow to kill workers that hang up --- src/libserver/cfg_file.h | 1 + src/libserver/cfg_rcl.c | 7 +++++++ src/libserver/worker_util.c | 25 +++++++++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index 7186a73ec..d42fbfba9 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -380,6 +380,7 @@ struct rspamd_config { gsize images_cache_size; /**< size of LRU cache for DCT data from images */ gdouble task_timeout; /**< maximum message processing time */ gint default_max_shots; /**< default maximum count of symbols hits permitted (-1 for unlimited) */ + gint32 heartbeats_loss_max; /**< number of heartbeats lost to consider worker's termination */ gdouble heartbeat_interval; /**< interval for heartbeats for workers */ enum rspamd_log_type log_type; /**< log type */ diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c index 5a1d3a639..11c378d5d 100644 --- a/src/libserver/cfg_rcl.c +++ b/src/libserver/cfg_rcl.c @@ -2188,6 +2188,13 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections) G_STRUCT_OFFSET (struct rspamd_config, heartbeat_interval), RSPAMD_CL_FLAG_TIME_FLOAT, "Time between workers heartbeats"); + rspamd_rcl_add_default_handler (sub, + "heartbeats_loss_max", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET (struct rspamd_config, heartbeat_interval), + RSPAMD_CL_FLAG_INT_32, + "Maximum count of heartbeats to be lost before trying to " + "terminate a worker (default: 0 - disabled)"); /* Neighbours configuration */ rspamd_rcl_add_section_doc (&sub->subsections, "neighbours", "name", diff --git a/src/libserver/worker_util.c b/src/libserver/worker_util.c index d2e52d5a1..883e7e8a9 100644 --- a/src/libserver/worker_util.c +++ b/src/libserver/worker_util.c @@ -756,6 +756,31 @@ rspamd_main_heartbeat_cb (EV_P_ ev_timer *w, int revents) g_quark_to_string (wrk->type), wrk->pid, timebuf); + + if (rspamd_main->cfg->heartbeats_loss_max > 0 && + -(wrk->hb.nbeats) >= rspamd_main->cfg->heartbeats_loss_max) { + + + if (-(wrk->hb.nbeats) >= rspamd_main->cfg->heartbeats_loss_max + 1) { + msg_err_main ("terminate worker type %s with pid %P, " + "last beat on: %s; %L heartbeat loast", + g_quark_to_string (wrk->type), + wrk->pid, + timebuf, + -(wrk->hb.nbeats)); + kill (wrk->pid, SIGTERM); + } + else { + msg_err_main ("force kill worker type %s with pid %P, " + "last beat on: %s; %L heartbeat loast", + g_quark_to_string (wrk->type), + wrk->pid, + timebuf, + -(wrk->hb.nbeats)); + kill (wrk->pid, SIGKILL); + } + + } } } else if (wrk->hb.nbeats < 0) { -- 2.39.5