]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Allow to kill workers that hang up
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 21 Sep 2019 15:27:25 +0000 (16:27 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 21 Sep 2019 15:27:25 +0000 (16:27 +0100)
src/libserver/cfg_file.h
src/libserver/cfg_rcl.c
src/libserver/worker_util.c

index 7186a73ecf0f4a08b6b07df898c736fb43eb83e5..d42fbfba9ec07e8e03ad033b8da4b53224aa4e55 100644 (file)
@@ -380,6 +380,7 @@ struct rspamd_config {
        gsize images_cache_size;                        /**< size of LRU cache for DCT data from images                 */
        gdouble task_timeout;                           /**< maximum message processing time                                    */
        gint default_max_shots;                         /**< default maximum count of symbols hits permitted (-1 for unlimited) */
+       gint32 heartbeats_loss_max;                     /**< number of heartbeats lost to consider worker's termination */
        gdouble heartbeat_interval;                     /**< interval for heartbeats for workers                                */
 
        enum rspamd_log_type log_type;                  /**< log type                                                                                   */
index 5a1d3a639d668b225b6ded3b7796322144b2f77c..11c378d5de60ffbfe6a266cf49fa00b15a72f052 100644 (file)
@@ -2188,6 +2188,13 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections)
                                G_STRUCT_OFFSET (struct rspamd_config, heartbeat_interval),
                                RSPAMD_CL_FLAG_TIME_FLOAT,
                                "Time between workers heartbeats");
+               rspamd_rcl_add_default_handler (sub,
+                               "heartbeats_loss_max",
+                               rspamd_rcl_parse_struct_integer,
+                               G_STRUCT_OFFSET (struct rspamd_config, heartbeat_interval),
+                               RSPAMD_CL_FLAG_INT_32,
+                               "Maximum count of heartbeats to be lost before trying to "
+                               "terminate a worker (default: 0 - disabled)");
 
                /* Neighbours configuration */
                rspamd_rcl_add_section_doc (&sub->subsections, "neighbours", "name",
index d2e52d5a1c3207cf6f0f90871ad2d810e1b9db8a..883e7e8a9df4477affeecd0192695d3336b8f4fe 100644 (file)
@@ -756,6 +756,31 @@ rspamd_main_heartbeat_cb (EV_P_ ev_timer *w, int revents)
                                        g_quark_to_string (wrk->type),
                                        wrk->pid,
                                        timebuf);
+
+                       if (rspamd_main->cfg->heartbeats_loss_max > 0 &&
+                               -(wrk->hb.nbeats) >= rspamd_main->cfg->heartbeats_loss_max) {
+
+
+                               if (-(wrk->hb.nbeats) >= rspamd_main->cfg->heartbeats_loss_max + 1) {
+                                       msg_err_main ("terminate worker type %s with pid %P, "
+                                                                 "last beat on: %s; %L heartbeat loast",
+                                                       g_quark_to_string (wrk->type),
+                                                       wrk->pid,
+                                                       timebuf,
+                                                       -(wrk->hb.nbeats));
+                                       kill (wrk->pid, SIGTERM);
+                               }
+                               else {
+                                       msg_err_main ("force kill worker type %s with pid %P, "
+                                                                 "last beat on: %s; %L heartbeat loast",
+                                                       g_quark_to_string (wrk->type),
+                                                       wrk->pid,
+                                                       timebuf,
+                                                       -(wrk->hb.nbeats));
+                                       kill (wrk->pid, SIGKILL);
+                               }
+
+                       }
                }
        }
        else if (wrk->hb.nbeats < 0) {