6311392 The Quick and the Dead: rapid-fire svcadm restart leads to maintenance
authorSean Wilcox <Sean.Wilcox@Sun.COM>
Wed, 13 Jan 2010 09:08:53 -0800
changeset 11482 7315713fb22c
parent 11481 f3d007acb905
child 11483 802d270d2ab7
6311392 The Quick and the Dead: rapid-fire svcadm restart leads to maintenance 6219078 svc.startd's algorithm for detecting restart loops should be sensible
usr/src/cmd/cmd-inet/usr.lib/inetd/inetd.c
usr/src/cmd/svc/startd/graph.c
usr/src/cmd/svc/startd/restarter.c
usr/src/cmd/svc/startd/startd.h
usr/src/lib/librestart/common/librestart.h
--- a/usr/src/cmd/cmd-inet/usr.lib/inetd/inetd.c	Wed Jan 13 14:57:39 2010 -0800
+++ b/usr/src/cmd/cmd-inet/usr.lib/inetd/inetd.c	Wed Jan 13 09:08:53 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1726,6 +1726,7 @@
 	case RESTARTER_EVENT_TYPE_REMOVE_INSTANCE:
 		remove_instance(instance);
 		goto done;
+	case RESTARTER_EVENT_TYPE_STOP_RESET:
 	case RESTARTER_EVENT_TYPE_STOP:
 		switch (instance->cur_istate) {
 		case IIS_OFFLINE_CONRATE:
--- a/usr/src/cmd/svc/startd/graph.c	Wed Jan 13 14:57:39 2010 -0800
+++ b/usr/src/cmd/svc/startd/graph.c	Wed Jan 13 09:08:53 2010 -0800
@@ -825,6 +825,7 @@
 		assert(v->gv_state != RESTARTER_STATE_DISABLED);
 		break;
 
+	case RESTARTER_EVENT_TYPE_STOP_RESET:
 	case RESTARTER_EVENT_TYPE_STOP:
 		log_framework(LOG_DEBUG, "Stopping %s.\n", v->gv_name);
 		assert(v->gv_state == RESTARTER_STATE_DEGRADED ||
@@ -1730,8 +1731,14 @@
 	switch (v->gv_type) {
 	case GVT_INST:
 		/* Restarter */
-		if (err > RERR_NONE && inst_running(v))
-			vertex_send_event(v, RESTARTER_EVENT_TYPE_STOP);
+		if (err > RERR_NONE && inst_running(v)) {
+			if (err == RERR_RESTART) {
+				vertex_send_event(v,
+				    RESTARTER_EVENT_TYPE_STOP_RESET);
+			} else {
+				vertex_send_event(v, RESTARTER_EVENT_TYPE_STOP);
+			}
+		}
 		break;
 
 	case GVT_SVC:
@@ -1759,8 +1766,15 @@
 		e = uu_list_first(v->gv_dependents);
 		svc = e->ge_vertex;
 
-		if (inst_running(svc))
-			vertex_send_event(svc, RESTARTER_EVENT_TYPE_STOP);
+		if (inst_running(svc)) {
+			if (err == RERR_RESTART) {
+				vertex_send_event(svc,
+				    RESTARTER_EVENT_TYPE_STOP_RESET);
+			} else {
+				vertex_send_event(svc,
+				    RESTARTER_EVENT_TYPE_STOP);
+			}
+		}
 		break;
 
 	default:
@@ -1846,7 +1860,7 @@
 	(void) scf_handle_unbind(h);
 	scf_handle_destroy(h);
 
-	vertex_send_event(v, RESTARTER_EVENT_TYPE_STOP);
+	vertex_send_event(v, RESTARTER_EVENT_TYPE_STOP_RESET);
 }
 
 /*
--- a/usr/src/cmd/svc/startd/restarter.c	Wed Jan 13 14:57:39 2010 -0800
+++ b/usr/src/cmd/svc/startd/restarter.c	Wed Jan 13 09:08:53 2010 -0800
@@ -140,6 +140,18 @@
 
 static uu_list_pool_t *restarter_queue_pool;
 
+/*
+ * Function used to reset the restart times for an instance, when
+ * an administrative task comes along and essentially makes the times
+ * in this array ineffective.
+ */
+static void
+reset_start_times(restarter_inst_t *inst)
+{
+	inst->ri_start_index = 0;
+	bzero(inst->ri_start_time, sizeof (inst->ri_start_time));
+}
+
 /*ARGSUSED*/
 static int
 restarter_instance_compare(const void *lc_arg, const void *rc_arg,
@@ -1643,8 +1655,12 @@
 		switch (event->riq_type) {
 		case RESTARTER_EVENT_TYPE_ENABLE:
 		case RESTARTER_EVENT_TYPE_DISABLE:
+			(void) enable_inst(h, inst, event->riq_type);
+			break;
+
 		case RESTARTER_EVENT_TYPE_ADMIN_DISABLE:
-			(void) enable_inst(h, inst, event->riq_type);
+			if (enable_inst(h, inst, event->riq_type) == 0)
+				reset_start_times(inst);
 			break;
 
 		case RESTARTER_EVENT_TYPE_REMOVE_INSTANCE:
@@ -1652,6 +1668,9 @@
 			inst = NULL;
 			goto cont;
 
+		case RESTARTER_EVENT_TYPE_STOP_RESET:
+			reset_start_times(inst);
+			/* FALLTHROUGH */
 		case RESTARTER_EVENT_TYPE_STOP:
 			(void) stop_instance(h, inst, RSTOP_DEPENDENCY);
 			break;
@@ -1710,7 +1729,8 @@
 				 * Stop the instance.  If it can be restarted,
 				 * the graph engine will send a new event.
 				 */
-				(void) stop_instance(h, inst, RSTOP_RESTART);
+				if (stop_instance(h, inst, RSTOP_RESTART) == 0)
+					reset_start_times(inst);
 			}
 			break;
 
@@ -1980,6 +2000,9 @@
 		 * process we're monitoring, then the
 		 * wait_thread will stop the instance.
 		 */
+		if (type == CT_PR_EV_EMPTY)
+			reset_start_times(inst);
+
 		log_framework(LOG_DEBUG,
 		    "%s: ignoring contract event on wait-style service\n",
 		    fmri);
--- a/usr/src/cmd/svc/startd/startd.h	Wed Jan 13 14:57:39 2010 -0800
+++ b/usr/src/cmd/svc/startd/startd.h	Wed Jan 13 09:08:53 2010 -0800
@@ -394,8 +394,8 @@
 
 #define	RINST_RETAKE_MASK	0x0f000000
 
-#define	RINST_START_TIMES	10		/* failures to consider */
-#define	RINST_FAILURE_RATE_NS	1000000000LL	/* 1 failure/second */
+#define	RINST_START_TIMES	5		/* failures to consider */
+#define	RINST_FAILURE_RATE_NS	600000000000LL	/* 1 failure/10 minutes */
 
 /* Number of events in the queue when we start dropping ADMIN events. */
 #define	RINST_QUEUE_THRESHOLD	100
@@ -653,6 +653,7 @@
 
 int walk_dependency_pgs(scf_instance_t *, callback_t, void *);
 int walk_property_astrings(scf_property_t *, callback_t, void *);
+void libscf_reset_start_times(restarter_inst_t *, int);
 
 /* libscf.c - used by restarter.c/method.c/expand.c */
 char *libscf_get_method(scf_handle_t *, int, restarter_inst_t *,
--- a/usr/src/lib/librestart/common/librestart.h	Wed Jan 13 14:57:39 2010 -0800
+++ b/usr/src/lib/librestart/common/librestart.h	Wed Jan 13 09:08:53 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -124,6 +124,7 @@
 #define	RESTARTER_EVENT_TYPE_DEPENDENCY_CYCLE		13
 #define	RESTARTER_EVENT_TYPE_INVALID_DEPENDENCY		14
 #define	RESTARTER_EVENT_TYPE_ADMIN_DISABLE		15
+#define	RESTARTER_EVENT_TYPE_STOP_RESET			16
 
 #define	RESTARTER_EVENT_ERROR			-1