Start of intrd implementation.
authorAlbert Lee <trisk@forkgnu.org>
Mon, 19 Apr 2010 14:47:06 -0400
changeset 3 380ada8fd621
parent 2 ee32231c211b
child 4 a42e422f55c0
Start of intrd implementation.
intrd.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/intrd.c	Mon Apr 19 14:47:06 2010 -0400
@@ -0,0 +1,419 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates.  All rights reserved.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <libgen.h>
+#include <syslog.h>
+#include <kstat.h>
+#include <sys/processor.h>
+
+#include "intrs.h"
+
+typedef struct ivec {
+	int cookie;
+	hrtime_t time;
+	hrtime_t crtime;
+	int pil;
+	int ino;
+	int ihs;
+	int num_ino;
+	int origcpu;
+	int nowcpu;
+	int inum;
+} ivec_t;
+
+typedef struct bus_stat {
+	bus_stat_t *next;
+	char *buspath[MAXPATHLEN];
+	int num_intr;
+	ivec_t *ivecs;
+} bus_stat_t;
+
+typedef struct cpu_stat {
+	int state;
+	uint64_t tot;
+	hrtime_t crtime;
+	bus_stat_t *bus_stats;
+} cpu_stat_t;
+
+type def struct intr_stat {
+	double snaptime;
+	*cpu_stat_t *cpus;
+} intr_stat_t;
+
+typedef enum sleeptime {
+	NORMAL_SLEEPTIME = 10,			/* time to sleep between samples */
+	IDLE_SLEEPTIME = 45,			/* time to sleep when idle */
+	ONECPU_SLEEPTIME = 60 * 15,		/* used if only 1 CPU on system */
+} sleeptime_t;
+
+int using_scengen;	/* 1 if using scenario simulator */
+int debug;
+int foreground;
+
+int max_cpus;
+
+sleeptime_t sleeptime = NORMAL_SLEEPTIME;	/* either normal_ or idle_ or onecpu_ */
+
+float idle_intrload = 0.1; 			/*  idle if interrupt load < 10% */
+
+float timerange_toohi = 0.1;
+int statslen = 60;	/* time period (in secs) to keep in @deltas */
+
+int main(int argc, char **argv)
+{
+	const char *cmdname;
+	kstat_ctl_t *kc;
+	kstat_t *ksp;
+	intr_stat_t stat;
+
+	max_cpus = sysconf(_SC_CPUID_MAX) + 1;
+
+	cmdname = basename(argv[0]);
+/*
+ * Parse arguments. intrd does not accept any public arguments; the two
+ * arguments below are meant for testing purposes. -D generates a significant
+ * amount of syslog output. -S <filename> loads the filename as a perl
+ * script. That file is expected to implement a kstat "simulator" which
+ * can be used to feed information to intrd and verify intrd's responses.
+*/
+	for (; --argc > 0; ++argv) {
+		if (argv[1][0] != '-' || argv[1][1] == '\0' ||
+		    argv[1][2] != '\0') {
+			continue;
+		}
+
+		switch (argv[1][1]) {
+			case 'S':
+				using_scengen = 1;
+				foreground = 1;
+				if (argc > 1) {
+					--argc;
+					load_simulator(++argv[1]);
+				}
+				break;
+			case 'D':
+				debug = 1;
+				break;
+			case 'f':
+				foreground = 1;
+				break;
+			default:
+		}
+	}
+
+	if (!foreground) {
+		if (daemon(0, 0) == -1) {
+			return 1;
+		}
+	}
+
+	if (!using_scengen) {
+		openlog(cmdname, LOG_PID, LOG_DAEMON);
+		(void) setlogmask(LOG_UPTO(debug ? LOG_DEBUG : LOG_INFO));
+	}
+
+	if (!using_scengen) {
+		kc = kstat_open();
+		if (kc == NULL) {
+			return 1;
+		}
+	} else {
+		/* scengen not implemented */
+		return 1;
+	}
+
+/*
+ * If no pci_intrs kstats were found, we need to exit, but we can't because
+ * SMF will restart us and/or report an error to the administrator. But
+ * there's nothing an administrator can do. So print out a message to syslog
+ * and silently pause forever.
+*/
+	for (ksp = kc->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
+		if ((ksp->ks_type == KSTAT_TYPE_NAMED) &&
+		    !strcmp(ksp->ks_module, "pci_intrs")) {
+		    break;
+		}
+	}
+	if (ksp == NULL) {
+		kstat_close(kc);
+		syslog(LOG_INFO, "no interrupts were found: " \
+			"your I/O bus may not yet be supported\n");
+		do {} while (!sleep(ONECPU_SLEEPTIME));
+		return 0;
+	}
+
+	if ((stat.cpus = malloc(sizeof(cpu_stat_t) * max_cpus)) == NULL) {
+		return 1;
+	}
+}
+
+
+static int verify(int condition, const char *msg)
+{
+	int bad = !condition;
+	if (bad) {
+		syslog(LOG_DEBUG, "VERIFY: %s", msg);
+	}
+	return bad;
+}
+
+static void load_simulator(const char *file)
+{
+}
+
+static int getstat($$);
+/*
+int generate_delta($$);
+int compress_deltas($);
+int dumpdelta($);
+
+int goodness($);
+int imbalanced($$);
+int do_reconfig($);
+
+int goodness_cpu($$);		# private function
+int move_intr($$$$);		# private function
+int ivecs_to_string(@);		# private function
+int do_find_goal($$$$);		# private function
+int find_goal($$);		# private function
+int do_reconfig_cpu2cpu($$$$);	# private function
+int do_reconfig_cpu($$$);	
+*/
+
+
+/*
+#
+ * What follow are the basic data structures routines of intrd.
+#
+ * getstat() is responsible for reading the kstats and generating a "stat" hash.
+#
+ * generate_delta() is responsible for taking two "stat" hashes and creating
+ * a new "delta" hash that represents what has changed over time.
+#
+ * compress_deltas() is responsible for taking a list of deltas and generating
+ * a single delta hash that encompasses all the time periods described by the
+ * deltas.
+*/
+
+
+/*
+#
+ * getstat() is handed a reference to a kstat and generates a hash, returned
+ * by reference, containing all the fields from the kstats which we need.
+ * If it returns the scalar 0, it failed to gather the kstats, and the caller
+ * should react accordingly.
+#
+ * getstat() is also responsible for maintaining a reasonable $sleeptime.
+#
+ * {"snaptime"}          kstat's snaptime
+ * {<cpuid>}             one hash reference per online cpu
+ *  ->{"tot"}            == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
+ *  ->{"crtime"}         == cpu:<cpuid>:sys:crtime
+ *  ->{"ivecs"}
+ *     ->{<cookie#>}     iterates over pci_intrs::<nexus>:cookie
+ *        ->{"time"}     == pci_intrs:<ivec#>:<nexus>:time (in nsec)
+ *        ->{"pil"}      == pci_intrs:<ivec#>:<nexus>:pil
+ *        ->{"crtime"}   == pci_intrs:<ivec#>:<nexus>:crtime
+ *        ->{"ino"}      == pci_intrs:<ivec#>:<nexus>:ino
+ *        ->{"num_ino"}  == num inos of single device instance sharing this entry
+ *				Will be > 1 on pcplusmp X86 systems for devices
+ *				with multiple MSI interrupts.
+ *        ->{"buspath"}  == pci_intrs:<ivec#>:<nexus>:buspath
+ *        ->{"name"}     == pci_intrs:<ivec#>:<nexus>:name
+ *        ->{"ihs"}      == pci_intrs:<ivec#>:<nexus>:ihs
+#
+*/
+
+int getstat(kstat_ctl_t *kc, intr_stat_t *stat)
+{
+	int cpucnt = 0;
+	kstat_t *ksp;
+	double minsnap, maxsnap;
+
+	/* Hash of hash which matches (MSI device, ino) combos to kstats. */
+	int *msidevs;
+
+	/*
+	 * kstats are not generated atomically. Each kstat hierarchy will
+	 * have been generated within the kernel at a different time. On a
+	 * thrashing system, we may not run quickly enough in order to get
+	 * coherent kstat timing information across all the kstats. To
+	 * determine if this is occurring, $minsnap/$maxsnap are used to
+	 * find the breadth between the first and last snaptime of all the
+	 * kstats we access. $maxsnap - $minsnap roughly represents the
+	 * total time taken up in getstat(). If this time approaches the
+	 * time between snapshots, our results may not be useful.
+	*/
+
+	minsnap = -1;		/* snaptime is always a positive number */
+	maxsnap = minsnap;
+
+	/*
+	 * iterate over the cpus in cpu:<cpuid>::. check
+	 * cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
+	 * processor is "on-line". if not, it isn't accepting interrupts
+	 * and doesn't concern us.
+	 *
+	 * record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
+	 */
+
+	cpu_stats = stat->cpus;
+	bzero(cpu_stats, sizeof(cpu_stat_t) * max_cpus);
+
+	for (ksp = kc->kc_chain; ksp != null; ksp = ksp->ks_next) {
+		kstat_t *ksp_sys;
+		kstat_named_t *knp;
+		int cpu;
+		double snaptime;
+
+		if ((ksp->ks_type != kstat_type_named) ||
+		    strcmp(ksp->ks_module, "cpu_info") ||
+		    (kstat_read(kc, ksp) == -1)) {
+		    continue;
+		}
+		knp = kstat_data_lookup(ksp, "state");
+		if ((knp == NULL) || strcmp(knp->name, PS_ONLINE)) {
+			continue;
+		}
+		cpu = ksp->ks_instance;
+		ksp_sys = kstat_lookup(kc, "cpu", cpu, "sys");
+		if ((ksp_sys == NULL) || (kstat_read(kc, ksp_sys, NULL) == -1)) {
+			continue;
+		}
+		cpu_stats[cpu].state = P_ONLINE;
+		knp = ksp_sys->ks_data;
+		for (i = 0; i < ksp_sys->ks_ndata; i++) {
+			if (!strcmp(knp[i].name, "cpu_nsec_idle") ||
+			    !strcmp(knp[i].name, "cpu_nsec_user") ||
+			    !strcmp(knp[i].name, "cpu_nsec_kernel")) {
+				cpu_stats[cpu].tot += knp[i].value.ui64;
+		}
+		cpu_stats[cpu].crtime = ksp_sys->crtime;
+		snaptime = ksp_sys->snaptime;
+		if (minsnap == -1 || snaptime < minsnap) {
+			minsnap = snaptime;
+		}
+		if (snaptime > maxsnap) {
+			maxsnap = snaptime;
+		}
+		cpucnt++;
+
+	}
+
+	if (cpucnt <= 1) {
+		sleeptime = ONECPU_SLEEPTIME;
+		return 0;	/* nothing to do with 1 CPU */
+	}
+
+	/*
+	 * Iterate over the ivecs. If the cpu is not on-line, ignore the
+	 * ivecs mapped to it, if any.
+	 *
+	 * Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
+	 * ino, name, and buspath. Check $minsnap/$maxsnap.
+	 */
+
+	for (ksp = kc->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
+		kstat_named_t *knp;
+		int cpu;
+		double snaptime;
+
+		if ((ksp->ks_type != KSTAT_TYPE_NAMED) ||
+		    strcmp(ksp->ks_module, "pci_intrs") ||
+		    (kstat_read(kc, ksp) == -1)) {
+			continue;
+		}
+		knp = kstat_data_lookup(ksp, "cpu");
+		if ((knp == NULL) || ((cpu = knp->value.ui32) > max_cpus) ||
+		    (cpu_stats[cpu].state != P_ONLINE)) {
+			continue;
+		}
+		knp = kstat_data_lookup(ksp, "type");
+		if ((knp == NULL) || strcmp(knp->value.c, "disabled")) {
+			continue;
+		}
+		knp = kstat_data_lookup(ksp, "buspath");
+		if (knp == NULL) {
+			continue;
+		}
+		snaptime = ksp->snaptime;
+		if (minsnap == -1 || snaptime < minsnap) {
+			minsnap = snaptime;
+		}
+		if (snaptime > maxsnap) {
+			maxsnap = snaptime;
+		}
+	}
+
+	/*
+	 * All MSI interrupts of a device instance share a single MSI address.
+	 * On X86 systems with an APIC, this MSI address is interpreted as CPU
+	 * routing info by the APIC.  For this reason, on these platforms, all
+	 * interrupts for MSI devices must be moved to the same CPU at the same
+	 * time.
+	 *
+	 * Since all interrupts will be on the same CPU on these platforms, all
+	 * interrupts can be consolidated into one ivec entry.  For such devices,
+	 * num_ino will be > 1 to denote that a group move is needed.  
+	 */
+
+	/*
+	 * Loop thru all MSI devices on X86 pcplusmp systems.
+	 * Nop on other systems.
+	 */
+
+
+	/*
+	 * We define the timerange as the amount of time spent gathering the
+	 * various kstats, divided by our sleeptime. If we take a lot of time
+	 * to access the kstats, and then we create a delta comparing these
+	 * kstats with a prior set of kstats, that delta will cover
+	 * substaintially different amount of time depending upon which
+	 * interrupt or CPU is being examined.
+	 *
+	 * By checking the timerange here, we guarantee that any deltas
+	 * created from these kstats will contain self-consistent data,
+	 * in that all CPUs and interrupts cover a similar span of time.
+	 *
+	 * $timerange_toohi is the upper bound. Any timerange above
+	 * this is thrown out as garbage. If the stat is safely within this
+	 * bound, we treat the stat as representing an instant in time, rather
+	 * than the time range it actually spans. We arbitrarily choose minsnap
+	 * as the snaptime of the stat.
+	 */
+
+	 stat->snaptime = minsnap;
+	 if (((maxsnap - minsnap) / sleeptime) > timerange_toohi) {
+	 	return 0;
+	}
+	return 1;
+}
+