--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/intrd.c Mon Apr 19 14:47:06 2010 -0400
@@ -0,0 +1,419 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <libgen.h>
+#include <syslog.h>
+#include <kstat.h>
+#include <sys/processor.h>
+
+#include "intrs.h"
+
+typedef struct ivec {
+ int cookie;
+ hrtime_t time;
+ hrtime_t crtime;
+ int pil;
+ int ino;
+ int ihs;
+ int num_ino;
+ int origcpu;
+ int nowcpu;
+ int inum;
+} ivec_t;
+
+typedef struct bus_stat {
+ bus_stat_t *next;
+ char *buspath[MAXPATHLEN];
+ int num_intr;
+ ivec_t *ivecs;
+} bus_stat_t;
+
+typedef struct cpu_stat {
+ int state;
+ uint64_t tot;
+ hrtime_t crtime;
+ bus_stat_t *bus_stats;
+} cpu_stat_t;
+
+type def struct intr_stat {
+ double snaptime;
+ *cpu_stat_t *cpus;
+} intr_stat_t;
+
+typedef enum sleeptime {
+ NORMAL_SLEEPTIME = 10, /* time to sleep between samples */
+ IDLE_SLEEPTIME = 45, /* time to sleep when idle */
+ ONECPU_SLEEPTIME = 60 * 15, /* used if only 1 CPU on system */
+} sleeptime_t;
+
+int using_scengen; /* 1 if using scenario simulator */
+int debug;
+int foreground;
+
+int max_cpus;
+
+sleeptime_t sleeptime = NORMAL_SLEEPTIME; /* either normal_ or idle_ or onecpu_ */
+
+float idle_intrload = 0.1; /* idle if interrupt load < 10% */
+
+float timerange_toohi = 0.1;
+int statslen = 60; /* time period (in secs) to keep in @deltas */
+
+int main(int argc, char **argv)
+{
+ const char *cmdname;
+ kstat_ctl_t *kc;
+ kstat_t *ksp;
+ intr_stat_t stat;
+
+ max_cpus = sysconf(_SC_CPUID_MAX) + 1;
+
+ cmdname = basename(argv[0]);
+/*
+ * Parse arguments. intrd does not accept any public arguments; the two
+ * arguments below are meant for testing purposes. -D generates a significant
+ * amount of syslog output. -S <filename> loads the filename as a perl
+ * script. That file is expected to implement a kstat "simulator" which
+ * can be used to feed information to intrd and verify intrd's responses.
+*/
+ for (; --argc > 0; ++argv) {
+ if (argv[1][0] != '-' || argv[1][1] == '\0' ||
+ argv[1][2] != '\0') {
+ continue;
+ }
+
+ switch (argv[1][1]) {
+ case 'S':
+ using_scengen = 1;
+ foreground = 1;
+ if (argc > 1) {
+ --argc;
+ load_simulator(++argv[1]);
+ }
+ break;
+ case 'D':
+ debug = 1;
+ break;
+ case 'f':
+ foreground = 1;
+ break;
+ default:
+ }
+ }
+
+ if (!foreground) {
+ if (daemon(0, 0) == -1) {
+ return 1;
+ }
+ }
+
+ if (!using_scengen) {
+ openlog(cmdname, LOG_PID, LOG_DAEMON);
+ (void) setlogmask(LOG_UPTO(debug ? LOG_DEBUG : LOG_INFO));
+ }
+
+ if (!using_scengen) {
+ kc = kstat_open();
+ if (kc == NULL) {
+ return 1;
+ }
+ } else {
+ /* scengen not implemented */
+ return 1;
+ }
+
+/*
+ * If no pci_intrs kstats were found, we need to exit, but we can't because
+ * SMF will restart us and/or report an error to the administrator. But
+ * there's nothing an administrator can do. So print out a message to syslog
+ * and silently pause forever.
+*/
+ for (ksp = kc->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
+ if ((ksp->ks_type == KSTAT_TYPE_NAMED) &&
+ !strcmp(ksp->ks_module, "pci_intrs")) {
+ break;
+ }
+ }
+ if (ksp == NULL) {
+ kstat_close(kc);
+ syslog(LOG_INFO, "no interrupts were found: " \
+ "your I/O bus may not yet be supported\n");
+ do {} while (!sleep(ONECPU_SLEEPTIME));
+ return 0;
+ }
+
+ if ((stat.cpus = malloc(sizeof(cpu_stat_t) * max_cpus)) == NULL) {
+ return 1;
+ }
+}
+
+
+static int verify(int condition, const char *msg)
+{
+ int bad = !condition;
+ if (bad) {
+ syslog(LOG_DEBUG, "VERIFY: %s", msg);
+ }
+ return bad;
+}
+
+static void load_simulator(const char *file)
+{
+}
+
+static int getstat($$);
+/*
+int generate_delta($$);
+int compress_deltas($);
+int dumpdelta($);
+
+int goodness($);
+int imbalanced($$);
+int do_reconfig($);
+
+int goodness_cpu($$); # private function
+int move_intr($$$$); # private function
+int ivecs_to_string(@); # private function
+int do_find_goal($$$$); # private function
+int find_goal($$); # private function
+int do_reconfig_cpu2cpu($$$$); # private function
+int do_reconfig_cpu($$$);
+*/
+
+
+/*
+#
+ * What follow are the basic data structures routines of intrd.
+#
+ * getstat() is responsible for reading the kstats and generating a "stat" hash.
+#
+ * generate_delta() is responsible for taking two "stat" hashes and creating
+ * a new "delta" hash that represents what has changed over time.
+#
+ * compress_deltas() is responsible for taking a list of deltas and generating
+ * a single delta hash that encompasses all the time periods described by the
+ * deltas.
+*/
+
+
+/*
+#
+ * getstat() is handed a reference to a kstat and generates a hash, returned
+ * by reference, containing all the fields from the kstats which we need.
+ * If it returns the scalar 0, it failed to gather the kstats, and the caller
+ * should react accordingly.
+#
+ * getstat() is also responsible for maintaining a reasonable $sleeptime.
+#
+ * {"snaptime"} kstat's snaptime
+ * {<cpuid>} one hash reference per online cpu
+ * ->{"tot"} == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
+ * ->{"crtime"} == cpu:<cpuid>:sys:crtime
+ * ->{"ivecs"}
+ * ->{<cookie#>} iterates over pci_intrs::<nexus>:cookie
+ * ->{"time"} == pci_intrs:<ivec#>:<nexus>:time (in nsec)
+ * ->{"pil"} == pci_intrs:<ivec#>:<nexus>:pil
+ * ->{"crtime"} == pci_intrs:<ivec#>:<nexus>:crtime
+ * ->{"ino"} == pci_intrs:<ivec#>:<nexus>:ino
+ * ->{"num_ino"} == num inos of single device instance sharing this entry
+ * Will be > 1 on pcplusmp X86 systems for devices
+ * with multiple MSI interrupts.
+ * ->{"buspath"} == pci_intrs:<ivec#>:<nexus>:buspath
+ * ->{"name"} == pci_intrs:<ivec#>:<nexus>:name
+ * ->{"ihs"} == pci_intrs:<ivec#>:<nexus>:ihs
+#
+*/
+
+int getstat(kstat_ctl_t *kc, intr_stat_t *stat)
+{
+ int cpucnt = 0;
+ kstat_t *ksp;
+ double minsnap, maxsnap;
+
+ /* Hash of hash which matches (MSI device, ino) combos to kstats. */
+ int *msidevs;
+
+ /*
+ * kstats are not generated atomically. Each kstat hierarchy will
+ * have been generated within the kernel at a different time. On a
+ * thrashing system, we may not run quickly enough in order to get
+ * coherent kstat timing information across all the kstats. To
+ * determine if this is occurring, $minsnap/$maxsnap are used to
+ * find the breadth between the first and last snaptime of all the
+ * kstats we access. $maxsnap - $minsnap roughly represents the
+ * total time taken up in getstat(). If this time approaches the
+ * time between snapshots, our results may not be useful.
+ */
+
+ minsnap = -1; /* snaptime is always a positive number */
+ maxsnap = minsnap;
+
+ /*
+ * iterate over the cpus in cpu:<cpuid>::. check
+ * cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
+ * processor is "on-line". if not, it isn't accepting interrupts
+ * and doesn't concern us.
+ *
+ * record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
+ */
+
+ cpu_stats = stat->cpus;
+ bzero(cpu_stats, sizeof(cpu_stat_t) * max_cpus);
+
+ for (ksp = kc->kc_chain; ksp != null; ksp = ksp->ks_next) {
+ kstat_t *ksp_sys;
+ kstat_named_t *knp;
+ int cpu;
+ double snaptime;
+
+ if ((ksp->ks_type != kstat_type_named) ||
+ strcmp(ksp->ks_module, "cpu_info") ||
+ (kstat_read(kc, ksp) == -1)) {
+ continue;
+ }
+ knp = kstat_data_lookup(ksp, "state");
+ if ((knp == NULL) || strcmp(knp->name, PS_ONLINE)) {
+ continue;
+ }
+ cpu = ksp->ks_instance;
+ ksp_sys = kstat_lookup(kc, "cpu", cpu, "sys");
+ if ((ksp_sys == NULL) || (kstat_read(kc, ksp_sys, NULL) == -1)) {
+ continue;
+ }
+ cpu_stats[cpu].state = P_ONLINE;
+ knp = ksp_sys->ks_data;
+ for (i = 0; i < ksp_sys->ks_ndata; i++) {
+ if (!strcmp(knp[i].name, "cpu_nsec_idle") ||
+ !strcmp(knp[i].name, "cpu_nsec_user") ||
+ !strcmp(knp[i].name, "cpu_nsec_kernel")) {
+ cpu_stats[cpu].tot += knp[i].value.ui64;
+ }
+ cpu_stats[cpu].crtime = ksp_sys->crtime;
+ snaptime = ksp_sys->snaptime;
+ if (minsnap == -1 || snaptime < minsnap) {
+ minsnap = snaptime;
+ }
+ if (snaptime > maxsnap) {
+ maxsnap = snaptime;
+ }
+ cpucnt++;
+
+ }
+
+ if (cpucnt <= 1) {
+ sleeptime = ONECPU_SLEEPTIME;
+ return 0; /* nothing to do with 1 CPU */
+ }
+
+ /*
+ * Iterate over the ivecs. If the cpu is not on-line, ignore the
+ * ivecs mapped to it, if any.
+ *
+ * Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
+ * ino, name, and buspath. Check $minsnap/$maxsnap.
+ */
+
+ for (ksp = kc->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
+ kstat_named_t *knp;
+ int cpu;
+ double snaptime;
+
+ if ((ksp->ks_type != KSTAT_TYPE_NAMED) ||
+ strcmp(ksp->ks_module, "pci_intrs") ||
+ (kstat_read(kc, ksp) == -1)) {
+ continue;
+ }
+ knp = kstat_data_lookup(ksp, "cpu");
+ if ((knp == NULL) || ((cpu = knp->value.ui32) > max_cpus) ||
+ (cpu_stats[cpu].state != P_ONLINE)) {
+ continue;
+ }
+ knp = kstat_data_lookup(ksp, "type");
+ if ((knp == NULL) || strcmp(knp->value.c, "disabled")) {
+ continue;
+ }
+ knp = kstat_data_lookup(ksp, "buspath");
+ if (knp == NULL) {
+ continue;
+ }
+ snaptime = ksp->snaptime;
+ if (minsnap == -1 || snaptime < minsnap) {
+ minsnap = snaptime;
+ }
+ if (snaptime > maxsnap) {
+ maxsnap = snaptime;
+ }
+ }
+
+ /*
+ * All MSI interrupts of a device instance share a single MSI address.
+ * On X86 systems with an APIC, this MSI address is interpreted as CPU
+ * routing info by the APIC. For this reason, on these platforms, all
+ * interrupts for MSI devices must be moved to the same CPU at the same
+ * time.
+ *
+ * Since all interrupts will be on the same CPU on these platforms, all
+ * interrupts can be consolidated into one ivec entry. For such devices,
+ * num_ino will be > 1 to denote that a group move is needed.
+ */
+
+ /*
+ * Loop thru all MSI devices on X86 pcplusmp systems.
+ * Nop on other systems.
+ */
+
+
+ /*
+ * We define the timerange as the amount of time spent gathering the
+ * various kstats, divided by our sleeptime. If we take a lot of time
+ * to access the kstats, and then we create a delta comparing these
+ * kstats with a prior set of kstats, that delta will cover
+ * substaintially different amount of time depending upon which
+ * interrupt or CPU is being examined.
+ *
+ * By checking the timerange here, we guarantee that any deltas
+ * created from these kstats will contain self-consistent data,
+ * in that all CPUs and interrupts cover a similar span of time.
+ *
+ * $timerange_toohi is the upper bound. Any timerange above
+ * this is thrown out as garbage. If the stat is safely within this
+ * bound, we treat the stat as representing an instant in time, rather
+ * than the time range it actually spans. We arbitrarily choose minsnap
+ * as the snaptime of the stat.
+ */
+
+ stat->snaptime = minsnap;
+ if (((maxsnap - minsnap) / sleeptime) > timerange_toohi) {
+ return 0;
+ }
+ return 1;
+}
+