--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/lib/brand/lx/lx_brand/common/sched.c	Mon Sep 11 22:51:59 2006 -0700
@@ -0,0 +1,610 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/cred_impl.h>
+#include <sys/ucred.h>
+#include <ucred.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sched.h>
+#include <strings.h>
+#include <pthread.h>
+#include <time.h>
+#include <thread.h>
+#include <alloca.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/lx_syscall.h>
+#include <sys/lx_debug.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_sched.h>
+
+/* Linux only has three valid policies, SCHED_FIFO, SCHED_RR and SCHED_OTHER */
+static int
+validate_policy(int policy)
+{
+	switch (policy) {
+		case LX_SCHED_FIFO:
+			return (SCHED_FIFO);
+
+		case LX_SCHED_RR:
+			return (SCHED_RR);
+
+		case LX_SCHED_OTHER:
+			return (SCHED_OTHER);
+
+		default:
+			lx_debug("validate_policy: illegal policy: %d", policy);
+			return (-EINVAL);
+	}
+}
+
+/*
+ * Check to see if we have the permissions to set scheduler parameters and
+ * policy, based on Linux' demand that such commands fail with errno set to
+ * EPERM if the current euid is not the euid or ruid of the process in
+ * question.
+ */
+static int
+check_schedperms(pid_t pid)
+{
+	size_t sz;
+	ucred_t *cr;
+	uid_t euid;
+
+	euid = geteuid();
+
+	if (pid == getpid()) {
+		/*
+		 * If we're the process to be checked, simply check the euid
+		 * against our ruid.
+		 */
+		if (euid != getuid())
+			return (-EPERM);
+
+		return (0);
+	}
+
+	/*
+	 * We allocate a ucred_t ourselves rather than call ucred_get(3C)
+	 * because ucred_get() calls malloc(3C), which the brand library cannot
+	 * use.  Because we allocate the space with SAFE_ALLOCA(), there's
+	 * no need to free it when we're done.
+	 */
+	sz = ucred_size();
+	cr = (ucred_t *)SAFE_ALLOCA(sz);
+
+	if (cr == NULL)
+		return (-ENOMEM);
+
+	/*
+	 * If we can't access the process' credentials, fail with errno EPERM
+	 * as the call would not have succeeded anyway.
+	 */
+	if (syscall(SYS_ucredsys, UCREDSYS_UCREDGET, pid, cr) != 0)
+		return ((errno == EACCES) ? -EPERM : -errno);
+
+	if ((euid != ucred_geteuid(cr)) && (euid != ucred_getruid(cr)))
+		return (-EPERM);
+
+	return (0);
+}
+
+static int
+ltos_sparam(int policy, struct lx_sched_param *lsp, struct sched_param *sp)
+{
+	struct lx_sched_param ls;
+	int smin = sched_get_priority_min(policy);
+	int smax = sched_get_priority_max(policy);
+
+	if (uucopy(lsp, &ls, sizeof (struct lx_sched_param)) != 0)
+		return (-errno);
+
+	bzero(sp, sizeof (struct sched_param));
+
+	/*
+	 * Linux has a fixed priority range, 0 - 99, which we need to convert to
+	 * Solaris's dynamic range. Linux considers lower numbers to be
+	 * higher priority, so we'll invert the priority within Solaris's range.
+	 *
+	 * The formula to convert between ranges is:
+	 *
+	 *	L * (smax - smin)
+	 * S =  -----------------  + smin
+	 *	  (lmax - lmin)
+	 *
+	 * where S is the Solaris equivalent of the linux priority L.
+	 *
+	 * To invert the priority, we use:
+	 * S' = smax - S + smin
+	 *
+	 * Together, these two formulas become:
+	 *
+	 *		L * (smax - smin)
+	 *   S = smax - -----------------  + 2smin
+	 *			99
+	 */
+	sp->sched_priority = smax -
+	    ((ls.lx_sched_prio * (smax - smin)) / LX_PRI_MAX) + 2*smin;
+
+	lx_debug("ltos_sparam: linux prio %d = Solaris prio %d "
+	    "(Solaris range %d,%d)\n", ls.lx_sched_prio, sp->sched_priority,
+	    smin, smax);
+
+	return (0);
+}
+
+static int
+stol_sparam(int policy, struct sched_param *sp, struct lx_sched_param *lsp)
+{
+	struct lx_sched_param ls;
+	int smin = sched_get_priority_min(policy);
+	int smax = sched_get_priority_max(policy);
+
+	if (policy == SCHED_OTHER) {
+		/*
+		 * In Linux, the only valid SCHED_OTHER scheduler priority is 0
+		 */
+		ls.lx_sched_prio = 0;
+	} else {
+		/*
+		 * Convert Solaris's dynamic, inverted priority range to the
+		 * fixed Linux range of 1 - 99.
+		 *
+		 * The formula is (see above):
+		 *
+		 *	(smax - s + 2smin) * 99
+		 *  l = -----------------------
+		 *		smax - smin
+		 */
+		ls.lx_sched_prio = ((smax - sp->sched_priority + 2*smin) *
+		    LX_PRI_MAX) / (smax - smin);
+	}
+
+	lx_debug("stol_sparam: Solaris prio %d = linux prio %d "
+	    "(Solaris range %d,%d)\n", sp->sched_priority, ls.lx_sched_prio,
+	    smin, smax);
+
+	return ((uucopy(&ls, lsp, sizeof (struct lx_sched_param)) != 0)
+	    ? -errno : 0);
+}
+
+#define	BITINDEX(ind)	(ind / (sizeof (ulong_t) * 8))
+#define	BITSHIFT(ind)	(1 << (ind % (sizeof (ulong_t) * 8)))
+
+/* ARGSUSED */
+int
+lx_sched_getaffinity(uintptr_t pid, uintptr_t len, uintptr_t maskp)
+{
+	int	sz;
+	ulong_t	*lmask, *zmask;
+	int	i;
+
+	sz = syscall(SYS_brand, B_GET_AFFINITY_MASK, pid, len, maskp);
+	if (sz == -1)
+		return (-errno);
+
+	/*
+	 * If the target LWP hasn't ever had an affinity mask set, the kernel
+	 * will return a mask of all 0's. If that is the case we must build a
+	 * default mask that has all valid bits turned on.
+	 */
+	lmask = SAFE_ALLOCA(sz);
+	zmask = SAFE_ALLOCA(sz);
+	if (lmask == NULL || zmask == NULL)
+		return (-ENOMEM);
+
+	bzero(zmask, sz);
+
+	if (uucopy((void *)maskp, lmask, sz) != 0)
+		return (-EFAULT);
+
+	if (bcmp(lmask, zmask, sz) != 0)
+		return (sz);
+
+	for (i = 0; i < sz * 8; i++) {
+		if (p_online(i, P_STATUS) != -1) {
+			lmask[BITINDEX(i)] |= BITSHIFT(i);
+		}
+	}
+
+	if (uucopy(lmask, (void *)maskp, sz) != 0)
+		return (-EFAULT);
+
+	return (sz);
+}
+
+/* ARGSUSED */
+int
+lx_sched_setaffinity(uintptr_t pid, uintptr_t len, uintptr_t maskp)
+{
+	int		ret;
+	int		sz;
+	int		i;
+	int		found;
+	ulong_t		*lmask;
+	pid_t		s_pid;
+	lwpid_t		s_tid;
+	processorid_t	cpuid;
+
+	if ((pid_t)pid < 0)
+		return (-EINVAL);
+
+	if (lx_lpid_to_spair(pid, &s_pid, &s_tid) < 0)
+		return (-ESRCH);
+
+	/*
+	 * We only support setting affinity masks for threads in
+	 * the calling process.
+	 */
+	if (s_pid != getpid())
+		return (-EPERM);
+
+	/*
+	 * First, get the minimum bitmask size from the kernel.
+	 */
+	sz = syscall(SYS_brand, B_GET_AFFINITY_MASK, 0, 0, 0);
+	if (sz == -1)
+		return (-errno);
+
+	lmask = SAFE_ALLOCA(sz);
+	if (lmask == NULL)
+		return (-ENOMEM);
+
+	if (uucopy((void *)maskp, lmask, sz) != 0)
+		return (-EFAULT);
+
+	/*
+	 * Make sure the mask contains at least one processor that is
+	 * physically on the system. Reduce the user's mask to the set of
+	 * physically present CPUs. Keep track of how many valid
+	 * bits are set in the user's mask.
+	 */
+
+	for (found = 0, i = 0; i < sz * 8; i++) {
+		if (p_online(i, P_STATUS) == -1) {
+			/*
+			 * This CPU doesn't exist, so clear this bit from
+			 * the user's mask.
+			 */
+			lmask[BITINDEX(i)] &= ~BITSHIFT(i);
+			continue;
+		}
+
+		if ((lmask[BITINDEX(i)] & BITSHIFT(i)) == BITSHIFT(i)) {
+			found++;
+			cpuid = i;
+		}
+	}
+
+	if (found == 0) {
+		lx_debug("\tlx_sched_setaffinity: mask has no present CPUs\n");
+		return (-EINVAL);
+	}
+
+	/*
+	 * If only one bit is set, bind the thread to that procesor;
+	 * otherwise, clear the binding.
+	 */
+	if (found == 1) {
+		lx_debug("\tlx_sched_setaffinity: binding thread %d to cpu%d\n",
+		    s_tid, cpuid);
+		if (processor_bind(P_LWPID, s_tid, cpuid, NULL) != 0)
+			/*
+			 * It could be that the requested processor is offline,
+			 * so we'll just abandon our good-natured attempt to
+			 * bind to it.
+			 */
+			lx_debug("couldn't bind LWP %d to cpu %d: %s\n", s_tid,
+			    cpuid, strerror(errno));
+	} else {
+		lx_debug("\tlx_sched_setaffinity: clearing thr %d binding\n",
+		    s_tid);
+		if (processor_bind(P_LWPID, s_tid, PBIND_NONE, NULL) != 0) {
+			lx_debug("couldn't clear CPU binding for LWP %d: %s\n",
+			    s_tid, strerror(errno));
+		}
+	}
+
+	/*
+	 * Finally, ask the kernel to make a note of our current (though fairly
+	 * meaningless) affinity mask.
+	 */
+	ret = syscall(SYS_brand, B_SET_AFFINITY_MASK, pid, sz, lmask);
+
+	return ((ret == 0) ? 0 : -errno);
+}
+
+int
+lx_sched_getparam(uintptr_t pid, uintptr_t param)
+{
+	int	policy, ret;
+	pid_t	s_pid;
+	lwpid_t	s_tid;
+
+	struct sched_param sp;
+
+	if (((pid_t)pid < 0) || (param == NULL))
+		return (-EINVAL);
+
+	if (lx_lpid_to_spair((pid_t)pid, &s_pid, &s_tid) < 0)
+		return (-ESRCH);
+
+	/*
+	 * If we're attempting to get information on our own process, we can
+	 * get data on a per-thread basis; if not, punt and use the specified
+	 * pid.
+	 */
+	if (s_pid == getpid()) {
+		if ((ret = pthread_getschedparam(s_tid, &policy, &sp)) != 0)
+		    return (-ret);
+	} else {
+		if (sched_getparam(s_pid, &sp) == -1)
+			return (-errno);
+
+		if ((policy = sched_getscheduler(s_pid)) < 0)
+			return (-errno);
+	}
+
+	return (stol_sparam(policy, &sp, (struct lx_sched_param *)param));
+}
+
+int
+lx_sched_setparam(uintptr_t pid, uintptr_t param)
+{
+	int	err, policy;
+	pid_t	s_pid;
+	lwpid_t	s_tid;
+	struct lx_sched_param lp;
+	struct sched_param sp;
+
+	if (((pid_t)pid < 0) || (param == NULL))
+		return (-EINVAL);
+
+	if (lx_lpid_to_spair((pid_t)pid, &s_pid, &s_tid) < 0)
+		return (-ESRCH);
+
+	if (s_pid == getpid()) {
+		struct sched_param dummy;
+
+		if ((err = pthread_getschedparam(s_tid, &policy, &dummy)) != 0)
+			return (-err);
+	} else
+		if ((policy = sched_getscheduler(s_pid)) < 0)
+			return (-errno);
+
+	lx_debug("sched_setparam(): current policy %d", policy);
+
+	if (uucopy((void *)param, &lp, sizeof (lp)) != 0)
+		return (-errno);
+
+	/*
+	 * In Linux, the only valid SCHED_OTHER scheduler priority is 0
+	 */
+	if ((policy == SCHED_OTHER) && (lp.lx_sched_prio != 0))
+		return (-EINVAL);
+
+	if ((err = ltos_sparam(policy, (struct lx_sched_param *)&lp,
+	    &sp)) != 0)
+		return (err);
+
+	/*
+	 * Check if we're allowed to change the scheduler for the process.
+	 *
+	 * If we're operating on a thread, we can't just call
+	 * pthread_setschedparam() because as all threads reside within a
+	 * single Solaris process, Solaris will allow the modification
+	 *
+	 * If we're operating on a process, we can't just call sched_setparam()
+	 * because Solaris will allow the call to succeed if the scheduler
+	 * parameters do not differ from those being installed, but Linux wants
+	 * the call to fail.
+	 */
+	if ((err = check_schedperms(s_pid)) != 0)
+		return (err);
+
+	if (s_pid == getpid())
+		return (((err = pthread_setschedparam(s_tid, policy, &sp)) != 0)
+		    ? -err : 0);
+
+	return ((sched_setparam(s_pid, &sp) == -1) ? -errno : 0);
+}
+
+int
+lx_sched_rr_get_interval(uintptr_t pid, uintptr_t timespec)
+{
+	struct timespec ts;
+	pid_t	s_pid;
+
+	if ((pid_t)pid < 0)
+		return (-EINVAL);
+
+	if (lx_lpid_to_spid((pid_t)pid, &s_pid) < 0)
+		return (-ESRCH);
+
+	if (uucopy((struct timespec *)timespec, &ts,
+	    sizeof (struct timespec)) != 0)
+		return (-errno);
+
+	return ((sched_rr_get_interval(s_pid, &ts) == -1) ? -errno : 0);
+}
+
+int
+lx_sched_getscheduler(uintptr_t pid)
+{
+	int	policy, rv;
+	pid_t	s_pid;
+	lwpid_t	s_tid;
+
+	if ((pid_t)pid < 0)
+		return (-EINVAL);
+
+	if (lx_lpid_to_spair((pid_t)pid, &s_pid, &s_tid) < 0)
+		return (-ESRCH);
+
+	if (s_pid == getpid()) {
+		struct sched_param dummy;
+
+		if ((rv = pthread_getschedparam(s_tid, &policy, &dummy)) != 0)
+			return (-rv);
+	} else
+		if ((policy = sched_getscheduler(s_pid)) < 0)
+			return (-errno);
+
+	/*
+	 * Linux only supports certain policies; avoid confusing apps with
+	 * alien policies.
+	 */
+	switch (policy) {
+	case SCHED_FIFO:
+		return (LX_SCHED_FIFO);
+	case SCHED_OTHER:
+		return (LX_SCHED_OTHER);
+	case SCHED_RR:
+		return (LX_SCHED_RR);
+	default:
+		break;
+	}
+
+	return (LX_SCHED_OTHER);
+}
+
+int
+lx_sched_setscheduler(uintptr_t pid, uintptr_t policy, uintptr_t param)
+{
+	int	rt_pol;
+	int	rv;
+	pid_t	s_pid;
+	lwpid_t	s_tid;
+	struct lx_sched_param lp;
+
+	struct sched_param sp;
+
+	if (((pid_t)pid < 0) || (param == NULL))
+		return (-EINVAL);
+
+	if ((rt_pol = validate_policy((int)policy)) < 0)
+		return (rt_pol);
+
+	if ((rv = ltos_sparam(policy, (struct lx_sched_param *)param,
+	    &sp)) != 0)
+		return (rv);
+
+	if (uucopy((void *)param, &lp, sizeof (lp)) != 0)
+		return (-errno);
+
+	/*
+	 * In Linux, the only valid SCHED_OTHER scheduler priority is 0
+	 */
+	if ((rt_pol == LX_SCHED_OTHER) && (lp.lx_sched_prio != 0))
+		return (-EINVAL);
+
+	if (lx_lpid_to_spair((pid_t)pid, &s_pid, &s_tid) < 0)
+		return (-ESRCH);
+
+	/*
+	 * Check if we're allowed to change the scheduler for the process.
+	 *
+	 * If we're operating on a thread, we can't just call
+	 * pthread_setschedparam() because as all threads reside within a
+	 * single Solaris process, Solaris will allow the modification.
+	 *
+	 * If we're operating on a process, we can't just call
+	 * sched_setscheduler() because Solaris will allow the call to succeed
+	 * if the scheduler and scheduler parameters do not differ from those
+	 * being installed, but Linux wants the call to fail.
+	 */
+	if ((rv = check_schedperms(s_pid)) != 0)
+		return (rv);
+
+	if (s_pid == getpid()) {
+		struct sched_param param;
+		int pol;
+
+		if ((pol = sched_getscheduler(s_pid)) != 0)
+			return (-errno);
+
+		/*
+		 * sched_setscheduler() returns the previous scheduling policy
+		 * on success, so call pthread_getschedparam() to get the
+		 * current thread's scheduling policy and return that if the
+		 * call to pthread_setschedparam() succeeds.
+		 */
+		if ((rv = pthread_getschedparam(s_tid, &pol, &param)) != 0)
+			return (-rv);
+
+		return (((rv = pthread_setschedparam(s_tid, rt_pol, &sp)) != 0)
+		    ? -rv : pol);
+	}
+
+	return (((rv = sched_setscheduler(s_pid, rt_pol, &sp)) == -1)
+	    ? -errno : rv);
+}
+
+int
+lx_sched_get_priority_min(uintptr_t policy)
+{
+	/*
+	 * In Linux, the only valid SCHED_OTHER scheduler priority is 0.
+	 * Linux scheduling priorities are not alterable, so there is no
+	 * Solaris translation necessary.
+	 */
+	switch (policy) {
+	case LX_SCHED_FIFO:
+	case LX_SCHED_RR:
+		return (LX_SCHED_PRIORITY_MIN_RRFIFO);
+	case LX_SCHED_OTHER:
+		return (LX_SCHED_PRIORITY_MIN_OTHER);
+	default:
+		break;
+	}
+	return (-EINVAL);
+}
+
+int
+lx_sched_get_priority_max(uintptr_t policy)
+{
+	/*
+	 * In Linux, the only valid SCHED_OTHER scheduler priority is 0
+	 * Linux scheduling priorities are not alterable, so there is no
+	 * Solaris translation necessary.
+	 */
+	switch (policy) {
+	case LX_SCHED_FIFO:
+	case LX_SCHED_RR:
+		return (LX_SCHED_PRIORITY_MAX_RRFIFO);
+	case LX_SCHED_OTHER:
+		return (LX_SCHED_PRIORITY_MAX_OTHER);
+	default:
+		break;
+	}
+	return (-EINVAL);
+}