6533678 dtrace incorrectly resumes execution in the middle of 2-byte int 3 instructions
authorsethg
Fri, 30 Mar 2007 19:27:19 -0700
changeset 3939 8f689b97125d
parent 3938 670947f6c3f6
child 3940 5517729c4306
6533678 dtrace incorrectly resumes execution in the middle of 2-byte int 3 instructions 6533954 kmdb destroys usermode GSBASE when entering 6534035 dtrace_user_probe fails to consider alternate code selectors when checking for int3 6534061 amd64 DEBUG kernel's __SAVE_REGS should save the {gs,fs}base registers 6534200 syscall rewriting code is not thread-safe 6534218 Incorrect ASSERT logic in prnldt induces panic 6534277 syscall rewriting code in trap() should be aware of alternate segment selectors
usr/src/uts/i86pc/os/dtrace_subr.c
usr/src/uts/i86pc/os/machdep.c
usr/src/uts/i86pc/os/trap.c
usr/src/uts/i86pc/sys/machsystm.h
usr/src/uts/intel/amd64/ml/mach_offsets.in
usr/src/uts/intel/amd64/sys/privregs.h
usr/src/uts/intel/fs/proc/prmachdep.c
usr/src/uts/intel/kdi/amd64/kdi_asm.s
--- a/usr/src/uts/i86pc/os/dtrace_subr.c	Fri Mar 30 17:01:13 2007 -0700
+++ b/usr/src/uts/i86pc/os/dtrace_subr.c	Fri Mar 30 19:27:19 2007 -0700
@@ -240,7 +240,8 @@
 		rp->r_pc = npc;
 
 	} else if (rp->r_trapno == T_BPTFLT) {
-		uint8_t instr;
+		uint8_t instr, instr2;
+		caddr_t linearpc;
 		rwp = &CPU->cpu_ft_lock;
 
 		/*
@@ -258,14 +259,25 @@
 		}
 		rw_exit(rwp);
 
+		if (dtrace_linear_pc(rp, p, &linearpc) != 0) {
+			trap(rp, addr, cpuid);
+			return;
+		}
+
 		/*
 		 * If the instruction that caused the breakpoint trap doesn't
 		 * look like an int 3 anymore, it may be that this tracepoint
 		 * was removed just after the user thread executed it. In
 		 * that case, return to user land to retry the instuction.
+		 * Note that we assume the length of the instruction to retry
+		 * is 1 byte because that's the length of FASTTRAP_INSTR.
+		 * We check for r_pc > 0 and > 2 so that we don't have to
+		 * deal with segment wraparound.
 		 */
-		if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 &&
-		    instr != FASTTRAP_INSTR) {
+		if (rp->r_pc > 0 && fuword8(linearpc - 1, &instr) == 0 &&
+		    instr != FASTTRAP_INSTR &&
+		    (instr != 3 || (rp->r_pc >= 2 &&
+		    (fuword8(linearpc - 2, &instr2) != 0 || instr2 != 0xCD)))) {
 			rp->r_pc--;
 			return;
 		}
--- a/usr/src/uts/i86pc/os/machdep.c	Fri Mar 30 17:01:13 2007 -0700
+++ b/usr/src/uts/i86pc/os/machdep.c	Fri Mar 30 19:27:19 2007 -0700
@@ -29,6 +29,7 @@
 #include <sys/types.h>
 #include <sys/t_lock.h>
 #include <sys/param.h>
+#include <sys/segments.h>
 #include <sys/sysmacros.h>
 #include <sys/signal.h>
 #include <sys/systm.h>
@@ -1019,3 +1020,100 @@
 {
 	return (0);
 }
+
+/*
+ * Calculates a linear address, given the CS selector and PC values
+ * by looking up the %cs selector process's LDT or the CPU's GDT.
+ * proc->p_ldtlock must be held across this call.
+ */
+int
+linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp)
+{
+	user_desc_t	*descrp;
+	caddr_t		baseaddr;
+	uint16_t	idx = SELTOIDX(rp->r_cs);
+
+	ASSERT(rp->r_cs <= 0xFFFF);
+	ASSERT(MUTEX_HELD(&p->p_ldtlock));
+
+	if (SELISLDT(rp->r_cs)) {
+		/*
+		 * Currently 64 bit processes cannot have private LDTs.
+		 */
+		ASSERT(p->p_model != DATAMODEL_LP64);
+
+		if (p->p_ldt == NULL)
+			return (-1);
+
+		descrp = &p->p_ldt[idx];
+		baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
+
+		/*
+		 * Calculate the linear address (wraparound is not only ok,
+		 * it's expected behavior).  The cast to uint32_t is because
+		 * LDT selectors are only allowed in 32-bit processes.
+		 */
+		*linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr +
+		    rp->r_pc);
+	} else {
+#ifdef DEBUG
+		descrp = &CPU->cpu_gdt[idx];
+		baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
+		/* GDT-based descriptors' base addresses should always be 0 */
+		ASSERT(baseaddr == 0);
+#endif
+		*linearp = (caddr_t)(uintptr_t)rp->r_pc;
+	}
+
+	return (0);
+}
+
+/*
+ * The implementation of dtrace_linear_pc is similar to the that of
+ * linear_pc, above, but here we acquire p_ldtlock before accessing
+ * p_ldt.  This implementation is used by the pid provider; we prefix
+ * it with "dtrace_" to avoid inducing spurious tracing events.
+ */
+int
+dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp)
+{
+	user_desc_t	*descrp;
+	caddr_t		baseaddr;
+	uint16_t	idx = SELTOIDX(rp->r_cs);
+
+	ASSERT(rp->r_cs <= 0xFFFF);
+
+	if (SELISLDT(rp->r_cs)) {
+		/*
+		 * Currently 64 bit processes cannot have private LDTs.
+		 */
+		ASSERT(p->p_model != DATAMODEL_LP64);
+
+		mutex_enter(&p->p_ldtlock);
+		if (p->p_ldt == NULL) {
+			mutex_exit(&p->p_ldtlock);
+			return (-1);
+		}
+		descrp = &p->p_ldt[idx];
+		baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
+		mutex_exit(&p->p_ldtlock);
+
+		/*
+		 * Calculate the linear address (wraparound is not only ok,
+		 * it's expected behavior).  The cast to uint32_t is because
+		 * LDT selectors are only allowed in 32-bit processes.
+		 */
+		*linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr +
+		    rp->r_pc);
+	} else {
+#ifdef DEBUG
+		descrp = &CPU->cpu_gdt[idx];
+		baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
+		/* GDT-based descriptors' base addresses should always be 0 */
+		ASSERT(baseaddr == 0);
+#endif
+		*linearp = (caddr_t)(uintptr_t)rp->r_pc;
+	}
+
+	return (0);
+}
--- a/usr/src/uts/i86pc/os/trap.c	Fri Mar 30 17:01:13 2007 -0700
+++ b/usr/src/uts/i86pc/os/trap.c	Fri Mar 30 19:27:19 2007 -0700
@@ -132,6 +132,9 @@
 
 #define	TRAP_TYPES	(sizeof (trap_type) / sizeof (trap_type[0]))
 
+#define	SLOW_SCALL_SIZE	2
+#define	FAST_SCALL_SIZE	2
+
 int tudebug = 0;
 int tudebugbpt = 0;
 int tudebugfpe = 0;
@@ -206,8 +209,6 @@
  * int <vector> is two bytes: 0xCD <vector>
  */
 
-#define	SLOW_SCALL_SIZE	2
-
 static int
 rewrite_syscall(caddr_t pc)
 {
@@ -227,28 +228,88 @@
  *
  * sysenter is two bytes: 0x0F 0x34
  * syscall is two bytes:  0x0F 0x05
+ * int $T_SYSCALLINT is two bytes: 0xCD 0x91
  */
 
-#define	FAST_SCALL_SIZE	2
-
 static int
-instr_is_fast_syscall(caddr_t pc, int which)
+instr_is_other_syscall(caddr_t pc, int which)
 {
 	uchar_t instr[FAST_SCALL_SIZE];
 
-	ASSERT(which == X86_SEP || which == X86_ASYSC);
+	ASSERT(which == X86_SEP || which == X86_ASYSC || which == 0xCD);
 
-	if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0 ||
-	    instr[0] != 0x0F)
+	if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0)
 		return (0);
 
-	if ((which == X86_SEP && instr[1] == 0x34) ||
-	    (which == X86_ASYSC && instr[1] == 0x05))
-		return (1);
+	switch (which) {
+	case X86_SEP:
+		if (instr[0] == 0x0F && instr[1] == 0x34)
+			return (1);
+		break;
+	case X86_ASYSC:
+		if (instr[0] == 0x0F && instr[1] == 0x05)
+			return (1);
+		break;
+	case 0xCD:
+		if (instr[0] == 0xCD && instr[1] == T_SYSCALLINT)
+			return (1);
+		break;
+	}
 
 	return (0);
 }
 
+static const char *
+syscall_insn_string(int syscall_insn)
+{
+	switch (syscall_insn) {
+	case X86_SEP:
+		return ("sysenter");
+	case X86_ASYSC:
+		return ("syscall");
+	case 0xCD:
+		return ("int");
+	default:
+		return ("Unknown");
+	}
+}
+
+static int
+ldt_rewrite_syscall(struct regs *rp, proc_t *p, int syscall_insn)
+{
+	caddr_t	linearpc;
+	int return_code = 0;
+
+	mutex_enter(&p->p_ldtlock);	/* Must be held across linear_pc() */
+
+	if (linear_pc(rp, p, &linearpc) == 0) {
+
+		/*
+		 * If another thread beat us here, it already changed
+		 * this site to the slower (int) syscall instruction.
+		 */
+		if (instr_is_other_syscall(linearpc, 0xCD)) {
+			return_code = 1;
+		} else if (instr_is_other_syscall(linearpc, syscall_insn)) {
+
+			if (rewrite_syscall(linearpc) == 0) {
+				return_code = 1;
+			}
+#ifdef DEBUG
+			else
+				cmn_err(CE_WARN, "failed to rewrite %s "
+				    "instruction in process %d",
+				    syscall_insn_string(syscall_insn),
+				    p->p_pid);
+#endif /* DEBUG */
+		}
+	}
+
+	mutex_exit(&p->p_ldtlock);	/* Must be held across linear_pc() */
+
+	return (return_code);
+}
+
 /*
  * Test to see if the instruction at pc is a system call instruction.
  *
@@ -260,7 +321,7 @@
 #define	LCALLSIZE	7
 
 static int
-instr_is_syscall(caddr_t pc)
+instr_is_lcall_syscall(caddr_t pc)
 {
 	uchar_t instr[LCALLSIZE];
 
@@ -704,7 +765,7 @@
 		 * trap gate sequence.  We just have to adjust the pc.
 		 */
 		if (watchpage && addr == (caddr_t)rp->r_sp &&
-		    rw == S_READ && instr_is_syscall((caddr_t)rp->r_pc)) {
+		    rw == S_READ && instr_is_lcall_syscall((caddr_t)rp->r_pc)) {
 			extern void watch_syscall(void);
 
 			rp->r_pc += LCALLSIZE;
@@ -828,16 +889,8 @@
 		 * be to emulate that particular instruction.
 		 */
 		if (p->p_ldt != NULL &&
-		    instr_is_fast_syscall((caddr_t)rp->r_pc, X86_ASYSC)) {
-			if (rewrite_syscall((caddr_t)rp->r_pc) == 0)
-				goto out;
-#ifdef DEBUG
-			else
-				cmn_err(CE_WARN, "failed to rewrite syscall "
-				    "instruction in process %d",
-				    curthread->t_procp->p_pid);
-#endif /* DEBUG */
-		}
+		    ldt_rewrite_syscall(rp, p, X86_ASYSC))
+			goto out;
 
 #ifdef __amd64
 		/*
@@ -1114,7 +1167,7 @@
 #ifdef _SYSCALL32_IMPL
 		if (p->p_model != DATAMODEL_NATIVE) {
 #endif /* _SYSCALL32_IMPL */
-		if (instr_is_syscall((caddr_t)rp->r_pc)) {
+		if (instr_is_lcall_syscall((caddr_t)rp->r_pc)) {
 			if (type == T_SEGFLT + USER)
 				ASSERT(p->p_ldt != NULL);
 
@@ -1161,16 +1214,9 @@
 		 * this will be to emulate that particular instruction.
 		 */
 		if (p->p_ldt != NULL &&
-		    instr_is_fast_syscall((caddr_t)rp->r_pc, X86_SEP)) {
-			if (rewrite_syscall((caddr_t)rp->r_pc) == 0)
-				goto out;
-#ifdef DEBUG
-			else
-				cmn_err(CE_WARN, "failed to rewrite sysenter "
-				    "instruction in process %d",
-				    curthread->t_procp->p_pid);
-#endif /* DEBUG */
-		}
+		    ldt_rewrite_syscall(rp, p, X86_SEP))
+			goto out;
+
 		/*FALLTHROUGH*/
 
 	case T_BOUNDFLT + USER:	/* bound fault */
--- a/usr/src/uts/i86pc/sys/machsystm.h	Fri Mar 30 17:01:13 2007 -0700
+++ b/usr/src/uts/i86pc/sys/machsystm.h	Fri Mar 30 19:27:19 2007 -0700
@@ -121,6 +121,8 @@
 extern void memlist_add(uint64_t, uint64_t, struct memlist *,
     struct memlist **);
 extern page_t *page_get_physical(uintptr_t);
+extern int linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp);
+extern int dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp);
 
 #endif /* _KERNEL */
 
--- a/usr/src/uts/intel/amd64/ml/mach_offsets.in	Fri Mar 30 17:01:13 2007 -0700
+++ b/usr/src/uts/intel/amd64/ml/mach_offsets.in	Fri Mar 30 19:27:19 2007 -0700
@@ -77,6 +77,10 @@
 	r_r13	REGOFF_R13
 	r_r14	REGOFF_R14
 	r_r15	REGOFF_R15
+\#if DEBUG
+	__r_fsbase	REGOFF_FSBASE
+	__r_gsbase	REGOFF_GSBASE
+\#endif
 	r_ds	REGOFF_DS
 	r_es	REGOFF_ES
 	r_fs	REGOFF_FS
--- a/usr/src/uts/intel/amd64/sys/privregs.h	Fri Mar 30 17:01:13 2007 -0700
+++ b/usr/src/uts/intel/amd64/sys/privregs.h	Fri Mar 30 19:27:19 2007 -0700
@@ -78,15 +78,11 @@
 	greg_t	r_r15;		/* callee-saved */
 
 	/*
-	 * XX64
-	 * We used to sample fsbase and gsbase on every exception
-	 * with expensive rdmsr's. Yet this was only useful at
-	 * best for debugging during amd64 bringup. We should take
-	 * these away but for now simply rename them to avoid any
-	 * flag days.
+	 * fsbase and gsbase are sampled on every exception in DEBUG kernels
+	 * only.  They remain in the non-DEBUG kernel to avoid any flag days.
 	 */
-	greg_t	__r_fsbase;	/* XX64 no longer used by the kernel */
-	greg_t	__r_gsbase;	/* XX64 no longer used by the kernel */
+	greg_t	__r_fsbase;	/* no longer used in non-DEBUG builds */
+	greg_t	__r_gsbase;	/* no longer used in non-DEBUG builds */
 	greg_t	r_ds;
 	greg_t	r_es;
 	greg_t	r_fs;		/* %fs is *never* used by the kernel */
@@ -123,6 +119,20 @@
 #include <sys/machprivregs.h>
 #include <sys/pcb.h>
 
+#ifdef DEBUG
+#define	__SAVE_BASES				\
+	movl    $MSR_AMD_FSBASE, %ecx;          \
+	rdmsr;                                  \
+	movl    %eax, REGOFF_FSBASE(%rsp);      \
+	movl    %edx, REGOFF_FSBASE+4(%rsp);    \
+	movl    $MSR_AMD_GSBASE, %ecx;          \
+	rdmsr;                                  \
+	movl    %eax, REGOFF_GSBASE(%rsp);      \
+	movl    %edx, REGOFF_GSBASE+4(%rsp)
+#else
+#define	__SAVE_BASES
+#endif
+
 /*
  * Create a struct regs on the stack suitable for an
  * interrupt trap.
@@ -157,7 +167,8 @@
 	movw	%es, %cx;			\
 	movq	%rcx, REGOFF_ES(%rsp);		\
 	movw	%ds, %cx;			\
-	movq	%rcx, REGOFF_DS(%rsp)
+	movq	%rcx, REGOFF_DS(%rsp);		\
+	__SAVE_BASES
 
 #define	__RESTORE_REGS				\
 	movq	REGOFF_RDI(%rsp),	%rdi;	\
--- a/usr/src/uts/intel/fs/proc/prmachdep.c	Fri Mar 30 17:01:13 2007 -0700
+++ b/usr/src/uts/intel/fs/proc/prmachdep.c	Fri Mar 30 19:27:19 2007 -0700
@@ -508,10 +508,9 @@
 	ASSERT(MUTEX_HELD(&p->p_ldtlock));
 
 	/*
-	 * Currently 64 bit processes cannot have a private ldt.
+	 * Currently 64 bit processes cannot have private LDTs.
 	 */
-	ASSERT(get_udatamodel() != DATAMODEL_LP64 || p->p_ldt == NULL);
-
+	ASSERT(p->p_model != DATAMODEL_LP64 || p->p_ldt == NULL);
 
 	if (p->p_ldt == NULL)
 		return (0);
--- a/usr/src/uts/intel/kdi/amd64/kdi_asm.s	Fri Mar 30 17:01:13 2007 -0700
+++ b/usr/src/uts/intel/kdi/amd64/kdi_asm.s	Fri Mar 30 19:27:19 2007 -0700
@@ -104,10 +104,20 @@
 	movw	%fs, %ax;				\
 	movq	%rax, REG_OFF(KDIREG_FS)(base);		\
 	movw	%gs, %ax;				\
-	movq	%rax, REG_OFF(KDIREG_GS)(base)
+	movq	%rax, REG_OFF(KDIREG_GS)(base);		\
+	movl	$MSR_AMD_GSBASE, %ecx;			\
+	rdmsr;						\
+	shlq	$32, %rdx;				\
+	orq	%rax, %rdx;				\
+	movq	%rdx, REG_OFF(KDIREG_GSBASE)(base)
 
 #define	KDI_RESTORE_REGS(base) \
 	movq	base, %rdi;				\
+	movq	REG_OFF(KDIREG_GSBASE)(%rdi), %rdx;	\
+	movq	%rdx, %rax;				\
+	shrq	$32, %rdx;				\
+	movl	$MSR_AMD_GSBASE, %ecx;			\
+	wrmsr;						\
 	movq	REG_OFF(KDIREG_ES)(%rdi), %rax;		\
 	movw	%ax, %es;				\
 	movq	REG_OFF(KDIREG_DS)(%rdi), %rax;		\
@@ -193,7 +203,9 @@
 	 * Switch to the kernel's GSBASE.  Neither GSBASE nor the ill-named
 	 * KGSBASE can be trusted, as the kernel may or may not have already
 	 * done a swapgs.  All is not lost, as the kernel can divine the correct
-	 * value for us.
+	 * value for us.  Note that the previous GSBASE is saved in the
+	 * KDI_SAVE_REGS macro to prevent a usermode process's GSBASE from being
+	 * blown away.
 	 */
 	subq	$10, %rsp
 	sgdt	(%rsp)