git: f0bc4ed144fc - main - kinst: Initial revision

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Tue, 11 Oct 2022 22:35:35 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=f0bc4ed144fc59b6f72d90c46b91ca803d3b29ce

commit f0bc4ed144fc59b6f72d90c46b91ca803d3b29ce
Author:     Christos Margiolis <christos@FreeBSD.org>
AuthorDate: 2022-10-11 15:28:17 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2022-10-11 22:19:08 +0000

    kinst: Initial revision
    
    This is a new DTrace provider which allows arbitrary kernel instructions
    to be traced.  Currently it is implemented only for amd64.
    
    kinst probes are created on demand by libdtrace, and there is a probe
    for each kernel instruction.  Probes are named
    kinst:<module>:<function>:<offset>, where "offset" is the offset of the
    target instruction relative to the beginning of the function.  Omitting
    "offset" causes all instructions in the function to be traced.
    
    kinst works similarly to FBT in that it places a breakpoint on the
    target instruction and hooks into the kernel breakpoint handler.
    Because kinst has to be able to trace arbitrary instructions, it does
    not emulate most of them in software but rather causes the traced thread
    to execute a copy of the instruction before returning to the original
    code.
    
    The provider is quite low-level and as-is will be useful mostly only to
    kernel developers.  However, it provides a great deal of visibility into
    kernel code execution and could be used as a building block for
    higher-level tooling which can in some sense translate between C sources
    and generated machine code.  In particular, the "regs" variable recently
    added to D allows the CPU's register file to be accessed from kinst
    probes.
    
    kinst is experimental and should not be used on production systems for
    now.
    
    In collaboration with:  markj
    Sponsored by:           Google, Inc. (GSoC 2022)
    MFC after:              3 months
    Differential Revision:  https://reviews.freebsd.org/D36851
---
 sys/cddl/dev/dtrace/dtrace_cddl.h    |   2 +
 sys/cddl/dev/kinst/amd64/kinst_isa.c | 550 +++++++++++++++++++++++++++++++++++
 sys/cddl/dev/kinst/amd64/kinst_isa.h |  45 +++
 sys/cddl/dev/kinst/kinst.c           | 233 +++++++++++++++
 sys/cddl/dev/kinst/kinst.h           |  71 +++++
 sys/cddl/dev/kinst/trampoline.c      | 303 +++++++++++++++++++
 sys/modules/dtrace/Makefile          |   1 +
 sys/modules/dtrace/kinst/Makefile    |  17 ++
 8 files changed, 1222 insertions(+)

diff --git a/sys/cddl/dev/dtrace/dtrace_cddl.h b/sys/cddl/dev/dtrace/dtrace_cddl.h
index 08b6f80d4bae..b2397d621355 100644
--- a/sys/cddl/dev/dtrace/dtrace_cddl.h
+++ b/sys/cddl/dev/dtrace/dtrace_cddl.h
@@ -88,6 +88,7 @@ typedef struct kdtrace_thread {
 	void		*td_systrace_args; /* syscall probe arguments. */
 	uint64_t	td_fasttrap_tp_gen; /* Tracepoint hash table gen. */
 	struct trapframe *td_dtrace_trapframe; /* Trap frame from invop. */
+	void		*td_kinst;
 } kdtrace_thread_t;
 
 /*
@@ -117,6 +118,7 @@ typedef struct kdtrace_thread {
 #define	t_dtrace_systrace_args	td_dtrace->td_systrace_args
 #define	t_fasttrap_tp_gen	td_dtrace->td_fasttrap_tp_gen
 #define	t_dtrace_trapframe	td_dtrace->td_dtrace_trapframe
+#define	t_kinst		td_dtrace->td_kinst
 #define	p_dtrace_helpers	p_dtrace->p_dtrace_helpers
 #define	p_dtrace_count	p_dtrace->p_dtrace_count
 #define	p_dtrace_probes	p_dtrace->p_dtrace_probes
diff --git a/sys/cddl/dev/kinst/amd64/kinst_isa.c b/sys/cddl/dev/kinst/amd64/kinst_isa.c
new file mode 100644
index 000000000000..6d8d5d521617
--- /dev/null
+++ b/sys/cddl/dev/kinst/amd64/kinst_isa.c
@@ -0,0 +1,550 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
+ * Copyright 2022 Mark Johnston <markj@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+
+#include <sys/dtrace.h>
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+#include <dis_tables.h>
+
+#include "kinst.h"
+
+#define KINST_PUSHL_RBP		0x55
+#define KINST_STI		0xfb
+#define KINST_POPF		0x9d
+
+#define KINST_MODRM_MOD(b)	(((b) & 0xc0) >> 6)
+#define KINST_MODRM_REG(b)	(((b) & 0x38) >> 3)
+#define KINST_MODRM_RM(b)	((b) & 0x07)
+
+#define KINST_SIB_SCALE(s)	(((s) & 0xc0) >> 6)
+#define KINST_SIB_INDEX(s)	(((s) & 0x38) >> 3)
+#define KINST_SIB_BASE(s)	(((s) & 0x07) >> 0)
+
+#define KINST_REX_W(r)		(((r) & 0x08) >> 3)
+#define KINST_REX_R(r)		(((r) & 0x04) >> 2)
+#define KINST_REX_X(r)		(((r) & 0x02) >> 1)
+#define KINST_REX_B(r)		(((r) & 0x01) >> 0)
+
+#define KINST_F_CALL		0x0001	/* instruction is a "call" */
+#define KINST_F_DIRECT_CALL	0x0002	/* instruction is a direct call */
+#define KINST_F_RIPREL		0x0004	/* instruction is position-dependent */
+#define KINST_F_JMP		0x0008	/* instruction is a %rip-relative jmp */
+#define KINST_F_MOD_DIRECT	0x0010	/* operand is not a memory address */
+
+/*
+ * Map ModR/M register bits to a trapframe offset.
+ */
+static int
+kinst_regoff(int reg)
+{
+#define	_MATCH_REG(i, reg)			\
+	case i:					\
+		return (offsetof(struct trapframe, tf_ ## reg) / \
+		    sizeof(register_t))
+	switch (reg) {
+	_MATCH_REG( 0, rax);
+	_MATCH_REG( 1, rcx);
+	_MATCH_REG( 2, rdx);
+	_MATCH_REG( 3, rbx);
+	_MATCH_REG( 4, rsp); /* SIB when mod != 3 */
+	_MATCH_REG( 5, rbp);
+	_MATCH_REG( 6, rsi);
+	_MATCH_REG( 7, rdi);
+	_MATCH_REG( 8, r8); /* REX.R is set */
+	_MATCH_REG( 9, r9);
+	_MATCH_REG(10, r10);
+	_MATCH_REG(11, r11);
+	_MATCH_REG(12, r12);
+	_MATCH_REG(13, r13);
+	_MATCH_REG(14, r14);
+	_MATCH_REG(15, r15);
+	}
+#undef _MATCH_REG
+	panic("%s: unhandled register index %d", __func__, reg);
+}
+
+/*
+ * Obtain the specified register's value.
+ */
+static uint64_t
+kinst_regval(struct trapframe *frame, int reg)
+{
+	if (reg == -1)
+		return (0);
+	return (((register_t *)frame)[kinst_regoff(reg)]);
+}
+
+static uint32_t
+kinst_riprel_disp(struct kinst_probe *kp, void *dst)
+{
+	return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp -
+	    (intptr_t)dst));
+}
+
+static void
+kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp)
+{
+	uint8_t *instr;
+	uint32_t disp;
+	int ilen;
+
+	ilen = kp->kp_md.tinstlen;
+
+	memcpy(tramp, kp->kp_md.template, ilen);
+	if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) {
+		disp = kinst_riprel_disp(kp, tramp);
+		memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t));
+	}
+
+	/*
+	 * The following position-independent jmp takes us back to the
+	 * original code.  It is encoded as "jmp *0(%rip)" (six bytes),
+	 * followed by the absolute address of the instruction following
+	 * the one that was traced (eight bytes).
+	 */
+	tramp[ilen + 0] = 0xff;
+	tramp[ilen + 1] = 0x25;
+	tramp[ilen + 2] = 0x00;
+	tramp[ilen + 3] = 0x00;
+	tramp[ilen + 4] = 0x00;
+	tramp[ilen + 5] = 0x00;
+	instr = kp->kp_patchpoint + kp->kp_md.instlen;
+	memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t));
+}
+
+int
+kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch)
+{
+	solaris_cpu_t *cpu;
+	uintptr_t *stack, retaddr;
+	struct kinst_probe *kp;
+	struct kinst_probe_md *kpmd;
+	uint8_t *tramp;
+
+	stack = (uintptr_t *)frame->tf_rsp;
+	cpu = &solaris_cpu[curcpu];
+
+	LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) {
+		if ((uintptr_t)kp->kp_patchpoint == addr)
+			break;
+	}
+	if (kp == NULL)
+		return (0);
+
+	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+	cpu->cpu_dtrace_caller = stack[0];
+	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+	dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0);
+	cpu->cpu_dtrace_caller = 0;
+
+	kpmd = &kp->kp_md;
+	if ((kpmd->flags & KINST_F_CALL) != 0) {
+		/*
+		 * dtrace_invop_start() reserves space on the stack to
+		 * store the return address of the call instruction.
+		 */
+		retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen);
+		*(uintptr_t *)scratch = retaddr;
+
+		if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) {
+			frame->tf_rip = (uintptr_t)(kp->kp_patchpoint +
+			    kpmd->disp + kpmd->instlen);
+		} else {
+			register_t rval;
+
+			if (kpmd->reg1 == -1 && kpmd->reg2 == -1) {
+				/* rip-relative */
+				rval = frame->tf_rip - 1 + kpmd->instlen;
+			} else {
+				/* indirect */
+				rval = kinst_regval(frame, kpmd->reg1) +
+				    (kinst_regval(frame, kpmd->reg2) <<
+				    kpmd->scale);
+			}
+
+			if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) {
+				frame->tf_rip = rval + kpmd->disp;
+			} else {
+				frame->tf_rip =
+				    *(uintptr_t *)(rval + kpmd->disp);
+			}
+		}
+		return (DTRACE_INVOP_CALL);
+	} else {
+		tramp = curthread->t_kinst;
+		if (tramp == NULL) {
+			/*
+			 * A trampoline allocation failed, so this probe is
+			 * effectively disabled.  Restore the original
+			 * instruction.
+			 *
+			 * We can't safely print anything here, but the
+			 * trampoline allocator should have left a breadcrumb in
+			 * the dmesg.
+			 */
+			kinst_patch_tracepoint(kp, kp->kp_savedval);
+			frame->tf_rip = (register_t)kp->kp_patchpoint;
+		} else {
+			kinst_trampoline_populate(kp, tramp);
+			frame->tf_rip = (register_t)tramp;
+		}
+		return (DTRACE_INVOP_NOP);
+	}
+}
+
+void
+kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val)
+{
+	register_t reg;
+	int oldwp;
+
+	reg = intr_disable();
+	oldwp = disable_wp();
+	*kp->kp_patchpoint = val;
+	restore_wp(oldwp);
+	intr_restore(reg);
+}
+
+static void
+kinst_set_disp8(struct kinst_probe *kp, uint8_t byte)
+{
+	kp->kp_md.disp = (int64_t)(int8_t)byte;
+}
+
+static void
+kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes)
+{
+	int32_t disp32;
+
+	memcpy(&disp32, bytes, sizeof(disp32));
+	kp->kp_md.disp = (int64_t)disp32;
+}
+
+static int
+kinst_dis_get_byte(void *p)
+{
+	int ret;
+	uint8_t **instr = p;
+
+	ret = **instr;
+	(*instr)++;
+
+	return (ret);
+}
+
+/*
+ * Set up all of the state needed to faithfully execute a probed instruction.
+ *
+ * In the simple case, we copy the instruction unmodified to a per-thread
+ * trampoline, wherein it is followed by a jump back to the original code.
+ * - Instructions can have %rip as an operand:
+ *   - with %rip-relative addressing encoded in ModR/M, or
+ *   - implicitly as a part of the instruction definition (jmp, call).
+ * - Call instructions (which may be %rip-relative) need to push the correct
+ *   return address onto the stack.
+ *
+ * Call instructions are simple enough to be emulated in software, so we simply
+ * do not use the trampoline mechanism in that case.  kinst_invop() will compute
+ * the branch target using the address info computed here (register operands and
+ * displacement).
+ *
+ * %rip-relative operands encoded using the ModR/M byte always use a 32-bit
+ * displacement; when populating the trampoline the displacement is adjusted to
+ * be relative to the trampoline address.  Trampolines are always allocated
+ * above KERNBASE for this reason.
+ *
+ * For other %rip-relative operands (just jumps) we take the same approach.
+ * Instructions which specify an 8-bit displacement must be rewritten to use a
+ * 32-bit displacement.
+ */
+static int
+kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr)
+{
+	struct kinst_probe_md *kpmd;
+	dis86_t d86;
+	uint8_t *bytes, modrm, rex;
+	int dispoff, i, ilen, opcidx;
+
+	kpmd = &kp->kp_md;
+
+	d86.d86_data = instr;
+	d86.d86_get_byte = kinst_dis_get_byte;
+	d86.d86_check_func = NULL;
+	if (dtrace_disx86(&d86, SIZE64) != 0) {
+		KINST_LOG("failed to disassemble instruction at: %p", *instr);
+		return (EINVAL);
+	}
+	bytes = d86.d86_bytes;
+	kpmd->instlen = kpmd->tinstlen = d86.d86_len;
+
+	/*
+	 * Skip over prefixes, save REX.
+	 */
+	rex = 0;
+	for (i = 0; i < kpmd->instlen; i++) {
+		switch (bytes[i]) {
+		case 0xf0 ... 0xf3:
+			/* group 1 */
+			continue;
+		case 0x26:
+		case 0x2e:
+		case 0x36:
+		case 0x3e:
+		case 0x64:
+		case 0x65:
+			/* group 2 */
+			continue;
+		case 0x66:
+			/* group 3 */
+			continue;
+		case 0x67:
+			/* group 4 */
+			continue;
+		case 0x40 ... 0x4f:
+			/* REX */
+			rex = bytes[i];
+			continue;
+		}
+		break;
+	}
+	KASSERT(i < kpmd->instlen,
+	    ("%s: failed to disassemble instruction at %p", __func__, bytes));
+	opcidx = i;
+
+	/*
+	 * Identify instructions of interest by opcode: calls and jumps.
+	 * Extract displacements.
+	 */
+	dispoff = -1;
+	switch (bytes[opcidx]) {
+	case 0x0f:
+		switch (bytes[opcidx + 1]) {
+		case 0x80 ... 0x8f:
+			/* conditional jmp near */
+			kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+			dispoff = opcidx + 2;
+			kinst_set_disp32(kp, &bytes[dispoff]);
+			break;
+		}
+		break;
+	case 0xe3:
+		/*
+		 * There is no straightforward way to translate this instruction
+		 * to use a 32-bit displacement.  Fortunately, it is rarely
+		 * used.
+		 */
+		return (EINVAL);
+	case 0x70 ... 0x7f:
+		/* conditional jmp short */
+		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+		dispoff = opcidx + 1;
+		kinst_set_disp8(kp, bytes[dispoff]);
+		break;
+	case 0xe9:
+		/* unconditional jmp near */
+		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+		dispoff = opcidx + 1;
+		kinst_set_disp32(kp, &bytes[dispoff]);
+		break;
+	case 0xeb:
+		/* unconditional jmp short */
+		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+		dispoff = opcidx + 1;
+		kinst_set_disp8(kp, bytes[dispoff]);
+		break;
+	case 0xe8:
+	case 0x9a:
+		/* direct call */
+		kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL;
+		dispoff = opcidx + 1;
+		kinst_set_disp32(kp, &bytes[dispoff]);
+		break;
+	case 0xff:
+		KASSERT(d86.d86_got_modrm,
+		    ("no ModR/M byte for instr at %p", *instr - kpmd->instlen));
+		switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) {
+		case 0x02:
+		case 0x03:
+			/* indirect call */
+			kpmd->flags |= KINST_F_CALL;
+			break;
+		case 0x04:
+		case 0x05:
+			/* indirect jump */
+			kpmd->flags |= KINST_F_JMP;
+			break;
+		}
+	}
+
+	/*
+	 * If there's a ModR/M byte, we need to check it to see if the operand
+	 * is %rip-relative, and rewrite the displacement if so.  If not, we
+	 * might still have to extract operand info if this is a call
+	 * instruction.
+	 */
+	if (d86.d86_got_modrm) {
+		uint8_t mod, rm, sib;
+
+		kpmd->reg1 = kpmd->reg2 = -1;
+
+		modrm = bytes[d86.d86_rmindex];
+		mod = KINST_MODRM_MOD(modrm);
+		rm = KINST_MODRM_RM(modrm);
+		if (mod == 0 && rm == 5) {
+			kpmd->flags |= KINST_F_RIPREL;
+			dispoff = d86.d86_rmindex + 1;
+			kinst_set_disp32(kp, &bytes[dispoff]);
+		} else if ((kpmd->flags & KINST_F_CALL) != 0) {
+			bool havesib;
+
+			havesib = (mod != 3 && rm == 4);
+			dispoff = d86.d86_rmindex + (havesib ? 2 : 1);
+			if (mod == 1)
+				kinst_set_disp8(kp, bytes[dispoff]);
+			else if (mod == 2)
+				kinst_set_disp32(kp, &bytes[dispoff]);
+			else if (mod == 3)
+				kpmd->flags |= KINST_F_MOD_DIRECT;
+
+			if (havesib) {
+				sib = bytes[d86.d86_rmindex + 1];
+				if (KINST_SIB_BASE(sib) != 5) {
+					kpmd->reg1 = KINST_SIB_BASE(sib) |
+					    (KINST_REX_B(rex) << 3);
+				}
+				kpmd->scale = KINST_SIB_SCALE(sib);
+				kpmd->reg2 = KINST_SIB_INDEX(sib) |
+				    (KINST_REX_X(rex) << 3);
+			} else {
+				kpmd->reg1 = rm | (KINST_REX_B(rex) << 3);
+			}
+		}
+	}
+
+	/*
+	 * Calls are emulated in software; once operands are decoded we have
+	 * nothing else to do.
+	 */
+	if ((kpmd->flags & KINST_F_CALL) != 0)
+		return (0);
+
+	/*
+	 * Allocate and populate an instruction trampoline template.
+	 *
+	 * Position-independent instructions can simply be copied, but
+	 * position-dependent instructions require some surgery: jump
+	 * instructions with an 8-bit displacement need to be converted to use a
+	 * 32-bit displacement, and the adjusted displacement needs to be
+	 * computed.
+	 */
+	ilen = kpmd->instlen;
+	if ((kpmd->flags & KINST_F_RIPREL) != 0) {
+		if ((kpmd->flags & KINST_F_JMP) == 0 ||
+		    bytes[opcidx] == 0x0f ||
+		    bytes[opcidx] == 0xe9 ||
+		    bytes[opcidx] == 0xff) {
+			memcpy(kpmd->template, bytes, dispoff);
+			memcpy(&kpmd->template[dispoff + 4],
+			    &bytes[dispoff + 4], ilen - (dispoff + 4));
+			kpmd->dispoff = dispoff;
+		} else if (bytes[opcidx] == 0xeb) {
+			memcpy(kpmd->template, bytes, opcidx);
+			kpmd->template[opcidx] = 0xe9;
+			kpmd->dispoff = opcidx + 1;
+
+			/* Instruction length changes from 2 to 5. */
+			kpmd->tinstlen = 5;
+			kpmd->disp -= 3;
+		} else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f)  {
+			memcpy(kpmd->template, bytes, opcidx);
+			kpmd->template[opcidx] = 0x0f;
+			kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10;
+			kpmd->dispoff = opcidx + 2;
+
+			/* Instruction length changes from 2 to 6. */
+			kpmd->tinstlen = 6;
+			kpmd->disp -= 4;
+		} else {
+			panic("unhandled opcode %#x", bytes[opcidx]);
+		}
+	} else {
+		memcpy(kpmd->template, bytes, ilen);
+	}
+
+	return (0);
+}
+
+int
+kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval,
+    void *opaque)
+{
+	struct kinst_probe *kp;
+	dtrace_kinst_probedesc_t *pd;
+	const char *func;
+	int error, n, off;
+	uint8_t *instr, *limit;
+
+	pd = opaque;
+	func = symval->name;
+	if (strcmp(func, pd->kpd_func) != 0 || strcmp(func, "trap_check") == 0)
+		return (0);
+
+	instr = (uint8_t *)symval->value;
+	limit = (uint8_t *)symval->value + symval->size;
+	if (instr >= limit)
+		return (0);
+
+	/*
+	 * Ignore functions not beginning with the usual function prologue.
+	 * These might correspond to assembly routines with which we should not
+	 * meddle.
+	 */
+	if (*instr != KINST_PUSHL_RBP)
+		return (0);
+
+	n = 0;
+	while (instr < limit) {
+		off = (int)(instr - (uint8_t *)symval->value);
+		if (pd->kpd_off != -1 && off != pd->kpd_off) {
+			instr += dtrace_instr_size(instr);
+			continue;
+		}
+
+		/*
+		 * Prevent separate dtrace(1) instances from creating copies of
+		 * the same probe.
+		 */
+		LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) {
+			if (strcmp(kp->kp_func, func) == 0 &&
+			    strtol(kp->kp_name, NULL, 10) == off)
+				return (0);
+		}
+		if (++n > KINST_PROBETAB_MAX) {
+			KINST_LOG("probe list full: %d entries", n);
+			return (ENOMEM);
+		}
+		kp = malloc(sizeof(struct kinst_probe), M_KINST,
+		    M_WAITOK | M_ZERO);
+		kp->kp_func = func;
+		snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off);
+		kp->kp_savedval = *instr;
+		kp->kp_patchval = KINST_PATCHVAL;
+		kp->kp_patchpoint = instr;
+
+		error = kinst_instr_dissect(kp, &instr);
+		if (error != 0)
+			return (error);
+
+		kinst_probe_create(kp, lf);
+	}
+
+	return (0);
+}
diff --git a/sys/cddl/dev/kinst/amd64/kinst_isa.h b/sys/cddl/dev/kinst/amd64/kinst_isa.h
new file mode 100644
index 000000000000..4c6387b8cb50
--- /dev/null
+++ b/sys/cddl/dev/kinst/amd64/kinst_isa.h
@@ -0,0 +1,45 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
+ * Copyright 2022 Mark Johnston <markj@FreeBSD.org>
+ */
+
+#ifndef _KINST_ISA_H_
+#define _KINST_ISA_H_
+
+#include <sys/types.h>
+
+#define KINST_PATCHVAL		0xcc
+
+/*
+ * Each trampoline is 32 bytes long and contains [instruction, jmp]. Since we
+ * have 2 instructions stored in the trampoline, and each of them can take up
+ * to 16 bytes, 32 bytes is enough to cover even the worst case scenario.
+ */
+#define	KINST_TRAMP_SIZE	32
+#define	KINST_TRAMPCHUNK_SIZE	PAGE_SIZE
+
+/*
+ * Fill the trampolines with breakpoint instructions so that the kernel will
+ * crash cleanly if things somehow go wrong.
+ */
+#define	KINST_TRAMP_INIT(t, s)	memset((t), KINST_PATCHVAL, (s))
+
+typedef uint8_t kinst_patchval_t;
+
+struct kinst_probe_md {
+	int			flags;
+	int			instlen;	/* original instr len */
+	int			tinstlen;	/* trampoline instr len */
+	uint8_t			template[16];	/* copied into thread tramps */
+	int			dispoff;	/* offset of rip displacement */
+
+	/* operands to "call" instruction branch target */
+	int			reg1;
+	int			reg2;
+	int			scale;
+	int64_t			disp;
+};
+
+#endif /* _KINST_ISA_H_ */
diff --git a/sys/cddl/dev/kinst/kinst.c b/sys/cddl/dev/kinst/kinst.c
new file mode 100644
index 000000000000..a7d04e927fa7
--- /dev/null
+++ b/sys/cddl/dev/kinst/kinst.c
@@ -0,0 +1,233 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+
+#include <sys/dtrace.h>
+
+#include "kinst.h"
+
+MALLOC_DEFINE(M_KINST, "kinst", "Kernel Instruction Tracing");
+
+static d_open_t		kinst_open;
+static d_close_t	kinst_close;
+static d_ioctl_t	kinst_ioctl;
+
+static void	kinst_provide_module(void *, modctl_t *);
+static void	kinst_getargdesc(void *, dtrace_id_t, void *,
+		    dtrace_argdesc_t *);
+static void	kinst_destroy(void *, dtrace_id_t, void *);
+static void	kinst_enable(void *, dtrace_id_t, void *);
+static void	kinst_disable(void *, dtrace_id_t, void *);
+static int	kinst_load(void *);
+static int	kinst_unload(void *);
+static int	kinst_modevent(module_t, int, void *);
+
+static dtrace_pattr_t kinst_attr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+};
+
+static dtrace_pops_t kinst_pops = {
+	.dtps_provide		= NULL,
+	.dtps_provide_module	= kinst_provide_module,
+	.dtps_enable		= kinst_enable,
+	.dtps_disable		= kinst_disable,
+	.dtps_suspend		= NULL,
+	.dtps_resume		= NULL,
+	.dtps_getargdesc	= kinst_getargdesc,
+	.dtps_getargval		= NULL,
+	.dtps_usermode		= NULL,
+	.dtps_destroy		= kinst_destroy
+};
+
+static struct cdevsw kinst_cdevsw = {
+	.d_name			= "kinst",
+	.d_version		= D_VERSION,
+	.d_flags		= D_TRACKCLOSE,
+	.d_open			= kinst_open,
+	.d_close		= kinst_close,
+	.d_ioctl		= kinst_ioctl,
+};
+
+static dtrace_provider_id_t	kinst_id;
+struct kinst_probe_list	*kinst_probetab;
+static struct cdev	*kinst_cdev;
+
+void
+kinst_probe_create(struct kinst_probe *kp, linker_file_t lf)
+{
+	kp->kp_id = dtrace_probe_create(kinst_id, lf->filename,
+	    kp->kp_func, kp->kp_name, 3, kp);
+
+	LIST_INSERT_HEAD(KINST_GETPROBE(kp->kp_patchpoint), kp, kp_hashnext);
+}
+
+static int
+kinst_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused,
+    struct thread *td __unused)
+{
+	return (0);
+}
+
+static int
+kinst_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused,
+    struct thread *td __unused)
+{
+	dtrace_condense(kinst_id);
+	return (0);
+}
+
+static int
+kinst_linker_file_cb(linker_file_t lf, void *arg)
+{
+	dtrace_kinst_probedesc_t *pd;
+
+	pd = arg;
+	if (pd->kpd_mod[0] != '\0' && strcmp(pd->kpd_mod, lf->filename) != 0)
+		return (0);
+
+	/*
+	 * Invoke kinst_make_probe_function() once for each function symbol in
+	 * the module "lf".
+	 */
+	return (linker_file_function_listall(lf, kinst_make_probe, arg));
+}
+
+static int
+kinst_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr,
+    int flags __unused, struct thread *td __unused)
+{
+	dtrace_kinst_probedesc_t *pd;
+	int error = 0;
+
+	switch (cmd) {
+	case KINSTIOC_MAKEPROBE:
+		pd = (dtrace_kinst_probedesc_t *)addr;
+		pd->kpd_func[sizeof(pd->kpd_func) - 1] = '\0';
+		pd->kpd_mod[sizeof(pd->kpd_mod) - 1] = '\0';
+
+		/* Loop over all functions in the kernel and loaded modules. */
+		error = linker_file_foreach(kinst_linker_file_cb, pd);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	return (error);
+}
+
+static void
+kinst_provide_module(void *arg, modctl_t *lf)
+{
+}
+
+static void
+kinst_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
+{
+	desc->dtargd_ndx = DTRACE_ARGNONE;
+}
+
+static void
+kinst_destroy(void *arg, dtrace_id_t id, void *parg)
+{
+	struct kinst_probe *kp = parg;
+
+	LIST_REMOVE(kp, kp_hashnext);
+	free(kp, M_KINST);
+}
+
+static void
+kinst_enable(void *arg, dtrace_id_t id, void *parg)
+{
+	struct kinst_probe *kp = parg;
+
+	kinst_patch_tracepoint(kp, kp->kp_patchval);
+}
+
+static void
+kinst_disable(void *arg, dtrace_id_t id, void *parg)
+{
+	struct kinst_probe *kp = parg;
+
+	kinst_patch_tracepoint(kp, kp->kp_savedval);
+}
+
+static int
+kinst_load(void *dummy)
+{
+	int error;
+
+	error = kinst_trampoline_init();
+	if (error != 0)
+		return (error);
+
+	error = dtrace_register("kinst", &kinst_attr, DTRACE_PRIV_USER, NULL,
+	    &kinst_pops, NULL, &kinst_id);
+	if (error != 0) {
+		kinst_trampoline_deinit();
+		return (error);
+	}
+	kinst_probetab = malloc(KINST_PROBETAB_MAX *
+	    sizeof(struct kinst_probe_list), M_KINST, M_WAITOK | M_ZERO);
+	for (int i = 0; i < KINST_PROBETAB_MAX; i++)
+		LIST_INIT(&kinst_probetab[i]);
+	kinst_cdev = make_dev(&kinst_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+	    "dtrace/kinst");
+	dtrace_invop_add(kinst_invop);
+	return (0);
+}
+
+static int
+kinst_unload(void *dummy)
+{
+	free(kinst_probetab, M_KINST);
+	kinst_trampoline_deinit();
+	dtrace_invop_remove(kinst_invop);
+	destroy_dev(kinst_cdev);
+
+	return (dtrace_unregister(kinst_id));
+}
+
+static int
+kinst_modevent(module_t mod __unused, int type, void *data __unused)
+{
+	int error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		KINST_LOG(
+		    "kinst: This provider is experimental, exercise caution");
+		break;
+	case MOD_UNLOAD:
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+SYSINIT(kinst_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, kinst_load, NULL);
+SYSUNINIT(kinst_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, kinst_unload,
+    NULL);
+
+DEV_MODULE(kinst, kinst_modevent, NULL);
+MODULE_VERSION(kinst, 1);
+MODULE_DEPEND(kinst, dtrace, 1, 1, 1);
+MODULE_DEPEND(kinst, opensolaris, 1, 1, 1);
diff --git a/sys/cddl/dev/kinst/kinst.h b/sys/cddl/dev/kinst/kinst.h
new file mode 100644
index 000000000000..ea1a5b50004f
--- /dev/null
+++ b/sys/cddl/dev/kinst/kinst.h
@@ -0,0 +1,71 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
+ */
+
+#ifndef _KINST_H_
+#define _KINST_H_
+
+#include <sys/dtrace.h>
+
+typedef struct {
+	char	kpd_func[DTRACE_FUNCNAMELEN];
+	char	kpd_mod[DTRACE_MODNAMELEN];
+	int	kpd_off;
+} dtrace_kinst_probedesc_t;
+
+#define KINSTIOC_MAKEPROBE	_IOW('k', 1, dtrace_kinst_probedesc_t)
+
+#ifdef _KERNEL
+
+#include <sys/queue.h>
+
+#include "kinst_isa.h"
+
+struct kinst_probe {
+	LIST_ENTRY(kinst_probe)	kp_hashnext;
+	const char		*kp_func;
+	char			kp_name[16];
+	dtrace_id_t		kp_id;
+	kinst_patchval_t	kp_patchval;
+	kinst_patchval_t	kp_savedval;
+	kinst_patchval_t	*kp_patchpoint;
+
+	struct kinst_probe_md	kp_md;
+};
+
+LIST_HEAD(kinst_probe_list, kinst_probe);
+
+extern struct kinst_probe_list	*kinst_probetab;
+
+#define KINST_PROBETAB_MAX	0x8000	/* 32k */
+#define KINST_ADDR2NDX(addr)	(((uintptr_t)(addr)) & (KINST_PROBETAB_MAX - 1))
+#define KINST_GETPROBE(i) 	(&kinst_probetab[KINST_ADDR2NDX(i)])
+
+struct linker_file;
+struct linker_symval;
+
+int	kinst_invop(uintptr_t, struct trapframe *, uintptr_t);
+int	kinst_make_probe(struct linker_file *, int, struct linker_symval *,
+	    void *);
+void	kinst_patch_tracepoint(struct kinst_probe *, kinst_patchval_t);
+void	kinst_probe_create(struct kinst_probe *, struct linker_file *);
+
+int	kinst_trampoline_init(void);
+int	kinst_trampoline_deinit(void);
+uint8_t	*kinst_trampoline_alloc(int);
+void	kinst_trampoline_dealloc(uint8_t *);
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_KINST);
*** 354 LINES SKIPPED ***