svn commit: r210079 - in stable/8/sys: amd64/amd64 amd64/include i386/i386 i386/include

John Baldwin jhb at FreeBSD.org
Wed Jul 14 21:10:15 UTC 2010


Author: jhb
Date: Wed Jul 14 21:10:14 2010
New Revision: 210079
URL: http://svn.freebsd.org/changeset/base/210079

Log:
  MFC 208507,208556,208621:
  Add support for corrected machine check interrupts.  CMCI is a new local
  APIC interrupt that fires when a threshold of corrected machine check
  events is reached.  CMCI also includes a count of events when reporting
  corrected errors in the bank's status register.  Note that individual
  banks may or may not support CMCI.  If they do, each bank includes its own
  threshold register that determines when the interrupt fires.  Currently
  the code uses a very simple strategy where it doubles the threshold on
  each interrupt until it succeeds in throttling the interrupt to occur
  only once a minute (this interval can be tuned via sysctl).  The threshold
  is also adjusted on each hourly poll which will lower the threshold once
  events stop occurring.

Modified:
  stable/8/sys/amd64/amd64/apic_vector.S
  stable/8/sys/amd64/amd64/local_apic.c
  stable/8/sys/amd64/amd64/machdep.c
  stable/8/sys/amd64/amd64/mca.c
  stable/8/sys/amd64/include/apicreg.h
  stable/8/sys/amd64/include/apicvar.h
  stable/8/sys/amd64/include/mca.h
  stable/8/sys/amd64/include/pcpu.h
  stable/8/sys/amd64/include/specialreg.h
  stable/8/sys/i386/i386/apic_vector.s
  stable/8/sys/i386/i386/local_apic.c
  stable/8/sys/i386/i386/machdep.c
  stable/8/sys/i386/i386/mca.c
  stable/8/sys/i386/include/apicreg.h
  stable/8/sys/i386/include/apicvar.h
  stable/8/sys/i386/include/mca.h
  stable/8/sys/i386/include/pcpu.h
  stable/8/sys/i386/include/specialreg.h
Directory Properties:
  stable/8/sys/   (props changed)
  stable/8/sys/amd64/include/xen/   (props changed)
  stable/8/sys/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/contrib/dev/acpica/   (props changed)
  stable/8/sys/contrib/pf/   (props changed)
  stable/8/sys/dev/xen/xenpci/   (props changed)

Modified: stable/8/sys/amd64/amd64/apic_vector.S
==============================================================================
--- stable/8/sys/amd64/amd64/apic_vector.S	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/amd64/amd64/apic_vector.S	Wed Jul 14 21:10:14 2010	(r210079)
@@ -105,6 +105,18 @@ IDTVEC(timerint)
 	jmp	doreti
 
 /*
+ * Local APIC CMCI handler.
+ */
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(cmcint)
+	PUSH_FRAME
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	call	lapic_handle_cmc
+	MEXITCOUNT
+	jmp	doreti
+
+/*
  * Local APIC error interrupt handler.
  */
 	.text

Modified: stable/8/sys/amd64/amd64/local_apic.c
==============================================================================
--- stable/8/sys/amd64/amd64/local_apic.c	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/amd64/amd64/local_apic.c	Wed Jul 14 21:10:14 2010	(r210079)
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/apicvar.h>
+#include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
@@ -123,6 +124,7 @@ static struct lvt lvts[LVT_MAX + 1] = {
 	{ 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },	/* Error */
 	{ 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },	/* PMC */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },	/* Thermal */
+	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },	/* CMCI */
 };
 
 static inthand_t *ioint_handlers[] = {
@@ -227,6 +229,9 @@ lapic_init(vm_paddr_t addr)
 	setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* XXX: Thermal interrupt */
+
+	/* Local APIC CMCI. */
+	setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_SYSIGT, SEL_KPL, 0);
 }
 
 /*
@@ -252,7 +257,7 @@ lapic_create(u_int apic_id, int boot_cpu
 	 */
 	lapics[apic_id].la_present = 1;
 	lapics[apic_id].la_id = apic_id;
-	for (i = 0; i < LVT_MAX; i++) {
+	for (i = 0; i <= LVT_MAX; i++) {
 		lapics[apic_id].la_lvts[i] = lvts[i];
 		lapics[apic_id].la_lvts[i].lvt_active = 0;
 	}
@@ -282,6 +287,7 @@ lapic_dump(const char* str)
 	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n",
 	    lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error,
 	    lapic->lvt_pcint);
+	printf("   cmci: 0x%08x\n", lapic->lvt_cmci);
 }
 
 void
@@ -333,6 +339,10 @@ lapic_setup(int boot)
 
 	/* XXX: Thermal LVT */
 
+	/* Program the CMCI LVT entry if present. */
+	if (maxlvt >= LVT_CMCI)
+		lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci);
+	    
 	intr_restore(eflags);
 }
 
@@ -857,6 +867,34 @@ lapic_timer_enable_intr(void)
 }
 
 void
+lapic_handle_cmc(void)
+{
+
+	lapic_eoi();
+	cmc_intr();
+}
+
+/*
+ * Called from the mca_init() to activate the CMC interrupt if this CPU is
+ * responsible for monitoring any MC banks for CMC events.  Since mca_init()
+ * is called prior to lapic_setup() during boot, this just needs to unmask
+ * this CPU's LVT_CMCI entry.
+ */
+void
+lapic_enable_cmc(void)
+{
+	u_int apic_id;
+
+	apic_id = PCPU_GET(apic_id);
+	KASSERT(lapics[apic_id].la_present,
+	    ("%s: missing APIC %u", __func__, apic_id));
+	lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0;
+	lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1;
+	if (bootverbose)
+		printf("lapic%u: CMCI unmasked\n", apic_id);
+}
+
+void
 lapic_handle_error(void)
 {
 	u_int32_t esr;

Modified: stable/8/sys/amd64/amd64/machdep.c
==============================================================================
--- stable/8/sys/amd64/amd64/machdep.c	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/amd64/amd64/machdep.c	Wed Jul 14 21:10:14 2010	(r210079)
@@ -283,7 +283,6 @@ cpu_startup(dummy)
 	vm_pager_bufferinit();
 
 	cpu_setregs();
-	mca_init();
 }
 
 /*

Modified: stable/8/sys/amd64/amd64/mca.c
==============================================================================
--- stable/8/sys/amd64/amd64/mca.c	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/amd64/amd64/mca.c	Wed Jul 14 21:10:14 2010	(r210079)
@@ -33,6 +33,8 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -43,11 +45,29 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
 #include <machine/cputypes.h>
 #include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
+/* Modes for mca_scan() */
+enum scan_mode {
+	POLLED,
+	MCE,
+	CMCI,
+};
+
+/*
+ * State maintained for each monitored MCx bank to control the
+ * corrected machine check interrupt threshold.
+ */
+struct cmc_state {
+	int	max_threshold;
+	int	last_intr;
+};
+
 struct mca_internal {
 	struct mca_record rec;
 	int		logged;
@@ -79,19 +99,22 @@ static struct callout mca_timer;
 static int mca_ticks = 3600;	/* Check hourly by default. */
 static struct task mca_task;
 static struct mtx mca_lock;
+static struct cmc_state **cmc_state;	/* Indexed by cpuid, bank */
+static int cmc_banks;
+static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
 
 static int
-sysctl_mca_ticks(SYSCTL_HANDLER_ARGS)
+sysctl_positive_int(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
-	value = mca_ticks;
+	value = *(int *)arg1;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error || req->newptr == NULL)
 		return (error);
 	if (value <= 0)
 		return (EINVAL);
-	mca_ticks = value;
+	*(int *)arg1 = value;
 	return (0);
 }
 
@@ -401,31 +424,112 @@ mca_record_entry(const struct mca_record
 }
 
 /*
+ * Update the interrupt threshold for a CMCI.  The strategy is to use
+ * a low trigger that interrupts as soon as the first event occurs.
+ * However, if a steady stream of events arrive, the threshold is
+ * increased until the interrupts are throttled to once every
+ * cmc_throttle seconds or the periodic scan.  If a periodic scan
+ * finds that the threshold is too high, it is lowered.
+ */
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+	struct cmc_state *cc;
+	uint64_t ctl;
+	u_int delta;
+	int count, limit;
+
+	/* Fetch the current limit for this bank. */
+	cc = &cmc_state[PCPU_GET(cpuid)][bank];
+	ctl = rdmsr(MSR_MC_CTL2(bank));
+	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+	delta = (u_int)(ticks - cc->last_intr);
+
+	/*
+	 * If an interrupt was received less than cmc_throttle seconds
+	 * since the previous interrupt and the count from the current
+	 * event is greater than or equal to the current threshold,
+	 * double the threshold up to the max.
+	 */
+	if (mode == CMCI && valid) {
+		limit = ctl & MC_CTL2_THRESHOLD;
+		if (delta < cmc_throttle && count >= limit &&
+		    limit < cc->max_threshold) {
+			limit = min(limit << 1, cc->max_threshold);
+			ctl &= ~MC_CTL2_THRESHOLD;
+			ctl |= limit;
+			wrmsr(MSR_MC_CTL2(bank), limit);
+		}
+		cc->last_intr = ticks;
+		return;
+	}
+
+	/*
+	 * When the banks are polled, check to see if the threshold
+	 * should be lowered.
+	 */
+	if (mode != POLLED)
+		return;
+
+	/* If a CMCI occured recently, do nothing for now. */
+	if (delta < cmc_throttle)
+		return;
+
+	/*
+	 * Compute a new limit based on the average rate of events per
+	 * cmc_throttle seconds since the last interrupt.
+	 */
+	if (valid) {
+		count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+		limit = count * cmc_throttle / delta;
+		if (limit <= 0)
+			limit = 1;
+		else if (limit > cc->max_threshold)
+			limit = cc->max_threshold;
+	} else
+		limit = 1;
+	if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+		ctl &= ~MC_CTL2_THRESHOLD;
+		ctl |= limit;
+		wrmsr(MSR_MC_CTL2(bank), limit);
+	}
+}
+
+/*
  * This scans all the machine check banks of the current CPU to see if
  * there are any machine checks.  Any non-recoverable errors are
  * reported immediately via mca_log().  The current thread must be
- * pinned when this is called.  The 'mcip' parameter indicates if we
- * are being called from the MC exception handler.  In that case this
- * function returns true if the system is restartable.  Otherwise, it
- * returns a count of the number of valid MC records found.
+ * pinned when this is called.  The 'mode' parameter indicates if we
+ * are being called from the MC exception handler, the CMCI handler,
+ * or the periodic poller.  In the MC exception case this function
+ * returns true if the system is restartable.  Otherwise, it returns a
+ * count of the number of valid MC records found.
  */
 static int
-mca_scan(int mcip)
+mca_scan(enum scan_mode mode)
 {
 	struct mca_record rec;
 	uint64_t mcg_cap, ucmask;
-	int count, i, recoverable;
+	int count, i, recoverable, valid;
 
 	count = 0;
 	recoverable = 1;
 	ucmask = MC_STATUS_UC | MC_STATUS_PCC;
 
 	/* When handling a MCE#, treat the OVER flag as non-restartable. */
-	if (mcip)
+	if (mode == MCE)
 		ucmask |= MC_STATUS_OVER;
 	mcg_cap = rdmsr(MSR_MCG_CAP);
 	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
-		if (mca_check_status(i, &rec)) {
+		/*
+		 * For a CMCI, only check banks this CPU is
+		 * responsible for.
+		 */
+		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
+			continue;
+
+		valid = mca_check_status(i, &rec);
+		if (valid) {
 			count++;
 			if (rec.mr_status & ucmask) {
 				recoverable = 0;
@@ -433,8 +537,15 @@ mca_scan(int mcip)
 			}
 			mca_record_entry(&rec);
 		}
+	
+		/*
+		 * If this is a bank this CPU monitors via CMCI,
+		 * update the threshold.
+		 */
+		if (PCPU_GET(cmci_mask) & (1 << i))
+			cmci_update(mode, i, valid, &rec);
 	}
-	return (mcip ? recoverable : count);
+	return (mode == MCE ? recoverable : count);
 }
 
 /*
@@ -457,7 +568,7 @@ mca_scan_cpus(void *context, int pending
 			continue;
 		sched_bind(td, cpu);
 		thread_unlock(td);
-		count += mca_scan(0);
+		count += mca_scan(POLLED);
 		thread_lock(td);
 		sched_unbind(td);
 	}
@@ -511,7 +622,24 @@ mca_startup(void *dummy)
 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
 
 static void
-mca_setup(void)
+cmci_setup(uint64_t mcg_cap)
+{
+	int i;
+
+	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
+	    M_MCA, M_WAITOK);
+	cmc_banks = mcg_cap & MCG_CAP_COUNT;
+	for (i = 0; i <= mp_maxid; i++)
+		cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
+		    M_MCA, M_WAITOK | M_ZERO);
+	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+	    &cmc_throttle, 0, sysctl_positive_int, "I",
+	    "Interval in seconds to throttle corrected MC interrupts");
+}
+
+static void
+mca_setup(uint64_t mcg_cap)
 {
 
 	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
@@ -522,13 +650,62 @@ mca_setup(void)
 	    "count", CTLFLAG_RD, &mca_count, 0, "Record count");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
-	    0, sysctl_mca_ticks, "I",
+	    0, sysctl_positive_int, "I",
 	    "Periodic interval in seconds to scan for machine checks");
 	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
 	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
+	if (mcg_cap & MCG_CAP_CMCI_P)
+		cmci_setup(mcg_cap);
+}
+
+/*
+ * See if we should monitor CMCI for this bank.  If CMCI_EN is already
+ * set in MC_CTL2, then another CPU is responsible for this bank, so
+ * ignore it.  If CMCI_EN returns zero after being set, then this bank
+ * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
+ * now monitor this bank.
+ */
+static void
+cmci_monitor(int i)
+{
+	struct cmc_state *cc;
+	uint64_t ctl;
+
+	KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
+
+	ctl = rdmsr(MSR_MC_CTL2(i));
+	if (ctl & MC_CTL2_CMCI_EN)
+		/* Already monitored by another CPU. */
+		return;
+
+	/* Set the threshold to one event for now. */
+	ctl &= ~MC_CTL2_THRESHOLD;
+	ctl |= MC_CTL2_CMCI_EN | 1;
+	wrmsr(MSR_MC_CTL2(i), ctl);
+	ctl = rdmsr(MSR_MC_CTL2(i));
+	if (!(ctl & MC_CTL2_CMCI_EN))
+		/* This bank does not support CMCI. */
+		return;
+
+	cc = &cmc_state[PCPU_GET(cpuid)][i];
+
+	/* Determine maximum threshold. */
+	ctl &= ~MC_CTL2_THRESHOLD;
+	ctl |= 0x7fff;
+	wrmsr(MSR_MC_CTL2(i), ctl);
+	ctl = rdmsr(MSR_MC_CTL2(i));
+	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
+
+	/* Start off with a threshold of 1. */
+	ctl &= ~MC_CTL2_THRESHOLD;
+	ctl |= 1;
+	wrmsr(MSR_MC_CTL2(i), ctl);
+
+	/* Mark this bank as monitored. */
+	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
 }
 
 /* Must be executed on each CPU. */
@@ -554,14 +731,14 @@ mca_init(void)
 		workaround_erratum383 = 1;
 
 	if (cpu_feature & CPUID_MCA) {
-		if (PCPU_GET(cpuid) == 0)
-			mca_setup();
+		PCPU_SET(cmci_mask, 0);
 
-		sched_pin();
 		mcg_cap = rdmsr(MSR_MCG_CAP);
 		if (mcg_cap & MCG_CAP_CTL_P)
 			/* Enable MCA features. */
 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
+		if (PCPU_GET(cpuid) == 0)
+			mca_setup(mcg_cap);
 
 		/*
 		 * Disable logging of level one TLB parity (L1TP) errors by
@@ -597,15 +774,34 @@ mca_init(void)
 
 			if (!skip)
 				wrmsr(MSR_MC_CTL(i), ctl);
+
+			if (mcg_cap & MCG_CAP_CMCI_P)
+				cmci_monitor(i);
+
 			/* Clear all errors. */
 			wrmsr(MSR_MC_STATUS(i), 0);
 		}
-		sched_unpin();
+
+		if (PCPU_GET(cmci_mask) != 0)
+			lapic_enable_cmc();
 	}
 
 	load_cr4(rcr4() | CR4_MCE);
 }
 
+/*
+ * The machine check registers for the BSP cannot be initialized until
+ * the local APIC is initialized.  This happens at SI_SUB_CPU,
+ * SI_ORDER_SECOND.
+ */
+static void
+mca_init_bsp(void *arg __unused)
+{
+
+	mca_init();
+}
+SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
+
 /* Called when a machine check exception fires. */
 int
 mca_intr(void)
@@ -624,7 +820,7 @@ mca_intr(void)
 	}
 
 	/* Scan the banks and check for any non-recoverable errors. */
-	recoverable = mca_scan(1);
+	recoverable = mca_scan(MCE);
 	mcg_status = rdmsr(MSR_MCG_STATUS);
 	if (!(mcg_status & MCG_STATUS_RIPV))
 		recoverable = 0;
@@ -633,3 +829,31 @@ mca_intr(void)
 	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
 	return (recoverable);
 }
+
+/* Called for a CMCI (correctable machine check interrupt). */
+void
+cmc_intr(void)
+{
+	struct mca_internal *mca;
+	int count;
+
+	/*
+	 * Serialize MCA bank scanning to prevent collisions from
+	 * sibling threads.
+	 */
+	count = mca_scan(CMCI);
+
+	/* If we found anything, log them to the console. */
+	if (count != 0) {
+		mtx_lock_spin(&mca_lock);
+		STAILQ_FOREACH(mca, &mca_records, link) {
+			if (!mca->logged) {
+				mca->logged = 1;
+				mtx_unlock_spin(&mca_lock);
+				mca_log(&mca->rec);
+				mtx_lock_spin(&mca_lock);
+			}
+		}
+		mtx_unlock_spin(&mca_lock);
+	}
+}

Modified: stable/8/sys/amd64/include/apicreg.h
==============================================================================
--- stable/8/sys/amd64/include/apicreg.h	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/amd64/include/apicreg.h	Wed Jul 14 21:10:14 2010	(r210079)
@@ -89,7 +89,7 @@
  * 2C0             Reserved
  * 2D0             Reserved
  * 2E0             Reserved
- * 2F0             Reserved
+ * 2F0             Local Vector Table (CMCI)       R/W
  * 300 ICR_LOW     Interrupt Command Reg. (0-31)   R/W
  * 310 ICR_HI      Interrupt Command Reg. (32-63)  R/W
  * 320             Local Vector Table (Timer)      R/W
@@ -172,7 +172,7 @@ struct LAPIC {
 	/* reserved */		PAD4;
 	/* reserved */		PAD4;
 	/* reserved */		PAD4;
-	/* reserved */		PAD4;
+	u_int32_t lvt_cmci;	PAD3;
 	u_int32_t icr_lo;	PAD3;
 	u_int32_t icr_hi;	PAD3;
 	u_int32_t lvt_timer;	PAD3;

Modified: stable/8/sys/amd64/include/apicvar.h
==============================================================================
--- stable/8/sys/amd64/include/apicvar.h	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/amd64/include/apicvar.h	Wed Jul 14 21:10:14 2010	(r210079)
@@ -108,8 +108,9 @@
 #define	APIC_LOCAL_INTS	240
 #define	APIC_ERROR_INT	APIC_LOCAL_INTS
 #define	APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
+#define	APIC_CMC_INT	(APIC_LOCAL_INTS + 2)
 
-#define	APIC_IPI_INTS	(APIC_LOCAL_INTS + 2)
+#define	APIC_IPI_INTS	(APIC_LOCAL_INTS + 3)
 #define	IPI_RENDEZVOUS	(APIC_IPI_INTS)		/* Inter-CPU rendezvous. */
 #define	IPI_INVLTLB	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs */
 #define	IPI_INVLPG	(APIC_IPI_INTS + 2)
@@ -143,7 +144,8 @@
 #define	LVT_ERROR	3
 #define	LVT_PMC		4
 #define	LVT_THERMAL	5
-#define	LVT_MAX		LVT_THERMAL
+#define	LVT_CMCI	6
+#define	LVT_MAX		LVT_CMCI
 
 #ifndef LOCORE
 
@@ -179,8 +181,8 @@ struct apic_enumerator {
 inthand_t
 	IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
 	IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
-	IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint),
-	IDTVEC(timerint);
+	IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
+	IDTVEC(spuriousint), IDTVEC(timerint);
 
 extern vm_paddr_t lapic_paddr;
 extern int apic_cpuids[];
@@ -210,6 +212,7 @@ void	lapic_create(u_int apic_id, int boo
 void	lapic_disable(void);
 void	lapic_disable_pmc(void);
 void	lapic_dump(const char *str);
+void	lapic_enable_cmc(void);
 int	lapic_enable_pmc(void);
 void	lapic_eoi(void);
 int	lapic_id(void);
@@ -218,6 +221,7 @@ int	lapic_intr_pending(u_int vector);
 void	lapic_ipi_raw(register_t icrlo, u_int dest);
 void	lapic_ipi_vectored(u_int vector, int dest);
 int	lapic_ipi_wait(int delay);
+void	lapic_handle_cmc(void);
 void	lapic_handle_error(void);
 void	lapic_handle_intr(int vector, struct trapframe *frame);
 void	lapic_handle_timer(struct trapframe *frame);

Modified: stable/8/sys/amd64/include/mca.h
==============================================================================
--- stable/8/sys/amd64/include/mca.h	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/amd64/include/mca.h	Wed Jul 14 21:10:14 2010	(r210079)
@@ -46,6 +46,7 @@ struct mca_record {
 
 #ifdef _KERNEL
 
+void	cmc_intr(void);
 void	mca_init(void);
 int	mca_intr(void);
 

Modified: stable/8/sys/amd64/include/pcpu.h
==============================================================================
--- stable/8/sys/amd64/include/pcpu.h	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/amd64/include/pcpu.h	Wed Jul 14 21:10:14 2010	(r210079)
@@ -75,7 +75,8 @@
 	/* Pointer to the CPU LDT descriptor */				\
 	struct system_segment_descriptor *pc_ldt;			\
 	/* Pointer to the CPU TSS descriptor */				\
-	struct system_segment_descriptor *pc_tss			\
+	struct system_segment_descriptor *pc_tss;			\
+	u_int	pc_cmci_mask		/* MCx banks for CMCI */	\
 	PCPU_XEN_FIELDS
 
 #ifdef _KERNEL

Modified: stable/8/sys/amd64/include/specialreg.h
==============================================================================
--- stable/8/sys/amd64/include/specialreg.h	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/amd64/include/specialreg.h	Wed Jul 14 21:10:14 2010	(r210079)
@@ -385,7 +385,7 @@
 #define	MC_STATUS_VAL		0x8000000000000000
 #define	MC_MISC_RA_LSB		0x000000000000003f	/* If MCG_CAP_SER_P */
 #define	MC_MISC_ADDRESS_MODE	0x00000000000001c0	/* If MCG_CAP_SER_P */
-#define	MC_CTL2_THRESHOLD	0x0000000000003fff
+#define	MC_CTL2_THRESHOLD	0x0000000000007fff
 #define	MC_CTL2_CMCI_EN		0x0000000040000000
 
 /*

Modified: stable/8/sys/i386/i386/apic_vector.s
==============================================================================
--- stable/8/sys/i386/i386/apic_vector.s	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/i386/i386/apic_vector.s	Wed Jul 14 21:10:14 2010	(r210079)
@@ -113,6 +113,19 @@ IDTVEC(timerint)
 	jmp	doreti
 
 /*
+ * Local APIC CMCI handler.
+ */
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(cmcint)
+	PUSH_FRAME
+	SET_KERNEL_SREGS
+	FAKE_MCOUNT(TF_EIP(%esp))
+	call	lapic_handle_cmc
+	MEXITCOUNT
+	jmp	doreti
+
+/*
  * Local APIC error interrupt handler.
  */
 	.text

Modified: stable/8/sys/i386/i386/local_apic.c
==============================================================================
--- stable/8/sys/i386/i386/local_apic.c	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/i386/i386/local_apic.c	Wed Jul 14 21:10:14 2010	(r210079)
@@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/apicvar.h>
+#include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
@@ -124,6 +125,7 @@ static struct lvt lvts[LVT_MAX + 1] = {
 	{ 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },	/* Error */
 	{ 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },	/* PMC */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },	/* Thermal */
+	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },	/* CMCI */
 };
 
 static inthand_t *ioint_handlers[] = {
@@ -231,6 +233,10 @@ lapic_init(vm_paddr_t addr)
 	    GSEL(GCODE_SEL, SEL_KPL));
 
 	/* XXX: Thermal interrupt */
+
+	/* Local APIC CMCI. */
+	setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_SYS386TGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
 }
 
 /*
@@ -256,7 +262,7 @@ lapic_create(u_int apic_id, int boot_cpu
 	 */
 	lapics[apic_id].la_present = 1;
 	lapics[apic_id].la_id = apic_id;
-	for (i = 0; i < LVT_MAX; i++) {
+	for (i = 0; i <= LVT_MAX; i++) {
 		lapics[apic_id].la_lvts[i] = lvts[i];
 		lapics[apic_id].la_lvts[i].lvt_active = 0;
 	}
@@ -286,6 +292,7 @@ lapic_dump(const char* str)
 	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n",
 	    lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error,
 	    lapic->lvt_pcint);
+	printf("   cmci: 0x%08x\n", lapic->lvt_cmci);
 }
 
 void
@@ -337,6 +344,10 @@ lapic_setup(int boot)
 
 	/* XXX: Thermal LVT */
 
+	/* Program the CMCI LVT entry if present. */
+	if (maxlvt >= LVT_CMCI)
+		lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci);
+	    
 	intr_restore(eflags);
 }
 
@@ -858,6 +869,34 @@ lapic_timer_enable_intr(void)
 }
 
 void
+lapic_handle_cmc(void)
+{
+
+	lapic_eoi();
+	cmc_intr();
+}
+
+/*
+ * Called from the mca_init() to activate the CMC interrupt if this CPU is
+ * responsible for monitoring any MC banks for CMC events.  Since mca_init()
+ * is called prior to lapic_setup() during boot, this just needs to unmask
+ * this CPU's LVT_CMCI entry.
+ */
+void
+lapic_enable_cmc(void)
+{
+	u_int apic_id;
+
+	apic_id = PCPU_GET(apic_id);
+	KASSERT(lapics[apic_id].la_present,
+	    ("%s: missing APIC %u", __func__, apic_id));
+	lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0;
+	lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1;
+	if (bootverbose)
+		printf("lapic%u: CMCI unmasked\n", apic_id);
+}
+
+void
 lapic_handle_error(void)
 {
 	u_int32_t esr;

Modified: stable/8/sys/i386/i386/machdep.c
==============================================================================
--- stable/8/sys/i386/i386/machdep.c	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/i386/i386/machdep.c	Wed Jul 14 21:10:14 2010	(r210079)
@@ -328,7 +328,6 @@ cpu_startup(dummy)
 #ifndef XEN
 	cpu_setregs();
 #endif
-	mca_init();
 }
 
 /*

Modified: stable/8/sys/i386/i386/mca.c
==============================================================================
--- stable/8/sys/i386/i386/mca.c	Wed Jul 14 20:55:45 2010	(r210078)
+++ stable/8/sys/i386/i386/mca.c	Wed Jul 14 21:10:14 2010	(r210079)
@@ -32,7 +32,11 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_apic.h"
+
 #include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -43,11 +47,31 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
 #include <machine/cputypes.h>
 #include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
+/* Modes for mca_scan() */
+enum scan_mode {
+	POLLED,
+	MCE,
+	CMCI,
+};
+
+#ifdef DEV_APIC
+/*
+ * State maintained for each monitored MCx bank to control the
+ * corrected machine check interrupt threshold.
+ */
+struct cmc_state {
+	int	max_threshold;
+	int	last_intr;
+};
+#endif
+
 struct mca_internal {
 	struct mca_record rec;
 	int		logged;
@@ -80,18 +104,24 @@ static int mca_ticks = 3600;	/* Check ho
 static struct task mca_task;
 static struct mtx mca_lock;
 
+#ifdef DEV_APIC
+static struct cmc_state **cmc_state;	/* Indexed by cpuid, bank */
+static int cmc_banks;
+static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
+#endif
+
 static int
-sysctl_mca_ticks(SYSCTL_HANDLER_ARGS)
+sysctl_positive_int(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
-	value = mca_ticks;
+	value = *(int *)arg1;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error || req->newptr == NULL)
 		return (error);
 	if (value <= 0)
 		return (EINVAL);
-	mca_ticks = value;
+	*(int *)arg1 = value;
 	return (0);
 }
 
@@ -400,32 +430,117 @@ mca_record_entry(const struct mca_record
 	mtx_unlock_spin(&mca_lock);
 }
 
+#ifdef DEV_APIC
+/*
+ * Update the interrupt threshold for a CMCI.  The strategy is to use
+ * a low trigger that interrupts as soon as the first event occurs.
+ * However, if a steady stream of events arrive, the threshold is
+ * increased until the interrupts are throttled to once every
+ * cmc_throttle seconds or the periodic scan.  If a periodic scan
+ * finds that the threshold is too high, it is lowered.
+ */
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+	struct cmc_state *cc;
+	uint64_t ctl;
+	u_int delta;
+	int count, limit;
+
+	/* Fetch the current limit for this bank. */
+	cc = &cmc_state[PCPU_GET(cpuid)][bank];
+	ctl = rdmsr(MSR_MC_CTL2(bank));
+	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+	delta = (u_int)(ticks - cc->last_intr);
+
+	/*
+	 * If an interrupt was received less than cmc_throttle seconds
+	 * since the previous interrupt and the count from the current
+	 * event is greater than or equal to the current threshold,
+	 * double the threshold up to the max.
+	 */
+	if (mode == CMCI && valid) {
+		limit = ctl & MC_CTL2_THRESHOLD;
+		if (delta < cmc_throttle && count >= limit &&
+		    limit < cc->max_threshold) {
+			limit = min(limit << 1, cc->max_threshold);
+			ctl &= ~MC_CTL2_THRESHOLD;
+			ctl |= limit;
+			wrmsr(MSR_MC_CTL2(bank), limit);
+		}
+		cc->last_intr = ticks;
+		return;
+	}
+
+	/*
+	 * When the banks are polled, check to see if the threshold
+	 * should be lowered.
+	 */
+	if (mode != POLLED)
+		return;
+
+	/* If a CMCI occured recently, do nothing for now. */
+	if (delta < cmc_throttle)
+		return;
+
+	/*
+	 * Compute a new limit based on the average rate of events per
+	 * cmc_throttle seconds since the last interrupt.
+	 */
+	if (valid) {
+		count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+		limit = count * cmc_throttle / delta;
+		if (limit <= 0)
+			limit = 1;
+		else if (limit > cc->max_threshold)
+			limit = cc->max_threshold;
+	} else
+		limit = 1;
+	if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+		ctl &= ~MC_CTL2_THRESHOLD;
+		ctl |= limit;
+		wrmsr(MSR_MC_CTL2(bank), limit);
+	}
+}
+#endif
+
 /*
  * This scans all the machine check banks of the current CPU to see if
  * there are any machine checks.  Any non-recoverable errors are
  * reported immediately via mca_log().  The current thread must be
- * pinned when this is called.  The 'mcip' parameter indicates if we
- * are being called from the MC exception handler.  In that case this
- * function returns true if the system is restartable.  Otherwise, it
- * returns a count of the number of valid MC records found.
+ * pinned when this is called.  The 'mode' parameter indicates if we
+ * are being called from the MC exception handler, the CMCI handler,
+ * or the periodic poller.  In the MC exception case this function
+ * returns true if the system is restartable.  Otherwise, it returns a
+ * count of the number of valid MC records found.
  */
 static int
-mca_scan(int mcip)
+mca_scan(enum scan_mode mode)
 {
 	struct mca_record rec;
 	uint64_t mcg_cap, ucmask;
-	int count, i, recoverable;
+	int count, i, recoverable, valid;
 
 	count = 0;
 	recoverable = 1;
 	ucmask = MC_STATUS_UC | MC_STATUS_PCC;
 
 	/* When handling a MCE#, treat the OVER flag as non-restartable. */
-	if (mcip)
+	if (mode == MCE)
 		ucmask |= MC_STATUS_OVER;
 	mcg_cap = rdmsr(MSR_MCG_CAP);
 	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
-		if (mca_check_status(i, &rec)) {
+#ifdef DEV_APIC
+		/*
+		 * For a CMCI, only check banks this CPU is
+		 * responsible for.
+		 */
+		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
+			continue;
+#endif
+
+		valid = mca_check_status(i, &rec);
+		if (valid) {
 			count++;
 			if (rec.mr_status & ucmask) {
 				recoverable = 0;
@@ -433,8 +548,17 @@ mca_scan(int mcip)
 			}
 			mca_record_entry(&rec);
 		}
+	
+#ifdef DEV_APIC
+		/*
+		 * If this is a bank this CPU monitors via CMCI,
+		 * update the threshold.
+		 */
+		if (PCPU_GET(cmci_mask) & (1 << i))
+			cmci_update(mode, i, valid, &rec);
+#endif
 	}
-	return (mcip ? recoverable : count);
+	return (mode == MCE ? recoverable : count);
 }
 
 /*
@@ -457,7 +581,7 @@ mca_scan_cpus(void *context, int pending
 			continue;
 		sched_bind(td, cpu);
 		thread_unlock(td);
-		count += mca_scan(0);
+		count += mca_scan(POLLED);
 		thread_lock(td);
 		sched_unbind(td);
 	}
@@ -510,8 +634,27 @@ mca_startup(void *dummy)
 }
 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
 
+#ifdef DEV_APIC
 static void
-mca_setup(void)
+cmci_setup(uint64_t mcg_cap)
+{
+	int i;
+
+	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
+	    M_MCA, M_WAITOK);
+	cmc_banks = mcg_cap & MCG_CAP_COUNT;
+	for (i = 0; i <= mp_maxid; i++)
+		cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
+		    M_MCA, M_WAITOK | M_ZERO);
+	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+	    &cmc_throttle, 0, sysctl_positive_int, "I",
+	    "Interval in seconds to throttle corrected MC interrupts");
+}
+#endif
+

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-stable-8 mailing list