svn commit: r206089 - in head: lib/libpmc sys/amd64/include sys/conf sys/dev/hwpmc sys/i386/include sys/modules/hwpmc sys/sys

Fri Apr 2 13:23:50 UTC 2010

Author: fabient
Date: Fri Apr  2 13:23:49 2010
New Revision: 206089
URL: http://svn.freebsd.org/changeset/base/206089

Log:
  - Support for uncore counting events: one fixed PMC with the uncore
    domain clock, 8 programmable PMC.
  - Westmere based CPU (Xeon 5600, Corei7 980X) support.
  - New man pages with events list for core and uncore.
  - Updated Corei7 events with Intel 253669-033US December 2009 doc.
    There is some removed events in the documentation, they have been
    kept in the code but documented in the man page as obsolete.
  - Offcore response events can be setup with rsp token.
  
  Sponsored by: NETASQ

Added:
  head/lib/libpmc/pmc.corei7.3   (contents, props changed)
  head/lib/libpmc/pmc.corei7uc.3   (contents, props changed)
  head/lib/libpmc/pmc.ucf.3   (contents, props changed)
  head/lib/libpmc/pmc.westmere.3   (contents, props changed)
  head/lib/libpmc/pmc.westmereuc.3   (contents, props changed)
  head/sys/dev/hwpmc/hwpmc_uncore.c   (contents, props changed)
  head/sys/dev/hwpmc/hwpmc_uncore.h   (contents, props changed)
Modified:
  head/lib/libpmc/Makefile
  head/lib/libpmc/libpmc.c
  head/sys/amd64/include/pmc_mdep.h
  head/sys/conf/files.amd64
  head/sys/conf/files.i386
  head/sys/conf/files.pc98
  head/sys/dev/hwpmc/hwpmc_core.c
  head/sys/dev/hwpmc/hwpmc_core.h
  head/sys/dev/hwpmc/hwpmc_intel.c
  head/sys/dev/hwpmc/pmc_events.h
  head/sys/i386/include/pmc_mdep.h
  head/sys/modules/hwpmc/Makefile
  head/sys/sys/pmc.h

Modified: head/lib/libpmc/Makefile
==============================================================================

--- head/lib/libpmc/Makefile	Fri Apr  2 11:07:55 2010	(r206088)
+++ head/lib/libpmc/Makefile	Fri Apr  2 13:23:49 2010	(r206089)
@@ -27,11 +27,16 @@ MAN+=	pmc.atom.3
 MAN+=	pmc.core.3
 MAN+=	pmc.core2.3
 MAN+=	pmc.iaf.3
+MAN+=	pmc.ucf.3
 MAN+=	pmc.k7.3
 MAN+=	pmc.k8.3
 MAN+=	pmc.p4.3
 MAN+=	pmc.p5.3
 MAN+=	pmc.p6.3
+MAN+=	pmc.corei7.3
+MAN+=	pmc.corei7uc.3
+MAN+=	pmc.westmere.3
+MAN+=	pmc.westmereuc.3
 MAN+=	pmc.tsc.3
 .elif ${MACHINE_ARCH} == "arm" && ${CPUTYPE} == "xscale"
 MAN+=	pmc.xscale.3

Modified: head/lib/libpmc/libpmc.c
==============================================================================
--- head/lib/libpmc/libpmc.c	Fri Apr  2 11:07:55 2010	(r206088)
+++ head/lib/libpmc/libpmc.c	Fri Apr  2 13:23:49 2010	(r206089)
@@ -54,6 +54,10 @@ static int iaf_allocate_pmc(enum pmc_eve
     struct pmc_op_pmcallocate *_pmc_config);
 static int iap_allocate_pmc(enum pmc_event _pe, char *_ctrspec,
     struct pmc_op_pmcallocate *_pmc_config);
+static int ucf_allocate_pmc(enum pmc_event _pe, char *_ctrspec,
+    struct pmc_op_pmcallocate *_pmc_config);
+static int ucp_allocate_pmc(enum pmc_event _pe, char *_ctrspec,
+    struct pmc_op_pmcallocate *_pmc_config);
 static int k8_allocate_pmc(enum pmc_event _pe, char *_ctrspec,
     struct pmc_op_pmcallocate *_pmc_config);
 static int p4_allocate_pmc(enum pmc_event _pe, char *_ctrspec,
@@ -144,6 +148,7 @@ PMC_CLASSDEP_TABLE(p5, P5);
 PMC_CLASSDEP_TABLE(p6, P6);
 PMC_CLASSDEP_TABLE(xscale, XSCALE);
 PMC_CLASSDEP_TABLE(mips24k, MIPS24K);
+PMC_CLASSDEP_TABLE(ucf, UCF);
 
 #undef	__PMC_EV_ALIAS
 #define	__PMC_EV_ALIAS(N,CODE) 	{ N, PMC_EV_##CODE },
@@ -169,6 +174,21 @@ static const struct pmc_event_descr core
 	__PMC_EV_ALIAS_COREI7()
 };
 
+static const struct pmc_event_descr westmere_event_table[] =
+{
+	__PMC_EV_ALIAS_WESTMERE()
+};
+
+static const struct pmc_event_descr corei7uc_event_table[] =
+{
+	__PMC_EV_ALIAS_COREI7UC()
+};
+
+static const struct pmc_event_descr westmereuc_event_table[] =
+{
+	__PMC_EV_ALIAS_WESTMEREUC()
+};
+
 /*
  * PMC_MDEP_TABLE(NAME, PRIMARYCLASS, ADDITIONAL_CLASSES...)
  *
@@ -182,7 +202,8 @@ static const struct pmc_event_descr core
 PMC_MDEP_TABLE(atom, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC);
 PMC_MDEP_TABLE(core, IAP, PMC_CLASS_TSC);
 PMC_MDEP_TABLE(core2, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC);
-PMC_MDEP_TABLE(corei7, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC);
+PMC_MDEP_TABLE(corei7, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC, PMC_CLASS_UCF, PMC_CLASS_UCP);
+PMC_MDEP_TABLE(westmere, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC, PMC_CLASS_UCF, PMC_CLASS_UCP);
 PMC_MDEP_TABLE(k7, K7, PMC_CLASS_TSC);
 PMC_MDEP_TABLE(k8, K8, PMC_CLASS_TSC);
 PMC_MDEP_TABLE(p4, P4, PMC_CLASS_TSC);
@@ -215,6 +236,10 @@ PMC_CLASS_TABLE_DESC(atom, IAP, atom, ia
 PMC_CLASS_TABLE_DESC(core, IAP, core, iap);
 PMC_CLASS_TABLE_DESC(core2, IAP, core2, iap);
 PMC_CLASS_TABLE_DESC(corei7, IAP, corei7, iap);
+PMC_CLASS_TABLE_DESC(westmere, IAP, westmere, iap);
+PMC_CLASS_TABLE_DESC(ucf, UCF, ucf, ucf);
+PMC_CLASS_TABLE_DESC(corei7uc, UCP, corei7uc, ucp);
+PMC_CLASS_TABLE_DESC(westmereuc, UCP, westmereuc, ucp);
 #endif
 #if	defined(__i386__)
 PMC_CLASS_TABLE_DESC(k7, K7, k7, k7);
@@ -302,7 +327,7 @@ struct pmc_masks {
 	const uint32_t	pm_value;
 };
 #define	PMCMASK(N,V)	{ .pm_name = #N, .pm_value = (V) }
-#define	NULLMASK	PMCMASK(NULL,0)
+#define	NULLMASK	{ .pm_name = NULL }
 
 #if defined(__amd64__) || defined(__i386__)
 static int
@@ -495,6 +520,8 @@ static struct pmc_event_alias core2_alia
 #define	atom_aliases_without_iaf	core2_aliases_without_iaf
 #define corei7_aliases			core2_aliases
 #define corei7_aliases_without_iaf	core2_aliases_without_iaf
+#define westmere_aliases		core2_aliases
+#define westmere_aliases_without_iaf	core2_aliases_without_iaf
 
 #define	IAF_KW_OS		"os"
 #define	IAF_KW_USR		"usr"
@@ -545,6 +572,7 @@ iaf_allocate_pmc(enum pmc_event pe, char
 #define	IAP_KW_SNOOPTYPE	"snooptype"
 #define	IAP_KW_TRANSITION	"trans"
 #define	IAP_KW_USR		"usr"
+#define	IAP_KW_RSP		"rsp"
 
 static struct pmc_masks iap_core_mask[] = {
 	PMCMASK(all,	(0x3 << 14)),
@@ -592,19 +620,38 @@ static struct pmc_masks iap_transition_m
 	NULLMASK
 };
 
+static struct pmc_masks iap_rsp_mask[] = {
+	PMCMASK(DMND_DATA_RD,		(1 <<  0)),
+	PMCMASK(DMND_RFO,		(1 <<  1)),
+	PMCMASK(DMND_IFETCH,		(1 <<  2)),
+	PMCMASK(WB,			(1 <<  3)),
+	PMCMASK(PF_DATA_RD,		(1 <<  4)),
+	PMCMASK(PF_RFO,			(1 <<  5)),
+	PMCMASK(PF_IFETCH,		(1 <<  6)),
+	PMCMASK(OTHER,			(1 <<  7)),
+	PMCMASK(UNCORE_HIT,		(1 <<  8)),
+	PMCMASK(OTHER_CORE_HIT_SNP,	(1 <<  9)),
+	PMCMASK(OTHER_CORE_HITM,	(1 << 10)),
+	PMCMASK(REMOTE_CACHE_FWD,	(1 << 12)),
+	PMCMASK(REMOTE_DRAM,		(1 << 13)),
+	PMCMASK(LOCAL_DRAM,		(1 << 14)),
+	PMCMASK(NON_DRAM,		(1 << 15)),
+	NULLMASK
+};
+
 static int
 iap_allocate_pmc(enum pmc_event pe, char *ctrspec,
     struct pmc_op_pmcallocate *pmc_config)
 {
 	char *e, *p, *q;
-	uint32_t cachestate, evmask;
+	uint32_t cachestate, evmask, rsp;
 	int count, n;
 
 	pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE |
 	    PMC_CAP_QUALIFIER);
 	pmc_config->pm_md.pm_iap.pm_iap_config = 0;
 
-	cachestate = evmask = 0;
+	cachestate = evmask = rsp = 0;
 
 	/* Parse additional modifiers if present */
 	while ((p = strsep(&ctrspec, ",")) != NULL) {
@@ -651,8 +698,7 @@ iap_allocate_pmc(enum pmc_event pe, char
 				return (-1);
 		} else if (cpu_info.pm_cputype == PMC_CPU_INTEL_ATOM ||
 		    cpu_info.pm_cputype == PMC_CPU_INTEL_CORE2 ||
-		    cpu_info.pm_cputype == PMC_CPU_INTEL_CORE2EXTREME ||
-		    cpu_info.pm_cputype == PMC_CPU_INTEL_COREI7) {
+		    cpu_info.pm_cputype == PMC_CPU_INTEL_CORE2EXTREME) {
 			if (KWPREFIXMATCH(p, IAP_KW_SNOOPRESPONSE "=")) {
 				n = pmc_parse_mask(iap_snoopresponse_mask, p,
 				    &evmask);
@@ -661,6 +707,12 @@ iap_allocate_pmc(enum pmc_event pe, char
 				    &evmask);
 			} else
 				return (-1);
+		} else if (cpu_info.pm_cputype == PMC_CPU_INTEL_COREI7 ||
+		    cpu_info.pm_cputype == PMC_CPU_INTEL_WESTMERE) {
+			if (KWPREFIXMATCH(p, IAP_KW_RSP "=")) {
+				n = pmc_parse_mask(iap_rsp_mask, p, &rsp);
+			} else
+				return (-1);
 		} else
 			return (-1);
 
@@ -693,6 +745,69 @@ iap_allocate_pmc(enum pmc_event pe, char
 	}
 
 	pmc_config->pm_md.pm_iap.pm_iap_config |= cachestate;
+	pmc_config->pm_md.pm_iap.pm_iap_rsp = rsp;
+
+	return (0);
+}
+
+/*
+ * Intel Uncore.
+ */
+
+static int
+ucf_allocate_pmc(enum pmc_event pe, char *ctrspec,
+    struct pmc_op_pmcallocate *pmc_config)
+{
+	(void) pe;
+	(void) ctrspec;
+
+	pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE);
+	pmc_config->pm_md.pm_ucf.pm_ucf_flags = 0;
+
+	return (0);
+}
+
+#define	UCP_KW_CMASK		"cmask"
+#define	UCP_KW_EDGE		"edge"
+#define	UCP_KW_INV		"inv"
+
+static int
+ucp_allocate_pmc(enum pmc_event pe, char *ctrspec,
+    struct pmc_op_pmcallocate *pmc_config)
+{
+	char *e, *p, *q;
+	int count, n;
+
+	(void) pe;
+
+	pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE |
+	    PMC_CAP_QUALIFIER);
+	pmc_config->pm_md.pm_ucp.pm_ucp_config = 0;
+
+	/* Parse additional modifiers if present */
+	while ((p = strsep(&ctrspec, ",")) != NULL) {
+
+		n = 0;
+		if (KWPREFIXMATCH(p, UCP_KW_CMASK "=")) {
+			q = strchr(p, '=');
+			if (*++q == '\0') /* skip '=' */
+				return (-1);
+			count = strtol(q, &e, 0);
+			if (e == q || *e != '\0')
+				return (-1);
+			pmc_config->pm_caps |= PMC_CAP_THRESHOLD;
+			pmc_config->pm_md.pm_ucp.pm_ucp_config |=
+			    UCP_CMASK(count);
+		} else if (KWMATCH(p, UCP_KW_EDGE)) {
+			pmc_config->pm_caps |= PMC_CAP_EDGE;
+		} else if (KWMATCH(p, UCP_KW_INV)) {
+			pmc_config->pm_caps |= PMC_CAP_INVERT;
+		} else
+			return (-1);
+
+		if (n < 0)	/* Parsing failed. */
+			return (-1);
+	}
 
 	return (0);
 }
@@ -2392,6 +2507,31 @@ pmc_event_names_of_class(enum pmc_class 
 			ev = corei7_event_table;
 			count = PMC_EVENT_TABLE_SIZE(corei7);
 			break;
+		case PMC_CPU_INTEL_WESTMERE:
+			ev = westmere_event_table;
+			count = PMC_EVENT_TABLE_SIZE(westmere);
+			break;
+		}
+		break;
+	case PMC_CLASS_UCF:
+		ev = ucf_event_table;
+		count = PMC_EVENT_TABLE_SIZE(ucf);
+		break;
+	case PMC_CLASS_UCP:
+		/*
+		 * Return the most appropriate set of event name
+		 * spellings for the current CPU.
+		 */
+		switch (cpu_info.pm_cputype) {
+		default:
+		case PMC_CPU_INTEL_COREI7:
+			ev = corei7uc_event_table;
+			count = PMC_EVENT_TABLE_SIZE(corei7uc);
+			break;
+		case PMC_CPU_INTEL_WESTMERE:
+			ev = westmereuc_event_table;
+			count = PMC_EVENT_TABLE_SIZE(westmereuc);
+			break;
 		}
 		break;
 	case PMC_CLASS_TSC:
@@ -2605,8 +2745,15 @@ pmc_init(void)
 		PMC_MDEP_INIT_INTEL_V2(core2);
 		break;
 	case PMC_CPU_INTEL_COREI7:
+		pmc_class_table[n++] = &ucf_class_table_descr;
+		pmc_class_table[n++] = &corei7uc_class_table_descr;
 		PMC_MDEP_INIT_INTEL_V2(corei7);
 		break;
+	case PMC_CPU_INTEL_WESTMERE:
+		pmc_class_table[n++] = &ucf_class_table_descr;
+		pmc_class_table[n++] = &westmereuc_class_table_descr;
+		PMC_MDEP_INIT_INTEL_V2(westmere);
+		break;
 	case PMC_CPU_INTEL_PIV:
 		PMC_MDEP_INIT(p4);
 		pmc_class_table[n] = &p4_class_table_descr;
@@ -2719,10 +2866,30 @@ _pmc_name_of_event(enum pmc_event pe, en
 			ev = corei7_event_table;
 			evfence = corei7_event_table + PMC_EVENT_TABLE_SIZE(corei7);
 			break;
+		case PMC_CPU_INTEL_WESTMERE:
+			ev = westmere_event_table;
+			evfence = westmere_event_table + PMC_EVENT_TABLE_SIZE(westmere);
+			break;
+		default:	/* Unknown CPU type. */
+			break;
+		}
+	} else if (pe >= PMC_EV_UCF_FIRST && pe <= PMC_EV_UCF_LAST) {
+		ev = ucf_event_table;
+		evfence = ucf_event_table + PMC_EVENT_TABLE_SIZE(ucf);
+	} else if (pe >= PMC_EV_UCP_FIRST && pe <= PMC_EV_UCP_LAST) {
+		switch (cpu) {
+		case PMC_CPU_INTEL_COREI7:
+			ev = corei7uc_event_table;
+			evfence = corei7uc_event_table + PMC_EVENT_TABLE_SIZE(corei7uc);
+			break;
+		case PMC_CPU_INTEL_WESTMERE:
+			ev = westmereuc_event_table;
+			evfence = westmereuc_event_table + PMC_EVENT_TABLE_SIZE(westmereuc);
+			break;
 		default:	/* Unknown CPU type. */
 			break;
 		}
-	} if (pe >= PMC_EV_K7_FIRST && pe <= PMC_EV_K7_LAST) {
+	} else if (pe >= PMC_EV_K7_FIRST && pe <= PMC_EV_K7_LAST) {
 		ev = k7_event_table;
 		evfence = k7_event_table + PMC_EVENT_TABLE_SIZE(k7);
 	} else if (pe >= PMC_EV_K8_FIRST && pe <= PMC_EV_K8_LAST) {

Added: head/lib/libpmc/pmc.corei7.3
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/lib/libpmc/pmc.corei7.3	Fri Apr  2 13:23:49 2010	(r206089)
@@ -0,0 +1,1581 @@
+.\" Copyright (c) 2010 Fabien Thomas.  All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" This software is provided by Joseph Koshy ``as is'' and
+.\" any express or implied warranties, including, but not limited to, the
+.\" implied warranties of merchantability and fitness for a particular purpose
+.\" are disclaimed.  in no event shall Joseph Koshy be liable
+.\" for any direct, indirect, incidental, special, exemplary, or consequential
+.\" damages (including, but not limited to, procurement of substitute goods
+.\" or services; loss of use, data, or profits; or business interruption)
+.\" however caused and on any theory of liability, whether in contract, strict
+.\" liability, or tort (including negligence or otherwise) arising in any way
+.\" out of the use of this software, even if advised of the possibility of
+.\" such damage.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd March 24, 2010
+.Os
+.Dt PMC.COREI7 3
+.Sh NAME
+.Nm pmc.corei7
+.Nd measurement events for
+.Tn Intel
+.Tn Core i7 and Xeon 5500
+family CPUs
+.Sh LIBRARY
+.Lb libpmc
+.Sh SYNOPSIS
+.In pmc.h
+.Sh DESCRIPTION
+.Tn Intel
+.Tn "Core i7"
+CPUs contain PMCs conforming to version 2 of the
+.Tn Intel
+performance measurement architecture.
+These CPUs may contain up to three classes of PMCs:
+.Bl -tag -width "Li PMC_CLASS_IAP"
+.It Li PMC_CLASS_IAF
+Fixed-function counters that count only one hardware event per counter.
+.It Li PMC_CLASS_IAP
+Programmable counters that may be configured to count one of a defined
+set of hardware events.
+.El
+.Pp
+The number of PMCs available in each class and their widths need to be
+determined at run time by calling
+.Xr pmc_cpuinfo 3 .
+.Pp
+Intel Core i7 and Xeon 5500 PMCs are documented in
+.Rs
+.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual"
+.%T "Volume 3B: System Programming Guide, Part 2"
+.%N "Order Number: 253669-033US"
+.%D December 2009
+.%Q "Intel Corporation"
+.Re
+.Ss COREI7 AND XEON 5500 FIXED FUNCTION PMCS
+These PMCs and their supported events are documented in
+.Xr pmc.iaf 3 .
+Not all CPUs in this family implement fixed-function counters.
+.Ss COREI7 AND XEON 5500 PROGRAMMABLE PMCS
+The programmable PMCs support the following capabilities:
+.Bl -column "PMC_CAP_INTERRUPT" "Support"
+.It Em Capability Ta Em Support
+.It PMC_CAP_CASCADE Ta \&No
+.It PMC_CAP_EDGE Ta Yes
+.It PMC_CAP_INTERRUPT Ta Yes
+.It PMC_CAP_INVERT Ta Yes
+.It PMC_CAP_READ Ta Yes
+.It PMC_CAP_PRECISE Ta \&No
+.It PMC_CAP_SYSTEM Ta Yes
+.It PMC_CAP_TAGGING Ta \&No
+.It PMC_CAP_THRESHOLD Ta Yes
+.It PMC_CAP_USER Ta Yes
+.It PMC_CAP_WRITE Ta Yes
+.El
+.Ss Event Qualifiers
+Event specifiers for these PMCs support the following common
+qualifiers:
+.Bl -tag -width indent
+.It Li rsp= Ns Ar value
+Configure the Off-core Response bits.
+.Bl -tag -width indent
+.It Li DMND_DATA_RD
+Counts the number of demand and DCU prefetch data reads of full
+and partial cachelines as well as demand data page table entry
+cacheline reads. Does not count L2 data read prefetches or
+instruction fetches.
+.It Li DMND_RFO
+Counts the number of demand and DCU prefetch reads for ownership
+(RFO) requests generated by a write to data cacheline. Does not
+count L2 RFO.
+.It Li DMND_IFETCH
+Counts the number of demand and DCU prefetch instruction cacheline
+reads. Does not count L2 code read prefetches.
+WB
+Counts the number of writeback (modified to exclusive) transactions.
+.It Li PF_DATA_RD
+Counts the number of data cacheline reads generated by L2 prefetchers.
+.It Li PF_RFO
+Counts the number of RFO requests generated by L2 prefetchers.
+.It Li PF_IFETCH
+Counts the number of code reads generated by L2 prefetchers.
+.It Li OTHER
+Counts one of the following transaction types, including L3 invalidate,
+I/O, full or partial writes, WC or non-temporal stores, CLFLUSH, Fences,
+lock, unlock, split lock.
+.It Li UNCORE_HIT
+L3 Hit: local or remote home requests that hit L3 cache in the uncore
+with no coherency actions required (snooping).
+.It Li OTHER_CORE_HIT_SNP
+L3 Hit: local or remote home requests that hit L3 cache in the uncore
+and was serviced by another core with a cross core snoop where no modified
+copies were found (clean).
+.It Li OTHER_CORE_HITM
+L3 Hit: local or remote home requests that hit L3 cache in the uncore
+and was serviced by another core with a cross core snoop where modified
+copies were found (HITM).
+.It Li REMOTE_CACHE_FWD
+L3 Miss: local homed requests that missed the L3 cache and was serviced
+by forwarded data following a cross package snoop where no modified
+copies found. (Remote home requests are not counted)
+.It Li REMOTE_DRAM
+L3 Miss: remote home requests that missed the L3 cache and were serviced
+by remote DRAM.
+.It Li LOCAL_DRAM
+L3 Miss: local home requests that missed the L3 cache and were serviced
+by local DRAM.
+.It Li NON_DRAM
+Non-DRAM requests that were serviced by IOH.
+.El
+.It Li cmask= Ns Ar value
+Configure the PMC to increment only if the number of configured
+events measured in a cycle is greater than or equal to
+.Ar value .
+.It Li edge
+Configure the PMC to count the number of de-asserted to asserted
+transitions of the conditions expressed by the other qualifiers.
+If specified, the counter will increment only once whenever a
+condition becomes true, irrespective of the number of clocks during
+which the condition remains true.
+.It Li inv
+Invert the sense of comparison when the
+.Dq Li cmask
+qualifier is present, making the counter increment when the number of
+events per cycle is less than the value specified by the
+.Dq Li cmask
+qualifier.
+.It Li os
+Configure the PMC to count events happening at processor privilege
+level 0.
+.It Li usr
+Configure the PMC to count events occurring at privilege levels 1, 2
+or 3.
+.El
+.Pp
+If neither of the
+.Dq Li os
+or
+.Dq Li usr
+qualifiers are specified, the default is to enable both.
+.Ss Event Specifiers (Programmable PMCs)
+Core i7 and Xeon 5500 programmable PMCs support the following events:
+.Bl -tag -width indent
+.It Li SB_DRAIN.ANY
+.Pq Event 04H , Umask 07H
+Counts the number of store buffer drains.
+.It Li STORE_BLOCKS.AT_RET
+.Pq Event 06H , Umask 04H
+Counts number of loads delayed with at-Retirement block code. The following
+loads need to be executed at retirement and wait for all senior stores on
+the same thread to be drained: load splitting across 4K boundary (page
+split), load accessing uncacheable (UC or USWC) memory, load lock, and load
+with page table in UC or USWC memory region.
+.It Li STORE_BLOCKS.L1D_BLOCK
+.Pq Event 06H , Umask 08H
+Cacheable loads delayed with L1D block code
+.It Li PARTIAL_ADDRESS_ALIAS
+.Pq Event 07H , Umask 01H
+Counts false dependency due to partial address aliasing
+.It Li DTLB_LOAD_MISSES.ANY
+.Pq Event 08H , Umask 01H
+Counts all load misses that cause a page walk
+.It Li DTLB_LOAD_MISSES.WALK_COMPLETED
+.Pq Event 08H , Umask 02H
+Counts number of completed page walks due to load miss in the STLB.
+.It Li DTLB_LOAD_MISSES.STLB_HIT
+.Pq Event 08H , Umask 10H
+Number of cache load STLB hits
+.It Li DTLB_LOAD_MISSES.PDE_MISS
+.Pq Event 08H , Umask 20H
+Number of DTLB cache load misses where the low part of the linear to
+physical address translation was missed.
+.It Li DTLB_LOAD_MISSES.PDP_MISS
+.Pq Event 08H , Umask 40H
+Number of DTLB cache load misses where the high part of the linear to
+physical address translation was missed.
+.It Li DTLB_LOAD_MISSES.LARGE_WALK_COMPLETED
+.Pq Event 08H , Umask 80H
+Counts number of completed large page walks due to load miss in the STLB.
+.It Li MEM_INST_RETIRED.LOADS
+.Pq Event 0BH , Umask 01H
+Counts the number of instructions with an architecturally-visible store
+retired on the architected path.
+In conjunction with ld_lat facility
+.It Li MEM_INST_RETIRED.STORES
+.Pq Event 0BH , Umask 02H
+Counts the number of instructions with an architecturally-visible store
+retired on the architected path.
+In conjunction with ld_lat facility
+.It Li MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD
+.Pq Event 0BH , Umask 10H
+Counts the number of instructions exceeding the latency specified with
+ld_lat facility.
+In conjunction with ld_lat facility
+.It Li MEM_STORE_RETIRED.DTLB_MISS
+.Pq Event 0CH , Umask 01H
+The event counts the number of retired stores that missed the DTLB. The DTLB
+miss is not counted if the store operation causes a fault. Does not counter
+prefetches. Counts both primary and secondary misses to the TLB
+.It Li UOPS_ISSUED.ANY
+.Pq Event 0EH , Umask 01H
+Counts the number of Uops issued by the Register Allocation Table to the
+Reservation Station, i.e. the UOPs issued from the front end to the back
+end.
+.It Li UOPS_ISSUED.STALLED_CYCLES
+.Pq Event 0EH , Umask 01H
+Counts the number of cycles no Uops issued by the Register Allocation Table
+to the Reservation Station, i.e. the UOPs issued from the front end to the
+back end.
+set invert=1, cmask = 1
+.It Li UOPS_ISSUED.FUSED
+.Pq Event 0EH , Umask 02H
+Counts the number of fused Uops that were issued from the Register
+Allocation Table to the Reservation Station.
+.It Li MEM_UNCORE_RETIRED.L3_DATA_MISS_UNKNOWN
+.Pq Event 0FH , Umask 01H
+Counts number of memory load instructions retired where the memory reference
+missed L3 and data source is unknown.
+Available only for CPUID signature 06_2EH
+.It Li MEM_UNCORE_RETIRED.OTHER_CORE_L2_HITM
+.Pq Event 0FH , Umask 02H
+Counts number of memory load instructions retired where the memory reference
+hit modified data in a sibling core residing on the same socket.
+.It Li MEM_UNCORE_RETIRED.REMOTE_CACHE_LOCAL_HOME_HIT
+.Pq Event 0FH , Umask 08H
+Counts number of memory load instructions retired where the memory reference
+missed the L1, L2 and L3 caches and HIT in a remote socket's cache. Only
+counts locally homed lines.
+.It Li MEM_UNCORE_RETIRED.REMOTE_DRAM
+.Pq Event 0FH , Umask 10H
+Counts number of memory load instructions retired where the memory reference
+missed the L1, L2 and L3 caches and was remotely homed. This includes both
+DRAM access and HITM in a remote socket's cache for remotely homed lines.
+.It Li MEM_UNCORE_RETIRED.LOCAL_DRAM
+.Pq Event 0FH , Umask 20H
+Counts number of memory load instructions retired where the memory reference
+missed the L1, L2 and L3 caches and required a local socket memory
+reference. This includes locally homed cachelines that were in a modified
+state in another socket.
+.It Li MEM_UNCORE_RETIRED.UNCACHEABLE
+.Pq Event 0FH , Umask 80H
+Counts number of memory load instructions retired where the memory reference
+missed the L1, L2 and L3 caches and to perform I/O.
+Available only for CPUID signature 06_2EH
+.It Li FP_COMP_OPS_EXE.X87
+.Pq Event 10H , Umask 01H
+Counts the number of FP Computational Uops Executed. The number of FADD,
+FSUB, FCOM, FMULs, integer MULsand IMULs, FDIVs, FPREMs, FSQRTS, integer
+DIVs, and IDIVs. This event does not distinguish an FADD used in the middle
+of a transcendental flow from a separate FADD instruction.
+.It Li FP_COMP_OPS_EXE.MMX
+.Pq Event 10H , Umask 02H
+Counts number of MMX Uops executed.
+.It Li FP_COMP_OPS_EXE.SSE_FP
+.Pq Event 10H , Umask 04H
+Counts number of SSE and SSE2 FP uops executed.
+.It Li FP_COMP_OPS_EXE.SSE2_INTEGER
+.Pq Event 10H , Umask 08H
+Counts number of SSE2 integer uops executed.
+.It Li FP_COMP_OPS_EXE.SSE_FP_PACKED
+.Pq Event 10H , Umask 10H
+Counts number of SSE FP packed uops executed.
+.It Li FP_COMP_OPS_EXE.SSE_FP_SCALAR
+.Pq Event 10H , Umask 20H
+Counts number of SSE FP scalar uops executed.
+.It Li FP_COMP_OPS_EXE.SSE_SINGLE_PRECISION
+.Pq Event 10H , Umask 40H
+Counts number of SSE* FP single precision uops executed.
+.It Li FP_COMP_OPS_EXE.SSE_DOUBLE_PRECISION
+.Pq Event 10H , Umask 80H
+Counts number of SSE* FP double precision uops executed.
+.It Li SIMD_INT_128.PACKED_MPY
+.Pq Event 12H , Umask 01H
+Counts number of 128 bit SIMD integer multiply operations.
+.It Li SIMD_INT_128.PACKED_SHIFT
+.Pq Event 12H , Umask 02H
+Counts number of 128 bit SIMD integer shift operations.
+.It Li SIMD_INT_128.PACK
+.Pq Event 12H , Umask 04H
+Counts number of 128 bit SIMD integer pack operations.
+.It Li SIMD_INT_128.UNPACK
+.Pq Event 12H , Umask 08H
+Counts number of 128 bit SIMD integer unpack operations.
+.It Li SIMD_INT_128.PACKED_LOGICAL
+.Pq Event 12H , Umask 10H
+Counts number of 128 bit SIMD integer logical operations.
+.It Li SIMD_INT_128.PACKED_ARITH
+.Pq Event 12H , Umask 20H
+Counts number of 128 bit SIMD integer arithmetic operations.
+.It Li SIMD_INT_128.SHUFFLE_MOVE
+.Pq Event 12H , Umask 40H
+Counts number of 128 bit SIMD integer shuffle and move operations.
+.It Li LOAD_DISPATCH.RS
+.Pq Event 13H , Umask 01H
+Counts number of loads dispatched from the Reservation Station that bypass
+the Memory Order Buffer.
+.It Li LOAD_DISPATCH.RS_DELAYED
+.Pq Event 13H , Umask 02H
+Counts the number of delayed RS dispatches at the stage latch. If an RS
+dispatch can not bypass to LB, it has another chance to dispatch from the
+one-cycle delayed staging latch before it is written into the LB.
+.It Li LOAD_DISPATCH.MOB
+.Pq Event 13H , Umask 04H
+Counts the number of loads dispatched from the Reservation Station to the
+Memory Order Buffer.
+.It Li LOAD_DISPATCH.ANY
+.Pq Event 13H , Umask 07H
+Counts all loads dispatched from the Reservation Station.
+.It Li ARITH.CYCLES_DIV_BUSY
+.Pq Event 14H , Umask 01H
+Counts the number of cycles the divider is busy executing divide or square
+root operations. The divide can be integer, X87 or Streaming SIMD Extensions
+(SSE). The square root operation can be either X87 or SSE.
+Set 'edge =1, invert=1, cmask=1' to count the number of divides.
+Count may be incorrect When SMT is on.
+.It Li ARITH.MUL
+.Pq Event 14H , Umask 02H
+Counts the number of multiply operations executed. This includes integer as
+well as floating point multiply operations but excludes DPPS mul and MPSAD.
+Count may be incorrect When SMT is on
+.It Li INST_QUEUE_WRITES
+.Pq Event 17H , Umask 01H
+Counts the number of instructions written into the instruction queue every
+cycle.
+.It Li INST_DECODED.DEC0
+.Pq Event 18H , Umask 01H
+Counts number of instructions that require decoder 0 to be decoded. Usually,
+this means that the instruction maps to more than 1 uop
+.It Li TWO_UOP_INSTS_DECODED
+.Pq Event 19H , Umask 01H
+An instruction that generates two uops was decoded
+.It Li INST_QUEUE_WRITE_CYCLES
+.Pq Event 1EH , Umask 01H
+This event counts the number of cycles during which instructions are written
+to the instruction queue. Dividing this counter by the number of
+instructions written to the instruction queue (INST_QUEUE_WRITES) yields the
+average number of instructions decoded each cycle. If this number is less
+than four and the pipe stalls, this indicates that the decoder is failing to
+decode enough instructions per cycle to sustain the 4-wide pipeline.
+If SSE* instructions that are 6 bytes or longer arrive one after another,
+then front end throughput may limit execution speed. In such case,
+.It Li LSD_OVERFLOW
+.Pq Event 20H , Umask 01H
+Counts number of loops that cant stream from the instruction queue.
+.It Li L2_RQSTS.LD_HIT
+.Pq Event 24H , Umask 01H
+Counts number of loads that hit the L2 cache. L2 loads include both L1D
+demand misses as well as L1D prefetches. L2 loads can be rejected for
+various reasons. Only non rejected loads are counted.
+.It Li L2_RQSTS.LD_MISS
+.Pq Event 24H , Umask 02H
+Counts the number of loads that miss the L2 cache. L2 loads include both L1D
+demand misses as well as L1D prefetches.
+.It Li L2_RQSTS.LOADS
+.Pq Event 24H , Umask 03H
+Counts all L2 load requests. L2 loads include both L1D demand misses as well
+as L1D prefetches.
+.It Li L2_RQSTS.RFO_HIT
+.Pq Event 24H , Umask 04H
+Counts the number of store RFO requests that hit the L2 cache. L2 RFO
+requests include both L1D demand RFO misses as well as L1D RFO prefetches.
+Count includes WC memory requests, where the data is not fetched but the
+permission to write the line is required.
+.It Li L2_RQSTS.RFO_MISS
+.Pq Event 24H , Umask 08H
+Counts the number of store RFO requests that miss the L2 cache. L2 RFO
+requests include both L1D demand RFO misses as well as L1D RFO prefetches.
+.It Li L2_RQSTS.RFOS
+.Pq Event 24H , Umask 0CH
+Counts all L2 store RFO requests. L2 RFO requests include both L1D demand
+RFO misses as well as L1D RFO prefetches.
+.It Li L2_RQSTS.IFETCH_HIT
+.Pq Event 24H , Umask 10H
+Counts number of instruction fetches that hit the L2 cache. L2 instruction
+fetches include both L1I demand misses as well as L1I instruction
+prefetches.
+.It Li L2_RQSTS.IFETCH_MISS
+.Pq Event 24H , Umask 20H
+Counts number of instruction fetches that miss the L2 cache. L2 instruction
+fetches include both L1I demand misses as well as L1I instruction
+prefetches.
+.It Li L2_RQSTS.IFETCHES
+.Pq Event 24H , Umask 30H
+Counts all instruction fetches. L2 instruction fetches include both L1I
+demand misses as well as L1I instruction prefetches.
+.It Li L2_RQSTS.PREFETCH_HIT
+.Pq Event 24H , Umask 40H
+Counts L2 prefetch hits for both code and data.
+.It Li L2_RQSTS.PREFETCH_MISS
+.Pq Event 24H , Umask 80H
+Counts L2 prefetch misses for both code and data.
+.It Li L2_RQSTS.PREFETCHES
+.Pq Event 24H , Umask C0H
+Counts all L2 prefetches for both code and data.
+.It Li L2_RQSTS.MISS
+.Pq Event 24H , Umask AAH
+Counts all L2 misses for both code and data.
+.It Li L2_RQSTS.REFERENCES
+.Pq Event 24H , Umask FFH
+Counts all L2 requests for both code and data.
+.It Li L2_DATA_RQSTS.DEMAND.I_STATE
+.Pq Event 26H , Umask 01H
+Counts number of L2 data demand loads where the cache line to be loaded is
+in the I (invalid) state, i.e. a cache miss. L2 demand loads are both L1D
+demand misses and L1D prefetches.
+.It Li L2_DATA_RQSTS.DEMAND.S_STATE
+.Pq Event 26H , Umask 02H
+Counts number of L2 data demand loads where the cache line to be loaded is
+in the S (shared) state. L2 demand loads are both L1D demand misses and L1D
+prefetches.
+.It Li L2_DATA_RQSTS.DEMAND.E_STATE
+.Pq Event 26H , Umask 04H
+Counts number of L2 data demand loads where the cache line to be loaded is
+in the E (exclusive) state. L2 demand loads are both L1D demand misses and
+L1D prefetches.
+.It Li L2_DATA_RQSTS.DEMAND.M_STATE
+.Pq Event 26H , Umask 08H
+Counts number of L2 data demand loads where the cache line to be loaded is
+in the M (modified) state. L2 demand loads are both L1D demand misses and
+L1D prefetches.
+.It Li L2_DATA_RQSTS.DEMAND.MESI
+.Pq Event 26H , Umask 0FH
+Counts all L2 data demand requests. L2 demand loads are both L1D demand
+misses and L1D prefetches.
+.It Li L2_DATA_RQSTS.PREFETCH.I_STATE
+.Pq Event 26H , Umask 10H
+Counts number of L2 prefetch data loads where the cache line to be loaded is
+in the I (invalid) state, i.e. a cache miss.
+.It Li L2_DATA_RQSTS.PREFETCH.S_STATE
+.Pq Event 26H , Umask 20H
+Counts number of L2 prefetch data loads where the cache line to be loaded is
+in the S (shared) state. A prefetch RFO will miss on an S state line, while
+a prefetch read will hit on an S state line.
+.It Li L2_DATA_RQSTS.PREFETCH.E_STATE
+.Pq Event 26H , Umask 40H
+Counts number of L2 prefetch data loads where the cache line to be loaded is
+in the E (exclusive) state.
+.It Li L2_DATA_RQSTS.PREFETCH.M_STATE
+.Pq Event 26H , Umask 80H
+Counts number of L2 prefetch data loads where the cache line to be loaded is
+in the M (modified) state.
+.It Li L2_DATA_RQSTS.PREFETCH.MESI
+.Pq Event 26H , Umask F0H
+Counts all L2 prefetch requests.
+.It Li L2_DATA_RQSTS.ANY
+.Pq Event 26H , Umask FFH
+Counts all L2 data requests.
+.It Li L2_WRITE.RFO.I_STATE
+.Pq Event 27H , Umask 01H
+Counts number of L2 demand store RFO requests where the cache line to be
+loaded is in the I (invalid) state, i.e, a cache miss. The L1D prefetcher
+does not issue a RFO prefetch.
+This is a demand RFO request
+.It Li L2_WRITE.RFO.S_STATE
+.Pq Event 27H , Umask 02H
+Counts number of L2 store RFO requests where the cache line to be loaded is
+in the S (shared) state. The L1D prefetcher does not issue a RFO prefetch,.
+This is a demand RFO request
+.It Li L2_WRITE.RFO.M_STATE
+.Pq Event 27H , Umask 08H
+Counts number of L2 store RFO requests where the cache line to be loaded is
+in the M (modified) state. The L1D prefetcher does not issue a RFO prefetch.
+This is a demand RFO request
+.It Li L2_WRITE.RFO.HIT
+.Pq Event 27H , Umask 0EH
+Counts number of L2 store RFO requests where the cache line to be loaded is
+in either the S, E or M states. The L1D prefetcher does not issue a RFO
+prefetch.
+This is a demand RFO request
+.It Li L2_WRITE.RFO.MESI
+.Pq Event 27H , Umask 0FH
+Counts all L2 store RFO requests.The L1D prefetcher does not issue a RFO
+prefetch.
+This is a demand RFO request
+.It Li L2_WRITE.LOCK.I_STATE
+.Pq Event 27H , Umask 10H
+Counts number of L2 demand lock RFO requests where the cache line to be
+loaded is in the I (invalid) state, i.e. a cache miss.
+.It Li L2_WRITE.LOCK.S_STATE
+.Pq Event 27H , Umask 20H
+Counts number of L2 lock RFO requests where the cache line to be loaded is
+in the S (shared) state.
+.It Li L2_WRITE.LOCK.E_STATE
+.Pq Event 27H , Umask 40H
+Counts number of L2 demand lock RFO requests where the cache line to be
+loaded is in the E (exclusive) state.
+.It Li L2_WRITE.LOCK.M_STATE
+.Pq Event 27H , Umask 80H
+Counts number of L2 demand lock RFO requests where the cache line to be
+loaded is in the M (modified) state.
+.It Li L2_WRITE.LOCK.HIT
+.Pq Event 27H , Umask E0H
+Counts number of L2 demand lock RFO requests where the cache line to be
+loaded is in either the S, E, or M state.
+.It Li L2_WRITE.LOCK.MESI
+.Pq Event 27H , Umask F0H
+Counts all L2 demand lock RFO requests.
+.It Li L1D_WB_L2.I_STATE
+.Pq Event 28H , Umask 01H
+Counts number of L1 writebacks to the L2 where the cache line to be written
+is in the I (invalid) state, i.e. a cache miss.
+.It Li L1D_WB_L2.S_STATE
+.Pq Event 28H , Umask 02H
+Counts number of L1 writebacks to the L2 where the cache line to be written
+is in the S state.
+.It Li L1D_WB_L2.E_STATE
+.Pq Event 28H , Umask 04H
+Counts number of L1 writebacks to the L2 where the cache line to be written
+is in the E (exclusive) state.
+.It Li L1D_WB_L2.M_STATE
+.Pq Event 28H , Umask 08H
+Counts number of L1 writebacks to the L2 where the cache line to be written
+is in the M (modified) state.
+.It Li L1D_WB_L2.MESI
+.Pq Event 28H , Umask 0FH
+Counts all L1 writebacks to the L2.
+.It Li L3_LAT_CACHE.REFERENCE
+.Pq Event 2EH , Umask 4FH
+This event counts requests originating from the core that reference a cache
+line in the last level cache. The event count includes speculative traffic
+but excludes cache line fills due to a L2 hardware-prefetch. Because cache
+hierarchy, cache sizes and other implementation-specific characteristics;
+value comparison to estimate performance differences is not recommended.
+see Table A-1
+.It Li L3_LAT_CACHE.MISS
+.Pq Event 2EH , Umask 41H
+This event counts each cache miss condition for references to the last level
+cache. The event count may include speculative traffic but excludes cache
+line fills due to L2 hardware-prefetches. Because cache hierarchy, cache
+sizes and other implementation-specific characteristics; value comparison to
+estimate performance differences is not recommended.
+see Table A-1
+.It Li CPU_CLK_UNHALTED.THREAD_P
+.Pq Event 3CH , Umask 00H
+Counts the number of thread cycles while the thread is not in a halt state.
+The thread enters the halt state when it is running the HLT instruction. The
+core frequency may change from time to time due to power or thermal
+throttling.
+see Table A-1
+.It Li CPU_CLK_UNHALTED.REF_P
+.Pq Event 3CH , Umask 01H
+Increments at the frequency of TSC when not halted.
+see Table A-1
+.It Li L1D_CACHE_LD.I_STATE
+.Pq Event 40H , Umask 01H
+Counts L1 data cache read requests where the cache line to be loaded is in
+the I (invalid) state, i.e. the read request missed the cache.
+Counter 0, 1 only
+.It Li L1D_CACHE_LD.S_STATE
+.Pq Event 40H , Umask 02H
+Counts L1 data cache read requests where the cache line to be loaded is in
+the S (shared) state.
+Counter 0, 1 only
+.It Li L1D_CACHE_LD.E_STATE
+.Pq Event 40H , Umask 04H
+Counts L1 data cache read requests where the cache line to be loaded is in
+the E (exclusive) state.
+Counter 0, 1 only
+.It Li L1D_CACHE_LD.M_STATE
+.Pq Event 40H , Umask 08H
+Counts L1 data cache read requests where the cache line to be loaded is in
+the M (modified) state.
+Counter 0, 1 only
+.It Li L1D_CACHE_LD.MESI
+.Pq Event 40H , Umask 0FH
+Counts L1 data cache read requests.
+Counter 0, 1 only
+.It Li L1D_CACHE_ST.S_STATE
+.Pq Event 41H , Umask 02H
+Counts L1 data cache store RFO requests where the cache line to be loaded is
+in the S (shared) state.
+Counter 0, 1 only
+.It Li L1D_CACHE_ST.E_STATE
+.Pq Event 41H , Umask 04H
+Counts L1 data cache store RFO requests where the cache line to be loaded is
+in the E (exclusive) state.
+Counter 0, 1 only
+.It Li L1D_CACHE_ST.M_STATE
+.Pq Event 41H , Umask 08H
+Counts L1 data cache store RFO requests where cache line to be loaded is in
+the M (modified) state.
+Counter 0, 1 only
+.It Li L1D_CACHE_LOCK.HIT
+.Pq Event 42H , Umask 01H
+Counts retired load locks that hit in the L1 data cache or hit in an already
+allocated fill buffer.	The lock portion of the load lock transaction must
+hit in the L1D.
+The initial load will pull the lock into the L1 data cache. Counter 0, 1
+only
+.It Li L1D_CACHE_LOCK.S_STATE
+.Pq Event 42H , Umask 02H
+Counts L1 data cache retired load locks that hit the target cache line in
+the shared state.
+Counter 0, 1 only
+.It Li L1D_CACHE_LOCK.E_STATE
+.Pq Event 42H , Umask 04H
+Counts L1 data cache retired load locks that hit the target cache line in
+the exclusive state.
+Counter 0, 1 only
+.It Li L1D_CACHE_LOCK.M_STATE
+.Pq Event 42H , Umask 08H
+Counts L1 data cache retired load locks that hit the target cache line in
+the modified state.
+Counter 0, 1 only
+.It Li L1D_ALL_REF.ANY
+.Pq Event 43H , Umask 01H
+Counts all references (uncached, speculated and retired) to the L1 data
+cache, including all loads and stores with any memory types. The event
+counts memory accesses only when they are actually performed. For example, a
+load blocked by unknown store address and later performed is only counted
+once.
+The event does not include non- memory accesses, such as I/O accesses.
+Counter 0, 1 only
+.It Li L1D_ALL_REF.CACHEABLE
+.Pq Event 43H , Umask 02H
+Counts all data reads and writes (speculated and retired) from cacheable
+memory, including locked operations.
+Counter 0, 1 only
+.It Li L1D_PEND_MISS.LOAD_BUFFERS_FULL
+.Pq Event 48H , Umask 02H
+Counts cycles of L1 data cache load fill buffers full.
+Counter 0, 1 only
+.It Li DTLB_MISSES.ANY
+.Pq Event 49H , Umask 01H
+Counts the number of misses in the STLB which causes a page walk.
+.It Li DTLB_MISSES.WALK_COMPLETED
+.Pq Event 49H , Umask 02H
+Counts number of misses in the STLB which resulted in a completed page walk.
+.It Li DTLB_MISSES.STLB_HIT
+.Pq Event 49H , Umask 10H
+Counts the number of DTLB first level misses that hit in the second level
+TLB. This event is only relevant if the core contains multiple DTLB levels.
+.It Li LOAD_HIT_PRE
+.Pq Event 4CH , Umask 01H
+Counts load operations sent to the L1 data cache while a previous SSE
+prefetch instruction to the same cache line has started prefetching but has
+not yet finished.
+.It Li L1D_PREFETCH.REQUESTS
+.Pq Event 4EH , Umask 01H
+Counts number of hardware prefetch requests dispatched out of the prefetch
+FIFO.
+.It Li L1D_PREFETCH.MISS
+.Pq Event 4EH , Umask 02H
+Counts number of hardware prefetch requests that miss the L1D. There are two

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***