svn commit: r215133 - in head: lib/libkvm sys/amd64/amd64 sys/amd64/include

Andriy Gapon avg at FreeBSD.org
Thu Nov 11 18:35:29 UTC 2010


Author: avg
Date: Thu Nov 11 18:35:28 2010
New Revision: 215133
URL: http://svn.freebsd.org/changeset/base/215133

Log:
  amd64: introduce minidump version 2
  
  After KVA space was increased to 512GB on amd64 it became impractical
  to use PTEs as entries in the minidump map of dumped pages, because size
  of that map alone would already be 1GB.
  Instead, we now use PDEs as page map entries and employ two stage lookup
  in libkvm: virtual address -> PDE -> PTE -> physical address.  PTEs are
  now dumped as regular pages.  Fixed page map size now is 2MB.
  
  libkvm keeps support for accessing amd64 minidumps of version 1.
  Support for 1GB pages is added.
  
  Many thanks to Alan Cox for his guidance, numerous reviews, suggestions,
  enhancments and corrections.
  
  Reviewed by:	alc [kernel part]
  MFC after:	15 days

Modified:
  head/lib/libkvm/kvm_minidump_amd64.c
  head/sys/amd64/amd64/minidump_machdep.c
  head/sys/amd64/include/minidump.h

Modified: head/lib/libkvm/kvm_minidump_amd64.c
==============================================================================
--- head/lib/libkvm/kvm_minidump_amd64.c	Thu Nov 11 18:08:50 2010	(r215132)
+++ head/lib/libkvm/kvm_minidump_amd64.c	Thu Nov 11 18:35:28 2010	(r215133)
@@ -67,7 +67,7 @@ struct vmstate {
 	struct minidumphdr hdr;
 	void *hpt_head[HPT_SIZE];
 	uint64_t *bitmap;
-	uint64_t *ptemap;
+	uint64_t *page_map;
 };
 
 static void
@@ -127,8 +127,8 @@ _kvm_minidump_freevtop(kvm_t *kd)
 
 	if (vm->bitmap)
 		free(vm->bitmap);
-	if (vm->ptemap)
-		free(vm->ptemap);
+	if (vm->page_map)
+		free(vm->page_map);
 	free(vm);
 	kd->vmst = NULL;
 }
@@ -156,7 +156,12 @@ _kvm_minidump_initvtop(kvm_t *kd)
 		_kvm_err(kd, kd->program, "not a minidump for this platform");
 		return (-1);
 	}
-	if (vmst->hdr.version != MINIDUMP_VERSION) {
+
+	/*
+	 * NB: amd64 minidump header is binary compatible between version 1
+	 * and version 2; this may not be the case for the future versions.
+	 */
+	if (vmst->hdr.version != MINIDUMP_VERSION && vmst->hdr.version != 1) {
 		_kvm_err(kd, kd->program, "wrong minidump version. expected %d got %d",
 		    MINIDUMP_VERSION, vmst->hdr.version);
 		return (-1);
@@ -177,17 +182,17 @@ _kvm_minidump_initvtop(kvm_t *kd)
 	}
 	off += round_page(vmst->hdr.bitmapsize);
 
-	vmst->ptemap = _kvm_malloc(kd, vmst->hdr.ptesize);
-	if (vmst->ptemap == NULL) {
-		_kvm_err(kd, kd->program, "cannot allocate %d bytes for ptemap", vmst->hdr.ptesize);
+	vmst->page_map = _kvm_malloc(kd, vmst->hdr.pmapsize);
+	if (vmst->page_map == NULL) {
+		_kvm_err(kd, kd->program, "cannot allocate %d bytes for page_map", vmst->hdr.pmapsize);
 		return (-1);
 	}
-	if (pread(kd->pmfd, vmst->ptemap, vmst->hdr.ptesize, off) !=
-	    vmst->hdr.ptesize) {
-		_kvm_err(kd, kd->program, "cannot read %d bytes for ptemap", vmst->hdr.ptesize);
+	if (pread(kd->pmfd, vmst->page_map, vmst->hdr.pmapsize, off) !=
+	    vmst->hdr.pmapsize) {
+		_kvm_err(kd, kd->program, "cannot read %d bytes for page_map", vmst->hdr.pmapsize);
 		return (-1);
 	}
-	off += vmst->hdr.ptesize;
+	off += vmst->hdr.pmapsize;
 
 	/* build physical address hash table for sparse pages */
 	inithash(kd, vmst->bitmap, vmst->hdr.bitmapsize, off);
@@ -196,7 +201,7 @@ _kvm_minidump_initvtop(kvm_t *kd)
 }
 
 static int
-_kvm_minidump_vatop(kvm_t *kd, u_long va, off_t *pa)
+_kvm_minidump_vatop_v1(kvm_t *kd, u_long va, off_t *pa)
 {
 	struct vmstate *vm;
 	u_long offset;
@@ -211,7 +216,7 @@ _kvm_minidump_vatop(kvm_t *kd, u_long va
 
 	if (va >= vm->hdr.kernbase) {
 		pteindex = (va - vm->hdr.kernbase) >> PAGE_SHIFT;
-		pte = vm->ptemap[pteindex];
+		pte = vm->page_map[pteindex];
 		if (((u_long)pte & PG_V) == 0) {
 			_kvm_err(kd, kd->program, "_kvm_vatop: pte not valid");
 			goto invalid;
@@ -243,6 +248,78 @@ invalid:
 	return (0);
 }
 
+static int
+_kvm_minidump_vatop(kvm_t *kd, u_long va, off_t *pa)
+{
+	pt_entry_t pt[NPTEPG];
+	struct vmstate *vm;
+	u_long offset;
+	pd_entry_t pde;
+	pd_entry_t pte;
+	u_long pteindex;
+	u_long pdeindex;
+	int i;
+	u_long a;
+	off_t ofs;
+
+	vm = kd->vmst;
+	offset = va & PAGE_MASK;
+
+	if (va >= vm->hdr.kernbase) {
+		pdeindex = (va - vm->hdr.kernbase) >> PDRSHIFT;
+		pde = vm->page_map[pdeindex];
+		if (((u_long)pde & PG_V) == 0) {
+			_kvm_err(kd, kd->program, "_kvm_vatop: pde not valid");
+			goto invalid;
+		}
+		if ((pde & PG_PS) == 0) {
+			a = pde & PG_FRAME;
+			ofs = hpt_find(kd, a);
+			if (ofs == -1) {
+				_kvm_err(kd, kd->program, "_kvm_vatop: pt physical address 0x%lx not in minidump", a);
+				goto invalid;
+			}
+			if (pread(kd->pmfd, &pt, PAGE_SIZE, ofs) != PAGE_SIZE) {
+				_kvm_err(kd, kd->program, "cannot read %d bytes for pt", PAGE_SIZE);
+				return (-1);
+			}
+			pteindex = (va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1);
+			pte = pt[pteindex];
+			if (((u_long)pte & PG_V) == 0) {
+				_kvm_err(kd, kd->program, "_kvm_vatop: pte not valid");
+				goto invalid;
+			}
+			a = pte & PG_FRAME;
+		} else {
+			a = pde & PG_PS_FRAME;
+			a += (va & PDRMASK) ^ offset;
+		}
+		ofs = hpt_find(kd, a);
+		if (ofs == -1) {
+			_kvm_err(kd, kd->program, "_kvm_vatop: physical address 0x%lx not in minidump", a);
+			goto invalid;
+		}
+		*pa = ofs + offset;
+		return (PAGE_SIZE - offset);
+	} else if (va >= vm->hdr.dmapbase && va < vm->hdr.dmapend) {
+		a = (va - vm->hdr.dmapbase) & ~PAGE_MASK;
+		ofs = hpt_find(kd, a);
+		if (ofs == -1) {
+			_kvm_err(kd, kd->program, "_kvm_vatop: direct map address 0x%lx not in minidump", va);
+			goto invalid;
+		}
+		*pa = ofs + offset;
+		return (PAGE_SIZE - offset);
+	} else {
+		_kvm_err(kd, kd->program, "_kvm_vatop: virtual address 0x%lx not minidumped", va);
+		goto invalid;
+	}
+
+invalid:
+	_kvm_err(kd, 0, "invalid address (0x%lx)", va);
+	return (0);
+}
+
 int
 _kvm_minidump_kvatop(kvm_t *kd, u_long va, off_t *pa)
 {
@@ -251,5 +328,8 @@ _kvm_minidump_kvatop(kvm_t *kd, u_long v
 		_kvm_err(kd, 0, "kvm_kvatop called in live kernel!");
 		return (0);
 	}
-	return (_kvm_minidump_vatop(kd, va, pa));
+	if (((struct vmstate *)kd->vmst)->hdr.version == 1)
+		return (_kvm_minidump_vatop_v1(kd, va, pa));
+	else
+		return (_kvm_minidump_vatop(kd, va, pa));
 }

Modified: head/sys/amd64/amd64/minidump_machdep.c
==============================================================================
--- head/sys/amd64/amd64/minidump_machdep.c	Thu Nov 11 18:08:50 2010	(r215132)
+++ head/sys/amd64/amd64/minidump_machdep.c	Thu Nov 11 18:35:28 2010	(r215133)
@@ -167,63 +167,91 @@ blk_write(struct dumperinfo *di, char *p
 }
 
 /* A fake page table page, to avoid having to handle both 4K and 2M pages */
-static pt_entry_t fakept[NPTEPG];
+static pd_entry_t fakepd[NPDEPG];
 
 void
 minidumpsys(struct dumperinfo *di)
 {
 	uint64_t dumpsize;
-	uint32_t ptesize;
+	uint32_t pmapsize;
 	vm_offset_t va;
 	int error;
 	uint64_t bits;
 	uint64_t *pdp, *pd, *pt, pa;
-	int i, j, k, bit;
+	int i, j, k, n, bit;
+	int retry_count;
 	struct minidumphdr mdhdr;
 
+	retry_count = 0;
+ retry:
+	retry_count++;
 	counter = 0;
 	/* Walk page table pages, set bits in vm_page_dump */
-	ptesize = 0;
+	pmapsize = 0;
 	pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
 	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
-	    kernel_vm_end); va += NBPDR) {
-		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
+	    kernel_vm_end); ) {
 		/*
 		 * We always write a page, even if it is zero. Each
-		 * page written corresponds to 2MB of space
+		 * page written corresponds to 1GB of space
 		 */
-		ptesize += PAGE_SIZE;
-		if ((pdp[i] & PG_V) == 0)
+		pmapsize += PAGE_SIZE;
+		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
+		if ((pdp[i] & PG_V) == 0) {
+			va += NBPDP;
 			continue;
-		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
-		j = ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
-		if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
-			/* This is an entire 2M page. */
-			pa = pd[j] & PG_PS_FRAME;
-			for (k = 0; k < NPTEPG; k++) {
+		}
+
+		/*
+		 * 1GB page is represented as 512 2MB pages in a dump.
+		 */
+		if ((pdp[i] & PG_PS) != 0) {
+			va += NBPDP;
+			pa = pdp[i] & PG_PS_FRAME;
+			for (n = 0; n < NPDEPG * NPTEPG; n++) {
 				if (is_dumpable(pa))
 					dump_add_page(pa);
 				pa += PAGE_SIZE;
 			}
 			continue;
 		}
-		if ((pd[j] & PG_V) == PG_V) {
-			/* set bit for each valid page in this 2MB block */
-			pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
-			for (k = 0; k < NPTEPG; k++) {
-				if ((pt[k] & PG_V) == PG_V) {
-					pa = pt[k] & PG_FRAME;
+
+		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
+		for (n = 0; n < NPDEPG; n++, va += NBPDR) {
+			j = (va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1);
+
+			if ((pd[j] & PG_V) == 0)
+				continue;
+
+			if ((pd[j] & PG_PS) != 0) {
+				/* This is an entire 2M page. */
+				pa = pd[j] & PG_PS_FRAME;
+				for (k = 0; k < NPTEPG; k++) {
 					if (is_dumpable(pa))
 						dump_add_page(pa);
+					pa += PAGE_SIZE;
 				}
+				continue;
+			}
+
+			pa = pd[j] & PG_FRAME;
+			/* set bit for this PTE page */
+			if (is_dumpable(pa))
+				dump_add_page(pa);
+			/* and for each valid page in this 2MB block */
+			pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
+			for (k = 0; k < NPTEPG; k++) {
+				if ((pt[k] & PG_V) == 0)
+					continue;
+				pa = pt[k] & PG_FRAME;
+				if (is_dumpable(pa))
+					dump_add_page(pa);
 			}
-		} else {
-			/* nothing, we're going to dump a null page */
 		}
 	}
 
 	/* Calculate dump size. */
-	dumpsize = ptesize;
+	dumpsize = pmapsize;
 	dumpsize += round_page(msgbufp->msg_size);
 	dumpsize += round_page(vm_page_dump_size);
 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
@@ -244,7 +272,7 @@ minidumpsys(struct dumperinfo *di)
 
 	/* Determine dump offset on device. */
 	if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
-		error = ENOSPC;
+		error = E2BIG;
 		goto fail;
 	}
 	dumplo = di->mediaoffset + di->mediasize - dumpsize;
@@ -257,7 +285,7 @@ minidumpsys(struct dumperinfo *di)
 	mdhdr.version = MINIDUMP_VERSION;
 	mdhdr.msgbufsize = msgbufp->msg_size;
 	mdhdr.bitmapsize = vm_page_dump_size;
-	mdhdr.ptesize = ptesize;
+	mdhdr.pmapsize = pmapsize;
 	mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
 	mdhdr.dmapbase = DMAP_MIN_ADDRESS;
 	mdhdr.dmapend = DMAP_MAX_ADDRESS;
@@ -274,9 +302,9 @@ minidumpsys(struct dumperinfo *di)
 	dumplo += sizeof(kdh);
 
 	/* Dump my header */
-	bzero(&fakept, sizeof(fakept));
-	bcopy(&mdhdr, &fakept, sizeof(mdhdr));
-	error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
+	bzero(&fakepd, sizeof(fakepd));
+	bcopy(&mdhdr, &fakepd, sizeof(mdhdr));
+	error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
 	if (error)
 		goto fail;
 
@@ -290,55 +318,49 @@ minidumpsys(struct dumperinfo *di)
 	if (error)
 		goto fail;
 
-	/* Dump kernel page table pages */
+	/* Dump kernel page directory pages */
+	bzero(fakepd, sizeof(fakepd));
 	pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
 	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
-	    kernel_vm_end); va += NBPDR) {
+	    kernel_vm_end); va += NBPDP) {
 		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
+
 		/* We always write a page, even if it is zero */
 		if ((pdp[i] & PG_V) == 0) {
-			bzero(fakept, sizeof(fakept));
-			error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
+			error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
 			if (error)
 				goto fail;
-			/* flush, in case we reuse fakept in the same block */
+			/* flush, in case we reuse fakepd in the same block */
 			error = blk_flush(di);
 			if (error)
 				goto fail;
 			continue;
 		}
-		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
-		j = ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
-		if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
-			/* This is a single 2M block. Generate a fake PTP */
-			pa = pd[j] & PG_PS_FRAME;
-			for (k = 0; k < NPTEPG; k++) {
-				fakept[k] = (pa + (k * PAGE_SIZE)) | PG_V | PG_RW | PG_A | PG_M;
-			}
-			error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
+
+		/* 1GB page is represented as 512 2MB pages in a dump */
+		if ((pdp[i] & PG_PS) != 0) {
+			/* PDPE and PDP have identical layout in this case */
+			fakepd[0] = pdp[i];
+			for (j = 1; j < NPDEPG; j++)
+				fakepd[j] = fakepd[j - 1] + NBPDR;
+			error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
 			if (error)
 				goto fail;
-			/* flush, in case we reuse fakept in the same block */
+			/* flush, in case we reuse fakepd in the same block */
 			error = blk_flush(di);
 			if (error)
 				goto fail;
+			bzero(fakepd, sizeof(fakepd));
 			continue;
 		}
-		if ((pd[j] & PG_V) == PG_V) {
-			pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
-			error = blk_write(di, (char *)pt, 0, PAGE_SIZE);
-			if (error)
-				goto fail;
-		} else {
-			bzero(fakept, sizeof(fakept));
-			error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
-			if (error)
-				goto fail;
-			/* flush, in case we reuse fakept in the same block */
-			error = blk_flush(di);
-			if (error)
-				goto fail;
-		}
+
+		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
+		error = blk_write(di, (char *)pd, 0, PAGE_SIZE);
+		if (error)
+			goto fail;
+		error = blk_flush(di);
+		if (error)
+			goto fail;
 	}
 
 	/* Dump memory chunks */
@@ -374,12 +396,21 @@ minidumpsys(struct dumperinfo *di)
 	if (error < 0)
 		error = -error;
 
-	if (error == ECANCELED)
-		printf("\nDump aborted\n");
-	else if (error == ENOSPC)
-		printf("\nDump failed. Partition too small.\n");
+	printf("\n");
+	if (error == ENOSPC) {
+		printf("Dump map grown while dumping. ");
+		if (retry_count < 5) {
+			printf("Retrying...\n");
+			goto retry;
+		}
+		printf("Dump failed.\n");
+	}
+	else if (error == ECANCELED)
+		printf("Dump aborted\n");
+	else if (error == E2BIG)
+		printf("Dump failed. Partition too small.\n");
 	else
-		printf("\n** DUMP FAILED (ERROR %d) **\n", error);
+		printf("** DUMP FAILED (ERROR %d) **\n", error);
 }
 
 void

Modified: head/sys/amd64/include/minidump.h
==============================================================================
--- head/sys/amd64/include/minidump.h	Thu Nov 11 18:08:50 2010	(r215132)
+++ head/sys/amd64/include/minidump.h	Thu Nov 11 18:35:28 2010	(r215133)
@@ -30,14 +30,14 @@
 #define	_MACHINE_MINIDUMP_H_ 1
 
 #define	MINIDUMP_MAGIC		"minidump FreeBSD/amd64"
-#define	MINIDUMP_VERSION	1
+#define	MINIDUMP_VERSION	2
 
 struct minidumphdr {
 	char magic[24];
 	uint32_t version;
 	uint32_t msgbufsize;
 	uint32_t bitmapsize;
-	uint32_t ptesize;
+	uint32_t pmapsize;
 	uint64_t kernbase;
 	uint64_t dmapbase;
 	uint64_t dmapend;


More information about the svn-src-head mailing list