git: 772430dd6795 - main - Ensure I/O buffers in libufs(3) are 128-byte aligned.
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 17 Nov 2023 22:11:56 UTC
The branch main has been updated by mckusick:
URL: https://cgit.FreeBSD.org/src/commit/?id=772430dd67955850942d689714ab982da24257ba
commit 772430dd67955850942d689714ab982da24257ba
Author: Kirk McKusick <mckusick@FreeBSD.org>
AuthorDate: 2023-11-17 22:10:29 +0000
Commit: Kirk McKusick <mckusick@FreeBSD.org>
CommitDate: 2023-11-17 22:11:24 +0000
Ensure I/O buffers in libufs(3) are 128-byte aligned.
Various disk controllers require their buffers to be aligned to a
cache-line size (128 bytes). For buffers allocated in structures,
ensure that they are 128-byte aligned. Use aligned_malloc to allocate
memory to ensure that the returned memory is 128-byte aligned.
While we are here, we replace the dynamically allocated inode buffer
with a buffer allocated in the uufsd structure just as the superblock
and cylinder group buffers do.
This can be removed if/when the kernel is fixed. Because this problem
has existed on one I/O subsystem or another since the 1990's, we
are probably stuck with dealing with it forever.
The problem most recent showed up in Azure, see:
https://reviews.freebsd.org/D41728
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=267654
Before these fixes were applied, it was confirmed that the changes
in this commit also fixed the issue in Azure.
Reviewed-by: Warner Losh, kib
Tested-by: Souradeep Chakrabarti of Microsoft (earlier version)
PR: 267654
Differential Revision: https://reviews.freebsd.org/D41724
---
lib/libufs/Makefile | 2 +-
lib/libufs/block.c | 45 +++++++++++------------------------
lib/libufs/inode.c | 16 ++-----------
lib/libufs/libufs.h | 57 ++++++++++++++++++++++++++++++---------------
lib/libufs/sblock.c | 3 ++-
lib/libufs/type.c | 12 ++++++----
lib/libufs/ufs_disk_close.3 | 9 ++++++-
sbin/fsck_ffs/fsck.h | 15 ++++++++++++
sbin/fsck_ffs/fsutil.c | 11 ++++-----
sbin/fsck_ffs/inode.c | 3 +--
sbin/fsck_ffs/main.c | 1 -
sbin/fsck_ffs/pass5.c | 1 -
sbin/fsck_ffs/setup.c | 11 ++++-----
sbin/fsck_ffs/suj.c | 3 +--
14 files changed, 98 insertions(+), 91 deletions(-)
diff --git a/lib/libufs/Makefile b/lib/libufs/Makefile
index e37a285bb509..42d76c6061d2 100644
--- a/lib/libufs/Makefile
+++ b/lib/libufs/Makefile
@@ -2,7 +2,7 @@
PACKAGE= ufs
LIB= ufs
SHLIBDIR?= /lib
-SHLIB_MAJOR= 7
+SHLIB_MAJOR= 8
SRCS= block.c cgroup.c gsb_crc32.c inode.c sblock.c type.c ffs_subr.c
SRCS+= ffs_tables.c
diff --git a/lib/libufs/block.c b/lib/libufs/block.c
index 20002fc4df1b..e15216aefa8d 100644
--- a/lib/libufs/block.c
+++ b/lib/libufs/block.c
@@ -57,19 +57,10 @@ bread(struct uufsd *disk, ufs2_daddr_t blockno, void *data, size_t size)
ERROR(disk, NULL);
- p2 = data;
- /*
- * XXX: various disk controllers require alignment of our buffer
- * XXX: which is stricter than struct alignment.
- * XXX: Bounce the buffer if not 64 byte aligned.
- * XXX: this can be removed if/when the kernel is fixed
- */
- if (((intptr_t)data) & 0x3f) {
- p2 = malloc(size);
- if (p2 == NULL) {
- ERROR(disk, "allocate bounce buffer");
- goto fail;
- }
+ BUF_MALLOC(&p2, data, size);
+ if (p2 == NULL) {
+ ERROR(disk, "allocate bounce buffer");
+ goto fail;
}
cnt = pread(disk->d_fd, p2, size, (off_t)(blockno * disk->d_bsize));
if (cnt == -1) {
@@ -101,7 +92,7 @@ bwrite(struct uufsd *disk, ufs2_daddr_t blockno, const void *data, size_t size)
{
ssize_t cnt;
int rv;
- void *p2 = NULL;
+ void *p2;
ERROR(disk, NULL);
@@ -110,24 +101,15 @@ bwrite(struct uufsd *disk, ufs2_daddr_t blockno, const void *data, size_t size)
ERROR(disk, "failed to open disk for writing");
return (-1);
}
-
- /*
- * XXX: various disk controllers require alignment of our buffer
- * XXX: which is stricter than struct alignment.
- * XXX: Bounce the buffer if not 64 byte aligned.
- * XXX: this can be removed if/when the kernel is fixed
- */
- if (((intptr_t)data) & 0x3f) {
- p2 = malloc(size);
- if (p2 == NULL) {
- ERROR(disk, "allocate bounce buffer");
- return (-1);
- }
- memcpy(p2, data, size);
- data = p2;
+ BUF_MALLOC(&p2, data, size);
+ if (p2 == NULL) {
+ ERROR(disk, "allocate bounce buffer");
+ return (-1);
}
- cnt = pwrite(disk->d_fd, data, size, (off_t)(blockno * disk->d_bsize));
- if (p2 != NULL)
+ if (p2 != data)
+ memcpy(p2, data, size);
+ cnt = pwrite(disk->d_fd, p2, size, (off_t)(blockno * disk->d_bsize));
+ if (p2 != data)
free(p2);
if (cnt == -1) {
ERROR(disk, "write error to block device");
@@ -137,7 +119,6 @@ bwrite(struct uufsd *disk, ufs2_daddr_t blockno, const void *data, size_t size)
ERROR(disk, "short write to block device");
return (-1);
}
-
return (cnt);
}
diff --git a/lib/libufs/inode.c b/lib/libufs/inode.c
index fe34fd45b815..3865e50591de 100644
--- a/lib/libufs/inode.c
+++ b/lib/libufs/inode.c
@@ -62,18 +62,10 @@ getinode(struct uufsd *disk, union dinodep *dp, ino_t inum)
ERROR(disk, "inode number out of range");
return (-1);
}
- inoblock = disk->d_inoblock;
+ inoblock = (caddr_t)&disk->d_inos[0];
min = disk->d_inomin;
max = disk->d_inomax;
- if (inoblock == NULL) {
- inoblock = malloc(fs->fs_bsize);
- if (inoblock == NULL) {
- ERROR(disk, "unable to allocate inode block");
- return (-1);
- }
- disk->d_inoblock = inoblock;
- }
if (inum >= min && inum < max)
goto gotit;
bread(disk, fsbtodb(fs, ino_to_fsba(fs, inum)), inoblock,
@@ -107,14 +99,10 @@ putinode(struct uufsd *disk)
struct fs *fs;
fs = &disk->d_fs;
- if (disk->d_inoblock == NULL) {
- ERROR(disk, "No inode block allocated");
- return (-1);
- }
if (disk->d_ufs == 2)
ffs_update_dinode_ckhash(fs, disk->d_dp.dp2);
if (bwrite(disk, fsbtodb(fs, ino_to_fsba(&disk->d_fs, disk->d_inomin)),
- disk->d_inoblock, disk->d_fs.fs_bsize) <= 0)
+ (caddr_t)&disk->d_inos[0], disk->d_fs.fs_bsize) <= 0)
return (-1);
return (0);
}
diff --git a/lib/libufs/libufs.h b/lib/libufs/libufs.h
index 4c6242e9daef..45ac97f43c06 100644
--- a/lib/libufs/libufs.h
+++ b/lib/libufs/libufs.h
@@ -30,6 +30,13 @@
#ifndef __LIBUFS_H__
#define __LIBUFS_H__
+/*
+ * Various disk controllers require their buffers to be aligned to the size
+ * of a cache line. The LIBUFS_BUFALIGN defines the required alignment size.
+ * The alignment must be a power of 2.
+ */
+#define LIBUFS_BUFALIGN 128
+
/*
* libufs structures.
*/
@@ -42,39 +49,51 @@ union dinodep {
* userland ufs disk.
*/
struct uufsd {
- const char *d_name; /* disk name */
- int d_ufs; /* decimal UFS version */
- int d_fd; /* raw device file descriptor */
- long d_bsize; /* device bsize */
- ufs2_daddr_t d_sblock; /* superblock location */
- struct fs_summary_info *d_si; /* Superblock summary info */
- caddr_t d_inoblock; /* inode block */
- uint32_t d_inomin; /* low ino, not ino_t for ABI compat */
- uint32_t d_inomax; /* high ino, not ino_t for ABI compat */
- union dinodep d_dp; /* pointer to currently active inode */
union {
struct fs d_fs; /* filesystem information */
- char d_sb[MAXBSIZE]; /* superblock as buffer */
- } d_sbunion;
+ char d_sb[SBLOCKSIZE]; /* superblock as buffer */
+ } d_sbunion __aligned(LIBUFS_BUFALIGN);
union {
struct cg d_cg; /* cylinder group */
char d_buf[MAXBSIZE]; /* cylinder group storage */
- } d_cgunion;
- int d_ccg; /* current cylinder group */
- int d_lcg; /* last cylinder group (in d_cg) */
+ } d_cgunion __aligned(LIBUFS_BUFALIGN);
+ union {
+ union dinodep d_ino[1]; /* inode block */
+ char d_inos[MAXBSIZE]; /* inode block as buffer */
+ } d_inosunion __aligned(LIBUFS_BUFALIGN);
+ const char *d_name; /* disk name */
const char *d_error; /* human readable disk error */
+ ufs2_daddr_t d_sblock; /* superblock location */
+ struct fs_summary_info *d_si; /* Superblock summary info */
+ union dinodep d_dp; /* pointer to currently active inode */
+ ino_t d_inomin; /* low ino */
+ ino_t d_inomax; /* high ino */
off_t d_sblockloc; /* where to look for the superblock */
- int d_lookupflags; /* flags to superblock lookup */
- int d_mine; /* internal flags */
+ int64_t d_bsize; /* device bsize */
+ int64_t d_lookupflags; /* flags to superblock lookup */
+ int64_t d_mine; /* internal flags */
+ int32_t d_ccg; /* current cylinder group */
+ int32_t d_ufs; /* decimal UFS version */
+ int32_t d_fd; /* raw device file descriptor */
+ int32_t d_lcg; /* last cylinder group (in d_cg) */
+};
+#define d_inos d_inosunion.d_inos
#define d_fs d_sbunion.d_fs
-#define d_sb d_sbunion.d_sb
#define d_cg d_cgunion.d_cg
-};
/*
* libufs macros (internal, non-exported).
*/
#ifdef _LIBUFS
+/*
+ * Ensure that the buffer is aligned to the I/O subsystem requirements.
+ */
+#define BUF_MALLOC(newbufpp, data, size) { \
+ if (data != NULL && (((intptr_t)data) & (LIBUFS_BUFALIGN - 1)) == 0) \
+ *newbufpp = (void *)data; \
+ else \
+ *newbufpp = aligned_alloc(LIBUFS_BUFALIGN, size); \
+}
/*
* Trace steps through libufs, to be used at entry and erroneous return.
*/
diff --git a/lib/libufs/sblock.c b/lib/libufs/sblock.c
index 9f1d8a2485bd..59cd44de04ab 100644
--- a/lib/libufs/sblock.c
+++ b/lib/libufs/sblock.c
@@ -228,7 +228,8 @@ use_pread(void *devfd, off_t loc, void **bufp, int size)
int fd;
fd = *(int *)devfd;
- if ((*bufp = malloc(size)) == NULL)
+ BUF_MALLOC(bufp, NULL, size);
+ if (*bufp == NULL)
return (ENOSPC);
if (pread(fd, *bufp, size, loc) != size)
return (EIO);
diff --git a/lib/libufs/type.c b/lib/libufs/type.c
index 1d0c4c0200eb..99aec0a57e9a 100644
--- a/lib/libufs/type.c
+++ b/lib/libufs/type.c
@@ -61,10 +61,6 @@ ufs_disk_close(struct uufsd *disk)
ERROR(disk, NULL);
close(disk->d_fd);
disk->d_fd = -1;
- if (disk->d_inoblock != NULL) {
- free(disk->d_inoblock);
- disk->d_inoblock = NULL;
- }
if (disk->d_mine & MINE_NAME) {
free((char *)(uintptr_t)disk->d_name);
disk->d_name = NULL;
@@ -155,10 +151,16 @@ again: if ((ret = stat(name, &st)) < 0) {
return (-1);
}
+ if (((uintptr_t)disk & ~(LIBUFS_BUFALIGN - 1)) != (uintptr_t)disk) {
+ ERROR(disk, "uufsd structure must be aligned to "
+ "LIBUFS_BUFALIGN byte boundry, see ufs_disk_fillout(3)");
+ close(fd);
+ return (-1);
+ }
+
disk->d_bsize = 1;
disk->d_ccg = 0;
disk->d_fd = fd;
- disk->d_inoblock = NULL;
disk->d_inomin = 0;
disk->d_inomax = 0;
disk->d_lcg = 0;
diff --git a/lib/libufs/ufs_disk_close.3 b/lib/libufs/ufs_disk_close.3
index d0d93d05838c..f332a9bb5de9 100644
--- a/lib/libufs/ufs_disk_close.3
+++ b/lib/libufs/ufs_disk_close.3
@@ -9,7 +9,7 @@
.\"
.\" This file is in the public domain.
.\"
-.Dd June 4, 2003
+.Dd November 17, 2023
.Dt UFS_DISK_CLOSE 3
.Os
.Sh NAME
@@ -51,6 +51,13 @@ functions open a disk specified by
.Fa name
and populate the structure pointed to by
.Fa disk .
+The structure referenced by the
+.Fa disk
+pointer must be aligned to at least the alignment specified by
+.Dv LIBUFS_ALIGN
+that is defined in the
+.Lb libufs.h
+header file.
The disk is opened read-only.
The specified
.Fa name
diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h
index 3b795783f39c..827336a77d67 100644
--- a/sbin/fsck_ffs/fsck.h
+++ b/sbin/fsck_ffs/fsck.h
@@ -67,6 +67,7 @@
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
+#include <libufs.h>
#include <sys/queue.h>
@@ -424,6 +425,20 @@ Malloc(size_t size)
break;
return (retval);
}
+/*
+ * Allocate a block of memory to be used as an I/O buffer.
+ * Ensure that the buffer is aligned to the I/O subsystem requirements.
+ */
+static inline void*
+Balloc(size_t size)
+{
+ void *retval;
+
+ while ((retval = aligned_alloc(LIBUFS_BUFALIGN, size)) == NULL)
+ if (flushentry() == 0)
+ break;
+ return (retval);
+}
/*
* Wrapper for calloc() that flushes the cylinder group cache to try
diff --git a/sbin/fsck_ffs/fsutil.c b/sbin/fsck_ffs/fsutil.c
index 2583e324e94c..05f83789236e 100644
--- a/sbin/fsck_ffs/fsutil.c
+++ b/sbin/fsck_ffs/fsutil.c
@@ -58,7 +58,6 @@ static const char sccsid[] = "@(#)utilities.c 8.6 (Berkeley) 5/19/95";
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
-#include <libufs.h>
#include "fsck.h"
@@ -189,7 +188,7 @@ bufinit(void)
initbarea(&failedbuf, BT_UNKNOWN);
failedbuf.b_errs = -1;
failedbuf.b_un.b_buf = NULL;
- if ((cgblk.b_un.b_buf = Malloc((unsigned int)sblock.fs_bsize)) == NULL)
+ if ((cgblk.b_un.b_buf = Balloc((unsigned int)sblock.fs_bsize)) == NULL)
errx(EEXIT, "Initial malloc(%d) failed", sblock.fs_bsize);
initbarea(&cgblk, BT_CYLGRP);
numbufs = cachelookups = cachereads = 0;
@@ -211,7 +210,7 @@ allocbuf(const char *failreason)
char *bufp;
bp = (struct bufarea *)Malloc(sizeof(struct bufarea));
- bufp = Malloc((unsigned int)sblock.fs_bsize);
+ bufp = Balloc((unsigned int)sblock.fs_bsize);
if (bp == NULL || bufp == NULL) {
errx(EEXIT, "%s", failreason);
/* NOTREACHED */
@@ -241,7 +240,7 @@ cglookup(int cg)
if ((unsigned) cg >= sblock.fs_ncg)
errx(EEXIT, "cglookup: out of range cylinder group %d", cg);
if (cgbufs == NULL) {
- cgbufs = calloc(sblock.fs_ncg, sizeof(struct bufarea));
+ cgbufs = Calloc(sblock.fs_ncg, sizeof(struct bufarea));
if (cgbufs == NULL)
errx(EEXIT, "Cannot allocate cylinder group buffers");
}
@@ -250,7 +249,7 @@ cglookup(int cg)
return (cgbp);
cgp = NULL;
if (flushtries == 0)
- cgp = Malloc((unsigned int)sblock.fs_cgsize);
+ cgp = Balloc((unsigned int)sblock.fs_cgsize);
if (cgp == NULL) {
if (sujrecovery)
errx(EEXIT,"Ran out of memory during journal recovery");
@@ -966,7 +965,7 @@ blzero(int fd, ufs2_daddr_t blk, long size)
if (fd < 0)
return;
if (zero == NULL) {
- zero = calloc(ZEROBUFSIZE, 1);
+ zero = Balloc(ZEROBUFSIZE);
if (zero == NULL)
errx(EEXIT, "cannot allocate buffer pool");
}
diff --git a/sbin/fsck_ffs/inode.c b/sbin/fsck_ffs/inode.c
index 3db8a5e5c23d..e4349ff97088 100644
--- a/sbin/fsck_ffs/inode.c
+++ b/sbin/fsck_ffs/inode.c
@@ -48,7 +48,6 @@ static const char sccsid[] = "@(#)inode.c 8.8 (Berkeley) 4/28/95";
#include <pwd.h>
#include <string.h>
#include <time.h>
-#include <libufs.h>
#include "fsck.h"
@@ -646,7 +645,7 @@ setinodebuf(int cg, ino_t inosused)
inobufsize = blkroundup(&sblock,
MAX(INOBUFSIZE, sblock.fs_bsize));
initbarea(&inobuf, BT_INODES);
- if ((inobuf.b_un.b_buf = Malloc((unsigned)inobufsize)) == NULL)
+ if ((inobuf.b_un.b_buf = Balloc((unsigned)inobufsize)) == NULL)
errx(EEXIT, "cannot allocate space for inode buffer");
}
fullcnt = inobufsize / ((sblock.fs_magic == FS_UFS1_MAGIC) ?
diff --git a/sbin/fsck_ffs/main.c b/sbin/fsck_ffs/main.c
index ee39760f4dd2..4189af1ba517 100644
--- a/sbin/fsck_ffs/main.c
+++ b/sbin/fsck_ffs/main.c
@@ -59,7 +59,6 @@ static char sccsid[] = "@(#)main.c 8.6 (Berkeley) 5/14/95";
#include <fstab.h>
#include <grp.h>
#include <inttypes.h>
-#include <libufs.h>
#include <mntopts.h>
#include <paths.h>
#include <stdint.h>
diff --git a/sbin/fsck_ffs/pass5.c b/sbin/fsck_ffs/pass5.c
index 58de6791903f..8980ba60e03a 100644
--- a/sbin/fsck_ffs/pass5.c
+++ b/sbin/fsck_ffs/pass5.c
@@ -45,7 +45,6 @@ static const char sccsid[] = "@(#)pass5.c 8.9 (Berkeley) 4/28/95";
#include <inttypes.h>
#include <limits.h>
#include <string.h>
-#include <libufs.h>
#include "fsck.h"
diff --git a/sbin/fsck_ffs/setup.c b/sbin/fsck_ffs/setup.c
index c9aa19c7eded..b3d58749015e 100644
--- a/sbin/fsck_ffs/setup.c
+++ b/sbin/fsck_ffs/setup.c
@@ -52,7 +52,6 @@ static const char sccsid[] = "@(#)setup.c 8.10 (Berkeley) 5/9/95";
#include <limits.h>
#include <stdint.h>
#include <string.h>
-#include <libufs.h>
#include "fsck.h"
@@ -214,7 +213,7 @@ setup(char *dev)
sbdirty();
}
if (snapcnt > 0 && copybuf == NULL) {
- copybuf = Malloc(sblock.fs_bsize);
+ copybuf = Balloc(sblock.fs_bsize);
if (copybuf == NULL)
errx(EEXIT, "cannot allocate space for snapshot "
"copy buffer");
@@ -501,7 +500,7 @@ sblock_init(void)
fsmodified = 0;
lfdir = 0;
initbarea(&sblk, BT_SUPERBLK);
- sblk.b_un.b_buf = Malloc(SBLOCKSIZE);
+ sblk.b_un.b_buf = Balloc(SBLOCKSIZE);
if (sblk.b_un.b_buf == NULL)
errx(EEXIT, "cannot allocate space for superblock");
dev_bsize = secsize = DEV_BSIZE;
@@ -530,7 +529,7 @@ calcsb(char *dev, int devfd, struct fs *fs)
*/
if (ioctl(devfd, DIOCGSECTORSIZE, &secsize) == -1)
return (0);
- fsrbuf = Malloc(secsize);
+ fsrbuf = Balloc(secsize);
if (fsrbuf == NULL)
errx(EEXIT, "calcsb: cannot allocate recovery buffer");
if (blread(devfd, fsrbuf,
@@ -573,7 +572,7 @@ chkrecovery(int devfd)
rdsize = sblock.fs_fsize;
if (ioctl(devfd, DIOCGSECTORSIZE, &secsize) == -1 ||
rdsize % secsize != 0 ||
- (fsrbuf = Malloc(rdsize)) == NULL ||
+ (fsrbuf = Balloc(rdsize)) == NULL ||
blread(devfd, fsrbuf, (SBLOCK_UFS2 - rdsize) / dev_bsize,
rdsize) != 0) {
free(fsrbuf);
@@ -612,7 +611,7 @@ saverecovery(int readfd, int writefd)
if (sblock.fs_magic != FS_UFS2_MAGIC ||
ioctl(readfd, DIOCGSECTORSIZE, &secsize) == -1 ||
rdsize % secsize != 0 ||
- (fsrbuf = Malloc(rdsize)) == NULL ||
+ (fsrbuf = Balloc(rdsize)) == NULL ||
blread(readfd, fsrbuf, (SBLOCK_UFS2 - rdsize) / dev_bsize,
rdsize) != 0) {
printf("RECOVERY DATA COULD NOT BE CREATED\n");
diff --git a/sbin/fsck_ffs/suj.c b/sbin/fsck_ffs/suj.c
index c66b605bd69d..e1fd54229d69 100644
--- a/sbin/fsck_ffs/suj.c
+++ b/sbin/fsck_ffs/suj.c
@@ -47,7 +47,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
-#include <libufs.h>
#include <string.h>
#include <strings.h>
#include <sysexits.h>
@@ -2274,7 +2273,7 @@ suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
static void
suj_read(void)
{
- uint8_t block[1 * 1024 * 1024];
+ uint8_t block[1 * 1024 * 1024] __aligned(LIBUFS_BUFALIGN);
struct suj_seg *seg;
struct jsegrec *recn;
struct jsegrec *rec;