Re: fsck segfaults on rpi3 running 13-stable (and on 14-CURRENT analyzing the same file system that resulted from the 13-STABLE crash)
Date: Sun, 12 Feb 2023 21:25:32 UTC
[With a backtrace for the fsck_ffs SIGSEGV crash and some
listing of code involved, I'm now including mckusick@FreeBSD.org <mailto:mckusick@FreeBSD.org>
in the To: . Kirk M. likely would like you to preserve the
problematical UFS file system that produces the fsck_ffs
crashes, at least for now. For Kirk M.: The below is from/for
the fsck_ffs attempted from 14-CURRENT.]
On Feb 12, 2023, at 11:53, bob prohaska <fbsd@www.zefox.net> wrote:
> On Sun, Feb 12, 2023 at 11:31:59AM -0800, Mark Millard wrote:
>>
>> I'll note that another option is to run fsck_ffs from
>> lldb in the first place.
>
> That seems more productive, yielding:
>
> root@www:~ # lldb /sbin/fsck_ffs
> (lldb) target create "/sbin/fsck_ffs"
> Current executable set to '/sbin/fsck_ffs' (aarch64).
> (lldb) run -fy
> Process 62596 launched: '/sbin/fsck_ffs' (aarch64)
> usage: fsck_ffs [-BCdEFfnpRrSyZ] [-b block] [-c level] [-m mode] filesystem ...
> Process 62596 exited with status = 1 (0x00000001)
> (lldb) q
> root@www:~ # lldb fsck_ffs
> (lldb) target create "fsck_ffs"
> Current executable set to 'fsck_ffs' (aarch64).
> (lldb) run -fy /dev/da1s2d
> Process 62609 launched: '/sbin/fsck_ffs' (aarch64)
> ** /dev/da1s2d
> ** Last Mounted on /usr
> ** Phase 1 - Check Blocks and Sizes
> 7912408300994173476 BAD I=69393345
> 4313599915630302063 BAD I=69393345
> -4473632163892877928 BAD I=69393345
> 8068741989830080453 BAD I=69393345
> 3857159125896022134 BAD I=69393345
> -4354179704011695453 BAD I=69393345
> 7611175298055105740 BAD I=69393345
> 3985638883347136889 BAD I=69393345
> -2495754894521232470 BAD I=69393345
> 7739654885841380823 BAD I=69393345
> 7912408300994173476 BAD I=69393351
> 4313599915630302063 BAD I=69393351
> -4473632163892877928 BAD I=69393351
> 8068741989830080453 BAD I=69393351
> 3857159125896022134 BAD I=69393351
> -4354179704011695453 BAD I=69393351
> 7611175298055105740 BAD I=69393351
> 3985638883347136889 BAD I=69393351
> -2495754894521232470 BAD I=69393351
> 7739654885841380823 BAD I=69393351
> 7912408300994173476 BAD I=74682090
> 4313599915630302063 BAD I=74682090
> -4473632163892877928 BAD I=74682090
> 8068741989830080453 BAD I=74682090
> 3857159125896022134 BAD I=74682090
> -4354179704011695453 BAD I=74682090
> 7611175298055105740 BAD I=74682090
> 3985638883347136889 BAD I=74682090
> -2495754894521232470 BAD I=74682090
> 7739654885841380823 BAD I=74682090
> INODE CHECK-HASH FAILED I=74999808 OWNER=1842251117 MODE=15044
> This version of LLDB has no plugin for the language "assembler". Inspection of frame variables will be limited.
> Process 62609 stopped
> * thread #1, name = 'fsck_ffs', stop reason = signal SIGSEGV: invalid address (fault address: 0x0)
> frame #0: 0x00005c6f47c3d550 libc.so.7`strnlen at strnlen.S:50
> 47 bic src, srcin, 15
> 48 mov wtmp, 0xf00f
> 49 cbz cntin, L(nomatch)
> -> 50 ld1 {vdata.16b}, [src], 16
> 51 dup vrepmask.8h, wtmp
> 52 cmeq vhas_chr.16b, vdata.16b, 0
> 53 lsl shift, srcin, 2
> (lldb) bt all
> * thread #1, name = 'fsck_ffs', stop reason = signal SIGSEGV: invalid address (fault address: 0x0)
> * frame #0: 0x00005c6f47c3d550 libc.so.7`strnlen at strnlen.S:50
> frame #1: 0x00005c6f47c08b48 libc.so.7`__vfprintf(fp=0x00005c6f47cd8b68, locale=0x00005c6f47cd84a8, fmt0="MTIME=%12.12s %4.4s ", ap=(__stack = 0x00005c6f45005c40, __gr_top = 0x00005c6f45005bd0, __vr_top = 0x00005c6f45005b90, __gr_offs = -48, __vr_offs = -128)) at vfprintf.c:854:25
> frame #2: 0x00005c6f47c0752c libc.so.7`vfprintf_l(fp=0x00005c6f47cd8b68, locale=0x00005c6f47cd84a8, fmt0="MTIME=%12.12s %4.4s ", ap=(__stack = 0x00005c6f45005c40, __gr_top = 0x00005c6f45005bd0, __vr_top = 0x00005c6f45005b90, __gr_offs = -56, __vr_offs = -128)) at vfprintf.c:285:9
> frame #3: 0x00005c6f47c09f94 libc.so.7`vfprintf(fp=<unavailable>, fmt0=<unavailable>, ap=<unavailable>) at vfprintf.c:292:9
> frame #4: 0x00005c6f47c03dc0 libc.so.7`printf(fmt=<unavailable>) at printf.c:57:8
> frame #5: 0x00005c6ec487edac fsck_ffs`prtinode(ip=<unavailable>) at inode.c:1314:2
> frame #6: 0x00005c6ec487f000 fsck_ffs`getnextinode(inumber=74999808, rebuildcg=0) at inode.c:563:3
> frame #7: 0x00005c6ec4882d5c fsck_ffs`pass1 [inlined] checkinode(inumber=74999808, idesc=0x00005c6f45005d20, rebuildcg=0) at pass1.c:254:12
> frame #8: 0x00005c6ec4882d58 fsck_ffs`pass1 at pass1.c:181:8
> frame #9: 0x00005c6ec488209c fsck_ffs`main [inlined] checkfilesys(filesys=<unavailable>) at main.c:446:2
> frame #10: 0x00005c6ec48818b0 fsck_ffs`main(argc=1, argv=0x00005c6f45006138) at main.c:210:16
> frame #11: 0x00005c6ec4877ec0 fsck_ffs`__start(argc=3, argv=0x00005c6f45006128, env=0x00005c6f45006148, cleanup=<unavailable>) at crt1_c.c:72:7
> frame #12: 0x00007813def681d8 ld-elf.so.1`.rtld_start at rtld_start.S:41
> (lldb)
>
> Does that make any sense?
It gives some context for the internal failure, for sure.
I do not see a direct NULL pointer possibility in what I
report that I looked at below. It leaves me wondering if
something has trashed some memory (stack?) content that
is involved.
The backtrace indicates a NULL pointer was dereferenced:
* thread #1, name = 'fsck_ffs', stop reason = signal SIGSEGV: invalid address (fault address: 0x0)
* frame #0: 0x00005c6f47c3d550 libc.so.7`strnlen at strnlen.S:50
The "-> 50 ld1 {vdata.16b}, [src], 16" for strnlen indicates that
the code is from (given where I have main's source):
/usr/main-src/contrib/arm-optimized-routines/string/aarch64/strnlen.S
ENTRY (__strnlen_aarch64)
PTR_ARG (0)
SIZE_ARG (1)
bic src, srcin, 15
mov wtmp, 0xf00f
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src], 16
. . .
This is via the strnlen use in the
/usr/main-src/lib/libc/stdio/vfprintf.c code below (leading white
space might not be preserved):
. . .
int
__vfprintf(FILE *fp, locale_t locale, const char *fmt0, va_list ap)
{
. . .
case 's':
if (flags & LONGINT) {
wchar_t *wcp;
if (convbuf != NULL)
free(convbuf);
if ((wcp = GETARG(wchar_t *)) == NULL)
cp = "(null)";
else {
convbuf = __wcsconv(wcp, prec);
if (convbuf == NULL) {
fp->_flags |= __SERR;
goto error;
}
cp = convbuf;
}
} else if ((cp = GETARG(char *)) == NULL)
cp = "(null)";
size = (prec >= 0) ? strnlen(cp, prec) : strlen(cp);
sign = '\0';
break;
. . .
There are multiple layers involving va_list before we
get down to printf in printf.c . I've not tried to validate
this va_list related handling.
Looking back at code that is inside fsck_ffs source files that
leads to the printf usage (and indirectly to the other libc.so
code involved) . . .
So the code around /usr/main-src/sbin/fsck_ffs/inode.c:1314 looks
like: (leading white space might not be preserved)
void
prtinode(struct inode *ip)
{
char *p;
union dinode *dp;
struct passwd *pw;
time_t t;
dp = ip->i_dp;
printf(" I=%lu ", (u_long)ip->i_number);
if (ip->i_number < UFS_ROOTINO || ip->i_number > maxino)
return;
printf(" OWNER=");
if ((pw = getpwuid((int)DIP(dp, di_uid))) != NULL)
printf("%s ", pw->pw_name);
else
printf("%u ", (unsigned)DIP(dp, di_uid));
printf("MODE=%o\n", DIP(dp, di_mode));
if (preen)
printf("%s: ", cdevname);
printf("SIZE=%ju ", (uintmax_t)DIP(dp, di_size));
t = DIP(dp, di_mtime);
p = ctime(&t);
printf("MTIME=%12.12s %4.4s ", &p[4], &p[20]);
}
That, in turned, was called via:
( /usr/main-src/sbin/fsck_ffs/inode.c )
. . .
union dinode *
getnextinode(ino_t inumber, int rebuildcg)
{
. . .
dp = (union dinode *)nextinop;
if (sblock.fs_magic == FS_UFS1_MAGIC)
nextinop += sizeof(struct ufs1_dinode);
else
nextinop += sizeof(struct ufs2_dinode);
if ((ckhashadd & CK_INODE) != 0) {
ffs_update_dinode_ckhash(&sblock, (struct ufs2_dinode *)dp);
dirty(&inobuf);
}
if (ffs_verify_dinode_ckhash(&sblock, (struct ufs2_dinode *)dp) != 0) {
pwarn("INODE CHECK-HASH FAILED");
ip.i_bp = NULL;
ip.i_dp = dp;
ip.i_number = inumber;
prtinode(&ip);
if (preen || reply("FIX") != 0) {
if (preen)
printf(" (FIXED)\n");
ffs_update_dinode_ckhash(&sblock,
(struct ufs2_dinode *)dp);
dirty(&inobuf);
}
}
. . .
In turn:
( /usr/main-src/sbin/fsck_ffs/pass1.c )
. . .
static int
checkinode(ino_t inumber, struct inodesc *idesc, int rebuildcg)
{
. . .
if ((dp = getnextinode(inumber, rebuildcg)) == NULL) {
pfatal("INVALID INODE");
goto unknown;
}
. . .
In turn (same file):
void
pass1(void)
{
. . .
/*
* Scan the allocated inodes.
*/
setinodebuf(c, inosused);
for (i = 0; i < inosused; i++, inumber++) {
if (inumber < UFS_ROOTINO) {
(void)getnextinode(inumber, rebuildcg);
continue;
}
/*
* NULL return indicates probable end of allocated
* inodes during cylinder group rebuild attempt.
* We always keep trying until we get to the minimum
* valid number for this cylinder group.
*/
if (checkinode(inumber, &idesc, rebuildcg) == 0 &&
i > cgp->cg_initediblk)
break;
}
. . .
With that I stop.
So far, I've not identified how the NULL pointer showed up
that ended up being dereferenced. It does not look likely
that I will identify such.
===
Mark Millard
marklmi at yahoo.com