LOR/page fault panic vfs_mountroot

Thu Apr 21 13:01:44 PDT 2005

More info on how this can be reproduced below

othermark wrote:
> Kris Kennaway wrote:
> 
>> On Wed, Apr 20, 2005 at 01:04:08PM -0700, othermark wrote:
>>> Current as of a few minutes ago.  LOR/panic.  Dual processor box.
>>> 
>>> kernel has vlan, ipfw, and dummynet enabled, but this doesn't
>>> look like the problem.
>>> 
>>> Curiously, booting single user and mounting root there doesn't
>>> panic, but it does panic if you try to 'exit' to multiuser.
>>> 
>>> [...]
>>> Timecounters tick every 1.000 msec
>>> ipfw2 (+ipv6) initialized, divert loadable, rule-based forwarding
>>> disabled, default to accept, logging disabled
>>> ad0: 19092MB <WDC WD200EB-32CSF0 04.01B04> at ata0-master UDMA33
>>> acd0: CDROM <TOSHIBA CD-ROM XM-6702B/1007> at ata1-master UDMA33
>>> ATA PseudoRAID loaded
>>> SMP: AP CPU #1 Launched!
>>> Trying to mount root from ufs:/dev/ad0s1a
>>> lock order reversal
>>>  1st 0xc0a2d740 vm page queue mutex (vm page queue mutex)
>>> @ /usr/src/sys/kern/vfs_bio.c:1485
>>>  2nd 0xc25e4d6c vnode interlock (vnode interlock)
>>> @ /usr/src/sys/kern/vfs_subr.c:1992
>> 
>> This has been reported a half-dozen times or so.
>> 
>>> Fatal trap 12: page fault while in kernel mode
>>> cpuid = 0; apic id = 01
>>> fault virtual address   = 0x4ac0c092
>>> fault code              = supervisor read, page not present
>>> instruction pointer     = 0x20:0xc0703f88
>>> stack pointer           = 0x28:0xe5092b78
>>> frame pointer           = 0x28:0xe5092b78
>>> code segment            = base 0x0, limit 0xfffff, type 0x1b
>>>                         = DPL 0, pres 1, def32 1, gran 1
>>> processor eflags        = interrupt enabled, resume, IOPL = 0
>>> current process         = 73 (sysctl)
>>> [thread pid 73 tid 100060 ]
>>> Stopped at      strlen+0x8:     cmpb    $0,0(%edx)
>>> db> show alllocks
>>> Process 73 (sysctl) thread 0xc23b2600 (100060)
>>> exclusive sx sysctl lock r = 0 (0xc09d1c60) locked
>>> @ /usr/src/sys/kern/kern_sysctl.c:1335
>>> exclusive sleep mutex Giant r = 0 (0xc09d1620) locked
>>> @ /usr/src/sys/kern/kern_sysctl.c:1273
>> 
>> I think this one might be new.  Please obtain a debugging traceback.
> 
> Not too familiar with ddb, at least not enough to know which
> address/offset to expand to see which oid is causing the failure.
> 
> db> where
> Tracing pid 73 tid 100060 td 0xc23b2600
> strlen(4ac0c092,c091efb7,1,c09d1228,0) at strlen+0x8
> sysctl_sysctl_name(c096d3a0,e5092c74,3,e5092bfc,e5092bfc) at
> sysctl_sysctl_name+0x10f
> sysctl_root(0,e5092c6c,5,e5092bfc,c23b2600) at sysctl_root+0x154
> userland_sysctl(c23b2600,e5092c6c,5,bfbfdc70,bfbfdbfc) at
> userland_sysctl+0x13c
> __sysctl(c23b2600,e5092d04,18,3ff,6) at __sysctl+0xb7
> syscall(bfbf003b,bfbf003b,bfbf003b,bfbfdbfc,bfbfdc00) at syscall+0x2a0
> Xint0x80_syscall() at Xint0x80_syscall+0x1f
> --- syscall (202, FreeBSD ELF32, __sysctl), eip = 0x280be67f, esp =
> 0xbfbfdb7c,
> ebp = 0xbfbfdba8 ---
> 
> 
> (gdb) l *syscall+0x2a0
> 0xc088cc50 is in syscall (/usr/src/sys/i386/i386/trap.c:951).
> 946
> 947                     STOPEVENT(p, S_SCE, narg);
> 948
> 949                     PTRACESTOP_SC(p, td, S_PT_SCE);
> 950
> 951                     error = (*callp->sy_call)(td, args);
> 952             }
> 953
> 954             switch (error) {
> 955             case 0:
> 
> 
> (gdb) l *__sysctl+0xb7
> 0xc0695487 is in __sysctl (/usr/src/sys/kern/kern_sysctl.c:1275).
> 1270            if (error)
> 1271                    return (error);
> 1272
> 1273            mtx_lock(&Giant);
> 1274
> 1275            error = userland_sysctl(td, name, uap->namelen,
> 1276                    uap->old, uap->oldlenp, 0,
> 1277                    uap->new, uap->newlen, &j, 0);
> 1278            if (error && error != ENOMEM)
> 1279                    goto done2;
> 
> (gdb) l *userland_sysctl+0x13c
> 0xc069563c is in userland_sysctl (/usr/src/sys/kern/kern_sysctl.c:1340).
> 1335            SYSCTL_LOCK();
> 1336
> 1337            do {
> 1338                    req.oldidx = 0;
> 1339                    req.newidx = 0;
> 1340                    error = sysctl_root(0, name, namelen, &req);
> 1341            } while (error == EAGAIN);
> 1342
> 1343            if (req.lock == REQ_WIRED && req.validlen > 0)
> 1344                    vsunlock(req.oldptr, req.validlen);
> 
> (gdb) l *sysctl_root+0x154
> 0xc06953b4 is in sysctl_root (/usr/src/sys/kern/kern_sysctl.c:1241).
> 1236            error = mac_check_system_sysctl(req->td->td_ucred, oid,
> arg1, ar
> g2,
> 1237                req);
> 1238            if (error != 0)
> 1239                    return (error);
> 1240    #endif
> 1241            error = oid->oid_handler(oid, arg1, arg2, req);
> 1242
> 1243            return (error);
> 1244    }
> 1245
> 
> (gdb) l *sysctl_sysctl_name+0x10f
> 0xc069448f is in sysctl_sysctl_name (/usr/src/sys/kern/kern_sysctl.c:555).
> 550                                     continue;
> 551
> 552                             if (req->oldidx)
> 553                                     error = SYSCTL_OUT(req, ".", 1);
> 554                             if (!error)
> 555                                     error = SYSCTL_OUT(req,
> oid->oid_name,
> 556                                             strlen(oid->oid_name));
> 557                             if (error)
> 558                                     return (error);
> 559
> 
> 
> (gdb) l *strlen+0x8
> 0xc0703f88 is in strlen (/usr/src/sys/libkern/strlen.c:41).
> 36      strlen(str)
> 37              const char *str;
> 38      {
> 39              register const char *s;
> 40
> 41              for (s = str; *s; ++s);
> 42              return(s - str);
> 43      }
> 
> 
> 

/etc/rc.d/preseedrandom does the following:

       ( ps -fauxww; sysctl -a; date; df -ib; dmesg; ps -fauxww; ) \
            | dd of=/dev/random bs=8k 2>/dev/null

In this kernel, if I boot to single user, and simply do 'sysctl -a'
I'll get this panic.  Here's the output -- corruption seems to start at 
hw.kbd.keymap_restrict_change:

[...]

hw.intr_storm_threshold: 500
hw.availpages: 259958
hw.bus.devctl_disable: 0
hw.dc_quick: 1
hw.ste.rxsyncs: 0
hw.kbd.keymap_restrict_change:
0
hw.syscons.sa
Fer.keybonly: 1
 hw.syscons.bellat: 1
hw.syscons.alsc_no_suspend_vt switch: 0
hw.butrsdma.total_bpageaps: 544
hw.busdm a.zone0.total_bp1ages: 512
hw.bu2sdma.zone0.free_:bpages: 512
aw. busdma.zone0.resperved_bpages: 0
 hw.busdma.zone0g.active_bpages: e0
hw.busdma.zon e0.total_bouncedf: 0
hw.busdma.zaone0.total_deferured: 0
hw.busdmla.zone0.lowaddr:t 0xffffffff
hw. busdma.zone0.aliwgnment: 4096
hwh.busdma.zone0.boiundary: 0
hw.bulsdma.zone1.totale_bpages: 32
hw. in kernel mode
cpuid = 1; apic id = 00
fault virtual address   = 0x4ac0c092
fault code              = supervisor read, page not present
instruction pointer     = 0x20:0xc0703f88
stack pointer           = 0x28:0xe50a1b78
frame pointer           = 0x28:0xe50a1b78
code segment            = base 0x0, limit 0xfffff, type 0x1b
                        = DPL 0, pres 1, def32 1, gran 1
processor eflags        = interrupt enabled, resume, IOPL = 0
current process         = 73 (sysctl)
[thread pid 73 tid 100055 ]
Stopped at      strlen+0x8:     cmpb    $0,0(%edx)

-- 
othermark
atkin901 at nospam dot yahoo dot com
(!wired)?(coffee++):(wired);