help w/panic under heavy load - 5.4

Edwin edwin at verolan.com
Fri Jul 22 21:54:09 GMT 2005


Hi Giorgos,

I'm sorry - I have so many kernels I was trying - I belive I overwrote that particular
kernel/kernel.debug set - so I created a new kernel as a baseline with the same options
per my notes - and included the output from the crash below.

It does crash in the same  fashion, and the KGDB output shows an MTU of the same type
value (-1056788992 v. (-1056787456).

I also patched ip_fastforward.c w/ your patch - still a crash - still same type bogus
mtu value - a few lines from kgdb included @ end of message.

Thanks again,
-Edwin



the variables you were asking about from this crash.

(kgdb) f 13
#13 0xc06933c1 in ip_fastforward (m=0xc12e6c00) at /usr/src/sys/netinet/ip_fastfwd.c:572
572                             if (ip_fragment(ip, &m, mtu, ifp->if_hwassist,
(kgdb) p ro.ro_rt->rt_rmx
$1 = {rmx_mtu = 1500, rmx_expire = 333905919, rmx_pksent = 3868}
(kgdb) p ifp
$2 = (struct ifnet *) 0xc0f91800
(kgdb) p *ifp
$3 = {if_softc = 0xc0f91800, if_link = {tqe_next = 0xc0f90000, tqe_prev = 0xc08ebe84}, 
  if_xname = "sis0", '\0' <repeats 11 times>, if_dname = 0xc0f2ec2c "sis", if_dunit = 0, 
  if_addrhead = {tqh_first = 0xc0ec0000, tqh_last = 0xc1040460}, if_klist = {
    kl_lock = 0xc08e5a40, kl_list = {slh_first = 0x0}}, if_pcount = 0, if_carp = 0x0, 
  if_bpf = 0x0, if_index = 1, if_timer = 5, if_nvlans = 0, if_flags = 34883, 
  if_capabilities = 72, if_capenable = 72, if_linkmib = 0x0, if_linkmiblen = 0, 
  if_data = {ifi_type = 6 '\006', ifi_physical = 0 '\0', ifi_addrlen = 6 '\006', 
    ifi_hdrlen = 18 '\022', ifi_link_state = 2 '\002', ifi_recvquota = 0 '\0', 
    ifi_xmitquota = 0 '\0', ifi_datalen = 80 'P', ifi_mtu = 1500, ifi_metric = 0, 
    ifi_baudrate = 10000000, ifi_ipackets = 50, ifi_ierrors = 0, ifi_opackets = 3914, 
    ifi_oerrors = 0, ifi_collisions = 0, ifi_ibytes = 6146, ifi_obytes = 213356, 
    ifi_imcasts = 40, ifi_omcasts = 29, ifi_iqdrops = 0, ifi_noproto = 0, 
    ifi_hwassist = 0, ifi_epoch = 0, ifi_lastchange = {tv_sec = 0, tv_usec = 0}}, 
  if_multiaddrs = {tqh_first = 0xc0fab3e0, tqh_last = 0xc0fabcc0}, if_amcount = 0, 
  if_output = 0xc0671e04 <ether_output>, if_input = 0xc0672598 <ether_input>, 
  if_start = 0xc0713c10 <sis_start>, if_ioctl = 0xc071497c <sis_ioctl>, 
  if_watchdog = 0xc0714b04 <sis_watchdog>, if_init = 0xc0713f60 <sis_init>, 
  if_resolvemulti = 0xc0672e48 <ether_resolvemulti>, if_spare1 = 0x0, if_spare2 = 0x0, 
  if_spare3 = 0x0, if_spare_flags1 = 0, if_spare_flags2 = 0, if_snd = {ifq_head = 0x0, 
    ifq_tail = 0x0, ifq_len = 0, ifq_maxlen = 127, ifq_drops = 0, ifq_mtx = {
      mtx_object = {lo_class = 0xc0880b3c, lo_name = 0xc0f9180c "sis0", 
        lo_type = 0xc0829304 "if send queue", lo_flags = 196608, lo_list = {
          tqe_next = 0x0, tqe_prev = 0x0}, lo_witness = 0x0}, mtx_lock = 4, 
      mtx_recurse = 0}, ifq_drv_head = 0x0, ifq_drv_tail = 0x0, ifq_drv_len = 0, 
    ifq_drv_maxlen = 127, altq_type = 0, altq_flags = 1, altq_disc = 0x0, 
    altq_ifp = 0xc0f91800, altq_enqueue = 0, altq_dequeue = 0, altq_request = 0, 
    altq_clfier = 0x0, altq_classify = 0, altq_tbr = 0x0, altq_cdnr = 0x0}, 
  if_broadcastaddr = 0xc07db600 "������", lltables = 0x0, if_label = 0x0, 
  if_prefixhead = {tqh_first = 0x0, tqh_last = 0xc0f91968}, if_afdata = {
    0x0 <repeats 28 times>, 0xc0faaab0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, 
  if_afdata_initialized = 1, if_afdata_mtx = {mtx_object = {lo_class = 0xc0880b3c, 
      lo_name = 0xc08292f4 "if_afdata", lo_type = 0xc08292f4 "if_afdata", 
      lo_flags = 196608, lo_list = {tqe_next = 0x0, tqe_prev = 0x0}, lo_witness = 0x0}, 
    mtx_lock = 4, mtx_recurse = 0}, if_starttask = {ta_link = {stqe_next = 0x0}, 
    ta_pending = 0, ta_priority = 0, ta_func = 0xc06711c0 <if_start_deferred>, 
    ta_context = 0xc0f91800}}
(kgdb) 


for reference going forward - this kernel was named D1-0722, and I'm making cross
correlations to save the kernels/debugs/cores.


new kernel crash - all options compiled, sysctl ipff=1, polling not enabled


fb54c# panic: m_copym, offset > size of mbuf chain
KDB: enter: panic
[thread pid 21 tid 100015 ]
Stopped at      kdb_enter+0x2b: nop
db> where
Tracing pid 21 tid 100015 td 0xc0ecc780
kdb_enter(c0821a6a) at kdb_enter+0x2b
panic(c0826049,0,c076b79c,c102ae00,100) at panic+0xbb
m_copym(0,5dc,5c8,1,14) at m_copym+0x60
ip_fragment(c12f700e,c76bfc6c,5dc,0,1) at ip_fragment+0x214
ip_fastforward(c12e6c00) at ip_fastforward+0x6ed
ether_demux(c0f90000,c12e6c00,3c,c0f8a8d8,a) at ether_demux+0x259
ether_input(c0f90000,c12e6c00,c0f902d0,0,c08336ab) at ether_input+0x25d
sis_rxeof(c0f90000) at sis_rxeof+0x1ab
sis_intr(c0f90000) at sis_intr+0xf3
ithread_loop(c0ec6880,c76bfd48,c0ec6880,c060030c,0) at ithread_loop+0x124
fork_exit(c060030c,c0ec6880,c76bfd48) at fork_exit+0xa4
fork_trampoline() at fork_trampoline+0x8
--- trap 0x1, eip = 0, esp = 0xc76bfd7c, ebp = 0 ---
db> 

(kgdb) where
#0  doadump () at pcpu.h:159
#1  0xc04611f6 in db_fncall (dummy1=0, dummy2=0, dummy3=43, dummy4=0xc76bf9f4 "(�k�")
    at /usr/src/sys/ddb/db_command.c:531
#2  0xc0461004 in db_command (last_cmdp=0xc08c9264, cmd_table=0x0, 
    aux_cmd_tablep=0xc08483b8, aux_cmd_tablep_end=0xc08483d4)
    at /usr/src/sys/ddb/db_command.c:349
#3  0xc04610cc in db_command_loop () at /usr/src/sys/ddb/db_command.c:455
#4  0xc0462c51 in db_trap (type=3, code=0) at /usr/src/sys/ddb/db_main.c:221
#5  0xc0627af2 in kdb_trap (type=3, code=0, tf=0xc76bfb30)
    at /usr/src/sys/kern/subr_kdb.c:468
#6  0xc07b6394 in trap (frame=
      {tf_fs = -949288936, tf_es = -1067319280, tf_ds = -1065222128, tf_edi = 1, tf_esi = -
1065197495, tf_ebp = -949224592, tf_isp = -949224612, tf_ebx = -949224548, tf_edx = 0, tf_e
cx = -1060921344, tf_eax = 18, tf_trapno = 3, tf_err = 0, tf_eip = -1067288461, tf_cs = -1065222136, tf_eflags = 658, tf_esp = -949224560, tf_ss = -1067376657})
    at /usr/src/sys/i386/i386/trap.c:584
#7  0xc07a69ca in calltrap () at /usr/src/sys/i386/i386/exception.s:140
#8  0xc76b0018 in ?? ()
#9  0xc0620010 in schedcpu () at /usr/src/sys/kern/sched_4bsd.c:461
#10 0xc0611fef in panic (fmt=0xc0820008 "default")
    at /usr/src/sys/kern/kern_shutdown.c:550
#11 0xc0641a2c in m_copym (m=0x0, off0=1500, len=1480, wait=1)
    at /usr/src/sys/kern/uipc_mbuf.c:385
#12 0xc069b694 in ip_fragment (ip=0xc12f700e, m_frag=0xc76bfc6c, mtu=-1056788992, 
    if_hwassist_flags=0, sw_csum=1) at /usr/src/sys/netinet/ip_output.c:967
#13 0xc06933c1 in ip_fastforward (m=0xc12e6c00) at /usr/src/sys/netinet/ip_fastfwd.c:572
#14 0xc0672a59 in ether_demux (ifp=0xc0f90000, m=0xc12e6c00)
    at /usr/src/sys/net/if_ethersubr.c:770
#15 0xc06727f5 in ether_input (ifp=0xc0f90000, m=0xc12e6c00)
    at /usr/src/sys/net/if_ethersubr.c:631
#16 0xc0713507 in sis_rxeof (sc=0xc0f90000) at /usr/src/sys/pci/if_sis.c:1636
#17 0xc071398f in sis_intr (arg=0xc0f90000) at /usr/src/sys/pci/if_sis.c:1841
#18 0xc0600430 in ithread_loop (arg=0xc0ec6880) at /usr/src/sys/kern/kern_intr.c:547
#19 0xc05ff8a4 in fork_exit (callout=0xc060030c <ithread_loop>, arg=0xc0ec6880, 
    frame=0xc76bfd48) at /usr/src/sys/kern/kern_fork.c:791
#20 0xc07a6a2c in fork_trampoline () at /usr/src/sys/i386/i386/exception.s:209
(kgdb) 


[**** your patch applied ***]

#12 0xc069b690 in ip_fragment (ip=0xc11d580e, m_frag=0xc76bfc6c, mtu=-1056791808, 
    if_hwassist_flags=0, sw_csum=1) at /usr/src/sys/netinet/ip_output.c:967
#13 0xc06933bf in ip_fastforward (m=0xc11ae300) at /usr/src/sys/netinet/ip_fastfwd.c:586
#14 0xc0672a59 in ether_demux (ifp=0xc0f90000, m=0xc11ae300)
    at /usr/src/sys/net/if_ethersubr.c:770
#15 0xc06727f5 in ether_input (ifp=0xc0f90000, m=0xc11ae300)
    at /usr/src/sys/net/if_ethersubr.c:631



Giorgos Keramidas (keramida at freebsd.org) wrote:
> On 2005-07-20 11:41, Edwin <edwin at verolan.com> wrote:
> > I'm trying to understand the particulars about this - I get the null pointer
> > part, but as to ip_fragment - it's fragmenting mbufs to handle ip packets
> > during switching? and its failing trying to copy data past the end of the
> > chain?
> 
> ip_fastfwd() thinks that it should fragment the packet because it somehow
> calculates a bogus ``mtu'' value.  See the mtu value in frame 12 of the stack
> trace below.
> 
> > mbsd05# kgdb kernel.debug /tmp/crash/vmcore.3
> > [GDB will not be able to debug user-mode threads: /usr/lib/libthread_db.so: Undefined symbol "ps_pglobal_lookup"]
> > GNU gdb 6.1.1 [FreeBSD]
> > Copyright 2004 Free Software Foundation, Inc.
> > GDB is free software, covered by the GNU General Public License, and you are

[... deleted ...]

> > #11 0xc0641a2c in m_copym (m=0x0, off0=1500, len=1480, wait=1)
> >     at /usr/src/sys/kern/uipc_mbuf.c:385
> > #12 0xc069b694 in ip_fragment (ip=0xc11bd80e, m_frag=0xc76bfc6c, mtu=-1056787456, 
> >     if_hwassist_flags=0, sw_csum=1) at /usr/src/sys/netinet/ip_output.c:967
> 
> The ``mtu'' is an extremely small integer value, which is definitely a problem
> here.  Somehow, ip_fastforward() calculates a very wrong value for the ``mtu''.
> 
> > 6933c1 in ip_fastforward (m=0xc11ab100) at /usr/src/sys/netinet/ip_fastfwd.c:572
> 
> If you have this particular crash dump, can you show me a dump of the
> ``ro.ro_rt->rt_rmx'' and the ``ifp'' structure that ip_fastforward() is using?
> 
> One of these two seems to have an invalid mtu value.


More information about the freebsd-hackers mailing list