[follow-up] FreeBSD/amd64 r195146 to r195848, fatal trap 12 under
network load
Kamigishi Rei
spambox at haruhiism.net
Tue Jul 28 14:24:55 UTC 2009
Hello, hope you're having a nice day,
Revisions mentioned are those which were tested by me; r195849+ has the
corruption padded somewhere else so it might produce a panic with a
different set of options. For reference, my test kernel uses a GENERIC
config from May 09 snapshot without WITNESS and with IPFIREWALL,
IPFIREWALL_DEFAULT_TO_ACCEPT and DEVICE_POLLING enabled.
If someone experiences fatal traps under network load, and has the
kernel compiled with "options INVARIANTS", here's a patch to check if
you're suffering the memory corruption in netisr's DPCPU area. I'm
pretty interested in the backtraces this panic() call will produce.
Please note: with this patch, your system - if affected by the
aforementioned problem - will trap *almost immediately* after the
corruption happens, while a non-patched system can survive that for a
bit if by the time v is assigned mtx_lock's value in _mtx_sleep_flags()
the value is already fixed by a concurrent thread.
I highly recommend having a backup 'normal kernel'.
DO NOT apply this patch on a system you can't access via local (and/or
IPMI/LOM) or serial console.
For systems without INVARIANTS (although I'm not sure if the issue
affects non-INVARIANTS builds) you can replace
KASSERT(!(((foo & 0x8000000000000000)==0x0) && (foo != MTX_UNOWNED)),("mi_switch: DPCPU sanity checks: netisr workstream mutex nws_mtx contains an invalid pointer %llx in mtx_lock; this will lead to a page fault (cpuid: %u). Terminating.\n", ((long long unsigned)foo), (mycpuid)));
with
if(((foo & 0x8000000000000000)==0x0) && (foo != MTX_UNOWNED)) panic("mi_switch: DPCPU sanity checks: netisr workstream mutex nws_mtx contains an invalid pointer %llx in mtx_lock; this will lead to a page fault (cpuid: %u). Terminating.\n", (long long unsigned)foo, mycpuid);
You can also replace the panic() call with a simple printf(), and it
will just print out the warning message, but that won't really help
debugging the cause, although will check if you're getting a fatal trap
because of that aforementioned corruption.
I will be really grateful if someone can help me debug this issue,
namely, point me at how can I trace the problem to a thread/system call
which could have messed with the memory area in question.
--
Kamigishi Rei
KREI-RIPE
-------------- next part --------------
Index: sys/kern/kern_synch.c
===================================================================
--- sys/kern/kern_synch.c (revision 195848)
+++ sys/kern/kern_synch.c (working copy)
@@ -71,6 +71,8 @@
#include <vm/pmap.h>
#endif
+#include <net/netisr.h>
+
#define KTDSTATE(td) \
(((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \
((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \
@@ -391,7 +393,18 @@
uint64_t runtime, new_switchtime;
struct thread *td;
struct proc *p;
+ struct netisr_workstream *nwsp;
+ uintptr_t foo;
+ unsigned int mycpuid;
+ for (mycpuid = 0; mycpuid < mp_maxid; mycpuid++) {
+ nwsp = DPCPU_ID_PTR(mycpuid, nws);
+ if (mtx_initialized(&(nwsp->nws_mtx))) {
+ foo = nwsp->nws_mtx.mtx_lock;
+ KASSERT(!(((foo & 0x8000000000000000)==0x0) && (foo != MTX_UNOWNED)),("mi_switch: DPCPU sanity checks: netisr workstream mutex nws_mtx contains an invalid pointer %llx in mtx_lock; this will lead to a page fault (cpuid: %u). Terminating.\n", ((long long unsigned)foo), (mycpuid)));
+ }
+ }
+
td = curthread; /* XXX */
THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
p = td->td_proc; /* XXX */
Index: sys/net/netisr.c
===================================================================
--- sys/net/netisr.c (revision 195848)
+++ sys/net/netisr.c (working copy)
@@ -203,8 +203,6 @@
u_int np_policy; /* Work placement policy. */
};
-#define NETISR_MAXPROT 16 /* Compile-time limit. */
-
/*
* The np array describes all registered protocols, indexed by protocol
* number.
@@ -212,53 +210,6 @@
static struct netisr_proto np[NETISR_MAXPROT];
/*
- * Protocol-specific work for each workstream is described by struct
- * netisr_work. Each work descriptor consists of an mbuf queue and
- * statistics.
- */
-struct netisr_work {
- /*
- * Packet queue, linked by m_nextpkt.
- */
- struct mbuf *nw_head;
- struct mbuf *nw_tail;
- u_int nw_len;
- u_int nw_qlimit;
- u_int nw_watermark;
-
- /*
- * Statistics -- written unlocked, but mostly from curcpu.
- */
- u_int64_t nw_dispatched; /* Number of direct dispatches. */
- u_int64_t nw_hybrid_dispatched; /* "" hybrid dispatches. */
- u_int64_t nw_qdrops; /* "" drops. */
- u_int64_t nw_queued; /* "" enqueues. */
- u_int64_t nw_handled; /* "" handled in worker. */
-};
-
-/*
- * Workstreams hold a set of ordered work across each protocol, and are
- * described by netisr_workstream. Each workstream is associated with a
- * worker thread, which in turn is pinned to a CPU. Work associated with a
- * workstream can be processd in other threads during direct dispatch;
- * concurrent processing is prevented by the NWS_RUNNING flag, which
- * indicates that a thread is already processing the work queue.
- */
-struct netisr_workstream {
- struct intr_event *nws_intr_event; /* Handler for stream. */
- void *nws_swi_cookie; /* swi(9) cookie for stream. */
- struct mtx nws_mtx; /* Synchronize work. */
- u_int nws_cpu; /* CPU pinning. */
- u_int nws_flags; /* Wakeup flags. */
- u_int nws_pendingbits; /* Scheduled protocols. */
-
- /*
- * Each protocol has per-workstream data.
- */
- struct netisr_work nws_work[NETISR_MAXPROT];
-} __aligned(CACHE_LINE_SIZE);
-
-/*
* Per-CPU workstream data.
*/
DPCPU_DEFINE(struct netisr_workstream, nws);
Index: sys/net/netisr.h
===================================================================
--- sys/net/netisr.h (revision 195848)
+++ sys/net/netisr.h (working copy)
@@ -111,6 +111,60 @@
};
/*
+ * Protocol-specific work for each workstream is described by struct
+ * netisr_work. Each work descriptor consists of an mbuf queue and
+ * statistics.
+ */
+struct netisr_work {
+ /*
+ * Packet queue, linked by m_nextpkt.
+ */
+ struct mbuf *nw_head;
+ struct mbuf *nw_tail;
+ u_int nw_len;
+ u_int nw_qlimit;
+ u_int nw_watermark;
+
+ /*
+ * Statistics -- written unlocked, but mostly from curcpu.
+ */
+ u_int64_t nw_dispatched; /* Number of direct dispatches. */
+ u_int64_t nw_hybrid_dispatched; /* "" hybrid dispatches. */
+ u_int64_t nw_qdrops; /* "" drops. */
+ u_int64_t nw_queued; /* "" enqueues. */
+ u_int64_t nw_handled; /* "" handled in worker. */
+};
+
+#define NETISR_MAXPROT 16 /* Compile-time limit. */
+
+/*
+ * Workstreams hold a set of ordered work across each protocol, and are
+ * described by netisr_workstream. Each workstream is associated with a
+ * worker thread, which in turn is pinned to a CPU. Work associated with a
+ * workstream can be processd in other threads during direct dispatch;
+ * concurrent processing is prevented by the NWS_RUNNING flag, which
+ * indicates that a thread is already processing the work queue.
+ */
+struct netisr_workstream {
+ struct intr_event *nws_intr_event; /* Handler for stream. */
+ void *nws_swi_cookie; /* swi(9) cookie for stream. */
+ struct mtx nws_mtx; /* Synchronize work. */
+ u_int nws_cpu; /* CPU pinning. */
+ u_int nws_flags; /* Wakeup flags. */
+ u_int nws_pendingbits; /* Scheduled protocols. */
+
+ /*
+ * Each protocol has per-workstream data.
+ */
+ struct netisr_work nws_work[NETISR_MAXPROT];
+} __aligned(CACHE_LINE_SIZE);
+
+/*
+ * Declare per-CPU workstream data globally
+ */
+DPCPU_DECLARE(struct netisr_workstream, nws);
+
+/*
* Register, unregister, and other netisr handler management functions.
*/
void netisr_clearqdrops(const struct netisr_handler *nhp);
More information about the freebsd-current
mailing list