Commit 367705+367706 causes a pabic

Kristof Provost kp at FreeBSD.org
Fri Nov 20 14:53:51 UTC 2020


I still can’t reproduce that panic.

Does it happen immediately after you start a vnet jail?

Does it also happen with a GENERIC kernel?

Regards,
Kristof

On 20 Nov 2020, at 14:53, Peter Blok wrote:

> The panic with ipsec code in the backtrace was already very strange. I 
> was using IPsec, but only on one interface totally separate from the 
> members of the bridge as well as the bridge itself. The jails were not 
> doing any ipsec as well. Note that panic was a while ago and it was 
> after the 1st bridge epochification was done on stable-12 which was 
> later backed out
>
> Today the system is no longer using ipsec, but it is still compiled 
> in. I can remove it if need be for a test
>
>
> src.conf
> WITHOUT_KERBEROS=yes
> WITHOUT_GSSAPI=yes
> WITHOUT_SENDMAIL=true
> WITHOUT_MAILWRAPPER=true
> WITHOUT_DMAGENT=true
> WITHOUT_GAMES=true
> WITHOUT_IPFILTER=true
> WITHOUT_UNBOUND=true
> WITHOUT_PROFILE=true
> WITHOUT_ATM=true
> WITHOUT_BSNMP=true
> #WITHOUT_CROSS_COMPILER=true
> WITHOUT_DEBUG_FILES=true
> WITHOUT_DICT=true
> WITHOUT_FLOPPY=true
> WITHOUT_HTML=true
> WITHOUT_HYPERV=true
> WITHOUT_NDIS=true
> WITHOUT_NIS=true
> WITHOUT_PPP=true
> WITHOUT_TALK=true
> WITHOUT_TESTS=true
> WITHOUT_WIRELESS=true
> #WITHOUT_LIB32=true
> WITHOUT_LPR=true
>
> make.conf
> KERNCONF=BHYVE
> MODULES_OVERRIDE=opensolaris dtrace zfs vmm nmdm if_bridge bridgestp 
> if_vxlan pflog libmchain libiconv smbfs linux linux64 linux_common 
> linuxkpi linprocfs linsysfs ext2fs
> DEFAULT_VERSIONS+=perl5=5.30 mysql=5.7 python=3.8 python3=3.8
> OPTIONS_UNSET=DOCS NLS MANPAGES
>
> BHYVE
> cpu		HAMMER
> ident		BHYVE
>
> makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
> makeoptions	WITH_CTF=1		# Run ctfconvert(1) for DTrace support
>
> options		CAMDEBUG
>
> options 	SCHED_ULE		# ULE scheduler
> options 	PREEMPTION		# Enable kernel thread preemption
> options 	INET			# InterNETworking
> options 	INET6			# IPv6 communications protocols
> options		IPSEC
> options 	TCP_OFFLOAD		# TCP offload
> options		TCP_RFC7413		# TCP FASTOPEN
> options 	SCTP			# Stream Control Transmission Protocol
> options 	FFS			# Berkeley Fast Filesystem
> options 	SOFTUPDATES		# Enable FFS soft updates support
> options 	UFS_ACL			# Support for access control lists
> options 	UFS_DIRHASH		# Improve performance on big directories
> options 	UFS_GJOURNAL		# Enable gjournal-based UFS journaling
> options 	QUOTA			# Enable disk quotas for UFS
> options		SUIDDIR
> options 	NFSCL			# Network Filesystem Client
> options 	NFSD			# Network Filesystem Server
> options 	NFSLOCKD		# Network Lock Manager
> options 	MSDOSFS			# MSDOS Filesystem
> options 	CD9660			# ISO 9660 Filesystem
> options 	FUSEFS
> options		NULLFS			# NULL filesystem
> options		UNIONFS
> options		FDESCFS			# File descriptor filesystem
> options 	PROCFS			# Process filesystem (requires PSEUDOFS)
> options 	PSEUDOFS		# Pseudo-filesystem framework
> options 	GEOM_PART_GPT		# GUID Partition Tables.
> options 	GEOM_RAID		# Soft RAID functionality.
> options 	GEOM_LABEL		# Provides labelization
> options 	GEOM_ELI		# Disk encryption.
> options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
> options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
> options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
> options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
> options 	COMPAT_FREEBSD7		# Compatible with FreeBSD7
> options 	COMPAT_FREEBSD9		# Compatible with FreeBSD9
> options 	COMPAT_FREEBSD10	# Compatible with FreeBSD10
> options 	COMPAT_FREEBSD11	# Compatible with FreeBSD11
> options 	SCSI_DELAY=5000		# Delay (in ms) before probing SCSI
> options 	KTRACE			# ktrace(1) support
> options 	STACK			# stack(9) support
> options 	SYSVSHM			# SYSV-style shared memory
> options 	SYSVMSG			# SYSV-style message queues
> options 	SYSVSEM			# SYSV-style semaphores
> options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time 
> extensions
> options 	PRINTF_BUFR_SIZE=128	# Prevent printf output being 
> interspersed.
> options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
> options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
> options 	AUDIT			# Security event auditing
> options 	CAPABILITY_MODE		# Capsicum capability mode
> options 	CAPABILITIES		# Capsicum capabilities
> options 	MAC			# TrustedBSD MAC Framework
> options 	MAC_PORTACL
> options 	MAC_NTPD
> options 	KDTRACE_FRAME		# Ensure frames are compiled in
> options 	KDTRACE_HOOKS		# Kernel DTrace hooks
> options 	DDB_CTF			# Kernel ELF linker loads CTF data
> options 	INCLUDE_CONFIG_FILE	# Include this file in kernel
>
> # Debugging support.  Always need this:
> options 	KDB			# Enable kernel debugger support.
> options 	KDB_TRACE		# Print a stack trace for a panic.
> options 	KDB_UNATTENDED
>
> # Make an SMP-capable kernel by default
> options 	SMP			# Symmetric MultiProcessor Kernel
> options 	EARLY_AP_STARTUP
>
> # CPU frequency control
> device		cpufreq
> device		cpuctl
> device		coretemp
>
> # Bus support.
> device		acpi
> options 	ACPI_DMAR
> device		pci
> options		PCI_IOV			# PCI SR-IOV support
>
> device		iicbus
> device		iicbb
>
> device		iic
> device		ic
> device		iicsmb
>
> device		ichsmb
> device		smbus
> device		smb
>
> #device		jedec_dimm
>
> # ATA controllers
> device		ahci			# AHCI-compatible SATA controllers
> device		mvs			# Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA
>
> # SCSI Controllers
> device		mps			# LSI-Logic MPT-Fusion 2
>
> # ATA/SCSI peripherals
> device		scbus			# SCSI bus (required for ATA/SCSI)
> device		da			# Direct Access (disks)
> device		cd			# CD
> device		pass			# Passthrough device (direct ATA/SCSI access)
> device		ses			# Enclosure Services (SES and SAF-TE)
> device		sg
>
> device		cfiscsi
> device		ctl			# CAM Target Layer
> device		iscsi
>
> # atkbdc0 controls both the keyboard and the PS/2 mouse
> device		atkbdc			# AT keyboard controller
> device		atkbd			# AT keyboard
> device		psm			# PS/2 mouse
>
> device		kbdmux			# keyboard multiplexer
>
> # vt is the new video console driver
> device		vt
> device		vt_vga
> device		vt_efifb
>
> # Serial (COM) ports
> device		uart			# Generic UART driver
>
> # PCI/PCI-X/PCIe Ethernet NICs that use iflib infrastructure
> device		iflib
> device		em			# Intel PRO/1000 Gigabit Ethernet Family
> device		ix			# Intel PRO/10GbE PCIE PF Ethernet
>
> # Network stack virtualization.
> options		VIMAGE
>
> # Pseudo devices.
> device		crypto
> device		cryptodev
> device		loop			# Network loopback
> device		random			# Entropy device
> device		padlock_rng		# VIA Padlock RNG
> device		rdrand_rng		# Intel Bull Mountain RNG
> device		ipmi
> device		smbios
> device		vpd
> device		aesni			# AES-NI OpenCrypto module
> device		ether			# Ethernet support
> device		lagg
> device		vlan			# 802.1Q VLAN support
> device		tuntap			# Packet tunnel.
> device		md			# Memory "disks"
> device		gif			# IPv6 and IPv4 tunneling
> device		firmware		# firmware assist module
>
> device		pf
> #device		pflog
> #device		pfsync
>
> # The `bpf' device enables the Berkeley Packet Filter.
> # Be aware of the administrative consequences of enabling this!
> # Note that 'bpf' is required for DHCP.
> device		bpf			# Berkeley packet filter
>
> # The `epair' device implements a virtual back-to-back connected 
> Ethernet
> # like interface pair.
> device		epair
>
> # USB support
> options 	USB_DEBUG		# enable debug msgs
> device		uhci			# UHCI PCI->USB interface
> device		ohci			# OHCI PCI->USB interface
> device		ehci			# EHCI PCI->USB interface (USB 2.0)
> device		xhci			# XHCI PCI->USB interface (USB 3.0)
> device		usb			# USB Bus (required)
> device		uhid
> device		ukbd			# Keyboard
> device		umass			# Disks/Mass storage - Requires scbus and da
> device		ums
>
> device		filemon
>
> device		if_bridge
>
>> On 20 Nov 2020, at 12:53, Kristof Provost <kp at FreeBSD.org> wrote:
>>
>> Can you share your kernel config file (and src.conf / make.conf if 
>> they exist)?
>>
>> This second panic is in the IPSec code. My current thinking is that 
>> your kernel config is triggering a bug that’s manifesting in 
>> multiple places, but not actually caused by those places.
>>
>> I’d like to be able to reproduce it so we can debug it.
>>
>> Best regards,
>> Kristof
>>
>> On 20 Nov 2020, at 12:02, Peter Blok wrote:
>>> Hi Kristof,
>>>
>>> This is 12-stable. With the previous bridge epochification that was 
>>> backed out my config had a panic too.
>>>
>>> I don’t have any local modifications. I did a clean rebuild after 
>>> removing /usr/obj/usr
>>>
>>> My kernel is custom - I only have zfs.ko, opensolaris.ko, vmm.ko and 
>>> nmdm.ko as modules. Everything else is statically linked. I have 
>>> removed all drivers not needed for the hardware at hand.
>>>
>>> My bridge is between two vlans from the same trunk and the jail 
>>> epair devices as well as the bhyve tap devices.
>>>
>>> The panic happens when the jails are starting.
>>>
>>> I can try to narrow it down over the weekend and make the crash dump 
>>> available for analysis.
>>>
>>> Previously I had the following crash with 363492
>>>
>>> kernel trap 12 with interrupts disabled
>>>
>>>
>>> Fatal trap 12: page fault while in kernel mode
>>> cpuid = 2; apic id = 02
>>> fault virtual address	= 0xffffffff00000410
>>> fault code		= supervisor read data, page not present
>>> instruction pointer	= 0x20:0xffffffff80692326
>>> stack pointer	        = 0x28:0xfffffe00c06097b0
>>> frame pointer	        = 0x28:0xfffffe00c06097f0
>>> code segment		= base 0x0, limit 0xfffff, type 0x1b
>>> 			= DPL 0, pres 1, long 1, def32 0, gran 1
>>> processor eflags	= resume, IOPL = 0
>>> current process		= 2030 (ifconfig)
>>> trap number		= 12
>>> panic: page fault
>>> cpuid = 2
>>> time = 1595683412
>>> KDB: stack backtrace:
>>> #0 0xffffffff80698165 at kdb_backtrace+0x65
>>> #1 0xffffffff8064d67b at vpanic+0x17b
>>> #2 0xffffffff8064d4f3 at panic+0x43
>>> #3 0xffffffff809cc311 at trap_fatal+0x391
>>> #4 0xffffffff809cc36f at trap_pfault+0x4f
>>> #5 0xffffffff809cb9b6 at trap+0x286
>>> #6 0xffffffff809a5b28 at calltrap+0x8
>>> #7 0xffffffff803677fd at ck_epoch_synchronize_wait+0x8d
>>> #8 0xffffffff8069213a at epoch_wait_preempt+0xaa
>>> #9 0xffffffff807615b7 at ipsec_ioctl+0x3a7
>>> #10 0xffffffff8075274f at ifioctl+0x47f
>>> #11 0xffffffff806b5ea7 at kern_ioctl+0x2b7
>>> #12 0xffffffff806b5b4a at sys_ioctl+0xfa
>>> #13 0xffffffff809ccec7 at amd64_syscall+0x387
>>> #14 0xffffffff809a6450 at fast_syscall_common+0x101
>>>
>>>
>>>
>>>
>>>> On 20 Nov 2020, at 11:30, Kristof Provost <kp at FreeBSD.org> wrote:
>>>>
>>>> On 20 Nov 2020, at 11:18, peter.blok at bsd4all.org 
>>>> <mailto:peter.blok at bsd4all.org> wrote:
>>>>> I’m afraid the last Epoch fix for bridge is not solving the 
>>>>> problem ( or perhaps creates a new ).
>>>>>
>>>> We’re talking about the stable/12 branch, right?
>>>>
>>>>> This seems to happen when the jail epair is added to the bridge.
>>>>>
>>>> There must be something more to it than that. I’ve run the bridge 
>>>> tests on stable/12 without issue, and this is a problem we didn’t 
>>>> see when the bridge epochification initially went into stable/12.
>>>>
>>>> Do you have a custom kernel config? Other patches? What exact 
>>>> commands do you run to trigger the panic?
>>>>
>>>>> kernel trap 12 with interrupts disabled
>>>>>
>>>>>
>>>>> Fatal trap 12: page fault while in kernel mode
>>>>> cpuid = 6; apic id = 06
>>>>> fault virtual address	= 0xc10
>>>>> fault code		= supervisor read data, page not present
>>>>> instruction pointer	= 0x20:0xffffffff80695e76
>>>>> stack pointer	        = 0x28:0xfffffe00bf14e6e0
>>>>> frame pointer	        = 0x28:0xfffffe00bf14e720
>>>>> code segment		= base 0x0, limit 0xfffff, type 0x1b
>>>>> 			= DPL 0, pres 1, long 1, def32 0, gran 1
>>>>> processor eflags	= resume, IOPL = 0
>>>>> current process		= 1686 (jail)
>>>>> trap number		= 12
>>>>> panic: page fault
>>>>> cpuid = 6
>>>>> time = 1605811310
>>>>> KDB: stack backtrace:
>>>>> #0 0xffffffff8069bb85 at kdb_backtrace+0x65
>>>>> #1 0xffffffff80650a4b at vpanic+0x17b
>>>>> #2 0xffffffff806508c3 at panic+0x43
>>>>> #3 0xffffffff809d0351 at trap_fatal+0x391
>>>>> #4 0xffffffff809d03af at trap_pfault+0x4f
>>>>> #5 0xffffffff809cf9f6 at trap+0x286
>>>>> #6 0xffffffff809a98c8 at calltrap+0x8
>>>>> #7 0xffffffff80368a8d at ck_epoch_synchronize_wait+0x8d
>>>>> #8 0xffffffff80695c8a at epoch_wait_preempt+0xaa
>>>>> #9 0xffffffff80757d40 at vnet_if_init+0x120
>>>>> #10 0xffffffff8078c994 at vnet_alloc+0x114
>>>>> #11 0xffffffff8061e3f7 at kern_jail_set+0x1bb7
>>>>> #12 0xffffffff80620190 at sys_jail_set+0x40
>>>>> #13 0xffffffff809d0f07 at amd64_syscall+0x387
>>>>> #14 0xffffffff809aa1ee at fast_syscall_common+0xf8
>>>>
>>>> This panic is rather odd. This isn’t even the bridge code. This 
>>>> is during initial creation of the vnet. I don’t really see how 
>>>> this could even trigger panics.
>>>> That panic looks as if something corrupted the net_epoch_preempt, 
>>>> by overwriting the epoch->e_epoch. The bridge patches only access 
>>>> this variable through the well-established functions and macros. I 
>>>> see no obvious way that they could corrupt it.
>>>>
>>>> Best regards,
>>>> Kristof
>>
>>
>> _______________________________________________
>> freebsd-stable at freebsd.org mailing list
>> https://lists.freebsd.org/mailman/listinfo/freebsd-stable
>> To unsubscribe, send any mail to 
>> "freebsd-stable-unsubscribe at freebsd.org"


More information about the freebsd-stable mailing list