svn commit: r362294 - head/sys/dev/sound/pci/hda

Sun Jun 21 21:59:54 UTC 2020

On Thu, Jun 18, 2020 at 06:12:06AM +0000, Andriy Gapon wrote:
> Author: avg
> Date: Thu Jun 18 06:12:06 2020
> New Revision: 362294
> URL: https://svnweb.freebsd.org/changeset/base/362294
> 
> Log:
>   hdac_intr_handler: keep working until global interrupt status clears
>   
>   It is plausible that the hardware interrupts a host only when GIS goes
>   from zero to one.  GIS is formed by OR-ing multiple hardware statuses,
>   so it's possible that a previously cleared status gets set again while
>   another status has not been cleared yet.  Thus, there will be no new
>   interrupt as GIS always stayed set.  If we don't re-examine GIS then we
>   can leave it set and never get another interrupt again.
>   
>   Without this change I frequently saw a problem where snd_hda would stop
>   working.  Setting dev.hdac.1.polling=1 would bring it back to life and
>   afterwards I could set polling back to zero.  Sometimes the problem
>   started right after a boot, sometimes it happened after resuming from
>   S3, frequently it would occur when sound output and input are active
>   concurrently (such as during conferencing).  I looked at HDAC_INTSTS
>   while the sound was not working and I saw that both HDAC_INTSTS_GIS and
>   HDAC_INTSTS_CIS were set, but there were no interrupts.
>   
>   I have collected some statistics over a period of several days about how
>   many loops (calls to hdac_one_intr) the new code did for a single
>   interrupt:
>   +--------+--------------+
>   |Loops   |Times Happened|
>   +--------+--------------+
>   |0       |301           |
>   |1       |12857746      |
>   |2       |280           |
>   |3       |2             |
>   |4+      |0             |
>   +--------+--------------+
>   I believe that previously the sound would get stuck each time we had to loop
>   more than once.
>   
>   The tested hardware is:
>   hdac1: <AMD (0x15e3) HDA Controller> mem 0xfe680000-0xfe687fff at device 0.6 on pci4
>   hdacc1: <Realtek ALC269 HDA CODEC> at cad 0 on hdac1
>   
>   No objections:	mav
>   MFC after:	5 weeks
>   Differential Revision: https://reviews.freebsd.org/D25128
> 
> Modified:
>   head/sys/dev/sound/pci/hda/hdac.c
> 
> Modified: head/sys/dev/sound/pci/hda/hdac.c
> ==============================================================================
> --- head/sys/dev/sound/pci/hda/hdac.c	Wed Jun 17 23:41:04 2020	(r362293)
> +++ head/sys/dev/sound/pci/hda/hdac.c	Thu Jun 18 06:12:06 2020	(r362294)
> @@ -303,30 +303,13 @@ hdac_config_fetch(struct hdac_softc *sc, uint32_t *on,
>  	}
>  }
>  
> -/****************************************************************************
> - * void hdac_intr_handler(void *)
> - *
> - * Interrupt handler. Processes interrupts received from the hdac.
> - ****************************************************************************/
>  static void
> -hdac_intr_handler(void *context)
> +hdac_one_intr(struct hdac_softc *sc, uint32_t intsts)
>  {
> -	struct hdac_softc *sc;
>  	device_t dev;
> -	uint32_t intsts;
>  	uint8_t rirbsts;
>  	int i;
>  
> -	sc = (struct hdac_softc *)context;
> -	hdac_lock(sc);
> -
> -	/* Do we have anything to do? */
> -	intsts = HDAC_READ_4(&sc->mem, HDAC_INTSTS);
> -	if ((intsts & HDAC_INTSTS_GIS) == 0) {
> -		hdac_unlock(sc);
> -		return;
> -	}
> -
>  	/* Was this a controller interrupt? */
>  	if (intsts & HDAC_INTSTS_CIS) {
>  		rirbsts = HDAC_READ_1(&sc->mem, HDAC_RIRBSTS);
> @@ -346,16 +329,45 @@ hdac_intr_handler(void *context)
>  			if ((intsts & (1 << i)) == 0)
>  				continue;
>  			HDAC_WRITE_1(&sc->mem, (i << 5) + HDAC_SDSTS,
> -			    HDAC_SDSTS_DESE | HDAC_SDSTS_FIFOE | HDAC_SDSTS_BCIS );
> +			    HDAC_SDSTS_DESE | HDAC_SDSTS_FIFOE | HDAC_SDSTS_BCIS);
>  			if ((dev = sc->streams[i].dev) != NULL) {
>  				HDAC_STREAM_INTR(dev,
>  				    sc->streams[i].dir, sc->streams[i].stream);
>  			}
>  		}
>  	}
> +}
>  
> -	HDAC_WRITE_4(&sc->mem, HDAC_INTSTS, intsts);
> -	hdac_unlock(sc);
> +/****************************************************************************
> + * void hdac_intr_handler(void *)
> + *
> + * Interrupt handler. Processes interrupts received from the hdac.
> + ****************************************************************************/
> +static void
> +hdac_intr_handler(void *context)
> +{
> +	struct hdac_softc *sc;
> +	uint32_t intsts;
> +
> +	sc = (struct hdac_softc *)context;
> +
> +	/*
> +	 * Loop until HDAC_INTSTS_GIS gets clear.
> +	 * It is plausible that hardware interrupts a host only when GIS goes
> +	 * from zero to one.  GIS is formed by OR-ing multiple hardware
> +	 * statuses, so it's possible that a previously cleared status gets set
> +	 * again while another status has not been cleared yet.  Thus, there
> +	 * will be no new interrupt as GIS always stayed set.  If we don't
> +	 * re-examine GIS then we can leave it set and never get an interrupt
> +	 * again.
> +	 */
> +	intsts = HDAC_READ_4(&sc->mem, HDAC_INTSTS);
> +	while ((intsts & HDAC_INTSTS_GIS) != 0) {
> +		hdac_lock(sc);
> +		hdac_one_intr(sc, intsts);
> +		hdac_unlock(sc);
> +		intsts = HDAC_READ_4(&sc->mem, HDAC_INTSTS);
> +	}
>  }
>  
>  static void
This commit (or rather, a merge of this commit to stable/12) causes an issue
on my Apollo Lake machine.  Look:
hdac0 at pci0:0:14:0:   class=0x040300 card=0xa1821458 chip=0x5a988086 rev=0x0b hdr=0x00
    vendor     = 'Intel Corporation'
    device     = 'Celeron N3350/Pentium N4200/Atom E3900 Series Audio Cluster'
    class      = multimedia
    subclass   = HDA
    bar   [10] = type Memory, range 64, base 0x91410000, size 16384, enabled
    bar   [20] = type Memory, range 64, base 0x91000000, size 1048576, enabled
    cap 01[50] = powerspec 3  supports D0 D3  current D0
    cap 09[80] = vendor (length 20) Intel cap 15 version 0
    cap 05[60] = MSI supports 1 message, 64 bit

DMAP base is 0xfffff80000000000.
tom% sudo /usr/local/bin/kgdb ~/work/DEV/svn/RELENG_12/sys/amd64/compile/TOM/kernel.full /dev/mem
GNU gdb (GDB) 9.2 [GDB v9.2 for FreeBSD]
Reading symbols from /usr/home/kostik/work/DEV/svn/RELENG_12/sys/amd64/compile/TOM/kernel.full...
...
sched_switch (td=0xfffff8029d516000, newtd=0xfffff800025f4000,
    flags=<optimized out>) at ../../../kern/sched_ule.c:2143
2143                    cpuid = PCPU_GET(cpuid);
(kgdb) p/x *(unsigned int *)(0xfffff80000000000+0x91410000+0x24) INTSTS
$1 = 0xc0000000
This causes the first interrupt handler to loop forever.

And this is why:
(kgdb) p/x *(unsigned char *)(0xfffff80000000000+0x91410000+0x5d) RIRBSTS
$3 = 0x0
(kgdb) p/x *(unsigned char *)(0xfffff80000000000+0x91410000+0x4d) CORBSTS
$4 = 0x0
(kgdb) p/x *(unsigned short *)(0xfffff80000000000+0x91410000+0xe) WAKESTS
$5 = 0x5
So the SDIN State Change status bit is set, and nothing in the driver
clears it, it seems.

More, the EDS specifies that another reason for INTSTS CIS bit set may
be the CORB Memory Error Indication (CMEI), and again it seems that driver
never clears the reason.

So as is the change perhaps helps for some situations, but also
practically makes some systems to loose core in tight loop with interrupt
priority, which causes weird machine misbehavior like unkillable processes
and never finishing epochs.