Arm64 stack issues (was Re: FreeBSD status for/on ODroid-C2?)

Mark Millard markmi at dsl-only.net
Sun Feb 5 09:12:33 UTC 2017


[Top post of a new result.]

Using lldb to look at the memory for the stack around
sh failure points has some apparently fixed structure.
Example:

. . . junk values . . .
0xffffffffe4d0: 0x0000000000000078 0x637261612f737470
0xffffffffe4e0: 0x00000000004345c8 0x0000000000434000
0xffffffffe4f0: 0x0000000000434000 0x0000000040a903e0
0xffffffffe500: 0x0000ffffffffe540 0x000000004054cd94
0xffffffffe510: 0x696d6b72616d2f6c 0x0000000000000000
0xffffffffe520: 0x0000000000000000 0x0000000000000000
0xffffffffe530: 0x0000000000000000 0xe8021690dc1f70b8
0xffffffffe540: 0x00000000004345c8 0x0000000000434000
0xffffffffe550: 0x0000000000434000 0x000000000000000f
0xffffffffe560: 0x0000ffffffffe5a0 0x000000000041aef0
0xffffffffe570: 0x0000000000434c38 0x732f7273752f3a6e
0xffffffffe580: 0x0000000000000001 0x0000000000000005
0xffffffffe590: 0x0000000040a33180 0x0000000000000000
0xffffffffe5a0: 0x0000ffffffffc5c0 0x000000000040f490
. . .  junk values . . .

where "register read" showed:

        sp = 0x0000ffffffffe600

(The distance and direction to the last non-junk line
from the reported sp in each example is the same.)
Looking around that 0x000000000040f490:

    0x40f48c: 0x97fffc74   bl     0x40e65c                  ; freejob at jobs.c:463
    0x40f490: 0x9100c294   add    x20, x20, #0x30           ; =0x30 

It is the same address and code in each case.

Sometimes the junk values are all zeros over sizable
distances. Sometimes the sizable areas seem to have
random data.

/usr/src/bin/sh/jobs.c 's freejobs is:

static void
freejob(struct job *jp)
{
        struct procstat *ps;
        int i;

        INTOFF;
        if (bgjob == jp)
                bgjob = NULL;
        for (i = jp->nprocs, ps = jp->ps ; --i >= 0 ; ps++) {
                if (ps->cmd != nullstr)
                        ckfree(ps->cmd);
        }
        if (jp->ps != &jp->ps0)
                ckfree(jp->ps);
        jp->used = 0;
#if JOBS
        deljob(jp);
#endif
        INTON;
}

/usr/src/bin/sh/error.h defines INTOFF and INTON:

#define EXINT 0         /* SIGINT received */
#define EXERROR 1       /* a generic error */
#define EXEXEC 2        /* command execution failed */
#define EXEXIT 3        /* call exitshell(exitstatus) */

. . .

extern struct jmploc *handler;
extern volatile sig_atomic_t exception;

. . .

extern volatile sig_atomic_t suppressint;
extern volatile sig_atomic_t intpending;

#define INTOFF suppressint++
#define INTON { if (--suppressint == 0 && intpending) onint(); }
#define is_int_on() suppressint
#define SETINTON(s) suppressint = (s)
#define FORCEINTON {suppressint = 0; if (intpending) onint();}
#define SET_PENDING_INT intpending = 1
#define CLEAR_PENDING_INT intpending = 0
#define int_pending() intpending

void exraise(int) __dead2;
void onint(void) __dead2;

/usr/src/bin/sh/error.c hAS:

void
exraise(int e)
{
        INTOFF;
        if (handler == NULL)
                abort();
        exception = e;
        longjmp(handler->loc, 1);
}
. . .
void
onint(void)
{
        sigset_t sigs;

        intpending = 0;
        sigemptyset(&sigs);
        sigprocmask(SIG_SETMASK, &sigs, NULL);

        /*
         * This doesn't seem to be needed, since main() emits a newline.
         */
#if 0
        if (tcgetpgrp(0) == getpid())
                write(STDERR_FILENO, "\n", 1);
#endif
        if (rootshell && iflag)
                exraise(EXINT);
        else {
                signal(SIGINT, SIG_DFL);
                kill(getpid(), SIGINT);
                _exit(128 + SIGINT);
        }
}

# grep setjmp /usr/src/bin/sh/*
/usr/src/bin/sh/TOUR:so I implement it using setjmp and longjmp.  The global variable
/usr/src/bin/sh/error.h:#include <setjmp.h>
/usr/src/bin/sh/error.h: * BSD setjmp saves the signal mask, which violates ANSI C and takes time,
/usr/src/bin/sh/error.h: * so we use _setjmp instead.
/usr/src/bin/sh/error.h:#define setjmp(jmploc)	_setjmp(jmploc)
/usr/src/bin/sh/eval.c:	if (setjmp(jmploc.loc)) {
/usr/src/bin/sh/eval.c:	if (setjmp(jmploc.loc))
/usr/src/bin/sh/eval.c:		if (setjmp(jmploc.loc)) {
/usr/src/bin/sh/eval.c:		if (setjmp(jmploc.loc)) {
/usr/src/bin/sh/eval.c:		if (setjmp(jmploc.loc)) {
/usr/src/bin/sh/histedit.c:		if (setjmp(jmploc.loc)) {
/usr/src/bin/sh/jobs.c:		if (setjmp(jmploc.loc))
/usr/src/bin/sh/main.c: * commands.  The setjmp call sets up the location to jump to when an
/usr/src/bin/sh/main.c:	if (setjmp(main_handler.loc)) {
/usr/src/bin/sh/parser.c:	if (setjmp(jmploc.loc)) {
/usr/src/bin/sh/parser.c:	if (!setjmp(jmploc.loc)) {
/usr/src/bin/sh/trap.c:	if (!setjmp(loc1.loc)) {
/usr/src/bin/sh/trap.c:	if (!setjmp(loc2.loc)) {
/usr/src/bin/sh/var.c:	if (setjmp(jmploc.loc))



Other notes:

As a personal investigation I've temporarily changed to using
something not fully generic but based on gic-400 specifics:

# svnlite diff /usr/src/sys/arm/arm/gic.c
Index: /usr/src/sys/arm/arm/gic.c
===================================================================
--- /usr/src/sys/arm/arm/gic.c  (revision 312982)
+++ /usr/src/sys/arm/arm/gic.c  (working copy)
@@ -672,9 +672,13 @@
 
        if (irq >= sc->nirqs) {
 #ifdef GIC_DEBUG_SPURIOUS
+#define EXPECTED_SPURIOUS_IRQ 1023
+           if (irq != EXPECTED_SPURIOUS_IRQ) {
                device_printf(sc->gic_dev,
-                   "Spurious interrupt detected: last irq: %d on CPU%d\n",
+                   "Spurious interrupt %d detected of %d: last irq: %d on CPU%d\n",
+                   irq, sc->nirqs,
                    sc->last_irq[PCPU_GET(cpuid)], PCPU_GET(cpuid));
+            }
 #endif
                return (FILTER_HANDLED);
        }
@@ -720,6 +724,16 @@
        if (irq < sc->nirqs)
                goto dispatch_irq;
 
+       if (irq != EXPECTED_SPURIOUS_IRQ) {
+#undef EXPECTED_SPURIOUS_IRQ
+#ifdef GIC_DEBUG_SPURIOUS
+               device_printf(sc->gic_dev,
+                   "Spurious end interrupt %d detected of %d: last irq: %d on CPU%d\n",
+                   irq, sc->nirqs,
+                   sc->last_irq[PCPU_GET(cpuid)], PCPU_GET(cpuid));
+#endif
+       }
+
        return (FILTER_HANDLED);
 }
 

The result was no notices of Spurious interrupts have been generated:
All of the odd interrupts were the special 1023 value.

[As far as I could tell from the code the configuration is such that
1022 should not be generated --and were not. 1020 and 1021 are
reserved and should not be generated.]





===
Mark Millard
markmi at dsl-only.net

On 2017-Feb-2, at 12:37 AM, Mark Millard <markmi at dsl-only.net> wrote:

I'm out of my element here but I will note one difference
between what I read in the likes of:

http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf

and what I see in /usr/src/sys/arm/arm/gic.c 's arm_gic_intr .


The written description says (quoting 10.6 The Generic Interrupt
Controller and 10.6.3 Interrupt Handling):

> Interrupts can either be edge-triggered (considered to be asserted when the GIC detects a rising edge on the relevant input, and to remain asserted until cleared) or level-sensitive (considered to be asserted only when the relevant input to the GIC is HIGH).
> 
> . . .
> 
> The priority and list of cores to which an interrupt can be delivered to are all configured in the Distributor. An interrupt asserted to the Distributor by a peripheral is in the Pending state (or Active and Pending if it was already Active). The Distributor determines the highest priority pending interrupt that can be delivered to a core and forwards that to the CPU interface of the core. At the CPU interface, the interrupt is in turn signaled to the core, at which point the core takes the FIQ or IRQ exception.
> 
> The core executes the exception handler in response. The handler must query the interrupt ID from a CPU interface register and begin servicing the interrupt source. When finished, the handler must write to a CPU interface register to report the end of processing.
> 
> 	• For a given interrupt the typical sequence is:
> 
> 		• Inactive -> Pending
> When the interrupt is asserted by the peripheral.
> 
> 		• Pending -> Active
> When the handler acknowledges the interrupt.
> 
> 		• Active -> Inactive
> When the handle[r] has finished dealing with the interrupt.
> 
> . . .
> 
> The top-level interrupt handler reads the Interrupt Acknowledge Register from the CPU Interface block to obtain the interrupt ID.
> 
> As well as returning the interrupt ID, the read causes the interrupt to be marked as active in the Distributor. Once the interrupt ID is known (identifying the interrupt source), the top-level handler can now dispatch a device-specific handler to service the interrupt.
> 
> When the device-specific handler finishes execution, the top-level handler writes the same interrupt ID to the End of Interrupt (EoI) register in the CPU Interface block, indicating the end of interrupt processing.


So that wording indicates that the write to GICC_EOIR should
be after the dispatched activity (after "servicing"), not
before. I did not find anything indicating that edge-triggered
vs. level triggered would be different for this, for example.
(But being unfamiliar I could have missed something.)


In two cases below the code has the write to the GICC_EOIR
before the dispatch (so before the servicing activity),
possibly allowing another interrupt during or even before
the dispatched activity (say if the state for the irq is
active-and-pending at the time of the GICC_EOIR write or
if there is a lower priority interrupt pending at that
time):


>        if (irq <= GIC_LAST_SGI) {
> #ifdef SMP
>                /* Call EOI for all IPI before dispatch. */
>                gic_c_write_4(sc, GICC_EOIR, irq_active_reg);
>                intr_ipi_dispatch(sgi_to_ipi[gi->gi_irq], tf);
>                goto next_irq;
> #else
> . . .
> #endif
>        }
> 
> . . .
>        if ((gi->gi_flags & GI_FLAG_EARLY_EOI) == GI_FLAG_EARLY_EOI)
>                gic_c_write_4(sc, GICC_EOIR, irq_active_reg);
> 
>        if (intr_isrc_dispatch(&gi->gi_isrc, tf) != 0) {
>                gic_irq_mask(sc, irq);
>                if ((gi->gi_flags & GI_FLAG_EARLY_EOI) != GI_FLAG_EARLY_EOI)
>                        gic_c_write_4(sc, GICC_EOIR, irq_active_reg);
>                device_printf(sc->gic_dev, "Stray irq %u disabled\n", irq);
>        }
> 
> next_irq:
. . .



Note: GI_FLAG_EARLY_EOI was set for edge triggered:

>        /* For MSI/MSI-X we should have already configured these */
>        if ((gi->gi_flags & GI_FLAG_MSI) == 0) {
>                if (pol == INTR_POLARITY_CONFORM)
>                        pol = INTR_POLARITY_LOW;        /* just pick some */
>                if (trig == INTR_TRIGGER_CONFORM)
>                        trig = INTR_TRIGGER_EDGE;       /* just pick some */
> 
>                gi->gi_pol = pol;
>                gi->gi_trig = trig;
> 
>                /* Edge triggered interrupts need an early EOI sent */
>                if (gi->gi_pol == INTR_TRIGGER_EDGE)
>                        gi->gi_flags |= GI_FLAG_EARLY_EOI;
>        }



===
Mark Millard
markmi at dsl-only.net


On 2017-Feb-1, at 7:07 PM, Mark Millard <markmi at dsl-only.net> wrote:

> I temporarily modified the Spurious-interrupt-detected notice to also report
> irq and sc->nirqs :
> 
> 
> . . .
> #define gic_c_read_4(_sc, _reg)         \
>   bus_space_read_4((_sc)->gic_c_bst, (_sc)->gic_c_bsh, (_reg))
> 
> . . .
> int
> arm_gic_intr(void *arg)
> {
>       struct arm_gic_softc *sc = arg;
>       struct gic_irqsrc *gi;
>       uint32_t irq_active_reg, irq;
>       struct trapframe *tf;
> 
>       irq_active_reg = gic_c_read_4(sc, GICC_IAR);
>       irq = irq_active_reg & 0x3FF;
> 
>       /*
> . . .
>        */
> 
>       if (irq >= sc->nirqs) {
> #ifdef GIC_DEBUG_SPURIOUS
>               device_printf(sc->gic_dev,
>                   "Spurious interrupt %d detected of %d: last irq: %d on CPU%d\n",
>                   irq, sc->nirqs,
>                   sc->last_irq[PCPU_GET(cpuid)], PCPU_GET(cpuid));
> #endif
>               return (FILTER_HANDLED);
>       }
> . . .
> 
> 
> The result was irq==1023 and sc->nirqs==224 in every message
> that I've seen so far. 1023==0x3FF .
> 
> Looking around I found in:
> 
> http://www.cl.cam.ac.uk/research/srg/han/ACS-P35/zynq/arm_gic_architecture_specification.pdf
> 
> the following on various reasons why 1023 would show up (quoting):
> 
> 
> 
> 	• A processor reads the GICC_IAR and obtains the interrupt ID 1023, indicating a spurious interrupt. The processor can return from its interrupt service routine without writing to its GICC_EOIR.
> 
> The spurious interrupt ID indicates that the original interrupt is no longer pending, typically because another target processor is handling it.
> 
> . . .
> 
> The GIC architecture reserves interrupt ID numbers 1020-1023 for special purposes. In a GICv1 implementation that does not implement the GIC Security Extensions, the only one of these used is ID 1023. This value is returned to a processor, in response to an interrupt acknowledge, if there is no pending interrupt with sufficient priority for it to be signaled to the processor. It is described as a response to a spurious interrupt.
> 
> Note
> 
> A race condition can cause a spurious interrupt. For example, a spurious interrupt can occur if a processor writes a 1 to a field in an GICD_ICENABLERn that corresponds to a pending interrupt after the CPU interface has signaled the interrupt to the processor and the processor has recognized the interrupt, but before the processor has read from the GICC_IAR.
> 
> . . .
> 
> 	• If a read of the GICC_IAR does not match the security of the interrupt, the GICC_IAR read does not acknowledge any interrupt and returns the value:
> 
> 		• 1022 for a Secure read when the highest priority interrupt is Non-secure
> 
> 		• 1023 for a Non-secure read when the highest priority interrupt is Secure.
> . . .
> 
> A read of the GICC_IAR returns the interrupt ID of the highest priority pending interrupt for the CPU interface. The read returns a spurious interrupt ID of 1023 if any of the following apply:
> 
> 	• forwarding of interrupts by the Distributor to the CPU interface is disabled
> 
> 	• signaling of interrupts by the CPU interface to the connected processor is disabled
> 
> 	• no pending interrupt on the CPU interface has sufficient priority for the interface to signal it to the processor.
> 
> 
> 	• The following sequence of events is an example of when the GIC returns an interrupt ID of 1023, and shows how reads of the GICC_IAR can be timing critical:
> 
> 1. A peripheral asserts a level-sensitive interrupt.
> 
> 2. The interrupt has sufficient priority and therefore the GIC signals it to a targeted processor.
> 
> 3. The peripheral deasserts the interrupt. Because there is no other pending interrupt of sufficient priority, the GIC deasserts the interrupt request to the processor.
> 
> 4. Before it has recognized the deassertion of the interrupt request from stage 3, the targeted processor reads the GICC_IAR. Because there is no interrupt with sufficient priority to signal to the processor, the GIC returns the spurious ID value of 1023.
> 
> 
> The determination of the returned interrupt ID is more complex if the GIC supports interrupt grouping
> 
> . . .
> 
> 
> Interrupt signaling of the required interrupt group by CPU interface disabled
> 
> 
> 
> ===
> Mark Millard
> markmi at dsl-only.net

_______________________________________________
freebsd-arm at freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-arm
To unsubscribe, send any mail to "freebsd-arm-unsubscribe at freebsd.org"

_______________________________________________
freebsd-arm at freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-arm
To unsubscribe, send any mail to "freebsd-arm-unsubscribe at freebsd.org"



More information about the freebsd-arm mailing list