Project

General

Profile

Bug #12738

Panic in vlapic_callout_handler

Added by Patrick Mooney 8 months ago. Updated 8 months ago.

Status:
Closed
Priority:
Normal
Category:
kernel
Start date:
Due date:
% Done:

100%

Estimated time:
Difficulty:
Medium
Tags:
bhyve
Gerrit CR:

Description

Upstreaming OS-8076 from SmartOS:

Customer reported a panic:

panic message: vlapic callout at 959897.26e71f8da24c1409, expected at 95a897.26e61a1a432ef5b3

From @mike.gerdts

> $C
fffffcc269588a70 vpanic()
fffffcc269588ab0 0xfffffffff8469c98()
fffffcc269588ad0 vmm_glue_callout_handler+0x2f(fffffe2636f07350)
fffffcc269588b70 cyclic_softint+0xfd(fffffe235e4be000, 0)
fffffcc269588b80 cbe_low_level+0x14()
fffffcc269588bd0 av_dispatch_softvect+0x78(2)
fffffcc269588c00 apix_dispatch_softint+0x35(0, 0)
fffffcc26b7641c0 switch_sp_and_call+0x13()
fffffcc26b764210 apix_do_softint+0x6c(fffffffff792f5a7)
fffffcc26b764270 0xfffffe235e4be000()
fffffcc26b764560 tsc_scalehrtime+0x47(fffffffffb864a9b)
fffffcc26b7645a0 0xfffffe2bd9d365c0()
fffffcc26b7645d0 savectx+0x3d(fffffe25ff7a2780)
fffffcc26b7645e0 resume+0x70()
fffffcc26b764610 swtch+0x141()
fffffcc26b764630 preempt+0xec()
fffffcc26b764660 kpreempt+0x97(ffffffff)
fffffcc26b764670 critical_exit+0x42()
fffffcc26b764760 vm_run+0x285(fffffe238f22e000, fffffcc26b764ae0)
fffffcc26b764c20 vmmdev_do_ioctl+0x775(fffffe37a82d5480, c0907601, fffffbffea5ffe00, 202003, fffffe260890e8a0, fffffcc26b764ea8)
fffffcc26b764cc0 vmm_ioctl+0x12f(13100000002, c0907601, fffffbffea5ffe00, 202003, fffffe260890e8a0, fffffcc26b764ea8)
fffffcc26b764d00 cdev_ioctl+0x39(13100000002, c0907601, fffffbffea5ffe00, 202003, fffffe260890e8a0, fffffcc26b764ea8)
fffffcc26b764d50 spec_ioctl+0x60(fffffe28f04dc080, c0907601, fffffbffea5ffe00, 202003, fffffe260890e8a0, fffffcc26b764ea8, 0)
fffffcc26b764de0 fop_ioctl+0x55(fffffe28f04dc080, c0907601, fffffbffea5ffe00, 202003, fffffe260890e8a0, fffffcc26b764ea8, 0)
fffffcc26b764f00 ioctl+0x9b(3, c0907601, fffffbffea5ffe00)
fffffcc26b764f10 sys_syscall+0x19f()
742 static void
 743 vlapic_callout_handler(void *arg)
 744 {
...
 762         if (vlapic_periodic_timer(vlapic)) {
 763                 binuptime(&btnow);
 764 #ifdef  __FreeBSD__
... (just a format string difference from below)
 769 #else
 770                 KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
 771                     ("vlapic callout at %lx.%lx, expected at %lx.%lx",
 772                     btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
 773                     vlapic->timer_fire_bt.frac));
 774 #endif
 775
 776                 /*
 777                  * Compute the delta between when the timer was supposed to
 778                  * fire and the present time.
 779                  */
 780                 bt = btnow;
 781                 bintime_sub(&bt, &vlapic->timer_fire_bt);
 782
 783                 rem_sbt = bttosbt(vlapic->timer_period_bt);
 784                 if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
 785                         /*
 786                          * Adjust the time until the next countdown downward
 787                          * to account for the lost time.
 788                          */
 789                         rem_sbt -= bttosbt(bt);
 790                 } else {
 791                         /*
 792                          * If the delta is greater than the timer period then
 793                          * just reset our time base instead of trying to catch
 794                          * up.
 795                          */
 796                         vlapic->timer_fire_bt = btnow;
 797                         VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " 
 798                             "usecs, period is %lu usecs - resetting time base",
 799                             bttosbt(bt) / SBT_1US,
 800                             bttosbt(vlapic->timer_period_bt) / SBT_1US);
 801                 }

@john.levon noticed that the seconds portion was off by 4096. We'll come to that.
Unfortunately rdi is somewhere in the middle of the panic string. I found a promising pointer in %r12.

> 0xfffffe2636f07300::print -at struct vlapic
fffffe2636f07300 struct vlapic {
    fffffe2636f07300 struct vm *vm = 0xfffffe238f22e000
    fffffe2636f07308 int vcpuid = 0x1
    fffffe2636f07310 struct LAPIC *apic_page = 0xffffffc3378bd000
    fffffe2636f07318 struct vlapic_ops ops = {
        fffffe2636f07318 int (*)() set_intr_ready = vmx_set_intr_ready
        fffffe2636f07320 int (*)() pending_intr = vmx_pending_intr
        fffffe2636f07328 void (*)() intr_accepted = vmx_intr_accepted
        fffffe2636f07330 void (*)() post_intr = vmx_post_intr
        fffffe2636f07338 void (*)() set_tmr = vmx_set_tmr
        fffffe2636f07340 void (*)() enable_x2apic_mode = vmx_enable_x2apic_mode
    }
    fffffe2636f07348 uint32_t esr_pending = 0
    fffffe2636f0734c int esr_firing = 0
    fffffe2636f07350 struct callout callout = {
        fffffe2636f07350 cyclic_id_t c_cyc_id = 0xfffffe957204baf8
        fffffe2636f07358 int c_flags = 0
        fffffe2636f07360 void (*)() c_func = vlapic_callout_handler
        fffffe2636f07368 void *c_arg = 0xfffffe2636f07300
    }
    fffffe2636f07370 struct bintime timer_fire_bt = {
        fffffe2636f07370 ulong_t sec = 0x95a897
        fffffe2636f07378 uint64_t frac = 0x26e61a1a432ef5b3
    }
    fffffe2636f07380 struct bintime timer_freq_bt = {
        fffffe2636f07380 ulong_t sec = 0
        fffffe2636f07388 uint64_t frac = 0x100000000000
    }
    fffffe2636f07390 struct bintime timer_period_bt = {
        fffffe2636f07390 ulong_t sec = 0xfff
        fffffe2636f07398 uint64_t frac = 0xfffff00000000000
    }
    fffffe2636f073a0 struct mtx timer_mtx = {
        fffffe2636f073a0 kmutex_t m = {
            fffffe2636f073a0 void *[1] _opaque = [ 0xfffffcc269588c20 ]
        }
    }
    fffffe2636f073a8 uint8_t [17] isrvec_stk = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
    fffffe2636f073bc int isrvec_stk_top = 0
    fffffe2636f073c0 uint64_t msr_apicbase = 0xfee00800
    fffffe2636f073c8 enum boot_state boot_state = 0x2 (BS_RUNNING)
    fffffe2636f073cc uint32_t svr_last = 0x1df
    fffffe2636f073d0 uint32_t [7] lvt_last = [ 0x100d8, 0x184d8, 0x200fd, 0xe2, 0xfe, 0x10000, 0x10000 ]
}

That looks pretty legit - it even points to itself and is holding the mutex in that structure.
In vlapic_callout_handler() there is:

 803                 bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);

Notice that timer_period_bt.t_sec is 4095 and t_frac is such that bintime_add() is likely to add another second. I'm not sure what to make of that.

Digging further w/ @mike.zeller,

    fffffe2636f07350 struct callout callout = {
        fffffe2636f07350 cyclic_id_t c_cyc_id = 0xfffffe957204baf8
        fffffe2636f07358 int c_flags = 0
        fffffe2636f07360 void (*)() c_func = vlapic_callout_handler
        fffffe2636f07368 void *c_arg = 0xfffffe2636f07300
    }
> 0xfffffe957204baf8::cycid
            ADDR  CPU NDX           CYCLIC HANDLER
fffffe957204baf8   30  30 fffffe36415c14b0 vmm_glue_callout_handler
> fffffe36415c14b0::cyclic
            ADDR LEVL  PEND FLAGS            FIRE USECINT HANDLER
fffffe36415c14b0  low     1  0000  22d85857ebb35e 9223372036854775 vmm_glue_callout_handler
> panic_hrtime/J
panic_hrtime:
panic_hrtime:   22d49eab4c3689
> 22d85857ebb35e-22d49eab4c3689
                3b9ac9f7cd5
> 3b9ac9f7cd5%0t1000000000=D
                4095

So the cyclic appears to have fired too early, though that seems incredibly unlikely.

Having familiarized ourself with the cyclic code, the FreeBSD callout shim, and the vlapic code, and a generous amount of head scratching, a plausible scenario emerged:

First, a few facts about the cyclic system and the FreeBSD callout comparability shim layer:

- All cyclics used by the FreeBSD callout shim always have a period of CY_INFINITY, that is, they are always one-shot cyclics. The FreeBSD callout API does not have the notion of a periodic timer, so there is no way to pass along such an intent through the FreeBSD callout shim to the cyclic subsystem (which does). As a result, consumers of the FBSD shim interface must reprogram the callout on each invocation to affect a periodic timer.
- All cyclic used by the FreeBSD callout shim use low-level cyclics.
- When a low-level cyclic fires, it's cy_pend value is incremented, and is queued to run in the cyclic softint handler.
- After enqueing in the softint handler, a cyclic's expiration time is updated. For a periodic cyclic this is the new time to fire, for a one shot timer, this is set to CY_INFINITY.
- When the cyclic softint processes a low level cyclic, it decrements the cyclic's cy_pend value after the handler has returned.

From this, looking at the above cyclic, we see that the cyclic interval (USECINT) is CY_INFINITY, but the expiration time FIRE is not CY_INFINITY, and we have one event pending. From the above facts, the only way this can occur is if the cyclic fires and then it is reprogrammed while the handler is still running.

Now looking at the FreeBSD callout shim code, the functions are the most interesting:

static void
vmm_glue_callout_handler(void *arg)
{
    struct callout *c = arg;

    c->c_flags &= ~CALLOUT_PENDING;
    if (c->c_flags & CALLOUT_ACTIVE) {
        (c->c_func)(c->c_arg);
    }
}

...

int
vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr,
    void (*func)(void *), void *arg, int flags)
{
    hrtime_t target = sbttohrtime(sbt);

    ASSERT(c->c_cyc_id != CYCLIC_NONE);

    c->c_func = func;
    c->c_arg = arg;
    c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);

    if (flags & C_ABSOLUTE) {
        cyclic_reprogram(c->c_cyc_id, target);
    } else {
        cyclic_reprogram(c->c_cyc_id, target + gethrtime());
    }

    return (0);
}

And from vlapic.c:

static void
vlapic_callout_handler(void *arg)
{
    struct vlapic *vlapic;
    struct bintime bt, btnow;
    sbintime_t rem_sbt;

    vlapic = arg;

    VLAPIC_TIMER_LOCK(vlapic);
    if (callout_pending(&vlapic->callout))    /* callout was reset */
        goto done;

    if (!callout_active(&vlapic->callout))    /* callout was stopped */
        goto done;

    callout_deactivate(&vlapic->callout);

    vlapic_fire_timer(vlapic);

    if (vlapic_periodic_timer(vlapic)) {
        binuptime(&btnow);
#ifdef    __FreeBSD__
        KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
            ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx",
            btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
            vlapic->timer_fire_bt.frac));
#else
        KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
            ("vlapic callout at %lx.%lx, expected at %lx.%lx",
            btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
            vlapic->timer_fire_bt.frac));
#endif

        /*
         * Compute the delta between when the timer was supposed to
         * fire and the present time.
         */
        bt = btnow;
        bintime_sub(&bt, &vlapic->timer_fire_bt);

        rem_sbt = bttosbt(vlapic->timer_period_bt);
        if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
            /*
             * Adjust the time until the next countdown downward
             * to account for the lost time.
             */
            rem_sbt -= bttosbt(bt);
        } else {
            /*
             * If the delta is greater than the timer period then
             * just reset our time base instead of trying to catch
             * up.
             */
            vlapic->timer_fire_bt = btnow;
            VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " 
                "usecs, period is %lu usecs - resetting time base",
                bttosbt(bt) / SBT_1US,
                bttosbt(vlapic->timer_period_bt) / SBT_1US);
        }

        bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
        callout_reset_sbt(&vlapic->callout, rem_sbt, 0,
            vlapic_callout_handler, vlapic, 0);
    }
done:
    VLAPIC_TIMER_UNLOCK(vlapic);
}

void
vlapic_icrtmr_write_handler(struct vlapic *vlapic)
{
    struct LAPIC *lapic;
    sbintime_t sbt;
    uint32_t icr_timer;

    VLAPIC_TIMER_LOCK(vlapic);

    lapic = vlapic->apic_page;
    icr_timer = lapic->icr_timer;

    vlapic->timer_period_bt = vlapic->timer_freq_bt;
    bintime_mul(&vlapic->timer_period_bt, icr_timer);

    if (icr_timer != 0) {
        binuptime(&vlapic->timer_fire_bt);
        bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);

        sbt = bttosbt(vlapic->timer_period_bt);
        callout_reset_sbt(&vlapic->callout, sbt, 0,
            vlapic_callout_handler, vlapic, 0);
    } else
        callout_stop(&vlapic->callout);

    VLAPIC_TIMER_UNLOCK(vlapic);
}

(vlapic_icrtmr_write_handler is what runs when a guest reprograms the vlapic to reset the callout timer).

Note that whenever the vlapic timer is reprogrammed, timer_fire_bt is first set to the new value, then the vlapic's callout is reprogrammed with the new expiration time.

We noticed that while both calls to callout_reset_sbt happen to occur with the vlapic lock held, c_flags is being modified by vmm_glue_callout_handler without any locks held! That means the following scenario is possible:

vlapic timer lock acquired
vlapic timer_fire_bt is set to new expiration time
callout_reset_sbt is called to program callout
CALLOUT_PENDING is set by callout_reset_sbt
        vlapic cyclic fires
        vmm_glue_callout_handler clears CALLOUT_PENDING
        vlapic_callout_handler is called
        vlapic_callout_handler blocks waiting for the vlapic timer lock
callout cyclic is reprogrammed
vlapic timer lock is released
        vlapic checks CALLOUT_PENDING flag, sees that it is clear and proceeds
        vlapic generates interrupt to vcpu
        vlapic_callout_handler sees _new_ expiration time, thinks it has fired early and panics

An initial attempt to recreate the panic:

#!/usr/sbin/dtrace -ws

vmm_glue_callout_reset_sbt:entry { self->trace = 1 }
vmm_glue_callout_reset_sbt:return { self->trace = 0 }
cyclic_reprogram:entry /self->trace/ { chill(1000) }

Initial attempt hasn't been successful so far. Part of the difficulity is figuring out a guest that triggers the necessary behavior. So far I have noticed that with at least some versions of Linux, it only uses the APIC timer (and reprograms it) during the boot process and doesn't seem to use the timer functionality afterwards – at least I haven't found a way to reliably force it to use that after boot.

#1

Updated by Patrick Mooney 8 months ago

As noted in the SmartOS ticket, reproducing this issue proved difficult. Only one of their customers reported seeing it, and infrequently enough that they didn't press for a fix. The proposed solution looks like it should address the assumed circumstances which lead to the panics. It did not make the situation worse running existing bhyve workloads.

#2

Updated by Electric Monk 8 months ago

  • Status changed from New to Closed
  • % Done changed from 0 to 100

git commit ee8ae3fa63afd7fd57d5e63676a991af0fb8d887

commit  ee8ae3fa63afd7fd57d5e63676a991af0fb8d887
Author: Patrick Mooney <pmooney@pfmooney.com>
Date:   2020-05-19T16:05:00.000Z

    12738 Panic in vlapic_callout_handler
    Reviewed by: Hans Rosenfeld <hans.rosenfeld@joyent.com>
    Reviewed by: Jason King <jason.king@joyent.com>
    Reviewed by: Mike Zeller <mike.zeller@joyent.com>
    Approved by: Dan McDonald <danmcd@joyent.com>

Also available in: Atom PDF