Project

General

Profile

Actions

Bug #3934

closed

project reference count leak in tasksys_settaskid()

Added by Robert Mustacchi almost 8 years ago. Updated almost 8 years ago.

Status:
Resolved
Priority:
High
Category:
kernel
Start date:
2013-07-28
Due date:
% Done:

100%

Estimated time:
Difficulty:
Medium
Tags:
Gerrit CR:

Description

We have had quite a few dumps dying on seemingly random data corruption in the VM subsystem – always due to a mysterious 0x31 where there should be a 0. Here is a typical stack trace:

ffffff005e9fea40 swap_anon+0x7c()
ffffff005e9feaa0 swap_getphysname+0xc6()
ffffff005e9feb30 swap_dispose+0x5a()
ffffff005e9febc0 fop_dispose+0xde()
ffffff005e9fec00 anon_decref+0x14a()
ffffff005e9fec60 anon_free+0x86()
ffffff005e9feca0 segvn_free+0x121()
ffffff005e9fecc0 seg_free+0x34()
ffffff005e9fed60 segvn_unmap+0xcce()
ffffff005e9fedc0 as_free+0x117()
ffffff005e9fedf0 relvm+0xdd()
ffffff005e9fee80 proc_exit+0x4b4()
ffffff005e9feea0 exit+0x15()
ffffff005e9feec0 rexit+0x1c()
ffffff005e9fef10 _sys_sysenter_post_swapgs+0x149()

Fortunately, several recently have had a much more explicit failure mode:

ffffff005c0c5a30 cap_project_usage_walker+0x3d()
ffffff005c0c5a70 cap_walk+0x3f()
ffffff005c0c5a80 caps_update+0x23()
ffffff005c0c5af0 clock+0x3ab()
ffffff005c0c5ba0 cyclic_softint+0xdc()
ffffff005c0c5bb0 cbe_softclock+0x1a()
ffffff005c0c5bf0 av_dispatch_softvect+0x5f()
ffffff005c0c5c20 dispatch_softint+0x34()
ffffff005c005a30 switch_sp_and_call+0x13()
ffffff005c005a60 dosoftint+0x59()
ffffff005c005ab0 do_interrupt+0x114()
ffffff005c005ac0 _interrupt+0xba()
ffffff005c005bb0 i86_mwait+0xd()
ffffff005c005c00 cpu_idle_mwait+0xf1()
ffffff005c005c20 idle+0x114()
ffffff005c005c30 thread_start+8()

In this case, we're dying because we're dereferencing the cap_zone from cpucap_t, and then looking at its zone_cpucap member – but the cap_zone is garbage:

> ffffff0ea0e54298::print -t cpucap_t
cpucap_t {
    list_node_t cap_link = {
        struct list_node *list_next = 0xffffffff45ca1148
        struct list_node *list_prev = 0xffffff1000bf1820
    }
    struct kproject *cap_project = 0xffffffffb019fdc8
    struct zone *cap_zone = 0xffffff0d4bc569c0
    waitq_t cap_waitq = {
        disp_lock_t wq_lock = 0
        kthread_t *wq_first = 0
        int wq_count = 0
        boolean_t wq_blocked = 0 (0)
    }
    kstat_t *cap_kstat = 0xffffffff682f1660
    int64_t cap_gen = 0
    hrtime_t cap_chk_value = 0x3b9aca00
    hrtime_t cap_value = 0x3b9aca00
    hrtime_t cap_usage = 0x31
    hrtime_t cap_base = 0
    u_longlong_t cap_burst_limit = 0
    u_longlong_t cap_bursting = 0
    disp_lock_t cap_usagelock = 0
    hrtime_t cap_maxusage = 0x41b6c44e
    u_longlong_t cap_below = 0xbdeae2f
    u_longlong_t cap_above = 0x206e
    u_longlong_t cap_above_base = 0
}
> 0xffffff0d4bc569c0::print zone_t zone_nodename zone_cpucap
zone_nodename = 0x100000004
zone_cpucap = 0x808b4f8fee8910d

Interestingly, the cap_project's has the same value for the zone:

> 0xffffffffb019fdc8::print -t struct kproject      
struct kproject {
    projid_t kpj_id = 0x453
    zoneid_t kpj_zoneid = 0x7
    struct zone *kpj_zone = 0xffffff0d4bc569c0
    uint_t kpj_count = 0x7
    uint32_t kpj_shares = 0xa
    rctl_set_t *kpj_rctls = 0xffffff0d7f1f8470
    struct kproject *kpj_prev = 0xffffffff287c5498
    struct kproject *kpj_next = 0xffffff0f6e628190
    kproject_data_t kpj_data = {
        rctl_qty_t kpd_shmmax = 0
        ipc_rqty_t kpd_ipc = {
            rctl_qty_t ipcq_shmmni = 0
            rctl_qty_t ipcq_semmni = 0
            rctl_qty_t ipcq_msgmni = 0
        }
        rctl_qty_t kpd_locked_mem = 0
        rctl_qty_t kpd_locked_mem_ctl = 0xffffffffffffffff
        rctl_qty_t kpd_contract = 0
        kmutex_t kpd_crypto_lock = {
            void *[1] _opaque = [ 0 ]
        }
        rctl_qty_t kpd_crypto_mem = 0
        rctl_qty_t kpd_crypto_mem_ctl = 0x2ffb85c00
        kstat_t *kpd_lockedmem_kstat = 0xffffffff682f1770
        kstat_t *kpd_nprocs_kstat = 0xffffff0e7b064440
    }
    kmutex_t kpj_poolbind = {
        void *[1] _opaque = [ 0 ]
    }
    rctl_qty_t kpj_nlwps = 0
    rctl_qty_t kpj_nlwps_ctl = 0x19
    rctl_qty_t kpj_ntasks = 0
    rctl_qty_t kpj_ntasks_ctl = 0x7fffffff
    struct cpucap *kpj_cpucap = 0xffffff0ea0e54298
    struct klpd_reg *kpj_klpd = 0
    rctl_qty_t kpj_nprocs = 0
    rctl_qty_t kpj_nprocs_ctl = 0x7fffffff
}

The project seems otherwise intact. Now, the project does not take a hold on its zone, but it shouldn't really need to – as long as a task exists in the zone, the zone cannot be destroyed. However, note in the above that there are no tasks, no processes and no LWPs associated with this project – btu the kpj_count is 7. This points to kpj_count (project_hold()/project_rele()) leak as a possible culprit: if a project hold count is leaked and the task associated with it exits, the underlying zone could be destroyed with a cpucap_t still pointing to it.

A little digging reveals:

/*
 * taskid_t tasksys_settaskid(projid_t projid, uint_t flags);
 *
 * Overview
 *   Place the calling process in a new task if sufficiently privileged.  If the
 *   present task is finalized, the process may not create a new task.
 *
 * Return values
 *   0 on success, errno on failure.
 */
static long
tasksys_settaskid(projid_t projid, uint_t flags)
{
        proc_t *p = ttoproc(curthread);
        kproject_t *oldpj;
        kproject_t *kpj;
        task_t *tk, *oldtk;
        rctl_entity_p_t e;
        zone_t *zone;
        int rctlfail = 0;

        ...

        kpj = project_hold_by_id(projid, p->p_zone, PROJECT_HOLD_INSERT);
        e.rcep_p.proj = kpj;
        e.rcep_t = RCENTITY_PROJECT;
        ...
        if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl)
                if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e,
                    p->p_lwpcnt, 0) & RCT_DENY)
                        rctlfail = 1;

        if (kpj->kpj_ntasks + 1 > kpj->kpj_ntasks_ctl)
                if (rctl_test_entity(rc_project_ntasks, kpj->kpj_rctls, p, &e,
                    1, 0) & RCT_DENY)
                        rctlfail = 1;

        if (kpj != proj0p && kpj->kpj_nprocs + 1 > kpj->kpj_nprocs_ctl)
                if (rctl_test_entity(rc_project_nprocs, kpj->kpj_rctls, p, &e,
                    1, 0) & RCT_DENY)
                        rctlfail = 1;
        if (kpj->kpj_data.kpd_locked_mem + p->p_locked_mem >
            kpj->kpj_data.kpd_locked_mem_ctl)
                if (rctl_test_entity(rc_project_locked_mem, kpj->kpj_rctls, p,
                    &e, p->p_locked_mem, 0) & RCT_DENY)
                        rctlfail = 1;

        mutex_enter(&(kpj->kpj_data.kpd_crypto_lock));
        if (kpj->kpj_data.kpd_crypto_mem + p->p_crypto_mem >
            kpj->kpj_data.kpd_crypto_mem_ctl)
                if (rctl_test_entity(rc_project_crypto_mem, kpj->kpj_rctls, p,
                    &e, p->p_crypto_mem, 0) & RCT_DENY)
                        rctlfail = 1;

        if (rctlfail) {
                mutex_exit(&(kpj->kpj_data.kpd_crypto_lock));
                mutex_exit(&zone->zone_mem_lock);
                mutex_exit(&zone->zone_nlwps_lock);
                if (curthread != p->p_agenttp)
                        continuelwps(p);
                mutex_exit(&p->p_lock);
                return (set_errno(EAGAIN));
        }
        ...

This code is incorrect in that if rctlfail is set, it will exit without releasing its hold on the project. This problem can be easily demonstrated by adding this line to /etc/project in a non-global zone:

doogle:100:Mr. Doogle:*:*:project.max-tasks=(privileged,1,deny);project.cpu-cap=(privileged,50,deny)

Then, launch a new task:

$ newtask -p doogle bash
$

In another terminal, attempt this again:

$ newtask -p doogle bash
newtask: resource control limit has been reached
$

At this point, a reference to the kproject_t has been leaked; rebooting the zone will cause it to have a reference the stale zone_t. At that point, the machine is a time bomb; depending on how garbage lands, the machine will either die in cap_project_usage_walker() (as above) or elsewhere on memory corruption when cap_project_usage_walker() manages to write its usage (often 0x31 in the dumps seen) to random memory.

The fix is simple; the rctlfail failure path for tasksys_settaskid() needs to release its hold on the project.

Actions #1

Updated by Robert Mustacchi almost 8 years ago

  • Status changed from New to Resolved
  • % Done changed from 90 to 100

Resolved in 74bf729631d9843cabb29019f16ac648de4aaa80.

Actions

Also available in: Atom PDF