Bug #3934
project reference count leak in tasksys_settaskid()
100%
Description
We have had quite a few dumps dying on seemingly random data corruption in the VM subsystem – always due to a mysterious 0x31 where there should be a 0. Here is a typical stack trace:
ffffff005e9fea40 swap_anon+0x7c() ffffff005e9feaa0 swap_getphysname+0xc6() ffffff005e9feb30 swap_dispose+0x5a() ffffff005e9febc0 fop_dispose+0xde() ffffff005e9fec00 anon_decref+0x14a() ffffff005e9fec60 anon_free+0x86() ffffff005e9feca0 segvn_free+0x121() ffffff005e9fecc0 seg_free+0x34() ffffff005e9fed60 segvn_unmap+0xcce() ffffff005e9fedc0 as_free+0x117() ffffff005e9fedf0 relvm+0xdd() ffffff005e9fee80 proc_exit+0x4b4() ffffff005e9feea0 exit+0x15() ffffff005e9feec0 rexit+0x1c() ffffff005e9fef10 _sys_sysenter_post_swapgs+0x149()
Fortunately, several recently have had a much more explicit failure mode:
ffffff005c0c5a30 cap_project_usage_walker+0x3d() ffffff005c0c5a70 cap_walk+0x3f() ffffff005c0c5a80 caps_update+0x23() ffffff005c0c5af0 clock+0x3ab() ffffff005c0c5ba0 cyclic_softint+0xdc() ffffff005c0c5bb0 cbe_softclock+0x1a() ffffff005c0c5bf0 av_dispatch_softvect+0x5f() ffffff005c0c5c20 dispatch_softint+0x34() ffffff005c005a30 switch_sp_and_call+0x13() ffffff005c005a60 dosoftint+0x59() ffffff005c005ab0 do_interrupt+0x114() ffffff005c005ac0 _interrupt+0xba() ffffff005c005bb0 i86_mwait+0xd() ffffff005c005c00 cpu_idle_mwait+0xf1() ffffff005c005c20 idle+0x114() ffffff005c005c30 thread_start+8()
In this case, we're dying because we're dereferencing the cap_zone from cpucap_t, and then looking at its zone_cpucap member – but the cap_zone is garbage:
> ffffff0ea0e54298::print -t cpucap_t cpucap_t { list_node_t cap_link = { struct list_node *list_next = 0xffffffff45ca1148 struct list_node *list_prev = 0xffffff1000bf1820 } struct kproject *cap_project = 0xffffffffb019fdc8 struct zone *cap_zone = 0xffffff0d4bc569c0 waitq_t cap_waitq = { disp_lock_t wq_lock = 0 kthread_t *wq_first = 0 int wq_count = 0 boolean_t wq_blocked = 0 (0) } kstat_t *cap_kstat = 0xffffffff682f1660 int64_t cap_gen = 0 hrtime_t cap_chk_value = 0x3b9aca00 hrtime_t cap_value = 0x3b9aca00 hrtime_t cap_usage = 0x31 hrtime_t cap_base = 0 u_longlong_t cap_burst_limit = 0 u_longlong_t cap_bursting = 0 disp_lock_t cap_usagelock = 0 hrtime_t cap_maxusage = 0x41b6c44e u_longlong_t cap_below = 0xbdeae2f u_longlong_t cap_above = 0x206e u_longlong_t cap_above_base = 0 } > 0xffffff0d4bc569c0::print zone_t zone_nodename zone_cpucap zone_nodename = 0x100000004 zone_cpucap = 0x808b4f8fee8910d
Interestingly, the cap_project's has the same value for the zone:
> 0xffffffffb019fdc8::print -t struct kproject struct kproject { projid_t kpj_id = 0x453 zoneid_t kpj_zoneid = 0x7 struct zone *kpj_zone = 0xffffff0d4bc569c0 uint_t kpj_count = 0x7 uint32_t kpj_shares = 0xa rctl_set_t *kpj_rctls = 0xffffff0d7f1f8470 struct kproject *kpj_prev = 0xffffffff287c5498 struct kproject *kpj_next = 0xffffff0f6e628190 kproject_data_t kpj_data = { rctl_qty_t kpd_shmmax = 0 ipc_rqty_t kpd_ipc = { rctl_qty_t ipcq_shmmni = 0 rctl_qty_t ipcq_semmni = 0 rctl_qty_t ipcq_msgmni = 0 } rctl_qty_t kpd_locked_mem = 0 rctl_qty_t kpd_locked_mem_ctl = 0xffffffffffffffff rctl_qty_t kpd_contract = 0 kmutex_t kpd_crypto_lock = { void *[1] _opaque = [ 0 ] } rctl_qty_t kpd_crypto_mem = 0 rctl_qty_t kpd_crypto_mem_ctl = 0x2ffb85c00 kstat_t *kpd_lockedmem_kstat = 0xffffffff682f1770 kstat_t *kpd_nprocs_kstat = 0xffffff0e7b064440 } kmutex_t kpj_poolbind = { void *[1] _opaque = [ 0 ] } rctl_qty_t kpj_nlwps = 0 rctl_qty_t kpj_nlwps_ctl = 0x19 rctl_qty_t kpj_ntasks = 0 rctl_qty_t kpj_ntasks_ctl = 0x7fffffff struct cpucap *kpj_cpucap = 0xffffff0ea0e54298 struct klpd_reg *kpj_klpd = 0 rctl_qty_t kpj_nprocs = 0 rctl_qty_t kpj_nprocs_ctl = 0x7fffffff }
The project seems otherwise intact. Now, the project does not take a hold on its zone, but it shouldn't really need to – as long as a task exists in the zone, the zone cannot be destroyed. However, note in the above that there are no tasks, no processes and no LWPs associated with this project – btu the kpj_count is 7. This points to kpj_count (project_hold()/project_rele()) leak as a possible culprit: if a project hold count is leaked and the task associated with it exits, the underlying zone could be destroyed with a cpucap_t still pointing to it.
A little digging reveals:
/* * taskid_t tasksys_settaskid(projid_t projid, uint_t flags); * * Overview * Place the calling process in a new task if sufficiently privileged. If the * present task is finalized, the process may not create a new task. * * Return values * 0 on success, errno on failure. */ static long tasksys_settaskid(projid_t projid, uint_t flags) { proc_t *p = ttoproc(curthread); kproject_t *oldpj; kproject_t *kpj; task_t *tk, *oldtk; rctl_entity_p_t e; zone_t *zone; int rctlfail = 0; ... kpj = project_hold_by_id(projid, p->p_zone, PROJECT_HOLD_INSERT); e.rcep_p.proj = kpj; e.rcep_t = RCENTITY_PROJECT; ... if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl) if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e, p->p_lwpcnt, 0) & RCT_DENY) rctlfail = 1; if (kpj->kpj_ntasks + 1 > kpj->kpj_ntasks_ctl) if (rctl_test_entity(rc_project_ntasks, kpj->kpj_rctls, p, &e, 1, 0) & RCT_DENY) rctlfail = 1; if (kpj != proj0p && kpj->kpj_nprocs + 1 > kpj->kpj_nprocs_ctl) if (rctl_test_entity(rc_project_nprocs, kpj->kpj_rctls, p, &e, 1, 0) & RCT_DENY) rctlfail = 1; if (kpj->kpj_data.kpd_locked_mem + p->p_locked_mem > kpj->kpj_data.kpd_locked_mem_ctl) if (rctl_test_entity(rc_project_locked_mem, kpj->kpj_rctls, p, &e, p->p_locked_mem, 0) & RCT_DENY) rctlfail = 1; mutex_enter(&(kpj->kpj_data.kpd_crypto_lock)); if (kpj->kpj_data.kpd_crypto_mem + p->p_crypto_mem > kpj->kpj_data.kpd_crypto_mem_ctl) if (rctl_test_entity(rc_project_crypto_mem, kpj->kpj_rctls, p, &e, p->p_crypto_mem, 0) & RCT_DENY) rctlfail = 1; if (rctlfail) { mutex_exit(&(kpj->kpj_data.kpd_crypto_lock)); mutex_exit(&zone->zone_mem_lock); mutex_exit(&zone->zone_nlwps_lock); if (curthread != p->p_agenttp) continuelwps(p); mutex_exit(&p->p_lock); return (set_errno(EAGAIN)); } ...
This code is incorrect in that if rctlfail is set, it will exit without releasing its hold on the project. This problem can be easily demonstrated by adding this line to /etc/project in a non-global zone:
doogle:100:Mr. Doogle:*:*:project.max-tasks=(privileged,1,deny);project.cpu-cap=(privileged,50,deny)
Then, launch a new task:
$ newtask -p doogle bash $
In another terminal, attempt this again:
$ newtask -p doogle bash newtask: resource control limit has been reached $
At this point, a reference to the kproject_t has been leaked; rebooting the zone will cause it to have a reference the stale zone_t. At that point, the machine is a time bomb; depending on how garbage lands, the machine will either die in cap_project_usage_walker() (as above) or elsewhere on memory corruption when cap_project_usage_walker() manages to write its usage (often 0x31 in the dumps seen) to random memory.
The fix is simple; the rctlfail failure path for tasksys_settaskid() needs to release its hold on the project.
Updated by Robert Mustacchi over 7 years ago
- Status changed from New to Resolved
- % Done changed from 90 to 100
Resolved in 74bf729631d9843cabb29019f16ac648de4aaa80.