Bug #13827
openmntfs, doors assuming final VOP_HOLD always belongs to a file_t, leaks ensue
0%
Description
Currently mntfs in mntclose
does this:
static int mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { mntnode_t *mnp = VTOM(vp); /* Clean up any locks or shares held by the current process */ cleanlocks(vp, ttoproc(curthread)->p_pid, 0); cleanshares(vp, ttoproc(curthread)->p_pid); if (count > 1) return (0); if (vp->v_count == 1) { rw_enter(&mnp->mnt_contents, RW_WRITER); mntfs_freesnap(mnp, &mnp->mnt_read); mntfs_freesnap(mnp, &mnp->mnt_ioctl); rw_exit(&mnp->mnt_contents); atomic_dec_32(&MTOD(mnp)->mnt_nopen); } return (0); }
This vp->v_count == 1
condition is trying to detect the final vop_close of a vnode (mntfs does a vnode per open of the /etc/mnttab
file). This logic as it is, however, assumes that the last extant VOP_HOLD
released on a given vnode will always be the one belonging to the file_t
, released due to a close()
.
This, however, turns out to not be a safe assumption.
In several places in the LX brand code we do roughly this:
if ((fp = getf(fd)) == NULL) { return (set_errno(EBADF)); } vp = fp->f_vnode; VN_HOLD(vp); releasef(fd); /* some other work... */ VN_RELE(vp);
Anywhere logic like this appears can race against close()
and lead to the final call to vop_close
happening with count = 1
(since it's the last reference to the file_t
which is closing) but v_count > 1
(since this extra VN_HOLD
is still sticking around until this other racing syscall returns). This violates the assumption that mntclose()
has made, and in this case leads to leaks -- both of the mntfs snapshot structs and also of counts in the mnt_nopen
atomic counter.
This then leads to mntfs
refusing to unmount and blocking zone shutdown in the down
state (which is how we originally encountered this bug).
Lest you, dear reader, think that perhaps this is simply an LX bug, logic in this vein also appears elsewhere in the kernel: in Roger's proc (see prgetattr
handling PR_FDINFO
) and I have unconfirmed reports of it elsewhere (the logic can be pretty annoying to untangle sometimes to be 100% sure whether it's possible or not). The proc one has been there since time immemorial.
As well as mntfs
, there is code in other virtual filesystems which do vnode-per-open which makes this same assumption -- you can see an example in door_close
and possibly also some real filesystems which could be affected even by e.g. an fstat
since they persist their vnodes. I definitely believe the doorfs code to be incorrect in the same way as mntfs, and I strongly suspect a number of the other places to be incorrect as well.
Tricks to help reproduce are in the comments.
Updated by Alex Wilson about 1 year ago
Reproducer (by racing close()
against lx_fchmod
/ lx_fchown
):
test.c:
#include <stdlib.h> #include <stdio.h> #include <fcntl.h> #include <unistd.h> #include <signal.h> #include <sys/random.h> #include <sys/stat.h> #include <sys/mman.h> #include <sys/file.h> #include <pthread.h> enum fdblock_state { FDB_NONCLOSE_READY = (1<<0), FDB_CLOSE_READY = (1<<1), FDB_NONCLOSE_RUNNING = (1<<2), FDB_CLOSE_RUNNING = (1<<3) }; struct fdblock { pthread_mutex_t fb_mtx; unsigned int fb_state; pthread_cond_t fb_statechange; size_t fb_len; int *fb_fds; }; void * fdblock_nonclose_thread(void *arg) { struct fdblock *fb = (struct fdblock *)arg; unsigned int i, j; pthread_mutex_lock(&fb->fb_mtx); while (1) { while (!(fb->fb_state & FDB_NONCLOSE_READY)) pthread_cond_wait(&fb->fb_statechange, &fb->fb_mtx); fb->fb_state &= ~FDB_NONCLOSE_READY; fb->fb_state |= FDB_NONCLOSE_RUNNING; pthread_cond_broadcast(&fb->fb_statechange); pthread_mutex_unlock(&fb->fb_mtx); for (i = 0; i < fb->fb_len; ++i) { int fd = fb->fb_fds[i]; fchmod(fd, 0660); fchown(fd, 0, 0); } pthread_mutex_lock(&fb->fb_mtx); fb->fb_state &= ~FDB_NONCLOSE_RUNNING; pthread_cond_broadcast(&fb->fb_statechange); } } void * fdblock_close_thread(void *arg) { struct fdblock *fb = (struct fdblock *)arg; unsigned int i, j; pthread_mutex_lock(&fb->fb_mtx); while (1) { while (!(fb->fb_state & FDB_CLOSE_READY)) pthread_cond_wait(&fb->fb_statechange, &fb->fb_mtx); fb->fb_state &= ~FDB_CLOSE_READY; fb->fb_state |= FDB_CLOSE_RUNNING; pthread_cond_broadcast(&fb->fb_statechange); pthread_mutex_unlock(&fb->fb_mtx); for (i = 0; i < fb->fb_len; ++i) { int fd = fb->fb_fds[i]; close(fd); } pthread_mutex_lock(&fb->fb_mtx); fb->fb_state &= ~FDB_CLOSE_RUNNING; pthread_cond_broadcast(&fb->fb_statechange); } } int main(int argc, char *argv[]) { struct fdblock *fb; unsigned int i; char buf[32]; pthread_t noncloseth, closeth; fb = calloc(sizeof (*fb), 1); pthread_mutex_init(&fb->fb_mtx, NULL); pthread_cond_init(&fb->fb_statechange, NULL); fb->fb_len = 10000; fb->fb_fds = calloc(sizeof(int), 10000); pthread_mutex_lock(&fb->fb_mtx); pthread_create(&noncloseth, NULL, fdblock_nonclose_thread, fb); pthread_create(&closeth, NULL, fdblock_close_thread, fb); while (1) { for (i = 0; i < fb->fb_len; ++i) { fb->fb_fds[i] = open("/etc/mnttab", O_RDONLY); read(fb->fb_fds[i], buf, sizeof (buf)); } fb->fb_state = FDB_NONCLOSE_READY; pthread_cond_broadcast(&fb->fb_statechange); pthread_mutex_unlock(&fb->fb_mtx); pthread_mutex_lock(&fb->fb_mtx); fb->fb_state |= FDB_CLOSE_READY; pthread_cond_broadcast(&fb->fb_statechange); pthread_mutex_unlock(&fb->fb_mtx); pthread_mutex_lock(&fb->fb_mtx); while (!(fb->fb_state & (FDB_CLOSE_RUNNING | FDB_NONCLOSE_RUNNING))) pthread_cond_wait(&fb->fb_statechange, &fb->fb_mtx); while (fb->fb_state & (FDB_CLOSE_RUNNING | FDB_NONCLOSE_RUNNING)) pthread_cond_wait(&fb->fb_statechange, &fb->fb_mtx); } return (0); }
compile on lx with gcc -o test test.c -pthread
then run dtrace:
dtrace -w -n 'fbt::mntinactive:entry { self->mnp = (mntnode_t*)args[0]->v_data; }' -n 'fbt::mntinactive:entry /self->mnp->mnt_read.mnts_nmnts != 0 || self->mnp->mnt_ioctl.mnts_nmnts != 0/ { printf("zone=%s exec=%s pid=%d", zonename, execname, pid); }' -n 'fbt::lx_vn_chmod:entry { chill(1000); }'
and start ./test
. once you have output, you have leaks. you can repro this without the chill()
as well, but it can take quite a while