Bug #6156
openSpurious "Unable to set CLSET_NODELAYONERR"
0%
Description
In spite of #4463's fix, I still occasionally see errors mentioned above. I have one theory, as extra-commented in refresh_nlm_rpc():
138 ret = clnt_tli_kcreate(&hostp->nh_knc, &hostp->nh_addr, 139 NLM_PROG, rpcp->nr_vers, 0, NLM_RPC_RETRIES, 140 CRED(), &rpcp->nr_handle); 141 142 /* 143 * Set the client's CLSET_NODELAYONERR option to true. The 144 * RPC clnt_call interface creates an artificial delay for 145 * certain call errors in order to prevent RPC consumers 146 * from getting into tight retry loops. Since this function is 147 * called by the NLM service routines we would like to avoid 148 * this artificial delay when possible. We do not retry if the 149 * NULL request fails so it is safe for us to turn this option 150 * on. 151 */ /* XXX KEBE ASKS what if ret is not zero? */ 152 if (clnt_control(rpcp->nr_handle, CLSET_NODELAYONERR, 153 (char *)&clset) == FALSE) { 154 NLM_ERR("Unable to set CLSET_NODELAYONERR\n");
Reading further, we should kernel panic if ret is not zero, bcause rpcp->nr_handle would still be NULL. So that in and of itself is a problem too.
Regardless, I'm still seeing this, so either I'm incredibly lucky about clnt_control()'s side-effects, or for some other reason. I'm running dtrace to catch one. Maybe just the return-check alone is worth a fix...
Related issues
Updated by Dan McDonald almost 7 years ago
I ran this dtrace script:
#!/usr/sbin/dtrace -Cs refresh_nlm_rpc:entry /((nlm_rpc_t *)arg1)->nr_handle == NULL/ { self->trace = 1; self->hostp = arg0; self->handle = arg1; } clnt_tli_kcreate:return /self->trace == 1/ { self->kcreate_ret = arg1; } cmn_err:entry /self->trace == 1/ { printf("kcreate returned %d and CLSET_NODELAYONERR returned false\n", self->kcreate_ret); printf("args of refresh_nlm_rpc() follow:\n"); print(*((struct nlm_host *)self->hostp)); print(*((nlm_rpc_t *)self->handle)); stack(); } refresh_nlm_rpc:return /self->trace == 1/ { self->trace = 0; }
And got a piece of output:
kcreate returned 0 and CLSET_NODELAYONERR returned false args of refresh_nlm_rpc() follow: struct nlm_host { kmutex_t nh_lock = { void *[1] _opaque = [ 0 ] } volatile uint_t nh_refs = 0x1 struct nh_link = { struct nlm_host *tqe_next = 0 struct nlm_host **tqe_prev = 0 } avl_node_t nh_by_addr = { struct avl_node *[2] avl_child = [ 0, 0 ] uintptr_t avl_pcb = 0x1 } char *nh_name = 0xffffff09144031c8 char *nh_netid = 0xffffff09b124bae8 struct knetconfig nh_knc = { unsigned int knc_semantics = 0x1 caddr_t knc_protofmly = 0xfffffffff843274c caddr_t knc_proto = 0xfffffffff8432751 dev_t knc_rdev = 0x10000000000 unsigned int [8] knc_unused = [ 0, 0, 0, 0, 0, 0, 0, 0 ] } struct netbuf nh_addr = { unsigned int maxlen = 0x10 unsigned int len = 0x10 char *buf = 0xffffff0ba154c3b0 } sysid_t nh_sysid = 0x9 int32_t nh_state = 0 clock_t nh_idle_timeout = 0 uint8_t nh_flags = 0 kcondvar_t nh_recl_cv = { ushort_t _opaque = 0 } kcondvar_t nh_rpcb_cv = { ushort_t _opaque = 0 } enum clnt_stat nh_rpcb_ustat = RPC_SUCCESS enum nlm_rpcb_state nh_rpcb_state = NRPCB_UPDATED struct nlm_rpch_list nh_rpchc = { struct nlm_rpc *tqh_first = 0 struct nlm_rpc **tqh_last = 0xffffff090fec6930 } mod_hash_t *nh_vholds_by_vp = 0xffffff09ee8b6500 struct nlm_vhold_list nh_vholds_list = { struct nlm_vhold *tqh_first = 0 struct nlm_vhold **tqh_last = 0xffffff090fec6948 } struct nlm_shres *nh_shrlist = 0 kthread_t *nh_reclaimer = 0 }nlm_rpc_t { CLIENT *nr_handle = 0xffffff0b308dd700 rpcvers_t nr_vers = 0x4 struct nr_link = { struct nlm_rpc *tqe_next = 0 struct nlm_rpc **tqe_prev = 0 } } klmmod`refresh_nlm_rpc+0xf8 klmmod`nlm_host_get_rpc+0xc6 klmmod`nlm_do_unlock+0xe1 klmmod`nlm4_unlock_msg_4_svc+0x4a klmmod`nlm_dispatch+0xe6 klmmod`nlm_prog_4+0x34 rpcmod`svc_getreq+0x1c1 rpcmod`svc_run+0xe0 rpcmod`svc_do_run+0x8e nfs`nfssys+0xf1 unix`_sys_sysenter_post_swapgs+0x149
An NLM expert may be able to tell what's going on immediately, but I will have to get this script to be more verbose. Clues welcome, but I think I can manage with a bit of time.
Updated by Marcel Telka almost 7 years ago
- Related to Bug #4463: NLM fails wrongly on clnt_control(). added
Updated by Dan McDonald almost 7 years ago
I've updated the DTrace script, and see more useful (hostname redacted) output:
#!/usr/sbin/dtrace -Cs refresh_nlm_rpc:entry /((nlm_rpc_t *)arg1)->nr_handle == NULL/ { self->trace = 1; self->hostp = arg0; self->handle = arg1; } clnt_tli_kcreate:return /self->trace == 1/ { self->kcreate_ret = arg1; } cmn_err:entry /self->trace == 1/ { printf("\n\nkcreate returned %d and CLSET_NODELAYONERR returned false\n" , self->kcreate_ret); printf("args of refresh_nlm_rpc() follow:\n"); print(*((struct nlm_host *)self->hostp)); printf("\n"); print(*((nlm_rpc_t *)self->handle)); printf("\n"); print(*((CLIENT *)((nlm_rpc_t *)self->handle)->nr_handle)); printf("\n"); print(*((struct client_ops *) ((CLIENT *)((nlm_rpc_t *)self->handle)->nr_handle)->cl_ops)); printf("\n"); printf("Host name %s ; netid %s\n", stringof(((struct nlm_host *)self->hostp)->nh_name), stringof(((struct nlm_host *)self->hostp)->nh_netid)); printf("\n"); printf("\n"); stack(); } refresh_nlm_rpc:return /self->trace == 1/ { self->trace = 0; }
dtrace: script '/export/home/danmcd/clset-nodelayonerr.d' matched 4 probes CPU ID FUNCTION:NAME 0 25166 cmn_err:entry kcreate returned 0 and CLSET_NODELAYONERR returned false args of refresh_nlm_rpc() follow: struct nlm_host { kmutex_t nh_lock = { void *[1] _opaque = [ 0 ] } volatile uint_t nh_refs = 0x1 struct nh_link = { struct nlm_host *tqe_next = 0 struct nlm_host **tqe_prev = 0 } avl_node_t nh_by_addr = { struct avl_node *[2] avl_child = [ 0, 0 ] uintptr_t avl_pcb = 0x1 } char *nh_name = 0xffffff0ba154c8c0 char *nh_netid = 0xffffff0ebf43ee30 struct knetconfig nh_knc = { unsigned int knc_semantics = 0x1 caddr_t knc_protofmly = 0xfffffffff843274c caddr_t knc_proto = 0xfffffffff8432751 dev_t knc_rdev = 0x10000000000 unsigned int [8] knc_unused = [ 0, 0, 0, 0, 0, 0, 0, 0 ] } struct netbuf nh_addr = { unsigned int maxlen = 0x10 unsigned int len = 0x10 char *buf = 0xffffff0913f0a1c0 } sysid_t nh_sysid = 0xb int32_t nh_state = 0 clock_t nh_idle_timeout = 0 uint8_t nh_flags = 0 kcondvar_t nh_recl_cv = { ushort_t _opaque = 0 } kcondvar_t nh_rpcb_cv = { ushort_t _opaque = 0 } enum clnt_stat nh_rpcb_ustat = RPC_SUCCESS enum nlm_rpcb_state nh_rpcb_state = NRPCB_UPDATED struct nlm_rpch_list nh_rpchc = { struct nlm_rpc *tqh_first = 0 struct nlm_rpc **tqh_last = 0xffffff090fec6750 } mod_hash_t *nh_vholds_by_vp = 0xffffff09f0236180 struct nlm_vhold_list nh_vholds_list = { struct nlm_vhold *tqh_first = 0 struct nlm_vhold **tqh_last = 0xffffff090fec6768 } struct nlm_shres *nh_shrlist = 0 kthread_t *nh_reclaimer = 0 } nlm_rpc_t { CLIENT *nr_handle = 0xffffff091f016680 rpcvers_t nr_vers = 0x4 struct nr_link = { struct nlm_rpc *tqe_next = 0 struct nlm_rpc **tqe_prev = 0 } } CLIENT { AUTH *cl_auth = 0xffffff0915864a70 struct clnt_ops *cl_ops = 0xfffffffffbd301e0 caddr_t cl_private = 0xffffff091f016680 bool_t cl_nosignal = 0 } struct client_ops <forward decl> Host name <hostname-no-domain> ; netid udp klmmod`refresh_nlm_rpc+0xf8 klmmod`nlm_host_get_rpc+0xc6 klmmod`nlm_do_unlock+0xe1 klmmod`nlm4_unlock_msg_4_svc+0x4a klmmod`nlm_dispatch+0xe6 klmmod`nlm_prog_4+0x34 rpcmod`svc_getreq+0x1c1 rpcmod`svc_run+0xe0 rpcmod`svc_do_run+0x8e nfs`nfssys+0xf1 unix`_sys_sysenter_post_swapgs+0x149
It looks like we're doing CLSET_NODELAYONERR with a UDP session. Should we be so chatty in that case?