Project

General

Profile

Bug #6156

Spurious "Unable to set CLSET_NODELAYONERR"

Added by Dan McDonald about 5 years ago. Updated about 5 years ago.

Status:
New
Priority:
Normal
Assignee:
Category:
nfs - NFS server and client
Start date:
2015-08-22
Due date:
% Done:

0%

Estimated time:
Difficulty:
Bite-size
Tags:
needs-triage
Gerrit CR:

Description

In spite of #4463's fix, I still occasionally see errors mentioned above. I have one theory, as extra-commented in refresh_nlm_rpc():

138        ret = clnt_tli_kcreate(&hostp->nh_knc, &hostp->nh_addr,
139            NLM_PROG, rpcp->nr_vers, 0, NLM_RPC_RETRIES,
140            CRED(), &rpcp->nr_handle);
141
142        /*
143         * Set the client's CLSET_NODELAYONERR option to true. The
144         * RPC clnt_call interface creates an artificial delay for
145         * certain call errors in order to prevent RPC consumers
146         * from getting into tight retry loops. Since this function is
147         * called by the NLM service routines we would like to avoid
148         * this artificial delay when possible. We do not retry if the
149         * NULL request fails so it is safe for us to turn this option
150         * on.
151         */

              /* XXX KEBE ASKS what if ret is not zero? */

152        if (clnt_control(rpcp->nr_handle, CLSET_NODELAYONERR,
153            (char *)&clset) == FALSE) {
154            NLM_ERR("Unable to set CLSET_NODELAYONERR\n");

Reading further, we should kernel panic if ret is not zero, bcause rpcp->nr_handle would still be NULL. So that in and of itself is a problem too.

Regardless, I'm still seeing this, so either I'm incredibly lucky about clnt_control()'s side-effects, or for some other reason. I'm running dtrace to catch one. Maybe just the return-check alone is worth a fix...


Related issues

Related to illumos gate - Bug #4463: NLM fails wrongly on clnt_control().Resolved2014-01-13

Actions

History

#1

Updated by Dan McDonald about 5 years ago

I ran this dtrace script:

#!/usr/sbin/dtrace -Cs

refresh_nlm_rpc:entry
/((nlm_rpc_t *)arg1)->nr_handle == NULL/
{
        self->trace = 1;
        self->hostp = arg0;
        self->handle = arg1;
}

clnt_tli_kcreate:return
/self->trace == 1/
{
        self->kcreate_ret = arg1;
}

cmn_err:entry
/self->trace == 1/
{
        printf("kcreate returned %d and CLSET_NODELAYONERR returned false\n",
           self->kcreate_ret);
        printf("args of refresh_nlm_rpc() follow:\n");
        print(*((struct nlm_host *)self->hostp));
        print(*((nlm_rpc_t *)self->handle));
        stack();
}

refresh_nlm_rpc:return
/self->trace == 1/
{
        self->trace = 0;
}

And got a piece of output:

kcreate returned 0 and CLSET_NODELAYONERR returned false
args of refresh_nlm_rpc() follow:
struct nlm_host {
    kmutex_t nh_lock = {
        void *[1] _opaque = [ 0 ]
    }
    volatile uint_t nh_refs = 0x1
    struct nh_link = {
        struct nlm_host *tqe_next = 0
        struct nlm_host **tqe_prev = 0
    }
    avl_node_t nh_by_addr = {
        struct avl_node *[2] avl_child = [ 0, 0 ]
        uintptr_t avl_pcb = 0x1
    }
    char *nh_name = 0xffffff09144031c8
    char *nh_netid = 0xffffff09b124bae8
    struct knetconfig nh_knc = {
        unsigned int knc_semantics = 0x1
        caddr_t knc_protofmly = 0xfffffffff843274c
        caddr_t knc_proto = 0xfffffffff8432751
        dev_t knc_rdev = 0x10000000000
        unsigned int [8] knc_unused = [ 0, 0, 0, 0, 0, 0, 0, 0 ]
    }
    struct netbuf nh_addr = {
        unsigned int maxlen = 0x10
        unsigned int len = 0x10
        char *buf = 0xffffff0ba154c3b0
    }
    sysid_t nh_sysid = 0x9
    int32_t nh_state = 0
    clock_t nh_idle_timeout = 0
    uint8_t nh_flags = 0
    kcondvar_t nh_recl_cv = {
        ushort_t _opaque = 0
    }
    kcondvar_t nh_rpcb_cv = {
        ushort_t _opaque = 0
    }
    enum clnt_stat nh_rpcb_ustat = RPC_SUCCESS
    enum nlm_rpcb_state nh_rpcb_state = NRPCB_UPDATED
    struct nlm_rpch_list nh_rpchc = {
        struct nlm_rpc *tqh_first = 0
        struct nlm_rpc **tqh_last = 0xffffff090fec6930
    }
    mod_hash_t *nh_vholds_by_vp = 0xffffff09ee8b6500
    struct nlm_vhold_list nh_vholds_list = {
        struct nlm_vhold *tqh_first = 0
        struct nlm_vhold **tqh_last = 0xffffff090fec6948
    }
    struct nlm_shres *nh_shrlist = 0
    kthread_t *nh_reclaimer = 0
}nlm_rpc_t {
    CLIENT *nr_handle = 0xffffff0b308dd700
    rpcvers_t nr_vers = 0x4
    struct nr_link = {
        struct nlm_rpc *tqe_next = 0
        struct nlm_rpc **tqe_prev = 0
    }
}
              klmmod`refresh_nlm_rpc+0xf8
              klmmod`nlm_host_get_rpc+0xc6
              klmmod`nlm_do_unlock+0xe1
              klmmod`nlm4_unlock_msg_4_svc+0x4a
              klmmod`nlm_dispatch+0xe6
              klmmod`nlm_prog_4+0x34
              rpcmod`svc_getreq+0x1c1
              rpcmod`svc_run+0xe0
              rpcmod`svc_do_run+0x8e
              nfs`nfssys+0xf1
              unix`_sys_sysenter_post_swapgs+0x149

An NLM expert may be able to tell what's going on immediately, but I will have to get this script to be more verbose. Clues welcome, but I think I can manage with a bit of time.

#2

Updated by Marcel Telka about 5 years ago

  • Related to Bug #4463: NLM fails wrongly on clnt_control(). added
#3

Updated by Dan McDonald about 5 years ago

I've updated the DTrace script, and see more useful (hostname redacted) output:

#!/usr/sbin/dtrace -Cs

refresh_nlm_rpc:entry
/((nlm_rpc_t *)arg1)->nr_handle == NULL/
{
        self->trace = 1;
        self->hostp = arg0;
        self->handle = arg1;
}

clnt_tli_kcreate:return
/self->trace == 1/
{
        self->kcreate_ret = arg1;
}

cmn_err:entry
/self->trace == 1/
{
        printf("\n\nkcreate returned %d and CLSET_NODELAYONERR returned false\n" 
, self->kcreate_ret);
        printf("args of refresh_nlm_rpc() follow:\n");
        print(*((struct nlm_host *)self->hostp));
        printf("\n");
        print(*((nlm_rpc_t *)self->handle));
        printf("\n");
        print(*((CLIENT *)((nlm_rpc_t *)self->handle)->nr_handle));
        printf("\n");
        print(*((struct client_ops *)
            ((CLIENT *)((nlm_rpc_t *)self->handle)->nr_handle)->cl_ops));
        printf("\n");
        printf("Host name %s ; netid %s\n",
            stringof(((struct nlm_host *)self->hostp)->nh_name),
            stringof(((struct nlm_host *)self->hostp)->nh_netid));
        printf("\n");
        printf("\n");
        stack();
}

refresh_nlm_rpc:return
/self->trace == 1/
{
        self->trace = 0;
}

dtrace: script '/export/home/danmcd/clset-nodelayonerr.d' matched 4 probes
CPU     ID                    FUNCTION:NAME
  0  25166                    cmn_err:entry 

kcreate returned 0 and CLSET_NODELAYONERR returned false
args of refresh_nlm_rpc() follow:
struct nlm_host {
    kmutex_t nh_lock = {
        void *[1] _opaque = [ 0 ]
    }
    volatile uint_t nh_refs = 0x1
    struct nh_link = {
        struct nlm_host *tqe_next = 0
        struct nlm_host **tqe_prev = 0
    }
    avl_node_t nh_by_addr = {
        struct avl_node *[2] avl_child = [ 0, 0 ]
        uintptr_t avl_pcb = 0x1
    }
    char *nh_name = 0xffffff0ba154c8c0
    char *nh_netid = 0xffffff0ebf43ee30
    struct knetconfig nh_knc = {
        unsigned int knc_semantics = 0x1
        caddr_t knc_protofmly = 0xfffffffff843274c
        caddr_t knc_proto = 0xfffffffff8432751
        dev_t knc_rdev = 0x10000000000
        unsigned int [8] knc_unused = [ 0, 0, 0, 0, 0, 0, 0, 0 ]
    }
    struct netbuf nh_addr = {
        unsigned int maxlen = 0x10
        unsigned int len = 0x10
        char *buf = 0xffffff0913f0a1c0
    }
    sysid_t nh_sysid = 0xb
    int32_t nh_state = 0
    clock_t nh_idle_timeout = 0
    uint8_t nh_flags = 0
    kcondvar_t nh_recl_cv = {
        ushort_t _opaque = 0
    }
    kcondvar_t nh_rpcb_cv = {
        ushort_t _opaque = 0
    }
    enum clnt_stat nh_rpcb_ustat = RPC_SUCCESS
    enum nlm_rpcb_state nh_rpcb_state = NRPCB_UPDATED
    struct nlm_rpch_list nh_rpchc = {
        struct nlm_rpc *tqh_first = 0
        struct nlm_rpc **tqh_last = 0xffffff090fec6750
    }
    mod_hash_t *nh_vholds_by_vp = 0xffffff09f0236180
    struct nlm_vhold_list nh_vholds_list = {
        struct nlm_vhold *tqh_first = 0
        struct nlm_vhold **tqh_last = 0xffffff090fec6768
    }
    struct nlm_shres *nh_shrlist = 0
    kthread_t *nh_reclaimer = 0
}
nlm_rpc_t {
    CLIENT *nr_handle = 0xffffff091f016680
    rpcvers_t nr_vers = 0x4
    struct nr_link = {
        struct nlm_rpc *tqe_next = 0
        struct nlm_rpc **tqe_prev = 0
    }
}
CLIENT {
    AUTH *cl_auth = 0xffffff0915864a70
    struct clnt_ops *cl_ops = 0xfffffffffbd301e0
    caddr_t cl_private = 0xffffff091f016680
    bool_t cl_nosignal = 0
}
struct client_ops <forward decl>

Host name <hostname-no-domain> ; netid udp

              klmmod`refresh_nlm_rpc+0xf8
              klmmod`nlm_host_get_rpc+0xc6
              klmmod`nlm_do_unlock+0xe1
              klmmod`nlm4_unlock_msg_4_svc+0x4a
              klmmod`nlm_dispatch+0xe6
              klmmod`nlm_prog_4+0x34
              rpcmod`svc_getreq+0x1c1
              rpcmod`svc_run+0xe0
              rpcmod`svc_do_run+0x8e
              nfs`nfssys+0xf1
              unix`_sys_sysenter_post_swapgs+0x149

It looks like we're doing CLSET_NODELAYONERR with a UDP session. Should we be so chatty in that case?

Also available in: Atom PDF