Udiff clnt_rdma.c
--- /webrev/webrev/usr/src/uts/common/rpc/clnt_rdma.c-  Mon Aug 14 13:12:10 2006
+++ clnt_rdma.c Thu Aug 10 14:22:04 2006
@@ -29,10 +29,23 @@
  * Portions of this source code were derived from Berkeley
  * 4.3 BSD under license from the Regents of the University of
  * California.
  */
 
+ /* Copyright (c) 2006, The Ohio State University. All rights reserved.
+  *
+  * Portions of this source code is developed by the team members of
+  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
+  * headed by Professor Dhabaleswar K. (DK) Panda.
+  *
+  * Acknowledgements to contributions from developors:
+  *   Ranjit Noronha: noronha@cse.ohio-state.edu
+  *   Lei Chai      : chail@cse.ohio-state.edu
+  *   Weikuan Yu    : yuw@cse.ohio-state.edu
+  *
+  */
+
 #pragma ident  "@(#)clnt_rdma.c        1.10    05/07/26 SMI"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/user.h>
@@ -54,12 +67,55 @@
 #include <rpc/xdr.h>
 #include <rpc/auth.h>
 #include <rpc/clnt.h>
 #include <rpc/rpc_msg.h>
 #include <rpc/rpc_rdma.h>
+#include <nfs/nfs.h>
 
+#define CLNT_CREDIT_LOW (5)
 
+xdrproc_t x_READ3args = NULL_xdrproc_t;
+xdrproc_t x_READ3res = NULL_xdrproc_t;
+xdrproc_t x_READ3vres = NULL_xdrproc_t;
+xdrproc_t x_READ3uiores = NULL_xdrproc_t;
+
+static uint32_t rdma_bufs_rqst = RDMA_BUFS_RQST;
+
+int rdma_wlist_verbose_debug = 0;
+int rdma_wlist_memreg_debug = 0;
+int rdma_wlist_clnt_debug = 0;
+int rdma_wlist_svc_debug = 0;
+int rdma_wlist_xdr_debug = 0;
+int rdma_wlist_pglck_debug = 0;
+int credit_control_debug = 0;
+int rdma_long_reply_debug = 0;
+int rdma_xdr_long_reply_debug = 0;
+
+struct clist empty_cl = {0};
+
+static void clnt_read3args_make_wlist(caddr_t, struct clist **, xdrproc_t, uint_t *);
+static int clnt_compose_rpcmsg(CLIENT *, rpcproc_t, rdma_buf_t *,
+                                      XDR *, xdrproc_t, caddr_t);
+static int  clnt_compose_rdma_header(CONN *, CLIENT *, rdma_buf_t *,
+                                    XDR **, uint_t *);
+static int clnt_setup_rlist(CONN *, XDR *, struct clist **);
+static int clnt_setup_wlist(CONN *, rpcproc_t, struct clist **,
+                                   caddr_t, xdrproc_t, XDR *);
+static int clnt_setup_long_reply(CONN *, rpcproc_t, struct clist *, 
+               XDR *, bool_t *);
+#ifdef DYNAMIC_CREDIT_CONTROL
+static void clnt_compute_credit(CONN *, uint32_t *);
+#endif
+static void clnt_check_credit(CONN *);
+static void clnt_return_credit(CONN *);
+static int clnt_decode_long_reply(CONN *, rpcproc_t, struct clist *,
+               struct clist *, XDR *, XDR **, struct clist *, 
+               struct clist *, uint_t,uint_t);
+
+static void clnt_update_credit(CONN *,uint32_t);
+static void check_dereg_wlist(CONN *, struct clist *);
+
 static enum clnt_stat clnt_rdma_kcallit(CLIENT *, rpcproc_t, xdrproc_t,
     caddr_t, xdrproc_t, caddr_t, struct timeval);
 static void    clnt_rdma_kabort(CLIENT *);
 static void    clnt_rdma_kerror(CLIENT *, struct rpc_err *);
 static bool_t  clnt_rdma_kfreeres(CLIENT *, xdrproc_t, caddr_t);
@@ -83,10 +139,12 @@
 
 /*
  * The size of the preserialized RPC header information.
  */
 #define        CKU_HDRSIZE     20
+#define CLNT_RDMA_SUCCESS 0
+#define CLNT_RDMA_FAIL -99
 
 /*
  * Per RPC RDMA endpoint details
  */
 typedef struct cku_private {
@@ -283,10 +341,270 @@
        p->cku_addr.len = raddr->len;
        bcopy(raddr->buf, p->cku_addr.buf, raddr->len);
        h->cl_ops = &rdma_clnt_ops;
 }
 
+static int clnt_compose_rpcmsg(CLIENT *h, rpcproc_t procnum, 
+                              rdma_buf_t *rpcmsg, XDR *xdrs, 
+                              xdrproc_t xdr_args, caddr_t argsp)
+{
+    cku_private_t *p = htop(h);
+    
+    if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
+            /*
+             * Copy in the preserialized RPC header
+             * information.
+             */
+            bcopy(p->cku_rpchdr, rpcmsg->addr, CKU_HDRSIZE);
+
+            /*
+             * transaction id is the 1st thing in the output
+             * buffer.
+             */
+            /* LINTED pointer alignment */
+            (*(uint32_t *)(rpcmsg->addr)) = p->cku_xid;
+
+            /* Skip the preserialized stuff. */
+            XDR_SETPOS(xdrs, CKU_HDRSIZE);
+
+            /* Serialize dynamic stuff into the output buffer. */
+            if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
+                (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
+                (!(*xdr_args)(xdrs, argsp))) {
+                   cmn_err(CE_WARN,"Failed to serialize dynamic arguments\n");
+                    return CLNT_RDMA_FAIL;
+                }
+            p->cku_outsz = XDR_GETPOS(xdrs);
+    } else {
+            uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE];
+            IXDR_PUT_U_INT32(uproc, procnum);
+            (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
+            XDR_SETPOS(xdrs, 0);
+
+            /* Serialize the procedure number and the arguments. */
+            if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr,
+                CKU_HDRSIZE+4, xdrs, NULL, NULL) || 
+                !(*xdr_args)(xdrs, argsp)) {
+                   if (rpcmsg->addr != xdrs->x_base) {
+                       rpcmsg->addr = xdrs->x_base;
+                       rpcmsg->len = xdr_getbufsize(xdrs);
+                   }
+                   cmn_err(CE_WARN,"Failed to serialize procedure number and the arguments.\n");
+                   return CLNT_RDMA_FAIL;
+             }
+            /*
+             * If we had to allocate a new buffer while encoding
+             * then update the addr and len.
+             */
+                if (rpcmsg->addr != xdrs->x_base) {
+                    rpcmsg->addr = xdrs->x_base;
+                    rpcmsg->len = xdr_getbufsize(xdrs);
+                }
+
+                p->cku_outsz = XDR_GETPOS(xdrs);
+        }
+
+    return CLNT_RDMA_SUCCESS;
+}
+
+static int clnt_compose_rdma_header(CONN *conn, CLIENT *h, rdma_buf_t *clmsg,
+                                    XDR **xdrs, uint_t *op)
+{
+        cku_private_t *p = htop(h);
+        uint_t vers;
+        uint32_t rdma_credit = rdma_bufs_rqst;
+
+        vers = RPCRDMA_VERS;
+        clmsg->type = SEND_BUFFER;
+
+#ifdef DYNAMIC_CREDIT_CONTROL
+        clnt_compute_credit(conn, &rdma_credit);
+#endif
+
+        if (RDMA_BUF_ALLOC(conn, clmsg)) {
+                return CLNT_RDMA_FAIL;
+        }
+
+        *xdrs = &p->cku_outxdr;
+        xdrmem_create(*xdrs, clmsg->addr, clmsg->len, XDR_ENCODE);
+
+        (*(uint32_t *)clmsg->addr) = p->cku_xid;
+        XDR_SETPOS(*xdrs, sizeof (uint32_t));
+        (void) xdr_u_int(*xdrs, &vers);
+        (void) xdr_u_int(*xdrs, &rdma_credit);
+        (void) xdr_u_int(*xdrs, op);
+
+        return CLNT_RDMA_SUCCESS;
+}
+
+static int clnt_setup_rlist(CONN *conn, XDR *xdrs, struct clist **cl)
+{
+        int ret;
+
+        if (*cl != NULL) {
+                ret = clist_register(conn, *cl, 1);
+                if (ret != RDMA_SUCCESS) {
+                   return CLNT_RDMA_FAIL;
+                }
+        }
+        (void) xdr_do_clist(xdrs, cl);
+        
+        return CLNT_RDMA_SUCCESS;
+}
+
+static int clnt_setup_wlist(CONN *conn, rpcproc_t procnum, 
+       struct clist **rpccall_wlist, caddr_t resultsp, 
+       xdrproc_t xdr_results, XDR *xdrs)
+{
+       int status;
+       uint_t num_segment = 0;
+
+       if (procnum == NFSPROC3_READ) {
+               clnt_read3args_make_wlist(resultsp, rpccall_wlist, 
+                               xdr_results, &num_segment);
+               status = clist_register(conn, *rpccall_wlist, 0);
+               if (status != RDMA_SUCCESS) 
+                       return CLNT_RDMA_FAIL;
+               } else {
+               *rpccall_wlist = NULL;
+       }
+ 
+       if (! xdr_encode_wlist(xdrs, *rpccall_wlist, num_segment)) 
+               return CLNT_RDMA_FAIL;
+
+       return CLNT_RDMA_SUCCESS;
+}
+
+static int clnt_setup_long_reply(CONN *conn, rpcproc_t procnum, 
+               struct clist *lrc_clist, 
+               XDR *xdrs, bool_t *exists)
+{
+               int status;
+               caddr_t addr;
+#ifdef SERVER_REG_CACHE
+       rib_lrc_entry_t *long_reply_buf = NULL;
+#endif
+       *exists = FALSE;
+        lrc_clist->c_daddr = NULL;
+
+#ifdef RPC_RDMA_INLINE
+       if (lrc_clist->c_len < rdma_minchunk)
+               return CLNT_RDMA_SUCCESS;
+#endif
+
+               if (procnum == NFSPROC3_READDIR || 
+           procnum == NFSPROC3_READDIRPLUS || 
+           procnum == NFSPROC3_READLINK) {
+#ifndef SERVER_REG_CACHE
+               addr = kmem_alloc(LONG_REPLY_LEN, KM_SLEEP);
+               bzero(addr, LONG_REPLY_LEN);
+               lrc_clist->c_daddr        = (uint64)addr;
+               lrc_clist->c_len          = LONG_REPLY_LEN;
+               lrc_clist->c_next         = NULL;
+               lrc_clist->long_reply_buf = NULL;
+               status = clist_register(conn, lrc_clist, 0);
+#else
+               long_reply_buf = RDMA_GET_SERVER_CACHE_BUF(conn, LONG_REPLY_LEN);
+               bzero(long_reply_buf->lrc_buf, LONG_REPLY_LEN);
+               lrc_clist->c_daddr        = (uint64)long_reply_buf->lrc_buf;
+               lrc_clist->c_len          = LONG_REPLY_LEN;
+               lrc_clist->c_next         = NULL;
+               lrc_clist->long_reply_buf = (uint64)long_reply_buf;
+               lrc_clist->c_dmemhandle   = long_reply_buf->lrc_mhandle;
+               status = clist_register(conn, lrc_clist, 0);
+#endif
+               if(status) {
+                       cmn_err(CE_WARN, "clnt_setup_long_reply: cannot register buffer");
+#ifndef SERVER_REG_CACHE
+                       kmem_free((void*)addr, (size_t)LONG_REPLY_LEN);
+#else
+                RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)long_reply_buf);
+
+#endif
+                       lrc_clist->c_daddr = NULL;
+                       return CLNT_RDMA_FAIL;
+               }
+               *exists = TRUE;
+               } 
+
+       return CLNT_RDMA_SUCCESS;
+}
+
+static void
+clnt_read3args_make_wlist(caddr_t replyp, struct clist **rpccall_wlist, 
+                          xdrproc_t xr, uint_t *num_segment)
+{
+               READ3uiores *ures = (READ3uiores *)replyp;
+               READ3vres   *vres = (READ3vres *)replyp;
+               struct clist *rwl = NULL, *prev = NULL;
+               int i, total_length;
+
+       *rpccall_wlist = NULL;
+
+#ifdef RPC_RDMA_INLINE
+       if (xr == x_READ3uiores) {
+               total_length = 0;
+               for(i=0; i<ures->uiop->uio_iovcnt; i++) {
+                       total_length += ures->uiop->uio_iov[i].iov_len;
+               }
+       } else {
+               total_length = vres->data.data_len;
+       }
+
+       if (total_length < rdma_minchunk)
+               return;
+#endif
+
+       /* XXX: fake a chunk threshold for the combined length for now */
+       if (xr == x_READ3uiores) {
+               *num_segment = ures->uiop->uio_iovcnt;
+               for(i=0; i<ures->uiop->uio_iovcnt; i++) {
+                       rwl = (struct clist *)kmem_zalloc(sizeof(struct clist), 
+                                       KM_SLEEP);
+
+                       rwl->c_len = ures->uiop->uio_iov[i].iov_len;
+                       rwl->c_daddr = (uint64)(ures->uiop->uio_iov[i].iov_base);
+                       /*
+                        * if userspace address, put adspace ptr in clist.  
+                        * If not, then do nothing since it's already 
+                        * set to NULL (from empty_cl)
+                        */
+                       if (ures->uiop->uio_segflg == UIO_USERSPACE) {
+                               int error;
+                               rwl->c_adspc = ttoproc(curthread)->p_as;
+                       } else {
+                               rwl->c_dpplist = (page_t **)NULL;
+                       }
+
+                       if(prev == NULL)
+                               prev = rwl;
+                       else {
+                               prev->c_next = rwl;
+                               prev = rwl;
+                       }
+
+                       if(*rpccall_wlist == NULL)
+                               *rpccall_wlist = rwl;
+               }
+               rwl->c_next = NULL;
+               } else if (xr == x_READ3vres) {
+               *num_segment = 1;
+               rwl = (struct clist *)kmem_zalloc(sizeof (struct clist), 
+                               KM_SLEEP);
+               *rwl = empty_cl;
+
+               rwl->c_len = vres->data.data_len;
+               rwl->c_daddr = (uint64)(vres->data.data_val);
+
+               if(*rpccall_wlist == NULL)
+                       *rpccall_wlist = rwl;
+               } else {
+               /*cmn_err(CE_NOTE, "read3args_make_wlist: non READ3xr=%p", 
+                               (void *)xr);*/
+               }
+}
+
 /* ARGSUSED */
 static enum clnt_stat
 clnt_rdma_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args,
     caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait)
 {
@@ -294,20 +612,30 @@
        int     status;
        XDR     *xdrs;
        XDR     *cxdrp = NULL, callxdr; /* for xdrrdma encoding the RPC call */
        XDR     *rxdrp = NULL, replxdr; /* for xdrrdma decoding the RPC reply */
        struct rpc_msg  reply_msg;
-       struct clist *sendlist, *recvlist = NULL;
-       struct clist *cl = NULL, *cle = NULL;
+       struct clist *sendlist = NULL, *recvlist = NULL;
+       struct clist *cl = NULL, *cle = NULL, *rdma_reply = NULL;
        uint_t vers, op;
        uint_t off;
        uint32_t xid;
+       uint32_t seg_array_len;
        CONN *conn = NULL;
-       rdma_buf_t clmsg, rpcmsg, longmsg, rpcreply;
+       rdma_buf_t clmsg = {0}, rpcmsg = {0};
        int msglen;
        clock_t ticks;
+       bool_t wlist_exists_reply  = FALSE;
+       bool_t long_reply_buf_exists = FALSE;
 
+        struct clist *rpccall_wlist = NULL, *rpcreply_wlist = NULL, 
+                    long_reply_clist ={0};
+        rpccall_read_t read_type;
+        rpccall_write_t write_type;
+        uint32_t rdma_credit = rdma_bufs_rqst;
+        struct clist long_reply_buf_clist = {0};
+
        RCSTAT_INCR(rccalls);
        /*
         * Get unique xid
         */
        if (p->cku_xid == 0)
@@ -361,10 +689,13 @@
                        break;
                }
 
                return (p->cku_err.re_status);
        }
+
+        clnt_check_credit(conn);
+
        /*
         * Get the size of the rpc call message. Need this
         * to determine if the rpc call message will fit in
         * the pre-allocated RDMA buffers. If the rpc call
         * message length is greater that the pre-allocated
@@ -372,173 +703,87 @@
         * buffer is allocated and registered for the Long
         * RPC call.
         */
        xdrs = &callxdr;
        msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT;
+
        if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
                msglen += xdrrdma_authsize(h->cl_auth, p->cku_cred,
                                rdma_minchunk);
                msglen += xdrrdma_sizeof(xdr_args, argsp, rdma_minchunk);
 
-               if (msglen > RPC_MSG_SZ) {
-
-                       /*
-                        * Long RPC. Allocate one time use custom buffer.
-                        */
-                       rpcmsg.type = CHUNK_BUFFER;
-                       rpcmsg.addr = kmem_zalloc(msglen, KM_SLEEP);
-                       cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
-                       cle->c_xdroff = 0;
-                       cle->c_len  = rpcmsg.len = msglen;
-                       cle->c_saddr = (uint64)(uintptr_t)rpcmsg.addr;
-                       cle->c_next = NULL;
-                       xdrrdma_create(xdrs, rpcmsg.addr, msglen,
-                           rdma_minchunk, cle, XDR_ENCODE, NULL);
-                       cxdrp = xdrs;
-                       op = RDMA_NOMSG;
+               if (msglen > RPC_MSG_SZ) 
+                    read_type = RPCCALL_RCHUNK;
+               else 
+                    read_type = RPCCALL_NORCHUNK;
                } else {
                        /*
-                        * Get a pre-allocated buffer for rpc call
-                        */
-                       rpcmsg.type = SEND_BUFFER;
-                       if (RDMA_BUF_ALLOC(conn, &rpcmsg)) {
-                               p->cku_err.re_status = RPC_CANTSEND;
-                               p->cku_err.re_errno = EIO;
-                               RCSTAT_INCR(rcnomem);
-                               cmn_err(CE_WARN,
-                                   "clnt_rdma_kcallit: no buffers!");
-                               goto done;
-                       }
-                       xdrrdma_create(xdrs, rpcmsg.addr, rpcmsg.len,
-                           rdma_minchunk, NULL, XDR_ENCODE, NULL);
-                       cxdrp = xdrs;
-                       op = RDMA_MSG;
-               }
-       } else {
-               /*
                 * For RPCSEC_GSS since we cannot accurately presize the
                 * buffer required for encoding, we assume that its going
                 * to be a Long RPC to start with. We also create the
                 * the XDR stream with min_chunk set to 0 which instructs
                 * the XDR layer to not chunk the incoming byte stream.
                 */
 
                msglen += 2 * MAX_AUTH_BYTES + 2 * sizeof (struct opaque_auth);
-               msglen += xdr_sizeof(xdr_args, argsp);
+                msglen += xdrrdma_sizeof(xdr_args, argsp, rdma_minchunk);
 
-               /*
-                * Long RPC. Allocate one time use custom buffer.
-                */
-               longmsg.type = CHUNK_BUFFER;
-               longmsg.addr = kmem_zalloc(msglen, KM_SLEEP);
-               cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
-               cle->c_xdroff = 0;
-               cle->c_len  = longmsg.len = msglen;
-               cle->c_saddr = (uint64)(uintptr_t)longmsg.addr;
-               cle->c_next = NULL;
-               xdrrdma_create(xdrs, longmsg.addr, msglen, 0, cle,
-                   XDR_ENCODE, NULL);
-               cxdrp = xdrs;
-               op = RDMA_NOMSG;
+                if (msglen > RPC_MSG_SZ)
+                    read_type = RPCCALL_RCHUNK;
+                else
+                    read_type = RPCCALL_NORCHUNK;
        }
 
-       if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
-               /*
-                * Copy in the preserialized RPC header
-                * information.
-                */
-               bcopy(p->cku_rpchdr, rpcmsg.addr, CKU_HDRSIZE);
+        if (read_type == RPCCALL_NORCHUNK) {
 
-               /*
-                * transaction id is the 1st thing in the output
-                * buffer.
-                */
-               /* LINTED pointer alignment */
-               (*(uint32_t *)(rpcmsg.addr)) = p->cku_xid;
-
-               /* Skip the preserialized stuff. */
-               XDR_SETPOS(xdrs, CKU_HDRSIZE);
-
-               /* Serialize dynamic stuff into the output buffer. */
-               if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
-                   (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
-                   (!(*xdr_args)(xdrs, argsp))) {
-                       rdma_buf_free(conn, &rpcmsg);
-                       if (cle)
-                               clist_free(cle);
-                       p->cku_err.re_status = RPC_CANTENCODEARGS;
-                       p->cku_err.re_errno = EIO;
-                       cmn_err(CE_WARN,
-       "clnt_rdma_kcallit: XDR_PUTINT32/AUTH_MARSHAL/xdr_args failed");
+                rpcmsg.type = SEND_BUFFER;
+                if (RDMA_BUF_ALLOC(conn, &rpcmsg)) {
+                        cmn_err(CE_WARN, "clnt_rdma_kcallit: no buffers!");
                        goto done;
                }
-               p->cku_outsz = XDR_GETPOS(xdrs);
        } else {
-               uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE];
-               IXDR_PUT_U_INT32(uproc, procnum);
-               (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
-               XDR_SETPOS(xdrs, 0);
-
-               /* Serialize the procedure number and the arguments. */
-               if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr,
-                   CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) {
-                       if (longmsg.addr != xdrs->x_base) {
-                               longmsg.addr = xdrs->x_base;
-                               longmsg.len = xdr_getbufsize(xdrs);
+#ifdef SERVER_REG_CACHE
+               rib_lrc_entry_t *long_reply_buf = NULL;
+#endif
+                rpcmsg.type = CHUNK_BUFFER;
+#ifdef SERVER_REG_CACHE
+                long_reply_buf = RDMA_GET_SERVER_CACHE_BUF(conn, msglen);
+                rpcmsg.addr =  long_reply_buf->lrc_buf;
+#else
+                rpcmsg.addr = kmem_zalloc(msglen, KM_SLEEP);
+#endif
+                cle = (struct clist *)kmem_zalloc(sizeof (struct clist),
+                                                  KM_SLEEP);
+                cle->c_xdroff = 0;
+                cle->c_len    = rpcmsg.len = msglen;
+                cle->c_saddr  = (uint64)(uintptr_t)rpcmsg.addr;
+                cle->c_next   = NULL;
+#ifdef SERVER_REG_CACHE
+               cle->long_reply_buf  = (uint64)long_reply_buf;
+#endif
                        }
-                       rdma_buf_free(conn, &longmsg);
+
+        op = cle ? RDMA_NOMSG : RDMA_MSG;
+        cxdrp = xdrs;
+        xdrrdma_create(xdrs, rpcmsg.addr, (cle ? msglen : rpcmsg.len),
+                       rdma_minchunk, cle, XDR_ENCODE, NULL);
+        
+        status = clnt_compose_rpcmsg(h, procnum, &rpcmsg, xdrs, xdr_args, argsp);
+        if (status != CLNT_RDMA_SUCCESS) {
+            rdma_buf_free(conn, &rpcmsg);
                        clist_free(cle);
                        p->cku_err.re_status = RPC_CANTENCODEARGS;
                        p->cku_err.re_errno = EIO;
                        cmn_err(CE_WARN,
-               "clnt_rdma_kcallit: AUTH_WRAP failed");
+                    "clnt_rdma_kcallit: clnt_compose_rpcmsg failed");
                        goto done;
                }
-               /*
-                * If we had to allocate a new buffer while encoding
-                * then update the addr and len.
-                */
-               if (longmsg.addr != xdrs->x_base) {
-                       longmsg.addr = xdrs->x_base;
-                       longmsg.len = xdr_getbufsize(xdrs);
-               }
 
-               /*
-                * If it so happens that the encoded message is after all
-                * not long enough to be a Long RPC then allocate a
-                * SEND_BUFFER and copy the encoded message into it.
+        /*  Read chunklist (a linked list of  N elements,
+         *  position P (same P for all chunks of same arg!):
+         *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
                 */
-               p->cku_outsz = XDR_GETPOS(xdrs);
-               if (p->cku_outsz > RPC_MSG_SZ) {
-                       rpcmsg.type = CHUNK_BUFFER;
-                       rpcmsg.addr = longmsg.addr;
-                       rpcmsg.len = longmsg.len;
-               } else {
-                       clist_free(cle);
-                       XDR_DESTROY(cxdrp);
-                       cxdrp = NULL;
-                       /*
-                        * Get a pre-allocated buffer for rpc call
-                        */
-                       rpcmsg.type = SEND_BUFFER;
-                       if (RDMA_BUF_ALLOC(conn, &rpcmsg)) {
-                               p->cku_err.re_status = RPC_CANTSEND;
-                               p->cku_err.re_errno = EIO;
-                               RCSTAT_INCR(rcnomem);
-                               cmn_err(CE_WARN,
-                                   "clnt_rdma_kcallit: no buffers!");
-                               rdma_buf_free(conn, &longmsg);
-                               goto done;
-                       }
-                       bcopy(longmsg.addr, rpcmsg.addr, p->cku_outsz);
-                       xdrrdma_create(xdrs, rpcmsg.addr, p->cku_outsz, 0,
-                           NULL, XDR_ENCODE, NULL);
-                       cxdrp = xdrs;
-                       rdma_buf_free(conn, &longmsg);
-                       op = RDMA_MSG;
-               }
-       }
 
        cl = xdrrdma_clist(xdrs);
 
        /*
         * Update the chunk size information for the Long RPC msg.
@@ -545,64 +790,70 @@
         */
        if (cl && op == RDMA_NOMSG)
                cl->c_len = p->cku_outsz;
 
        /*
-        * Set up the RDMA chunk message
+         * Prepare the header for the RDMA chunk
         */
-       vers = RPCRDMA_VERS;
-       clmsg.type = SEND_BUFFER;
-       if (RDMA_BUF_ALLOC(conn, &clmsg)) {
+        status = clnt_compose_rdma_header(conn, h, &clmsg, &xdrs, &op);
+        if (status != CLNT_RDMA_SUCCESS) {
                p->cku_err.re_status = RPC_CANTSEND;
                p->cku_err.re_errno = EIO;
                rdma_buf_free(conn, &rpcmsg);
+                clist_free(cle);
                RCSTAT_INCR(rcnomem);
                cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffers!!");
                goto done;
        }
-       xdrs = &p->cku_outxdr;
-       xdrmem_create(xdrs, clmsg.addr, clmsg.len, XDR_ENCODE);
-       /*
-        * Treat xid as opaque (xid is the first entity
-        * in the rpc rdma message).
-        */
-       (*(uint32_t *)clmsg.addr) = p->cku_xid;
-       /* Skip xid and set the xdr position accordingly. */
-       XDR_SETPOS(xdrs, sizeof (uint32_t));
-       (void) xdr_u_int(xdrs, &vers);
-       (void) xdr_u_int(xdrs, &op);
 
-       /*
-        * Now XDR the chunk list
-        */
-       if (cl != NULL) {
+        status = clnt_setup_rlist(conn, xdrs, &cl);
+        if (status != CLNT_RDMA_SUCCESS) {
+           cmn_err(CE_WARN, "clnt_rdma_kcallit: clist register failed");
+           rdma_buf_free(conn, &clmsg);
+           rdma_buf_free(conn, &rpcmsg);
+           clist_free(cl);
+           p->cku_err.re_status = RPC_CANTSEND;
+           p->cku_err.re_errno = EIO;
+           goto done;
+       }
 
-               /*
-                * Register the chunks in the list
+        /* Setup write chunk list for NFS3 READ operation 
+         * Other operations will have a NULL wlist 
                 */
-               status = clist_register(conn, cl, 1);
-               if (status != RDMA_SUCCESS) {
-                       cmn_err(CE_WARN,
-               "clnt_rdma_kcallit: clist register failed");
+        status = clnt_setup_wlist(conn, procnum, &rpccall_wlist, 
+                                 resultsp, xdr_results, xdrs);
+        if (status != CLNT_RDMA_SUCCESS) {
                        rdma_buf_free(conn, &clmsg);
                        rdma_buf_free(conn, &rpcmsg);
                        clist_free(cl);
                        p->cku_err.re_status = RPC_CANTSEND;
                        p->cku_err.re_errno = EIO;
                        goto done;
                }
 
+        status = clnt_setup_long_reply(conn, procnum, &long_reply_buf_clist, 
+                       xdrs, &long_reply_buf_exists);
+        if (status != CLNT_RDMA_SUCCESS) {
+              rdma_buf_free(conn, &clmsg);
+              rdma_buf_free(conn, &rpcmsg);
+              clist_free(cl);
+              p->cku_err.re_status = RPC_CANTSEND;
+              p->cku_err.re_errno = EIO;
+              goto done;
        }
-       (void) xdr_do_clist(xdrs, &cl);
 
        /*
+         * XDR encode the RDMA_REPLY write chunk
+         */
+       seg_array_len = (long_reply_buf_exists ? 1:0);
+        (void) xdr_encode_reply_wchunk(xdrs, &long_reply_buf_clist, seg_array_len);
+       /*
         * Start with the RDMA header and clist (if any)
         */
        sendlist = NULL;
        clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &clmsg.handle,
                clmsg.addr, NULL, NULL);
-
        /*
         * Put the RPC call message in the send list if small RPC
         */
        if (op == RDMA_MSG) {
                clist_add(&sendlist, 0, p->cku_outsz, &rpcmsg.handle,
@@ -646,11 +897,15 @@
        }
 
        /*
         * Send the call message to the server
         */
+#if defined (CLNT_INTERRUPT_COAL)
+       status = RDMA_SEND_BL(conn, sendlist, p->cku_xid);
+#else
        status = RDMA_SEND(conn, sendlist, p->cku_xid);
+#endif
        if (status != RDMA_SUCCESS) {
                if (cl) {
                        (void) clist_deregister(conn, cl, 1);
                        clist_free(cl);
                        /*
@@ -672,20 +927,16 @@
                clmsg.addr = NULL;
                if (rpcmsg.type == SEND_BUFFER)
                        rpcmsg.addr = NULL;
        }
        clist_free(sendlist);
-#ifdef DEBUG
-if (rdma_clnt_debug) {
-               printf("clnt_rdma_kcallit: send request xid %u\n", p->cku_xid);
-       }
-#endif
 
        /*
         * Recv rpc reply
         */
        status = RDMA_RECV(conn, &recvlist, p->cku_xid);
+        clnt_return_credit(conn);
 
        /*
         * Deregister chunks sent. Do this only after the reply
         * is received as that is a sure indication that the
         * remote end has completed RDMA of the chunks.
@@ -704,16 +955,10 @@
 
        /*
         * Now check recv status
         */
        if (status != 0) {
-#ifdef DEBUG
-               if (rdma_clnt_debug)
-                       cmn_err(CE_NOTE,
-                           "clnt_rdma_kcallit: reply failed %u status %d",
-                           p->cku_xid, status);
-#endif
                if (status == RDMA_INTR) {
                        p->cku_err.re_status = RPC_INTR;
                        p->cku_err.re_errno = EINTR;
                        RCSTAT_INCR(rcintrs);
                } else if (status == RPC_TIMEDOUT) {
@@ -724,14 +969,10 @@
                        p->cku_err.re_status = RPC_CANTRECV;
                        p->cku_err.re_errno = EIO;
                }
                goto done;
        }
-#ifdef DEBUG
-       if (rdma_clnt_debug)
-               printf("clnt_rdma_kcallit: got response xid %u\n", p->cku_xid);
-#endif
        /*
         * Process the reply message.
         *
         * First the chunk list (if any)
         */
@@ -744,92 +985,74 @@
         */
        xid = *(uint32_t *)(uintptr_t)recvlist->c_saddr;
        /* Skip xid and set the xdr position accordingly. */
        XDR_SETPOS(xdrs, sizeof (uint32_t));
        (void) xdr_u_int(xdrs, &vers);
+       (void) xdr_u_int(xdrs, &rdma_credit); 
        (void) xdr_u_int(xdrs, &op);
        (void) xdr_do_clist(xdrs, &cl);
-       off = xdr_getpos(xdrs);
+        clnt_update_credit(conn, rdma_credit); 
+         wlist_exists_reply = FALSE;
+         if (! xdr_decode_wlist(xdrs, &rpcreply_wlist, &wlist_exists_reply)) {
+                 cmn_err(CE_NOTE,
+                         "clnt_rdma_kcallit: xdr_decode_wlist failed");
+                 /* XXX: what should we fail with here -- EIO? */
+         }
+#ifdef RPC_RDMA_INLINE
+         if (xdr_results == x_READ3vres) {
+                 ((READ3vres *)resultsp)->wlist = NULL;
+         } else if (xdr_results == x_READ3uiores) {
+                 ((READ3uiores *)resultsp)->wlist = NULL;
+         }
+#endif
 
-       /*
-        * Now the RPC reply message itself. If the reply
-        * came as a chunk item, then RDMA the reply over.
-        */
-       xdrs = &replxdr;
-       if (cl && op == RDMA_NOMSG) {
-               struct clist            *cle = cl;
+         if (procnum == NFSPROC3_READ) {
 
-               rpcreply.type = CHUNK_BUFFER;
-               rpcreply.addr = kmem_alloc(cle->c_len, KM_SLEEP);
-               rpcreply.len = cle->c_len;
-               cle->c_daddr = (uint64)(uintptr_t)rpcreply.addr;
-               cl = cl->c_next;
-               cle->c_next = NULL;
+                 check_dereg_wlist(conn, rpccall_wlist);
 
-               /*
-                * Register the rpc reply chunk destination
-                */
-               status = clist_register(conn, cle, 0);
-               if (status) {
-                       rdma_buf_free(conn, &rpcreply);
-                       clist_free(cle);
-                       p->cku_err.re_status = RPC_CANTDECODERES;
-                       p->cku_err.re_errno = EIO;
-                       cmn_err(CE_WARN,
-                           "clnt_rdma_kcallit: clist_register failed");
-                       goto rdma_done;
+                 if (wlist_exists_reply) {
+                         if (xdr_results == x_READ3vres) {
+                                 ((READ3vres *)resultsp)->wlist =
+                                        rpcreply_wlist;
+                                 ((READ3vres *)resultsp)->wlist_len =
+                                         rpcreply_wlist->c_len;
+                         } else if (xdr_results == x_READ3uiores) {
+                                 ((READ3uiores *)resultsp)->wlist =
+                                        rpcreply_wlist;
+                                 ((READ3uiores *)resultsp)->wlist_len =
+                                         rpcreply_wlist->c_len;
+                         } else {
+                                 cmn_err(CE_NOTE,
+                                         "unknown READ3 xdr decode fnp=%p",
+                                         (void *)xdr_results);
                }
-
-               /*
-                * Now read rpc reply in
-                */
-#ifdef DEBUG
-       if (rdma_clnt_debug)
-               printf("clnt_rdma_kcallit: read chunk, len %d, xid %u, \
-                       reply xid %u\n", cle->c_len, p->cku_xid, xid);
-#endif
-               status = RDMA_READ(conn, cle, WAIT);
-               if (status) {
-                       (void) clist_deregister(conn, cle, 0);
-                       rdma_buf_free(conn, &rpcreply);
-                       clist_free(cle);
-                       p->cku_err.re_status = RPC_CANTDECODERES;
-                       p->cku_err.re_errno = EIO;
-                       cmn_err(CE_WARN,
-                               "clnt_rdma_kcallit: RDMA_READ failed");
-                       goto rdma_done;
                }
-
-               /*
-                * sync the memory for dma
-                */
-               status = clist_syncmem(conn, cle, 0);
-               if (status != RDMA_SUCCESS) {
-                       (void) clist_deregister(conn, cle, 0);
-                       rdma_buf_free(conn, &rpcreply);
-                       clist_free(cle);
-                       p->cku_err.re_status = RPC_CANTDECODERES;
-                       p->cku_err.re_errno = EIO;
-                       goto rdma_done;
+         } else {
+                 if(wlist_exists_reply)
+                         cmn_err(CE_NOTE,
+                                 "clnt_rdma_kcallit: received wlist for "
+                                 "non-READ3 call.  reply xdr decode fnp=%p",
+                                 (void *)xdr_results);
                }
 
                /*
-                * Deregister the Long RPC chunk
+          * The server shouldn't have sent a RDMA_SEND that
+          * the client needs to RDMA_WRITE a reply back to
+          * the server.  So silently ignoring what the
+          * server returns in the rdma_reply section of the
+          * header.
                 */
-               (void) clist_deregister(conn, cle, 0);
-               clist_free(cle);
-               xdrrdma_create(xdrs, rpcreply.addr, rpcreply.len, 0, cl,
-                       XDR_DECODE, conn);
-               rxdrp = xdrs;
-       } else {
-               rpcreply.addr = NULL;
-               xdrrdma_create(xdrs,
-                   (caddr_t)(uintptr_t)(recvlist->c_saddr + off),
-                   recvlist->c_len - off, 0, cl, XDR_DECODE, conn);
-               rxdrp = xdrs;
-       }
+        (void) xdr_decode_reply_wchunk(xdrs, &rdma_reply,conn);
+       off = xdr_getpos(xdrs);
 
+        xdrs = &replxdr;
+        if (clnt_decode_long_reply(conn, procnum, &long_reply_buf_clist,
+                                 rdma_reply, xdrs, &rxdrp,
+                                 cl, recvlist, op, off) != CLNT_RDMA_SUCCESS) 
+       {
+               goto done;
+       }       
        reply_msg.rm_direction = REPLY;
        reply_msg.rm_reply.rp_stat = MSG_ACCEPTED;
        reply_msg.acpted_rply.ar_stat = SUCCESS;
        reply_msg.acpted_rply.ar_verf = _null_auth;
        /*
@@ -910,99 +1133,143 @@
        }
 
        /*
         * If rpc reply is in a chunk, free it now.
         */
-       if (rpcreply.addr != NULL)
-               rdma_buf_free(conn, &rpcreply);
-
-rdma_done:
-       if ((cl != NULL) || (op == RDMA_NOMSG)) {
-               rdma_buf_t      donemsg;
-
-               /*
-                * Free the list holding the chunk info
-                */
-               if (cl) {
-                       clist_free(cl);
-                       cl = NULL;
-               }
-
-               /*
-                * Tell the server that the reads are done
-                */
-               donemsg.type = SEND_BUFFER;
-               if (RDMA_BUF_ALLOC(conn, &donemsg)) {
-                       p->cku_err.re_status = RPC_CANTSEND;
-                       p->cku_err.re_errno = EIO;
-                       RCSTAT_INCR(rcnomem);
-                       cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffer");
-                       goto done;
-               }
-               xdrs = &p->cku_outxdr;
-               xdrmem_create(xdrs, donemsg.addr, donemsg.len, XDR_ENCODE);
-               vers = RPCRDMA_VERS;
-               op = RDMA_DONE;
-
-               /*
-                * Treat xid as opaque (xid is the first entity
-                * in the rpc rdma message).
-                */
-               (*(uint32_t *)donemsg.addr) = p->cku_xid;
-               /* Skip xid and set the xdr position accordingly. */
-               XDR_SETPOS(xdrs, sizeof (uint32_t));
-               if (!xdr_u_int(xdrs, &vers) ||
-                   !xdr_u_int(xdrs, &op)) {
-                       cmn_err(CE_WARN,
-                               "clnt_rdma_kcallit: xdr_u_int failed");
-                       rdma_buf_free(conn, &donemsg);
-                       goto done;
-               }
-
-               sendlist = NULL;
-               clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &donemsg.handle,
-                       donemsg.addr, NULL, NULL);
-
-               status = RDMA_SEND(conn, sendlist, p->cku_xid);
-               if (status != RDMA_SUCCESS) {
-                       cmn_err(CE_WARN,
-                               "clnt_rdma_kcallit: RDMA_SEND failed xid %u",
-                                       p->cku_xid);
-               }
-#ifdef DEBUG
-               else {
-               if (rdma_clnt_debug)
-                       printf("clnt_rdma_kcallit: sent RDMA_DONE xid %u\n",
-                               p->cku_xid);
-               }
+done:
+       if (long_reply_buf_exists){
+               (void) clist_deregister(conn, &long_reply_buf_clist, 0);
+#ifndef SERVER_REG_CACHE
+               kmem_free((void *)long_reply_buf_clist.c_daddr,
+                               (size_t)long_reply_buf_clist.c_len);
+#else
+         RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)long_reply_buf_clist.long_reply_buf);
 #endif
-               clist_free(sendlist);
        }
-
-done:
        if (cxdrp)
                XDR_DESTROY(cxdrp);
        if (rxdrp) {
                (void) xdr_rpc_free_verifier(rxdrp, &reply_msg);
                XDR_DESTROY(rxdrp);
        }
 
        if (recvlist) {
-               rdma_buf_t      recvmsg;
-
+               rdma_buf_t      recvmsg = {0};
                recvmsg.addr = (caddr_t)(uintptr_t)recvlist->c_saddr;
                recvmsg.type = RECV_BUFFER;
                RDMA_BUF_FREE(conn, &recvmsg);
                clist_free(recvlist);
        }
+#if (!defined(ASYNC_CLIENT_DEREG))
+       if(rpccall_wlist){
+               kmem_free(rpccall_wlist, sizeof(clist));
+       }
+#endif
+
        RDMA_REL_CONN(conn);
        if (p->cku_err.re_status != RPC_SUCCESS) {
                RCSTAT_INCR(rcbadcalls);
        }
        return (p->cku_err.re_status);
 }
 
+static int clnt_decode_long_reply(CONN *conn,  rpcproc_t procnum,
+                               struct clist *long_reply_buf_clist, 
+                               struct clist *rdma_reply, XDR *xdrs,
+                               XDR **rxdrp, struct clist *cl, 
+                               struct clist *recvlist, 
+                               uint_t  op,uint_t off)
+{
+               if ( RDMA_NOMSG == op && long_reply_buf_clist->c_daddr) {
+               if (procnum == NFSPROC3_READDIR ||
+                   procnum == NFSPROC3_READDIRPLUS ||
+                   procnum == NFSPROC3_READLINK) {
+                       xdrmem_destroy(xdrs);
+                       xdrrdma_create(xdrs,
+                                       (caddr_t)long_reply_buf_clist->c_daddr,
+                                       rdma_reply->c_len,
+                                       0,
+                                       NULL,
+                                       XDR_DECODE,
+                                       conn);
+
+                       *rxdrp = xdrs;
+               } else {
+                       cmn_err(CE_NOTE, "clnt_rdma_kcallit: "
+                                       "wchunk buffer for wrong nfs proc");
+                       xdrmem_destroy(xdrs);
+                       *rxdrp = NULL;
+               }
+               } else if (cl && RDMA_NOMSG == op) {
+               cmn_err(CE_NOTE, "clnt_rdma_kcallit: "
+                               "Server sent a READ list in the RPC Reply");
+               xdrmem_destroy(xdrs);
+               } else {
+               xdrmem_destroy(xdrs);
+               xdrrdma_create(xdrs,
+                               (caddr_t)(uintptr_t)(recvlist->c_saddr + off),
+                               recvlist->c_len - off, 0, cl, XDR_DECODE, conn);
+               *rxdrp = xdrs;
+               }
+               return CLNT_RDMA_SUCCESS;
+}
+
+#ifdef DYNAMIC_CREDIT_CONTROL
+static void clnt_compute_credit(CONN *conn, uint32_t *rdma_credit)
+{
+       rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+       mutex_enter(&conn->c_lock);
+       if(cc_info->clnt_cc_granted_ops - cc_info->clnt_cc_in_flight_ops < CLNT_CREDIT_LOW)
+               *rdma_credit = rdma_bufs_rqst + cc_info->clnt_cc_in_flight_ops / 2;
+       mutex_exit(&conn->c_lock);
+}
+#endif
+
+static void clnt_return_credit(CONN *conn)
+{
+       rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+       mutex_enter(&conn->c_lock);
+       cc_info->clnt_cc_in_flight_ops--;
+       cv_signal(&cc_info->clnt_cc_cv);
+       mutex_exit(&conn->c_lock);
+}
+
+static void clnt_update_credit(CONN *conn, uint32_t rdma_credit)
+{
+        rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+        /*
+         * Get the granted number of buffers for credit control.
+         */
+        mutex_enter(&conn->c_lock);
+        cc_info->clnt_cc_granted_ops = rdma_credit;
+        mutex_exit(&conn->c_lock);
+}
+
+static void clnt_check_credit(CONN *conn)
+{
+        rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+        /*
+         * Make sure we are not going over our allowed buffer use
+         * (and make sure we have gotten a granted value before).
+         */
+        mutex_enter(&conn->c_lock);
+        while (cc_info->clnt_cc_in_flight_ops >= cc_info->clnt_cc_granted_ops
+                        && cc_info->clnt_cc_granted_ops != 0) {
+                /*
+                 * Client has maxed out its granted buffers due to
+                 * credit control.  Current handling is to block and wait.
+                 */
+                cv_wait(&cc_info->clnt_cc_cv, &conn->c_lock);
+        }
+        cc_info->clnt_cc_in_flight_ops++;
+        mutex_exit(&conn->c_lock);
+}
+
 /* ARGSUSED */
 static void
 clnt_rdma_kabort(CLIENT *h)
 {
 }
@@ -1051,10 +1318,14 @@
        struct knetconfig *knc;
        char *pf, *p;
        rdma_stat status;
        int error = 0;
 
+       mutex_enter(&rdma_modload_lock);
+       error = rdma_modload();
+       mutex_exit(&rdma_modload_lock);
+
        if (!INGLOBALZONE(curproc))
                return (-1);
        /*
         * modload the RDMA plugins if not already done.
         */
@@ -1100,6 +1371,30 @@
                }
                rp = rp->r_next;
        }
        rw_exit(&rdma_lock);
        return (-1);
+}
+
+static void
+check_dereg_wlist(CONN *conn, clist *rwc)
+{
+               if (rwc == NULL)
+               return;
+
+       if (rwc) {
+               if (rwc->c_dmemhandle.mrc_rmr && rwc->c_len) {
+                       int status;
+#if defined(ASYNC_CLIENT_DEREG)
+                       /* Add in an entry to rqueue    */
+                       INSERT_QUEUE(conn, rwc);        
+#else
+                       status = clist_deregister(conn, rwc, FALSE);
+                       if (status != RDMA_SUCCESS) {
+                               cmn_err(CE_NOTE, "dereg_wlist failed."
+                                               "status=%d", status);
+                       }
+#endif
+               }
+
+               }
 }