Cdiff clnt_rdma.c
*** /webrev/webrev/usr/src/uts/common/rpc/clnt_rdma.c-  Mon Aug 14 13:12:10 2006
--- clnt_rdma.c Thu Aug 10 14:22:04 2006

*** 29,38 **** --- 29,51 ---- * Portions of this source code were derived from Berkeley * 4.3 BSD under license from the Regents of the University of * California. */ + /* Copyright (c) 2006, The Ohio State University. All rights reserved. + * + * Portions of this source code is developed by the team members of + * The Ohio State University's Network-Based Computing Laboratory (NBCL), + * headed by Professor Dhabaleswar K. (DK) Panda. + * + * Acknowledgements to contributions from developors: + * Ranjit Noronha: noronha@cse.ohio-state.edu + * Lei Chai : chail@cse.ohio-state.edu + * Weikuan Yu : yuw@cse.ohio-state.edu + * + */ + #pragma ident "@(#)clnt_rdma.c 1.10 05/07/26 SMI" #include <sys/param.h> #include <sys/types.h> #include <sys/user.h>
*** 54,65 **** --- 67,121 ---- #include <rpc/xdr.h> #include <rpc/auth.h> #include <rpc/clnt.h> #include <rpc/rpc_msg.h> #include <rpc/rpc_rdma.h> + #include <nfs/nfs.h> + #define CLNT_CREDIT_LOW (5) + xdrproc_t x_READ3args = NULL_xdrproc_t; + xdrproc_t x_READ3res = NULL_xdrproc_t; + xdrproc_t x_READ3vres = NULL_xdrproc_t; + xdrproc_t x_READ3uiores = NULL_xdrproc_t; + + static uint32_t rdma_bufs_rqst = RDMA_BUFS_RQST; + + int rdma_wlist_verbose_debug = 0; + int rdma_wlist_memreg_debug = 0; + int rdma_wlist_clnt_debug = 0; + int rdma_wlist_svc_debug = 0; + int rdma_wlist_xdr_debug = 0; + int rdma_wlist_pglck_debug = 0; + int credit_control_debug = 0; + int rdma_long_reply_debug = 0; + int rdma_xdr_long_reply_debug = 0; + + struct clist empty_cl = {0}; + + static void clnt_read3args_make_wlist(caddr_t, struct clist **, xdrproc_t, uint_t *); + static int clnt_compose_rpcmsg(CLIENT *, rpcproc_t, rdma_buf_t *, + XDR *, xdrproc_t, caddr_t); + static int clnt_compose_rdma_header(CONN *, CLIENT *, rdma_buf_t *, + XDR **, uint_t *); + static int clnt_setup_rlist(CONN *, XDR *, struct clist **); + static int clnt_setup_wlist(CONN *, rpcproc_t, struct clist **, + caddr_t, xdrproc_t, XDR *); + static int clnt_setup_long_reply(CONN *, rpcproc_t, struct clist *, + XDR *, bool_t *); + #ifdef DYNAMIC_CREDIT_CONTROL + static void clnt_compute_credit(CONN *, uint32_t *); + #endif + static void clnt_check_credit(CONN *); + static void clnt_return_credit(CONN *); + static int clnt_decode_long_reply(CONN *, rpcproc_t, struct clist *, + struct clist *, XDR *, XDR **, struct clist *, + struct clist *, uint_t,uint_t); + + static void clnt_update_credit(CONN *,uint32_t); + static void check_dereg_wlist(CONN *, struct clist *); + static enum clnt_stat clnt_rdma_kcallit(CLIENT *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, caddr_t, struct timeval); static void clnt_rdma_kabort(CLIENT *); static void clnt_rdma_kerror(CLIENT *, struct rpc_err *); static bool_t clnt_rdma_kfreeres(CLIENT *, xdrproc_t, caddr_t);
*** 83,92 **** --- 139,150 ---- /* * The size of the preserialized RPC header information. */ #define CKU_HDRSIZE 20 + #define CLNT_RDMA_SUCCESS 0 + #define CLNT_RDMA_FAIL -99 /* * Per RPC RDMA endpoint details */ typedef struct cku_private {
*** 283,292 **** --- 341,610 ---- p->cku_addr.len = raddr->len; bcopy(raddr->buf, p->cku_addr.buf, raddr->len); h->cl_ops = &rdma_clnt_ops; } + static int clnt_compose_rpcmsg(CLIENT *h, rpcproc_t procnum, + rdma_buf_t *rpcmsg, XDR *xdrs, + xdrproc_t xdr_args, caddr_t argsp) + { + cku_private_t *p = htop(h); + + if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { + /* + * Copy in the preserialized RPC header + * information. + */ + bcopy(p->cku_rpchdr, rpcmsg->addr, CKU_HDRSIZE); + + /* + * transaction id is the 1st thing in the output + * buffer. + */ + /* LINTED pointer alignment */ + (*(uint32_t *)(rpcmsg->addr)) = p->cku_xid; + + /* Skip the preserialized stuff. */ + XDR_SETPOS(xdrs, CKU_HDRSIZE); + + /* Serialize dynamic stuff into the output buffer. */ + if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || + (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || + (!(*xdr_args)(xdrs, argsp))) { + cmn_err(CE_WARN,"Failed to serialize dynamic arguments\n"); + return CLNT_RDMA_FAIL; + } + p->cku_outsz = XDR_GETPOS(xdrs); + } else { + uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE]; + IXDR_PUT_U_INT32(uproc, procnum); + (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; + XDR_SETPOS(xdrs, 0); + + /* Serialize the procedure number and the arguments. */ + if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr, + CKU_HDRSIZE+4, xdrs, NULL, NULL) || + !(*xdr_args)(xdrs, argsp)) { + if (rpcmsg->addr != xdrs->x_base) { + rpcmsg->addr = xdrs->x_base; + rpcmsg->len = xdr_getbufsize(xdrs); + } + cmn_err(CE_WARN,"Failed to serialize procedure number and the arguments.\n"); + return CLNT_RDMA_FAIL; + } + /* + * If we had to allocate a new buffer while encoding + * then update the addr and len. + */ + if (rpcmsg->addr != xdrs->x_base) { + rpcmsg->addr = xdrs->x_base; + rpcmsg->len = xdr_getbufsize(xdrs); + } + + p->cku_outsz = XDR_GETPOS(xdrs); + } + + return CLNT_RDMA_SUCCESS; + } + + static int clnt_compose_rdma_header(CONN *conn, CLIENT *h, rdma_buf_t *clmsg, + XDR **xdrs, uint_t *op) + { + cku_private_t *p = htop(h); + uint_t vers; + uint32_t rdma_credit = rdma_bufs_rqst; + + vers = RPCRDMA_VERS; + clmsg->type = SEND_BUFFER; + + #ifdef DYNAMIC_CREDIT_CONTROL + clnt_compute_credit(conn, &rdma_credit); + #endif + + if (RDMA_BUF_ALLOC(conn, clmsg)) { + return CLNT_RDMA_FAIL; + } + + *xdrs = &p->cku_outxdr; + xdrmem_create(*xdrs, clmsg->addr, clmsg->len, XDR_ENCODE); + + (*(uint32_t *)clmsg->addr) = p->cku_xid; + XDR_SETPOS(*xdrs, sizeof (uint32_t)); + (void) xdr_u_int(*xdrs, &vers); + (void) xdr_u_int(*xdrs, &rdma_credit); + (void) xdr_u_int(*xdrs, op); + + return CLNT_RDMA_SUCCESS; + } + + static int clnt_setup_rlist(CONN *conn, XDR *xdrs, struct clist **cl) + { + int ret; + + if (*cl != NULL) { + ret = clist_register(conn, *cl, 1); + if (ret != RDMA_SUCCESS) { + return CLNT_RDMA_FAIL; + } + } + (void) xdr_do_clist(xdrs, cl); + + return CLNT_RDMA_SUCCESS; + } + + static int clnt_setup_wlist(CONN *conn, rpcproc_t procnum, + struct clist **rpccall_wlist, caddr_t resultsp, + xdrproc_t xdr_results, XDR *xdrs) + { + int status; + uint_t num_segment = 0; + + if (procnum == NFSPROC3_READ) { + clnt_read3args_make_wlist(resultsp, rpccall_wlist, + xdr_results, &num_segment); + status = clist_register(conn, *rpccall_wlist, 0); + if (status != RDMA_SUCCESS) + return CLNT_RDMA_FAIL; + } else { + *rpccall_wlist = NULL; + } + + if (! xdr_encode_wlist(xdrs, *rpccall_wlist, num_segment)) + return CLNT_RDMA_FAIL; + + return CLNT_RDMA_SUCCESS; + } + + static int clnt_setup_long_reply(CONN *conn, rpcproc_t procnum, + struct clist *lrc_clist, + XDR *xdrs, bool_t *exists) + { + int status; + caddr_t addr; + #ifdef SERVER_REG_CACHE + rib_lrc_entry_t *long_reply_buf = NULL; + #endif + *exists = FALSE; + lrc_clist->c_daddr = NULL; + + #ifdef RPC_RDMA_INLINE + if (lrc_clist->c_len < rdma_minchunk) + return CLNT_RDMA_SUCCESS; + #endif + + if (procnum == NFSPROC3_READDIR || + procnum == NFSPROC3_READDIRPLUS || + procnum == NFSPROC3_READLINK) { + #ifndef SERVER_REG_CACHE + addr = kmem_alloc(LONG_REPLY_LEN, KM_SLEEP); + bzero(addr, LONG_REPLY_LEN); + lrc_clist->c_daddr = (uint64)addr; + lrc_clist->c_len = LONG_REPLY_LEN; + lrc_clist->c_next = NULL; + lrc_clist->long_reply_buf = NULL; + status = clist_register(conn, lrc_clist, 0); + #else + long_reply_buf = RDMA_GET_SERVER_CACHE_BUF(conn, LONG_REPLY_LEN); + bzero(long_reply_buf->lrc_buf, LONG_REPLY_LEN); + lrc_clist->c_daddr = (uint64)long_reply_buf->lrc_buf; + lrc_clist->c_len = LONG_REPLY_LEN; + lrc_clist->c_next = NULL; + lrc_clist->long_reply_buf = (uint64)long_reply_buf; + lrc_clist->c_dmemhandle = long_reply_buf->lrc_mhandle; + status = clist_register(conn, lrc_clist, 0); + #endif + if(status) { + cmn_err(CE_WARN, "clnt_setup_long_reply: cannot register buffer"); + #ifndef SERVER_REG_CACHE + kmem_free((void*)addr, (size_t)LONG_REPLY_LEN); + #else + RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)long_reply_buf); + + #endif + lrc_clist->c_daddr = NULL; + return CLNT_RDMA_FAIL; + } + *exists = TRUE; + } + + return CLNT_RDMA_SUCCESS; + } + + static void + clnt_read3args_make_wlist(caddr_t replyp, struct clist **rpccall_wlist, + xdrproc_t xr, uint_t *num_segment) + { + READ3uiores *ures = (READ3uiores *)replyp; + READ3vres *vres = (READ3vres *)replyp; + struct clist *rwl = NULL, *prev = NULL; + int i, total_length; + + *rpccall_wlist = NULL; + + #ifdef RPC_RDMA_INLINE + if (xr == x_READ3uiores) { + total_length = 0; + for(i=0; i<ures->uiop->uio_iovcnt; i++) { + total_length += ures->uiop->uio_iov[i].iov_len; + } + } else { + total_length = vres->data.data_len; + } + + if (total_length < rdma_minchunk) + return; + #endif + + /* XXX: fake a chunk threshold for the combined length for now */ + if (xr == x_READ3uiores) { + *num_segment = ures->uiop->uio_iovcnt; + for(i=0; i<ures->uiop->uio_iovcnt; i++) { + rwl = (struct clist *)kmem_zalloc(sizeof(struct clist), + KM_SLEEP); + + rwl->c_len = ures->uiop->uio_iov[i].iov_len; + rwl->c_daddr = (uint64)(ures->uiop->uio_iov[i].iov_base); + /* + * if userspace address, put adspace ptr in clist. + * If not, then do nothing since it's already + * set to NULL (from empty_cl) + */ + if (ures->uiop->uio_segflg == UIO_USERSPACE) { + int error; + rwl->c_adspc = ttoproc(curthread)->p_as; + } else { + rwl->c_dpplist = (page_t **)NULL; + } + + if(prev == NULL) + prev = rwl; + else { + prev->c_next = rwl; + prev = rwl; + } + + if(*rpccall_wlist == NULL) + *rpccall_wlist = rwl; + } + rwl->c_next = NULL; + } else if (xr == x_READ3vres) { + *num_segment = 1; + rwl = (struct clist *)kmem_zalloc(sizeof (struct clist), + KM_SLEEP); + *rwl = empty_cl; + + rwl->c_len = vres->data.data_len; + rwl->c_daddr = (uint64)(vres->data.data_val); + + if(*rpccall_wlist == NULL) + *rpccall_wlist = rwl; + } else { + /*cmn_err(CE_NOTE, "read3args_make_wlist: non READ3xr=%p", + (void *)xr);*/ + } + } + /* ARGSUSED */ static enum clnt_stat clnt_rdma_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait) {
*** 294,313 **** int status; XDR *xdrs; XDR *cxdrp = NULL, callxdr; /* for xdrrdma encoding the RPC call */ XDR *rxdrp = NULL, replxdr; /* for xdrrdma decoding the RPC reply */ struct rpc_msg reply_msg; ! struct clist *sendlist, *recvlist = NULL; ! struct clist *cl = NULL, *cle = NULL; uint_t vers, op; uint_t off; uint32_t xid; CONN *conn = NULL; ! rdma_buf_t clmsg, rpcmsg, longmsg, rpcreply; int msglen; clock_t ticks; RCSTAT_INCR(rccalls); /* * Get unique xid */ if (p->cku_xid == 0) --- 612,641 ---- int status; XDR *xdrs; XDR *cxdrp = NULL, callxdr; /* for xdrrdma encoding the RPC call */ XDR *rxdrp = NULL, replxdr; /* for xdrrdma decoding the RPC reply */ struct rpc_msg reply_msg; ! struct clist *sendlist = NULL, *recvlist = NULL; ! struct clist *cl = NULL, *cle = NULL, *rdma_reply = NULL; uint_t vers, op; uint_t off; uint32_t xid; + uint32_t seg_array_len; CONN *conn = NULL; ! rdma_buf_t clmsg = {0}, rpcmsg = {0}; int msglen; clock_t ticks; + bool_t wlist_exists_reply = FALSE; + bool_t long_reply_buf_exists = FALSE; + struct clist *rpccall_wlist = NULL, *rpcreply_wlist = NULL, + long_reply_clist ={0}; + rpccall_read_t read_type; + rpccall_write_t write_type; + uint32_t rdma_credit = rdma_bufs_rqst; + struct clist long_reply_buf_clist = {0}; + RCSTAT_INCR(rccalls); /* * Get unique xid */ if (p->cku_xid == 0)
*** 361,370 **** --- 689,701 ---- break; } return (p->cku_err.re_status); } + + clnt_check_credit(conn); + /* * Get the size of the rpc call message. Need this * to determine if the rpc call message will fit in * the pre-allocated RDMA buffers. If the rpc call * message length is greater that the pre-allocated
*** 372,544 **** * buffer is allocated and registered for the Long * RPC call. */ xdrs = &callxdr; msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT; if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { msglen += xdrrdma_authsize(h->cl_auth, p->cku_cred, rdma_minchunk); msglen += xdrrdma_sizeof(xdr_args, argsp, rdma_minchunk); ! if (msglen > RPC_MSG_SZ) { ! ! /* ! * Long RPC. Allocate one time use custom buffer. ! */ ! rpcmsg.type = CHUNK_BUFFER; ! rpcmsg.addr = kmem_zalloc(msglen, KM_SLEEP); ! cle = kmem_zalloc(sizeof (*cle), KM_SLEEP); ! cle->c_xdroff = 0; ! cle->c_len = rpcmsg.len = msglen; ! cle->c_saddr = (uint64)(uintptr_t)rpcmsg.addr; ! cle->c_next = NULL; ! xdrrdma_create(xdrs, rpcmsg.addr, msglen, ! rdma_minchunk, cle, XDR_ENCODE, NULL); ! cxdrp = xdrs; ! op = RDMA_NOMSG; } else { /* - * Get a pre-allocated buffer for rpc call - */ - rpcmsg.type = SEND_BUFFER; - if (RDMA_BUF_ALLOC(conn, &rpcmsg)) { - p->cku_err.re_status = RPC_CANTSEND; - p->cku_err.re_errno = EIO; - RCSTAT_INCR(rcnomem); - cmn_err(CE_WARN, - "clnt_rdma_kcallit: no buffers!"); - goto done; - } - xdrrdma_create(xdrs, rpcmsg.addr, rpcmsg.len, - rdma_minchunk, NULL, XDR_ENCODE, NULL); - cxdrp = xdrs; - op = RDMA_MSG; - } - } else { - /* * For RPCSEC_GSS since we cannot accurately presize the * buffer required for encoding, we assume that its going * to be a Long RPC to start with. We also create the * the XDR stream with min_chunk set to 0 which instructs * the XDR layer to not chunk the incoming byte stream. */ msglen += 2 * MAX_AUTH_BYTES + 2 * sizeof (struct opaque_auth); ! msglen += xdr_sizeof(xdr_args, argsp); ! /* ! * Long RPC. Allocate one time use custom buffer. ! */ ! longmsg.type = CHUNK_BUFFER; ! longmsg.addr = kmem_zalloc(msglen, KM_SLEEP); ! cle = kmem_zalloc(sizeof (*cle), KM_SLEEP); ! cle->c_xdroff = 0; ! cle->c_len = longmsg.len = msglen; ! cle->c_saddr = (uint64)(uintptr_t)longmsg.addr; ! cle->c_next = NULL; ! xdrrdma_create(xdrs, longmsg.addr, msglen, 0, cle, ! XDR_ENCODE, NULL); ! cxdrp = xdrs; ! op = RDMA_NOMSG; } ! if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { ! /* ! * Copy in the preserialized RPC header ! * information. ! */ ! bcopy(p->cku_rpchdr, rpcmsg.addr, CKU_HDRSIZE); ! /* ! * transaction id is the 1st thing in the output ! * buffer. ! */ ! /* LINTED pointer alignment */ ! (*(uint32_t *)(rpcmsg.addr)) = p->cku_xid; ! ! /* Skip the preserialized stuff. */ ! XDR_SETPOS(xdrs, CKU_HDRSIZE); ! ! /* Serialize dynamic stuff into the output buffer. */ ! if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || ! (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || ! (!(*xdr_args)(xdrs, argsp))) { ! rdma_buf_free(conn, &rpcmsg); ! if (cle) ! clist_free(cle); ! p->cku_err.re_status = RPC_CANTENCODEARGS; ! p->cku_err.re_errno = EIO; ! cmn_err(CE_WARN, ! "clnt_rdma_kcallit: XDR_PUTINT32/AUTH_MARSHAL/xdr_args failed"); goto done; } - p->cku_outsz = XDR_GETPOS(xdrs); } else { ! uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE]; ! IXDR_PUT_U_INT32(uproc, procnum); ! (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; ! XDR_SETPOS(xdrs, 0); ! ! /* Serialize the procedure number and the arguments. */ ! if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr, ! CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) { ! if (longmsg.addr != xdrs->x_base) { ! longmsg.addr = xdrs->x_base; ! longmsg.len = xdr_getbufsize(xdrs); } ! rdma_buf_free(conn, &longmsg); clist_free(cle); p->cku_err.re_status = RPC_CANTENCODEARGS; p->cku_err.re_errno = EIO; cmn_err(CE_WARN, ! "clnt_rdma_kcallit: AUTH_WRAP failed"); goto done; } - /* - * If we had to allocate a new buffer while encoding - * then update the addr and len. - */ - if (longmsg.addr != xdrs->x_base) { - longmsg.addr = xdrs->x_base; - longmsg.len = xdr_getbufsize(xdrs); - } ! /* ! * If it so happens that the encoded message is after all ! * not long enough to be a Long RPC then allocate a ! * SEND_BUFFER and copy the encoded message into it. */ - p->cku_outsz = XDR_GETPOS(xdrs); - if (p->cku_outsz > RPC_MSG_SZ) { - rpcmsg.type = CHUNK_BUFFER; - rpcmsg.addr = longmsg.addr; - rpcmsg.len = longmsg.len; - } else { - clist_free(cle); - XDR_DESTROY(cxdrp); - cxdrp = NULL; - /* - * Get a pre-allocated buffer for rpc call - */ - rpcmsg.type = SEND_BUFFER; - if (RDMA_BUF_ALLOC(conn, &rpcmsg)) { - p->cku_err.re_status = RPC_CANTSEND; - p->cku_err.re_errno = EIO; - RCSTAT_INCR(rcnomem); - cmn_err(CE_WARN, - "clnt_rdma_kcallit: no buffers!"); - rdma_buf_free(conn, &longmsg); - goto done; - } - bcopy(longmsg.addr, rpcmsg.addr, p->cku_outsz); - xdrrdma_create(xdrs, rpcmsg.addr, p->cku_outsz, 0, - NULL, XDR_ENCODE, NULL); - cxdrp = xdrs; - rdma_buf_free(conn, &longmsg); - op = RDMA_MSG; - } - } cl = xdrrdma_clist(xdrs); /* * Update the chunk size information for the Long RPC msg. --- 703,789 ---- * buffer is allocated and registered for the Long * RPC call. */ xdrs = &callxdr; msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT; + if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { msglen += xdrrdma_authsize(h->cl_auth, p->cku_cred, rdma_minchunk); msglen += xdrrdma_sizeof(xdr_args, argsp, rdma_minchunk); ! if (msglen > RPC_MSG_SZ) ! read_type = RPCCALL_RCHUNK; ! else ! read_type = RPCCALL_NORCHUNK; } else { /* * For RPCSEC_GSS since we cannot accurately presize the * buffer required for encoding, we assume that its going * to be a Long RPC to start with. We also create the * the XDR stream with min_chunk set to 0 which instructs * the XDR layer to not chunk the incoming byte stream. */ msglen += 2 * MAX_AUTH_BYTES + 2 * sizeof (struct opaque_auth); ! msglen += xdrrdma_sizeof(xdr_args, argsp, rdma_minchunk); ! if (msglen > RPC_MSG_SZ) ! read_type = RPCCALL_RCHUNK; ! else ! read_type = RPCCALL_NORCHUNK; } ! if (read_type == RPCCALL_NORCHUNK) { ! rpcmsg.type = SEND_BUFFER; ! if (RDMA_BUF_ALLOC(conn, &rpcmsg)) { ! cmn_err(CE_WARN, "clnt_rdma_kcallit: no buffers!"); goto done; } } else { ! #ifdef SERVER_REG_CACHE ! rib_lrc_entry_t *long_reply_buf = NULL; ! #endif ! rpcmsg.type = CHUNK_BUFFER; ! #ifdef SERVER_REG_CACHE ! long_reply_buf = RDMA_GET_SERVER_CACHE_BUF(conn, msglen); ! rpcmsg.addr = long_reply_buf->lrc_buf; ! #else ! rpcmsg.addr = kmem_zalloc(msglen, KM_SLEEP); ! #endif ! cle = (struct clist *)kmem_zalloc(sizeof (struct clist), ! KM_SLEEP); ! cle->c_xdroff = 0; ! cle->c_len = rpcmsg.len = msglen; ! cle->c_saddr = (uint64)(uintptr_t)rpcmsg.addr; ! cle->c_next = NULL; ! #ifdef SERVER_REG_CACHE ! cle->long_reply_buf = (uint64)long_reply_buf; ! #endif } ! ! op = cle ? RDMA_NOMSG : RDMA_MSG; ! cxdrp = xdrs; ! xdrrdma_create(xdrs, rpcmsg.addr, (cle ? msglen : rpcmsg.len), ! rdma_minchunk, cle, XDR_ENCODE, NULL); ! ! status = clnt_compose_rpcmsg(h, procnum, &rpcmsg, xdrs, xdr_args, argsp); ! if (status != CLNT_RDMA_SUCCESS) { ! rdma_buf_free(conn, &rpcmsg); clist_free(cle); p->cku_err.re_status = RPC_CANTENCODEARGS; p->cku_err.re_errno = EIO; cmn_err(CE_WARN, ! "clnt_rdma_kcallit: clnt_compose_rpcmsg failed"); goto done; } ! /* Read chunklist (a linked list of N elements, ! * position P (same P for all chunks of same arg!): ! * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 */ cl = xdrrdma_clist(xdrs); /* * Update the chunk size information for the Long RPC msg.
*** 545,608 **** */ if (cl && op == RDMA_NOMSG) cl->c_len = p->cku_outsz; /* ! * Set up the RDMA chunk message */ ! vers = RPCRDMA_VERS; ! clmsg.type = SEND_BUFFER; ! if (RDMA_BUF_ALLOC(conn, &clmsg)) { p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; rdma_buf_free(conn, &rpcmsg); RCSTAT_INCR(rcnomem); cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffers!!"); goto done; } - xdrs = &p->cku_outxdr; - xdrmem_create(xdrs, clmsg.addr, clmsg.len, XDR_ENCODE); - /* - * Treat xid as opaque (xid is the first entity - * in the rpc rdma message). - */ - (*(uint32_t *)clmsg.addr) = p->cku_xid; - /* Skip xid and set the xdr position accordingly. */ - XDR_SETPOS(xdrs, sizeof (uint32_t)); - (void) xdr_u_int(xdrs, &vers); - (void) xdr_u_int(xdrs, &op); ! /* ! * Now XDR the chunk list ! */ ! if (cl != NULL) { ! /* ! * Register the chunks in the list */ ! status = clist_register(conn, cl, 1); ! if (status != RDMA_SUCCESS) { ! cmn_err(CE_WARN, ! "clnt_rdma_kcallit: clist register failed"); rdma_buf_free(conn, &clmsg); rdma_buf_free(conn, &rpcmsg); clist_free(cl); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } } - (void) xdr_do_clist(xdrs, &cl); /* * Start with the RDMA header and clist (if any) */ sendlist = NULL; clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &clmsg.handle, clmsg.addr, NULL, NULL); - /* * Put the RPC call message in the send list if small RPC */ if (op == RDMA_MSG) { clist_add(&sendlist, 0, p->cku_outsz, &rpcmsg.handle, --- 790,859 ---- */ if (cl && op == RDMA_NOMSG) cl->c_len = p->cku_outsz; /* ! * Prepare the header for the RDMA chunk */ ! status = clnt_compose_rdma_header(conn, h, &clmsg, &xdrs, &op); ! if (status != CLNT_RDMA_SUCCESS) { p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; rdma_buf_free(conn, &rpcmsg); + clist_free(cle); RCSTAT_INCR(rcnomem); cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffers!!"); goto done; } ! status = clnt_setup_rlist(conn, xdrs, &cl); ! if (status != CLNT_RDMA_SUCCESS) { ! cmn_err(CE_WARN, "clnt_rdma_kcallit: clist register failed"); ! rdma_buf_free(conn, &clmsg); ! rdma_buf_free(conn, &rpcmsg); ! clist_free(cl); ! p->cku_err.re_status = RPC_CANTSEND; ! p->cku_err.re_errno = EIO; ! goto done; ! } ! /* Setup write chunk list for NFS3 READ operation ! * Other operations will have a NULL wlist */ ! status = clnt_setup_wlist(conn, procnum, &rpccall_wlist, ! resultsp, xdr_results, xdrs); ! if (status != CLNT_RDMA_SUCCESS) { rdma_buf_free(conn, &clmsg); rdma_buf_free(conn, &rpcmsg); clist_free(cl); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } + status = clnt_setup_long_reply(conn, procnum, &long_reply_buf_clist, + xdrs, &long_reply_buf_exists); + if (status != CLNT_RDMA_SUCCESS) { + rdma_buf_free(conn, &clmsg); + rdma_buf_free(conn, &rpcmsg); + clist_free(cl); + p->cku_err.re_status = RPC_CANTSEND; + p->cku_err.re_errno = EIO; + goto done; } /* + * XDR encode the RDMA_REPLY write chunk + */ + seg_array_len = (long_reply_buf_exists ? 1:0); + (void) xdr_encode_reply_wchunk(xdrs, &long_reply_buf_clist, seg_array_len); + /* * Start with the RDMA header and clist (if any) */ sendlist = NULL; clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &clmsg.handle, clmsg.addr, NULL, NULL); /* * Put the RPC call message in the send list if small RPC */ if (op == RDMA_MSG) { clist_add(&sendlist, 0, p->cku_outsz, &rpcmsg.handle,
*** 646,656 **** --- 897,911 ---- } /* * Send the call message to the server */ + #if defined (CLNT_INTERRUPT_COAL) + status = RDMA_SEND_BL(conn, sendlist, p->cku_xid); + #else status = RDMA_SEND(conn, sendlist, p->cku_xid); + #endif if (status != RDMA_SUCCESS) { if (cl) { (void) clist_deregister(conn, cl, 1); clist_free(cl); /*
*** 672,691 **** clmsg.addr = NULL; if (rpcmsg.type == SEND_BUFFER) rpcmsg.addr = NULL; } clist_free(sendlist); - #ifdef DEBUG - if (rdma_clnt_debug) { - printf("clnt_rdma_kcallit: send request xid %u\n", p->cku_xid); - } - #endif /* * Recv rpc reply */ status = RDMA_RECV(conn, &recvlist, p->cku_xid); /* * Deregister chunks sent. Do this only after the reply * is received as that is a sure indication that the * remote end has completed RDMA of the chunks. --- 927,942 ---- clmsg.addr = NULL; if (rpcmsg.type == SEND_BUFFER) rpcmsg.addr = NULL; } clist_free(sendlist); /* * Recv rpc reply */ status = RDMA_RECV(conn, &recvlist, p->cku_xid); + clnt_return_credit(conn); /* * Deregister chunks sent. Do this only after the reply * is received as that is a sure indication that the * remote end has completed RDMA of the chunks.
*** 704,719 **** /* * Now check recv status */ if (status != 0) { - #ifdef DEBUG - if (rdma_clnt_debug) - cmn_err(CE_NOTE, - "clnt_rdma_kcallit: reply failed %u status %d", - p->cku_xid, status); - #endif if (status == RDMA_INTR) { p->cku_err.re_status = RPC_INTR; p->cku_err.re_errno = EINTR; RCSTAT_INCR(rcintrs); } else if (status == RPC_TIMEDOUT) { --- 955,964 ----
*** 724,737 **** p->cku_err.re_status = RPC_CANTRECV; p->cku_err.re_errno = EIO; } goto done; } - #ifdef DEBUG - if (rdma_clnt_debug) - printf("clnt_rdma_kcallit: got response xid %u\n", p->cku_xid); - #endif /* * Process the reply message. * * First the chunk list (if any) */ --- 969,978 ----
*** 744,835 **** */ xid = *(uint32_t *)(uintptr_t)recvlist->c_saddr; /* Skip xid and set the xdr position accordingly. */ XDR_SETPOS(xdrs, sizeof (uint32_t)); (void) xdr_u_int(xdrs, &vers); (void) xdr_u_int(xdrs, &op); (void) xdr_do_clist(xdrs, &cl); ! off = xdr_getpos(xdrs); ! /* ! * Now the RPC reply message itself. If the reply ! * came as a chunk item, then RDMA the reply over. ! */ ! xdrs = &replxdr; ! if (cl && op == RDMA_NOMSG) { ! struct clist *cle = cl; ! rpcreply.type = CHUNK_BUFFER; ! rpcreply.addr = kmem_alloc(cle->c_len, KM_SLEEP); ! rpcreply.len = cle->c_len; ! cle->c_daddr = (uint64)(uintptr_t)rpcreply.addr; ! cl = cl->c_next; ! cle->c_next = NULL; ! /* ! * Register the rpc reply chunk destination ! */ ! status = clist_register(conn, cle, 0); ! if (status) { ! rdma_buf_free(conn, &rpcreply); ! clist_free(cle); ! p->cku_err.re_status = RPC_CANTDECODERES; ! p->cku_err.re_errno = EIO; ! cmn_err(CE_WARN, ! "clnt_rdma_kcallit: clist_register failed"); ! goto rdma_done; } - - /* - * Now read rpc reply in - */ - #ifdef DEBUG - if (rdma_clnt_debug) - printf("clnt_rdma_kcallit: read chunk, len %d, xid %u, \ - reply xid %u\n", cle->c_len, p->cku_xid, xid); - #endif - status = RDMA_READ(conn, cle, WAIT); - if (status) { - (void) clist_deregister(conn, cle, 0); - rdma_buf_free(conn, &rpcreply); - clist_free(cle); - p->cku_err.re_status = RPC_CANTDECODERES; - p->cku_err.re_errno = EIO; - cmn_err(CE_WARN, - "clnt_rdma_kcallit: RDMA_READ failed"); - goto rdma_done; } ! ! /* ! * sync the memory for dma ! */ ! status = clist_syncmem(conn, cle, 0); ! if (status != RDMA_SUCCESS) { ! (void) clist_deregister(conn, cle, 0); ! rdma_buf_free(conn, &rpcreply); ! clist_free(cle); ! p->cku_err.re_status = RPC_CANTDECODERES; ! p->cku_err.re_errno = EIO; ! goto rdma_done; } /* ! * Deregister the Long RPC chunk */ ! (void) clist_deregister(conn, cle, 0); ! clist_free(cle); ! xdrrdma_create(xdrs, rpcreply.addr, rpcreply.len, 0, cl, ! XDR_DECODE, conn); ! rxdrp = xdrs; ! } else { ! rpcreply.addr = NULL; ! xdrrdma_create(xdrs, ! (caddr_t)(uintptr_t)(recvlist->c_saddr + off), ! recvlist->c_len - off, 0, cl, XDR_DECODE, conn); ! rxdrp = xdrs; ! } reply_msg.rm_direction = REPLY; reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; reply_msg.acpted_rply.ar_stat = SUCCESS; reply_msg.acpted_rply.ar_verf = _null_auth; /* --- 985,1058 ---- */ xid = *(uint32_t *)(uintptr_t)recvlist->c_saddr; /* Skip xid and set the xdr position accordingly. */ XDR_SETPOS(xdrs, sizeof (uint32_t)); (void) xdr_u_int(xdrs, &vers); + (void) xdr_u_int(xdrs, &rdma_credit); (void) xdr_u_int(xdrs, &op); (void) xdr_do_clist(xdrs, &cl); ! clnt_update_credit(conn, rdma_credit); ! wlist_exists_reply = FALSE; ! if (! xdr_decode_wlist(xdrs, &rpcreply_wlist, &wlist_exists_reply)) { ! cmn_err(CE_NOTE, ! "clnt_rdma_kcallit: xdr_decode_wlist failed"); ! /* XXX: what should we fail with here -- EIO? */ ! } ! #ifdef RPC_RDMA_INLINE ! if (xdr_results == x_READ3vres) { ! ((READ3vres *)resultsp)->wlist = NULL; ! } else if (xdr_results == x_READ3uiores) { ! ((READ3uiores *)resultsp)->wlist = NULL; ! } ! #endif ! if (procnum == NFSPROC3_READ) { ! check_dereg_wlist(conn, rpccall_wlist); ! if (wlist_exists_reply) { ! if (xdr_results == x_READ3vres) { ! ((READ3vres *)resultsp)->wlist = ! rpcreply_wlist; ! ((READ3vres *)resultsp)->wlist_len = ! rpcreply_wlist->c_len; ! } else if (xdr_results == x_READ3uiores) { ! ((READ3uiores *)resultsp)->wlist = ! rpcreply_wlist; ! ((READ3uiores *)resultsp)->wlist_len = ! rpcreply_wlist->c_len; ! } else { ! cmn_err(CE_NOTE, ! "unknown READ3 xdr decode fnp=%p", ! (void *)xdr_results); } } ! } else { ! if(wlist_exists_reply) ! cmn_err(CE_NOTE, ! "clnt_rdma_kcallit: received wlist for " ! "non-READ3 call. reply xdr decode fnp=%p", ! (void *)xdr_results); } /* ! * The server shouldn't have sent a RDMA_SEND that ! * the client needs to RDMA_WRITE a reply back to ! * the server. So silently ignoring what the ! * server returns in the rdma_reply section of the ! * header. */ ! (void) xdr_decode_reply_wchunk(xdrs, &rdma_reply,conn); ! off = xdr_getpos(xdrs); + xdrs = &replxdr; + if (clnt_decode_long_reply(conn, procnum, &long_reply_buf_clist, + rdma_reply, xdrs, &rxdrp, + cl, recvlist, op, off) != CLNT_RDMA_SUCCESS) + { + goto done; + } reply_msg.rm_direction = REPLY; reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; reply_msg.acpted_rply.ar_stat = SUCCESS; reply_msg.acpted_rply.ar_verf = _null_auth; /*
*** 910,1008 **** } /* * If rpc reply is in a chunk, free it now. */ ! if (rpcreply.addr != NULL) ! rdma_buf_free(conn, &rpcreply); ! ! rdma_done: ! if ((cl != NULL) || (op == RDMA_NOMSG)) { ! rdma_buf_t donemsg; ! ! /* ! * Free the list holding the chunk info ! */ ! if (cl) { ! clist_free(cl); ! cl = NULL; ! } ! ! /* ! * Tell the server that the reads are done ! */ ! donemsg.type = SEND_BUFFER; ! if (RDMA_BUF_ALLOC(conn, &donemsg)) { ! p->cku_err.re_status = RPC_CANTSEND; ! p->cku_err.re_errno = EIO; ! RCSTAT_INCR(rcnomem); ! cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffer"); ! goto done; ! } ! xdrs = &p->cku_outxdr; ! xdrmem_create(xdrs, donemsg.addr, donemsg.len, XDR_ENCODE); ! vers = RPCRDMA_VERS; ! op = RDMA_DONE; ! ! /* ! * Treat xid as opaque (xid is the first entity ! * in the rpc rdma message). ! */ ! (*(uint32_t *)donemsg.addr) = p->cku_xid; ! /* Skip xid and set the xdr position accordingly. */ ! XDR_SETPOS(xdrs, sizeof (uint32_t)); ! if (!xdr_u_int(xdrs, &vers) || ! !xdr_u_int(xdrs, &op)) { ! cmn_err(CE_WARN, ! "clnt_rdma_kcallit: xdr_u_int failed"); ! rdma_buf_free(conn, &donemsg); ! goto done; ! } ! ! sendlist = NULL; ! clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &donemsg.handle, ! donemsg.addr, NULL, NULL); ! ! status = RDMA_SEND(conn, sendlist, p->cku_xid); ! if (status != RDMA_SUCCESS) { ! cmn_err(CE_WARN, ! "clnt_rdma_kcallit: RDMA_SEND failed xid %u", ! p->cku_xid); ! } ! #ifdef DEBUG ! else { ! if (rdma_clnt_debug) ! printf("clnt_rdma_kcallit: sent RDMA_DONE xid %u\n", ! p->cku_xid); ! } #endif - clist_free(sendlist); } - - done: if (cxdrp) XDR_DESTROY(cxdrp); if (rxdrp) { (void) xdr_rpc_free_verifier(rxdrp, &reply_msg); XDR_DESTROY(rxdrp); } if (recvlist) { ! rdma_buf_t recvmsg; ! recvmsg.addr = (caddr_t)(uintptr_t)recvlist->c_saddr; recvmsg.type = RECV_BUFFER; RDMA_BUF_FREE(conn, &recvmsg); clist_free(recvlist); } RDMA_REL_CONN(conn); if (p->cku_err.re_status != RPC_SUCCESS) { RCSTAT_INCR(rcbadcalls); } return (p->cku_err.re_status); } /* ARGSUSED */ static void clnt_rdma_kabort(CLIENT *h) { } --- 1133,1275 ---- } /* * If rpc reply is in a chunk, free it now. */ ! done: ! if (long_reply_buf_exists){ ! (void) clist_deregister(conn, &long_reply_buf_clist, 0); ! #ifndef SERVER_REG_CACHE ! kmem_free((void *)long_reply_buf_clist.c_daddr, ! (size_t)long_reply_buf_clist.c_len); ! #else ! RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)long_reply_buf_clist.long_reply_buf); #endif } if (cxdrp) XDR_DESTROY(cxdrp); if (rxdrp) { (void) xdr_rpc_free_verifier(rxdrp, &reply_msg); XDR_DESTROY(rxdrp); } if (recvlist) { ! rdma_buf_t recvmsg = {0}; recvmsg.addr = (caddr_t)(uintptr_t)recvlist->c_saddr; recvmsg.type = RECV_BUFFER; RDMA_BUF_FREE(conn, &recvmsg); clist_free(recvlist); } + #if (!defined(ASYNC_CLIENT_DEREG)) + if(rpccall_wlist){ + kmem_free(rpccall_wlist, sizeof(clist)); + } + #endif + RDMA_REL_CONN(conn); if (p->cku_err.re_status != RPC_SUCCESS) { RCSTAT_INCR(rcbadcalls); } return (p->cku_err.re_status); } + static int clnt_decode_long_reply(CONN *conn, rpcproc_t procnum, + struct clist *long_reply_buf_clist, + struct clist *rdma_reply, XDR *xdrs, + XDR **rxdrp, struct clist *cl, + struct clist *recvlist, + uint_t op,uint_t off) + { + if ( RDMA_NOMSG == op && long_reply_buf_clist->c_daddr) { + if (procnum == NFSPROC3_READDIR || + procnum == NFSPROC3_READDIRPLUS || + procnum == NFSPROC3_READLINK) { + xdrmem_destroy(xdrs); + xdrrdma_create(xdrs, + (caddr_t)long_reply_buf_clist->c_daddr, + rdma_reply->c_len, + 0, + NULL, + XDR_DECODE, + conn); + + *rxdrp = xdrs; + } else { + cmn_err(CE_NOTE, "clnt_rdma_kcallit: " + "wchunk buffer for wrong nfs proc"); + xdrmem_destroy(xdrs); + *rxdrp = NULL; + } + } else if (cl && RDMA_NOMSG == op) { + cmn_err(CE_NOTE, "clnt_rdma_kcallit: " + "Server sent a READ list in the RPC Reply"); + xdrmem_destroy(xdrs); + } else { + xdrmem_destroy(xdrs); + xdrrdma_create(xdrs, + (caddr_t)(uintptr_t)(recvlist->c_saddr + off), + recvlist->c_len - off, 0, cl, XDR_DECODE, conn); + *rxdrp = xdrs; + } + return CLNT_RDMA_SUCCESS; + } + + #ifdef DYNAMIC_CREDIT_CONTROL + static void clnt_compute_credit(CONN *conn, uint32_t *rdma_credit) + { + rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc; + + mutex_enter(&conn->c_lock); + if(cc_info->clnt_cc_granted_ops - cc_info->clnt_cc_in_flight_ops < CLNT_CREDIT_LOW) + *rdma_credit = rdma_bufs_rqst + cc_info->clnt_cc_in_flight_ops / 2; + mutex_exit(&conn->c_lock); + } + #endif + + static void clnt_return_credit(CONN *conn) + { + rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc; + + mutex_enter(&conn->c_lock); + cc_info->clnt_cc_in_flight_ops--; + cv_signal(&cc_info->clnt_cc_cv); + mutex_exit(&conn->c_lock); + } + + static void clnt_update_credit(CONN *conn, uint32_t rdma_credit) + { + rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc; + + /* + * Get the granted number of buffers for credit control. + */ + mutex_enter(&conn->c_lock); + cc_info->clnt_cc_granted_ops = rdma_credit; + mutex_exit(&conn->c_lock); + } + + static void clnt_check_credit(CONN *conn) + { + rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc; + + /* + * Make sure we are not going over our allowed buffer use + * (and make sure we have gotten a granted value before). + */ + mutex_enter(&conn->c_lock); + while (cc_info->clnt_cc_in_flight_ops >= cc_info->clnt_cc_granted_ops + && cc_info->clnt_cc_granted_ops != 0) { + /* + * Client has maxed out its granted buffers due to + * credit control. Current handling is to block and wait. + */ + cv_wait(&cc_info->clnt_cc_cv, &conn->c_lock); + } + cc_info->clnt_cc_in_flight_ops++; + mutex_exit(&conn->c_lock); + } + /* ARGSUSED */ static void clnt_rdma_kabort(CLIENT *h) { }
*** 1051,1060 **** --- 1318,1331 ---- struct knetconfig *knc; char *pf, *p; rdma_stat status; int error = 0; + mutex_enter(&rdma_modload_lock); + error = rdma_modload(); + mutex_exit(&rdma_modload_lock); + if (!INGLOBALZONE(curproc)) return (-1); /* * modload the RDMA plugins if not already done. */
*** 1100,1105 **** --- 1371,1400 ---- } rp = rp->r_next; } rw_exit(&rdma_lock); return (-1); + } + + static void + check_dereg_wlist(CONN *conn, clist *rwc) + { + if (rwc == NULL) + return; + + if (rwc) { + if (rwc->c_dmemhandle.mrc_rmr && rwc->c_len) { + int status; + #if defined(ASYNC_CLIENT_DEREG) + /* Add in an entry to rqueue */ + INSERT_QUEUE(conn, rwc); + #else + status = clist_deregister(conn, rwc, FALSE); + if (status != RDMA_SUCCESS) { + cmn_err(CE_NOTE, "dereg_wlist failed." + "status=%d", status); + } + #endif + } + + } }