Udiff clnt_rdma.c
--- /webrev/webrev/usr/src/uts/common/rpc/clnt_rdma.c- Mon Aug 14 13:12:10 2006
+++ clnt_rdma.c Thu Aug 10 14:22:04 2006
@@ -29,10 +29,23 @@
* Portions of this source code were derived from Berkeley
* 4.3 BSD under license from the Regents of the University of
* California.
*/
+ /* Copyright (c) 2006, The Ohio State University. All rights reserved.
+ *
+ * Portions of this source code is developed by the team members of
+ * The Ohio State University's Network-Based Computing Laboratory (NBCL),
+ * headed by Professor Dhabaleswar K. (DK) Panda.
+ *
+ * Acknowledgements to contributions from developors:
+ * Ranjit Noronha: noronha@cse.ohio-state.edu
+ * Lei Chai : chail@cse.ohio-state.edu
+ * Weikuan Yu : yuw@cse.ohio-state.edu
+ *
+ */
+
#pragma ident "@(#)clnt_rdma.c 1.10 05/07/26 SMI"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/user.h>
@@ -54,12 +67,55 @@
#include <rpc/xdr.h>
#include <rpc/auth.h>
#include <rpc/clnt.h>
#include <rpc/rpc_msg.h>
#include <rpc/rpc_rdma.h>
+#include <nfs/nfs.h>
+#define CLNT_CREDIT_LOW (5)
+xdrproc_t x_READ3args = NULL_xdrproc_t;
+xdrproc_t x_READ3res = NULL_xdrproc_t;
+xdrproc_t x_READ3vres = NULL_xdrproc_t;
+xdrproc_t x_READ3uiores = NULL_xdrproc_t;
+
+static uint32_t rdma_bufs_rqst = RDMA_BUFS_RQST;
+
+int rdma_wlist_verbose_debug = 0;
+int rdma_wlist_memreg_debug = 0;
+int rdma_wlist_clnt_debug = 0;
+int rdma_wlist_svc_debug = 0;
+int rdma_wlist_xdr_debug = 0;
+int rdma_wlist_pglck_debug = 0;
+int credit_control_debug = 0;
+int rdma_long_reply_debug = 0;
+int rdma_xdr_long_reply_debug = 0;
+
+struct clist empty_cl = {0};
+
+static void clnt_read3args_make_wlist(caddr_t, struct clist **, xdrproc_t, uint_t *);
+static int clnt_compose_rpcmsg(CLIENT *, rpcproc_t, rdma_buf_t *,
+ XDR *, xdrproc_t, caddr_t);
+static int clnt_compose_rdma_header(CONN *, CLIENT *, rdma_buf_t *,
+ XDR **, uint_t *);
+static int clnt_setup_rlist(CONN *, XDR *, struct clist **);
+static int clnt_setup_wlist(CONN *, rpcproc_t, struct clist **,
+ caddr_t, xdrproc_t, XDR *);
+static int clnt_setup_long_reply(CONN *, rpcproc_t, struct clist *,
+ XDR *, bool_t *);
+#ifdef DYNAMIC_CREDIT_CONTROL
+static void clnt_compute_credit(CONN *, uint32_t *);
+#endif
+static void clnt_check_credit(CONN *);
+static void clnt_return_credit(CONN *);
+static int clnt_decode_long_reply(CONN *, rpcproc_t, struct clist *,
+ struct clist *, XDR *, XDR **, struct clist *,
+ struct clist *, uint_t,uint_t);
+
+static void clnt_update_credit(CONN *,uint32_t);
+static void check_dereg_wlist(CONN *, struct clist *);
+
static enum clnt_stat clnt_rdma_kcallit(CLIENT *, rpcproc_t, xdrproc_t,
caddr_t, xdrproc_t, caddr_t, struct timeval);
static void clnt_rdma_kabort(CLIENT *);
static void clnt_rdma_kerror(CLIENT *, struct rpc_err *);
static bool_t clnt_rdma_kfreeres(CLIENT *, xdrproc_t, caddr_t);
@@ -83,10 +139,12 @@
/*
* The size of the preserialized RPC header information.
*/
#define CKU_HDRSIZE 20
+#define CLNT_RDMA_SUCCESS 0
+#define CLNT_RDMA_FAIL -99
/*
* Per RPC RDMA endpoint details
*/
typedef struct cku_private {
@@ -283,10 +341,270 @@
p->cku_addr.len = raddr->len;
bcopy(raddr->buf, p->cku_addr.buf, raddr->len);
h->cl_ops = &rdma_clnt_ops;
}
+static int clnt_compose_rpcmsg(CLIENT *h, rpcproc_t procnum,
+ rdma_buf_t *rpcmsg, XDR *xdrs,
+ xdrproc_t xdr_args, caddr_t argsp)
+{
+ cku_private_t *p = htop(h);
+
+ if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
+ /*
+ * Copy in the preserialized RPC header
+ * information.
+ */
+ bcopy(p->cku_rpchdr, rpcmsg->addr, CKU_HDRSIZE);
+
+ /*
+ * transaction id is the 1st thing in the output
+ * buffer.
+ */
+ /* LINTED pointer alignment */
+ (*(uint32_t *)(rpcmsg->addr)) = p->cku_xid;
+
+ /* Skip the preserialized stuff. */
+ XDR_SETPOS(xdrs, CKU_HDRSIZE);
+
+ /* Serialize dynamic stuff into the output buffer. */
+ if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
+ (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
+ (!(*xdr_args)(xdrs, argsp))) {
+ cmn_err(CE_WARN,"Failed to serialize dynamic arguments\n");
+ return CLNT_RDMA_FAIL;
+ }
+ p->cku_outsz = XDR_GETPOS(xdrs);
+ } else {
+ uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE];
+ IXDR_PUT_U_INT32(uproc, procnum);
+ (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
+ XDR_SETPOS(xdrs, 0);
+
+ /* Serialize the procedure number and the arguments. */
+ if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr,
+ CKU_HDRSIZE+4, xdrs, NULL, NULL) ||
+ !(*xdr_args)(xdrs, argsp)) {
+ if (rpcmsg->addr != xdrs->x_base) {
+ rpcmsg->addr = xdrs->x_base;
+ rpcmsg->len = xdr_getbufsize(xdrs);
+ }
+ cmn_err(CE_WARN,"Failed to serialize procedure number and the arguments.\n");
+ return CLNT_RDMA_FAIL;
+ }
+ /*
+ * If we had to allocate a new buffer while encoding
+ * then update the addr and len.
+ */
+ if (rpcmsg->addr != xdrs->x_base) {
+ rpcmsg->addr = xdrs->x_base;
+ rpcmsg->len = xdr_getbufsize(xdrs);
+ }
+
+ p->cku_outsz = XDR_GETPOS(xdrs);
+ }
+
+ return CLNT_RDMA_SUCCESS;
+}
+
+static int clnt_compose_rdma_header(CONN *conn, CLIENT *h, rdma_buf_t *clmsg,
+ XDR **xdrs, uint_t *op)
+{
+ cku_private_t *p = htop(h);
+ uint_t vers;
+ uint32_t rdma_credit = rdma_bufs_rqst;
+
+ vers = RPCRDMA_VERS;
+ clmsg->type = SEND_BUFFER;
+
+#ifdef DYNAMIC_CREDIT_CONTROL
+ clnt_compute_credit(conn, &rdma_credit);
+#endif
+
+ if (RDMA_BUF_ALLOC(conn, clmsg)) {
+ return CLNT_RDMA_FAIL;
+ }
+
+ *xdrs = &p->cku_outxdr;
+ xdrmem_create(*xdrs, clmsg->addr, clmsg->len, XDR_ENCODE);
+
+ (*(uint32_t *)clmsg->addr) = p->cku_xid;
+ XDR_SETPOS(*xdrs, sizeof (uint32_t));
+ (void) xdr_u_int(*xdrs, &vers);
+ (void) xdr_u_int(*xdrs, &rdma_credit);
+ (void) xdr_u_int(*xdrs, op);
+
+ return CLNT_RDMA_SUCCESS;
+}
+
+static int clnt_setup_rlist(CONN *conn, XDR *xdrs, struct clist **cl)
+{
+ int ret;
+
+ if (*cl != NULL) {
+ ret = clist_register(conn, *cl, 1);
+ if (ret != RDMA_SUCCESS) {
+ return CLNT_RDMA_FAIL;
+ }
+ }
+ (void) xdr_do_clist(xdrs, cl);
+
+ return CLNT_RDMA_SUCCESS;
+}
+
+static int clnt_setup_wlist(CONN *conn, rpcproc_t procnum,
+ struct clist **rpccall_wlist, caddr_t resultsp,
+ xdrproc_t xdr_results, XDR *xdrs)
+{
+ int status;
+ uint_t num_segment = 0;
+
+ if (procnum == NFSPROC3_READ) {
+ clnt_read3args_make_wlist(resultsp, rpccall_wlist,
+ xdr_results, &num_segment);
+ status = clist_register(conn, *rpccall_wlist, 0);
+ if (status != RDMA_SUCCESS)
+ return CLNT_RDMA_FAIL;
+ } else {
+ *rpccall_wlist = NULL;
+ }
+
+ if (! xdr_encode_wlist(xdrs, *rpccall_wlist, num_segment))
+ return CLNT_RDMA_FAIL;
+
+ return CLNT_RDMA_SUCCESS;
+}
+
+static int clnt_setup_long_reply(CONN *conn, rpcproc_t procnum,
+ struct clist *lrc_clist,
+ XDR *xdrs, bool_t *exists)
+{
+ int status;
+ caddr_t addr;
+#ifdef SERVER_REG_CACHE
+ rib_lrc_entry_t *long_reply_buf = NULL;
+#endif
+ *exists = FALSE;
+ lrc_clist->c_daddr = NULL;
+
+#ifdef RPC_RDMA_INLINE
+ if (lrc_clist->c_len < rdma_minchunk)
+ return CLNT_RDMA_SUCCESS;
+#endif
+
+ if (procnum == NFSPROC3_READDIR ||
+ procnum == NFSPROC3_READDIRPLUS ||
+ procnum == NFSPROC3_READLINK) {
+#ifndef SERVER_REG_CACHE
+ addr = kmem_alloc(LONG_REPLY_LEN, KM_SLEEP);
+ bzero(addr, LONG_REPLY_LEN);
+ lrc_clist->c_daddr = (uint64)addr;
+ lrc_clist->c_len = LONG_REPLY_LEN;
+ lrc_clist->c_next = NULL;
+ lrc_clist->long_reply_buf = NULL;
+ status = clist_register(conn, lrc_clist, 0);
+#else
+ long_reply_buf = RDMA_GET_SERVER_CACHE_BUF(conn, LONG_REPLY_LEN);
+ bzero(long_reply_buf->lrc_buf, LONG_REPLY_LEN);
+ lrc_clist->c_daddr = (uint64)long_reply_buf->lrc_buf;
+ lrc_clist->c_len = LONG_REPLY_LEN;
+ lrc_clist->c_next = NULL;
+ lrc_clist->long_reply_buf = (uint64)long_reply_buf;
+ lrc_clist->c_dmemhandle = long_reply_buf->lrc_mhandle;
+ status = clist_register(conn, lrc_clist, 0);
+#endif
+ if(status) {
+ cmn_err(CE_WARN, "clnt_setup_long_reply: cannot register buffer");
+#ifndef SERVER_REG_CACHE
+ kmem_free((void*)addr, (size_t)LONG_REPLY_LEN);
+#else
+ RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)long_reply_buf);
+
+#endif
+ lrc_clist->c_daddr = NULL;
+ return CLNT_RDMA_FAIL;
+ }
+ *exists = TRUE;
+ }
+
+ return CLNT_RDMA_SUCCESS;
+}
+
+static void
+clnt_read3args_make_wlist(caddr_t replyp, struct clist **rpccall_wlist,
+ xdrproc_t xr, uint_t *num_segment)
+{
+ READ3uiores *ures = (READ3uiores *)replyp;
+ READ3vres *vres = (READ3vres *)replyp;
+ struct clist *rwl = NULL, *prev = NULL;
+ int i, total_length;
+
+ *rpccall_wlist = NULL;
+
+#ifdef RPC_RDMA_INLINE
+ if (xr == x_READ3uiores) {
+ total_length = 0;
+ for(i=0; i<ures->uiop->uio_iovcnt; i++) {
+ total_length += ures->uiop->uio_iov[i].iov_len;
+ }
+ } else {
+ total_length = vres->data.data_len;
+ }
+
+ if (total_length < rdma_minchunk)
+ return;
+#endif
+
+ /* XXX: fake a chunk threshold for the combined length for now */
+ if (xr == x_READ3uiores) {
+ *num_segment = ures->uiop->uio_iovcnt;
+ for(i=0; i<ures->uiop->uio_iovcnt; i++) {
+ rwl = (struct clist *)kmem_zalloc(sizeof(struct clist),
+ KM_SLEEP);
+
+ rwl->c_len = ures->uiop->uio_iov[i].iov_len;
+ rwl->c_daddr = (uint64)(ures->uiop->uio_iov[i].iov_base);
+ /*
+ * if userspace address, put adspace ptr in clist.
+ * If not, then do nothing since it's already
+ * set to NULL (from empty_cl)
+ */
+ if (ures->uiop->uio_segflg == UIO_USERSPACE) {
+ int error;
+ rwl->c_adspc = ttoproc(curthread)->p_as;
+ } else {
+ rwl->c_dpplist = (page_t **)NULL;
+ }
+
+ if(prev == NULL)
+ prev = rwl;
+ else {
+ prev->c_next = rwl;
+ prev = rwl;
+ }
+
+ if(*rpccall_wlist == NULL)
+ *rpccall_wlist = rwl;
+ }
+ rwl->c_next = NULL;
+ } else if (xr == x_READ3vres) {
+ *num_segment = 1;
+ rwl = (struct clist *)kmem_zalloc(sizeof (struct clist),
+ KM_SLEEP);
+ *rwl = empty_cl;
+
+ rwl->c_len = vres->data.data_len;
+ rwl->c_daddr = (uint64)(vres->data.data_val);
+
+ if(*rpccall_wlist == NULL)
+ *rpccall_wlist = rwl;
+ } else {
+ /*cmn_err(CE_NOTE, "read3args_make_wlist: non READ3xr=%p",
+ (void *)xr);*/
+ }
+}
+
/* ARGSUSED */
static enum clnt_stat
clnt_rdma_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args,
caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait)
{
@@ -294,20 +612,30 @@
int status;
XDR *xdrs;
XDR *cxdrp = NULL, callxdr; /* for xdrrdma encoding the RPC call */
XDR *rxdrp = NULL, replxdr; /* for xdrrdma decoding the RPC reply */
struct rpc_msg reply_msg;
- struct clist *sendlist, *recvlist = NULL;
- struct clist *cl = NULL, *cle = NULL;
+ struct clist *sendlist = NULL, *recvlist = NULL;
+ struct clist *cl = NULL, *cle = NULL, *rdma_reply = NULL;
uint_t vers, op;
uint_t off;
uint32_t xid;
+ uint32_t seg_array_len;
CONN *conn = NULL;
- rdma_buf_t clmsg, rpcmsg, longmsg, rpcreply;
+ rdma_buf_t clmsg = {0}, rpcmsg = {0};
int msglen;
clock_t ticks;
+ bool_t wlist_exists_reply = FALSE;
+ bool_t long_reply_buf_exists = FALSE;
+ struct clist *rpccall_wlist = NULL, *rpcreply_wlist = NULL,
+ long_reply_clist ={0};
+ rpccall_read_t read_type;
+ rpccall_write_t write_type;
+ uint32_t rdma_credit = rdma_bufs_rqst;
+ struct clist long_reply_buf_clist = {0};
+
RCSTAT_INCR(rccalls);
/*
* Get unique xid
*/
if (p->cku_xid == 0)
@@ -361,10 +689,13 @@
break;
}
return (p->cku_err.re_status);
}
+
+ clnt_check_credit(conn);
+
/*
* Get the size of the rpc call message. Need this
* to determine if the rpc call message will fit in
* the pre-allocated RDMA buffers. If the rpc call
* message length is greater that the pre-allocated
@@ -372,173 +703,87 @@
* buffer is allocated and registered for the Long
* RPC call.
*/
xdrs = &callxdr;
msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT;
+
if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
msglen += xdrrdma_authsize(h->cl_auth, p->cku_cred,
rdma_minchunk);
msglen += xdrrdma_sizeof(xdr_args, argsp, rdma_minchunk);
- if (msglen > RPC_MSG_SZ) {
-
- /*
- * Long RPC. Allocate one time use custom buffer.
- */
- rpcmsg.type = CHUNK_BUFFER;
- rpcmsg.addr = kmem_zalloc(msglen, KM_SLEEP);
- cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
- cle->c_xdroff = 0;
- cle->c_len = rpcmsg.len = msglen;
- cle->c_saddr = (uint64)(uintptr_t)rpcmsg.addr;
- cle->c_next = NULL;
- xdrrdma_create(xdrs, rpcmsg.addr, msglen,
- rdma_minchunk, cle, XDR_ENCODE, NULL);
- cxdrp = xdrs;
- op = RDMA_NOMSG;
+ if (msglen > RPC_MSG_SZ)
+ read_type = RPCCALL_RCHUNK;
+ else
+ read_type = RPCCALL_NORCHUNK;
} else {
/*
- * Get a pre-allocated buffer for rpc call
- */
- rpcmsg.type = SEND_BUFFER;
- if (RDMA_BUF_ALLOC(conn, &rpcmsg)) {
- p->cku_err.re_status = RPC_CANTSEND;
- p->cku_err.re_errno = EIO;
- RCSTAT_INCR(rcnomem);
- cmn_err(CE_WARN,
- "clnt_rdma_kcallit: no buffers!");
- goto done;
- }
- xdrrdma_create(xdrs, rpcmsg.addr, rpcmsg.len,
- rdma_minchunk, NULL, XDR_ENCODE, NULL);
- cxdrp = xdrs;
- op = RDMA_MSG;
- }
- } else {
- /*
* For RPCSEC_GSS since we cannot accurately presize the
* buffer required for encoding, we assume that its going
* to be a Long RPC to start with. We also create the
* the XDR stream with min_chunk set to 0 which instructs
* the XDR layer to not chunk the incoming byte stream.
*/
msglen += 2 * MAX_AUTH_BYTES + 2 * sizeof (struct opaque_auth);
- msglen += xdr_sizeof(xdr_args, argsp);
+ msglen += xdrrdma_sizeof(xdr_args, argsp, rdma_minchunk);
- /*
- * Long RPC. Allocate one time use custom buffer.
- */
- longmsg.type = CHUNK_BUFFER;
- longmsg.addr = kmem_zalloc(msglen, KM_SLEEP);
- cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
- cle->c_xdroff = 0;
- cle->c_len = longmsg.len = msglen;
- cle->c_saddr = (uint64)(uintptr_t)longmsg.addr;
- cle->c_next = NULL;
- xdrrdma_create(xdrs, longmsg.addr, msglen, 0, cle,
- XDR_ENCODE, NULL);
- cxdrp = xdrs;
- op = RDMA_NOMSG;
+ if (msglen > RPC_MSG_SZ)
+ read_type = RPCCALL_RCHUNK;
+ else
+ read_type = RPCCALL_NORCHUNK;
}
- if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
- /*
- * Copy in the preserialized RPC header
- * information.
- */
- bcopy(p->cku_rpchdr, rpcmsg.addr, CKU_HDRSIZE);
+ if (read_type == RPCCALL_NORCHUNK) {
- /*
- * transaction id is the 1st thing in the output
- * buffer.
- */
- /* LINTED pointer alignment */
- (*(uint32_t *)(rpcmsg.addr)) = p->cku_xid;
-
- /* Skip the preserialized stuff. */
- XDR_SETPOS(xdrs, CKU_HDRSIZE);
-
- /* Serialize dynamic stuff into the output buffer. */
- if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
- (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
- (!(*xdr_args)(xdrs, argsp))) {
- rdma_buf_free(conn, &rpcmsg);
- if (cle)
- clist_free(cle);
- p->cku_err.re_status = RPC_CANTENCODEARGS;
- p->cku_err.re_errno = EIO;
- cmn_err(CE_WARN,
- "clnt_rdma_kcallit: XDR_PUTINT32/AUTH_MARSHAL/xdr_args failed");
+ rpcmsg.type = SEND_BUFFER;
+ if (RDMA_BUF_ALLOC(conn, &rpcmsg)) {
+ cmn_err(CE_WARN, "clnt_rdma_kcallit: no buffers!");
goto done;
}
- p->cku_outsz = XDR_GETPOS(xdrs);
} else {
- uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE];
- IXDR_PUT_U_INT32(uproc, procnum);
- (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
- XDR_SETPOS(xdrs, 0);
-
- /* Serialize the procedure number and the arguments. */
- if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr,
- CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) {
- if (longmsg.addr != xdrs->x_base) {
- longmsg.addr = xdrs->x_base;
- longmsg.len = xdr_getbufsize(xdrs);
+#ifdef SERVER_REG_CACHE
+ rib_lrc_entry_t *long_reply_buf = NULL;
+#endif
+ rpcmsg.type = CHUNK_BUFFER;
+#ifdef SERVER_REG_CACHE
+ long_reply_buf = RDMA_GET_SERVER_CACHE_BUF(conn, msglen);
+ rpcmsg.addr = long_reply_buf->lrc_buf;
+#else
+ rpcmsg.addr = kmem_zalloc(msglen, KM_SLEEP);
+#endif
+ cle = (struct clist *)kmem_zalloc(sizeof (struct clist),
+ KM_SLEEP);
+ cle->c_xdroff = 0;
+ cle->c_len = rpcmsg.len = msglen;
+ cle->c_saddr = (uint64)(uintptr_t)rpcmsg.addr;
+ cle->c_next = NULL;
+#ifdef SERVER_REG_CACHE
+ cle->long_reply_buf = (uint64)long_reply_buf;
+#endif
}
- rdma_buf_free(conn, &longmsg);
+
+ op = cle ? RDMA_NOMSG : RDMA_MSG;
+ cxdrp = xdrs;
+ xdrrdma_create(xdrs, rpcmsg.addr, (cle ? msglen : rpcmsg.len),
+ rdma_minchunk, cle, XDR_ENCODE, NULL);
+
+ status = clnt_compose_rpcmsg(h, procnum, &rpcmsg, xdrs, xdr_args, argsp);
+ if (status != CLNT_RDMA_SUCCESS) {
+ rdma_buf_free(conn, &rpcmsg);
clist_free(cle);
p->cku_err.re_status = RPC_CANTENCODEARGS;
p->cku_err.re_errno = EIO;
cmn_err(CE_WARN,
- "clnt_rdma_kcallit: AUTH_WRAP failed");
+ "clnt_rdma_kcallit: clnt_compose_rpcmsg failed");
goto done;
}
- /*
- * If we had to allocate a new buffer while encoding
- * then update the addr and len.
- */
- if (longmsg.addr != xdrs->x_base) {
- longmsg.addr = xdrs->x_base;
- longmsg.len = xdr_getbufsize(xdrs);
- }
- /*
- * If it so happens that the encoded message is after all
- * not long enough to be a Long RPC then allocate a
- * SEND_BUFFER and copy the encoded message into it.
+ /* Read chunklist (a linked list of N elements,
+ * position P (same P for all chunks of same arg!):
+ * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
*/
- p->cku_outsz = XDR_GETPOS(xdrs);
- if (p->cku_outsz > RPC_MSG_SZ) {
- rpcmsg.type = CHUNK_BUFFER;
- rpcmsg.addr = longmsg.addr;
- rpcmsg.len = longmsg.len;
- } else {
- clist_free(cle);
- XDR_DESTROY(cxdrp);
- cxdrp = NULL;
- /*
- * Get a pre-allocated buffer for rpc call
- */
- rpcmsg.type = SEND_BUFFER;
- if (RDMA_BUF_ALLOC(conn, &rpcmsg)) {
- p->cku_err.re_status = RPC_CANTSEND;
- p->cku_err.re_errno = EIO;
- RCSTAT_INCR(rcnomem);
- cmn_err(CE_WARN,
- "clnt_rdma_kcallit: no buffers!");
- rdma_buf_free(conn, &longmsg);
- goto done;
- }
- bcopy(longmsg.addr, rpcmsg.addr, p->cku_outsz);
- xdrrdma_create(xdrs, rpcmsg.addr, p->cku_outsz, 0,
- NULL, XDR_ENCODE, NULL);
- cxdrp = xdrs;
- rdma_buf_free(conn, &longmsg);
- op = RDMA_MSG;
- }
- }
cl = xdrrdma_clist(xdrs);
/*
* Update the chunk size information for the Long RPC msg.
@@ -545,64 +790,70 @@
*/
if (cl && op == RDMA_NOMSG)
cl->c_len = p->cku_outsz;
/*
- * Set up the RDMA chunk message
+ * Prepare the header for the RDMA chunk
*/
- vers = RPCRDMA_VERS;
- clmsg.type = SEND_BUFFER;
- if (RDMA_BUF_ALLOC(conn, &clmsg)) {
+ status = clnt_compose_rdma_header(conn, h, &clmsg, &xdrs, &op);
+ if (status != CLNT_RDMA_SUCCESS) {
p->cku_err.re_status = RPC_CANTSEND;
p->cku_err.re_errno = EIO;
rdma_buf_free(conn, &rpcmsg);
+ clist_free(cle);
RCSTAT_INCR(rcnomem);
cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffers!!");
goto done;
}
- xdrs = &p->cku_outxdr;
- xdrmem_create(xdrs, clmsg.addr, clmsg.len, XDR_ENCODE);
- /*
- * Treat xid as opaque (xid is the first entity
- * in the rpc rdma message).
- */
- (*(uint32_t *)clmsg.addr) = p->cku_xid;
- /* Skip xid and set the xdr position accordingly. */
- XDR_SETPOS(xdrs, sizeof (uint32_t));
- (void) xdr_u_int(xdrs, &vers);
- (void) xdr_u_int(xdrs, &op);
- /*
- * Now XDR the chunk list
- */
- if (cl != NULL) {
+ status = clnt_setup_rlist(conn, xdrs, &cl);
+ if (status != CLNT_RDMA_SUCCESS) {
+ cmn_err(CE_WARN, "clnt_rdma_kcallit: clist register failed");
+ rdma_buf_free(conn, &clmsg);
+ rdma_buf_free(conn, &rpcmsg);
+ clist_free(cl);
+ p->cku_err.re_status = RPC_CANTSEND;
+ p->cku_err.re_errno = EIO;
+ goto done;
+ }
- /*
- * Register the chunks in the list
+ /* Setup write chunk list for NFS3 READ operation
+ * Other operations will have a NULL wlist
*/
- status = clist_register(conn, cl, 1);
- if (status != RDMA_SUCCESS) {
- cmn_err(CE_WARN,
- "clnt_rdma_kcallit: clist register failed");
+ status = clnt_setup_wlist(conn, procnum, &rpccall_wlist,
+ resultsp, xdr_results, xdrs);
+ if (status != CLNT_RDMA_SUCCESS) {
rdma_buf_free(conn, &clmsg);
rdma_buf_free(conn, &rpcmsg);
clist_free(cl);
p->cku_err.re_status = RPC_CANTSEND;
p->cku_err.re_errno = EIO;
goto done;
}
+ status = clnt_setup_long_reply(conn, procnum, &long_reply_buf_clist,
+ xdrs, &long_reply_buf_exists);
+ if (status != CLNT_RDMA_SUCCESS) {
+ rdma_buf_free(conn, &clmsg);
+ rdma_buf_free(conn, &rpcmsg);
+ clist_free(cl);
+ p->cku_err.re_status = RPC_CANTSEND;
+ p->cku_err.re_errno = EIO;
+ goto done;
}
- (void) xdr_do_clist(xdrs, &cl);
/*
+ * XDR encode the RDMA_REPLY write chunk
+ */
+ seg_array_len = (long_reply_buf_exists ? 1:0);
+ (void) xdr_encode_reply_wchunk(xdrs, &long_reply_buf_clist, seg_array_len);
+ /*
* Start with the RDMA header and clist (if any)
*/
sendlist = NULL;
clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &clmsg.handle,
clmsg.addr, NULL, NULL);
-
/*
* Put the RPC call message in the send list if small RPC
*/
if (op == RDMA_MSG) {
clist_add(&sendlist, 0, p->cku_outsz, &rpcmsg.handle,
@@ -646,11 +897,15 @@
}
/*
* Send the call message to the server
*/
+#if defined (CLNT_INTERRUPT_COAL)
+ status = RDMA_SEND_BL(conn, sendlist, p->cku_xid);
+#else
status = RDMA_SEND(conn, sendlist, p->cku_xid);
+#endif
if (status != RDMA_SUCCESS) {
if (cl) {
(void) clist_deregister(conn, cl, 1);
clist_free(cl);
/*
@@ -672,20 +927,16 @@
clmsg.addr = NULL;
if (rpcmsg.type == SEND_BUFFER)
rpcmsg.addr = NULL;
}
clist_free(sendlist);
-#ifdef DEBUG
-if (rdma_clnt_debug) {
- printf("clnt_rdma_kcallit: send request xid %u\n", p->cku_xid);
- }
-#endif
/*
* Recv rpc reply
*/
status = RDMA_RECV(conn, &recvlist, p->cku_xid);
+ clnt_return_credit(conn);
/*
* Deregister chunks sent. Do this only after the reply
* is received as that is a sure indication that the
* remote end has completed RDMA of the chunks.
@@ -704,16 +955,10 @@
/*
* Now check recv status
*/
if (status != 0) {
-#ifdef DEBUG
- if (rdma_clnt_debug)
- cmn_err(CE_NOTE,
- "clnt_rdma_kcallit: reply failed %u status %d",
- p->cku_xid, status);
-#endif
if (status == RDMA_INTR) {
p->cku_err.re_status = RPC_INTR;
p->cku_err.re_errno = EINTR;
RCSTAT_INCR(rcintrs);
} else if (status == RPC_TIMEDOUT) {
@@ -724,14 +969,10 @@
p->cku_err.re_status = RPC_CANTRECV;
p->cku_err.re_errno = EIO;
}
goto done;
}
-#ifdef DEBUG
- if (rdma_clnt_debug)
- printf("clnt_rdma_kcallit: got response xid %u\n", p->cku_xid);
-#endif
/*
* Process the reply message.
*
* First the chunk list (if any)
*/
@@ -744,92 +985,74 @@
*/
xid = *(uint32_t *)(uintptr_t)recvlist->c_saddr;
/* Skip xid and set the xdr position accordingly. */
XDR_SETPOS(xdrs, sizeof (uint32_t));
(void) xdr_u_int(xdrs, &vers);
+ (void) xdr_u_int(xdrs, &rdma_credit);
(void) xdr_u_int(xdrs, &op);
(void) xdr_do_clist(xdrs, &cl);
- off = xdr_getpos(xdrs);
+ clnt_update_credit(conn, rdma_credit);
+ wlist_exists_reply = FALSE;
+ if (! xdr_decode_wlist(xdrs, &rpcreply_wlist, &wlist_exists_reply)) {
+ cmn_err(CE_NOTE,
+ "clnt_rdma_kcallit: xdr_decode_wlist failed");
+ /* XXX: what should we fail with here -- EIO? */
+ }
+#ifdef RPC_RDMA_INLINE
+ if (xdr_results == x_READ3vres) {
+ ((READ3vres *)resultsp)->wlist = NULL;
+ } else if (xdr_results == x_READ3uiores) {
+ ((READ3uiores *)resultsp)->wlist = NULL;
+ }
+#endif
- /*
- * Now the RPC reply message itself. If the reply
- * came as a chunk item, then RDMA the reply over.
- */
- xdrs = &replxdr;
- if (cl && op == RDMA_NOMSG) {
- struct clist *cle = cl;
+ if (procnum == NFSPROC3_READ) {
- rpcreply.type = CHUNK_BUFFER;
- rpcreply.addr = kmem_alloc(cle->c_len, KM_SLEEP);
- rpcreply.len = cle->c_len;
- cle->c_daddr = (uint64)(uintptr_t)rpcreply.addr;
- cl = cl->c_next;
- cle->c_next = NULL;
+ check_dereg_wlist(conn, rpccall_wlist);
- /*
- * Register the rpc reply chunk destination
- */
- status = clist_register(conn, cle, 0);
- if (status) {
- rdma_buf_free(conn, &rpcreply);
- clist_free(cle);
- p->cku_err.re_status = RPC_CANTDECODERES;
- p->cku_err.re_errno = EIO;
- cmn_err(CE_WARN,
- "clnt_rdma_kcallit: clist_register failed");
- goto rdma_done;
+ if (wlist_exists_reply) {
+ if (xdr_results == x_READ3vres) {
+ ((READ3vres *)resultsp)->wlist =
+ rpcreply_wlist;
+ ((READ3vres *)resultsp)->wlist_len =
+ rpcreply_wlist->c_len;
+ } else if (xdr_results == x_READ3uiores) {
+ ((READ3uiores *)resultsp)->wlist =
+ rpcreply_wlist;
+ ((READ3uiores *)resultsp)->wlist_len =
+ rpcreply_wlist->c_len;
+ } else {
+ cmn_err(CE_NOTE,
+ "unknown READ3 xdr decode fnp=%p",
+ (void *)xdr_results);
}
-
- /*
- * Now read rpc reply in
- */
-#ifdef DEBUG
- if (rdma_clnt_debug)
- printf("clnt_rdma_kcallit: read chunk, len %d, xid %u, \
- reply xid %u\n", cle->c_len, p->cku_xid, xid);
-#endif
- status = RDMA_READ(conn, cle, WAIT);
- if (status) {
- (void) clist_deregister(conn, cle, 0);
- rdma_buf_free(conn, &rpcreply);
- clist_free(cle);
- p->cku_err.re_status = RPC_CANTDECODERES;
- p->cku_err.re_errno = EIO;
- cmn_err(CE_WARN,
- "clnt_rdma_kcallit: RDMA_READ failed");
- goto rdma_done;
}
-
- /*
- * sync the memory for dma
- */
- status = clist_syncmem(conn, cle, 0);
- if (status != RDMA_SUCCESS) {
- (void) clist_deregister(conn, cle, 0);
- rdma_buf_free(conn, &rpcreply);
- clist_free(cle);
- p->cku_err.re_status = RPC_CANTDECODERES;
- p->cku_err.re_errno = EIO;
- goto rdma_done;
+ } else {
+ if(wlist_exists_reply)
+ cmn_err(CE_NOTE,
+ "clnt_rdma_kcallit: received wlist for "
+ "non-READ3 call. reply xdr decode fnp=%p",
+ (void *)xdr_results);
}
/*
- * Deregister the Long RPC chunk
+ * The server shouldn't have sent a RDMA_SEND that
+ * the client needs to RDMA_WRITE a reply back to
+ * the server. So silently ignoring what the
+ * server returns in the rdma_reply section of the
+ * header.
*/
- (void) clist_deregister(conn, cle, 0);
- clist_free(cle);
- xdrrdma_create(xdrs, rpcreply.addr, rpcreply.len, 0, cl,
- XDR_DECODE, conn);
- rxdrp = xdrs;
- } else {
- rpcreply.addr = NULL;
- xdrrdma_create(xdrs,
- (caddr_t)(uintptr_t)(recvlist->c_saddr + off),
- recvlist->c_len - off, 0, cl, XDR_DECODE, conn);
- rxdrp = xdrs;
- }
+ (void) xdr_decode_reply_wchunk(xdrs, &rdma_reply,conn);
+ off = xdr_getpos(xdrs);
+ xdrs = &replxdr;
+ if (clnt_decode_long_reply(conn, procnum, &long_reply_buf_clist,
+ rdma_reply, xdrs, &rxdrp,
+ cl, recvlist, op, off) != CLNT_RDMA_SUCCESS)
+ {
+ goto done;
+ }
reply_msg.rm_direction = REPLY;
reply_msg.rm_reply.rp_stat = MSG_ACCEPTED;
reply_msg.acpted_rply.ar_stat = SUCCESS;
reply_msg.acpted_rply.ar_verf = _null_auth;
/*
@@ -910,99 +1133,143 @@
}
/*
* If rpc reply is in a chunk, free it now.
*/
- if (rpcreply.addr != NULL)
- rdma_buf_free(conn, &rpcreply);
-
-rdma_done:
- if ((cl != NULL) || (op == RDMA_NOMSG)) {
- rdma_buf_t donemsg;
-
- /*
- * Free the list holding the chunk info
- */
- if (cl) {
- clist_free(cl);
- cl = NULL;
- }
-
- /*
- * Tell the server that the reads are done
- */
- donemsg.type = SEND_BUFFER;
- if (RDMA_BUF_ALLOC(conn, &donemsg)) {
- p->cku_err.re_status = RPC_CANTSEND;
- p->cku_err.re_errno = EIO;
- RCSTAT_INCR(rcnomem);
- cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffer");
- goto done;
- }
- xdrs = &p->cku_outxdr;
- xdrmem_create(xdrs, donemsg.addr, donemsg.len, XDR_ENCODE);
- vers = RPCRDMA_VERS;
- op = RDMA_DONE;
-
- /*
- * Treat xid as opaque (xid is the first entity
- * in the rpc rdma message).
- */
- (*(uint32_t *)donemsg.addr) = p->cku_xid;
- /* Skip xid and set the xdr position accordingly. */
- XDR_SETPOS(xdrs, sizeof (uint32_t));
- if (!xdr_u_int(xdrs, &vers) ||
- !xdr_u_int(xdrs, &op)) {
- cmn_err(CE_WARN,
- "clnt_rdma_kcallit: xdr_u_int failed");
- rdma_buf_free(conn, &donemsg);
- goto done;
- }
-
- sendlist = NULL;
- clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &donemsg.handle,
- donemsg.addr, NULL, NULL);
-
- status = RDMA_SEND(conn, sendlist, p->cku_xid);
- if (status != RDMA_SUCCESS) {
- cmn_err(CE_WARN,
- "clnt_rdma_kcallit: RDMA_SEND failed xid %u",
- p->cku_xid);
- }
-#ifdef DEBUG
- else {
- if (rdma_clnt_debug)
- printf("clnt_rdma_kcallit: sent RDMA_DONE xid %u\n",
- p->cku_xid);
- }
+done:
+ if (long_reply_buf_exists){
+ (void) clist_deregister(conn, &long_reply_buf_clist, 0);
+#ifndef SERVER_REG_CACHE
+ kmem_free((void *)long_reply_buf_clist.c_daddr,
+ (size_t)long_reply_buf_clist.c_len);
+#else
+ RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)long_reply_buf_clist.long_reply_buf);
#endif
- clist_free(sendlist);
}
-
-done:
if (cxdrp)
XDR_DESTROY(cxdrp);
if (rxdrp) {
(void) xdr_rpc_free_verifier(rxdrp, &reply_msg);
XDR_DESTROY(rxdrp);
}
if (recvlist) {
- rdma_buf_t recvmsg;
-
+ rdma_buf_t recvmsg = {0};
recvmsg.addr = (caddr_t)(uintptr_t)recvlist->c_saddr;
recvmsg.type = RECV_BUFFER;
RDMA_BUF_FREE(conn, &recvmsg);
clist_free(recvlist);
}
+#if (!defined(ASYNC_CLIENT_DEREG))
+ if(rpccall_wlist){
+ kmem_free(rpccall_wlist, sizeof(clist));
+ }
+#endif
+
RDMA_REL_CONN(conn);
if (p->cku_err.re_status != RPC_SUCCESS) {
RCSTAT_INCR(rcbadcalls);
}
return (p->cku_err.re_status);
}
+static int clnt_decode_long_reply(CONN *conn, rpcproc_t procnum,
+ struct clist *long_reply_buf_clist,
+ struct clist *rdma_reply, XDR *xdrs,
+ XDR **rxdrp, struct clist *cl,
+ struct clist *recvlist,
+ uint_t op,uint_t off)
+{
+ if ( RDMA_NOMSG == op && long_reply_buf_clist->c_daddr) {
+ if (procnum == NFSPROC3_READDIR ||
+ procnum == NFSPROC3_READDIRPLUS ||
+ procnum == NFSPROC3_READLINK) {
+ xdrmem_destroy(xdrs);
+ xdrrdma_create(xdrs,
+ (caddr_t)long_reply_buf_clist->c_daddr,
+ rdma_reply->c_len,
+ 0,
+ NULL,
+ XDR_DECODE,
+ conn);
+
+ *rxdrp = xdrs;
+ } else {
+ cmn_err(CE_NOTE, "clnt_rdma_kcallit: "
+ "wchunk buffer for wrong nfs proc");
+ xdrmem_destroy(xdrs);
+ *rxdrp = NULL;
+ }
+ } else if (cl && RDMA_NOMSG == op) {
+ cmn_err(CE_NOTE, "clnt_rdma_kcallit: "
+ "Server sent a READ list in the RPC Reply");
+ xdrmem_destroy(xdrs);
+ } else {
+ xdrmem_destroy(xdrs);
+ xdrrdma_create(xdrs,
+ (caddr_t)(uintptr_t)(recvlist->c_saddr + off),
+ recvlist->c_len - off, 0, cl, XDR_DECODE, conn);
+ *rxdrp = xdrs;
+ }
+ return CLNT_RDMA_SUCCESS;
+}
+
+#ifdef DYNAMIC_CREDIT_CONTROL
+static void clnt_compute_credit(CONN *conn, uint32_t *rdma_credit)
+{
+ rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+ mutex_enter(&conn->c_lock);
+ if(cc_info->clnt_cc_granted_ops - cc_info->clnt_cc_in_flight_ops < CLNT_CREDIT_LOW)
+ *rdma_credit = rdma_bufs_rqst + cc_info->clnt_cc_in_flight_ops / 2;
+ mutex_exit(&conn->c_lock);
+}
+#endif
+
+static void clnt_return_credit(CONN *conn)
+{
+ rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+ mutex_enter(&conn->c_lock);
+ cc_info->clnt_cc_in_flight_ops--;
+ cv_signal(&cc_info->clnt_cc_cv);
+ mutex_exit(&conn->c_lock);
+}
+
+static void clnt_update_credit(CONN *conn, uint32_t rdma_credit)
+{
+ rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+ /*
+ * Get the granted number of buffers for credit control.
+ */
+ mutex_enter(&conn->c_lock);
+ cc_info->clnt_cc_granted_ops = rdma_credit;
+ mutex_exit(&conn->c_lock);
+}
+
+static void clnt_check_credit(CONN *conn)
+{
+ rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+ /*
+ * Make sure we are not going over our allowed buffer use
+ * (and make sure we have gotten a granted value before).
+ */
+ mutex_enter(&conn->c_lock);
+ while (cc_info->clnt_cc_in_flight_ops >= cc_info->clnt_cc_granted_ops
+ && cc_info->clnt_cc_granted_ops != 0) {
+ /*
+ * Client has maxed out its granted buffers due to
+ * credit control. Current handling is to block and wait.
+ */
+ cv_wait(&cc_info->clnt_cc_cv, &conn->c_lock);
+ }
+ cc_info->clnt_cc_in_flight_ops++;
+ mutex_exit(&conn->c_lock);
+}
+
/* ARGSUSED */
static void
clnt_rdma_kabort(CLIENT *h)
{
}
@@ -1051,10 +1318,14 @@
struct knetconfig *knc;
char *pf, *p;
rdma_stat status;
int error = 0;
+ mutex_enter(&rdma_modload_lock);
+ error = rdma_modload();
+ mutex_exit(&rdma_modload_lock);
+
if (!INGLOBALZONE(curproc))
return (-1);
/*
* modload the RDMA plugins if not already done.
*/
@@ -1100,6 +1371,30 @@
}
rp = rp->r_next;
}
rw_exit(&rdma_lock);
return (-1);
+}
+
+static void
+check_dereg_wlist(CONN *conn, clist *rwc)
+{
+ if (rwc == NULL)
+ return;
+
+ if (rwc) {
+ if (rwc->c_dmemhandle.mrc_rmr && rwc->c_len) {
+ int status;
+#if defined(ASYNC_CLIENT_DEREG)
+ /* Add in an entry to rqueue */
+ INSERT_QUEUE(conn, rwc);
+#else
+ status = clist_deregister(conn, rwc, FALSE);
+ if (status != RDMA_SUCCESS) {
+ cmn_err(CE_NOTE, "dereg_wlist failed."
+ "status=%d", status);
+ }
+#endif
+ }
+
+ }
}