Udiff xdr_rdma.c
--- /webrev/webrev/usr/src/uts/common/rpc/xdr_rdma.c-   Mon Aug 14 13:12:12 2006
+++ xdr_rdma.c  Thu Aug 10 14:22:04 2006
@@ -22,10 +22,23 @@
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
+ /* Copyright (c) 2006, The Ohio State University. All rights reserved.
+  *
+  * Portions of this source code is developed by the team members of
+  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
+  * headed by Professor Dhabaleswar K. (DK) Panda.
+  *
+  * Acknowledgements to contributions from developors:
+  *   Ranjit Noronha: noronha@cse.ohio-state.edu
+  *   Lei Chai      : chail@cse.ohio-state.edu
+  *   Weikuan Yu    : yuw@cse.ohio-state.edu
+  *
+  */
+
 #pragma ident  "@(#)xdr_rdma.c 1.4     05/06/08 SMI"
 
 /*
  * xdr_rdma.c, XDR implementation using RDMA to move large chunks
  */
@@ -41,10 +54,12 @@
 #include <rpc/rpc_sztypes.h>
 #include <rpc/rpc_rdma.h>
 
 static struct xdr_ops *xdrrdma_ops(void);
 
+/*int rdma_xdr_long_reply_debug = 0x0;*/
+
 /*
  * A chunk list entry identifies a chunk
  * of opaque data to be moved separately
  * from the rest of the RPC message.
  * xp_min_chunk = 0, is a special case for ENCODING, which means
@@ -145,25 +160,61 @@
 static bool_t
 xdrrdma_getbytes(XDR *xdrs, caddr_t addr, int len)
 {
        struct private *xdrp = (struct private *)(xdrs->x_private);
        struct clist *cle = *(xdrp->xp_cl_next);
+       struct clist *cls = *(xdrp->xp_cl_next);
        struct clist cl;
        bool_t  retval = TRUE;
+       uint32_t total_len=len;
+       uint32_t sum_len=0;
+       uint32_t total_segments=0;
+       uint32_t actual_segments=0;
+       uint32_t status;
+       uint32_t i;
+       uint32_t alen;
+       while(cle) {
+       total_segments++;
+       cle=cle->c_next;
+       }
 
+       cle = *(xdrp->xp_cl_next);
        /*
         * If there was a chunk at the current offset
         * first record the destination address and length
         * in the chunk list that came with the message, then
         * RDMA READ the chunk data.
         */
        if (cle != NULL &&
                cle->c_xdroff == (xdrp->xp_offp - xdrs->x_base)) {
-               cle->c_daddr = (uint64)(uintptr_t)addr;
-               cle->c_len  = len;
+               for(actual_segments=0; actual_segments < total_segments; actual_segments++) {
+               if(total_len <= 0) 
+                               goto mem_sync;
+               cle->c_daddr = (uint64)(uintptr_t)addr + sum_len;
+               /*cle->c_len  = len;*/
+               alen = 0;
+               if(cle->c_len > total_len) {
+                       alen       = cle->c_len;
+                       cle->c_len = total_len;
+               }
+               if(!alen)
                xdrp->xp_cl_next = &cle->c_next;
 
+               
+               sum_len += cle->c_len;
+               total_len -= cle->c_len;
+               
+               if((total_segments - actual_segments - 1) == 0 && total_len > 0 ){
+               cmn_err(CE_WARN,"Provided read chunks are too short\n");
+               retval = FALSE;
+               }
+               
+               if((total_segments - actual_segments - 1) > 0 && total_len == 0 ){
+#ifdef DEBUG
+               cmn_err(CE_NOTE,"Provided read chunks are too long [total=%d, actual=%d]\n",total_segments,actual_segments);
+#endif
+               }
                /*
                 * RDMA READ the chunk data from the remote end.
                 * First prep the destination buffer by registering
                 * it, then RDMA READ the chunk data. Since we are
                 * doing streaming memory, sync the destination buffer
@@ -176,35 +227,56 @@
                cl = *cle;
                cl.c_next = NULL;
                if (clist_register(xdrp->xp_conn, &cl, 0) != RDMA_SUCCESS) {
                        return (FALSE);
                }
+               cle->c_dmemhandle =  cl.c_dmemhandle;
+                cle->c_dsynchandle = cl.c_dsynchandle;
 
                /*
                 * Now read the chunk in
                 */
-               if (RDMA_READ(xdrp->xp_conn, &cl, WAIT) != RDMA_SUCCESS) {
+               if((total_segments - actual_segments - 1) == 0 || total_len == 0){
+               status = RDMA_READ(xdrp->xp_conn, &cl, WAIT);
+               } else {
+               status = RDMA_READ(xdrp->xp_conn, &cl, NOWAIT);
+               }
+               if (status != RDMA_SUCCESS) {
 #ifdef DEBUG
                        cmn_err(CE_WARN,
                                "xdrrdma_getbytes: RDMA_READ failed\n");
 #endif
                        retval = FALSE;
                        goto out;
                }
+               cle = cle->c_next;
+               }
+               mem_sync:
                /*
                 * sync the memory for cpu
                 */
+               cle = cls;
+               cl = *cle;
+               cl.c_next = NULL;
+               cl.c_len  = sum_len;
                if (clist_syncmem(xdrp->xp_conn, &cl, 0) != RDMA_SUCCESS) {
                        retval = FALSE;
                        goto out;
                }
-
 out:
                /*
                 * Deregister the chunks
                 */
+               cle = cls;
+               cl = *cle;
+               cl.c_next = NULL;
+               cl.c_len  = sum_len;
                (void) clist_deregister(xdrp->xp_conn, &cl, 0);
+               if(alen){
+               cle->c_saddr = (uint64)(uintptr_t)cle->c_saddr + cle->c_len;
+               cle->c_len   =  alen - cle->c_len;
+               }
                return (retval);
        }
 
        if ((xdrs->x_handy -= len) < 0)
                return (FALSE);
@@ -422,14 +494,14 @@
 xdr_clist(XDR *xdrs, clist *objp)
 {
 
        if (!xdr_uint32(xdrs, &objp->c_xdroff))
                return (FALSE);
-       if (!xdr_uint32(xdrs, &objp->c_len))
-               return (FALSE);
        if (!xdr_uint32(xdrs, &objp->c_smemhandle.mrc_rmr))
                return (FALSE);
+       if (!xdr_uint32(xdrs, &objp->c_len))
+               return (FALSE);
        if (!xdr_uint64(xdrs, &objp->c_saddr))
                return (FALSE);
        if (!xdr_pointer(xdrs, (char **)&objp->c_next, sizeof (clist),
                (xdrproc_t)xdr_clist))
                return (FALSE);
@@ -447,6 +519,300 @@
 xdr_getbufsize(XDR *xdrs)
 {
        struct private *xdrp = (struct private *)(xdrs->x_private);
 
        return ((uint_t)xdrp->xp_buf_size);
+}
+
+bool_t
+xdr_encode_wlist(XDR *xdrs, clist *w, uint_t num_segment)
+{
+        bool_t vfalse = FALSE, vtrue = TRUE;
+        int i;
+
+        /* does a wlist exist? */
+        if (w == NULL) {
+                return (xdr_bool(xdrs, &vfalse));
+        }
+
+        /* Encode N consecutive segments, 1, N, HLOO, ..., HLOO, 0 */
+        if (! xdr_bool(xdrs, &vtrue))
+                return (FALSE);
+
+        if (! xdr_uint32(xdrs, &num_segment))
+                return (FALSE);
+        for(i=0; i<num_segment; i++){
+            if (! xdr_uint32(xdrs, &w->c_dmemhandle.mrc_rmr))
+                return (FALSE);
+
+            if (! xdr_uint32(xdrs, &w->c_len))
+                return (FALSE);
+
+            if (! xdr_uint64(xdrs, &w->c_daddr))
+                return (FALSE);
+
+            w = w->c_next;
+        }
+       if (!xdr_bool(xdrs, &vfalse))
+           return (FALSE);
+
+        return (TRUE);
+}
+
+bool_t
+xdr_decode_wlist(XDR *xdrs, struct clist **w, bool_t *wlist_exists)
+{
+        struct clist *tmp;
+        bool_t more = FALSE;
+        uint32_t seg_array_len;
+        uint32_t i;
+
+        if (! xdr_bool(xdrs, &more))
+                return (FALSE);
+
+        /* is there a wlist? */
+        if (more == FALSE) {
+                *wlist_exists = FALSE;
+                return (TRUE);
+        }
+
+        *wlist_exists = TRUE;
+       
+        if (! xdr_uint32(xdrs, &seg_array_len))
+                return (FALSE);
+
+        tmp = *w  = (struct clist *)kmem_zalloc(sizeof (struct clist),
+                                                  KM_SLEEP);
+       /* *w = empty_cl; */
+        for (i = 0; i < seg_array_len; i++) {
+           if (! xdr_uint32(xdrs, &tmp->c_dmemhandle.mrc_rmr))
+                   return (FALSE);
+           if (! xdr_uint32(xdrs, &tmp->c_len))
+                   return (FALSE);
+           if (! xdr_uint64(xdrs, &tmp->c_daddr))
+                   return (FALSE);
+           if (i < seg_array_len - 1) {
+               tmp->c_next = (struct clist *) 
+                   mem_alloc(sizeof(struct clist));
+               tmp = tmp->c_next;
+           } else {
+               tmp->c_next = NULL; 
+           }
+       }
+
+       more = FALSE;
+       if (!xdr_bool(xdrs, &more))
+           return (FALSE);
+
+        return (TRUE);
+}
+
+bool_t
+xdr_decode_wlist_new(XDR *xdrs, struct clist **wclp, bool_t *wwl,
+        uint32_t *total_length,CONN *conn)
+{
+        struct clist *first, *prev, *ncl;
+        char  *memp;
+#ifdef SERVER_REG_CACHE
+       /*struct private *xdrp ; = (struct private *)(xdrs->x_private)*/
+       rib_lrc_entry_t *long_reply_buf = NULL;
+#endif
+        uint32_t num_wclist;
+        uint32_t wcl_length = 0;
+        uint32_t i;
+        bool_t   more = FALSE;
+
+        *wclp = NULL;
+        *wwl = FALSE;
+        *total_length=0;
+
+        if (! xdr_bool(xdrs, &more)) {
+                return (FALSE);
+        }
+
+        if (more == FALSE) {
+                return (TRUE);
+        }
+
+        *wwl = TRUE;
+        if (! xdr_uint32(xdrs, &num_wclist)) {
+                cmn_err(CE_NOTE, "Error interpretting list length");
+                return (FALSE);
+        }
+
+        first = prev = ncl = (struct clist *)
+                kmem_zalloc(num_wclist*sizeof(struct clist), KM_SLEEP);
+
+        if (!first) {
+                cmn_err(CE_NOTE, "Not able to allocate memory");
+                return (FALSE);
+        }
+
+        more = TRUE;
+        for (i = 0; i < num_wclist; i++) {
+                if (! xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
+                        return (FALSE);
+                if (! xdr_uint32(xdrs, &ncl->c_len))
+                        return (FALSE);
+                if (! xdr_uint64(xdrs, &ncl->c_daddr))
+                        return (FALSE);
+
+                if (ncl->c_len > MAX_SVC_XFER_SIZE) {
+                        cmn_err(CE_NOTE, "write chunk length too big");
+                        ncl->c_len = MAX_SVC_XFER_SIZE;
+                }
+                if (i > 0) {
+                        prev->c_next = ncl;
+                }
+                wcl_length += ncl->c_len;
+                prev = ncl;
+                ncl ++ ;
+        }
+
+       more = FALSE;
+       if (!xdr_bool(xdrs, &more))
+               return (FALSE);
+
+#ifdef SERVER_REG_CACHE 
+       long_reply_buf = RDMA_GET_SERVER_CACHE_BUF(conn,wcl_length*sizeof(char));
+       first->long_reply_buf = (uint64)long_reply_buf;
+       memp  =         long_reply_buf->lrc_buf;
+#else
+        memp = (char *) kmem_alloc(wcl_length*sizeof(char), KM_SLEEP);
+#endif
+        if (!memp) {
+                cmn_err(CE_NOTE, "Not able to allocate memory for chunks");
+                kmem_free((void*) first, num_wclist*sizeof(struct clist));
+                return (FALSE);
+        }
+        ncl = first;
+        for (i = 0; i < num_wclist; i++) {
+#ifdef SERVER_REG_CACHE 
+       ncl->long_reply_buf = (uint64)long_reply_buf;
+#endif
+                ncl->c_saddr = (uint64_t) memp;
+                memp += ncl->c_len;
+                ncl++;
+        }
+
+        *wclp = first;
+        *total_length = wcl_length;
+        return (TRUE);
+}
+
+/*
+ * XDR decode the long reply write chunk.
+ */
+bool_t
+xdr_decode_reply_wchunk(XDR *xdrs, struct clist **clist,CONN *conn)
+{
+        uint32_t mem_handle = 0;
+        uint32_t length     = 0;
+        uint64 offset       = 0;
+        bool_t have_rchunk  = FALSE;
+        uint32_t seg_array_len = 0;
+        struct clist *first = NULL, *prev = NULL, *ncl = NULL;
+        char  *memp;
+        uint32_t num_wclist;
+        uint32_t wcl_length = 0;
+        uint32_t i;
+       rdma_buf_t long_rpc = {0};
+
+        if (!xdr_bool(xdrs, &have_rchunk))
+                return (FALSE);
+
+        if (have_rchunk == FALSE)
+                return (TRUE);
+
+        if (! xdr_uint32(xdrs, &num_wclist)) {
+                cmn_err(CE_NOTE, "Error interpretting list length");
+                return (FALSE);
+        }
+       if (num_wclist == 0) {
+                return (FALSE);
+        }
+
+        first = prev = ncl = (struct clist *)
+                kmem_zalloc(num_wclist*sizeof(struct clist), KM_SLEEP);
+        if (!first) {
+                cmn_err(CE_NOTE, "Not able to allocate memory");
+                return (FALSE);
+        }
+
+        for (i = 0; i < num_wclist; i++) {
+                if (! xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
+                        return (FALSE);
+                if (! xdr_uint32(xdrs, &ncl->c_len))
+                        return (FALSE);
+                if (! xdr_uint64(xdrs, &ncl->c_daddr))
+                        return (FALSE);
+
+                if (ncl->c_len > MAX_SVC_XFER_SIZE) {
+                        cmn_err(CE_NOTE, "reply chunk length too big");
+                        ncl->c_len = MAX_SVC_XFER_SIZE;
+                }
+               if(!(ncl->c_dmemhandle.mrc_rmr && (ncl->c_len > 0) && ncl->c_daddr))
+                                       cmn_err(CE_WARN,"Client sent invalid segment address\n");
+                if (i > 0) {
+                        prev->c_next = ncl;
+                }
+                wcl_length += ncl->c_len;
+                prev = ncl;
+                ncl ++ ;
+        }
+       if(num_wclist){
+       long_rpc.type            = CHUNK_BUFFER;
+#ifdef SERVER_REG_CACHE
+       long_rpc.long_reply_buf  =  RDMA_GET_SERVER_CACHE_BUF(conn,wcl_length);
+       memp = long_rpc.addr     =  long_rpc.long_reply_buf->lrc_buf;
+#else
+       memp = long_rpc.addr     = kmem_zalloc(wcl_length, KM_SLEEP);
+#endif
+        ncl  = first;
+       
+        for (i = 0; i < num_wclist; i++) {
+#ifdef SERVER_REG_CACHE
+        ncl->long_reply_buf = (uint64)long_rpc.long_reply_buf;
+#endif
+                ncl->c_saddr = (uint64_t) memp;
+                memp += ncl->c_len;
+                ncl++;
+        }
+       }
+       *clist=first;
+        return (TRUE);
+}
+
+bool_t
+xdr_encode_reply_wchunk(XDR *xdrs, struct clist *lrc_entry,uint32_t seg_array_len)
+{
+int i;
+bool_t long_reply_exists = TRUE;
+uint32_t length ;
+uint64 offset ;
+if(seg_array_len>0){
+       if (!xdr_bool(xdrs, &long_reply_exists))
+                        return (FALSE);
+       if (!xdr_uint32(xdrs, &seg_array_len))
+                        return (FALSE);
+
+       for(i=0;i<seg_array_len;i++){
+       if(!lrc_entry) 
+                  return FALSE;
+        length = lrc_entry->c_len;
+        offset = (uint64)lrc_entry->c_daddr;
+
+               if (!xdr_uint32(xdrs, &lrc_entry->c_dmemhandle.mrc_rmr))
+                       return (FALSE);
+               if (!xdr_uint32(xdrs, &length))
+                       return (FALSE);
+               if (!xdr_uint64(xdrs, &offset))
+                       return (FALSE);
+       lrc_entry = lrc_entry->c_next;
+       }
+        } else {
+               long_reply_exists = FALSE;
+                if(!xdr_bool(xdrs, &long_reply_exists))
+                       return (FALSE);
+        }
+        return (TRUE);
 }