Udiff rpc_rdma.h
--- /webrev/webrev/usr/src/uts/common/rpc/rpc_rdma.h-   Mon Aug 14 13:12:11 2006
+++ rpc_rdma.h  Thu Aug 10 14:05:27 2006
@@ -22,10 +22,23 @@
 /*
  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
+ /* Copyright (c) 2006, The Ohio State University. All rights reserved.
+  *
+  * Portions of this source code is developed by the team members of
+  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
+  * headed by Professor Dhabaleswar K. (DK) Panda.
+  *
+  * Acknowledgements to contributions from developors:
+  *   Ranjit Noronha: noronha@cse.ohio-state.edu
+  *   Lei Chai      : chail@cse.ohio-state.edu
+  *   Weikuan Yu    : yuw@cse.ohio-state.edu
+  *
+  */
+
 #ifndef        _RPC_RPC_RDMA_H
 #define        _RPC_RPC_RDMA_H
 
 #pragma ident  "@(#)rpc_rdma.h 1.9     05/06/08 SMI"
 
@@ -36,15 +49,31 @@
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define        RPCRDMA_VERS    0       /* Version of the RPC over RDMA protocol */
+#define        RPCRDMA_VERS    1       /* Version of the RPC over RDMA protocol */
 #define        RDMATF_VERS     1       /* Version of the API used by RPC for RDMA */
 #define        RDMATF_VERS_1   1       /* Current version of RDMATF */
 
+/* #define DYNAMIC_CREDIT_CONTROL 1 */
+#define SERVER_REG_CACHE         
+/*#define ASYNC_SERVER_DEREG */
+#define ASYNC_CLIENT_DEREG
+/*#define RPC_RDMA_INLINE  */
+#ifdef RPC_RDMA_INLINE   /* Increase to some super-large values */
+#define        RDMA_MINCHUNK   262144
+#define        RPC_MSG_SZ      262144 
+#define        RPC_CL_SZ       262144
+#define        MINCHUNK        262144
+#define        RPC_BUF_SIZE    262144*2
+#else
 /*
+ * RDMA chunk size
+ */
+#define        RDMA_MINCHUNK   1024
+/*
  * The size of an RPC call or reply message
  */
 #define        RPC_MSG_SZ  1024
 
 /*
@@ -59,10 +88,11 @@
 
 /*
  * Size of receive buffer
  */
 #define        RPC_BUF_SIZE    2048
+#endif
 
 #define        NOWAIT  0       /* don't wait for operation of complete */
 #define        WAIT    1       /* wait and ensure that operation is complete */
 
 /*
@@ -69,11 +99,66 @@
  * RDMA xdr buffer control and other control flags. Add new flags here,
  * set them in private structure for xdr over RDMA in xdr_rdma.c
  */
 #define        RDMA_NOCHUNK            0x1
 
+#define LONG_REPLY_LEN 65536
+
+extern int credit_control_debug;
+extern int rib_long_reply_debug;
+extern int rdma_long_reply_debug;
+extern int rdma_xdr_long_reply_debug;
+extern int rdma_wlist_xdr_debug;
+extern int rdma_wlist_clnt_debug;
+extern int rdma_wlist_svc_debug;
+extern int rdma_wlist_memreg_debug;
+extern int rdma_wlist_verbose_debug;
+
+
+#define RDMA_BUFS_RQST          128       /* Num bufs requested by client */
+#define RDMA_BUFS_GRANT         126       /* Num bufs granted by server */ 
+
 /*
+ * Credit Control Structures.
+ */
+typedef enum rdma_cc_type {
+       RDMA_CC_CLNT,           /* CONN is for a client */
+       RDMA_CC_SRV             /* CONN is for a server */
+} rdma_cc_type_t;
+
+/*
+ * Client side credit control data structure.
+ */
+typedef struct rdma_clnt_cred_ctrl {
+        uint32_t        clnt_cc_granted_ops;
+        uint32_t        clnt_cc_in_flight_ops;
+        kcondvar_t      clnt_cc_cv;
+} rdma_clnt_cred_ctrl_t;
+
+/*
+ * Server side credit control data structure.
+ */
+typedef struct rdma_srv_cred_ctrl {
+        uint32_t        srv_cc_buffers_granted;
+        uint32_t        srv_cc_cur_buffers_used;
+        uint32_t        srv_cc_posted;
+        uint32_t        srv_cc_max_buf_size;    /* to be determined by CCP */
+        uint32_t        srv_cc_cur_buf_size;    /* to be determined by CCP */
+} rdma_srv_cred_ctrl_t;
+
+typedef enum {
+    RPCCALL_RCHUNK,
+    RPCCALL_NORCHUNK
+}rpccall_read_t;
+
+typedef enum {
+    RPCCALL_WLIST,
+    RPCCALL_WCHUNK,
+    RPCCALL_NOWRITE
+}rpccall_write_t;
+
+/*
  * Return codes from RDMA operations
  */
 typedef enum {
 
        RDMA_SUCCESS = 0,       /* successful operation */
@@ -123,17 +208,18 @@
        uint32_t        mrc_rmr;        /* Remote MR context, sent OTW */
        union {
                struct mr {
                        uint32_t        lmr;    /* Local MR context */
                        uint64_t        linfo;  /* Local memory info */
+                       uint64_t        lma;    /* Local Mem Area Hdl */
                } mr;
        } lhdl;
 };
 
 #define        mrc_lmr         lhdl.mr.lmr
 #define        mrc_linfo       lhdl.mr.linfo
-
+#define mrc_lma         lhdl.mr.lma             /* FMR : Mem Area Hdl */
 /*
  * The XDR offset value is used by the XDR
  * routine to identify the position in the
  * RPC message where the opaque object would
  * normally occur. Neither the data content
@@ -144,24 +230,52 @@
  * The remaining fields identify the chunk of data
  * on the sender.  The c_memhandle identifies a
  * registered RDMA memory region and the c_addr
  * and c_len fields identify the chunk within it.
  */
+
+#ifdef SERVER_REG_CACHE
+typedef struct rib_lrc_entry {
+   struct rib_lrc_entry *forw;
+   struct rib_lrc_entry *back;
+   char *lrc_buf;
+   
+   uint32_t lrc_len;
+   void  *avl_node;
+   bool_t registered;
+
+   struct mrc lrc_mhandle;
+   bool_t lrc_on_freed_list;
+} rib_lrc_entry_t;
+#endif
+
 struct clist {
        uint32          c_xdroff;       /* XDR offset */
        uint32          c_len;          /* Length */
        struct mrc      c_smemhandle;   /* src memory handle */
        uint64          c_ssynchandle;  /* src sync handle */
        uint64          c_saddr;        /* src address */
        struct mrc      c_dmemhandle;   /* dst memory handle */
        uint64          c_dsynchandle;  /* dst sync handle */
        uint64          c_daddr;        /* dst address */
+        struct as       *c_adspc;       /* address space for saddr/daddr */
+        page_t          **c_dpplist;    /* page list for dest vaddr */
+       uint64          long_reply_buf;
        struct clist    *c_next;        /* Next chunk */
 };
 
 typedef struct clist clist;
 
+extern struct clist empty_cl;
+
+/*
+ * FTDO: max 4 meg wlist xfer size
+ * This is defined because the rfs3_tsize service requires
+ * svc_req struct (which we don't have that in krecv).
+ */
+#define MAX_SVC_XFER_SIZE (4*1024*1024)
+
 enum rdma_proc {
        RDMA_MSG        = 0,    /* chunk list and RPC msg follow */
        RDMA_NOMSG      = 1,    /* only chunk list follows */
        RDMA_MSGP       = 2,    /* chunk list and RPC msg with padding follow */
        RDMA_DONE       = 3     /* signal completion of chunk transfer */
@@ -224,12 +338,20 @@
 #define        C_ERROR         0x10000000
 #define        C_DISCONN_PEND  0x08000000
 #define        C_REMOTE_DOWN   0x04000000
 
        uint_t          c_state;        /* state of connection */
+        rdma_cc_type_t  c_cc_type;      /* client or server, for credit cntrl */
+        union {
+                rdma_clnt_cred_ctrl_t   c_clnt_cc;
+                rdma_srv_cred_ctrl_t    c_srv_cc;
+        } rdma_conn_cred_ctrl_u;
        kmutex_t        c_lock;         /* protect c_state and c_ref fields */
        kcondvar_t      c_cv;           /* to signal when pending is done */
+#if defined (CLNT_INTERRUPT_COAL)
+        uint_t          c_count;
+#endif
 } CONN;
 
 
 /*
  * Memory management for the RDMA buffers
@@ -251,10 +373,13 @@
 typedef struct rdma_buf {
        rdma_btype      type;   /* buffer type */
        int             len;    /* length of buffer */
        caddr_t         addr;   /* buffer address */
        struct mrc      handle; /* buffer registration handle */
+#ifdef SERVER_REG_CACHE
+        rib_lrc_entry_t *long_reply_buf;
+#endif
 } rdma_buf_t;
 
 /*
  * Data transferred from plugin interrupt to svc_queuereq()
  */
@@ -277,31 +402,55 @@
        rdma_stat       (*rdma_rel_conn)(CONN *);
        /* Server side listner start and stop routines */
        void            (*rdma_svc_listen)(struct rdma_svc_data *);
        void            (*rdma_svc_stop)(struct rdma_svc_data *);
        /* Memory */
-       rdma_stat       (*rdma_regmem)(CONN *, caddr_t, uint_t, struct mrc *);
+       rdma_stat       (*rdma_regmem)(CONN *, caddr_t , caddr_t, uint_t, struct mrc *);
        rdma_stat       (*rdma_deregmem)(CONN *, caddr_t, struct mrc);
-       rdma_stat       (*rdma_regmemsync)(CONN *, caddr_t, uint_t,
+#ifdef SERVER_REG_CACHE
+       rdma_stat       (*rdma_regmemsync)(CONN *, caddr_t ,caddr_t, uint_t,
+                               struct mrc *, void **, void *);
+       rdma_stat       (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc,
+                               void *, void *);
+#else
+       rdma_stat       (*rdma_regmemsync)(CONN *, caddr_t ,caddr_t, uint_t,
                                struct mrc *, void **);
        rdma_stat       (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc,
                                void *);
+
+#endif
        rdma_stat       (*rdma_syncmem)(CONN *, void *, caddr_t, int, int);
        /* Buffer */
        rdma_stat       (*rdma_buf_alloc)(CONN *, rdma_buf_t *);
        void            (*rdma_buf_free)(CONN *, rdma_buf_t *);
        /* Transfer */
        rdma_stat       (*rdma_send)(CONN *, clist *, uint32_t);
+#if defined (CLNT_INTERRUPT_COAL)
+       rdma_stat       (*rdma_send_bl)(CONN *, clist *, uint32_t);
+#endif
+#if defined(ASYNC_SERVER_DEREG)
+       rdma_stat       (*rdma_send_nw)(CONN *, clist *, uint32_t, caddr_t, caddr_t , int, caddr_t ,int, int, int);
+#endif
        rdma_stat       (*rdma_send_resp)(CONN *, clist *, uint32_t);
        rdma_stat       (*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t);
        rdma_stat       (*rdma_svc_recvbuf)(CONN *, clist *);
        rdma_stat       (*rdma_recv)(CONN *, clist **, uint32_t);
        /* RDMA */
        rdma_stat       (*rdma_read)(CONN *, clist *, int);
        rdma_stat       (*rdma_write)(CONN *, clist *, int);
        /* INFO */
        rdma_stat       (*rdma_getinfo)(rdma_info_t *info);
+#ifdef SERVER_REG_CACHE
+        rib_lrc_entry_t *(*rdma_get_server_cache_buf)(CONN *,uint32_t);
+        void            (*rdma_free_server_cache_buf)(CONN *, rib_lrc_entry_t *);
+#endif
+#ifdef DYNAMIC_CREDIT_CONTROL
+        void            (*rdma_get_resource_info)(CONN *, int *, int *);
+#endif
+#if defined(ASYNC_CLIENT_DEREG)
+       void    (*insert_queue)(CONN *, clist *);
+#endif
 
 } rdmaops_t;
 
 /*
  * RDMA operations.
@@ -313,24 +462,34 @@
        (*(rdma_ops)->rdma_get_conn)(addr, addr_type, handle, conn)
 
 #define        RDMA_REL_CONN(conn)     \
        (*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn)
 
-#define        RDMA_REGMEM(conn, buff, len, handle)    \
-       (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, buff, len, handle)
+#define        RDMA_REGMEM(conn, adsp, buff, len, handle)      \
+       (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp, buff, len, handle)
 
 #define        RDMA_DEREGMEM(conn, buff, handle)       \
        (*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle)
 
-#define        RDMA_REGMEMSYNC(conn, buff, len, handle, synchandle)    \
-       (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, buff, \
+#ifdef SERVER_REG_CACHE
+#define        RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc) \
+       (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \
+           len, handle, synchandle, lrc)
+
+#define        RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc)  \
+       (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \
+           handle, synchandle, lrc)
+#else
+#define        RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle)      \
+       (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \
            len, handle, synchandle)
 
 #define        RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle)       \
        (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \
            handle, synchandle)
 
+#endif
 #define        RDMA_SYNCMEM(conn, handle, buff, len, direction)        \
        (*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \
            buff, len, direction)
 
 #define        RDMA_BUF_ALLOC(conn, rbuf)      \
@@ -339,11 +498,24 @@
 #define        RDMA_BUF_FREE(conn, rbuf)       \
        (*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf)
 
 #define        RDMA_SEND(conn, sendlist, xid)  \
        (*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid)
+#if defined (CLNT_INTERRUPT_COAL)
+#define        RDMA_SEND_BL(conn, sendlist, xid)       \
+       (*(conn)->c_rdmamod->rdma_ops->rdma_send_bl)(conn, sendlist, xid)
 
+#endif
+#if defined(ASYNC_SERVER_DEREG)
+#define        RDMA_SEND_NW(conn, sendlist, xid, c, c1, c2, c3, c4, c5, c6) \
+       (*(conn)->c_rdmamod->rdma_ops->rdma_send_nw)(conn, sendlist, xid, c, c1, c2, c3, c4, c5, c6)
+#endif
+#if defined(ASYNC_CLIENT_DEREG)
+#define INSERT_QUEUE(conn,rwc)  \
+       (*(conn)->c_rdmamod->rdma_ops->insert_queue)(conn,rwc)
+#endif
+
 #define        RDMA_SEND_RESP(conn, sendlist, xid)     \
        (*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid)
 
 #define        RDMA_CLNT_RECVBUF(conn, cl, xid)        \
        (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid)
@@ -361,19 +533,32 @@
        (*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait)
 
 #define        RDMA_GETINFO(rdma_mod, info)    \
        (*(rdma_mod)->rdma_ops->rdma_getinfo)(info)
 
+
+#ifdef SERVER_REG_CACHE
+#define RDMA_GET_SERVER_CACHE_BUF(conn, len)      \
+         (*(conn)->c_rdmamod->rdma_ops->rdma_get_server_cache_buf)(conn, len)
+
+#define RDMA_FREE_SERVER_CACHE_BUF(conn, buf)        \
+         (*(conn)->c_rdmamod->rdma_ops->rdma_free_server_cache_buf)(conn, buf)
+#endif
+
+#ifdef DYNAMIC_CREDIT_CONTROL
+#define RDMA_GET_RESOURCE_INFO(conn, num, avail)      \
+         (*(conn)->c_rdmamod->rdma_ops->rdma_get_resource_info)(conn, num, avail)
+#endif
+
 #ifdef _KERNEL
 extern rdma_registry_t *rdma_mod_head;
 extern krwlock_t rdma_lock;            /* protects rdma_mod_head list */
 extern int rdma_modloaded;             /* flag for loading RDMA plugins */
 extern int rdma_dev_available;         /* rdma device is loaded or not */
 extern kmutex_t rdma_modload_lock;     /* protects rdma_modloaded flag */
 extern uint_t rdma_minchunk;
 extern ldi_ident_t rpcmod_li;          /* needed by layed driver framework */
-
 /*
  * General RDMA routines
  */
 extern void clist_add(struct clist **clp, uint32_t xdroff, int len,
        struct mrc *shandle, caddr_t saddr,
@@ -401,10 +586,19 @@
 extern bool_t xdr_clist(XDR *, clist *);
 extern bool_t xdr_do_clist(XDR *, clist **);
 extern uint_t xdr_getbufsize(XDR *);
 unsigned int xdrrdma_sizeof(xdrproc_t func, void *data, int min_chunk);
 unsigned int xdrrdma_authsize(AUTH *auth, struct cred *cred, int min_chunk);
+
+extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **,CONN *conn);
+extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *);
+extern bool_t xdr_decode_wlist_new(XDR *xdrs, struct clist **, bool_t *,
+        uint32_t *,CONN *);
+
+extern bool_t xdr_encode_wlist(XDR *, clist *, uint_t); 
+extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *, uint32_t seg_array_len); 
+
 #endif /* _KERNEL */
 
 #ifdef __cplusplus
 }
 #endif