Udiff rpc_rdma.h
--- /webrev/webrev/usr/src/uts/common/rpc/rpc_rdma.h- Mon Aug 14 13:12:11 2006
+++ rpc_rdma.h Thu Aug 10 14:05:27 2006
@@ -22,10 +22,23 @@
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+ /* Copyright (c) 2006, The Ohio State University. All rights reserved.
+ *
+ * Portions of this source code is developed by the team members of
+ * The Ohio State University's Network-Based Computing Laboratory (NBCL),
+ * headed by Professor Dhabaleswar K. (DK) Panda.
+ *
+ * Acknowledgements to contributions from developors:
+ * Ranjit Noronha: noronha@cse.ohio-state.edu
+ * Lei Chai : chail@cse.ohio-state.edu
+ * Weikuan Yu : yuw@cse.ohio-state.edu
+ *
+ */
+
#ifndef _RPC_RPC_RDMA_H
#define _RPC_RPC_RDMA_H
#pragma ident "@(#)rpc_rdma.h 1.9 05/06/08 SMI"
@@ -36,15 +49,31 @@
#ifdef __cplusplus
extern "C" {
#endif
-#define RPCRDMA_VERS 0 /* Version of the RPC over RDMA protocol */
+#define RPCRDMA_VERS 1 /* Version of the RPC over RDMA protocol */
#define RDMATF_VERS 1 /* Version of the API used by RPC for RDMA */
#define RDMATF_VERS_1 1 /* Current version of RDMATF */
+/* #define DYNAMIC_CREDIT_CONTROL 1 */
+#define SERVER_REG_CACHE
+/*#define ASYNC_SERVER_DEREG */
+#define ASYNC_CLIENT_DEREG
+/*#define RPC_RDMA_INLINE */
+#ifdef RPC_RDMA_INLINE /* Increase to some super-large values */
+#define RDMA_MINCHUNK 262144
+#define RPC_MSG_SZ 262144
+#define RPC_CL_SZ 262144
+#define MINCHUNK 262144
+#define RPC_BUF_SIZE 262144*2
+#else
/*
+ * RDMA chunk size
+ */
+#define RDMA_MINCHUNK 1024
+/*
* The size of an RPC call or reply message
*/
#define RPC_MSG_SZ 1024
/*
@@ -59,10 +88,11 @@
/*
* Size of receive buffer
*/
#define RPC_BUF_SIZE 2048
+#endif
#define NOWAIT 0 /* don't wait for operation of complete */
#define WAIT 1 /* wait and ensure that operation is complete */
/*
@@ -69,11 +99,66 @@
* RDMA xdr buffer control and other control flags. Add new flags here,
* set them in private structure for xdr over RDMA in xdr_rdma.c
*/
#define RDMA_NOCHUNK 0x1
+#define LONG_REPLY_LEN 65536
+
+extern int credit_control_debug;
+extern int rib_long_reply_debug;
+extern int rdma_long_reply_debug;
+extern int rdma_xdr_long_reply_debug;
+extern int rdma_wlist_xdr_debug;
+extern int rdma_wlist_clnt_debug;
+extern int rdma_wlist_svc_debug;
+extern int rdma_wlist_memreg_debug;
+extern int rdma_wlist_verbose_debug;
+
+
+#define RDMA_BUFS_RQST 128 /* Num bufs requested by client */
+#define RDMA_BUFS_GRANT 126 /* Num bufs granted by server */
+
/*
+ * Credit Control Structures.
+ */
+typedef enum rdma_cc_type {
+ RDMA_CC_CLNT, /* CONN is for a client */
+ RDMA_CC_SRV /* CONN is for a server */
+} rdma_cc_type_t;
+
+/*
+ * Client side credit control data structure.
+ */
+typedef struct rdma_clnt_cred_ctrl {
+ uint32_t clnt_cc_granted_ops;
+ uint32_t clnt_cc_in_flight_ops;
+ kcondvar_t clnt_cc_cv;
+} rdma_clnt_cred_ctrl_t;
+
+/*
+ * Server side credit control data structure.
+ */
+typedef struct rdma_srv_cred_ctrl {
+ uint32_t srv_cc_buffers_granted;
+ uint32_t srv_cc_cur_buffers_used;
+ uint32_t srv_cc_posted;
+ uint32_t srv_cc_max_buf_size; /* to be determined by CCP */
+ uint32_t srv_cc_cur_buf_size; /* to be determined by CCP */
+} rdma_srv_cred_ctrl_t;
+
+typedef enum {
+ RPCCALL_RCHUNK,
+ RPCCALL_NORCHUNK
+}rpccall_read_t;
+
+typedef enum {
+ RPCCALL_WLIST,
+ RPCCALL_WCHUNK,
+ RPCCALL_NOWRITE
+}rpccall_write_t;
+
+/*
* Return codes from RDMA operations
*/
typedef enum {
RDMA_SUCCESS = 0, /* successful operation */
@@ -123,17 +208,18 @@
uint32_t mrc_rmr; /* Remote MR context, sent OTW */
union {
struct mr {
uint32_t lmr; /* Local MR context */
uint64_t linfo; /* Local memory info */
+ uint64_t lma; /* Local Mem Area Hdl */
} mr;
} lhdl;
};
#define mrc_lmr lhdl.mr.lmr
#define mrc_linfo lhdl.mr.linfo
-
+#define mrc_lma lhdl.mr.lma /* FMR : Mem Area Hdl */
/*
* The XDR offset value is used by the XDR
* routine to identify the position in the
* RPC message where the opaque object would
* normally occur. Neither the data content
@@ -144,24 +230,52 @@
* The remaining fields identify the chunk of data
* on the sender. The c_memhandle identifies a
* registered RDMA memory region and the c_addr
* and c_len fields identify the chunk within it.
*/
+
+#ifdef SERVER_REG_CACHE
+typedef struct rib_lrc_entry {
+ struct rib_lrc_entry *forw;
+ struct rib_lrc_entry *back;
+ char *lrc_buf;
+
+ uint32_t lrc_len;
+ void *avl_node;
+ bool_t registered;
+
+ struct mrc lrc_mhandle;
+ bool_t lrc_on_freed_list;
+} rib_lrc_entry_t;
+#endif
+
struct clist {
uint32 c_xdroff; /* XDR offset */
uint32 c_len; /* Length */
struct mrc c_smemhandle; /* src memory handle */
uint64 c_ssynchandle; /* src sync handle */
uint64 c_saddr; /* src address */
struct mrc c_dmemhandle; /* dst memory handle */
uint64 c_dsynchandle; /* dst sync handle */
uint64 c_daddr; /* dst address */
+ struct as *c_adspc; /* address space for saddr/daddr */
+ page_t **c_dpplist; /* page list for dest vaddr */
+ uint64 long_reply_buf;
struct clist *c_next; /* Next chunk */
};
typedef struct clist clist;
+extern struct clist empty_cl;
+
+/*
+ * FTDO: max 4 meg wlist xfer size
+ * This is defined because the rfs3_tsize service requires
+ * svc_req struct (which we don't have that in krecv).
+ */
+#define MAX_SVC_XFER_SIZE (4*1024*1024)
+
enum rdma_proc {
RDMA_MSG = 0, /* chunk list and RPC msg follow */
RDMA_NOMSG = 1, /* only chunk list follows */
RDMA_MSGP = 2, /* chunk list and RPC msg with padding follow */
RDMA_DONE = 3 /* signal completion of chunk transfer */
@@ -224,12 +338,20 @@
#define C_ERROR 0x10000000
#define C_DISCONN_PEND 0x08000000
#define C_REMOTE_DOWN 0x04000000
uint_t c_state; /* state of connection */
+ rdma_cc_type_t c_cc_type; /* client or server, for credit cntrl */
+ union {
+ rdma_clnt_cred_ctrl_t c_clnt_cc;
+ rdma_srv_cred_ctrl_t c_srv_cc;
+ } rdma_conn_cred_ctrl_u;
kmutex_t c_lock; /* protect c_state and c_ref fields */
kcondvar_t c_cv; /* to signal when pending is done */
+#if defined (CLNT_INTERRUPT_COAL)
+ uint_t c_count;
+#endif
} CONN;
/*
* Memory management for the RDMA buffers
@@ -251,10 +373,13 @@
typedef struct rdma_buf {
rdma_btype type; /* buffer type */
int len; /* length of buffer */
caddr_t addr; /* buffer address */
struct mrc handle; /* buffer registration handle */
+#ifdef SERVER_REG_CACHE
+ rib_lrc_entry_t *long_reply_buf;
+#endif
} rdma_buf_t;
/*
* Data transferred from plugin interrupt to svc_queuereq()
*/
@@ -277,31 +402,55 @@
rdma_stat (*rdma_rel_conn)(CONN *);
/* Server side listner start and stop routines */
void (*rdma_svc_listen)(struct rdma_svc_data *);
void (*rdma_svc_stop)(struct rdma_svc_data *);
/* Memory */
- rdma_stat (*rdma_regmem)(CONN *, caddr_t, uint_t, struct mrc *);
+ rdma_stat (*rdma_regmem)(CONN *, caddr_t , caddr_t, uint_t, struct mrc *);
rdma_stat (*rdma_deregmem)(CONN *, caddr_t, struct mrc);
- rdma_stat (*rdma_regmemsync)(CONN *, caddr_t, uint_t,
+#ifdef SERVER_REG_CACHE
+ rdma_stat (*rdma_regmemsync)(CONN *, caddr_t ,caddr_t, uint_t,
+ struct mrc *, void **, void *);
+ rdma_stat (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc,
+ void *, void *);
+#else
+ rdma_stat (*rdma_regmemsync)(CONN *, caddr_t ,caddr_t, uint_t,
struct mrc *, void **);
rdma_stat (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc,
void *);
+
+#endif
rdma_stat (*rdma_syncmem)(CONN *, void *, caddr_t, int, int);
/* Buffer */
rdma_stat (*rdma_buf_alloc)(CONN *, rdma_buf_t *);
void (*rdma_buf_free)(CONN *, rdma_buf_t *);
/* Transfer */
rdma_stat (*rdma_send)(CONN *, clist *, uint32_t);
+#if defined (CLNT_INTERRUPT_COAL)
+ rdma_stat (*rdma_send_bl)(CONN *, clist *, uint32_t);
+#endif
+#if defined(ASYNC_SERVER_DEREG)
+ rdma_stat (*rdma_send_nw)(CONN *, clist *, uint32_t, caddr_t, caddr_t , int, caddr_t ,int, int, int);
+#endif
rdma_stat (*rdma_send_resp)(CONN *, clist *, uint32_t);
rdma_stat (*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t);
rdma_stat (*rdma_svc_recvbuf)(CONN *, clist *);
rdma_stat (*rdma_recv)(CONN *, clist **, uint32_t);
/* RDMA */
rdma_stat (*rdma_read)(CONN *, clist *, int);
rdma_stat (*rdma_write)(CONN *, clist *, int);
/* INFO */
rdma_stat (*rdma_getinfo)(rdma_info_t *info);
+#ifdef SERVER_REG_CACHE
+ rib_lrc_entry_t *(*rdma_get_server_cache_buf)(CONN *,uint32_t);
+ void (*rdma_free_server_cache_buf)(CONN *, rib_lrc_entry_t *);
+#endif
+#ifdef DYNAMIC_CREDIT_CONTROL
+ void (*rdma_get_resource_info)(CONN *, int *, int *);
+#endif
+#if defined(ASYNC_CLIENT_DEREG)
+ void (*insert_queue)(CONN *, clist *);
+#endif
} rdmaops_t;
/*
* RDMA operations.
@@ -313,24 +462,34 @@
(*(rdma_ops)->rdma_get_conn)(addr, addr_type, handle, conn)
#define RDMA_REL_CONN(conn) \
(*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn)
-#define RDMA_REGMEM(conn, buff, len, handle) \
- (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, buff, len, handle)
+#define RDMA_REGMEM(conn, adsp, buff, len, handle) \
+ (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp, buff, len, handle)
#define RDMA_DEREGMEM(conn, buff, handle) \
(*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle)
-#define RDMA_REGMEMSYNC(conn, buff, len, handle, synchandle) \
- (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, buff, \
+#ifdef SERVER_REG_CACHE
+#define RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc) \
+ (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \
+ len, handle, synchandle, lrc)
+
+#define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc) \
+ (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \
+ handle, synchandle, lrc)
+#else
+#define RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle) \
+ (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \
len, handle, synchandle)
#define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle) \
(*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \
handle, synchandle)
+#endif
#define RDMA_SYNCMEM(conn, handle, buff, len, direction) \
(*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \
buff, len, direction)
#define RDMA_BUF_ALLOC(conn, rbuf) \
@@ -339,11 +498,24 @@
#define RDMA_BUF_FREE(conn, rbuf) \
(*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf)
#define RDMA_SEND(conn, sendlist, xid) \
(*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid)
+#if defined (CLNT_INTERRUPT_COAL)
+#define RDMA_SEND_BL(conn, sendlist, xid) \
+ (*(conn)->c_rdmamod->rdma_ops->rdma_send_bl)(conn, sendlist, xid)
+#endif
+#if defined(ASYNC_SERVER_DEREG)
+#define RDMA_SEND_NW(conn, sendlist, xid, c, c1, c2, c3, c4, c5, c6) \
+ (*(conn)->c_rdmamod->rdma_ops->rdma_send_nw)(conn, sendlist, xid, c, c1, c2, c3, c4, c5, c6)
+#endif
+#if defined(ASYNC_CLIENT_DEREG)
+#define INSERT_QUEUE(conn,rwc) \
+ (*(conn)->c_rdmamod->rdma_ops->insert_queue)(conn,rwc)
+#endif
+
#define RDMA_SEND_RESP(conn, sendlist, xid) \
(*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid)
#define RDMA_CLNT_RECVBUF(conn, cl, xid) \
(*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid)
@@ -361,19 +533,32 @@
(*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait)
#define RDMA_GETINFO(rdma_mod, info) \
(*(rdma_mod)->rdma_ops->rdma_getinfo)(info)
+
+#ifdef SERVER_REG_CACHE
+#define RDMA_GET_SERVER_CACHE_BUF(conn, len) \
+ (*(conn)->c_rdmamod->rdma_ops->rdma_get_server_cache_buf)(conn, len)
+
+#define RDMA_FREE_SERVER_CACHE_BUF(conn, buf) \
+ (*(conn)->c_rdmamod->rdma_ops->rdma_free_server_cache_buf)(conn, buf)
+#endif
+
+#ifdef DYNAMIC_CREDIT_CONTROL
+#define RDMA_GET_RESOURCE_INFO(conn, num, avail) \
+ (*(conn)->c_rdmamod->rdma_ops->rdma_get_resource_info)(conn, num, avail)
+#endif
+
#ifdef _KERNEL
extern rdma_registry_t *rdma_mod_head;
extern krwlock_t rdma_lock; /* protects rdma_mod_head list */
extern int rdma_modloaded; /* flag for loading RDMA plugins */
extern int rdma_dev_available; /* rdma device is loaded or not */
extern kmutex_t rdma_modload_lock; /* protects rdma_modloaded flag */
extern uint_t rdma_minchunk;
extern ldi_ident_t rpcmod_li; /* needed by layed driver framework */
-
/*
* General RDMA routines
*/
extern void clist_add(struct clist **clp, uint32_t xdroff, int len,
struct mrc *shandle, caddr_t saddr,
@@ -401,10 +586,19 @@
extern bool_t xdr_clist(XDR *, clist *);
extern bool_t xdr_do_clist(XDR *, clist **);
extern uint_t xdr_getbufsize(XDR *);
unsigned int xdrrdma_sizeof(xdrproc_t func, void *data, int min_chunk);
unsigned int xdrrdma_authsize(AUTH *auth, struct cred *cred, int min_chunk);
+
+extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **,CONN *conn);
+extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *);
+extern bool_t xdr_decode_wlist_new(XDR *xdrs, struct clist **, bool_t *,
+ uint32_t *,CONN *);
+
+extern bool_t xdr_encode_wlist(XDR *, clist *, uint_t);
+extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *, uint32_t seg_array_len);
+
#endif /* _KERNEL */
#ifdef __cplusplus
}
#endif