Cdiff rpc_rdma.h
*** /webrev/webrev/usr/src/uts/common/rpc/rpc_rdma.h-   Mon Aug 14 13:12:11 2006
--- rpc_rdma.h  Thu Aug 10 14:05:27 2006

*** 22,31 **** --- 22,44 ---- /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ + /* Copyright (c) 2006, The Ohio State University. All rights reserved. + * + * Portions of this source code is developed by the team members of + * The Ohio State University's Network-Based Computing Laboratory (NBCL), + * headed by Professor Dhabaleswar K. (DK) Panda. + * + * Acknowledgements to contributions from developors: + * Ranjit Noronha: noronha@cse.ohio-state.edu + * Lei Chai : chail@cse.ohio-state.edu + * Weikuan Yu : yuw@cse.ohio-state.edu + * + */ + #ifndef _RPC_RPC_RDMA_H #define _RPC_RPC_RDMA_H #pragma ident "@(#)rpc_rdma.h 1.9 05/06/08 SMI"
*** 36,50 **** #ifdef __cplusplus extern "C" { #endif ! #define RPCRDMA_VERS 0 /* Version of the RPC over RDMA protocol */ #define RDMATF_VERS 1 /* Version of the API used by RPC for RDMA */ #define RDMATF_VERS_1 1 /* Current version of RDMATF */ /* * The size of an RPC call or reply message */ #define RPC_MSG_SZ 1024 /* --- 49,79 ---- #ifdef __cplusplus extern "C" { #endif ! #define RPCRDMA_VERS 1 /* Version of the RPC over RDMA protocol */ #define RDMATF_VERS 1 /* Version of the API used by RPC for RDMA */ #define RDMATF_VERS_1 1 /* Current version of RDMATF */ + /* #define DYNAMIC_CREDIT_CONTROL 1 */ + #define SERVER_REG_CACHE + /*#define ASYNC_SERVER_DEREG */ + #define ASYNC_CLIENT_DEREG + /*#define RPC_RDMA_INLINE */ + #ifdef RPC_RDMA_INLINE /* Increase to some super-large values */ + #define RDMA_MINCHUNK 262144 + #define RPC_MSG_SZ 262144 + #define RPC_CL_SZ 262144 + #define MINCHUNK 262144 + #define RPC_BUF_SIZE 262144*2 + #else /* + * RDMA chunk size + */ + #define RDMA_MINCHUNK 1024 + /* * The size of an RPC call or reply message */ #define RPC_MSG_SZ 1024 /*
*** 59,68 **** --- 88,98 ---- /* * Size of receive buffer */ #define RPC_BUF_SIZE 2048 + #endif #define NOWAIT 0 /* don't wait for operation of complete */ #define WAIT 1 /* wait and ensure that operation is complete */ /*
*** 69,79 **** --- 99,164 ---- * RDMA xdr buffer control and other control flags. Add new flags here, * set them in private structure for xdr over RDMA in xdr_rdma.c */ #define RDMA_NOCHUNK 0x1 + #define LONG_REPLY_LEN 65536 + + extern int credit_control_debug; + extern int rib_long_reply_debug; + extern int rdma_long_reply_debug; + extern int rdma_xdr_long_reply_debug; + extern int rdma_wlist_xdr_debug; + extern int rdma_wlist_clnt_debug; + extern int rdma_wlist_svc_debug; + extern int rdma_wlist_memreg_debug; + extern int rdma_wlist_verbose_debug; + + + #define RDMA_BUFS_RQST 128 /* Num bufs requested by client */ + #define RDMA_BUFS_GRANT 126 /* Num bufs granted by server */ + /* + * Credit Control Structures. + */ + typedef enum rdma_cc_type { + RDMA_CC_CLNT, /* CONN is for a client */ + RDMA_CC_SRV /* CONN is for a server */ + } rdma_cc_type_t; + + /* + * Client side credit control data structure. + */ + typedef struct rdma_clnt_cred_ctrl { + uint32_t clnt_cc_granted_ops; + uint32_t clnt_cc_in_flight_ops; + kcondvar_t clnt_cc_cv; + } rdma_clnt_cred_ctrl_t; + + /* + * Server side credit control data structure. + */ + typedef struct rdma_srv_cred_ctrl { + uint32_t srv_cc_buffers_granted; + uint32_t srv_cc_cur_buffers_used; + uint32_t srv_cc_posted; + uint32_t srv_cc_max_buf_size; /* to be determined by CCP */ + uint32_t srv_cc_cur_buf_size; /* to be determined by CCP */ + } rdma_srv_cred_ctrl_t; + + typedef enum { + RPCCALL_RCHUNK, + RPCCALL_NORCHUNK + }rpccall_read_t; + + typedef enum { + RPCCALL_WLIST, + RPCCALL_WCHUNK, + RPCCALL_NOWRITE + }rpccall_write_t; + + /* * Return codes from RDMA operations */ typedef enum { RDMA_SUCCESS = 0, /* successful operation */
*** 123,139 **** uint32_t mrc_rmr; /* Remote MR context, sent OTW */ union { struct mr { uint32_t lmr; /* Local MR context */ uint64_t linfo; /* Local memory info */ } mr; } lhdl; }; #define mrc_lmr lhdl.mr.lmr #define mrc_linfo lhdl.mr.linfo ! /* * The XDR offset value is used by the XDR * routine to identify the position in the * RPC message where the opaque object would * normally occur. Neither the data content --- 208,225 ---- uint32_t mrc_rmr; /* Remote MR context, sent OTW */ union { struct mr { uint32_t lmr; /* Local MR context */ uint64_t linfo; /* Local memory info */ + uint64_t lma; /* Local Mem Area Hdl */ } mr; } lhdl; }; #define mrc_lmr lhdl.mr.lmr #define mrc_linfo lhdl.mr.linfo ! #define mrc_lma lhdl.mr.lma /* FMR : Mem Area Hdl */ /* * The XDR offset value is used by the XDR * routine to identify the position in the * RPC message where the opaque object would * normally occur. Neither the data content
*** 144,167 **** --- 230,281 ---- * The remaining fields identify the chunk of data * on the sender. The c_memhandle identifies a * registered RDMA memory region and the c_addr * and c_len fields identify the chunk within it. */ + + #ifdef SERVER_REG_CACHE + typedef struct rib_lrc_entry { + struct rib_lrc_entry *forw; + struct rib_lrc_entry *back; + char *lrc_buf; + + uint32_t lrc_len; + void *avl_node; + bool_t registered; + + struct mrc lrc_mhandle; + bool_t lrc_on_freed_list; + } rib_lrc_entry_t; + #endif + struct clist { uint32 c_xdroff; /* XDR offset */ uint32 c_len; /* Length */ struct mrc c_smemhandle; /* src memory handle */ uint64 c_ssynchandle; /* src sync handle */ uint64 c_saddr; /* src address */ struct mrc c_dmemhandle; /* dst memory handle */ uint64 c_dsynchandle; /* dst sync handle */ uint64 c_daddr; /* dst address */ + struct as *c_adspc; /* address space for saddr/daddr */ + page_t **c_dpplist; /* page list for dest vaddr */ + uint64 long_reply_buf; struct clist *c_next; /* Next chunk */ }; typedef struct clist clist; + extern struct clist empty_cl; + + /* + * FTDO: max 4 meg wlist xfer size + * This is defined because the rfs3_tsize service requires + * svc_req struct (which we don't have that in krecv). + */ + #define MAX_SVC_XFER_SIZE (4*1024*1024) + enum rdma_proc { RDMA_MSG = 0, /* chunk list and RPC msg follow */ RDMA_NOMSG = 1, /* only chunk list follows */ RDMA_MSGP = 2, /* chunk list and RPC msg with padding follow */ RDMA_DONE = 3 /* signal completion of chunk transfer */
*** 224,235 **** --- 338,357 ---- #define C_ERROR 0x10000000 #define C_DISCONN_PEND 0x08000000 #define C_REMOTE_DOWN 0x04000000 uint_t c_state; /* state of connection */ + rdma_cc_type_t c_cc_type; /* client or server, for credit cntrl */ + union { + rdma_clnt_cred_ctrl_t c_clnt_cc; + rdma_srv_cred_ctrl_t c_srv_cc; + } rdma_conn_cred_ctrl_u; kmutex_t c_lock; /* protect c_state and c_ref fields */ kcondvar_t c_cv; /* to signal when pending is done */ + #if defined (CLNT_INTERRUPT_COAL) + uint_t c_count; + #endif } CONN; /* * Memory management for the RDMA buffers
*** 251,260 **** --- 373,385 ---- typedef struct rdma_buf { rdma_btype type; /* buffer type */ int len; /* length of buffer */ caddr_t addr; /* buffer address */ struct mrc handle; /* buffer registration handle */ + #ifdef SERVER_REG_CACHE + rib_lrc_entry_t *long_reply_buf; + #endif } rdma_buf_t; /* * Data transferred from plugin interrupt to svc_queuereq() */
*** 277,307 **** rdma_stat (*rdma_rel_conn)(CONN *); /* Server side listner start and stop routines */ void (*rdma_svc_listen)(struct rdma_svc_data *); void (*rdma_svc_stop)(struct rdma_svc_data *); /* Memory */ ! rdma_stat (*rdma_regmem)(CONN *, caddr_t, uint_t, struct mrc *); rdma_stat (*rdma_deregmem)(CONN *, caddr_t, struct mrc); ! rdma_stat (*rdma_regmemsync)(CONN *, caddr_t, uint_t, struct mrc *, void **); rdma_stat (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc, void *); rdma_stat (*rdma_syncmem)(CONN *, void *, caddr_t, int, int); /* Buffer */ rdma_stat (*rdma_buf_alloc)(CONN *, rdma_buf_t *); void (*rdma_buf_free)(CONN *, rdma_buf_t *); /* Transfer */ rdma_stat (*rdma_send)(CONN *, clist *, uint32_t); rdma_stat (*rdma_send_resp)(CONN *, clist *, uint32_t); rdma_stat (*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t); rdma_stat (*rdma_svc_recvbuf)(CONN *, clist *); rdma_stat (*rdma_recv)(CONN *, clist **, uint32_t); /* RDMA */ rdma_stat (*rdma_read)(CONN *, clist *, int); rdma_stat (*rdma_write)(CONN *, clist *, int); /* INFO */ rdma_stat (*rdma_getinfo)(rdma_info_t *info); } rdmaops_t; /* * RDMA operations. --- 402,456 ---- rdma_stat (*rdma_rel_conn)(CONN *); /* Server side listner start and stop routines */ void (*rdma_svc_listen)(struct rdma_svc_data *); void (*rdma_svc_stop)(struct rdma_svc_data *); /* Memory */ ! rdma_stat (*rdma_regmem)(CONN *, caddr_t , caddr_t, uint_t, struct mrc *); rdma_stat (*rdma_deregmem)(CONN *, caddr_t, struct mrc); ! #ifdef SERVER_REG_CACHE ! rdma_stat (*rdma_regmemsync)(CONN *, caddr_t ,caddr_t, uint_t, ! struct mrc *, void **, void *); ! rdma_stat (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc, ! void *, void *); ! #else ! rdma_stat (*rdma_regmemsync)(CONN *, caddr_t ,caddr_t, uint_t, struct mrc *, void **); rdma_stat (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc, void *); + + #endif rdma_stat (*rdma_syncmem)(CONN *, void *, caddr_t, int, int); /* Buffer */ rdma_stat (*rdma_buf_alloc)(CONN *, rdma_buf_t *); void (*rdma_buf_free)(CONN *, rdma_buf_t *); /* Transfer */ rdma_stat (*rdma_send)(CONN *, clist *, uint32_t); + #if defined (CLNT_INTERRUPT_COAL) + rdma_stat (*rdma_send_bl)(CONN *, clist *, uint32_t); + #endif + #if defined(ASYNC_SERVER_DEREG) + rdma_stat (*rdma_send_nw)(CONN *, clist *, uint32_t, caddr_t, caddr_t , int, caddr_t ,int, int, int); + #endif rdma_stat (*rdma_send_resp)(CONN *, clist *, uint32_t); rdma_stat (*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t); rdma_stat (*rdma_svc_recvbuf)(CONN *, clist *); rdma_stat (*rdma_recv)(CONN *, clist **, uint32_t); /* RDMA */ rdma_stat (*rdma_read)(CONN *, clist *, int); rdma_stat (*rdma_write)(CONN *, clist *, int); /* INFO */ rdma_stat (*rdma_getinfo)(rdma_info_t *info); + #ifdef SERVER_REG_CACHE + rib_lrc_entry_t *(*rdma_get_server_cache_buf)(CONN *,uint32_t); + void (*rdma_free_server_cache_buf)(CONN *, rib_lrc_entry_t *); + #endif + #ifdef DYNAMIC_CREDIT_CONTROL + void (*rdma_get_resource_info)(CONN *, int *, int *); + #endif + #if defined(ASYNC_CLIENT_DEREG) + void (*insert_queue)(CONN *, clist *); + #endif } rdmaops_t; /* * RDMA operations.
*** 313,336 **** (*(rdma_ops)->rdma_get_conn)(addr, addr_type, handle, conn) #define RDMA_REL_CONN(conn) \ (*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn) ! #define RDMA_REGMEM(conn, buff, len, handle) \ ! (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, buff, len, handle) #define RDMA_DEREGMEM(conn, buff, handle) \ (*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle) ! #define RDMA_REGMEMSYNC(conn, buff, len, handle, synchandle) \ ! (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, buff, \ len, handle, synchandle) #define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle) \ (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \ handle, synchandle) #define RDMA_SYNCMEM(conn, handle, buff, len, direction) \ (*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \ buff, len, direction) #define RDMA_BUF_ALLOC(conn, rbuf) \ --- 462,495 ---- (*(rdma_ops)->rdma_get_conn)(addr, addr_type, handle, conn) #define RDMA_REL_CONN(conn) \ (*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn) ! #define RDMA_REGMEM(conn, adsp, buff, len, handle) \ ! (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp, buff, len, handle) #define RDMA_DEREGMEM(conn, buff, handle) \ (*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle) ! #ifdef SERVER_REG_CACHE ! #define RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc) \ ! (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \ ! len, handle, synchandle, lrc) ! ! #define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc) \ ! (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \ ! handle, synchandle, lrc) ! #else ! #define RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle) \ ! (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \ len, handle, synchandle) #define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle) \ (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \ handle, synchandle) + #endif #define RDMA_SYNCMEM(conn, handle, buff, len, direction) \ (*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \ buff, len, direction) #define RDMA_BUF_ALLOC(conn, rbuf) \
*** 339,349 **** --- 498,521 ---- #define RDMA_BUF_FREE(conn, rbuf) \ (*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf) #define RDMA_SEND(conn, sendlist, xid) \ (*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid) + #if defined (CLNT_INTERRUPT_COAL) + #define RDMA_SEND_BL(conn, sendlist, xid) \ + (*(conn)->c_rdmamod->rdma_ops->rdma_send_bl)(conn, sendlist, xid) + #endif + #if defined(ASYNC_SERVER_DEREG) + #define RDMA_SEND_NW(conn, sendlist, xid, c, c1, c2, c3, c4, c5, c6) \ + (*(conn)->c_rdmamod->rdma_ops->rdma_send_nw)(conn, sendlist, xid, c, c1, c2, c3, c4, c5, c6) + #endif + #if defined(ASYNC_CLIENT_DEREG) + #define INSERT_QUEUE(conn,rwc) \ + (*(conn)->c_rdmamod->rdma_ops->insert_queue)(conn,rwc) + #endif + #define RDMA_SEND_RESP(conn, sendlist, xid) \ (*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid) #define RDMA_CLNT_RECVBUF(conn, cl, xid) \ (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid)
*** 361,379 **** (*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait) #define RDMA_GETINFO(rdma_mod, info) \ (*(rdma_mod)->rdma_ops->rdma_getinfo)(info) #ifdef _KERNEL extern rdma_registry_t *rdma_mod_head; extern krwlock_t rdma_lock; /* protects rdma_mod_head list */ extern int rdma_modloaded; /* flag for loading RDMA plugins */ extern int rdma_dev_available; /* rdma device is loaded or not */ extern kmutex_t rdma_modload_lock; /* protects rdma_modloaded flag */ extern uint_t rdma_minchunk; extern ldi_ident_t rpcmod_li; /* needed by layed driver framework */ - /* * General RDMA routines */ extern void clist_add(struct clist **clp, uint32_t xdroff, int len, struct mrc *shandle, caddr_t saddr, --- 533,564 ---- (*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait) #define RDMA_GETINFO(rdma_mod, info) \ (*(rdma_mod)->rdma_ops->rdma_getinfo)(info) + + #ifdef SERVER_REG_CACHE + #define RDMA_GET_SERVER_CACHE_BUF(conn, len) \ + (*(conn)->c_rdmamod->rdma_ops->rdma_get_server_cache_buf)(conn, len) + + #define RDMA_FREE_SERVER_CACHE_BUF(conn, buf) \ + (*(conn)->c_rdmamod->rdma_ops->rdma_free_server_cache_buf)(conn, buf) + #endif + + #ifdef DYNAMIC_CREDIT_CONTROL + #define RDMA_GET_RESOURCE_INFO(conn, num, avail) \ + (*(conn)->c_rdmamod->rdma_ops->rdma_get_resource_info)(conn, num, avail) + #endif + #ifdef _KERNEL extern rdma_registry_t *rdma_mod_head; extern krwlock_t rdma_lock; /* protects rdma_mod_head list */ extern int rdma_modloaded; /* flag for loading RDMA plugins */ extern int rdma_dev_available; /* rdma device is loaded or not */ extern kmutex_t rdma_modload_lock; /* protects rdma_modloaded flag */ extern uint_t rdma_minchunk; extern ldi_ident_t rpcmod_li; /* needed by layed driver framework */ /* * General RDMA routines */ extern void clist_add(struct clist **clp, uint32_t xdroff, int len, struct mrc *shandle, caddr_t saddr,
*** 401,410 **** --- 586,604 ---- extern bool_t xdr_clist(XDR *, clist *); extern bool_t xdr_do_clist(XDR *, clist **); extern uint_t xdr_getbufsize(XDR *); unsigned int xdrrdma_sizeof(xdrproc_t func, void *data, int min_chunk); unsigned int xdrrdma_authsize(AUTH *auth, struct cred *cred, int min_chunk); + + extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **,CONN *conn); + extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *); + extern bool_t xdr_decode_wlist_new(XDR *xdrs, struct clist **, bool_t *, + uint32_t *,CONN *); + + extern bool_t xdr_encode_wlist(XDR *, clist *, uint_t); + extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *, uint32_t seg_array_len); + #endif /* _KERNEL */ #ifdef __cplusplus } #endif