Cdiff rpcib.c
*** /webrev/webrev/usr/src/uts/common/rpc/rpcib.c-      Mon Aug 14 13:12:11 2006
--- rpcib.c     Thu Aug 10 14:05:27 2006

*** 22,31 **** --- 22,45 ---- /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ + + /* Copyright (c) 2006, The Ohio State University. All rights reserved. + * + * Portions of this source code is developed by the team members of + * The Ohio State University's Network-Based Computing Laboratory (NBCL), + * headed by Professor Dhabaleswar K. (DK) Panda. + * + * Acknowledgements to contributions from developors: + * Ranjit Noronha: noronha@cse.ohio-state.edu + * Lei Chai : chail@cse.ohio-state.edu + * Weikuan Yu : yuw@cse.ohio-state.edu + * + */ + #pragma ident "@(#)rpcib.c 1.29 06/01/25 SMI" /* * The rpcib plugin. Implements the interface for RDMATF's * interaction with IBTF.
*** 55,64 **** --- 69,80 ---- #include <sys/isa_defs.h> #include <sys/callb.h> #include <sys/sunddi.h> #include <sys/sunndi.h> + /* #define IB_FMR_SUP */ + /* #define CLNT_POLL_CQ */ #include <sys/ib/ibtl/ibti.h> #include <rpc/rpc.h> #include <rpc/ib.h> #include <sys/modctl.h>
*** 68,78 **** --- 84,97 ---- #include <sys/sockio.h> #include <sys/vnode.h> #include <sys/tiuser.h> #include <net/if.h> #include <sys/cred.h> + #include <rpc/rpc_rdma.h> + int num_clients = 0; + volatile uint32_t is_server = 0; extern char *inet_ntop(int, const void *, char *, int); /*
*** 105,114 **** --- 124,136 ---- CB_REV, /* rev */ nodev, /* int (*cb_aread)() */ nodev /* int (*cb_awrite)() */ }; + + + /* * Device options */ static struct dev_ops rpcib_ops = { DEVO_REV, /* devo_rev, */
*** 138,158 **** MODREV_1, (void *)&rib_modldrv, NULL }; /* * rib_stat: private data pointer used when registering * with the IBTF. It is returned to the consumer * in all callbacks. */ static rpcib_state_t *rib_stat = NULL; ! #define RNR_RETRIES 2 #define MAX_PORTS 2 ! int preposted_rbufs = 16; int send_threshold = 1; /* * State of the plugin. * ACCEPT = accepting new connections and requests. --- 160,200 ---- MODREV_1, (void *)&rib_modldrv, NULL }; + #ifdef SERVER_REG_CACHE + typedef struct cache_struct { + avl_node_t avl_link; + rib_lrc_entry_t r; + uint32_t len; + uint32_t elements; + kmutex_t node_lock; + } cache_avl_struct_t; + + + #if 1 + int rib_total_buffers = 0; + #endif + #endif /* * rib_stat: private data pointer used when registering * with the IBTF. It is returned to the consumer * in all callbacks. */ static rpcib_state_t *rib_stat = NULL; ! #define RNR_RETRIES IBT_RNR_INFINITE_RETRY #define MAX_PORTS 2 ! #ifdef IB_FMR_SUP ! #define IB_FMR_DIRTY_MARK 32 ! #define IB_FMR_MAX_SIZE 1048576 ! /*#define IB_FMR_MAX_SIZE 32768 */ ! #endif ! ! int preposted_rbufs = RDMA_BUFS_GRANT; int send_threshold = 1; /* * State of the plugin. * ACCEPT = accepting new connections and requests.
*** 167,188 **** /* * RPCIB RDMATF operations */ static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); static rdma_stat rib_disconnect(CONN *conn); static void rib_listen(struct rdma_svc_data *rd); static void rib_listen_stop(struct rdma_svc_data *rd); ! static rdma_stat rib_registermem(CONN *conn, caddr_t buf, uint_t buflen, struct mrc *buf_handle); static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle); ! static rdma_stat rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle); static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle); static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, int len, int cpu); static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); --- 209,245 ---- /* * RPCIB RDMATF operations */ + #if defined(MEASURE_POOL_DEPTH) + static void rib_posted_rbufs(uint32_t x) { return;} + #endif static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); static rdma_stat rib_disconnect(CONN *conn); static void rib_listen(struct rdma_svc_data *rd); static void rib_listen_stop(struct rdma_svc_data *rd); ! static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, struct mrc *buf_handle); static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle); ! static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, ! caddr_t buf, uint_t buflen, struct mrc *buf_handle); ! static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, ! struct mrc buf_handle); ! #ifdef SERVER_REG_CACHE ! static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, ! struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc); ! static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, ! struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); ! #else ! static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle); static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle); + + #endif static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, int len, int cpu); static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
*** 190,199 **** --- 247,266 ---- static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); static void rib_rbuf_free(CONN *conn, int ptype, void *buf); static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); + #if defined (CLNT_INTERRUPT_COAL) + static void rib_scq_free(caddr_t); + static rdma_stat rib_send_bl(CONN *conn, struct clist *cl, uint32_t msgid); + #endif + #if defined(ASYNC_SERVER_DEREG) + static rdma_stat rib_send_nw(CONN *conn, struct clist *cl, uint32_t msgid, caddr_t, caddr_t, int, caddr_t, int, int, int); + #endif + #if defined(ASYNC_CLIENT_DEREG) + static void insert_queue(CONN *conn, struct clist *rwc); + #endif static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
*** 200,209 **** --- 267,289 ---- static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **); static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); static rdma_stat rib_conn_release(CONN *conn); static rdma_stat rib_getinfo(rdma_info_t *info); + #ifdef DYNAMIC_CREDIT_CONTROL + void rib_get_resource_info(CONN *, int *, int *); + #endif + + #ifdef SERVER_REG_CACHE + static rib_lrc_entry_t *rib_get_server_cache_buf(CONN *conn, uint32_t len); + static void rib_free_server_cache_buf(CONN *conn, rib_lrc_entry_t *buf); + static void rib_destroy_cache(rib_hca_t *hca); + static void + rib_server_side_cache_reclaim(void *argp); + static int avl_compare(const void *t1,const void *t2); + #endif + static rdma_stat rib_register_ats(rib_hca_t *); static void rib_deregister_ats(); static void rib_stop_services(rib_hca_t *); /*
*** 213,224 **** int get_interfaces(TIUSER *tiptr, int *num); int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs); int get_ibd_ipaddr(rpcib_ibd_insts_t *); rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *); void rib_get_ibd_insts(rpcib_ibd_insts_t *); ! /* * RDMA operations the RPCIB module exports */ static rdmaops_t rib_ops = { rib_reachable, --- 293,319 ---- int get_interfaces(TIUSER *tiptr, int *num); int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs); int get_ibd_ipaddr(rpcib_ibd_insts_t *); rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *); void rib_get_ibd_insts(rpcib_ibd_insts_t *); + #if defined(ASYNC_SERVER_DEREG)||defined(ASYNC_CLIENT_DEREG) + static int clist_deregister1(CONN *, struct clist *, bool_t ); + #endif ! #if defined(ASYNC_CLIENT_DEREG) ! typedef struct async_dereg { ! struct async_dereg *forw; ! struct async_dereg *back; ! CONN c_conn; ! struct clist c_clist; ! } ASYNC; ! static void async_dereg_thread(caddr_t arg); ! extern pri_t minclsyspri; /* priority for taskq */ ! static ASYNC rqueue; ! static kmutex_t at_mutex; ! static kcondvar_t at_cond; ! #endif /* * RDMA operations the RPCIB module exports */ static rdmaops_t rib_ops = { rib_reachable,
*** 232,248 **** rib_deregistermemsync, rib_syncmem, rib_reg_buf_alloc, rib_reg_buf_free, rib_send, rib_send_resp, rib_post_resp, rib_post_recv, rib_recv, rib_read, rib_write, ! rib_getinfo }; /* * RDMATF RPCIB plugin details */ --- 327,359 ---- rib_deregistermemsync, rib_syncmem, rib_reg_buf_alloc, rib_reg_buf_free, rib_send, + #if defined (CLNT_INTERRUPT_COAL) + rib_send_bl, + #endif + #if defined(ASYNC_SERVER_DEREG) + rib_send_nw, + #endif rib_send_resp, rib_post_resp, rib_post_recv, rib_recv, rib_read, rib_write, ! rib_getinfo, ! #ifdef SERVER_REG_CACHE ! rib_get_server_cache_buf, ! rib_free_server_cache_buf, ! #endif ! #ifdef DYNAMIC_CREDIT_CONTROL ! rib_get_resource_info, ! #endif ! #if defined(ASYNC_CLIENT_DEREG) ! insert_queue, ! #endif }; /* * RDMATF RPCIB plugin details */
*** 258,269 **** static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); ! static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *); static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, rib_qp_t **); static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, rib_qp_t **); --- 369,386 ---- static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); ! #ifdef IB_FMR_SUP ! static rdma_stat rib_reg_mem_fmr(rib_hca_t *, caddr_t adsp,caddr_t, uint_t, ibt_mr_flags_t, ! ibt_mr_hdl_t *, ibt_ma_hdl_t *, ibt_pmr_desc_t *); ! #endif ! static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); + static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, + ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *); static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, rib_qp_t **); static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, rib_qp_t **);
*** 314,325 **** * debugging in rpcib kernel module. * Set it to values greater that 1 to control * the amount of debugging messages required. */ int rib_debug = 0; ! static int ats_running = 0; int _init(void) { int error; --- 431,446 ---- * debugging in rpcib kernel module. * Set it to values greater that 1 to control * the amount of debugging messages required. */ int rib_debug = 0; ! #if defined(CLNT_POLL_CQ) ! int max_poll_count = 500; ! #endif static int ats_running = 0; + + int _init(void) { int error;
*** 571,580 **** --- 692,702 ---- static rdma_stat rib_rem_replylist(rib_qp_t *); static int rib_remreply(rib_qp_t *, struct reply *); static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); + /* * One CQ pair per HCA */ static rdma_stat rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
*** 631,641 **** rdma_stat status; ibt_hca_portinfo_t *pinfop; ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; uint_t size, cq_size; int i; ! ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); if (ribstat->hcas == NULL) ribstat->hcas = kmem_zalloc(ribstat->hca_count * sizeof (rib_hca_t), KM_SLEEP); --- 753,766 ---- rdma_stat status; ibt_hca_portinfo_t *pinfop; ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; uint_t size, cq_size; int i; ! #ifdef IB_FMR_SUP ! ibt_fmr_pool_attr_t fmr_attr; ! uint_t h_page_sz; ! #endif ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); if (ribstat->hcas == NULL) ribstat->hcas = kmem_zalloc(ribstat->hca_count * sizeof (rib_hca_t), KM_SLEEP);
*** 744,754 **** --- 869,943 ---- if (hca->send_pool == NULL) { cmn_err(CE_WARN, "open_hcas: send buf pool failed\n"); rib_rbufpool_destroy(hca, RECV_BUFFER); goto fail3; } + #ifdef IB_FMR_SUP + /* Global FMR POOL */ + bzero(&fmr_attr, sizeof (ibt_fmr_pool_attr_t)); + h_page_sz = hca->hca_attrs.hca_page_sz * 1024; + + fmr_attr.fmr_max_pages_per_fmr = + (IB_FMR_MAX_SIZE / h_page_sz) + 2; + fmr_attr.fmr_pool_size = MAX_BUFS * 2; + fmr_attr.fmr_dirty_watermark = IB_FMR_DIRTY_MARK; + fmr_attr.fmr_page_sz = h_page_sz; + fmr_attr.fmr_cache = B_FALSE; + fmr_attr.fmr_flags = IBT_MR_SLEEP | + IBT_MR_ENABLE_LOCAL_WRITE | + IBT_MR_ENABLE_REMOTE_READ | + IBT_MR_ENABLE_REMOTE_WRITE; + fmr_attr.fmr_func_hdlr = NULL; + + if (rib_debug > 1) { + cmn_err(CE_NOTE, "open_hcas: ibt_create_fmr_pool:"); + cmn_err(CE_NOTE, "fmr_page_sz %d, fmr_pool_sz %d, " + "max_pages_per_fmr %d", fmr_attr.fmr_page_sz, + fmr_attr.fmr_pool_size, + fmr_attr.fmr_max_pages_per_fmr); + } + + ibt_status = ibt_create_fmr_pool(hca->hca_hdl, hca->pd_hdl, + &fmr_attr, &hca->fmr_pool); + if (ibt_status != IBT_SUCCESS) { + cmn_err(CE_WARN, "open_hcas: Global FMR pool creation " + "failed: %d\n", ibt_status); + rib_rbufpool_destroy(hca, RECV_BUFFER); + rib_rbufpool_destroy(hca, SEND_BUFFER); + goto fail3; + } + #endif + #ifdef SERVER_REG_CACHE + cmn_err(CE_NOTE,"Registration Cache enabled\n"); + { + cache_avl_struct_t my_avl_node; + hca->server_side_cache = + kmem_cache_create("rib_server_side_cache", + sizeof (cache_avl_struct_t), 0, + NULL, + NULL, + rib_server_side_cache_reclaim, + hca, NULL, 0); + avl_create(&hca->avl_tree, + avl_compare, + sizeof(cache_avl_struct_t), + (uint_t)&my_avl_node.avl_link-(uint_t)&my_avl_node); + /* mutex_init(&hca->avl_lock, NULL, MUTEX_DEFAULT, NULL);*/ + rw_init(&hca->avl_rw_lock, NULL, RW_DRIVER, hca->iblock); + hca->avl_init = TRUE; + + } + #endif + + #if defined(ASYNC_CLIENT_DEREG) + rqueue.forw = rqueue.back = &rqueue; + mutex_init(&at_mutex, NULL, MUTEX_DEFAULT, NULL); + cv_init(&at_cond, NULL, CV_DEFAULT, NULL); + (void) thread_create(NULL, 0, async_dereg_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + #endif /* * Initialize the registered service list and * the lock */ hca->service_list = NULL;
*** 886,895 **** --- 1075,1162 ---- } } } } + #if defined (CLNT_INTERRUPT_COAL) + static void + rib_scq_free(caddr_t widd) + { + struct send_wid *wd = (struct send_wid *)widd; + ibt_status_t ibt_status; + ibt_wc_t wc; + int i; + CONN *conn = qptoc(wd->qp); + + wc.wc_status = RDMA_SUCCESS; + mutex_enter(&wd->sendwait_lock); + switch (wc.wc_status) { + case IBT_WC_SUCCESS: + wd->status = RDMA_SUCCESS; + break; + case IBT_WC_WR_FLUSHED_ERR: + wd->status = RDMA_FAILED; + break; + default: + /* + * RC Send Q Error Code Local state Remote State + * ==================== =========== ============ + * IBT_WC_BAD_RESPONSE_ERR ERROR None + * IBT_WC_LOCAL_LEN_ERR ERROR None + * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None + * IBT_WC_LOCAL_PROTECT_ERR ERROR None + * IBT_WC_MEM_WIN_BIND_ERR ERROR None + * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR + * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR + * IBT_WC_REMOTE_OP_ERR ERROR ERROR + * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None + * IBT_WC_TRANS_TIMEOUT_ERR ERROR None + * IBT_WC_WR_FLUSHED_ERR None None + */ + #ifdef DEBUG + if (rib_debug > 1) { + if (wc.wc_status != IBT_WC_SUCCESS) { + cmn_err(CE_NOTE, "rib_clnt_scq_handler: " + "WR completed in error, wc.wc_status:%d, " + "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id); + } + } + #endif + /* + * Channel in error state. Set connection to + * ERROR and cleanup will happen either from + * conn_release or from rib_conn_get + */ + wd->status = RDMA_FAILED; + mutex_enter(&conn->c_lock); + if (conn->c_state != C_DISCONN_PEND) + conn->c_state = C_ERROR; + mutex_exit(&conn->c_lock); + break; + } + if (wd->cv_sig == 1) { + /* + * Notify poster + */ + cmn_err(CE_NOTE,"Some error \n"); + cv_signal(&wd->wait_cv); + mutex_exit(&wd->sendwait_lock); + } else { + /* + * Poster not waiting for notification. + * Free the send buffers and send_wid + */ + for (i = 0; i < wd->nsbufs; i++) { + rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, + (void *)(uintptr_t)wd->sbufaddr[i]); + } + mutex_exit(&wd->sendwait_lock); + (void) rib_free_sendwait(wd); + } + } + #endif + /* ARGSUSED */ static void rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { ibt_status_t ibt_status;
*** 919,929 **** wc.wc_status, (longlong_t)wc.wc_id); } #endif if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; ! mutex_enter(&wd->sendwait_lock); if (wd->cv_sig == 1) { /* * Update completion status and notify poster */ --- 1186,1217 ---- wc.wc_status, (longlong_t)wc.wc_id); } #endif if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; ! #ifdef ASYNC_SERVER_DEREG ! if(wd->c1){ ! (void) clist_deregister1((CONN *)wd->c, (struct clist *)wd->c1, TRUE); ! #ifdef SERVER_REG_CACHE ! RDMA_FREE_SERVER_CACHE_BUF((CONN *)wd->c, (rib_lrc_entry_t *)(((struct clist *)wd->c1)->long_reply_buf)); ! #else ! if(wd->c1 && wd->l1) ! kmem_free((void *) (wd->c1)->c_saddr, wd->l1); ! #endif ! kmem_free((void *)(wd->c1), wd->wl * sizeof(struct clist)); ! } ! if(wd->c2){ ! (void) clist_deregister1((CONN *)wd->c, (struct clist *)wd->c2, TRUE); ! #ifdef SERVER_REG_CACHE ! RDMA_FREE_SERVER_CACHE_BUF((CONN *)wd->c, (rib_lrc_entry_t *)(((struct clist *)wd->c2)->long_reply_buf)); ! #else ! if(wd->l2) ! kmem_free((void *) (wd->c2)->c_saddr, wd->l2); ! #endif ! kmem_free((void *)(wd->c2), wd->rl * sizeof(struct clist)); ! } ! #endif mutex_enter(&wd->sendwait_lock); if (wd->cv_sig == 1) { /* * Update completion status and notify poster */
*** 958,988 **** { rib_qp_t *qp; ibt_status_t ibt_status; ibt_wc_t wc; struct recv_wid *rwid; /* * Re-enable cq notify here to avoid missing any * completion queue notification. */ (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); ibt_status = IBT_SUCCESS; while (ibt_status != IBT_CQ_EMPTY) { bzero(&wc, sizeof (wc)); ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); if (ibt_status != IBT_SUCCESS) return; ! rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; qp = rwid->qp; if (wc.wc_status == IBT_WC_SUCCESS) { XDR inxdrs, *xdrs; uint_t xid, vers, op, find_xid = 0; struct reply *r; CONN *conn = qptoc(qp); xdrs = &inxdrs; xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, wc.wc_bytes_xfer, XDR_DECODE); /* --- 1246,1302 ---- { rib_qp_t *qp; ibt_status_t ibt_status; ibt_wc_t wc; struct recv_wid *rwid; + #if defined(CLNT_POLL_CQ) + uint32_t count = 0; + #endif /* * Re-enable cq notify here to avoid missing any * completion queue notification. */ + #if !defined(CLNT_POLL_CQ) (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); + #endif ibt_status = IBT_SUCCESS; while (ibt_status != IBT_CQ_EMPTY) { + #if defined(CLNT_POLL_CQ) + poll_cq_again: + #endif bzero(&wc, sizeof (wc)); ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); + #if defined(CLNT_POLL_CQ) + if (ibt_status == IBT_CQ_EMPTY){ + count ++; + if(count == max_poll_count){ + (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); + return; + } + goto poll_cq_again; + } + #endif if (ibt_status != IBT_SUCCESS) + #if defined(CLNT_POLL_CQ) + { + (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); + #endif return; ! #if defined(CLNT_POLL_CQ) ! } ! count = 0; ! #endif rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; qp = rwid->qp; if (wc.wc_status == IBT_WC_SUCCESS) { XDR inxdrs, *xdrs; uint_t xid, vers, op, find_xid = 0; struct reply *r; CONN *conn = qptoc(qp); + uint32_t rdma_credit = 0; xdrs = &inxdrs; xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, wc.wc_bytes_xfer, XDR_DECODE); /*
*** 991,1000 **** --- 1305,1315 ---- */ xid = *(uint32_t *)(uintptr_t)rwid->addr; /* Skip xid and set the xdr position accordingly. */ XDR_SETPOS(xdrs, sizeof (uint32_t)); (void) xdr_u_int(xdrs, &vers); + (void) xdr_u_int(xdrs, &rdma_credit); (void) xdr_u_int(xdrs, &op); XDR_DESTROY(xdrs); if (vers != RPCRDMA_VERS) { /* * Invalid RPC/RDMA version. Cannot interoperate.
*** 1108,1124 **** --- 1423,1443 ---- s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; qp = s_recvp->qp; conn = qptoc(qp); mutex_enter(&qp->posted_rbufs_lock); qp->n_posted_rbufs--; + #if defined(MEASURE_POOL_DEPTH) + rib_posted_rbufs(preposted_rbufs - qp->n_posted_rbufs); + #endif if (qp->n_posted_rbufs == 0) cv_signal(&qp->posted_rbufs_cv); mutex_exit(&qp->posted_rbufs_lock); if (wc.wc_status == IBT_WC_SUCCESS) { XDR inxdrs, *xdrs; uint_t xid, vers, op; + uint32_t rdma_credit; xdrs = &inxdrs; /* s_recvp->vaddr stores data */ xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, wc.wc_bytes_xfer, XDR_DECODE);
*** 1129,1138 **** --- 1448,1458 ---- */ xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; /* Skip xid and set the xdr position accordingly. */ XDR_SETPOS(xdrs, sizeof (uint32_t)); if (!xdr_u_int(xdrs, &vers) || + !xdr_u_int(xdrs, &rdma_credit) || !xdr_u_int(xdrs, &op)) { rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)s_recvp->vaddr); XDR_DESTROY(xdrs); #ifdef DEBUG
*** 1338,1347 **** --- 1658,1668 ---- static rdma_stat rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) { rib_qp_t *kqp = NULL; CONN *conn; + rdma_clnt_cred_ctrl_t *cc_info; ASSERT(qp != NULL); *qp = NULL; kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
*** 1365,1374 **** --- 1686,1710 ---- mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); + #if defined (CLNT_INTERRUPT_COAL) + kqp->rdmaconn.c_count = 0; + conn->c_count = 0; + bzero(&kqp->wd, sizeof(struct send_wid)); + kqp->wd.forw = kqp->wd.back = &kqp->wd; + #endif + /* + * Initialize the client credit control + * portion of the rdmaconn struct. + */ + kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; + cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; + cc_info->clnt_cc_granted_ops = 0; + cc_info->clnt_cc_in_flight_ops = 0; + cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); *qp = kqp; return (RDMA_SUCCESS); }
*** 1378,1387 **** --- 1714,1724 ---- { rib_qp_t *kqp = NULL; ibt_chan_sizes_t chan_sizes; ibt_rc_chan_alloc_args_t qp_attr; ibt_status_t ibt_status; + rdma_srv_cred_ctrl_t *cc_info; ASSERT(qp != NULL); *qp = NULL; kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
*** 1439,1449 **** --- 1776,1799 ---- /* * Set the private data area to qp to be used in callbacks */ ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); kqp->rdmaconn.c_state = C_CONNECTED; + + /* + * Initialize the server credit control + * portion of the rdmaconn struct. + */ + kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; + cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; + cc_info->srv_cc_buffers_granted = preposted_rbufs; + cc_info->srv_cc_cur_buffers_used = 0; + cc_info->srv_cc_posted = preposted_rbufs; + *qp = kqp; + + num_clients++; return (RDMA_SUCCESS); fail: if (kqp) kmem_free(kqp, sizeof (rib_qp_t));
*** 1722,1733 **** qp_attr.rc_flags = IBT_WR_SIGNALED; chan_args.oc_path = path; chan_args.oc_cm_handler = rib_clnt_cm_handler; chan_args.oc_cm_clnt_private = (void *)rib_stat; ! chan_args.oc_rdma_ra_out = 1; ! chan_args.oc_rdma_ra_in = 1; chan_args.oc_path_retry_cnt = 2; chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; refresh: rw_enter(&hca->state_lock, RW_READER); --- 2072,2083 ---- qp_attr.rc_flags = IBT_WR_SIGNALED; chan_args.oc_path = path; chan_args.oc_cm_handler = rib_clnt_cm_handler; chan_args.oc_cm_clnt_private = (void *)rib_stat; ! chan_args.oc_rdma_ra_out = 4; ! chan_args.oc_rdma_ra_in = 4; chan_args.oc_path_retry_cnt = 2; chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; refresh: rw_enter(&hca->state_lock, RW_READER);
*** 1900,1909 **** --- 2250,2269 ---- kmem_free(conn->c_raddr.buf, conn->c_raddr.len); } if (conn->c_laddr.buf != NULL) { kmem_free(conn->c_laddr.buf, conn->c_laddr.len); } + + /* + * Credit control cleanup. + */ + if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { + rdma_clnt_cred_ctrl_t *cc_info; + cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; + cv_destroy(&cc_info->clnt_cc_cv); + } + kmem_free(qp, sizeof (rib_qp_t)); /* * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, * then the hca is no longer being used.
*** 1925,1937 **** --- 2285,2321 ---- } rw_exit(&hca->srv_conn_list.conn_lock); } rw_exit(&hca->state_lock); } + + num_clients--; return (RDMA_SUCCESS); } + #ifdef DYNAMIC_CREDIT_CONTROL + void rib_get_resource_info(CONN *conn, int *current_clients, int *avail_bufs) + { + rib_qp_t *qp = ctoqp(conn); + rib_hca_t *hca = qp->hca; + rib_bufpool_t *rbp = NULL; + bufpool_t *bp; + + is_server = 1; + rbp = hca->recv_pool; + + if (rbp == NULL) + *avail_bufs = 0; + else { + bp = rbp->bpool; + *avail_bufs = bp->buffree; + } + + *current_clients = num_clients; + } + #endif + /* * Wait for send completion notification. Only on receiving a * notification be it a successful or error completion, free the * send_wid. */
*** 2062,2073 **** * Send buffers are freed here only in case of error in posting * on QP. If the post succeeded, the send buffers are freed upon * send completion in rib_sendwait() or in the scq_handler. */ rdma_stat rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, ! int send_sig, int cv_sig) { struct send_wid *wdesc; struct clist *clp; ibt_status_t ibt_status = IBT_SUCCESS; rdma_stat ret = RDMA_SUCCESS; --- 2446,2462 ---- * Send buffers are freed here only in case of error in posting * on QP. If the post succeeded, the send buffers are freed upon * send completion in rib_sendwait() or in the scq_handler. */ rdma_stat + #if defined(ASYNC_SERVER_DEREG) rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, ! int send_sig, int cv_sig, caddr_t c, caddr_t c1, int l1, caddr_t c2, int l2, int l3, int l4) ! #else ! rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, ! int send_sig, int cv_sig, caddr_t *swid) ! #endif { struct send_wid *wdesc; struct clist *clp; ibt_status_t ibt_status = IBT_SUCCESS; rdma_stat ret = RDMA_SUCCESS;
*** 2100,2114 **** --- 2489,2514 ---- if (send_sig) { /* Set SEND_SIGNAL flag. */ tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; wdesc = rib_init_sendwait(msgid, cv_sig, qp); + *swid = (caddr_t)wdesc; } else { tx_wr.wr_flags = IBT_WR_NO_FLAGS; wdesc = rib_init_sendwait(msgid, 0, qp); + *swid = (caddr_t)wdesc; } wdesc->nsbufs = nds; + #if defined(ASYNC_SERVER_DEREG) + wdesc->c = c; + wdesc->c1 = c1; + wdesc->c2 = c2; + wdesc->l1 = l1; + wdesc->l2 = l2; + wdesc->wl = l3; + wdesc->rl = l4; + #endif for (i = 0; i < nds; i++) { wdesc->sbufaddr[i] = sgl[i].ds_va; } tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
*** 2161,2181 **** } return (RDMA_SUCCESS); } rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid) { rdma_stat ret; /* send-wait & cv_signal */ ! ret = rib_send_and_wait(conn, cl, msgid, 1, 1); return (ret); } ! /* * Server interface (svc_rdma_ksend). * Send RPC reply and wait for RDMA_DONE. */ rdma_stat --- 2561,2632 ---- } return (RDMA_SUCCESS); } + #if defined (CLNT_INTERRUPT_COAL) rdma_stat + rib_send_bl(CONN *conn, struct clist *cl, uint32_t msgid) + { + rdma_stat ret; + struct send_wid *sd, dlist; + rib_qp_t *qp = ctoqp(conn); + caddr_t wd; + mutex_enter(&conn->c_lock); + if((conn->c_count+1) >= (preposted_rbufs/2)){ + conn->c_count = 0; + dlist.forw = dlist.back = &dlist; + while(qp->wd.forw != &qp->wd){ + sd = qp->wd.forw; + remque(sd); + insque(sd,&dlist); + } + mutex_exit(&conn->c_lock); + ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); + while(dlist.forw != &dlist){ + sd = dlist.forw; + remque(dlist.forw); + rib_scq_free((caddr_t)sd); + } + }else{ + mutex_exit(&conn->c_lock); + wd = 0; + ret = rib_send_and_wait(conn, cl, msgid, 0, 0, &wd); + mutex_enter(&conn->c_lock); + conn->c_count ++; + insque(wd, &qp->wd); + mutex_exit(&conn->c_lock); + } + return (ret); + } + #endif + + rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid) { rdma_stat ret; + /* send-wait & cv_signal */ + #if defined(ASYNC_SERVER_DEREG) + ret = rib_send_and_wait(conn, cl, msgid,1,1,0,0,0,0,0,0,0, &wd); + #else + ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); + #endif + return (ret); + } + #if defined(ASYNC_SERVER_DEREG) + rdma_stat + rib_send_nw(CONN *conn, struct clist *cl, uint32_t msgid, caddr_t c, caddr_t c1, int c2, caddr_t c3, int c4, int c5, int c6) + { + rdma_stat ret; + caddr_t *wid; /* send-wait & cv_signal */ ! ret = rib_send_and_wait(conn, cl, msgid, 1, 0, c, c1, c2, c3, c4, c5, c6, wid); return (ret); } ! #endif /* * Server interface (svc_rdma_ksend). * Send RPC reply and wait for RDMA_DONE. */ rdma_stat
*** 2182,2198 **** rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) { rdma_stat ret = RDMA_SUCCESS; struct rdma_done_list *rd; clock_t timout, cv_wait_ret; rib_qp_t *qp = ctoqp(conn); mutex_enter(&qp->rdlist_lock); rd = rdma_done_add(qp, msgid); /* No cv_signal (whether send-wait or no-send-wait) */ ! ret = rib_send_and_wait(conn, cl, msgid, 1, 0); if (ret != RDMA_SUCCESS) { #ifdef DEBUG cmn_err(CE_WARN, "rib_send_resp: send_and_wait " "failed, msgid %u, qp %p", msgid, (void *)qp); #endif --- 2633,2654 ---- rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) { rdma_stat ret = RDMA_SUCCESS; struct rdma_done_list *rd; clock_t timout, cv_wait_ret; + caddr_t *wid; rib_qp_t *qp = ctoqp(conn); mutex_enter(&qp->rdlist_lock); rd = rdma_done_add(qp, msgid); /* No cv_signal (whether send-wait or no-send-wait) */ ! #if defined(ASYNC_SERVER_DEREG) ! ret = rib_send_and_wait(conn, cl, msgid, 1, 0, 0, 0, 0, 0, 0, 0, 0, wid); ! #else ! ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); ! #endif if (ret != RDMA_SUCCESS) { #ifdef DEBUG cmn_err(CE_WARN, "rib_send_resp: send_and_wait " "failed, msgid %u, qp %p", msgid, (void *)qp); #endif
*** 2496,2506 **** */ rdma_stat rib_write(CONN *conn, struct clist *cl, int wait) { ibt_send_wr_t tx_wr; - int nds; int cv_sig; ibt_wr_ds_t sgl[DSEG_MAX]; struct send_wid *wdesc; ibt_status_t ibt_status; rdma_stat ret = RDMA_SUCCESS; --- 2952,2961 ----
*** 2509,2538 **** if (cl == NULL) { cmn_err(CE_WARN, "rib_write: NULL clist\n"); return (RDMA_FAILED); } bzero(&tx_wr, sizeof (ibt_send_wr_t)); - /* - * Remote address is at the head chunk item in list. - */ tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr; tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */ - nds = 0; - while (cl != NULL) { - if (nds >= DSEG_MAX) { - cmn_err(CE_WARN, "rib_write: DSEG_MAX too small!"); - return (RDMA_FAILED); - } - sgl[nds].ds_va = cl->c_saddr; - sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ - sgl[nds].ds_len = cl->c_len; - cl = cl->c_next; - nds++; - } - if (wait) { tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; cv_sig = 1; } else { tx_wr.wr_flags = IBT_WR_NO_FLAGS; --- 2964,2983 ---- if (cl == NULL) { cmn_err(CE_WARN, "rib_write: NULL clist\n"); return (RDMA_FAILED); } + + while ((cl != NULL)) { + if(cl->c_len > 0){ bzero(&tx_wr, sizeof (ibt_send_wr_t)); tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr; tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */ + sgl[0].ds_va = cl->c_saddr; + sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ + sgl[0].ds_len = cl->c_len; if (wait) { tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; cv_sig = 1; } else { tx_wr.wr_flags = IBT_WR_NO_FLAGS;
*** 2541,2551 **** wdesc = rib_init_sendwait(0, cv_sig, qp); tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; tx_wr.wr_opcode = IBT_WRC_RDMAW; tx_wr.wr_trans = IBT_RC_SRV; ! tx_wr.wr_nds = nds; tx_wr.wr_sgl = sgl; mutex_enter(&conn->c_lock); if (conn->c_state & C_CONNECTED) { ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); --- 2986,2996 ---- wdesc = rib_init_sendwait(0, cv_sig, qp); tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; tx_wr.wr_opcode = IBT_WRC_RDMAW; tx_wr.wr_trans = IBT_RC_SRV; ! tx_wr.wr_nds = 1; tx_wr.wr_sgl = sgl; mutex_enter(&conn->c_lock); if (conn->c_state & C_CONNECTED) { ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
*** 2565,2574 **** --- 3010,3022 ---- ret = rib_sendwait(qp, wdesc); if (ret != 0) { return (ret); } } + } + cl = cl->c_next; + } return (RDMA_SUCCESS); } /* * RDMA Read a buffer from the remote address.
*** 2683,2693 **** rpcib_state_t *ribstat; rib_hca_t *hca; rdma_stat status = RDMA_SUCCESS; int i; struct clist cl; ! rdma_buf_t rdbuf; void *buf = NULL; ibt_cm_req_rcv_t cm_req_rcv; CONN *conn; ibt_status_t ibt_status; ibt_ar_t ar_query, ar_result; --- 3131,3141 ---- rpcib_state_t *ribstat; rib_hca_t *hca; rdma_stat status = RDMA_SUCCESS; int i; struct clist cl; ! rdma_buf_t rdbuf = {0}; void *buf = NULL; ibt_cm_req_rcv_t cm_req_rcv; CONN *conn; ibt_status_t ibt_status; ibt_ar_t ar_query, ar_result;
*** 2768,2779 **** } } #endif ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; ! ret_args->cm_ret.rep.cm_rdma_ra_out = 1; ! ret_args->cm_ret.rep.cm_rdma_ra_in = 1; ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; /* * Pre-posts RECV buffers */ --- 3216,3227 ---- } } #endif ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; ! ret_args->cm_ret.rep.cm_rdma_ra_out = 4; ! ret_args->cm_ret.rep.cm_rdma_ra_in = 4; ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; /* * Pre-posts RECV buffers */
*** 3693,3736 **** return (0); } rdma_stat ! rib_registermem(CONN *conn, caddr_t buf, uint_t buflen, struct mrc *buf_handle) { ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ rdma_stat status; rib_hca_t *hca = (ctoqp(conn))->hca; /* * Note: ALL buffer pools use the same memory type RDMARW. */ ! status = rib_reg_mem(hca, buf, buflen, 0, &mr_hdl, &mr_desc); if (status == RDMA_SUCCESS) { buf_handle->mrc_linfo = (uintptr_t)mr_hdl; buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; } else { buf_handle->mrc_linfo = NULL; buf_handle->mrc_lmr = 0; buf_handle->mrc_rmr = 0; } return (status); } static rdma_stat ! rib_reg_mem(rib_hca_t *hca, caddr_t buf, uint_t size, ibt_mr_flags_t spec, ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) { ibt_mr_attr_t mem_attr; ibt_status_t ibt_status; - mem_attr.mr_vaddr = (uintptr_t)buf; mem_attr.mr_len = (ib_msglen_t)size; ! mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | IBT_MR_ENABLE_WINDOW_BIND | spec; rw_enter(&hca->state_lock, RW_READER); --- 4141,4289 ---- return (0); } rdma_stat ! rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, struct mrc *buf_handle) { ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ + #ifdef IB_FMR_SUP + ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */ + ibt_ma_hdl_t ma_hdl = NULL; + #endif ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ rdma_stat status; rib_hca_t *hca = (ctoqp(conn))->hca; /* * Note: ALL buffer pools use the same memory type RDMARW. */ ! #ifdef IB_FMR_SUP ! status = rib_reg_mem_fmr(hca, adsp, buf, buflen, 0, &mr_hdl, &ma_hdl, ! &pmr_desc); if (status == RDMA_SUCCESS) { buf_handle->mrc_linfo = (uintptr_t)mr_hdl; + buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey; + buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey; + buf_handle->mrc_lma = (uintptr_t)ma_hdl; + goto ret_stat; + } else { + buf_handle->mrc_linfo = NULL; + buf_handle->mrc_lma = NULL; + buf_handle->mrc_lmr = 0; + buf_handle->mrc_rmr = 0; + } + #endif + status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); + if (status == RDMA_SUCCESS) { + buf_handle->mrc_linfo = (uintptr_t)mr_hdl; buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; } else { buf_handle->mrc_linfo = NULL; buf_handle->mrc_lmr = 0; buf_handle->mrc_rmr = 0; } + ret_stat: return (status); } + #ifdef IB_FMR_SUP static rdma_stat ! rib_reg_mem_fmr(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec, ! ibt_mr_hdl_t *mr_hdlp, ibt_ma_hdl_t *ma_hdlp, ibt_pmr_desc_t *pmr_descp) ! { ! ibt_va_attr_t va_attr; ! ibt_phys_buf_t *paddr_list; ! uint_t paddr_list_len, num_paddr; ! size_t buf_sz = 0; ! ibt_pmr_attr_t pmr_attr; ! ib_memlen_t paddr_offset; ! ibt_status_t ibt_status; ! uint_t h_page_sz; ! if(adsp) ! return(RDMA_FAILED); ! bzero(&va_attr, sizeof (ibt_va_attr_t)); ! va_attr.va_vaddr = (ib_vaddr_t)buf; ! va_attr.va_len = size; ! va_attr.va_as = (struct as *)(caddr_t)adsp; ! va_attr.va_flags = IBT_VA_FMR | IBT_VA_SLEEP; ! if (spec == IBT_MR_NONCOHERENT) ! va_attr.va_flags |= IBT_VA_NONCOHERENT; ! va_attr.va_phys_buf_min = va_attr.va_phys_buf_max = 0; ! ! h_page_sz = hca->hca_attrs.hca_page_sz * 1024; ! paddr_list_len = (size / h_page_sz) + 2; ! paddr_list = (ibt_phys_buf_t *)kmem_zalloc(sizeof (ibt_phys_buf_t) * ! paddr_list_len, KM_NOSLEEP); ! ! if (rib_debug > 0) { ! cmn_err(CE_NOTE, "fmr: vaddr %p, size %d paddr_list_len %d \n", ! buf, size, paddr_list_len); ! } ! ! ibt_status = ibt_map_mem_area(hca->hca_hdl, &va_attr, paddr_list_len, ! paddr_list, &num_paddr, &buf_sz, &paddr_offset, ma_hdlp); ! if (ibt_status != IBT_SUCCESS) { ! cmn_err(CE_WARN, "rib_reg_mem_fmr: ibt_map_mem_area failed: " ! "status %d", ibt_status); ! kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len); ! return (RDMA_FAILED); ! } ! ! if (rib_debug > 0) { ! cmn_err(CE_NOTE,"fmr: p_laddr %p, p_size %d, buf_sz %d, p_ofset %llX\n", ! paddr_list[0].p_laddr, paddr_list[0].p_size, buf_sz, ! paddr_offset); ! cmn_err(CE_NOTE,"fmr: ibt_map_mem_area: ret %d, num_paddr %d, spec %d\n", ! ibt_status, num_paddr, spec); ! } ! ! bzero(&pmr_attr, sizeof (ibt_pmr_attr_t)); ! pmr_attr.pmr_iova = (ib_vaddr_t)buf; ! pmr_attr.pmr_len = size; ! pmr_attr.pmr_num_buf = num_paddr; ! pmr_attr.pmr_buf_sz = buf_sz; ! pmr_attr.pmr_buf_list = paddr_list; ! pmr_attr.pmr_offset = paddr_offset; ! pmr_attr.pmr_flags = spec; ! pmr_attr.pmr_ma = *ma_hdlp; ! ! ibt_status = ibt_register_physical_fmr(hca->hca_hdl, hca->fmr_pool, ! &pmr_attr, mr_hdlp, pmr_descp); ! if (ibt_status != IBT_SUCCESS) { ! cmn_err(CE_WARN, "rib_reg_mem_fmr: ibt_register_physical_fmr " ! "failed: status %d", ibt_status); ! (void) ibt_unmap_mem_area(hca->hca_hdl, *ma_hdlp); ! *ma_hdlp=NULL; ! kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len); ! return (RDMA_FAILED); ! } ! ! if (rib_debug > 0) { ! cmn_err(CE_NOTE,"fmr: rkey: 0x%lX lkey: 0x%lX, iova: %p, fmr_hdl %p \n", ! pmr_descp->pmd_rkey, pmr_descp->pmd_lkey, ! pmr_descp->pmd_iova, *mr_hdlp); ! } ! ! kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len); ! ! return (RDMA_SUCCESS); ! ! } ! ! #endif ! static rdma_stat ! rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec, ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) { ibt_mr_attr_t mem_attr; ibt_status_t ibt_status; mem_attr.mr_vaddr = (uintptr_t)buf; mem_attr.mr_len = (ib_msglen_t)size; ! mem_attr.mr_as = (struct as *)(caddr_t)adsp; mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | IBT_MR_ENABLE_WINDOW_BIND | spec; rw_enter(&hca->state_lock, RW_READER);
*** 3751,3808 **** } return (RDMA_SUCCESS); } rdma_stat ! rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle) { ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ rdma_stat status; rib_hca_t *hca = (ctoqp(conn))->hca; /* * Non-coherent memory registration. */ ! status = rib_reg_mem(hca, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, &mr_desc); if (status == RDMA_SUCCESS) { buf_handle->mrc_linfo = (uintptr_t)mr_hdl; buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; } else { buf_handle->mrc_linfo = NULL; buf_handle->mrc_lmr = 0; buf_handle->mrc_rmr = 0; } return (status); } /* ARGSUSED */ rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) { rib_hca_t *hca = (ctoqp(conn))->hca; - /* * Allow memory deregistration even if HCA is * getting detached. Need all outstanding * memory registrations to be deregistered * before HCA_DETACH_EVENT can be accepted. */ (void) ibt_deregister_mr(hca->hca_hdl, (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); return (RDMA_SUCCESS); } /* ARGSUSED */ rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) { (void) rib_deregistermem(conn, buf, buf_handle); return (RDMA_SUCCESS); } --- 4304,4463 ---- } return (RDMA_SUCCESS); } rdma_stat ! rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, ! #ifdef SERVER_REG_CACHE ! struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) ! #else struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle) + #endif { ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ + #ifdef IB_FMR_SUP + ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */ + ibt_ma_hdl_t ma_hdl = NULL; + #endif + #ifdef SERVER_REG_CACHE + rib_lrc_entry_t *l; + #endif ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ rdma_stat status; rib_hca_t *hca = (ctoqp(conn))->hca; /* * Non-coherent memory registration. */ ! #ifdef SERVER_REG_CACHE ! l = (rib_lrc_entry_t *)lrc; ! if(l){ ! if(l->registered){ ! buf_handle->mrc_linfo = (uintptr_t)l->lrc_mhandle.mrc_linfo; ! buf_handle->mrc_lmr = (uint32_t)l->lrc_mhandle.mrc_lmr; ! buf_handle->mrc_rmr = (uint32_t)l->lrc_mhandle.mrc_rmr; ! #ifdef IB_FMR_SUP ! buf_handle->mrc_lma = (uintptr_t)l->lrc_mhandle.mrc_lma; ! #endif ! *sync_handle = (RIB_SYNCMEM_HANDLE)l->lrc_mhandle.mrc_linfo; ! return(RDMA_SUCCESS); ! } else { ! /* Always register the whole buffer */ ! buf = (caddr_t)l->lrc_buf; ! buflen = l->lrc_len; ! /*cmn_err(CE_NOTE,"Register %p of length %d\n",buf,buflen);*/ ! } ! } ! #endif ! #ifdef IB_FMR_SUP ! status = rib_reg_mem_fmr(hca, adsp, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, ! &ma_hdl, &pmr_desc); ! if (status == RDMA_SUCCESS) { ! buf_handle->mrc_linfo = (uintptr_t)mr_hdl; ! buf_handle->mrc_lma = (uintptr_t)ma_hdl; ! buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey; ! buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey; ! *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; ! #ifdef SERVER_REG_CACHE ! if(l){ ! l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; ! l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; ! l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; ! l->registered = TRUE; ! l->lrc_mhandle.mrc_lma = (uintptr_t)ma_hdl; ! } ! #endif ! goto ret_stat; ! ! } else { ! if (rib_debug > 1) ! cmn_err(CE_WARN,"fmr reg failed for buffer %p of length %d\n",buf,buflen); ! buf_handle->mrc_linfo = NULL; ! buf_handle->mrc_lma = NULL; ! buf_handle->mrc_lmr = 0; ! buf_handle->mrc_rmr = 0; ! } ! #endif ! status = rib_reg_mem(hca, adsp, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, &mr_desc); if (status == RDMA_SUCCESS) { + #ifdef SERVER_REG_CACHE + if(l){ + l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; + l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; + l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; + l->registered = TRUE; + } + #endif buf_handle->mrc_linfo = (uintptr_t)mr_hdl; buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; } else { buf_handle->mrc_linfo = NULL; buf_handle->mrc_lmr = 0; buf_handle->mrc_rmr = 0; } + ret_stat: return (status); } /* ARGSUSED */ rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) { + avl_index_t where = NULL; + #ifdef IB_FMR_SUP + ibt_status_t ibt_status; + #endif rib_hca_t *hca = (ctoqp(conn))->hca; /* * Allow memory deregistration even if HCA is * getting detached. Need all outstanding * memory registrations to be deregistered * before HCA_DETACH_EVENT can be accepted. */ + #ifdef IB_FMR_SUP + if(buf_handle.mrc_lma){ + ibt_status = ibt_unmap_mem_area(hca->hca_hdl, + (ibt_ma_hdl_t)buf_handle.mrc_lma); + if (ibt_status != IBT_SUCCESS){ + cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed", + ibt_status); + return (RDMA_FAILED); + } + + ibt_status = ibt_deregister_fmr(hca->hca_hdl, + (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); + if (ibt_status != IBT_SUCCESS) + return (RDMA_FAILED); + return (RDMA_SUCCESS); + } + #endif (void) ibt_deregister_mr(hca->hca_hdl, (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); return (RDMA_SUCCESS); } /* ARGSUSED */ rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, + #ifdef SERVER_REG_CACHE + RIB_SYNCMEM_HANDLE sync_handle, void *lrc) + #else RIB_SYNCMEM_HANDLE sync_handle) + #endif { + #ifdef SERVER_REG_CACHE + rib_lrc_entry_t *l; + l = (rib_lrc_entry_t *)lrc; + if(l) + if(l->registered) + return(RDMA_SUCCESS); + #endif + + (void) rib_deregistermem(conn, buf, buf_handle); return (RDMA_SUCCESS); }
*** 3877,3895 **** num * sizeof (void *), KM_SLEEP); mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); bp->numelems = num; switch (ptype) { case SEND_BUFFER: mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; - /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */ bp->rsize = RPC_MSG_SZ; break; case RECV_BUFFER: mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; - /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */ bp->rsize = RPC_BUF_SIZE; break; default: goto fail; } --- 4532,4549 ---- num * sizeof (void *), KM_SLEEP); mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); bp->numelems = num; + switch (ptype) { case SEND_BUFFER: mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; bp->rsize = RPC_MSG_SZ; break; case RECV_BUFFER: mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; bp->rsize = RPC_BUF_SIZE; break; default: goto fail; }
*** 3901,3914 **** bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * sizeof (ibt_mr_hdl_t), KM_SLEEP); rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * sizeof (ibt_mr_desc_t), KM_SLEEP); - rw_enter(&hca->state_lock, RW_READER); if (hca->state != HCA_INITED) { rw_exit(&hca->state_lock); goto fail; } for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); mem_attr.mr_vaddr = (uintptr_t)buf; --- 4555,4568 ---- bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * sizeof (ibt_mr_hdl_t), KM_SLEEP); rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * sizeof (ibt_mr_desc_t), KM_SLEEP); rw_enter(&hca->state_lock, RW_READER); if (hca->state != HCA_INITED) { rw_exit(&hca->state_lock); + cmn_err(CE_WARN,"hca->state != HCA_INITED"); goto fail; } for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); mem_attr.mr_vaddr = (uintptr_t)buf;
*** 3924,3934 **** rw_exit(&hca->state_lock); goto fail; } } rw_exit(&hca->state_lock); - buf = (caddr_t)bp->buf; for (i = 0; i < num; i++, buf += bp->rsize) { bp->buflist[i] = (void *)buf; } bp->buffree = num - 1; /* no. of free buffers */ --- 4578,4587 ----
*** 4015,4025 **** if (rbp->mr_hdl) kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); if (rbp->mr_desc) kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); - if (bp->buf) kmem_free(bp->buf, bp->bufsize); mutex_destroy(&bp->buflock); kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); kmem_free(rbp, sizeof (rib_bufpool_t)); --- 4668,4677 ----
*** 4057,4066 **** --- 4709,4726 ---- return (RDMA_SUCCESS); } else return (RDMA_FAILED); } + #if defined(MEASURE_POOL_DEPTH) + static void rib_recv_bufs(uint32_t x) { + return; + } + static void rib_send_bufs(uint32_t x) { + return; + } + #endif /* * Fetch a buffer of specified type. * Note that rdbuf->handle is mw's rkey. */
*** 4107,4116 **** --- 4767,4782 ---- for (i = bp->numelems - 1; i >= 0; i--) { if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey; rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i]; rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey; + #if defined(MEASURE_POOL_DEPTH) + if(ptype == SEND_BUFFER) + rib_send_bufs(MAX_BUFS - (bp->buffree+1)); + if(ptype == RECV_BUFFER) + rib_recv_bufs(MAX_BUFS - (bp->buffree+1)); + #endif bp->buffree--; if (rib_debug > 1) cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs " "(type %d)\n", bp->buffree+1, ptype);
*** 4958,4967 **** --- 5624,5636 ---- * conn_lists are NULL, so destroy * buffers, close hca and be done. */ rib_rbufpool_destroy(hca, RECV_BUFFER); rib_rbufpool_destroy(hca, SEND_BUFFER); + #ifdef SERVER_REG_CACHE + rib_destroy_cache(hca); + #endif (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); (void) ibt_close_hca(hca->hca_hdl); hca->hca_hdl = NULL; } rw_exit(&hca->cl_conn_list.conn_lock);
*** 4981,4985 **** --- 5650,5981 ---- (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); (void) ibt_close_hca(hca->hca_hdl); hca->hca_hdl = NULL; } } + + #ifdef SERVER_REG_CACHE + + static void + rib_server_side_cache_reclaim(void *argp) + { + cache_avl_struct_t *rcas; + rib_lrc_entry_t *rb; + rib_hca_t *hca = (rib_hca_t *)argp; + + rw_enter(&hca->avl_rw_lock,RW_WRITER); + rcas = avl_first(&hca->avl_tree); + if(rcas != NULL) + avl_remove(&hca->avl_tree, rcas); + while(rcas != NULL){ + while(rcas->r.forw != &rcas->r){ + rcas->elements--; + rb = rcas->r.forw; + remque(rb); + rib_deregistermem_via_hca(hca, rb->lrc_buf, rb->lrc_mhandle); + kmem_free(rb->lrc_buf, rb->lrc_len); + kmem_free(rb, sizeof(rib_lrc_entry_t)); + } + mutex_destroy(&rcas->node_lock); + kmem_cache_free(hca->server_side_cache,rcas); + rcas = avl_first(&hca->avl_tree); + if(rcas != NULL) + avl_remove(&hca->avl_tree, rcas); + } + rw_exit(&hca->avl_rw_lock); + } + + static int avl_compare(const void *t1,const void *t2) { + + if(rib_debug > 1) + cmn_err(CE_NOTE,"Comparing %d and %d\n",((cache_avl_struct_t *)t1)->len, ((cache_avl_struct_t *)t2)->len); + if(((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) + return 0; + + if(((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) + return -1; + + if(((cache_avl_struct_t *)t1)->len > ((cache_avl_struct_t *)t2)->len) + return 1; + } + + static void rib_destroy_cache(rib_hca_t *hca) { + cache_avl_struct_t *rcas, *root; + rib_lrc_entry_t *rb; + + hca->avl_init = FALSE; + kmem_cache_destroy(hca->server_side_cache); + avl_destroy(&hca->avl_tree); + rw_destroy(&hca->avl_rw_lock); + + } + + static rib_lrc_entry_t * + rib_get_server_cache_buf(CONN *conn,uint32_t len) + { + cache_avl_struct_t cas,*rcas; + rib_hca_t *hca = (ctoqp(conn))->hca; + rib_lrc_entry_t *reply_buf; + avl_index_t where = NULL; + struct rib_lrc_entry *forw = NULL; + if(!hca->avl_init) + goto error_alloc; + cas.len = len; + rw_enter(&hca->avl_rw_lock, RW_READER); + if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL){ + rw_exit(&hca->avl_rw_lock); + rw_enter(&hca->avl_rw_lock, RW_WRITER); + /* Recheck to make sure no other thread added the entry in */ + if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL){ + /* Allocate an avl tree entry */ + if(rib_debug > 1) + cmn_err(CE_NOTE,"Allocating an avl entry for length %d\n",len); + rcas = (cache_avl_struct_t *)kmem_cache_alloc(hca->server_side_cache,KM_SLEEP); + bzero(rcas, sizeof(cache_avl_struct_t)); + rcas->elements = 0; + rcas->r.forw = + &rcas->r; + rcas->r.back = + &rcas->r; + rcas->len = len; + mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); + avl_insert(&hca->avl_tree,rcas,where); + } + } + if(rcas->elements > 0){ + mutex_enter(&rcas->node_lock); + reply_buf = rcas->r.forw; + remque(reply_buf); + rcas->elements --; + mutex_exit(&rcas->node_lock); + rw_exit(&hca->avl_rw_lock); + if(rib_debug > 1) + cmn_err(CE_NOTE,"Allocating a pre-alloced buffer for length %d\n",len); + } else { + rw_exit(&hca->avl_rw_lock); + rib_total_buffers ++; + if(rib_debug > 1) + cmn_err(CE_NOTE,"Allocating a new buffer for length %d\n",len); + /* Allocate a reply_buf entry */ + reply_buf = (rib_lrc_entry_t *)kmem_alloc(sizeof(rib_lrc_entry_t), KM_SLEEP); + bzero(reply_buf,sizeof(rib_lrc_entry_t)); + reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); + reply_buf->lrc_len = len; + reply_buf->registered = FALSE; + reply_buf->avl_node = (void *)rcas; + } + + return reply_buf; + error_alloc: + reply_buf = (rib_lrc_entry_t *)kmem_alloc(sizeof(rib_lrc_entry_t), KM_SLEEP); + bzero(reply_buf,sizeof(rib_lrc_entry_t)); + reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); + reply_buf->lrc_len = len; + reply_buf->registered = FALSE; + reply_buf->avl_node = NULL; + return reply_buf; + } + + /* + * Return a pre-registered back to the cache (without + * unregistering the buffer).. + */ + + static void + rib_free_server_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) + { + cache_avl_struct_t cas,*rcas; + avl_index_t where = NULL; + rib_hca_t *hca = (ctoqp(conn))->hca; + if(!reg_buf){ + cmn_err(CE_WARN,"Got a null reg_buf\n"); + return; + } + if(!hca->avl_init) + goto error_free; + cas.len = reg_buf->lrc_len; + rw_enter(&hca->avl_rw_lock, RW_READER); + if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,&cas,&where)) == NULL){ + rw_exit(&hca->avl_rw_lock); + goto error_free; + } else { + mutex_enter(&rcas->node_lock); + insque(reg_buf,&rcas->r); + rcas->elements ++; + mutex_exit(&rcas->node_lock); + rw_exit(&hca->avl_rw_lock); + if(rib_debug > 1) + cmn_err(CE_NOTE,"Returning buffer for length %d\n",reg_buf->lrc_len); + } + return; + error_free: + rib_deregistermem_via_hca(hca, reg_buf->lrc_buf, reg_buf->lrc_mhandle); + kmem_free(reg_buf->lrc_buf,reg_buf->lrc_len); + kmem_free(reg_buf,sizeof(rib_lrc_entry_t)); + } + + #endif + + static rdma_stat + rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, + uint_t buflen, struct mrc *buf_handle) + { + ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ + #ifdef IB_FMR_SUP + ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */ + ibt_ma_hdl_t ma_hdl = NULL; + #endif + ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ + rdma_stat status; + + + /* + * Note: ALL buffer pools use the same memory type RDMARW. + */ + /* This code will not be activated on the server. We could remove + the call to rib_reg_mem_fmr. But leave it in, in case the FMR + bugs get fixed. The bigger question is whether we need FMR when + the registered bufffers are coming out of a slab cache. This needs + to be evaluated. + */ + #ifdef IB_FMR_SUP + status = rib_reg_mem_fmr(hca, buf, adsp, buflen, 0, &mr_hdl, &ma_hdl, + &pmr_desc); + if (status == RDMA_SUCCESS) { + buf_handle->mrc_linfo = (uintptr_t)mr_hdl; + buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey; + buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey; + buf_handle->mrc_lma = (uintptr_t)ma_hdl; + goto ret_stat; + } else { + buf_handle->mrc_linfo = NULL; + buf_handle->mrc_lma = NULL; + buf_handle->mrc_lmr = 0; + buf_handle->mrc_rmr = 0; + } + #endif + status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); + if (status == RDMA_SUCCESS) { + buf_handle->mrc_linfo = (uint64_t)mr_hdl; + buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; + buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; + } else { + buf_handle->mrc_linfo = NULL; + buf_handle->mrc_lmr = 0; + buf_handle->mrc_rmr = 0; + } + ret_stat: + return (status); + } + + /* ARGSUSED */ + static rdma_stat + rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, + struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) + { + + (void) rib_deregistermem_via_hca(hca, buf, buf_handle); + + return (RDMA_SUCCESS); + } + + /* ARGSUSED */ + static rdma_stat + rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) + { + #ifdef IB_FMR_SUP + ibt_status_t ibt_status; + if(buf_handle.mrc_lma){ + ibt_status = ibt_unmap_mem_area(hca->hca_hdl, + (ibt_ma_hdl_t)buf_handle.mrc_lma); + if (ibt_status != IBT_SUCCESS){ + cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed", + ibt_status); + return (RDMA_FAILED); + } + ibt_status = ibt_deregister_fmr(hca->hca_hdl, + (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); + if (ibt_status != IBT_SUCCESS){ + cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed", + ibt_status); + return (RDMA_FAILED); + } + return (RDMA_SUCCESS); + } + #endif + + (void) ibt_deregister_mr(hca->hca_hdl, + (ibt_mr_hdl_t)buf_handle.mrc_linfo); + return (RDMA_SUCCESS); + } + + #if defined(ASYNC_SERVER_DEREG)||defined(ASYNC_CLIENT_DEREG) + static int + clist_deregister1(CONN *conn, struct clist *cl, bool_t src) + { + struct clist *c; + + for (c = cl; c; c = c->c_next) { + if (src) { + if (c->c_smemhandle.mrc_rmr != 0) { + (void) RDMA_DEREGMEMSYNC(conn, + (caddr_t)(uintptr_t)c->c_saddr, + c->c_smemhandle, + #ifdef SERVER_REG_CACHE + (void *)(uintptr_t)c->c_ssynchandle, (void *)c->long_reply_buf); + #else + (void *)(uintptr_t)c->c_ssynchandle); + #endif + c->c_smemhandle.mrc_rmr = 0; + c->c_ssynchandle = NULL; + } + } else { + if (c->c_dmemhandle.mrc_rmr != 0) { + (void) RDMA_DEREGMEMSYNC(conn, + (caddr_t)(uintptr_t)c->c_daddr, + c->c_dmemhandle, + #ifdef SERVER_REG_CACHE + (void *)(uintptr_t)c->c_dsynchandle, (void *)c->long_reply_buf); + #else + (void *)(uintptr_t)c->c_dsynchandle); + #endif + c->c_dmemhandle.mrc_rmr = 0; + c->c_dsynchandle = NULL; + } + } + } + + return (RDMA_SUCCESS); + } + #endif + + + + #if defined(ASYNC_CLIENT_DEREG) + static void + async_dereg_thread(caddr_t arg){ + ASYNC *r; + cmn_err(CE_WARN,"async_dereg_thread initiated\n"); + fetch_another_entry: + mutex_enter(&at_mutex); + while ((rqueue.forw == rqueue.back) && (rqueue.forw == &rqueue)) + cv_wait(&at_cond, &at_mutex); + r=rqueue.forw; + remque(rqueue.forw); + mutex_exit(&at_mutex); + /* Process deregistration */ + clist_deregister1(&r->c_conn, &r->c_clist, FALSE); + kmem_free(r, sizeof(ASYNC)); + goto fetch_another_entry; + + } + void insert_queue(CONN *conn, struct clist *rwc){ + ASYNC *r; + r=kmem_zalloc(sizeof(ASYNC),KM_SLEEP); + r->c_clist = *rwc; + r->c_conn = *conn; + mutex_enter(&at_mutex); + insque(r,&rqueue); + cv_broadcast(&at_cond); + mutex_exit(&at_mutex); + } + #endif