Sdiff rpcib.c


7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "@(#)rpcib.c 1.29 06/01/25 SMI" 28 29 /* 30 * The rpcib plugin. Implements the interface for RDMATF's 31 * interaction with IBTF. 32 */ 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/user.h> 37 #include <sys/systm.h> 38 #include <sys/sysmacros.h> 39 #include <sys/proc.h> 40 #include <sys/socket.h> 41 #include <sys/file.h> 42 #include <sys/stream.h> 43 #include <sys/strsubr.h> 44 #include <sys/stropts.h> 45 #include <sys/errno.h> 46 #include <sys/kmem.h> 47 #include <sys/debug.h> 48 #include <sys/systm.h> 49 #include <sys/pathname.h> 50 #include <sys/kstat.h> 51 #include <sys/t_lock.h> 52 #include <sys/ddi.h> 53 #include <sys/cmn_err.h> 54 #include <sys/time.h> 55 #include <sys/isa_defs.h> 56 #include <sys/callb.h> 57 #include <sys/sunddi.h> 58 #include <sys/sunndi.h> 59 60 #include <sys/ib/ibtl/ibti.h> 61 #include <rpc/rpc.h> 62 #include <rpc/ib.h> 63 64 #include <sys/modctl.h> 65 66 #include <sys/pathname.h> 67 #include <sys/kstr.h> 68 #include <sys/sockio.h> 69 #include <sys/vnode.h> 70 #include <sys/tiuser.h> 71 #include <net/if.h> 72 #include <sys/cred.h> 73 74 75 extern char *inet_ntop(int, const void *, char *, int); 76 77 78 /* 79 * Prototype declarations for driver ops 80 */ 81 82 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 83 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 84 void *, void **); 85 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 86 87 88 /* rpcib cb_ops */ 89 static struct cb_ops rpcib_cbops = { 90 nulldev, /* open */ 91 nulldev, /* close */ 92 nodev, /* strategy */ 93 nodev, /* print */ 94 nodev, /* dump */ 95 nodev, /* read */ 96 nodev, /* write */ 97 nodev, /* ioctl */ 98 nodev, /* devmap */ 99 nodev, /* mmap */ 100 nodev, /* segmap */ 101 nochpoll, /* poll */ 102 ddi_prop_op, /* prop_op */ 103 NULL, /* stream */ 104 D_MP, /* cb_flag */ 105 CB_REV, /* rev */ 106 nodev, /* int (*cb_aread)() */ 107 nodev /* int (*cb_awrite)() */ 108 }; 109 110 /* 111 * Device options 112 */ 113 static struct dev_ops rpcib_ops = { 114 DEVO_REV, /* devo_rev, */ 115 0, /* refcnt */ 116 rpcib_getinfo, /* info */ 117 nulldev, /* identify */ 118 nulldev, /* probe */ 119 rpcib_attach, /* attach */ 120 rpcib_detach, /* detach */ 121 nodev, /* reset */ 122 &rpcib_cbops, /* driver ops - devctl interfaces */ 123 NULL, /* bus operations */ 124 NULL /* power */ 125 }; 126 127 /* 128 * Module linkage information. 129 */ 130 131 static struct modldrv rib_modldrv = { 132 &mod_driverops, /* Driver module */ 133 "RPCIB plugin driver, ver 1.29", /* Driver name and version */ 134 &rpcib_ops, /* Driver ops */ 135 }; 136 137 static struct modlinkage rib_modlinkage = { 138 MODREV_1, 139 (void *)&rib_modldrv, 140 NULL 141 }; 142 143 /* 144 * rib_stat: private data pointer used when registering 145 * with the IBTF. It is returned to the consumer 146 * in all callbacks. 147 */ 148 static rpcib_state_t *rib_stat = NULL; 149 150 #define RNR_RETRIES 2 151 #define MAX_PORTS 2 152 153 int preposted_rbufs = 16; 154 int send_threshold = 1; 155 156 /* 157 * State of the plugin. 158 * ACCEPT = accepting new connections and requests. 159 * NO_ACCEPT = not accepting new connection and requests. 160 * This should eventually move to rpcib_state_t structure, since this 161 * will tell in which state the plugin is for a particular type of service 162 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 163 * state for one and in no_accept state for the other. 164 */ 165 int plugin_state; 166 kmutex_t plugin_state_lock; 167 168 169 /* 170 * RPCIB RDMATF operations 171 */ 172 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 173 static rdma_stat rib_disconnect(CONN *conn); 174 static void rib_listen(struct rdma_svc_data *rd); 175 static void rib_listen_stop(struct rdma_svc_data *rd); 176 static rdma_stat rib_registermem(CONN *conn, caddr_t buf, uint_t buflen, 177 struct mrc *buf_handle); 178 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 179 struct mrc buf_handle); 180 static rdma_stat rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen, 181 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle); 182 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 183 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle); 184 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 185 caddr_t buf, int len, int cpu); 186 187 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 188 189 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 190 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 191 192 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 193 194 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 195 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 196 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 197 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 198 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 199 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 200 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 201 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **); 202 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); 203 static rdma_stat rib_conn_release(CONN *conn); 204 static rdma_stat rib_getinfo(rdma_info_t *info); 205 static rdma_stat rib_register_ats(rib_hca_t *); 206 static void rib_deregister_ats(); 207 static void rib_stop_services(rib_hca_t *); 208 209 /* 210 * RPCIB addressing operations 211 */ 212 char ** get_ip_addrs(int *count); 213 int get_interfaces(TIUSER *tiptr, int *num); 214 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs); 215 int get_ibd_ipaddr(rpcib_ibd_insts_t *); 216 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *); 217 void rib_get_ibd_insts(rpcib_ibd_insts_t *); 218 219 220 /* 221 * RDMA operations the RPCIB module exports 222 */ 223 static rdmaops_t rib_ops = { 224 rib_reachable, 225 rib_conn_get, 226 rib_conn_release, 227 rib_listen, 228 rib_listen_stop, 229 rib_registermem, 230 rib_deregistermem, 231 rib_registermemsync, 232 rib_deregistermemsync, 233 rib_syncmem, 234 rib_reg_buf_alloc, 235 rib_reg_buf_free, 236 rib_send, 237 rib_send_resp, 238 rib_post_resp, 239 rib_post_recv, 240 rib_recv, 241 rib_read, 242 rib_write, 243 rib_getinfo 244 }; 245 246 /* 247 * RDMATF RPCIB plugin details 248 */ 249 static rdma_mod_t rib_mod = { 250 "ibtf", /* api name */ 251 RDMATF_VERS_1, 252 0, 253 &rib_ops, /* rdma op vector for ibtf */ 254 }; 255 256 static rdma_stat open_hcas(rpcib_state_t *); 257 static rdma_stat rib_qp_init(rib_qp_t *, int); 258 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 259 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 260 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 261 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 262 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 263 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 264 ibt_mr_hdl_t *, ibt_mr_desc_t *); 265 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *); 266 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 267 rib_qp_t **); 268 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 269 rib_qp_t **); 270 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 271 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 272 static int rib_free_sendwait(struct send_wid *); 273 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 274 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 275 static void rdma_done_rem_list(rib_qp_t *); 276 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 277 278 static void rib_async_handler(void *, 279 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 280 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 281 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 282 static int rib_free_svc_recv(struct svc_recv *); 283 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 284 static void rib_free_wid(struct recv_wid *);
299 }; 300 301 /* 302 * Global strucuture 303 */ 304 305 typedef struct rpcib_s { 306 dev_info_t *rpcib_dip; 307 kmutex_t rpcib_mutex; 308 } rpcib_t; 309 310 rpcib_t rpcib; 311 312 /* 313 * /etc/system controlled variable to control 314 * debugging in rpcib kernel module. 315 * Set it to values greater that 1 to control 316 * the amount of debugging messages required. 317 */ 318 int rib_debug = 0; 319 320 static int ats_running = 0; 321 int 322 _init(void) 323 { 324 int error; 325 326 error = mod_install((struct modlinkage *)&rib_modlinkage); 327 if (error != 0) { 328 /* 329 * Could not load module 330 */ 331 return (error); 332 } 333 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 334 335 return (0); 336 } 337 338 int 339 _fini() 340 {
556 " ATS service: %s", 557 to_remove->srv_name); 558 } 559 #endif 560 } 561 kmem_free(to_remove, sizeof (rib_service_t)); 562 } 563 hca->ats_list = NULL; 564 rw_exit(&hca->service_list_lock); 565 } 566 567 static void rib_rbufpool_free(rib_hca_t *, int); 568 static void rib_rbufpool_deregister(rib_hca_t *, int); 569 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 570 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 571 static rdma_stat rib_rem_replylist(rib_qp_t *); 572 static int rib_remreply(rib_qp_t *, struct reply *); 573 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 574 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 575 576 /* 577 * One CQ pair per HCA 578 */ 579 static rdma_stat 580 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 581 rib_cq_t **cqp, rpcib_state_t *ribstat) 582 { 583 rib_cq_t *cq; 584 ibt_cq_attr_t cq_attr; 585 uint32_t real_size; 586 ibt_status_t status; 587 rdma_stat error = RDMA_SUCCESS; 588 589 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 590 cq->rib_hca = hca; 591 cq_attr.cq_size = cq_size; 592 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 593 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 594 &real_size); 595 if (status != IBT_SUCCESS) {
616 617 return (error); 618 fail: 619 if (cq->rib_cq_hdl) 620 (void) ibt_free_cq(cq->rib_cq_hdl); 621 if (cq) 622 kmem_free(cq, sizeof (rib_cq_t)); 623 return (error); 624 } 625 626 static rdma_stat 627 open_hcas(rpcib_state_t *ribstat) 628 { 629 rib_hca_t *hca; 630 ibt_status_t ibt_status; 631 rdma_stat status; 632 ibt_hca_portinfo_t *pinfop; 633 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 634 uint_t size, cq_size; 635 int i; 636 637 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 638 if (ribstat->hcas == NULL) 639 ribstat->hcas = kmem_zalloc(ribstat->hca_count * 640 sizeof (rib_hca_t), KM_SLEEP); 641 642 /* 643 * Open a hca and setup for RDMA 644 */ 645 for (i = 0; i < ribstat->hca_count; i++) { 646 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 647 ribstat->hca_guids[i], 648 &ribstat->hcas[i].hca_hdl); 649 if (ibt_status != IBT_SUCCESS) { 650 cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) " 651 "returned %d", i, ibt_status); 652 continue; 653 } 654 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; 655 hca = &(ribstat->hcas[i]); 656 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
729 } 730 731 /* 732 * Create buffer pools. 733 * Note rib_rbuf_create also allocates memory windows. 734 */ 735 hca->recv_pool = rib_rbufpool_create(hca, 736 RECV_BUFFER, MAX_BUFS); 737 if (hca->recv_pool == NULL) { 738 cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n"); 739 goto fail3; 740 } 741 742 hca->send_pool = rib_rbufpool_create(hca, 743 SEND_BUFFER, MAX_BUFS); 744 if (hca->send_pool == NULL) { 745 cmn_err(CE_WARN, "open_hcas: send buf pool failed\n"); 746 rib_rbufpool_destroy(hca, RECV_BUFFER); 747 goto fail3; 748 } 749 750 /* 751 * Initialize the registered service list and 752 * the lock 753 */ 754 hca->service_list = NULL; 755 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); 756 757 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 758 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 759 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 760 hca->iblock); 761 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 762 hca->iblock); 763 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 764 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 765 hca->inuse = TRUE; 766 /* 767 * XXX One hca only. Add multi-hca functionality if needed 768 * later. 769 */
871 * Notify poster 872 */ 873 cv_signal(&wd->wait_cv); 874 mutex_exit(&wd->sendwait_lock); 875 } else { 876 /* 877 * Poster not waiting for notification. 878 * Free the send buffers and send_wid 879 */ 880 for (i = 0; i < wd->nsbufs; i++) { 881 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 882 (void *)(uintptr_t)wd->sbufaddr[i]); 883 } 884 mutex_exit(&wd->sendwait_lock); 885 (void) rib_free_sendwait(wd); 886 } 887 } 888 } 889 } 890 891 /* ARGSUSED */ 892 static void 893 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 894 { 895 ibt_status_t ibt_status; 896 ibt_wc_t wc; 897 int i; 898 899 /* 900 * Re-enable cq notify here to avoid missing any 901 * completion queue notification. 902 */ 903 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 904 905 ibt_status = IBT_SUCCESS; 906 while (ibt_status != IBT_CQ_EMPTY) { 907 bzero(&wc, sizeof (wc)); 908 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 909 if (ibt_status != IBT_SUCCESS) 910 return; 911 912 /* 913 * Got a send completion 914 */ 915 #ifdef DEBUG 916 if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) { 917 cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error " 918 "wc.wc_status:%d, wc_id:%llX", 919 wc.wc_status, (longlong_t)wc.wc_id); 920 } 921 #endif 922 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ 923 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; 924 925 mutex_enter(&wd->sendwait_lock); 926 if (wd->cv_sig == 1) { 927 /* 928 * Update completion status and notify poster 929 */ 930 if (wc.wc_status == IBT_WC_SUCCESS) 931 wd->status = RDMA_SUCCESS; 932 else 933 wd->status = RDMA_FAILED; 934 cv_signal(&wd->wait_cv); 935 mutex_exit(&wd->sendwait_lock); 936 } else { 937 /* 938 * Poster not waiting for notification. 939 * Free the send buffers and send_wid 940 */ 941 for (i = 0; i < wd->nsbufs; i++) { 942 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 943 (void *)(uintptr_t)wd->sbufaddr[i]); 944 } 945 mutex_exit(&wd->sendwait_lock); 946 (void) rib_free_sendwait(wd); 947 } 948 } 949 } 950 } 951 952 /* 953 * RCQ handler 954 */ 955 /* ARGSUSED */ 956 static void 957 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 958 { 959 rib_qp_t *qp; 960 ibt_status_t ibt_status; 961 ibt_wc_t wc; 962 struct recv_wid *rwid; 963 964 /* 965 * Re-enable cq notify here to avoid missing any 966 * completion queue notification. 967 */ 968 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 969 970 ibt_status = IBT_SUCCESS; 971 while (ibt_status != IBT_CQ_EMPTY) { 972 bzero(&wc, sizeof (wc)); 973 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 974 if (ibt_status != IBT_SUCCESS) 975 return; 976 977 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 978 qp = rwid->qp; 979 if (wc.wc_status == IBT_WC_SUCCESS) { 980 XDR inxdrs, *xdrs; 981 uint_t xid, vers, op, find_xid = 0; 982 struct reply *r; 983 CONN *conn = qptoc(qp); 984 985 xdrs = &inxdrs; 986 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 987 wc.wc_bytes_xfer, XDR_DECODE); 988 /* 989 * Treat xid as opaque (xid is the first entity 990 * in the rpc rdma message). 991 */ 992 xid = *(uint32_t *)(uintptr_t)rwid->addr; 993 /* Skip xid and set the xdr position accordingly. */ 994 XDR_SETPOS(xdrs, sizeof (uint32_t)); 995 (void) xdr_u_int(xdrs, &vers); 996 (void) xdr_u_int(xdrs, &op); 997 XDR_DESTROY(xdrs); 998 if (vers != RPCRDMA_VERS) { 999 /* 1000 * Invalid RPC/RDMA version. Cannot interoperate. 1001 * Set connection to ERROR state and bail out. 1002 */ 1003 mutex_enter(&conn->c_lock); 1004 if (conn->c_state != C_DISCONN_PEND) 1005 conn->c_state = C_ERROR; 1006 mutex_exit(&conn->c_lock); 1007 rib_rbuf_free(conn, RECV_BUFFER, 1008 (void *)(uintptr_t)rwid->addr); 1009 rib_free_wid(rwid); 1010 continue; 1011 } 1012 1013 mutex_enter(&qp->replylist_lock); 1014 for (r = qp->replylist; r != NULL; r = r->next) { 1015 if (r->xid == xid) {
1093 mblk_t *mp; 1094 1095 /* 1096 * Re-enable cq notify here to avoid missing any 1097 * completion queue notification. 1098 */ 1099 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1100 1101 ibt_status = IBT_SUCCESS; 1102 while (ibt_status != IBT_CQ_EMPTY) { 1103 bzero(&wc, sizeof (wc)); 1104 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1105 if (ibt_status != IBT_SUCCESS) 1106 return; 1107 1108 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1109 qp = s_recvp->qp; 1110 conn = qptoc(qp); 1111 mutex_enter(&qp->posted_rbufs_lock); 1112 qp->n_posted_rbufs--; 1113 if (qp->n_posted_rbufs == 0) 1114 cv_signal(&qp->posted_rbufs_cv); 1115 mutex_exit(&qp->posted_rbufs_lock); 1116 1117 if (wc.wc_status == IBT_WC_SUCCESS) { 1118 XDR inxdrs, *xdrs; 1119 uint_t xid, vers, op; 1120 1121 xdrs = &inxdrs; 1122 /* s_recvp->vaddr stores data */ 1123 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1124 wc.wc_bytes_xfer, XDR_DECODE); 1125 1126 /* 1127 * Treat xid as opaque (xid is the first entity 1128 * in the rpc rdma message). 1129 */ 1130 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1131 /* Skip xid and set the xdr position accordingly. */ 1132 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1133 if (!xdr_u_int(xdrs, &vers) || 1134 !xdr_u_int(xdrs, &op)) { 1135 rib_rbuf_free(conn, RECV_BUFFER, 1136 (void *)(uintptr_t)s_recvp->vaddr); 1137 XDR_DESTROY(xdrs); 1138 #ifdef DEBUG 1139 cmn_err(CE_NOTE, "rib_svc_rcq_handler: " 1140 "xdr_u_int failed for qp %p, wc_id=%llx", 1141 (void *)qp, (longlong_t)wc.wc_id); 1142 #endif 1143 (void) rib_free_svc_recv(s_recvp); 1144 continue; 1145 } 1146 XDR_DESTROY(xdrs); 1147 1148 if (vers != RPCRDMA_VERS) { 1149 /* 1150 * Invalid RPC/RDMA version. Drop rpc rdma message. 1151 */ 1152 rib_rbuf_free(conn, RECV_BUFFER, 1153 (void *)(uintptr_t)s_recvp->vaddr);
1323 return (RDMA_FAILED); 1324 } 1325 } else { 1326 mutex_exit(&rib_stat->open_hca_lock); 1327 return (RDMA_SUCCESS); 1328 } 1329 } else { 1330 *handle = NULL; 1331 if (rib_debug > 2) 1332 cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n"); 1333 return (RDMA_FAILED); 1334 } 1335 } 1336 1337 /* Client side qp creation */ 1338 static rdma_stat 1339 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1340 { 1341 rib_qp_t *kqp = NULL; 1342 CONN *conn; 1343 1344 ASSERT(qp != NULL); 1345 *qp = NULL; 1346 1347 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1348 conn = qptoc(kqp); 1349 kqp->hca = hca; 1350 kqp->rdmaconn.c_rdmamod = &rib_mod; 1351 kqp->rdmaconn.c_private = (caddr_t)kqp; 1352 1353 kqp->mode = RIB_CLIENT; 1354 kqp->chan_flags = IBT_BLOCKING; 1355 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1356 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1357 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1358 1359 /* 1360 * Initialize 1361 */ 1362 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1363 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1364 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1365 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1366 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1367 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1368 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1369 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1370 1371 *qp = kqp; 1372 return (RDMA_SUCCESS); 1373 } 1374 1375 /* Server side qp creation */ 1376 static rdma_stat 1377 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1378 { 1379 rib_qp_t *kqp = NULL; 1380 ibt_chan_sizes_t chan_sizes; 1381 ibt_rc_chan_alloc_args_t qp_attr; 1382 ibt_status_t ibt_status; 1383 1384 ASSERT(qp != NULL); 1385 *qp = NULL; 1386 1387 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1388 kqp->hca = hca; 1389 kqp->port_num = port; 1390 kqp->rdmaconn.c_rdmamod = &rib_mod; 1391 kqp->rdmaconn.c_private = (caddr_t)kqp; 1392 1393 /* 1394 * Create the qp handle 1395 */ 1396 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1397 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1398 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1399 qp_attr.rc_pd = hca->pd_hdl; 1400 qp_attr.rc_hca_port_num = port; 1401 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1402 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1424 goto fail; 1425 } 1426 1427 kqp->mode = RIB_SERVER; 1428 kqp->chan_flags = IBT_BLOCKING; 1429 kqp->q = q; /* server ONLY */ 1430 1431 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1432 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1433 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1434 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1435 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1436 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1437 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1438 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1439 /* 1440 * Set the private data area to qp to be used in callbacks 1441 */ 1442 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1443 kqp->rdmaconn.c_state = C_CONNECTED; 1444 *qp = kqp; 1445 return (RDMA_SUCCESS); 1446 fail: 1447 if (kqp) 1448 kmem_free(kqp, sizeof (rib_qp_t)); 1449 1450 return (RDMA_FAILED); 1451 } 1452 1453 void 1454 rib_dump_pathrec(ibt_path_info_t *path_rec) 1455 { 1456 ib_pkey_t pkey; 1457 1458 if (rib_debug > 1) { 1459 cmn_err(CE_NOTE, "Path Record:\n"); 1460 1461 cmn_err(CE_NOTE, "Source HCA GUID = %llx\n", 1462 (longlong_t)path_rec->pi_hca_guid); 1463 cmn_err(CE_NOTE, "Dest Service ID = %llx\n", 1464 (longlong_t)path_rec->pi_sid);
1707 1708 (void) bzero(&chan_args, sizeof (chan_args)); 1709 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1710 1711 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num; 1712 /* Alloc a RC channel */ 1713 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1714 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1715 qp_attr.rc_pd = hca->pd_hdl; 1716 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1717 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1718 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1719 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1720 qp_attr.rc_clone_chan = NULL; 1721 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1722 qp_attr.rc_flags = IBT_WR_SIGNALED; 1723 1724 chan_args.oc_path = path; 1725 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1726 chan_args.oc_cm_clnt_private = (void *)rib_stat; 1727 chan_args.oc_rdma_ra_out = 1; 1728 chan_args.oc_rdma_ra_in = 1; 1729 chan_args.oc_path_retry_cnt = 2; 1730 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1731 1732 refresh: 1733 rw_enter(&hca->state_lock, RW_READER); 1734 if (hca->state != HCA_DETACHED) { 1735 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1736 IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl, 1737 &chan_sizes); 1738 } else { 1739 rw_exit(&hca->state_lock); 1740 return (RDMA_FAILED); 1741 } 1742 rw_exit(&hca->state_lock); 1743 1744 if (ibt_status != IBT_SUCCESS) { 1745 #ifdef DEBUG 1746 cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel " 1747 "failed, ibt_status=%d.", ibt_status); 1748 #endif
1885 (void) rib_rem_replylist(qp); 1886 } 1887 1888 cv_destroy(&qp->cb_conn_cv); 1889 cv_destroy(&qp->posted_rbufs_cv); 1890 mutex_destroy(&qp->cb_lock); 1891 1892 mutex_destroy(&qp->replylist_lock); 1893 mutex_destroy(&qp->posted_rbufs_lock); 1894 mutex_destroy(&qp->rdlist_lock); 1895 1896 cv_destroy(&conn->c_cv); 1897 mutex_destroy(&conn->c_lock); 1898 1899 if (conn->c_raddr.buf != NULL) { 1900 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 1901 } 1902 if (conn->c_laddr.buf != NULL) { 1903 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 1904 } 1905 kmem_free(qp, sizeof (rib_qp_t)); 1906 1907 /* 1908 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 1909 * then the hca is no longer being used. 1910 */ 1911 if (conn_list != NULL) { 1912 rw_enter(&hca->state_lock, RW_READER); 1913 if (hca->state == HCA_DETACHED) { 1914 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 1915 if (hca->srv_conn_list.conn_hd == NULL) { 1916 rw_enter(&hca->cl_conn_list.conn_lock, 1917 RW_READER); 1918 if (hca->cl_conn_list.conn_hd == NULL) { 1919 mutex_enter(&hca->inuse_lock); 1920 hca->inuse = FALSE; 1921 cv_signal(&hca->cb_cv); 1922 mutex_exit(&hca->inuse_lock); 1923 } 1924 rw_exit(&hca->cl_conn_list.conn_lock); 1925 } 1926 rw_exit(&hca->srv_conn_list.conn_lock); 1927 } 1928 rw_exit(&hca->state_lock); 1929 } 1930 return (RDMA_SUCCESS); 1931 } 1932 1933 /* 1934 * Wait for send completion notification. Only on receiving a 1935 * notification be it a successful or error completion, free the 1936 * send_wid. 1937 */ 1938 static rdma_stat 1939 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 1940 { 1941 clock_t timout, cv_wait_ret; 1942 rdma_stat error = RDMA_SUCCESS; 1943 int i; 1944 1945 /* 1946 * Wait for send to complete 1947 */ 1948 ASSERT(wd != NULL); 1949 mutex_enter(&wd->sendwait_lock); 1950 if (wd->status == (uint_t)SEND_WAIT) { 1951 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 1952 ddi_get_lbolt();
2047 2048 static rdma_stat 2049 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2050 { 2051 mutex_enter(&qp->replylist_lock); 2052 if (rep != NULL) { 2053 (void) rib_remreply(qp, rep); 2054 mutex_exit(&qp->replylist_lock); 2055 return (RDMA_SUCCESS); 2056 } 2057 mutex_exit(&qp->replylist_lock); 2058 return (RDMA_FAILED); 2059 } 2060 2061 /* 2062 * Send buffers are freed here only in case of error in posting 2063 * on QP. If the post succeeded, the send buffers are freed upon 2064 * send completion in rib_sendwait() or in the scq_handler. 2065 */ 2066 rdma_stat 2067 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2068 int send_sig, int cv_sig) 2069 { 2070 struct send_wid *wdesc; 2071 struct clist *clp; 2072 ibt_status_t ibt_status = IBT_SUCCESS; 2073 rdma_stat ret = RDMA_SUCCESS; 2074 ibt_send_wr_t tx_wr; 2075 int i, nds; 2076 ibt_wr_ds_t sgl[DSEG_MAX]; 2077 uint_t total_msg_size; 2078 rib_qp_t *qp = ctoqp(conn); 2079 2080 ASSERT(cl != NULL); 2081 2082 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2083 2084 nds = 0; 2085 total_msg_size = 0; 2086 clp = cl; 2087 while (clp != NULL) { 2088 if (nds >= DSEG_MAX) { 2089 cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX" 2090 " too small!"); 2091 return (RDMA_FAILED); 2092 } 2093 sgl[nds].ds_va = clp->c_saddr; 2094 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2095 sgl[nds].ds_len = clp->c_len; 2096 total_msg_size += clp->c_len; 2097 clp = clp->c_next; 2098 nds++; 2099 } 2100 2101 if (send_sig) { 2102 /* Set SEND_SIGNAL flag. */ 2103 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2104 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2105 } else { 2106 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2107 wdesc = rib_init_sendwait(msgid, 0, qp); 2108 } 2109 wdesc->nsbufs = nds; 2110 for (i = 0; i < nds; i++) { 2111 wdesc->sbufaddr[i] = sgl[i].ds_va; 2112 } 2113 2114 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2115 tx_wr.wr_opcode = IBT_WRC_SEND; 2116 tx_wr.wr_trans = IBT_RC_SRV; 2117 tx_wr.wr_nds = nds; 2118 tx_wr.wr_sgl = sgl; 2119 2120 mutex_enter(&conn->c_lock); 2121 if (conn->c_state & C_CONNECTED) { 2122 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2123 } 2124 if (((conn->c_state & C_CONNECTED) == 0) || 2125 ibt_status != IBT_SUCCESS) { 2126 mutex_exit(&conn->c_lock); 2127 for (i = 0; i < nds; i++) { 2128 rib_rbuf_free(conn, SEND_BUFFER, 2129 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2146 * cv_wait for send to complete. 2147 * We can fail due to a timeout or signal or 2148 * unsuccessful send. 2149 */ 2150 ret = rib_sendwait(qp, wdesc); 2151 #ifdef DEBUG 2152 if (rib_debug > 2) 2153 if (ret != 0) { 2154 cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait " 2155 "FAILED, rdma stat=%d, wr_id %llx, qp %p!", 2156 ret, (longlong_t)tx_wr.wr_id, (void *)qp); 2157 } 2158 #endif 2159 return (ret); 2160 } 2161 } 2162 2163 return (RDMA_SUCCESS); 2164 } 2165 2166 rdma_stat 2167 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2168 { 2169 rdma_stat ret; 2170 2171 /* send-wait & cv_signal */ 2172 ret = rib_send_and_wait(conn, cl, msgid, 1, 1); 2173 2174 return (ret); 2175 } 2176 2177 /* 2178 * Server interface (svc_rdma_ksend). 2179 * Send RPC reply and wait for RDMA_DONE. 2180 */ 2181 rdma_stat 2182 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2183 { 2184 rdma_stat ret = RDMA_SUCCESS; 2185 struct rdma_done_list *rd; 2186 clock_t timout, cv_wait_ret; 2187 rib_qp_t *qp = ctoqp(conn); 2188 2189 mutex_enter(&qp->rdlist_lock); 2190 rd = rdma_done_add(qp, msgid); 2191 2192 /* No cv_signal (whether send-wait or no-send-wait) */ 2193 ret = rib_send_and_wait(conn, cl, msgid, 1, 0); 2194 if (ret != RDMA_SUCCESS) { 2195 #ifdef DEBUG 2196 cmn_err(CE_WARN, "rib_send_resp: send_and_wait " 2197 "failed, msgid %u, qp %p", msgid, (void *)qp); 2198 #endif 2199 rdma_done_rm(qp, rd); 2200 goto done; 2201 } 2202 2203 /* 2204 * Wait for RDMA_DONE from remote end 2205 */ 2206 timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2207 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock, 2208 timout); 2209 rdma_done_rm(qp, rd); 2210 if (cv_wait_ret < 0) { 2211 #ifdef DEBUG 2212 if (rib_debug > 1) { 2213 cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
2481 #ifdef DEBUG 2482 cmn_err(CE_WARN, "rib_recv: no matching reply for " 2483 "xid %u, qp %p\n", msgid, (void *)qp); 2484 #endif 2485 } 2486 2487 /* 2488 * Done. 2489 */ 2490 mutex_exit(&qp->replylist_lock); 2491 return (ret); 2492 } 2493 2494 /* 2495 * RDMA write a buffer to the remote address. 2496 */ 2497 rdma_stat 2498 rib_write(CONN *conn, struct clist *cl, int wait) 2499 { 2500 ibt_send_wr_t tx_wr; 2501 int nds; 2502 int cv_sig; 2503 ibt_wr_ds_t sgl[DSEG_MAX]; 2504 struct send_wid *wdesc; 2505 ibt_status_t ibt_status; 2506 rdma_stat ret = RDMA_SUCCESS; 2507 rib_qp_t *qp = ctoqp(conn); 2508 2509 if (cl == NULL) { 2510 cmn_err(CE_WARN, "rib_write: NULL clist\n"); 2511 return (RDMA_FAILED); 2512 } 2513 2514 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2515 /* 2516 * Remote address is at the head chunk item in list. 2517 */ 2518 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr; 2519 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */ 2520 2521 nds = 0; 2522 while (cl != NULL) { 2523 if (nds >= DSEG_MAX) { 2524 cmn_err(CE_WARN, "rib_write: DSEG_MAX too small!"); 2525 return (RDMA_FAILED); 2526 } 2527 sgl[nds].ds_va = cl->c_saddr; 2528 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2529 sgl[nds].ds_len = cl->c_len; 2530 cl = cl->c_next; 2531 nds++; 2532 } 2533 2534 if (wait) { 2535 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2536 cv_sig = 1; 2537 } else { 2538 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2539 cv_sig = 0; 2540 } 2541 2542 wdesc = rib_init_sendwait(0, cv_sig, qp); 2543 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2544 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2545 tx_wr.wr_trans = IBT_RC_SRV; 2546 tx_wr.wr_nds = nds; 2547 tx_wr.wr_sgl = sgl; 2548 2549 mutex_enter(&conn->c_lock); 2550 if (conn->c_state & C_CONNECTED) { 2551 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2552 } 2553 if (((conn->c_state & C_CONNECTED) == 0) || 2554 ibt_status != IBT_SUCCESS) { 2555 mutex_exit(&conn->c_lock); 2556 (void) rib_free_sendwait(wdesc); 2557 return (RDMA_FAILED); 2558 } 2559 mutex_exit(&conn->c_lock); 2560 2561 /* 2562 * Wait for send to complete 2563 */ 2564 if (wait) { 2565 ret = rib_sendwait(qp, wdesc); 2566 if (ret != 0) { 2567 return (ret); 2568 } 2569 } 2570 return (RDMA_SUCCESS); 2571 } 2572 2573 /* 2574 * RDMA Read a buffer from the remote address. 2575 */ 2576 rdma_stat 2577 rib_read(CONN *conn, struct clist *cl, int wait) 2578 { 2579 ibt_send_wr_t rx_wr; 2580 int nds; 2581 int cv_sig; 2582 ibt_wr_ds_t sgl[DSEG_MAX]; /* is 2 sufficient? */ 2583 struct send_wid *wdesc; 2584 ibt_status_t ibt_status = IBT_SUCCESS; 2585 rdma_stat ret = RDMA_SUCCESS; 2586 rib_qp_t *qp = ctoqp(conn); 2587 2588 if (cl == NULL) { 2589 cmn_err(CE_WARN, "rib_read: NULL clist\n");
2668 return (zero == 0); 2669 } 2670 2671 /* 2672 * rib_srv_cm_handler() 2673 * Connection Manager callback to handle RC connection requests. 2674 */ 2675 /* ARGSUSED */ 2676 static ibt_cm_status_t 2677 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2678 ibt_cm_return_args_t *ret_args, void *priv_data, 2679 ibt_priv_data_len_t len) 2680 { 2681 queue_t *q; 2682 rib_qp_t *qp; 2683 rpcib_state_t *ribstat; 2684 rib_hca_t *hca; 2685 rdma_stat status = RDMA_SUCCESS; 2686 int i; 2687 struct clist cl; 2688 rdma_buf_t rdbuf; 2689 void *buf = NULL; 2690 ibt_cm_req_rcv_t cm_req_rcv; 2691 CONN *conn; 2692 ibt_status_t ibt_status; 2693 ibt_ar_t ar_query, ar_result; 2694 ib_gid_t sgid; 2695 2696 2697 ASSERT(any != NULL); 2698 ASSERT(event != NULL); 2699 2700 ribstat = (rpcib_state_t *)any; 2701 hca = (rib_hca_t *)ribstat->hca; 2702 ASSERT(hca != NULL); 2703 2704 /* got a connection request */ 2705 switch (event->cm_type) { 2706 case IBT_CM_EVENT_REQ_RCV: 2707 /* 2708 * If the plugin is in the NO_ACCEPT state, bail out.
2753 cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n", 2754 cm_req_rcv.req_remote_qpn); 2755 cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n", 2756 cm_req_rcv.req_remote_qkey); 2757 cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n", 2758 (void *)qp, (void *)qp->qp_hdl); 2759 } 2760 2761 if (rib_debug > 2) { 2762 ibt_rc_chan_query_attr_t chan_attrs; 2763 2764 if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs) 2765 == IBT_SUCCESS) { 2766 cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in " 2767 "CEP state %d\n", (void *)qp, chan_attrs.rc_state); 2768 } 2769 } 2770 #endif 2771 2772 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2773 ret_args->cm_ret.rep.cm_rdma_ra_out = 1; 2774 ret_args->cm_ret.rep.cm_rdma_ra_in = 1; 2775 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2776 2777 /* 2778 * Pre-posts RECV buffers 2779 */ 2780 conn = qptoc(qp); 2781 for (i = 0; i < preposted_rbufs; i++) { 2782 bzero(&rdbuf, sizeof (rdbuf)); 2783 rdbuf.type = RECV_BUFFER; 2784 buf = rib_rbuf_alloc(conn, &rdbuf); 2785 if (buf == NULL) { 2786 cmn_err(CE_WARN, "rib_svc_cm_handler: " 2787 "No RECV_BUFFER buf!\n"); 2788 (void) rib_disconnect_channel(conn, NULL); 2789 return (IBT_CM_REJECT); 2790 } 2791 2792 bzero(&cl, sizeof (cl)); 2793 cl.c_saddr = (uintptr_t)rdbuf.addr; 2794 cl.c_len = rdbuf.len;
3678 rep->prev->next = rep->next; 3679 } 3680 if (rep->next) { 3681 rep->next->prev = rep->prev; 3682 } 3683 if (qp->replylist == rep) 3684 qp->replylist = rep->next; 3685 3686 cv_destroy(&rep->wait_cv); 3687 qp->rep_list_size--; 3688 if (rib_debug > 1) 3689 cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n", 3690 (void *)qp, qp->rep_list_size); 3691 3692 kmem_free(rep, sizeof (*rep)); 3693 3694 return (0); 3695 } 3696 3697 rdma_stat 3698 rib_registermem(CONN *conn, caddr_t buf, uint_t buflen, 3699 struct mrc *buf_handle) 3700 { 3701 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3702 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3703 rdma_stat status; 3704 rib_hca_t *hca = (ctoqp(conn))->hca; 3705 3706 /* 3707 * Note: ALL buffer pools use the same memory type RDMARW. 3708 */ 3709 status = rib_reg_mem(hca, buf, buflen, 0, &mr_hdl, &mr_desc); 3710 if (status == RDMA_SUCCESS) { 3711 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3712 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3713 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3714 } else { 3715 buf_handle->mrc_linfo = NULL; 3716 buf_handle->mrc_lmr = 0; 3717 buf_handle->mrc_rmr = 0; 3718 } 3719 return (status); 3720 } 3721 3722 static rdma_stat 3723 rib_reg_mem(rib_hca_t *hca, caddr_t buf, uint_t size, ibt_mr_flags_t spec, 3724 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3725 { 3726 ibt_mr_attr_t mem_attr; 3727 ibt_status_t ibt_status; 3728 3729 mem_attr.mr_vaddr = (uintptr_t)buf; 3730 mem_attr.mr_len = (ib_msglen_t)size; 3731 mem_attr.mr_as = NULL; 3732 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3733 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3734 IBT_MR_ENABLE_WINDOW_BIND | spec; 3735 3736 rw_enter(&hca->state_lock, RW_READER); 3737 if (hca->state == HCA_INITED) { 3738 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3739 &mem_attr, mr_hdlp, mr_descp); 3740 rw_exit(&hca->state_lock); 3741 } else { 3742 rw_exit(&hca->state_lock); 3743 return (RDMA_FAILED); 3744 } 3745 3746 if (ibt_status != IBT_SUCCESS) { 3747 cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr " 3748 "(spec:%d) failed for addr %llX, status %d", 3749 spec, (longlong_t)mem_attr.mr_vaddr, ibt_status); 3750 return (RDMA_FAILED); 3751 } 3752 return (RDMA_SUCCESS); 3753 } 3754 3755 rdma_stat 3756 rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen, 3757 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle) 3758 { 3759 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3760 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3761 rdma_stat status; 3762 rib_hca_t *hca = (ctoqp(conn))->hca; 3763 3764 /* 3765 * Non-coherent memory registration. 3766 */ 3767 status = rib_reg_mem(hca, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, 3768 &mr_desc); 3769 if (status == RDMA_SUCCESS) { 3770 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3771 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3772 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3773 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3774 } else { 3775 buf_handle->mrc_linfo = NULL; 3776 buf_handle->mrc_lmr = 0; 3777 buf_handle->mrc_rmr = 0; 3778 } 3779 return (status); 3780 } 3781 3782 /* ARGSUSED */ 3783 rdma_stat 3784 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3785 { 3786 rib_hca_t *hca = (ctoqp(conn))->hca; 3787 3788 /* 3789 * Allow memory deregistration even if HCA is 3790 * getting detached. Need all outstanding 3791 * memory registrations to be deregistered 3792 * before HCA_DETACH_EVENT can be accepted. 3793 */ 3794 (void) ibt_deregister_mr(hca->hca_hdl, 3795 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3796 return (RDMA_SUCCESS); 3797 } 3798 3799 /* ARGSUSED */ 3800 rdma_stat 3801 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3802 RIB_SYNCMEM_HANDLE sync_handle) 3803 { 3804 (void) rib_deregistermem(conn, buf, buf_handle); 3805 3806 return (RDMA_SUCCESS); 3807 } 3808 3809 /* ARGSUSED */ 3810 rdma_stat 3811 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3812 int len, int cpu) 3813 { 3814 ibt_status_t status; 3815 rib_hca_t *hca = (ctoqp(conn))->hca; 3816 ibt_mr_sync_t mr_segment; 3817 3818 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3819 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3820 mr_segment.ms_len = (ib_memlen_t)len; 3821 if (cpu) { 3822 /* make incoming data visible to memory */ 3823 mr_segment.ms_flags = IBT_SYNC_WRITE;
3862 } 3863 3864 rib_bufpool_t * 3865 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3866 { 3867 rib_bufpool_t *rbp = NULL; 3868 bufpool_t *bp = NULL; 3869 caddr_t buf; 3870 ibt_mr_attr_t mem_attr; 3871 ibt_status_t ibt_status; 3872 int i, j; 3873 3874 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3875 3876 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3877 num * sizeof (void *), KM_SLEEP); 3878 3879 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3880 bp->numelems = num; 3881 3882 switch (ptype) { 3883 case SEND_BUFFER: 3884 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3885 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */ 3886 bp->rsize = RPC_MSG_SZ; 3887 break; 3888 case RECV_BUFFER: 3889 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3890 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */ 3891 bp->rsize = RPC_BUF_SIZE; 3892 break; 3893 default: 3894 goto fail; 3895 } 3896 3897 /* 3898 * Register the pool. 3899 */ 3900 bp->bufsize = num * bp->rsize; 3901 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3902 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3903 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3904 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3905 sizeof (ibt_mr_desc_t), KM_SLEEP); 3906 3907 rw_enter(&hca->state_lock, RW_READER); 3908 if (hca->state != HCA_INITED) { 3909 rw_exit(&hca->state_lock); 3910 goto fail; 3911 } 3912 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3913 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3914 mem_attr.mr_vaddr = (uintptr_t)buf; 3915 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3916 mem_attr.mr_as = NULL; 3917 ibt_status = ibt_register_mr(hca->hca_hdl, 3918 hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i], 3919 &rbp->mr_desc[i]); 3920 if (ibt_status != IBT_SUCCESS) { 3921 for (j = 0; j < i; j++) { 3922 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]); 3923 } 3924 rw_exit(&hca->state_lock); 3925 goto fail; 3926 } 3927 } 3928 rw_exit(&hca->state_lock); 3929 3930 buf = (caddr_t)bp->buf; 3931 for (i = 0; i < num; i++, buf += bp->rsize) { 3932 bp->buflist[i] = (void *)buf; 3933 } 3934 bp->buffree = num - 1; /* no. of free buffers */ 3935 rbp->bpool = bp; 3936 3937 return (rbp); 3938 fail: 3939 if (bp) { 3940 if (bp->buf) 3941 kmem_free(bp->buf, bp->bufsize); 3942 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3943 } 3944 if (rbp) { 3945 if (rbp->mr_hdl) 3946 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3947 if (rbp->mr_desc) 3948 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3949 kmem_free(rbp, sizeof (rib_bufpool_t));
4000 break; 4001 case RECV_BUFFER: 4002 rbp = hca->recv_pool; 4003 break; 4004 default: 4005 return; 4006 } 4007 if (rbp == NULL) 4008 return; 4009 4010 bp = rbp->bpool; 4011 4012 /* 4013 * Free the pool memory. 4014 */ 4015 if (rbp->mr_hdl) 4016 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 4017 4018 if (rbp->mr_desc) 4019 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 4020 4021 if (bp->buf) 4022 kmem_free(bp->buf, bp->bufsize); 4023 mutex_destroy(&bp->buflock); 4024 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 4025 kmem_free(rbp, sizeof (rib_bufpool_t)); 4026 } 4027 4028 void 4029 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 4030 { 4031 /* 4032 * Deregister the pool memory and free it. 4033 */ 4034 rib_rbufpool_deregister(hca, ptype); 4035 rib_rbufpool_free(hca, ptype); 4036 } 4037 4038 /* 4039 * Fetch a buffer from the pool of type specified in rdbuf->type. 4040 */
4042 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4043 { 4044 4045 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4046 if (rdbuf->addr) { 4047 switch (rdbuf->type) { 4048 case SEND_BUFFER: 4049 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4050 break; 4051 case RECV_BUFFER: 4052 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4053 break; 4054 default: 4055 rdbuf->len = 0; 4056 } 4057 return (RDMA_SUCCESS); 4058 } else 4059 return (RDMA_FAILED); 4060 } 4061 4062 4063 /* 4064 * Fetch a buffer of specified type. 4065 * Note that rdbuf->handle is mw's rkey. 4066 */ 4067 static void * 4068 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4069 { 4070 rib_qp_t *qp = ctoqp(conn); 4071 rib_hca_t *hca = qp->hca; 4072 rdma_btype ptype = rdbuf->type; 4073 void *buf; 4074 rib_bufpool_t *rbp = NULL; 4075 bufpool_t *bp; 4076 int i; 4077 4078 /* 4079 * Obtain pool address based on type of pool 4080 */ 4081 switch (ptype) {
4092 return (NULL); 4093 4094 bp = rbp->bpool; 4095 4096 mutex_enter(&bp->buflock); 4097 if (bp->buffree < 0) { 4098 cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!"); 4099 mutex_exit(&bp->buflock); 4100 return (NULL); 4101 } 4102 4103 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4104 buf = bp->buflist[bp->buffree]; 4105 rdbuf->addr = buf; 4106 rdbuf->len = bp->rsize; 4107 for (i = bp->numelems - 1; i >= 0; i--) { 4108 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4109 rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey; 4110 rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i]; 4111 rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey; 4112 bp->buffree--; 4113 if (rib_debug > 1) 4114 cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs " 4115 "(type %d)\n", bp->buffree+1, ptype); 4116 4117 mutex_exit(&bp->buflock); 4118 4119 return (buf); 4120 } 4121 } 4122 cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of " 4123 "type %d found!", buf, ptype); 4124 mutex_exit(&bp->buflock); 4125 4126 return (NULL); 4127 } 4128 4129 static void 4130 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4131 {
4943 4944 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4945 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4946 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4947 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4948 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4949 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4950 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4951 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4952 4953 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4954 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4955 if (hca->srv_conn_list.conn_hd == NULL && 4956 hca->cl_conn_list.conn_hd == NULL) { 4957 /* 4958 * conn_lists are NULL, so destroy 4959 * buffers, close hca and be done. 4960 */ 4961 rib_rbufpool_destroy(hca, RECV_BUFFER); 4962 rib_rbufpool_destroy(hca, SEND_BUFFER); 4963 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4964 (void) ibt_close_hca(hca->hca_hdl); 4965 hca->hca_hdl = NULL; 4966 } 4967 rw_exit(&hca->cl_conn_list.conn_lock); 4968 rw_exit(&hca->srv_conn_list.conn_lock); 4969 4970 if (hca->hca_hdl != NULL) { 4971 mutex_enter(&hca->inuse_lock); 4972 while (hca->inuse) 4973 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4974 mutex_exit(&hca->inuse_lock); 4975 /* 4976 * conn_lists are now NULL, so destroy 4977 * buffers, close hca and be done. 4978 */ 4979 rib_rbufpool_destroy(hca, RECV_BUFFER); 4980 rib_rbufpool_destroy(hca, SEND_BUFFER); 4981 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4982 (void) ibt_close_hca(hca->hca_hdl); 4983 hca->hca_hdl = NULL; 4984 } 4985 }


7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 28 /* Copyright (c) 2006, The Ohio State University. All rights reserved. 29 * 30 * Portions of this source code is developed by the team members of 31 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 32 * headed by Professor Dhabaleswar K. (DK) Panda. 33 * 34 * Acknowledgements to contributions from developors: 35 * Ranjit Noronha: noronha@cse.ohio-state.edu 36 * Lei Chai : chail@cse.ohio-state.edu 37 * Weikuan Yu : yuw@cse.ohio-state.edu 38 * 39 */ 40 41 #pragma ident "@(#)rpcib.c 1.29 06/01/25 SMI" 42 43 /* 44 * The rpcib plugin. Implements the interface for RDMATF's 45 * interaction with IBTF. 46 */ 47 48 #include <sys/param.h> 49 #include <sys/types.h> 50 #include <sys/user.h> 51 #include <sys/systm.h> 52 #include <sys/sysmacros.h> 53 #include <sys/proc.h> 54 #include <sys/socket.h> 55 #include <sys/file.h> 56 #include <sys/stream.h> 57 #include <sys/strsubr.h> 58 #include <sys/stropts.h> 59 #include <sys/errno.h> 60 #include <sys/kmem.h> 61 #include <sys/debug.h> 62 #include <sys/systm.h> 63 #include <sys/pathname.h> 64 #include <sys/kstat.h> 65 #include <sys/t_lock.h> 66 #include <sys/ddi.h> 67 #include <sys/cmn_err.h> 68 #include <sys/time.h> 69 #include <sys/isa_defs.h> 70 #include <sys/callb.h> 71 #include <sys/sunddi.h> 72 #include <sys/sunndi.h> 73 74 /* #define IB_FMR_SUP */ 75 /* #define CLNT_POLL_CQ */ 76 #include <sys/ib/ibtl/ibti.h> 77 #include <rpc/rpc.h> 78 #include <rpc/ib.h> 79 80 #include <sys/modctl.h> 81 82 #include <sys/pathname.h> 83 #include <sys/kstr.h> 84 #include <sys/sockio.h> 85 #include <sys/vnode.h> 86 #include <sys/tiuser.h> 87 #include <net/if.h> 88 #include <sys/cred.h> 89 #include <rpc/rpc_rdma.h> 90 91 int num_clients = 0; 92 volatile uint32_t is_server = 0; 93 94 extern char *inet_ntop(int, const void *, char *, int); 95 96 97 /* 98 * Prototype declarations for driver ops 99 */ 100 101 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 102 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 103 void *, void **); 104 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 105 106 107 /* rpcib cb_ops */ 108 static struct cb_ops rpcib_cbops = { 109 nulldev, /* open */ 110 nulldev, /* close */ 111 nodev, /* strategy */ 112 nodev, /* print */ 113 nodev, /* dump */ 114 nodev, /* read */ 115 nodev, /* write */ 116 nodev, /* ioctl */ 117 nodev, /* devmap */ 118 nodev, /* mmap */ 119 nodev, /* segmap */ 120 nochpoll, /* poll */ 121 ddi_prop_op, /* prop_op */ 122 NULL, /* stream */ 123 D_MP, /* cb_flag */ 124 CB_REV, /* rev */ 125 nodev, /* int (*cb_aread)() */ 126 nodev /* int (*cb_awrite)() */ 127 }; 128 129 130 131 132 /* 133 * Device options 134 */ 135 static struct dev_ops rpcib_ops = { 136 DEVO_REV, /* devo_rev, */ 137 0, /* refcnt */ 138 rpcib_getinfo, /* info */ 139 nulldev, /* identify */ 140 nulldev, /* probe */ 141 rpcib_attach, /* attach */ 142 rpcib_detach, /* detach */ 143 nodev, /* reset */ 144 &rpcib_cbops, /* driver ops - devctl interfaces */ 145 NULL, /* bus operations */ 146 NULL /* power */ 147 }; 148 149 /* 150 * Module linkage information. 151 */ 152 153 static struct modldrv rib_modldrv = { 154 &mod_driverops, /* Driver module */ 155 "RPCIB plugin driver, ver 1.29", /* Driver name and version */ 156 &rpcib_ops, /* Driver ops */ 157 }; 158 159 static struct modlinkage rib_modlinkage = { 160 MODREV_1, 161 (void *)&rib_modldrv, 162 NULL 163 }; 164 165 #ifdef SERVER_REG_CACHE 166 typedef struct cache_struct { 167 avl_node_t avl_link; 168 rib_lrc_entry_t r; 169 uint32_t len; 170 uint32_t elements; 171 kmutex_t node_lock; 172 } cache_avl_struct_t; 173 174 175 #if 1 176 int rib_total_buffers = 0; 177 #endif 178 #endif 179 /* 180 * rib_stat: private data pointer used when registering 181 * with the IBTF. It is returned to the consumer 182 * in all callbacks. 183 */ 184 static rpcib_state_t *rib_stat = NULL; 185 186 #define RNR_RETRIES IBT_RNR_INFINITE_RETRY 187 #define MAX_PORTS 2 188 189 #ifdef IB_FMR_SUP 190 #define IB_FMR_DIRTY_MARK 32 191 #define IB_FMR_MAX_SIZE 1048576 192 /*#define IB_FMR_MAX_SIZE 32768 */ 193 #endif 194 195 int preposted_rbufs = RDMA_BUFS_GRANT; 196 int send_threshold = 1; 197 198 /* 199 * State of the plugin. 200 * ACCEPT = accepting new connections and requests. 201 * NO_ACCEPT = not accepting new connection and requests. 202 * This should eventually move to rpcib_state_t structure, since this 203 * will tell in which state the plugin is for a particular type of service 204 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 205 * state for one and in no_accept state for the other. 206 */ 207 int plugin_state; 208 kmutex_t plugin_state_lock; 209 210 211 /* 212 * RPCIB RDMATF operations 213 */ 214 #if defined(MEASURE_POOL_DEPTH) 215 static void rib_posted_rbufs(uint32_t x) { return;} 216 #endif 217 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 218 static rdma_stat rib_disconnect(CONN *conn); 219 static void rib_listen(struct rdma_svc_data *rd); 220 static void rib_listen_stop(struct rdma_svc_data *rd); 221 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 222 struct mrc *buf_handle); 223 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 224 struct mrc buf_handle); 225 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 226 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 227 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 228 struct mrc buf_handle); 229 #ifdef SERVER_REG_CACHE 230 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 231 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc); 232 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 233 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 234 #else 235 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 236 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle); 237 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 238 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle); 239 240 #endif 241 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 242 caddr_t buf, int len, int cpu); 243 244 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 245 246 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 247 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 248 249 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 250 251 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 252 #if defined (CLNT_INTERRUPT_COAL) 253 static void rib_scq_free(caddr_t); 254 static rdma_stat rib_send_bl(CONN *conn, struct clist *cl, uint32_t msgid); 255 #endif 256 #if defined(ASYNC_SERVER_DEREG) 257 static rdma_stat rib_send_nw(CONN *conn, struct clist *cl, uint32_t msgid, caddr_t, caddr_t, int, caddr_t, int, int, int); 258 #endif 259 #if defined(ASYNC_CLIENT_DEREG) 260 static void insert_queue(CONN *conn, struct clist *rwc); 261 #endif 262 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 263 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 264 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 265 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 266 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 267 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 268 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **); 269 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); 270 static rdma_stat rib_conn_release(CONN *conn); 271 static rdma_stat rib_getinfo(rdma_info_t *info); 272 #ifdef DYNAMIC_CREDIT_CONTROL 273 void rib_get_resource_info(CONN *, int *, int *); 274 #endif 275 276 #ifdef SERVER_REG_CACHE 277 static rib_lrc_entry_t *rib_get_server_cache_buf(CONN *conn, uint32_t len); 278 static void rib_free_server_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 279 static void rib_destroy_cache(rib_hca_t *hca); 280 static void 281 rib_server_side_cache_reclaim(void *argp); 282 static int avl_compare(const void *t1,const void *t2); 283 #endif 284 285 static rdma_stat rib_register_ats(rib_hca_t *); 286 static void rib_deregister_ats(); 287 static void rib_stop_services(rib_hca_t *); 288 289 /* 290 * RPCIB addressing operations 291 */ 292 char ** get_ip_addrs(int *count); 293 int get_interfaces(TIUSER *tiptr, int *num); 294 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs); 295 int get_ibd_ipaddr(rpcib_ibd_insts_t *); 296 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *); 297 void rib_get_ibd_insts(rpcib_ibd_insts_t *); 298 #if defined(ASYNC_SERVER_DEREG)||defined(ASYNC_CLIENT_DEREG) 299 static int clist_deregister1(CONN *, struct clist *, bool_t ); 300 #endif 301 302 #if defined(ASYNC_CLIENT_DEREG) 303 typedef struct async_dereg { 304 struct async_dereg *forw; 305 struct async_dereg *back; 306 CONN c_conn; 307 struct clist c_clist; 308 } ASYNC; 309 static void async_dereg_thread(caddr_t arg); 310 extern pri_t minclsyspri; /* priority for taskq */ 311 static ASYNC rqueue; 312 static kmutex_t at_mutex; 313 static kcondvar_t at_cond; 314 #endif 315 /* 316 * RDMA operations the RPCIB module exports 317 */ 318 static rdmaops_t rib_ops = { 319 rib_reachable, 320 rib_conn_get, 321 rib_conn_release, 322 rib_listen, 323 rib_listen_stop, 324 rib_registermem, 325 rib_deregistermem, 326 rib_registermemsync, 327 rib_deregistermemsync, 328 rib_syncmem, 329 rib_reg_buf_alloc, 330 rib_reg_buf_free, 331 rib_send, 332 #if defined (CLNT_INTERRUPT_COAL) 333 rib_send_bl, 334 #endif 335 #if defined(ASYNC_SERVER_DEREG) 336 rib_send_nw, 337 #endif 338 rib_send_resp, 339 rib_post_resp, 340 rib_post_recv, 341 rib_recv, 342 rib_read, 343 rib_write, 344 rib_getinfo, 345 #ifdef SERVER_REG_CACHE 346 rib_get_server_cache_buf, 347 rib_free_server_cache_buf, 348 #endif 349 #ifdef DYNAMIC_CREDIT_CONTROL 350 rib_get_resource_info, 351 #endif 352 #if defined(ASYNC_CLIENT_DEREG) 353 insert_queue, 354 #endif 355 }; 356 357 /* 358 * RDMATF RPCIB plugin details 359 */ 360 static rdma_mod_t rib_mod = { 361 "ibtf", /* api name */ 362 RDMATF_VERS_1, 363 0, 364 &rib_ops, /* rdma op vector for ibtf */ 365 }; 366 367 static rdma_stat open_hcas(rpcib_state_t *); 368 static rdma_stat rib_qp_init(rib_qp_t *, int); 369 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 370 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 371 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 372 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 373 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 374 #ifdef IB_FMR_SUP 375 static rdma_stat rib_reg_mem_fmr(rib_hca_t *, caddr_t adsp,caddr_t, uint_t, ibt_mr_flags_t, 376 ibt_mr_hdl_t *, ibt_ma_hdl_t *, ibt_pmr_desc_t *); 377 #endif 378 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, ibt_mr_flags_t, 379 ibt_mr_hdl_t *, ibt_mr_desc_t *); 380 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 381 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 382 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *); 383 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 384 rib_qp_t **); 385 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 386 rib_qp_t **); 387 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 388 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 389 static int rib_free_sendwait(struct send_wid *); 390 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 391 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 392 static void rdma_done_rem_list(rib_qp_t *); 393 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 394 395 static void rib_async_handler(void *, 396 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 397 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 398 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 399 static int rib_free_svc_recv(struct svc_recv *); 400 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 401 static void rib_free_wid(struct recv_wid *);
416 }; 417 418 /* 419 * Global strucuture 420 */ 421 422 typedef struct rpcib_s { 423 dev_info_t *rpcib_dip; 424 kmutex_t rpcib_mutex; 425 } rpcib_t; 426 427 rpcib_t rpcib; 428 429 /* 430 * /etc/system controlled variable to control 431 * debugging in rpcib kernel module. 432 * Set it to values greater that 1 to control 433 * the amount of debugging messages required. 434 */ 435 int rib_debug = 0; 436 #if defined(CLNT_POLL_CQ) 437 int max_poll_count = 500; 438 #endif 439 static int ats_running = 0; 440 441 442 int 443 _init(void) 444 { 445 int error; 446 447 error = mod_install((struct modlinkage *)&rib_modlinkage); 448 if (error != 0) { 449 /* 450 * Could not load module 451 */ 452 return (error); 453 } 454 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 455 456 return (0); 457 } 458 459 int 460 _fini() 461 {
677 " ATS service: %s", 678 to_remove->srv_name); 679 } 680 #endif 681 } 682 kmem_free(to_remove, sizeof (rib_service_t)); 683 } 684 hca->ats_list = NULL; 685 rw_exit(&hca->service_list_lock); 686 } 687 688 static void rib_rbufpool_free(rib_hca_t *, int); 689 static void rib_rbufpool_deregister(rib_hca_t *, int); 690 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 691 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 692 static rdma_stat rib_rem_replylist(rib_qp_t *); 693 static int rib_remreply(rib_qp_t *, struct reply *); 694 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 695 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 696 697 698 /* 699 * One CQ pair per HCA 700 */ 701 static rdma_stat 702 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 703 rib_cq_t **cqp, rpcib_state_t *ribstat) 704 { 705 rib_cq_t *cq; 706 ibt_cq_attr_t cq_attr; 707 uint32_t real_size; 708 ibt_status_t status; 709 rdma_stat error = RDMA_SUCCESS; 710 711 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 712 cq->rib_hca = hca; 713 cq_attr.cq_size = cq_size; 714 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 715 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 716 &real_size); 717 if (status != IBT_SUCCESS) {
738 739 return (error); 740 fail: 741 if (cq->rib_cq_hdl) 742 (void) ibt_free_cq(cq->rib_cq_hdl); 743 if (cq) 744 kmem_free(cq, sizeof (rib_cq_t)); 745 return (error); 746 } 747 748 static rdma_stat 749 open_hcas(rpcib_state_t *ribstat) 750 { 751 rib_hca_t *hca; 752 ibt_status_t ibt_status; 753 rdma_stat status; 754 ibt_hca_portinfo_t *pinfop; 755 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 756 uint_t size, cq_size; 757 int i; 758 #ifdef IB_FMR_SUP 759 ibt_fmr_pool_attr_t fmr_attr; 760 uint_t h_page_sz; 761 #endif 762 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 763 if (ribstat->hcas == NULL) 764 ribstat->hcas = kmem_zalloc(ribstat->hca_count * 765 sizeof (rib_hca_t), KM_SLEEP); 766 767 /* 768 * Open a hca and setup for RDMA 769 */ 770 for (i = 0; i < ribstat->hca_count; i++) { 771 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 772 ribstat->hca_guids[i], 773 &ribstat->hcas[i].hca_hdl); 774 if (ibt_status != IBT_SUCCESS) { 775 cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) " 776 "returned %d", i, ibt_status); 777 continue; 778 } 779 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; 780 hca = &(ribstat->hcas[i]); 781 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
854 } 855 856 /* 857 * Create buffer pools. 858 * Note rib_rbuf_create also allocates memory windows. 859 */ 860 hca->recv_pool = rib_rbufpool_create(hca, 861 RECV_BUFFER, MAX_BUFS); 862 if (hca->recv_pool == NULL) { 863 cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n"); 864 goto fail3; 865 } 866 867 hca->send_pool = rib_rbufpool_create(hca, 868 SEND_BUFFER, MAX_BUFS); 869 if (hca->send_pool == NULL) { 870 cmn_err(CE_WARN, "open_hcas: send buf pool failed\n"); 871 rib_rbufpool_destroy(hca, RECV_BUFFER); 872 goto fail3; 873 } 874 #ifdef IB_FMR_SUP 875 /* Global FMR POOL */ 876 bzero(&fmr_attr, sizeof (ibt_fmr_pool_attr_t)); 877 878 h_page_sz = hca->hca_attrs.hca_page_sz * 1024; 879 880 fmr_attr.fmr_max_pages_per_fmr = 881 (IB_FMR_MAX_SIZE / h_page_sz) + 2; 882 fmr_attr.fmr_pool_size = MAX_BUFS * 2; 883 fmr_attr.fmr_dirty_watermark = IB_FMR_DIRTY_MARK; 884 fmr_attr.fmr_page_sz = h_page_sz; 885 fmr_attr.fmr_cache = B_FALSE; 886 fmr_attr.fmr_flags = IBT_MR_SLEEP | 887 IBT_MR_ENABLE_LOCAL_WRITE | 888 IBT_MR_ENABLE_REMOTE_READ | 889 IBT_MR_ENABLE_REMOTE_WRITE; 890 fmr_attr.fmr_func_hdlr = NULL; 891 892 if (rib_debug > 1) { 893 cmn_err(CE_NOTE, "open_hcas: ibt_create_fmr_pool:"); 894 cmn_err(CE_NOTE, "fmr_page_sz %d, fmr_pool_sz %d, " 895 "max_pages_per_fmr %d", fmr_attr.fmr_page_sz, 896 fmr_attr.fmr_pool_size, 897 fmr_attr.fmr_max_pages_per_fmr); 898 } 899 900 ibt_status = ibt_create_fmr_pool(hca->hca_hdl, hca->pd_hdl, 901 &fmr_attr, &hca->fmr_pool); 902 if (ibt_status != IBT_SUCCESS) { 903 cmn_err(CE_WARN, "open_hcas: Global FMR pool creation " 904 "failed: %d\n", ibt_status); 905 rib_rbufpool_destroy(hca, RECV_BUFFER); 906 rib_rbufpool_destroy(hca, SEND_BUFFER); 907 goto fail3; 908 } 909 #endif 910 #ifdef SERVER_REG_CACHE 911 cmn_err(CE_NOTE,"Registration Cache enabled\n"); 912 { 913 cache_avl_struct_t my_avl_node; 914 hca->server_side_cache = 915 kmem_cache_create("rib_server_side_cache", 916 sizeof (cache_avl_struct_t), 0, 917 NULL, 918 NULL, 919 rib_server_side_cache_reclaim, 920 hca, NULL, 0); 921 avl_create(&hca->avl_tree, 922 avl_compare, 923 sizeof(cache_avl_struct_t), 924 (uint_t)&my_avl_node.avl_link-(uint_t)&my_avl_node); 925 /* mutex_init(&hca->avl_lock, NULL, MUTEX_DEFAULT, NULL);*/ 926 rw_init(&hca->avl_rw_lock, NULL, RW_DRIVER, hca->iblock); 927 hca->avl_init = TRUE; 928 929 } 930 #endif 931 932 #if defined(ASYNC_CLIENT_DEREG) 933 rqueue.forw = rqueue.back = &rqueue; 934 mutex_init(&at_mutex, NULL, MUTEX_DEFAULT, NULL); 935 cv_init(&at_cond, NULL, CV_DEFAULT, NULL); 936 (void) thread_create(NULL, 0, async_dereg_thread, NULL, 0, &p0, 937 TS_RUN, minclsyspri); 938 #endif 939 /* 940 * Initialize the registered service list and 941 * the lock 942 */ 943 hca->service_list = NULL; 944 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); 945 946 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 947 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 948 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 949 hca->iblock); 950 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 951 hca->iblock); 952 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 953 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 954 hca->inuse = TRUE; 955 /* 956 * XXX One hca only. Add multi-hca functionality if needed 957 * later. 958 */
1060 * Notify poster 1061 */ 1062 cv_signal(&wd->wait_cv); 1063 mutex_exit(&wd->sendwait_lock); 1064 } else { 1065 /* 1066 * Poster not waiting for notification. 1067 * Free the send buffers and send_wid 1068 */ 1069 for (i = 0; i < wd->nsbufs; i++) { 1070 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 1071 (void *)(uintptr_t)wd->sbufaddr[i]); 1072 } 1073 mutex_exit(&wd->sendwait_lock); 1074 (void) rib_free_sendwait(wd); 1075 } 1076 } 1077 } 1078 } 1079 1080 #if defined (CLNT_INTERRUPT_COAL) 1081 static void 1082 rib_scq_free(caddr_t widd) 1083 { 1084 struct send_wid *wd = (struct send_wid *)widd; 1085 ibt_status_t ibt_status; 1086 ibt_wc_t wc; 1087 int i; 1088 CONN *conn = qptoc(wd->qp); 1089 1090 wc.wc_status = RDMA_SUCCESS; 1091 mutex_enter(&wd->sendwait_lock); 1092 switch (wc.wc_status) { 1093 case IBT_WC_SUCCESS: 1094 wd->status = RDMA_SUCCESS; 1095 break; 1096 case IBT_WC_WR_FLUSHED_ERR: 1097 wd->status = RDMA_FAILED; 1098 break; 1099 default: 1100 /* 1101 * RC Send Q Error Code Local state Remote State 1102 * ==================== =========== ============ 1103 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1104 * IBT_WC_LOCAL_LEN_ERR ERROR None 1105 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1106 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1107 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1108 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1109 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1110 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1111 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1112 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1113 * IBT_WC_WR_FLUSHED_ERR None None 1114 */ 1115 #ifdef DEBUG 1116 if (rib_debug > 1) { 1117 if (wc.wc_status != IBT_WC_SUCCESS) { 1118 cmn_err(CE_NOTE, "rib_clnt_scq_handler: " 1119 "WR completed in error, wc.wc_status:%d, " 1120 "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id); 1121 } 1122 } 1123 #endif 1124 /* 1125 * Channel in error state. Set connection to 1126 * ERROR and cleanup will happen either from 1127 * conn_release or from rib_conn_get 1128 */ 1129 wd->status = RDMA_FAILED; 1130 mutex_enter(&conn->c_lock); 1131 if (conn->c_state != C_DISCONN_PEND) 1132 conn->c_state = C_ERROR; 1133 mutex_exit(&conn->c_lock); 1134 break; 1135 } 1136 if (wd->cv_sig == 1) { 1137 /* 1138 * Notify poster 1139 */ 1140 cmn_err(CE_NOTE,"Some error \n"); 1141 cv_signal(&wd->wait_cv); 1142 mutex_exit(&wd->sendwait_lock); 1143 } else { 1144 /* 1145 * Poster not waiting for notification. 1146 * Free the send buffers and send_wid 1147 */ 1148 for (i = 0; i < wd->nsbufs; i++) { 1149 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 1150 (void *)(uintptr_t)wd->sbufaddr[i]); 1151 } 1152 mutex_exit(&wd->sendwait_lock); 1153 (void) rib_free_sendwait(wd); 1154 } 1155 } 1156 #endif 1157 1158 /* ARGSUSED */ 1159 static void 1160 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1161 { 1162 ibt_status_t ibt_status; 1163 ibt_wc_t wc; 1164 int i; 1165 1166 /* 1167 * Re-enable cq notify here to avoid missing any 1168 * completion queue notification. 1169 */ 1170 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1171 1172 ibt_status = IBT_SUCCESS; 1173 while (ibt_status != IBT_CQ_EMPTY) { 1174 bzero(&wc, sizeof (wc)); 1175 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1176 if (ibt_status != IBT_SUCCESS) 1177 return; 1178 1179 /* 1180 * Got a send completion 1181 */ 1182 #ifdef DEBUG 1183 if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) { 1184 cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error " 1185 "wc.wc_status:%d, wc_id:%llX", 1186 wc.wc_status, (longlong_t)wc.wc_id); 1187 } 1188 #endif 1189 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ 1190 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1191 #ifdef ASYNC_SERVER_DEREG 1192 if(wd->c1){ 1193 (void) clist_deregister1((CONN *)wd->c, (struct clist *)wd->c1, TRUE); 1194 #ifdef SERVER_REG_CACHE 1195 RDMA_FREE_SERVER_CACHE_BUF((CONN *)wd->c, (rib_lrc_entry_t *)(((struct clist *)wd->c1)->long_reply_buf)); 1196 #else 1197 if(wd->c1 && wd->l1) 1198 kmem_free((void *) (wd->c1)->c_saddr, wd->l1); 1199 #endif 1200 kmem_free((void *)(wd->c1), wd->wl * sizeof(struct clist)); 1201 } 1202 if(wd->c2){ 1203 (void) clist_deregister1((CONN *)wd->c, (struct clist *)wd->c2, TRUE); 1204 #ifdef SERVER_REG_CACHE 1205 RDMA_FREE_SERVER_CACHE_BUF((CONN *)wd->c, (rib_lrc_entry_t *)(((struct clist *)wd->c2)->long_reply_buf)); 1206 #else 1207 if(wd->l2) 1208 kmem_free((void *) (wd->c2)->c_saddr, wd->l2); 1209 #endif 1210 kmem_free((void *)(wd->c2), wd->rl * sizeof(struct clist)); 1211 } 1212 #endif 1213 mutex_enter(&wd->sendwait_lock); 1214 if (wd->cv_sig == 1) { 1215 /* 1216 * Update completion status and notify poster 1217 */ 1218 if (wc.wc_status == IBT_WC_SUCCESS) 1219 wd->status = RDMA_SUCCESS; 1220 else 1221 wd->status = RDMA_FAILED; 1222 cv_signal(&wd->wait_cv); 1223 mutex_exit(&wd->sendwait_lock); 1224 } else { 1225 /* 1226 * Poster not waiting for notification. 1227 * Free the send buffers and send_wid 1228 */ 1229 for (i = 0; i < wd->nsbufs; i++) { 1230 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 1231 (void *)(uintptr_t)wd->sbufaddr[i]); 1232 } 1233 mutex_exit(&wd->sendwait_lock); 1234 (void) rib_free_sendwait(wd); 1235 } 1236 } 1237 } 1238 } 1239 1240 /* 1241 * RCQ handler 1242 */ 1243 /* ARGSUSED */ 1244 static void 1245 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1246 { 1247 rib_qp_t *qp; 1248 ibt_status_t ibt_status; 1249 ibt_wc_t wc; 1250 struct recv_wid *rwid; 1251 #if defined(CLNT_POLL_CQ) 1252 uint32_t count = 0; 1253 #endif 1254 1255 /* 1256 * Re-enable cq notify here to avoid missing any 1257 * completion queue notification. 1258 */ 1259 #if !defined(CLNT_POLL_CQ) 1260 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1261 #endif 1262 1263 ibt_status = IBT_SUCCESS; 1264 while (ibt_status != IBT_CQ_EMPTY) { 1265 #if defined(CLNT_POLL_CQ) 1266 poll_cq_again: 1267 #endif 1268 bzero(&wc, sizeof (wc)); 1269 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1270 #if defined(CLNT_POLL_CQ) 1271 if (ibt_status == IBT_CQ_EMPTY){ 1272 count ++; 1273 if(count == max_poll_count){ 1274 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1275 return; 1276 } 1277 goto poll_cq_again; 1278 } 1279 #endif 1280 if (ibt_status != IBT_SUCCESS) 1281 #if defined(CLNT_POLL_CQ) 1282 { 1283 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1284 #endif 1285 return; 1286 #if defined(CLNT_POLL_CQ) 1287 } 1288 count = 0; 1289 #endif 1290 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1291 qp = rwid->qp; 1292 if (wc.wc_status == IBT_WC_SUCCESS) { 1293 XDR inxdrs, *xdrs; 1294 uint_t xid, vers, op, find_xid = 0; 1295 struct reply *r; 1296 CONN *conn = qptoc(qp); 1297 uint32_t rdma_credit = 0; 1298 1299 xdrs = &inxdrs; 1300 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1301 wc.wc_bytes_xfer, XDR_DECODE); 1302 /* 1303 * Treat xid as opaque (xid is the first entity 1304 * in the rpc rdma message). 1305 */ 1306 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1307 /* Skip xid and set the xdr position accordingly. */ 1308 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1309 (void) xdr_u_int(xdrs, &vers); 1310 (void) xdr_u_int(xdrs, &rdma_credit); 1311 (void) xdr_u_int(xdrs, &op); 1312 XDR_DESTROY(xdrs); 1313 if (vers != RPCRDMA_VERS) { 1314 /* 1315 * Invalid RPC/RDMA version. Cannot interoperate. 1316 * Set connection to ERROR state and bail out. 1317 */ 1318 mutex_enter(&conn->c_lock); 1319 if (conn->c_state != C_DISCONN_PEND) 1320 conn->c_state = C_ERROR; 1321 mutex_exit(&conn->c_lock); 1322 rib_rbuf_free(conn, RECV_BUFFER, 1323 (void *)(uintptr_t)rwid->addr); 1324 rib_free_wid(rwid); 1325 continue; 1326 } 1327 1328 mutex_enter(&qp->replylist_lock); 1329 for (r = qp->replylist; r != NULL; r = r->next) { 1330 if (r->xid == xid) {
1408 mblk_t *mp; 1409 1410 /* 1411 * Re-enable cq notify here to avoid missing any 1412 * completion queue notification. 1413 */ 1414 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1415 1416 ibt_status = IBT_SUCCESS; 1417 while (ibt_status != IBT_CQ_EMPTY) { 1418 bzero(&wc, sizeof (wc)); 1419 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1420 if (ibt_status != IBT_SUCCESS) 1421 return; 1422 1423 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1424 qp = s_recvp->qp; 1425 conn = qptoc(qp); 1426 mutex_enter(&qp->posted_rbufs_lock); 1427 qp->n_posted_rbufs--; 1428 #if defined(MEASURE_POOL_DEPTH) 1429 rib_posted_rbufs(preposted_rbufs - qp->n_posted_rbufs); 1430 #endif 1431 if (qp->n_posted_rbufs == 0) 1432 cv_signal(&qp->posted_rbufs_cv); 1433 mutex_exit(&qp->posted_rbufs_lock); 1434 1435 if (wc.wc_status == IBT_WC_SUCCESS) { 1436 XDR inxdrs, *xdrs; 1437 uint_t xid, vers, op; 1438 uint32_t rdma_credit; 1439 1440 xdrs = &inxdrs; 1441 /* s_recvp->vaddr stores data */ 1442 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1443 wc.wc_bytes_xfer, XDR_DECODE); 1444 1445 /* 1446 * Treat xid as opaque (xid is the first entity 1447 * in the rpc rdma message). 1448 */ 1449 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1450 /* Skip xid and set the xdr position accordingly. */ 1451 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1452 if (!xdr_u_int(xdrs, &vers) || 1453 !xdr_u_int(xdrs, &rdma_credit) || 1454 !xdr_u_int(xdrs, &op)) { 1455 rib_rbuf_free(conn, RECV_BUFFER, 1456 (void *)(uintptr_t)s_recvp->vaddr); 1457 XDR_DESTROY(xdrs); 1458 #ifdef DEBUG 1459 cmn_err(CE_NOTE, "rib_svc_rcq_handler: " 1460 "xdr_u_int failed for qp %p, wc_id=%llx", 1461 (void *)qp, (longlong_t)wc.wc_id); 1462 #endif 1463 (void) rib_free_svc_recv(s_recvp); 1464 continue; 1465 } 1466 XDR_DESTROY(xdrs); 1467 1468 if (vers != RPCRDMA_VERS) { 1469 /* 1470 * Invalid RPC/RDMA version. Drop rpc rdma message. 1471 */ 1472 rib_rbuf_free(conn, RECV_BUFFER, 1473 (void *)(uintptr_t)s_recvp->vaddr);
1643 return (RDMA_FAILED); 1644 } 1645 } else { 1646 mutex_exit(&rib_stat->open_hca_lock); 1647 return (RDMA_SUCCESS); 1648 } 1649 } else { 1650 *handle = NULL; 1651 if (rib_debug > 2) 1652 cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n"); 1653 return (RDMA_FAILED); 1654 } 1655 } 1656 1657 /* Client side qp creation */ 1658 static rdma_stat 1659 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1660 { 1661 rib_qp_t *kqp = NULL; 1662 CONN *conn; 1663 rdma_clnt_cred_ctrl_t *cc_info; 1664 1665 ASSERT(qp != NULL); 1666 *qp = NULL; 1667 1668 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1669 conn = qptoc(kqp); 1670 kqp->hca = hca; 1671 kqp->rdmaconn.c_rdmamod = &rib_mod; 1672 kqp->rdmaconn.c_private = (caddr_t)kqp; 1673 1674 kqp->mode = RIB_CLIENT; 1675 kqp->chan_flags = IBT_BLOCKING; 1676 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1677 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1678 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1679 1680 /* 1681 * Initialize 1682 */ 1683 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1684 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1685 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1686 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1687 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1688 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1689 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1690 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1691 #if defined (CLNT_INTERRUPT_COAL) 1692 kqp->rdmaconn.c_count = 0; 1693 conn->c_count = 0; 1694 bzero(&kqp->wd, sizeof(struct send_wid)); 1695 kqp->wd.forw = kqp->wd.back = &kqp->wd; 1696 #endif 1697 /* 1698 * Initialize the client credit control 1699 * portion of the rdmaconn struct. 1700 */ 1701 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1702 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1703 cc_info->clnt_cc_granted_ops = 0; 1704 cc_info->clnt_cc_in_flight_ops = 0; 1705 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1706 1707 *qp = kqp; 1708 return (RDMA_SUCCESS); 1709 } 1710 1711 /* Server side qp creation */ 1712 static rdma_stat 1713 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1714 { 1715 rib_qp_t *kqp = NULL; 1716 ibt_chan_sizes_t chan_sizes; 1717 ibt_rc_chan_alloc_args_t qp_attr; 1718 ibt_status_t ibt_status; 1719 rdma_srv_cred_ctrl_t *cc_info; 1720 1721 ASSERT(qp != NULL); 1722 *qp = NULL; 1723 1724 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1725 kqp->hca = hca; 1726 kqp->port_num = port; 1727 kqp->rdmaconn.c_rdmamod = &rib_mod; 1728 kqp->rdmaconn.c_private = (caddr_t)kqp; 1729 1730 /* 1731 * Create the qp handle 1732 */ 1733 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1734 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1735 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1736 qp_attr.rc_pd = hca->pd_hdl; 1737 qp_attr.rc_hca_port_num = port; 1738 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1739 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1761 goto fail; 1762 } 1763 1764 kqp->mode = RIB_SERVER; 1765 kqp->chan_flags = IBT_BLOCKING; 1766 kqp->q = q; /* server ONLY */ 1767 1768 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1769 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1770 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1771 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1772 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1773 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1774 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1775 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1776 /* 1777 * Set the private data area to qp to be used in callbacks 1778 */ 1779 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1780 kqp->rdmaconn.c_state = C_CONNECTED; 1781 1782 /* 1783 * Initialize the server credit control 1784 * portion of the rdmaconn struct. 1785 */ 1786 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1787 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1788 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1789 cc_info->srv_cc_cur_buffers_used = 0; 1790 cc_info->srv_cc_posted = preposted_rbufs; 1791 1792 *qp = kqp; 1793 1794 num_clients++; 1795 return (RDMA_SUCCESS); 1796 fail: 1797 if (kqp) 1798 kmem_free(kqp, sizeof (rib_qp_t)); 1799 1800 return (RDMA_FAILED); 1801 } 1802 1803 void 1804 rib_dump_pathrec(ibt_path_info_t *path_rec) 1805 { 1806 ib_pkey_t pkey; 1807 1808 if (rib_debug > 1) { 1809 cmn_err(CE_NOTE, "Path Record:\n"); 1810 1811 cmn_err(CE_NOTE, "Source HCA GUID = %llx\n", 1812 (longlong_t)path_rec->pi_hca_guid); 1813 cmn_err(CE_NOTE, "Dest Service ID = %llx\n", 1814 (longlong_t)path_rec->pi_sid);
2057 2058 (void) bzero(&chan_args, sizeof (chan_args)); 2059 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 2060 2061 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num; 2062 /* Alloc a RC channel */ 2063 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 2064 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 2065 qp_attr.rc_pd = hca->pd_hdl; 2066 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 2067 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 2068 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 2069 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 2070 qp_attr.rc_clone_chan = NULL; 2071 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 2072 qp_attr.rc_flags = IBT_WR_SIGNALED; 2073 2074 chan_args.oc_path = path; 2075 chan_args.oc_cm_handler = rib_clnt_cm_handler; 2076 chan_args.oc_cm_clnt_private = (void *)rib_stat; 2077 chan_args.oc_rdma_ra_out = 4; 2078 chan_args.oc_rdma_ra_in = 4; 2079 chan_args.oc_path_retry_cnt = 2; 2080 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 2081 2082 refresh: 2083 rw_enter(&hca->state_lock, RW_READER); 2084 if (hca->state != HCA_DETACHED) { 2085 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 2086 IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl, 2087 &chan_sizes); 2088 } else { 2089 rw_exit(&hca->state_lock); 2090 return (RDMA_FAILED); 2091 } 2092 rw_exit(&hca->state_lock); 2093 2094 if (ibt_status != IBT_SUCCESS) { 2095 #ifdef DEBUG 2096 cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel " 2097 "failed, ibt_status=%d.", ibt_status); 2098 #endif
2235 (void) rib_rem_replylist(qp); 2236 } 2237 2238 cv_destroy(&qp->cb_conn_cv); 2239 cv_destroy(&qp->posted_rbufs_cv); 2240 mutex_destroy(&qp->cb_lock); 2241 2242 mutex_destroy(&qp->replylist_lock); 2243 mutex_destroy(&qp->posted_rbufs_lock); 2244 mutex_destroy(&qp->rdlist_lock); 2245 2246 cv_destroy(&conn->c_cv); 2247 mutex_destroy(&conn->c_lock); 2248 2249 if (conn->c_raddr.buf != NULL) { 2250 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2251 } 2252 if (conn->c_laddr.buf != NULL) { 2253 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2254 } 2255 2256 /* 2257 * Credit control cleanup. 2258 */ 2259 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2260 rdma_clnt_cred_ctrl_t *cc_info; 2261 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2262 cv_destroy(&cc_info->clnt_cc_cv); 2263 } 2264 2265 kmem_free(qp, sizeof (rib_qp_t)); 2266 2267 /* 2268 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2269 * then the hca is no longer being used. 2270 */ 2271 if (conn_list != NULL) { 2272 rw_enter(&hca->state_lock, RW_READER); 2273 if (hca->state == HCA_DETACHED) { 2274 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2275 if (hca->srv_conn_list.conn_hd == NULL) { 2276 rw_enter(&hca->cl_conn_list.conn_lock, 2277 RW_READER); 2278 if (hca->cl_conn_list.conn_hd == NULL) { 2279 mutex_enter(&hca->inuse_lock); 2280 hca->inuse = FALSE; 2281 cv_signal(&hca->cb_cv); 2282 mutex_exit(&hca->inuse_lock); 2283 } 2284 rw_exit(&hca->cl_conn_list.conn_lock); 2285 } 2286 rw_exit(&hca->srv_conn_list.conn_lock); 2287 } 2288 rw_exit(&hca->state_lock); 2289 } 2290 2291 num_clients--; 2292 return (RDMA_SUCCESS); 2293 } 2294 2295 #ifdef DYNAMIC_CREDIT_CONTROL 2296 void rib_get_resource_info(CONN *conn, int *current_clients, int *avail_bufs) 2297 { 2298 rib_qp_t *qp = ctoqp(conn); 2299 rib_hca_t *hca = qp->hca; 2300 rib_bufpool_t *rbp = NULL; 2301 bufpool_t *bp; 2302 2303 is_server = 1; 2304 rbp = hca->recv_pool; 2305 2306 if (rbp == NULL) 2307 *avail_bufs = 0; 2308 else { 2309 bp = rbp->bpool; 2310 *avail_bufs = bp->buffree; 2311 } 2312 2313 *current_clients = num_clients; 2314 } 2315 #endif 2316 2317 /* 2318 * Wait for send completion notification. Only on receiving a 2319 * notification be it a successful or error completion, free the 2320 * send_wid. 2321 */ 2322 static rdma_stat 2323 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2324 { 2325 clock_t timout, cv_wait_ret; 2326 rdma_stat error = RDMA_SUCCESS; 2327 int i; 2328 2329 /* 2330 * Wait for send to complete 2331 */ 2332 ASSERT(wd != NULL); 2333 mutex_enter(&wd->sendwait_lock); 2334 if (wd->status == (uint_t)SEND_WAIT) { 2335 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2336 ddi_get_lbolt();
2431 2432 static rdma_stat 2433 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2434 { 2435 mutex_enter(&qp->replylist_lock); 2436 if (rep != NULL) { 2437 (void) rib_remreply(qp, rep); 2438 mutex_exit(&qp->replylist_lock); 2439 return (RDMA_SUCCESS); 2440 } 2441 mutex_exit(&qp->replylist_lock); 2442 return (RDMA_FAILED); 2443 } 2444 2445 /* 2446 * Send buffers are freed here only in case of error in posting 2447 * on QP. If the post succeeded, the send buffers are freed upon 2448 * send completion in rib_sendwait() or in the scq_handler. 2449 */ 2450 rdma_stat 2451 #if defined(ASYNC_SERVER_DEREG) 2452 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2453 int send_sig, int cv_sig, caddr_t c, caddr_t c1, int l1, caddr_t c2, int l2, int l3, int l4) 2454 #else 2455 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2456 int send_sig, int cv_sig, caddr_t *swid) 2457 #endif 2458 { 2459 struct send_wid *wdesc; 2460 struct clist *clp; 2461 ibt_status_t ibt_status = IBT_SUCCESS; 2462 rdma_stat ret = RDMA_SUCCESS; 2463 ibt_send_wr_t tx_wr; 2464 int i, nds; 2465 ibt_wr_ds_t sgl[DSEG_MAX]; 2466 uint_t total_msg_size; 2467 rib_qp_t *qp = ctoqp(conn); 2468 2469 ASSERT(cl != NULL); 2470 2471 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2472 2473 nds = 0; 2474 total_msg_size = 0; 2475 clp = cl; 2476 while (clp != NULL) { 2477 if (nds >= DSEG_MAX) { 2478 cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX" 2479 " too small!"); 2480 return (RDMA_FAILED); 2481 } 2482 sgl[nds].ds_va = clp->c_saddr; 2483 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2484 sgl[nds].ds_len = clp->c_len; 2485 total_msg_size += clp->c_len; 2486 clp = clp->c_next; 2487 nds++; 2488 } 2489 2490 if (send_sig) { 2491 /* Set SEND_SIGNAL flag. */ 2492 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2493 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2494 *swid = (caddr_t)wdesc; 2495 } else { 2496 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2497 wdesc = rib_init_sendwait(msgid, 0, qp); 2498 *swid = (caddr_t)wdesc; 2499 } 2500 wdesc->nsbufs = nds; 2501 #if defined(ASYNC_SERVER_DEREG) 2502 wdesc->c = c; 2503 wdesc->c1 = c1; 2504 wdesc->c2 = c2; 2505 wdesc->l1 = l1; 2506 wdesc->l2 = l2; 2507 wdesc->wl = l3; 2508 wdesc->rl = l4; 2509 #endif 2510 for (i = 0; i < nds; i++) { 2511 wdesc->sbufaddr[i] = sgl[i].ds_va; 2512 } 2513 2514 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2515 tx_wr.wr_opcode = IBT_WRC_SEND; 2516 tx_wr.wr_trans = IBT_RC_SRV; 2517 tx_wr.wr_nds = nds; 2518 tx_wr.wr_sgl = sgl; 2519 2520 mutex_enter(&conn->c_lock); 2521 if (conn->c_state & C_CONNECTED) { 2522 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2523 } 2524 if (((conn->c_state & C_CONNECTED) == 0) || 2525 ibt_status != IBT_SUCCESS) { 2526 mutex_exit(&conn->c_lock); 2527 for (i = 0; i < nds; i++) { 2528 rib_rbuf_free(conn, SEND_BUFFER, 2529 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2546 * cv_wait for send to complete. 2547 * We can fail due to a timeout or signal or 2548 * unsuccessful send. 2549 */ 2550 ret = rib_sendwait(qp, wdesc); 2551 #ifdef DEBUG 2552 if (rib_debug > 2) 2553 if (ret != 0) { 2554 cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait " 2555 "FAILED, rdma stat=%d, wr_id %llx, qp %p!", 2556 ret, (longlong_t)tx_wr.wr_id, (void *)qp); 2557 } 2558 #endif 2559 return (ret); 2560 } 2561 } 2562 2563 return (RDMA_SUCCESS); 2564 } 2565 2566 #if defined (CLNT_INTERRUPT_COAL) 2567 rdma_stat 2568 rib_send_bl(CONN *conn, struct clist *cl, uint32_t msgid) 2569 { 2570 rdma_stat ret; 2571 struct send_wid *sd, dlist; 2572 rib_qp_t *qp = ctoqp(conn); 2573 caddr_t wd; 2574 mutex_enter(&conn->c_lock); 2575 if((conn->c_count+1) >= (preposted_rbufs/2)){ 2576 conn->c_count = 0; 2577 dlist.forw = dlist.back = &dlist; 2578 while(qp->wd.forw != &qp->wd){ 2579 sd = qp->wd.forw; 2580 remque(sd); 2581 insque(sd,&dlist); 2582 } 2583 mutex_exit(&conn->c_lock); 2584 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2585 while(dlist.forw != &dlist){ 2586 sd = dlist.forw; 2587 remque(dlist.forw); 2588 rib_scq_free((caddr_t)sd); 2589 } 2590 }else{ 2591 mutex_exit(&conn->c_lock); 2592 wd = 0; 2593 ret = rib_send_and_wait(conn, cl, msgid, 0, 0, &wd); 2594 mutex_enter(&conn->c_lock); 2595 conn->c_count ++; 2596 insque(wd, &qp->wd); 2597 mutex_exit(&conn->c_lock); 2598 } 2599 return (ret); 2600 } 2601 #endif 2602 2603 rdma_stat 2604 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2605 { 2606 rdma_stat ret; 2607 /* send-wait & cv_signal */ 2608 #if defined(ASYNC_SERVER_DEREG) 2609 ret = rib_send_and_wait(conn, cl, msgid,1,1,0,0,0,0,0,0,0, &wd); 2610 #else 2611 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2612 #endif 2613 return (ret); 2614 } 2615 2616 #if defined(ASYNC_SERVER_DEREG) 2617 rdma_stat 2618 rib_send_nw(CONN *conn, struct clist *cl, uint32_t msgid, caddr_t c, caddr_t c1, int c2, caddr_t c3, int c4, int c5, int c6) 2619 { 2620 rdma_stat ret; 2621 caddr_t *wid; 2622 /* send-wait & cv_signal */ 2623 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, c, c1, c2, c3, c4, c5, c6, wid); 2624 2625 return (ret); 2626 } 2627 #endif 2628 /* 2629 * Server interface (svc_rdma_ksend). 2630 * Send RPC reply and wait for RDMA_DONE. 2631 */ 2632 rdma_stat 2633 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2634 { 2635 rdma_stat ret = RDMA_SUCCESS; 2636 struct rdma_done_list *rd; 2637 clock_t timout, cv_wait_ret; 2638 caddr_t *wid; 2639 rib_qp_t *qp = ctoqp(conn); 2640 2641 mutex_enter(&qp->rdlist_lock); 2642 rd = rdma_done_add(qp, msgid); 2643 2644 /* No cv_signal (whether send-wait or no-send-wait) */ 2645 #if defined(ASYNC_SERVER_DEREG) 2646 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, 0, 0, 0, 0, 0, 0, 0, wid); 2647 #else 2648 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2649 #endif 2650 if (ret != RDMA_SUCCESS) { 2651 #ifdef DEBUG 2652 cmn_err(CE_WARN, "rib_send_resp: send_and_wait " 2653 "failed, msgid %u, qp %p", msgid, (void *)qp); 2654 #endif 2655 rdma_done_rm(qp, rd); 2656 goto done; 2657 } 2658 2659 /* 2660 * Wait for RDMA_DONE from remote end 2661 */ 2662 timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2663 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock, 2664 timout); 2665 rdma_done_rm(qp, rd); 2666 if (cv_wait_ret < 0) { 2667 #ifdef DEBUG 2668 if (rib_debug > 1) { 2669 cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
2937 #ifdef DEBUG 2938 cmn_err(CE_WARN, "rib_recv: no matching reply for " 2939 "xid %u, qp %p\n", msgid, (void *)qp); 2940 #endif 2941 } 2942 2943 /* 2944 * Done. 2945 */ 2946 mutex_exit(&qp->replylist_lock); 2947 return (ret); 2948 } 2949 2950 /* 2951 * RDMA write a buffer to the remote address. 2952 */ 2953 rdma_stat 2954 rib_write(CONN *conn, struct clist *cl, int wait) 2955 { 2956 ibt_send_wr_t tx_wr; 2957 int cv_sig; 2958 ibt_wr_ds_t sgl[DSEG_MAX]; 2959 struct send_wid *wdesc; 2960 ibt_status_t ibt_status; 2961 rdma_stat ret = RDMA_SUCCESS; 2962 rib_qp_t *qp = ctoqp(conn); 2963 2964 if (cl == NULL) { 2965 cmn_err(CE_WARN, "rib_write: NULL clist\n"); 2966 return (RDMA_FAILED); 2967 } 2968 2969 2970 while ((cl != NULL)) { 2971 if(cl->c_len > 0){ 2972 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2973 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr; 2974 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */ 2975 sgl[0].ds_va = cl->c_saddr; 2976 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2977 sgl[0].ds_len = cl->c_len; 2978 2979 if (wait) { 2980 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2981 cv_sig = 1; 2982 } else { 2983 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2984 cv_sig = 0; 2985 } 2986 2987 wdesc = rib_init_sendwait(0, cv_sig, qp); 2988 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2989 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2990 tx_wr.wr_trans = IBT_RC_SRV; 2991 tx_wr.wr_nds = 1; 2992 tx_wr.wr_sgl = sgl; 2993 2994 mutex_enter(&conn->c_lock); 2995 if (conn->c_state & C_CONNECTED) { 2996 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2997 } 2998 if (((conn->c_state & C_CONNECTED) == 0) || 2999 ibt_status != IBT_SUCCESS) { 3000 mutex_exit(&conn->c_lock); 3001 (void) rib_free_sendwait(wdesc); 3002 return (RDMA_FAILED); 3003 } 3004 mutex_exit(&conn->c_lock); 3005 3006 /* 3007 * Wait for send to complete 3008 */ 3009 if (wait) { 3010 ret = rib_sendwait(qp, wdesc); 3011 if (ret != 0) { 3012 return (ret); 3013 } 3014 } 3015 } 3016 cl = cl->c_next; 3017 } 3018 return (RDMA_SUCCESS); 3019 } 3020 3021 /* 3022 * RDMA Read a buffer from the remote address. 3023 */ 3024 rdma_stat 3025 rib_read(CONN *conn, struct clist *cl, int wait) 3026 { 3027 ibt_send_wr_t rx_wr; 3028 int nds; 3029 int cv_sig; 3030 ibt_wr_ds_t sgl[DSEG_MAX]; /* is 2 sufficient? */ 3031 struct send_wid *wdesc; 3032 ibt_status_t ibt_status = IBT_SUCCESS; 3033 rdma_stat ret = RDMA_SUCCESS; 3034 rib_qp_t *qp = ctoqp(conn); 3035 3036 if (cl == NULL) { 3037 cmn_err(CE_WARN, "rib_read: NULL clist\n");
3116 return (zero == 0); 3117 } 3118 3119 /* 3120 * rib_srv_cm_handler() 3121 * Connection Manager callback to handle RC connection requests. 3122 */ 3123 /* ARGSUSED */ 3124 static ibt_cm_status_t 3125 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 3126 ibt_cm_return_args_t *ret_args, void *priv_data, 3127 ibt_priv_data_len_t len) 3128 { 3129 queue_t *q; 3130 rib_qp_t *qp; 3131 rpcib_state_t *ribstat; 3132 rib_hca_t *hca; 3133 rdma_stat status = RDMA_SUCCESS; 3134 int i; 3135 struct clist cl; 3136 rdma_buf_t rdbuf = {0}; 3137 void *buf = NULL; 3138 ibt_cm_req_rcv_t cm_req_rcv; 3139 CONN *conn; 3140 ibt_status_t ibt_status; 3141 ibt_ar_t ar_query, ar_result; 3142 ib_gid_t sgid; 3143 3144 3145 ASSERT(any != NULL); 3146 ASSERT(event != NULL); 3147 3148 ribstat = (rpcib_state_t *)any; 3149 hca = (rib_hca_t *)ribstat->hca; 3150 ASSERT(hca != NULL); 3151 3152 /* got a connection request */ 3153 switch (event->cm_type) { 3154 case IBT_CM_EVENT_REQ_RCV: 3155 /* 3156 * If the plugin is in the NO_ACCEPT state, bail out.
3201 cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n", 3202 cm_req_rcv.req_remote_qpn); 3203 cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n", 3204 cm_req_rcv.req_remote_qkey); 3205 cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n", 3206 (void *)qp, (void *)qp->qp_hdl); 3207 } 3208 3209 if (rib_debug > 2) { 3210 ibt_rc_chan_query_attr_t chan_attrs; 3211 3212 if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs) 3213 == IBT_SUCCESS) { 3214 cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in " 3215 "CEP state %d\n", (void *)qp, chan_attrs.rc_state); 3216 } 3217 } 3218 #endif 3219 3220 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 3221 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 3222 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 3223 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 3224 3225 /* 3226 * Pre-posts RECV buffers 3227 */ 3228 conn = qptoc(qp); 3229 for (i = 0; i < preposted_rbufs; i++) { 3230 bzero(&rdbuf, sizeof (rdbuf)); 3231 rdbuf.type = RECV_BUFFER; 3232 buf = rib_rbuf_alloc(conn, &rdbuf); 3233 if (buf == NULL) { 3234 cmn_err(CE_WARN, "rib_svc_cm_handler: " 3235 "No RECV_BUFFER buf!\n"); 3236 (void) rib_disconnect_channel(conn, NULL); 3237 return (IBT_CM_REJECT); 3238 } 3239 3240 bzero(&cl, sizeof (cl)); 3241 cl.c_saddr = (uintptr_t)rdbuf.addr; 3242 cl.c_len = rdbuf.len;
4126 rep->prev->next = rep->next; 4127 } 4128 if (rep->next) { 4129 rep->next->prev = rep->prev; 4130 } 4131 if (qp->replylist == rep) 4132 qp->replylist = rep->next; 4133 4134 cv_destroy(&rep->wait_cv); 4135 qp->rep_list_size--; 4136 if (rib_debug > 1) 4137 cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n", 4138 (void *)qp, qp->rep_list_size); 4139 4140 kmem_free(rep, sizeof (*rep)); 4141 4142 return (0); 4143 } 4144 4145 rdma_stat 4146 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 4147 struct mrc *buf_handle) 4148 { 4149 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 4150 #ifdef IB_FMR_SUP 4151 ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */ 4152 ibt_ma_hdl_t ma_hdl = NULL; 4153 #endif 4154 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 4155 rdma_stat status; 4156 rib_hca_t *hca = (ctoqp(conn))->hca; 4157 4158 /* 4159 * Note: ALL buffer pools use the same memory type RDMARW. 4160 */ 4161 #ifdef IB_FMR_SUP 4162 status = rib_reg_mem_fmr(hca, adsp, buf, buflen, 0, &mr_hdl, &ma_hdl, 4163 &pmr_desc); 4164 if (status == RDMA_SUCCESS) { 4165 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 4166 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey; 4167 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey; 4168 buf_handle->mrc_lma = (uintptr_t)ma_hdl; 4169 goto ret_stat; 4170 } else { 4171 buf_handle->mrc_linfo = NULL; 4172 buf_handle->mrc_lma = NULL; 4173 buf_handle->mrc_lmr = 0; 4174 buf_handle->mrc_rmr = 0; 4175 } 4176 #endif 4177 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 4178 if (status == RDMA_SUCCESS) { 4179 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 4180 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 4181 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 4182 } else { 4183 buf_handle->mrc_linfo = NULL; 4184 buf_handle->mrc_lmr = 0; 4185 buf_handle->mrc_rmr = 0; 4186 } 4187 ret_stat: 4188 return (status); 4189 } 4190 4191 #ifdef IB_FMR_SUP 4192 static rdma_stat 4193 rib_reg_mem_fmr(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec, 4194 ibt_mr_hdl_t *mr_hdlp, ibt_ma_hdl_t *ma_hdlp, ibt_pmr_desc_t *pmr_descp) 4195 { 4196 ibt_va_attr_t va_attr; 4197 ibt_phys_buf_t *paddr_list; 4198 uint_t paddr_list_len, num_paddr; 4199 size_t buf_sz = 0; 4200 ibt_pmr_attr_t pmr_attr; 4201 ib_memlen_t paddr_offset; 4202 ibt_status_t ibt_status; 4203 uint_t h_page_sz; 4204 if(adsp) 4205 return(RDMA_FAILED); 4206 bzero(&va_attr, sizeof (ibt_va_attr_t)); 4207 va_attr.va_vaddr = (ib_vaddr_t)buf; 4208 va_attr.va_len = size; 4209 va_attr.va_as = (struct as *)(caddr_t)adsp; 4210 va_attr.va_flags = IBT_VA_FMR | IBT_VA_SLEEP; 4211 if (spec == IBT_MR_NONCOHERENT) 4212 va_attr.va_flags |= IBT_VA_NONCOHERENT; 4213 va_attr.va_phys_buf_min = va_attr.va_phys_buf_max = 0; 4214 4215 h_page_sz = hca->hca_attrs.hca_page_sz * 1024; 4216 paddr_list_len = (size / h_page_sz) + 2; 4217 paddr_list = (ibt_phys_buf_t *)kmem_zalloc(sizeof (ibt_phys_buf_t) * 4218 paddr_list_len, KM_NOSLEEP); 4219 4220 if (rib_debug > 0) { 4221 cmn_err(CE_NOTE, "fmr: vaddr %p, size %d paddr_list_len %d \n", 4222 buf, size, paddr_list_len); 4223 } 4224 4225 ibt_status = ibt_map_mem_area(hca->hca_hdl, &va_attr, paddr_list_len, 4226 paddr_list, &num_paddr, &buf_sz, &paddr_offset, ma_hdlp); 4227 if (ibt_status != IBT_SUCCESS) { 4228 cmn_err(CE_WARN, "rib_reg_mem_fmr: ibt_map_mem_area failed: " 4229 "status %d", ibt_status); 4230 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len); 4231 return (RDMA_FAILED); 4232 } 4233 4234 if (rib_debug > 0) { 4235 cmn_err(CE_NOTE,"fmr: p_laddr %p, p_size %d, buf_sz %d, p_ofset %llX\n", 4236 paddr_list[0].p_laddr, paddr_list[0].p_size, buf_sz, 4237 paddr_offset); 4238 cmn_err(CE_NOTE,"fmr: ibt_map_mem_area: ret %d, num_paddr %d, spec %d\n", 4239 ibt_status, num_paddr, spec); 4240 } 4241 4242 bzero(&pmr_attr, sizeof (ibt_pmr_attr_t)); 4243 pmr_attr.pmr_iova = (ib_vaddr_t)buf; 4244 pmr_attr.pmr_len = size; 4245 pmr_attr.pmr_num_buf = num_paddr; 4246 pmr_attr.pmr_buf_sz = buf_sz; 4247 pmr_attr.pmr_buf_list = paddr_list; 4248 pmr_attr.pmr_offset = paddr_offset; 4249 pmr_attr.pmr_flags = spec; 4250 pmr_attr.pmr_ma = *ma_hdlp; 4251 4252 ibt_status = ibt_register_physical_fmr(hca->hca_hdl, hca->fmr_pool, 4253 &pmr_attr, mr_hdlp, pmr_descp); 4254 if (ibt_status != IBT_SUCCESS) { 4255 cmn_err(CE_WARN, "rib_reg_mem_fmr: ibt_register_physical_fmr " 4256 "failed: status %d", ibt_status); 4257 (void) ibt_unmap_mem_area(hca->hca_hdl, *ma_hdlp); 4258 *ma_hdlp=NULL; 4259 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len); 4260 return (RDMA_FAILED); 4261 } 4262 4263 if (rib_debug > 0) { 4264 cmn_err(CE_NOTE,"fmr: rkey: 0x%lX lkey: 0x%lX, iova: %p, fmr_hdl %p \n", 4265 pmr_descp->pmd_rkey, pmr_descp->pmd_lkey, 4266 pmr_descp->pmd_iova, *mr_hdlp); 4267 } 4268 4269 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len); 4270 4271 return (RDMA_SUCCESS); 4272 4273 } 4274 4275 #endif 4276 static rdma_stat 4277 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec, 4278 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 4279 { 4280 ibt_mr_attr_t mem_attr; 4281 ibt_status_t ibt_status; 4282 mem_attr.mr_vaddr = (uintptr_t)buf; 4283 mem_attr.mr_len = (ib_msglen_t)size; 4284 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 4285 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 4286 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 4287 IBT_MR_ENABLE_WINDOW_BIND | spec; 4288 4289 rw_enter(&hca->state_lock, RW_READER); 4290 if (hca->state == HCA_INITED) { 4291 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 4292 &mem_attr, mr_hdlp, mr_descp); 4293 rw_exit(&hca->state_lock); 4294 } else { 4295 rw_exit(&hca->state_lock); 4296 return (RDMA_FAILED); 4297 } 4298 4299 if (ibt_status != IBT_SUCCESS) { 4300 cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr " 4301 "(spec:%d) failed for addr %llX, status %d", 4302 spec, (longlong_t)mem_attr.mr_vaddr, ibt_status); 4303 return (RDMA_FAILED); 4304 } 4305 return (RDMA_SUCCESS); 4306 } 4307 4308 rdma_stat 4309 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 4310 #ifdef SERVER_REG_CACHE 4311 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 4312 #else 4313 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle) 4314 #endif 4315 { 4316 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 4317 #ifdef IB_FMR_SUP 4318 ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */ 4319 ibt_ma_hdl_t ma_hdl = NULL; 4320 #endif 4321 #ifdef SERVER_REG_CACHE 4322 rib_lrc_entry_t *l; 4323 #endif 4324 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 4325 rdma_stat status; 4326 rib_hca_t *hca = (ctoqp(conn))->hca; 4327 4328 /* 4329 * Non-coherent memory registration. 4330 */ 4331 #ifdef SERVER_REG_CACHE 4332 l = (rib_lrc_entry_t *)lrc; 4333 if(l){ 4334 if(l->registered){ 4335 buf_handle->mrc_linfo = (uintptr_t)l->lrc_mhandle.mrc_linfo; 4336 buf_handle->mrc_lmr = (uint32_t)l->lrc_mhandle.mrc_lmr; 4337 buf_handle->mrc_rmr = (uint32_t)l->lrc_mhandle.mrc_rmr; 4338 #ifdef IB_FMR_SUP 4339 buf_handle->mrc_lma = (uintptr_t)l->lrc_mhandle.mrc_lma; 4340 #endif 4341 *sync_handle = (RIB_SYNCMEM_HANDLE)l->lrc_mhandle.mrc_linfo; 4342 return(RDMA_SUCCESS); 4343 } else { 4344 /* Always register the whole buffer */ 4345 buf = (caddr_t)l->lrc_buf; 4346 buflen = l->lrc_len; 4347 /*cmn_err(CE_NOTE,"Register %p of length %d\n",buf,buflen);*/ 4348 } 4349 } 4350 #endif 4351 #ifdef IB_FMR_SUP 4352 status = rib_reg_mem_fmr(hca, adsp, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, 4353 &ma_hdl, &pmr_desc); 4354 if (status == RDMA_SUCCESS) { 4355 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 4356 buf_handle->mrc_lma = (uintptr_t)ma_hdl; 4357 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey; 4358 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey; 4359 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 4360 #ifdef SERVER_REG_CACHE 4361 if(l){ 4362 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 4363 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 4364 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 4365 l->registered = TRUE; 4366 l->lrc_mhandle.mrc_lma = (uintptr_t)ma_hdl; 4367 } 4368 #endif 4369 goto ret_stat; 4370 4371 } else { 4372 if (rib_debug > 1) 4373 cmn_err(CE_WARN,"fmr reg failed for buffer %p of length %d\n",buf,buflen); 4374 buf_handle->mrc_linfo = NULL; 4375 buf_handle->mrc_lma = NULL; 4376 buf_handle->mrc_lmr = 0; 4377 buf_handle->mrc_rmr = 0; 4378 } 4379 #endif 4380 status = rib_reg_mem(hca, adsp, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, 4381 &mr_desc); 4382 if (status == RDMA_SUCCESS) { 4383 #ifdef SERVER_REG_CACHE 4384 if(l){ 4385 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 4386 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 4387 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 4388 l->registered = TRUE; 4389 } 4390 #endif 4391 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 4392 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 4393 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 4394 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 4395 } else { 4396 buf_handle->mrc_linfo = NULL; 4397 buf_handle->mrc_lmr = 0; 4398 buf_handle->mrc_rmr = 0; 4399 } 4400 ret_stat: 4401 return (status); 4402 } 4403 4404 /* ARGSUSED */ 4405 rdma_stat 4406 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 4407 { 4408 avl_index_t where = NULL; 4409 #ifdef IB_FMR_SUP 4410 ibt_status_t ibt_status; 4411 #endif 4412 rib_hca_t *hca = (ctoqp(conn))->hca; 4413 /* 4414 * Allow memory deregistration even if HCA is 4415 * getting detached. Need all outstanding 4416 * memory registrations to be deregistered 4417 * before HCA_DETACH_EVENT can be accepted. 4418 */ 4419 #ifdef IB_FMR_SUP 4420 if(buf_handle.mrc_lma){ 4421 ibt_status = ibt_unmap_mem_area(hca->hca_hdl, 4422 (ibt_ma_hdl_t)buf_handle.mrc_lma); 4423 if (ibt_status != IBT_SUCCESS){ 4424 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed", 4425 ibt_status); 4426 return (RDMA_FAILED); 4427 } 4428 4429 ibt_status = ibt_deregister_fmr(hca->hca_hdl, 4430 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 4431 if (ibt_status != IBT_SUCCESS) 4432 return (RDMA_FAILED); 4433 return (RDMA_SUCCESS); 4434 } 4435 #endif 4436 (void) ibt_deregister_mr(hca->hca_hdl, 4437 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 4438 return (RDMA_SUCCESS); 4439 } 4440 4441 /* ARGSUSED */ 4442 rdma_stat 4443 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 4444 #ifdef SERVER_REG_CACHE 4445 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 4446 #else 4447 RIB_SYNCMEM_HANDLE sync_handle) 4448 #endif 4449 { 4450 #ifdef SERVER_REG_CACHE 4451 rib_lrc_entry_t *l; 4452 l = (rib_lrc_entry_t *)lrc; 4453 if(l) 4454 if(l->registered) 4455 return(RDMA_SUCCESS); 4456 #endif 4457 4458 4459 (void) rib_deregistermem(conn, buf, buf_handle); 4460 4461 return (RDMA_SUCCESS); 4462 } 4463 4464 /* ARGSUSED */ 4465 rdma_stat 4466 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 4467 int len, int cpu) 4468 { 4469 ibt_status_t status; 4470 rib_hca_t *hca = (ctoqp(conn))->hca; 4471 ibt_mr_sync_t mr_segment; 4472 4473 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 4474 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 4475 mr_segment.ms_len = (ib_memlen_t)len; 4476 if (cpu) { 4477 /* make incoming data visible to memory */ 4478 mr_segment.ms_flags = IBT_SYNC_WRITE;
4517 } 4518 4519 rib_bufpool_t * 4520 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 4521 { 4522 rib_bufpool_t *rbp = NULL; 4523 bufpool_t *bp = NULL; 4524 caddr_t buf; 4525 ibt_mr_attr_t mem_attr; 4526 ibt_status_t ibt_status; 4527 int i, j; 4528 4529 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 4530 4531 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 4532 num * sizeof (void *), KM_SLEEP); 4533 4534 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 4535 bp->numelems = num; 4536 4537 4538 switch (ptype) { 4539 case SEND_BUFFER: 4540 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 4541 bp->rsize = RPC_MSG_SZ; 4542 break; 4543 case RECV_BUFFER: 4544 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 4545 bp->rsize = RPC_BUF_SIZE; 4546 break; 4547 default: 4548 goto fail; 4549 } 4550 4551 /* 4552 * Register the pool. 4553 */ 4554 bp->bufsize = num * bp->rsize; 4555 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 4556 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 4557 sizeof (ibt_mr_hdl_t), KM_SLEEP); 4558 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 4559 sizeof (ibt_mr_desc_t), KM_SLEEP); 4560 rw_enter(&hca->state_lock, RW_READER); 4561 if (hca->state != HCA_INITED) { 4562 rw_exit(&hca->state_lock); 4563 cmn_err(CE_WARN,"hca->state != HCA_INITED"); 4564 goto fail; 4565 } 4566 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 4567 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 4568 mem_attr.mr_vaddr = (uintptr_t)buf; 4569 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 4570 mem_attr.mr_as = NULL; 4571 ibt_status = ibt_register_mr(hca->hca_hdl, 4572 hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i], 4573 &rbp->mr_desc[i]); 4574 if (ibt_status != IBT_SUCCESS) { 4575 for (j = 0; j < i; j++) { 4576 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]); 4577 } 4578 rw_exit(&hca->state_lock); 4579 goto fail; 4580 } 4581 } 4582 rw_exit(&hca->state_lock); 4583 buf = (caddr_t)bp->buf; 4584 for (i = 0; i < num; i++, buf += bp->rsize) { 4585 bp->buflist[i] = (void *)buf; 4586 } 4587 bp->buffree = num - 1; /* no. of free buffers */ 4588 rbp->bpool = bp; 4589 4590 return (rbp); 4591 fail: 4592 if (bp) { 4593 if (bp->buf) 4594 kmem_free(bp->buf, bp->bufsize); 4595 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 4596 } 4597 if (rbp) { 4598 if (rbp->mr_hdl) 4599 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 4600 if (rbp->mr_desc) 4601 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 4602 kmem_free(rbp, sizeof (rib_bufpool_t));
4653 break; 4654 case RECV_BUFFER: 4655 rbp = hca->recv_pool; 4656 break; 4657 default: 4658 return; 4659 } 4660 if (rbp == NULL) 4661 return; 4662 4663 bp = rbp->bpool; 4664 4665 /* 4666 * Free the pool memory. 4667 */ 4668 if (rbp->mr_hdl) 4669 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 4670 4671 if (rbp->mr_desc) 4672 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 4673 if (bp->buf) 4674 kmem_free(bp->buf, bp->bufsize); 4675 mutex_destroy(&bp->buflock); 4676 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 4677 kmem_free(rbp, sizeof (rib_bufpool_t)); 4678 } 4679 4680 void 4681 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 4682 { 4683 /* 4684 * Deregister the pool memory and free it. 4685 */ 4686 rib_rbufpool_deregister(hca, ptype); 4687 rib_rbufpool_free(hca, ptype); 4688 } 4689 4690 /* 4691 * Fetch a buffer from the pool of type specified in rdbuf->type. 4692 */
4694 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4695 { 4696 4697 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4698 if (rdbuf->addr) { 4699 switch (rdbuf->type) { 4700 case SEND_BUFFER: 4701 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4702 break; 4703 case RECV_BUFFER: 4704 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4705 break; 4706 default: 4707 rdbuf->len = 0; 4708 } 4709 return (RDMA_SUCCESS); 4710 } else 4711 return (RDMA_FAILED); 4712 } 4713 4714 #if defined(MEASURE_POOL_DEPTH) 4715 static void rib_recv_bufs(uint32_t x) { 4716 return; 4717 } 4718 static void rib_send_bufs(uint32_t x) { 4719 return; 4720 } 4721 #endif 4722 4723 /* 4724 * Fetch a buffer of specified type. 4725 * Note that rdbuf->handle is mw's rkey. 4726 */ 4727 static void * 4728 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4729 { 4730 rib_qp_t *qp = ctoqp(conn); 4731 rib_hca_t *hca = qp->hca; 4732 rdma_btype ptype = rdbuf->type; 4733 void *buf; 4734 rib_bufpool_t *rbp = NULL; 4735 bufpool_t *bp; 4736 int i; 4737 4738 /* 4739 * Obtain pool address based on type of pool 4740 */ 4741 switch (ptype) {
4752 return (NULL); 4753 4754 bp = rbp->bpool; 4755 4756 mutex_enter(&bp->buflock); 4757 if (bp->buffree < 0) { 4758 cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!"); 4759 mutex_exit(&bp->buflock); 4760 return (NULL); 4761 } 4762 4763 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4764 buf = bp->buflist[bp->buffree]; 4765 rdbuf->addr = buf; 4766 rdbuf->len = bp->rsize; 4767 for (i = bp->numelems - 1; i >= 0; i--) { 4768 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4769 rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey; 4770 rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i]; 4771 rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey; 4772 #if defined(MEASURE_POOL_DEPTH) 4773 if(ptype == SEND_BUFFER) 4774 rib_send_bufs(MAX_BUFS - (bp->buffree+1)); 4775 if(ptype == RECV_BUFFER) 4776 rib_recv_bufs(MAX_BUFS - (bp->buffree+1)); 4777 #endif 4778 bp->buffree--; 4779 if (rib_debug > 1) 4780 cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs " 4781 "(type %d)\n", bp->buffree+1, ptype); 4782 4783 mutex_exit(&bp->buflock); 4784 4785 return (buf); 4786 } 4787 } 4788 cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of " 4789 "type %d found!", buf, ptype); 4790 mutex_exit(&bp->buflock); 4791 4792 return (NULL); 4793 } 4794 4795 static void 4796 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4797 {
5609 5610 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 5611 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 5612 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 5613 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 5614 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 5615 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 5616 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 5617 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 5618 5619 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 5620 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 5621 if (hca->srv_conn_list.conn_hd == NULL && 5622 hca->cl_conn_list.conn_hd == NULL) { 5623 /* 5624 * conn_lists are NULL, so destroy 5625 * buffers, close hca and be done. 5626 */ 5627 rib_rbufpool_destroy(hca, RECV_BUFFER); 5628 rib_rbufpool_destroy(hca, SEND_BUFFER); 5629 #ifdef SERVER_REG_CACHE 5630 rib_destroy_cache(hca); 5631 #endif 5632 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 5633 (void) ibt_close_hca(hca->hca_hdl); 5634 hca->hca_hdl = NULL; 5635 } 5636 rw_exit(&hca->cl_conn_list.conn_lock); 5637 rw_exit(&hca->srv_conn_list.conn_lock); 5638 5639 if (hca->hca_hdl != NULL) { 5640 mutex_enter(&hca->inuse_lock); 5641 while (hca->inuse) 5642 cv_wait(&hca->cb_cv, &hca->inuse_lock); 5643 mutex_exit(&hca->inuse_lock); 5644 /* 5645 * conn_lists are now NULL, so destroy 5646 * buffers, close hca and be done. 5647 */ 5648 rib_rbufpool_destroy(hca, RECV_BUFFER); 5649 rib_rbufpool_destroy(hca, SEND_BUFFER); 5650 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 5651 (void) ibt_close_hca(hca->hca_hdl); 5652 hca->hca_hdl = NULL; 5653 } 5654 } 5655 5656 #ifdef SERVER_REG_CACHE 5657 5658 static void 5659 rib_server_side_cache_reclaim(void *argp) 5660 { 5661 cache_avl_struct_t *rcas; 5662 rib_lrc_entry_t *rb; 5663 rib_hca_t *hca = (rib_hca_t *)argp; 5664 5665 rw_enter(&hca->avl_rw_lock,RW_WRITER); 5666 rcas = avl_first(&hca->avl_tree); 5667 if(rcas != NULL) 5668 avl_remove(&hca->avl_tree, rcas); 5669 while(rcas != NULL){ 5670 while(rcas->r.forw != &rcas->r){ 5671 rcas->elements--; 5672 rb = rcas->r.forw; 5673 remque(rb); 5674 rib_deregistermem_via_hca(hca, rb->lrc_buf, rb->lrc_mhandle); 5675 kmem_free(rb->lrc_buf, rb->lrc_len); 5676 kmem_free(rb, sizeof(rib_lrc_entry_t)); 5677 } 5678 mutex_destroy(&rcas->node_lock); 5679 kmem_cache_free(hca->server_side_cache,rcas); 5680 rcas = avl_first(&hca->avl_tree); 5681 if(rcas != NULL) 5682 avl_remove(&hca->avl_tree, rcas); 5683 } 5684 rw_exit(&hca->avl_rw_lock); 5685 } 5686 5687 static int avl_compare(const void *t1,const void *t2) { 5688 5689 if(rib_debug > 1) 5690 cmn_err(CE_NOTE,"Comparing %d and %d\n",((cache_avl_struct_t *)t1)->len, ((cache_avl_struct_t *)t2)->len); 5691 if(((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 5692 return 0; 5693 5694 if(((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 5695 return -1; 5696 5697 if(((cache_avl_struct_t *)t1)->len > ((cache_avl_struct_t *)t2)->len) 5698 return 1; 5699 } 5700 5701 static void rib_destroy_cache(rib_hca_t *hca) { 5702 cache_avl_struct_t *rcas, *root; 5703 rib_lrc_entry_t *rb; 5704 5705 hca->avl_init = FALSE; 5706 kmem_cache_destroy(hca->server_side_cache); 5707 avl_destroy(&hca->avl_tree); 5708 rw_destroy(&hca->avl_rw_lock); 5709 5710 } 5711 5712 static rib_lrc_entry_t * 5713 rib_get_server_cache_buf(CONN *conn,uint32_t len) 5714 { 5715 cache_avl_struct_t cas,*rcas; 5716 rib_hca_t *hca = (ctoqp(conn))->hca; 5717 rib_lrc_entry_t *reply_buf; 5718 avl_index_t where = NULL; 5719 struct rib_lrc_entry *forw = NULL; 5720 if(!hca->avl_init) 5721 goto error_alloc; 5722 cas.len = len; 5723 rw_enter(&hca->avl_rw_lock, RW_READER); 5724 if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL){ 5725 rw_exit(&hca->avl_rw_lock); 5726 rw_enter(&hca->avl_rw_lock, RW_WRITER); 5727 /* Recheck to make sure no other thread added the entry in */ 5728 if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL){ 5729 /* Allocate an avl tree entry */ 5730 if(rib_debug > 1) 5731 cmn_err(CE_NOTE,"Allocating an avl entry for length %d\n",len); 5732 rcas = (cache_avl_struct_t *)kmem_cache_alloc(hca->server_side_cache,KM_SLEEP); 5733 bzero(rcas, sizeof(cache_avl_struct_t)); 5734 rcas->elements = 0; 5735 rcas->r.forw = 5736 &rcas->r; 5737 rcas->r.back = 5738 &rcas->r; 5739 rcas->len = len; 5740 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 5741 avl_insert(&hca->avl_tree,rcas,where); 5742 } 5743 } 5744 if(rcas->elements > 0){ 5745 mutex_enter(&rcas->node_lock); 5746 reply_buf = rcas->r.forw; 5747 remque(reply_buf); 5748 rcas->elements --; 5749 mutex_exit(&rcas->node_lock); 5750 rw_exit(&hca->avl_rw_lock); 5751 if(rib_debug > 1) 5752 cmn_err(CE_NOTE,"Allocating a pre-alloced buffer for length %d\n",len); 5753 } else { 5754 rw_exit(&hca->avl_rw_lock); 5755 rib_total_buffers ++; 5756 if(rib_debug > 1) 5757 cmn_err(CE_NOTE,"Allocating a new buffer for length %d\n",len); 5758 /* Allocate a reply_buf entry */ 5759 reply_buf = (rib_lrc_entry_t *)kmem_alloc(sizeof(rib_lrc_entry_t), KM_SLEEP); 5760 bzero(reply_buf,sizeof(rib_lrc_entry_t)); 5761 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5762 reply_buf->lrc_len = len; 5763 reply_buf->registered = FALSE; 5764 reply_buf->avl_node = (void *)rcas; 5765 } 5766 5767 return reply_buf; 5768 error_alloc: 5769 reply_buf = (rib_lrc_entry_t *)kmem_alloc(sizeof(rib_lrc_entry_t), KM_SLEEP); 5770 bzero(reply_buf,sizeof(rib_lrc_entry_t)); 5771 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5772 reply_buf->lrc_len = len; 5773 reply_buf->registered = FALSE; 5774 reply_buf->avl_node = NULL; 5775 return reply_buf; 5776 } 5777 5778 /* 5779 * Return a pre-registered back to the cache (without 5780 * unregistering the buffer).. 5781 */ 5782 5783 static void 5784 rib_free_server_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5785 { 5786 cache_avl_struct_t cas,*rcas; 5787 avl_index_t where = NULL; 5788 rib_hca_t *hca = (ctoqp(conn))->hca; 5789 if(!reg_buf){ 5790 cmn_err(CE_WARN,"Got a null reg_buf\n"); 5791 return; 5792 } 5793 if(!hca->avl_init) 5794 goto error_free; 5795 cas.len = reg_buf->lrc_len; 5796 rw_enter(&hca->avl_rw_lock, RW_READER); 5797 if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,&cas,&where)) == NULL){ 5798 rw_exit(&hca->avl_rw_lock); 5799 goto error_free; 5800 } else { 5801 mutex_enter(&rcas->node_lock); 5802 insque(reg_buf,&rcas->r); 5803 rcas->elements ++; 5804 mutex_exit(&rcas->node_lock); 5805 rw_exit(&hca->avl_rw_lock); 5806 if(rib_debug > 1) 5807 cmn_err(CE_NOTE,"Returning buffer for length %d\n",reg_buf->lrc_len); 5808 } 5809 return; 5810 error_free: 5811 rib_deregistermem_via_hca(hca, reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5812 kmem_free(reg_buf->lrc_buf,reg_buf->lrc_len); 5813 kmem_free(reg_buf,sizeof(rib_lrc_entry_t)); 5814 } 5815 5816 #endif 5817 5818 static rdma_stat 5819 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5820 uint_t buflen, struct mrc *buf_handle) 5821 { 5822 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5823 #ifdef IB_FMR_SUP 5824 ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */ 5825 ibt_ma_hdl_t ma_hdl = NULL; 5826 #endif 5827 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5828 rdma_stat status; 5829 5830 5831 /* 5832 * Note: ALL buffer pools use the same memory type RDMARW. 5833 */ 5834 /* This code will not be activated on the server. We could remove 5835 the call to rib_reg_mem_fmr. But leave it in, in case the FMR 5836 bugs get fixed. The bigger question is whether we need FMR when 5837 the registered bufffers are coming out of a slab cache. This needs 5838 to be evaluated. 5839 */ 5840 #ifdef IB_FMR_SUP 5841 status = rib_reg_mem_fmr(hca, buf, adsp, buflen, 0, &mr_hdl, &ma_hdl, 5842 &pmr_desc); 5843 if (status == RDMA_SUCCESS) { 5844 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 5845 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey; 5846 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey; 5847 buf_handle->mrc_lma = (uintptr_t)ma_hdl; 5848 goto ret_stat; 5849 } else { 5850 buf_handle->mrc_linfo = NULL; 5851 buf_handle->mrc_lma = NULL; 5852 buf_handle->mrc_lmr = 0; 5853 buf_handle->mrc_rmr = 0; 5854 } 5855 #endif 5856 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5857 if (status == RDMA_SUCCESS) { 5858 buf_handle->mrc_linfo = (uint64_t)mr_hdl; 5859 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5860 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5861 } else { 5862 buf_handle->mrc_linfo = NULL; 5863 buf_handle->mrc_lmr = 0; 5864 buf_handle->mrc_rmr = 0; 5865 } 5866 ret_stat: 5867 return (status); 5868 } 5869 5870 /* ARGSUSED */ 5871 static rdma_stat 5872 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5873 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5874 { 5875 5876 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5877 5878 return (RDMA_SUCCESS); 5879 } 5880 5881 /* ARGSUSED */ 5882 static rdma_stat 5883 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5884 { 5885 #ifdef IB_FMR_SUP 5886 ibt_status_t ibt_status; 5887 if(buf_handle.mrc_lma){ 5888 ibt_status = ibt_unmap_mem_area(hca->hca_hdl, 5889 (ibt_ma_hdl_t)buf_handle.mrc_lma); 5890 if (ibt_status != IBT_SUCCESS){ 5891 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed", 5892 ibt_status); 5893 return (RDMA_FAILED); 5894 } 5895 ibt_status = ibt_deregister_fmr(hca->hca_hdl, 5896 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5897 if (ibt_status != IBT_SUCCESS){ 5898 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed", 5899 ibt_status); 5900 return (RDMA_FAILED); 5901 } 5902 return (RDMA_SUCCESS); 5903 } 5904 #endif 5905 5906 (void) ibt_deregister_mr(hca->hca_hdl, 5907 (ibt_mr_hdl_t)buf_handle.mrc_linfo); 5908 return (RDMA_SUCCESS); 5909 } 5910 5911 #if defined(ASYNC_SERVER_DEREG)||defined(ASYNC_CLIENT_DEREG) 5912 static int 5913 clist_deregister1(CONN *conn, struct clist *cl, bool_t src) 5914 { 5915 struct clist *c; 5916 5917 for (c = cl; c; c = c->c_next) { 5918 if (src) { 5919 if (c->c_smemhandle.mrc_rmr != 0) { 5920 (void) RDMA_DEREGMEMSYNC(conn, 5921 (caddr_t)(uintptr_t)c->c_saddr, 5922 c->c_smemhandle, 5923 #ifdef SERVER_REG_CACHE 5924 (void *)(uintptr_t)c->c_ssynchandle, (void *)c->long_reply_buf); 5925 #else 5926 (void *)(uintptr_t)c->c_ssynchandle); 5927 #endif 5928 c->c_smemhandle.mrc_rmr = 0; 5929 c->c_ssynchandle = NULL; 5930 } 5931 } else { 5932 if (c->c_dmemhandle.mrc_rmr != 0) { 5933 (void) RDMA_DEREGMEMSYNC(conn, 5934 (caddr_t)(uintptr_t)c->c_daddr, 5935 c->c_dmemhandle, 5936 #ifdef SERVER_REG_CACHE 5937 (void *)(uintptr_t)c->c_dsynchandle, (void *)c->long_reply_buf); 5938 #else 5939 (void *)(uintptr_t)c->c_dsynchandle); 5940 #endif 5941 c->c_dmemhandle.mrc_rmr = 0; 5942 c->c_dsynchandle = NULL; 5943 } 5944 } 5945 } 5946 5947 return (RDMA_SUCCESS); 5948 } 5949 #endif 5950 5951 5952 5953 #if defined(ASYNC_CLIENT_DEREG) 5954 static void 5955 async_dereg_thread(caddr_t arg){ 5956 ASYNC *r; 5957 cmn_err(CE_WARN,"async_dereg_thread initiated\n"); 5958 fetch_another_entry: 5959 mutex_enter(&at_mutex); 5960 while ((rqueue.forw == rqueue.back) && (rqueue.forw == &rqueue)) 5961 cv_wait(&at_cond, &at_mutex); 5962 r=rqueue.forw; 5963 remque(rqueue.forw); 5964 mutex_exit(&at_mutex); 5965 /* Process deregistration */ 5966 clist_deregister1(&r->c_conn, &r->c_clist, FALSE); 5967 kmem_free(r, sizeof(ASYNC)); 5968 goto fetch_another_entry; 5969 5970 } 5971 void insert_queue(CONN *conn, struct clist *rwc){ 5972 ASYNC *r; 5973 r=kmem_zalloc(sizeof(ASYNC),KM_SLEEP); 5974 r->c_clist = *rwc; 5975 r->c_conn = *conn; 5976 mutex_enter(&at_mutex); 5977 insque(r,&rqueue); 5978 cv_broadcast(&at_cond); 5979 mutex_exit(&at_mutex); 5980 } 5981 #endif