7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "@(#)rpcib.c 1.29 06/01/25 SMI"
28
29 /*
30 * The rpcib plugin. Implements the interface for RDMATF's
31 * interaction with IBTF.
32 */
33
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/sysmacros.h>
39 #include <sys/proc.h>
40 #include <sys/socket.h>
41 #include <sys/file.h>
42 #include <sys/stream.h>
43 #include <sys/strsubr.h>
44 #include <sys/stropts.h>
45 #include <sys/errno.h>
46 #include <sys/kmem.h>
47 #include <sys/debug.h>
48 #include <sys/systm.h>
49 #include <sys/pathname.h>
50 #include <sys/kstat.h>
51 #include <sys/t_lock.h>
52 #include <sys/ddi.h>
53 #include <sys/cmn_err.h>
54 #include <sys/time.h>
55 #include <sys/isa_defs.h>
56 #include <sys/callb.h>
57 #include <sys/sunddi.h>
58 #include <sys/sunndi.h>
59
60 #include <sys/ib/ibtl/ibti.h>
61 #include <rpc/rpc.h>
62 #include <rpc/ib.h>
63
64 #include <sys/modctl.h>
65
66 #include <sys/pathname.h>
67 #include <sys/kstr.h>
68 #include <sys/sockio.h>
69 #include <sys/vnode.h>
70 #include <sys/tiuser.h>
71 #include <net/if.h>
72 #include <sys/cred.h>
73
74
75 extern char *inet_ntop(int, const void *, char *, int);
76
77
78 /*
79 * Prototype declarations for driver ops
80 */
81
82 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
83 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
84 void *, void **);
85 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
86
87
88 /* rpcib cb_ops */
89 static struct cb_ops rpcib_cbops = {
90 nulldev, /* open */
91 nulldev, /* close */
92 nodev, /* strategy */
93 nodev, /* print */
94 nodev, /* dump */
95 nodev, /* read */
96 nodev, /* write */
97 nodev, /* ioctl */
98 nodev, /* devmap */
99 nodev, /* mmap */
100 nodev, /* segmap */
101 nochpoll, /* poll */
102 ddi_prop_op, /* prop_op */
103 NULL, /* stream */
104 D_MP, /* cb_flag */
105 CB_REV, /* rev */
106 nodev, /* int (*cb_aread)() */
107 nodev /* int (*cb_awrite)() */
108 };
109
110 /*
111 * Device options
112 */
113 static struct dev_ops rpcib_ops = {
114 DEVO_REV, /* devo_rev, */
115 0, /* refcnt */
116 rpcib_getinfo, /* info */
117 nulldev, /* identify */
118 nulldev, /* probe */
119 rpcib_attach, /* attach */
120 rpcib_detach, /* detach */
121 nodev, /* reset */
122 &rpcib_cbops, /* driver ops - devctl interfaces */
123 NULL, /* bus operations */
124 NULL /* power */
125 };
126
127 /*
128 * Module linkage information.
129 */
130
131 static struct modldrv rib_modldrv = {
132 &mod_driverops, /* Driver module */
133 "RPCIB plugin driver, ver 1.29", /* Driver name and version */
134 &rpcib_ops, /* Driver ops */
135 };
136
137 static struct modlinkage rib_modlinkage = {
138 MODREV_1,
139 (void *)&rib_modldrv,
140 NULL
141 };
142
143 /*
144 * rib_stat: private data pointer used when registering
145 * with the IBTF. It is returned to the consumer
146 * in all callbacks.
147 */
148 static rpcib_state_t *rib_stat = NULL;
149
150 #define RNR_RETRIES 2
151 #define MAX_PORTS 2
152
153 int preposted_rbufs = 16;
154 int send_threshold = 1;
155
156 /*
157 * State of the plugin.
158 * ACCEPT = accepting new connections and requests.
159 * NO_ACCEPT = not accepting new connection and requests.
160 * This should eventually move to rpcib_state_t structure, since this
161 * will tell in which state the plugin is for a particular type of service
162 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
163 * state for one and in no_accept state for the other.
164 */
165 int plugin_state;
166 kmutex_t plugin_state_lock;
167
168
169 /*
170 * RPCIB RDMATF operations
171 */
172 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
173 static rdma_stat rib_disconnect(CONN *conn);
174 static void rib_listen(struct rdma_svc_data *rd);
175 static void rib_listen_stop(struct rdma_svc_data *rd);
176 static rdma_stat rib_registermem(CONN *conn, caddr_t buf, uint_t buflen,
177 struct mrc *buf_handle);
178 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
179 struct mrc buf_handle);
180 static rdma_stat rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen,
181 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle);
182 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
183 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle);
184 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
185 caddr_t buf, int len, int cpu);
186
187 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
188
189 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
190 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
191
192 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
193
194 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
195 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
196 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
197 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
198 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
199 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
200 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
201 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
202 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
203 static rdma_stat rib_conn_release(CONN *conn);
204 static rdma_stat rib_getinfo(rdma_info_t *info);
205 static rdma_stat rib_register_ats(rib_hca_t *);
206 static void rib_deregister_ats();
207 static void rib_stop_services(rib_hca_t *);
208
209 /*
210 * RPCIB addressing operations
211 */
212 char ** get_ip_addrs(int *count);
213 int get_interfaces(TIUSER *tiptr, int *num);
214 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs);
215 int get_ibd_ipaddr(rpcib_ibd_insts_t *);
216 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *);
217 void rib_get_ibd_insts(rpcib_ibd_insts_t *);
218
219
220 /*
221 * RDMA operations the RPCIB module exports
222 */
223 static rdmaops_t rib_ops = {
224 rib_reachable,
225 rib_conn_get,
226 rib_conn_release,
227 rib_listen,
228 rib_listen_stop,
229 rib_registermem,
230 rib_deregistermem,
231 rib_registermemsync,
232 rib_deregistermemsync,
233 rib_syncmem,
234 rib_reg_buf_alloc,
235 rib_reg_buf_free,
236 rib_send,
237 rib_send_resp,
238 rib_post_resp,
239 rib_post_recv,
240 rib_recv,
241 rib_read,
242 rib_write,
243 rib_getinfo
244 };
245
246 /*
247 * RDMATF RPCIB plugin details
248 */
249 static rdma_mod_t rib_mod = {
250 "ibtf", /* api name */
251 RDMATF_VERS_1,
252 0,
253 &rib_ops, /* rdma op vector for ibtf */
254 };
255
256 static rdma_stat open_hcas(rpcib_state_t *);
257 static rdma_stat rib_qp_init(rib_qp_t *, int);
258 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
259 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
260 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
261 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
262 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
263 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
264 ibt_mr_hdl_t *, ibt_mr_desc_t *);
265 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *);
266 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
267 rib_qp_t **);
268 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
269 rib_qp_t **);
270 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
271 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
272 static int rib_free_sendwait(struct send_wid *);
273 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
274 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
275 static void rdma_done_rem_list(rib_qp_t *);
276 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
277
278 static void rib_async_handler(void *,
279 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
280 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
281 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
282 static int rib_free_svc_recv(struct svc_recv *);
283 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
284 static void rib_free_wid(struct recv_wid *);
299 };
300
301 /*
302 * Global strucuture
303 */
304
305 typedef struct rpcib_s {
306 dev_info_t *rpcib_dip;
307 kmutex_t rpcib_mutex;
308 } rpcib_t;
309
310 rpcib_t rpcib;
311
312 /*
313 * /etc/system controlled variable to control
314 * debugging in rpcib kernel module.
315 * Set it to values greater that 1 to control
316 * the amount of debugging messages required.
317 */
318 int rib_debug = 0;
319
320 static int ats_running = 0;
321 int
322 _init(void)
323 {
324 int error;
325
326 error = mod_install((struct modlinkage *)&rib_modlinkage);
327 if (error != 0) {
328 /*
329 * Could not load module
330 */
331 return (error);
332 }
333 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
334
335 return (0);
336 }
337
338 int
339 _fini()
340 {
556 " ATS service: %s",
557 to_remove->srv_name);
558 }
559 #endif
560 }
561 kmem_free(to_remove, sizeof (rib_service_t));
562 }
563 hca->ats_list = NULL;
564 rw_exit(&hca->service_list_lock);
565 }
566
567 static void rib_rbufpool_free(rib_hca_t *, int);
568 static void rib_rbufpool_deregister(rib_hca_t *, int);
569 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
570 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
571 static rdma_stat rib_rem_replylist(rib_qp_t *);
572 static int rib_remreply(rib_qp_t *, struct reply *);
573 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
574 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
575
576 /*
577 * One CQ pair per HCA
578 */
579 static rdma_stat
580 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
581 rib_cq_t **cqp, rpcib_state_t *ribstat)
582 {
583 rib_cq_t *cq;
584 ibt_cq_attr_t cq_attr;
585 uint32_t real_size;
586 ibt_status_t status;
587 rdma_stat error = RDMA_SUCCESS;
588
589 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
590 cq->rib_hca = hca;
591 cq_attr.cq_size = cq_size;
592 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
593 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
594 &real_size);
595 if (status != IBT_SUCCESS) {
616
617 return (error);
618 fail:
619 if (cq->rib_cq_hdl)
620 (void) ibt_free_cq(cq->rib_cq_hdl);
621 if (cq)
622 kmem_free(cq, sizeof (rib_cq_t));
623 return (error);
624 }
625
626 static rdma_stat
627 open_hcas(rpcib_state_t *ribstat)
628 {
629 rib_hca_t *hca;
630 ibt_status_t ibt_status;
631 rdma_stat status;
632 ibt_hca_portinfo_t *pinfop;
633 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS;
634 uint_t size, cq_size;
635 int i;
636
637 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
638 if (ribstat->hcas == NULL)
639 ribstat->hcas = kmem_zalloc(ribstat->hca_count *
640 sizeof (rib_hca_t), KM_SLEEP);
641
642 /*
643 * Open a hca and setup for RDMA
644 */
645 for (i = 0; i < ribstat->hca_count; i++) {
646 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
647 ribstat->hca_guids[i],
648 &ribstat->hcas[i].hca_hdl);
649 if (ibt_status != IBT_SUCCESS) {
650 cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) "
651 "returned %d", i, ibt_status);
652 continue;
653 }
654 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
655 hca = &(ribstat->hcas[i]);
656 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
729 }
730
731 /*
732 * Create buffer pools.
733 * Note rib_rbuf_create also allocates memory windows.
734 */
735 hca->recv_pool = rib_rbufpool_create(hca,
736 RECV_BUFFER, MAX_BUFS);
737 if (hca->recv_pool == NULL) {
738 cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n");
739 goto fail3;
740 }
741
742 hca->send_pool = rib_rbufpool_create(hca,
743 SEND_BUFFER, MAX_BUFS);
744 if (hca->send_pool == NULL) {
745 cmn_err(CE_WARN, "open_hcas: send buf pool failed\n");
746 rib_rbufpool_destroy(hca, RECV_BUFFER);
747 goto fail3;
748 }
749
750 /*
751 * Initialize the registered service list and
752 * the lock
753 */
754 hca->service_list = NULL;
755 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
756
757 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
758 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
759 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
760 hca->iblock);
761 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
762 hca->iblock);
763 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
764 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
765 hca->inuse = TRUE;
766 /*
767 * XXX One hca only. Add multi-hca functionality if needed
768 * later.
769 */
871 * Notify poster
872 */
873 cv_signal(&wd->wait_cv);
874 mutex_exit(&wd->sendwait_lock);
875 } else {
876 /*
877 * Poster not waiting for notification.
878 * Free the send buffers and send_wid
879 */
880 for (i = 0; i < wd->nsbufs; i++) {
881 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
882 (void *)(uintptr_t)wd->sbufaddr[i]);
883 }
884 mutex_exit(&wd->sendwait_lock);
885 (void) rib_free_sendwait(wd);
886 }
887 }
888 }
889 }
890
891 /* ARGSUSED */
892 static void
893 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
894 {
895 ibt_status_t ibt_status;
896 ibt_wc_t wc;
897 int i;
898
899 /*
900 * Re-enable cq notify here to avoid missing any
901 * completion queue notification.
902 */
903 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
904
905 ibt_status = IBT_SUCCESS;
906 while (ibt_status != IBT_CQ_EMPTY) {
907 bzero(&wc, sizeof (wc));
908 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
909 if (ibt_status != IBT_SUCCESS)
910 return;
911
912 /*
913 * Got a send completion
914 */
915 #ifdef DEBUG
916 if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) {
917 cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error "
918 "wc.wc_status:%d, wc_id:%llX",
919 wc.wc_status, (longlong_t)wc.wc_id);
920 }
921 #endif
922 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
923 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
924
925 mutex_enter(&wd->sendwait_lock);
926 if (wd->cv_sig == 1) {
927 /*
928 * Update completion status and notify poster
929 */
930 if (wc.wc_status == IBT_WC_SUCCESS)
931 wd->status = RDMA_SUCCESS;
932 else
933 wd->status = RDMA_FAILED;
934 cv_signal(&wd->wait_cv);
935 mutex_exit(&wd->sendwait_lock);
936 } else {
937 /*
938 * Poster not waiting for notification.
939 * Free the send buffers and send_wid
940 */
941 for (i = 0; i < wd->nsbufs; i++) {
942 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
943 (void *)(uintptr_t)wd->sbufaddr[i]);
944 }
945 mutex_exit(&wd->sendwait_lock);
946 (void) rib_free_sendwait(wd);
947 }
948 }
949 }
950 }
951
952 /*
953 * RCQ handler
954 */
955 /* ARGSUSED */
956 static void
957 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
958 {
959 rib_qp_t *qp;
960 ibt_status_t ibt_status;
961 ibt_wc_t wc;
962 struct recv_wid *rwid;
963
964 /*
965 * Re-enable cq notify here to avoid missing any
966 * completion queue notification.
967 */
968 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
969
970 ibt_status = IBT_SUCCESS;
971 while (ibt_status != IBT_CQ_EMPTY) {
972 bzero(&wc, sizeof (wc));
973 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
974 if (ibt_status != IBT_SUCCESS)
975 return;
976
977 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
978 qp = rwid->qp;
979 if (wc.wc_status == IBT_WC_SUCCESS) {
980 XDR inxdrs, *xdrs;
981 uint_t xid, vers, op, find_xid = 0;
982 struct reply *r;
983 CONN *conn = qptoc(qp);
984
985 xdrs = &inxdrs;
986 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
987 wc.wc_bytes_xfer, XDR_DECODE);
988 /*
989 * Treat xid as opaque (xid is the first entity
990 * in the rpc rdma message).
991 */
992 xid = *(uint32_t *)(uintptr_t)rwid->addr;
993 /* Skip xid and set the xdr position accordingly. */
994 XDR_SETPOS(xdrs, sizeof (uint32_t));
995 (void) xdr_u_int(xdrs, &vers);
996 (void) xdr_u_int(xdrs, &op);
997 XDR_DESTROY(xdrs);
998 if (vers != RPCRDMA_VERS) {
999 /*
1000 * Invalid RPC/RDMA version. Cannot interoperate.
1001 * Set connection to ERROR state and bail out.
1002 */
1003 mutex_enter(&conn->c_lock);
1004 if (conn->c_state != C_DISCONN_PEND)
1005 conn->c_state = C_ERROR;
1006 mutex_exit(&conn->c_lock);
1007 rib_rbuf_free(conn, RECV_BUFFER,
1008 (void *)(uintptr_t)rwid->addr);
1009 rib_free_wid(rwid);
1010 continue;
1011 }
1012
1013 mutex_enter(&qp->replylist_lock);
1014 for (r = qp->replylist; r != NULL; r = r->next) {
1015 if (r->xid == xid) {
1093 mblk_t *mp;
1094
1095 /*
1096 * Re-enable cq notify here to avoid missing any
1097 * completion queue notification.
1098 */
1099 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1100
1101 ibt_status = IBT_SUCCESS;
1102 while (ibt_status != IBT_CQ_EMPTY) {
1103 bzero(&wc, sizeof (wc));
1104 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1105 if (ibt_status != IBT_SUCCESS)
1106 return;
1107
1108 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1109 qp = s_recvp->qp;
1110 conn = qptoc(qp);
1111 mutex_enter(&qp->posted_rbufs_lock);
1112 qp->n_posted_rbufs--;
1113 if (qp->n_posted_rbufs == 0)
1114 cv_signal(&qp->posted_rbufs_cv);
1115 mutex_exit(&qp->posted_rbufs_lock);
1116
1117 if (wc.wc_status == IBT_WC_SUCCESS) {
1118 XDR inxdrs, *xdrs;
1119 uint_t xid, vers, op;
1120
1121 xdrs = &inxdrs;
1122 /* s_recvp->vaddr stores data */
1123 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1124 wc.wc_bytes_xfer, XDR_DECODE);
1125
1126 /*
1127 * Treat xid as opaque (xid is the first entity
1128 * in the rpc rdma message).
1129 */
1130 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1131 /* Skip xid and set the xdr position accordingly. */
1132 XDR_SETPOS(xdrs, sizeof (uint32_t));
1133 if (!xdr_u_int(xdrs, &vers) ||
1134 !xdr_u_int(xdrs, &op)) {
1135 rib_rbuf_free(conn, RECV_BUFFER,
1136 (void *)(uintptr_t)s_recvp->vaddr);
1137 XDR_DESTROY(xdrs);
1138 #ifdef DEBUG
1139 cmn_err(CE_NOTE, "rib_svc_rcq_handler: "
1140 "xdr_u_int failed for qp %p, wc_id=%llx",
1141 (void *)qp, (longlong_t)wc.wc_id);
1142 #endif
1143 (void) rib_free_svc_recv(s_recvp);
1144 continue;
1145 }
1146 XDR_DESTROY(xdrs);
1147
1148 if (vers != RPCRDMA_VERS) {
1149 /*
1150 * Invalid RPC/RDMA version. Drop rpc rdma message.
1151 */
1152 rib_rbuf_free(conn, RECV_BUFFER,
1153 (void *)(uintptr_t)s_recvp->vaddr);
1323 return (RDMA_FAILED);
1324 }
1325 } else {
1326 mutex_exit(&rib_stat->open_hca_lock);
1327 return (RDMA_SUCCESS);
1328 }
1329 } else {
1330 *handle = NULL;
1331 if (rib_debug > 2)
1332 cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n");
1333 return (RDMA_FAILED);
1334 }
1335 }
1336
1337 /* Client side qp creation */
1338 static rdma_stat
1339 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1340 {
1341 rib_qp_t *kqp = NULL;
1342 CONN *conn;
1343
1344 ASSERT(qp != NULL);
1345 *qp = NULL;
1346
1347 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1348 conn = qptoc(kqp);
1349 kqp->hca = hca;
1350 kqp->rdmaconn.c_rdmamod = &rib_mod;
1351 kqp->rdmaconn.c_private = (caddr_t)kqp;
1352
1353 kqp->mode = RIB_CLIENT;
1354 kqp->chan_flags = IBT_BLOCKING;
1355 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1356 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1357 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1358
1359 /*
1360 * Initialize
1361 */
1362 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1363 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1364 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1365 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1366 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1367 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1368 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1369 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1370
1371 *qp = kqp;
1372 return (RDMA_SUCCESS);
1373 }
1374
1375 /* Server side qp creation */
1376 static rdma_stat
1377 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1378 {
1379 rib_qp_t *kqp = NULL;
1380 ibt_chan_sizes_t chan_sizes;
1381 ibt_rc_chan_alloc_args_t qp_attr;
1382 ibt_status_t ibt_status;
1383
1384 ASSERT(qp != NULL);
1385 *qp = NULL;
1386
1387 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1388 kqp->hca = hca;
1389 kqp->port_num = port;
1390 kqp->rdmaconn.c_rdmamod = &rib_mod;
1391 kqp->rdmaconn.c_private = (caddr_t)kqp;
1392
1393 /*
1394 * Create the qp handle
1395 */
1396 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1397 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1398 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1399 qp_attr.rc_pd = hca->pd_hdl;
1400 qp_attr.rc_hca_port_num = port;
1401 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1402 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1424 goto fail;
1425 }
1426
1427 kqp->mode = RIB_SERVER;
1428 kqp->chan_flags = IBT_BLOCKING;
1429 kqp->q = q; /* server ONLY */
1430
1431 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1432 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1433 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1434 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1435 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1436 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1437 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1438 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1439 /*
1440 * Set the private data area to qp to be used in callbacks
1441 */
1442 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1443 kqp->rdmaconn.c_state = C_CONNECTED;
1444 *qp = kqp;
1445 return (RDMA_SUCCESS);
1446 fail:
1447 if (kqp)
1448 kmem_free(kqp, sizeof (rib_qp_t));
1449
1450 return (RDMA_FAILED);
1451 }
1452
1453 void
1454 rib_dump_pathrec(ibt_path_info_t *path_rec)
1455 {
1456 ib_pkey_t pkey;
1457
1458 if (rib_debug > 1) {
1459 cmn_err(CE_NOTE, "Path Record:\n");
1460
1461 cmn_err(CE_NOTE, "Source HCA GUID = %llx\n",
1462 (longlong_t)path_rec->pi_hca_guid);
1463 cmn_err(CE_NOTE, "Dest Service ID = %llx\n",
1464 (longlong_t)path_rec->pi_sid);
1707
1708 (void) bzero(&chan_args, sizeof (chan_args));
1709 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1710
1711 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
1712 /* Alloc a RC channel */
1713 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1714 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1715 qp_attr.rc_pd = hca->pd_hdl;
1716 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1717 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1718 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1719 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1720 qp_attr.rc_clone_chan = NULL;
1721 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1722 qp_attr.rc_flags = IBT_WR_SIGNALED;
1723
1724 chan_args.oc_path = path;
1725 chan_args.oc_cm_handler = rib_clnt_cm_handler;
1726 chan_args.oc_cm_clnt_private = (void *)rib_stat;
1727 chan_args.oc_rdma_ra_out = 1;
1728 chan_args.oc_rdma_ra_in = 1;
1729 chan_args.oc_path_retry_cnt = 2;
1730 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1731
1732 refresh:
1733 rw_enter(&hca->state_lock, RW_READER);
1734 if (hca->state != HCA_DETACHED) {
1735 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1736 IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl,
1737 &chan_sizes);
1738 } else {
1739 rw_exit(&hca->state_lock);
1740 return (RDMA_FAILED);
1741 }
1742 rw_exit(&hca->state_lock);
1743
1744 if (ibt_status != IBT_SUCCESS) {
1745 #ifdef DEBUG
1746 cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel "
1747 "failed, ibt_status=%d.", ibt_status);
1748 #endif
1885 (void) rib_rem_replylist(qp);
1886 }
1887
1888 cv_destroy(&qp->cb_conn_cv);
1889 cv_destroy(&qp->posted_rbufs_cv);
1890 mutex_destroy(&qp->cb_lock);
1891
1892 mutex_destroy(&qp->replylist_lock);
1893 mutex_destroy(&qp->posted_rbufs_lock);
1894 mutex_destroy(&qp->rdlist_lock);
1895
1896 cv_destroy(&conn->c_cv);
1897 mutex_destroy(&conn->c_lock);
1898
1899 if (conn->c_raddr.buf != NULL) {
1900 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
1901 }
1902 if (conn->c_laddr.buf != NULL) {
1903 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
1904 }
1905 kmem_free(qp, sizeof (rib_qp_t));
1906
1907 /*
1908 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
1909 * then the hca is no longer being used.
1910 */
1911 if (conn_list != NULL) {
1912 rw_enter(&hca->state_lock, RW_READER);
1913 if (hca->state == HCA_DETACHED) {
1914 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
1915 if (hca->srv_conn_list.conn_hd == NULL) {
1916 rw_enter(&hca->cl_conn_list.conn_lock,
1917 RW_READER);
1918 if (hca->cl_conn_list.conn_hd == NULL) {
1919 mutex_enter(&hca->inuse_lock);
1920 hca->inuse = FALSE;
1921 cv_signal(&hca->cb_cv);
1922 mutex_exit(&hca->inuse_lock);
1923 }
1924 rw_exit(&hca->cl_conn_list.conn_lock);
1925 }
1926 rw_exit(&hca->srv_conn_list.conn_lock);
1927 }
1928 rw_exit(&hca->state_lock);
1929 }
1930 return (RDMA_SUCCESS);
1931 }
1932
1933 /*
1934 * Wait for send completion notification. Only on receiving a
1935 * notification be it a successful or error completion, free the
1936 * send_wid.
1937 */
1938 static rdma_stat
1939 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
1940 {
1941 clock_t timout, cv_wait_ret;
1942 rdma_stat error = RDMA_SUCCESS;
1943 int i;
1944
1945 /*
1946 * Wait for send to complete
1947 */
1948 ASSERT(wd != NULL);
1949 mutex_enter(&wd->sendwait_lock);
1950 if (wd->status == (uint_t)SEND_WAIT) {
1951 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
1952 ddi_get_lbolt();
2047
2048 static rdma_stat
2049 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2050 {
2051 mutex_enter(&qp->replylist_lock);
2052 if (rep != NULL) {
2053 (void) rib_remreply(qp, rep);
2054 mutex_exit(&qp->replylist_lock);
2055 return (RDMA_SUCCESS);
2056 }
2057 mutex_exit(&qp->replylist_lock);
2058 return (RDMA_FAILED);
2059 }
2060
2061 /*
2062 * Send buffers are freed here only in case of error in posting
2063 * on QP. If the post succeeded, the send buffers are freed upon
2064 * send completion in rib_sendwait() or in the scq_handler.
2065 */
2066 rdma_stat
2067 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2068 int send_sig, int cv_sig)
2069 {
2070 struct send_wid *wdesc;
2071 struct clist *clp;
2072 ibt_status_t ibt_status = IBT_SUCCESS;
2073 rdma_stat ret = RDMA_SUCCESS;
2074 ibt_send_wr_t tx_wr;
2075 int i, nds;
2076 ibt_wr_ds_t sgl[DSEG_MAX];
2077 uint_t total_msg_size;
2078 rib_qp_t *qp = ctoqp(conn);
2079
2080 ASSERT(cl != NULL);
2081
2082 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2083
2084 nds = 0;
2085 total_msg_size = 0;
2086 clp = cl;
2087 while (clp != NULL) {
2088 if (nds >= DSEG_MAX) {
2089 cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX"
2090 " too small!");
2091 return (RDMA_FAILED);
2092 }
2093 sgl[nds].ds_va = clp->c_saddr;
2094 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2095 sgl[nds].ds_len = clp->c_len;
2096 total_msg_size += clp->c_len;
2097 clp = clp->c_next;
2098 nds++;
2099 }
2100
2101 if (send_sig) {
2102 /* Set SEND_SIGNAL flag. */
2103 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2104 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2105 } else {
2106 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2107 wdesc = rib_init_sendwait(msgid, 0, qp);
2108 }
2109 wdesc->nsbufs = nds;
2110 for (i = 0; i < nds; i++) {
2111 wdesc->sbufaddr[i] = sgl[i].ds_va;
2112 }
2113
2114 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2115 tx_wr.wr_opcode = IBT_WRC_SEND;
2116 tx_wr.wr_trans = IBT_RC_SRV;
2117 tx_wr.wr_nds = nds;
2118 tx_wr.wr_sgl = sgl;
2119
2120 mutex_enter(&conn->c_lock);
2121 if (conn->c_state & C_CONNECTED) {
2122 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2123 }
2124 if (((conn->c_state & C_CONNECTED) == 0) ||
2125 ibt_status != IBT_SUCCESS) {
2126 mutex_exit(&conn->c_lock);
2127 for (i = 0; i < nds; i++) {
2128 rib_rbuf_free(conn, SEND_BUFFER,
2129 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2146 * cv_wait for send to complete.
2147 * We can fail due to a timeout or signal or
2148 * unsuccessful send.
2149 */
2150 ret = rib_sendwait(qp, wdesc);
2151 #ifdef DEBUG
2152 if (rib_debug > 2)
2153 if (ret != 0) {
2154 cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait "
2155 "FAILED, rdma stat=%d, wr_id %llx, qp %p!",
2156 ret, (longlong_t)tx_wr.wr_id, (void *)qp);
2157 }
2158 #endif
2159 return (ret);
2160 }
2161 }
2162
2163 return (RDMA_SUCCESS);
2164 }
2165
2166 rdma_stat
2167 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2168 {
2169 rdma_stat ret;
2170
2171 /* send-wait & cv_signal */
2172 ret = rib_send_and_wait(conn, cl, msgid, 1, 1);
2173
2174 return (ret);
2175 }
2176
2177 /*
2178 * Server interface (svc_rdma_ksend).
2179 * Send RPC reply and wait for RDMA_DONE.
2180 */
2181 rdma_stat
2182 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2183 {
2184 rdma_stat ret = RDMA_SUCCESS;
2185 struct rdma_done_list *rd;
2186 clock_t timout, cv_wait_ret;
2187 rib_qp_t *qp = ctoqp(conn);
2188
2189 mutex_enter(&qp->rdlist_lock);
2190 rd = rdma_done_add(qp, msgid);
2191
2192 /* No cv_signal (whether send-wait or no-send-wait) */
2193 ret = rib_send_and_wait(conn, cl, msgid, 1, 0);
2194 if (ret != RDMA_SUCCESS) {
2195 #ifdef DEBUG
2196 cmn_err(CE_WARN, "rib_send_resp: send_and_wait "
2197 "failed, msgid %u, qp %p", msgid, (void *)qp);
2198 #endif
2199 rdma_done_rm(qp, rd);
2200 goto done;
2201 }
2202
2203 /*
2204 * Wait for RDMA_DONE from remote end
2205 */
2206 timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2207 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock,
2208 timout);
2209 rdma_done_rm(qp, rd);
2210 if (cv_wait_ret < 0) {
2211 #ifdef DEBUG
2212 if (rib_debug > 1) {
2213 cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
2481 #ifdef DEBUG
2482 cmn_err(CE_WARN, "rib_recv: no matching reply for "
2483 "xid %u, qp %p\n", msgid, (void *)qp);
2484 #endif
2485 }
2486
2487 /*
2488 * Done.
2489 */
2490 mutex_exit(&qp->replylist_lock);
2491 return (ret);
2492 }
2493
2494 /*
2495 * RDMA write a buffer to the remote address.
2496 */
2497 rdma_stat
2498 rib_write(CONN *conn, struct clist *cl, int wait)
2499 {
2500 ibt_send_wr_t tx_wr;
2501 int nds;
2502 int cv_sig;
2503 ibt_wr_ds_t sgl[DSEG_MAX];
2504 struct send_wid *wdesc;
2505 ibt_status_t ibt_status;
2506 rdma_stat ret = RDMA_SUCCESS;
2507 rib_qp_t *qp = ctoqp(conn);
2508
2509 if (cl == NULL) {
2510 cmn_err(CE_WARN, "rib_write: NULL clist\n");
2511 return (RDMA_FAILED);
2512 }
2513
2514 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2515 /*
2516 * Remote address is at the head chunk item in list.
2517 */
2518 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr;
2519 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */
2520
2521 nds = 0;
2522 while (cl != NULL) {
2523 if (nds >= DSEG_MAX) {
2524 cmn_err(CE_WARN, "rib_write: DSEG_MAX too small!");
2525 return (RDMA_FAILED);
2526 }
2527 sgl[nds].ds_va = cl->c_saddr;
2528 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2529 sgl[nds].ds_len = cl->c_len;
2530 cl = cl->c_next;
2531 nds++;
2532 }
2533
2534 if (wait) {
2535 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2536 cv_sig = 1;
2537 } else {
2538 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2539 cv_sig = 0;
2540 }
2541
2542 wdesc = rib_init_sendwait(0, cv_sig, qp);
2543 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2544 tx_wr.wr_opcode = IBT_WRC_RDMAW;
2545 tx_wr.wr_trans = IBT_RC_SRV;
2546 tx_wr.wr_nds = nds;
2547 tx_wr.wr_sgl = sgl;
2548
2549 mutex_enter(&conn->c_lock);
2550 if (conn->c_state & C_CONNECTED) {
2551 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2552 }
2553 if (((conn->c_state & C_CONNECTED) == 0) ||
2554 ibt_status != IBT_SUCCESS) {
2555 mutex_exit(&conn->c_lock);
2556 (void) rib_free_sendwait(wdesc);
2557 return (RDMA_FAILED);
2558 }
2559 mutex_exit(&conn->c_lock);
2560
2561 /*
2562 * Wait for send to complete
2563 */
2564 if (wait) {
2565 ret = rib_sendwait(qp, wdesc);
2566 if (ret != 0) {
2567 return (ret);
2568 }
2569 }
2570 return (RDMA_SUCCESS);
2571 }
2572
2573 /*
2574 * RDMA Read a buffer from the remote address.
2575 */
2576 rdma_stat
2577 rib_read(CONN *conn, struct clist *cl, int wait)
2578 {
2579 ibt_send_wr_t rx_wr;
2580 int nds;
2581 int cv_sig;
2582 ibt_wr_ds_t sgl[DSEG_MAX]; /* is 2 sufficient? */
2583 struct send_wid *wdesc;
2584 ibt_status_t ibt_status = IBT_SUCCESS;
2585 rdma_stat ret = RDMA_SUCCESS;
2586 rib_qp_t *qp = ctoqp(conn);
2587
2588 if (cl == NULL) {
2589 cmn_err(CE_WARN, "rib_read: NULL clist\n");
2668 return (zero == 0);
2669 }
2670
2671 /*
2672 * rib_srv_cm_handler()
2673 * Connection Manager callback to handle RC connection requests.
2674 */
2675 /* ARGSUSED */
2676 static ibt_cm_status_t
2677 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2678 ibt_cm_return_args_t *ret_args, void *priv_data,
2679 ibt_priv_data_len_t len)
2680 {
2681 queue_t *q;
2682 rib_qp_t *qp;
2683 rpcib_state_t *ribstat;
2684 rib_hca_t *hca;
2685 rdma_stat status = RDMA_SUCCESS;
2686 int i;
2687 struct clist cl;
2688 rdma_buf_t rdbuf;
2689 void *buf = NULL;
2690 ibt_cm_req_rcv_t cm_req_rcv;
2691 CONN *conn;
2692 ibt_status_t ibt_status;
2693 ibt_ar_t ar_query, ar_result;
2694 ib_gid_t sgid;
2695
2696
2697 ASSERT(any != NULL);
2698 ASSERT(event != NULL);
2699
2700 ribstat = (rpcib_state_t *)any;
2701 hca = (rib_hca_t *)ribstat->hca;
2702 ASSERT(hca != NULL);
2703
2704 /* got a connection request */
2705 switch (event->cm_type) {
2706 case IBT_CM_EVENT_REQ_RCV:
2707 /*
2708 * If the plugin is in the NO_ACCEPT state, bail out.
2753 cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n",
2754 cm_req_rcv.req_remote_qpn);
2755 cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n",
2756 cm_req_rcv.req_remote_qkey);
2757 cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n",
2758 (void *)qp, (void *)qp->qp_hdl);
2759 }
2760
2761 if (rib_debug > 2) {
2762 ibt_rc_chan_query_attr_t chan_attrs;
2763
2764 if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs)
2765 == IBT_SUCCESS) {
2766 cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in "
2767 "CEP state %d\n", (void *)qp, chan_attrs.rc_state);
2768 }
2769 }
2770 #endif
2771
2772 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2773 ret_args->cm_ret.rep.cm_rdma_ra_out = 1;
2774 ret_args->cm_ret.rep.cm_rdma_ra_in = 1;
2775 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2776
2777 /*
2778 * Pre-posts RECV buffers
2779 */
2780 conn = qptoc(qp);
2781 for (i = 0; i < preposted_rbufs; i++) {
2782 bzero(&rdbuf, sizeof (rdbuf));
2783 rdbuf.type = RECV_BUFFER;
2784 buf = rib_rbuf_alloc(conn, &rdbuf);
2785 if (buf == NULL) {
2786 cmn_err(CE_WARN, "rib_svc_cm_handler: "
2787 "No RECV_BUFFER buf!\n");
2788 (void) rib_disconnect_channel(conn, NULL);
2789 return (IBT_CM_REJECT);
2790 }
2791
2792 bzero(&cl, sizeof (cl));
2793 cl.c_saddr = (uintptr_t)rdbuf.addr;
2794 cl.c_len = rdbuf.len;
3678 rep->prev->next = rep->next;
3679 }
3680 if (rep->next) {
3681 rep->next->prev = rep->prev;
3682 }
3683 if (qp->replylist == rep)
3684 qp->replylist = rep->next;
3685
3686 cv_destroy(&rep->wait_cv);
3687 qp->rep_list_size--;
3688 if (rib_debug > 1)
3689 cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n",
3690 (void *)qp, qp->rep_list_size);
3691
3692 kmem_free(rep, sizeof (*rep));
3693
3694 return (0);
3695 }
3696
3697 rdma_stat
3698 rib_registermem(CONN *conn, caddr_t buf, uint_t buflen,
3699 struct mrc *buf_handle)
3700 {
3701 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3702 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3703 rdma_stat status;
3704 rib_hca_t *hca = (ctoqp(conn))->hca;
3705
3706 /*
3707 * Note: ALL buffer pools use the same memory type RDMARW.
3708 */
3709 status = rib_reg_mem(hca, buf, buflen, 0, &mr_hdl, &mr_desc);
3710 if (status == RDMA_SUCCESS) {
3711 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3712 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3713 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3714 } else {
3715 buf_handle->mrc_linfo = NULL;
3716 buf_handle->mrc_lmr = 0;
3717 buf_handle->mrc_rmr = 0;
3718 }
3719 return (status);
3720 }
3721
3722 static rdma_stat
3723 rib_reg_mem(rib_hca_t *hca, caddr_t buf, uint_t size, ibt_mr_flags_t spec,
3724 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3725 {
3726 ibt_mr_attr_t mem_attr;
3727 ibt_status_t ibt_status;
3728
3729 mem_attr.mr_vaddr = (uintptr_t)buf;
3730 mem_attr.mr_len = (ib_msglen_t)size;
3731 mem_attr.mr_as = NULL;
3732 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3733 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3734 IBT_MR_ENABLE_WINDOW_BIND | spec;
3735
3736 rw_enter(&hca->state_lock, RW_READER);
3737 if (hca->state == HCA_INITED) {
3738 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3739 &mem_attr, mr_hdlp, mr_descp);
3740 rw_exit(&hca->state_lock);
3741 } else {
3742 rw_exit(&hca->state_lock);
3743 return (RDMA_FAILED);
3744 }
3745
3746 if (ibt_status != IBT_SUCCESS) {
3747 cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr "
3748 "(spec:%d) failed for addr %llX, status %d",
3749 spec, (longlong_t)mem_attr.mr_vaddr, ibt_status);
3750 return (RDMA_FAILED);
3751 }
3752 return (RDMA_SUCCESS);
3753 }
3754
3755 rdma_stat
3756 rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen,
3757 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle)
3758 {
3759 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3760 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3761 rdma_stat status;
3762 rib_hca_t *hca = (ctoqp(conn))->hca;
3763
3764 /*
3765 * Non-coherent memory registration.
3766 */
3767 status = rib_reg_mem(hca, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl,
3768 &mr_desc);
3769 if (status == RDMA_SUCCESS) {
3770 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3771 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3772 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3773 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3774 } else {
3775 buf_handle->mrc_linfo = NULL;
3776 buf_handle->mrc_lmr = 0;
3777 buf_handle->mrc_rmr = 0;
3778 }
3779 return (status);
3780 }
3781
3782 /* ARGSUSED */
3783 rdma_stat
3784 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3785 {
3786 rib_hca_t *hca = (ctoqp(conn))->hca;
3787
3788 /*
3789 * Allow memory deregistration even if HCA is
3790 * getting detached. Need all outstanding
3791 * memory registrations to be deregistered
3792 * before HCA_DETACH_EVENT can be accepted.
3793 */
3794 (void) ibt_deregister_mr(hca->hca_hdl,
3795 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3796 return (RDMA_SUCCESS);
3797 }
3798
3799 /* ARGSUSED */
3800 rdma_stat
3801 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3802 RIB_SYNCMEM_HANDLE sync_handle)
3803 {
3804 (void) rib_deregistermem(conn, buf, buf_handle);
3805
3806 return (RDMA_SUCCESS);
3807 }
3808
3809 /* ARGSUSED */
3810 rdma_stat
3811 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3812 int len, int cpu)
3813 {
3814 ibt_status_t status;
3815 rib_hca_t *hca = (ctoqp(conn))->hca;
3816 ibt_mr_sync_t mr_segment;
3817
3818 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3819 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3820 mr_segment.ms_len = (ib_memlen_t)len;
3821 if (cpu) {
3822 /* make incoming data visible to memory */
3823 mr_segment.ms_flags = IBT_SYNC_WRITE;
3862 }
3863
3864 rib_bufpool_t *
3865 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3866 {
3867 rib_bufpool_t *rbp = NULL;
3868 bufpool_t *bp = NULL;
3869 caddr_t buf;
3870 ibt_mr_attr_t mem_attr;
3871 ibt_status_t ibt_status;
3872 int i, j;
3873
3874 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3875
3876 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3877 num * sizeof (void *), KM_SLEEP);
3878
3879 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3880 bp->numelems = num;
3881
3882 switch (ptype) {
3883 case SEND_BUFFER:
3884 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3885 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */
3886 bp->rsize = RPC_MSG_SZ;
3887 break;
3888 case RECV_BUFFER:
3889 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3890 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */
3891 bp->rsize = RPC_BUF_SIZE;
3892 break;
3893 default:
3894 goto fail;
3895 }
3896
3897 /*
3898 * Register the pool.
3899 */
3900 bp->bufsize = num * bp->rsize;
3901 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3902 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3903 sizeof (ibt_mr_hdl_t), KM_SLEEP);
3904 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3905 sizeof (ibt_mr_desc_t), KM_SLEEP);
3906
3907 rw_enter(&hca->state_lock, RW_READER);
3908 if (hca->state != HCA_INITED) {
3909 rw_exit(&hca->state_lock);
3910 goto fail;
3911 }
3912 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3913 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3914 mem_attr.mr_vaddr = (uintptr_t)buf;
3915 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3916 mem_attr.mr_as = NULL;
3917 ibt_status = ibt_register_mr(hca->hca_hdl,
3918 hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i],
3919 &rbp->mr_desc[i]);
3920 if (ibt_status != IBT_SUCCESS) {
3921 for (j = 0; j < i; j++) {
3922 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]);
3923 }
3924 rw_exit(&hca->state_lock);
3925 goto fail;
3926 }
3927 }
3928 rw_exit(&hca->state_lock);
3929
3930 buf = (caddr_t)bp->buf;
3931 for (i = 0; i < num; i++, buf += bp->rsize) {
3932 bp->buflist[i] = (void *)buf;
3933 }
3934 bp->buffree = num - 1; /* no. of free buffers */
3935 rbp->bpool = bp;
3936
3937 return (rbp);
3938 fail:
3939 if (bp) {
3940 if (bp->buf)
3941 kmem_free(bp->buf, bp->bufsize);
3942 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3943 }
3944 if (rbp) {
3945 if (rbp->mr_hdl)
3946 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3947 if (rbp->mr_desc)
3948 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3949 kmem_free(rbp, sizeof (rib_bufpool_t));
4000 break;
4001 case RECV_BUFFER:
4002 rbp = hca->recv_pool;
4003 break;
4004 default:
4005 return;
4006 }
4007 if (rbp == NULL)
4008 return;
4009
4010 bp = rbp->bpool;
4011
4012 /*
4013 * Free the pool memory.
4014 */
4015 if (rbp->mr_hdl)
4016 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4017
4018 if (rbp->mr_desc)
4019 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4020
4021 if (bp->buf)
4022 kmem_free(bp->buf, bp->bufsize);
4023 mutex_destroy(&bp->buflock);
4024 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4025 kmem_free(rbp, sizeof (rib_bufpool_t));
4026 }
4027
4028 void
4029 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4030 {
4031 /*
4032 * Deregister the pool memory and free it.
4033 */
4034 rib_rbufpool_deregister(hca, ptype);
4035 rib_rbufpool_free(hca, ptype);
4036 }
4037
4038 /*
4039 * Fetch a buffer from the pool of type specified in rdbuf->type.
4040 */
4042 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4043 {
4044
4045 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4046 if (rdbuf->addr) {
4047 switch (rdbuf->type) {
4048 case SEND_BUFFER:
4049 rdbuf->len = RPC_MSG_SZ; /* 1K */
4050 break;
4051 case RECV_BUFFER:
4052 rdbuf->len = RPC_BUF_SIZE; /* 2K */
4053 break;
4054 default:
4055 rdbuf->len = 0;
4056 }
4057 return (RDMA_SUCCESS);
4058 } else
4059 return (RDMA_FAILED);
4060 }
4061
4062
4063 /*
4064 * Fetch a buffer of specified type.
4065 * Note that rdbuf->handle is mw's rkey.
4066 */
4067 static void *
4068 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4069 {
4070 rib_qp_t *qp = ctoqp(conn);
4071 rib_hca_t *hca = qp->hca;
4072 rdma_btype ptype = rdbuf->type;
4073 void *buf;
4074 rib_bufpool_t *rbp = NULL;
4075 bufpool_t *bp;
4076 int i;
4077
4078 /*
4079 * Obtain pool address based on type of pool
4080 */
4081 switch (ptype) {
4092 return (NULL);
4093
4094 bp = rbp->bpool;
4095
4096 mutex_enter(&bp->buflock);
4097 if (bp->buffree < 0) {
4098 cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!");
4099 mutex_exit(&bp->buflock);
4100 return (NULL);
4101 }
4102
4103 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4104 buf = bp->buflist[bp->buffree];
4105 rdbuf->addr = buf;
4106 rdbuf->len = bp->rsize;
4107 for (i = bp->numelems - 1; i >= 0; i--) {
4108 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4109 rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey;
4110 rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i];
4111 rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey;
4112 bp->buffree--;
4113 if (rib_debug > 1)
4114 cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs "
4115 "(type %d)\n", bp->buffree+1, ptype);
4116
4117 mutex_exit(&bp->buflock);
4118
4119 return (buf);
4120 }
4121 }
4122 cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of "
4123 "type %d found!", buf, ptype);
4124 mutex_exit(&bp->buflock);
4125
4126 return (NULL);
4127 }
4128
4129 static void
4130 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4131 {
4943
4944 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4945 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4946 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4947 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4948 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4949 kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4950 kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4951 kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4952
4953 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4954 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4955 if (hca->srv_conn_list.conn_hd == NULL &&
4956 hca->cl_conn_list.conn_hd == NULL) {
4957 /*
4958 * conn_lists are NULL, so destroy
4959 * buffers, close hca and be done.
4960 */
4961 rib_rbufpool_destroy(hca, RECV_BUFFER);
4962 rib_rbufpool_destroy(hca, SEND_BUFFER);
4963 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4964 (void) ibt_close_hca(hca->hca_hdl);
4965 hca->hca_hdl = NULL;
4966 }
4967 rw_exit(&hca->cl_conn_list.conn_lock);
4968 rw_exit(&hca->srv_conn_list.conn_lock);
4969
4970 if (hca->hca_hdl != NULL) {
4971 mutex_enter(&hca->inuse_lock);
4972 while (hca->inuse)
4973 cv_wait(&hca->cb_cv, &hca->inuse_lock);
4974 mutex_exit(&hca->inuse_lock);
4975 /*
4976 * conn_lists are now NULL, so destroy
4977 * buffers, close hca and be done.
4978 */
4979 rib_rbufpool_destroy(hca, RECV_BUFFER);
4980 rib_rbufpool_destroy(hca, SEND_BUFFER);
4981 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4982 (void) ibt_close_hca(hca->hca_hdl);
4983 hca->hca_hdl = NULL;
4984 }
4985 }
|
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27
28 /* Copyright (c) 2006, The Ohio State University. All rights reserved.
29 *
30 * Portions of this source code is developed by the team members of
31 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
32 * headed by Professor Dhabaleswar K. (DK) Panda.
33 *
34 * Acknowledgements to contributions from developors:
35 * Ranjit Noronha: noronha@cse.ohio-state.edu
36 * Lei Chai : chail@cse.ohio-state.edu
37 * Weikuan Yu : yuw@cse.ohio-state.edu
38 *
39 */
40
41 #pragma ident "@(#)rpcib.c 1.29 06/01/25 SMI"
42
43 /*
44 * The rpcib plugin. Implements the interface for RDMATF's
45 * interaction with IBTF.
46 */
47
48 #include <sys/param.h>
49 #include <sys/types.h>
50 #include <sys/user.h>
51 #include <sys/systm.h>
52 #include <sys/sysmacros.h>
53 #include <sys/proc.h>
54 #include <sys/socket.h>
55 #include <sys/file.h>
56 #include <sys/stream.h>
57 #include <sys/strsubr.h>
58 #include <sys/stropts.h>
59 #include <sys/errno.h>
60 #include <sys/kmem.h>
61 #include <sys/debug.h>
62 #include <sys/systm.h>
63 #include <sys/pathname.h>
64 #include <sys/kstat.h>
65 #include <sys/t_lock.h>
66 #include <sys/ddi.h>
67 #include <sys/cmn_err.h>
68 #include <sys/time.h>
69 #include <sys/isa_defs.h>
70 #include <sys/callb.h>
71 #include <sys/sunddi.h>
72 #include <sys/sunndi.h>
73
74 /* #define IB_FMR_SUP */
75 /* #define CLNT_POLL_CQ */
76 #include <sys/ib/ibtl/ibti.h>
77 #include <rpc/rpc.h>
78 #include <rpc/ib.h>
79
80 #include <sys/modctl.h>
81
82 #include <sys/pathname.h>
83 #include <sys/kstr.h>
84 #include <sys/sockio.h>
85 #include <sys/vnode.h>
86 #include <sys/tiuser.h>
87 #include <net/if.h>
88 #include <sys/cred.h>
89 #include <rpc/rpc_rdma.h>
90
91 int num_clients = 0;
92 volatile uint32_t is_server = 0;
93
94 extern char *inet_ntop(int, const void *, char *, int);
95
96
97 /*
98 * Prototype declarations for driver ops
99 */
100
101 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
102 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
103 void *, void **);
104 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
105
106
107 /* rpcib cb_ops */
108 static struct cb_ops rpcib_cbops = {
109 nulldev, /* open */
110 nulldev, /* close */
111 nodev, /* strategy */
112 nodev, /* print */
113 nodev, /* dump */
114 nodev, /* read */
115 nodev, /* write */
116 nodev, /* ioctl */
117 nodev, /* devmap */
118 nodev, /* mmap */
119 nodev, /* segmap */
120 nochpoll, /* poll */
121 ddi_prop_op, /* prop_op */
122 NULL, /* stream */
123 D_MP, /* cb_flag */
124 CB_REV, /* rev */
125 nodev, /* int (*cb_aread)() */
126 nodev /* int (*cb_awrite)() */
127 };
128
129
130
131
132 /*
133 * Device options
134 */
135 static struct dev_ops rpcib_ops = {
136 DEVO_REV, /* devo_rev, */
137 0, /* refcnt */
138 rpcib_getinfo, /* info */
139 nulldev, /* identify */
140 nulldev, /* probe */
141 rpcib_attach, /* attach */
142 rpcib_detach, /* detach */
143 nodev, /* reset */
144 &rpcib_cbops, /* driver ops - devctl interfaces */
145 NULL, /* bus operations */
146 NULL /* power */
147 };
148
149 /*
150 * Module linkage information.
151 */
152
153 static struct modldrv rib_modldrv = {
154 &mod_driverops, /* Driver module */
155 "RPCIB plugin driver, ver 1.29", /* Driver name and version */
156 &rpcib_ops, /* Driver ops */
157 };
158
159 static struct modlinkage rib_modlinkage = {
160 MODREV_1,
161 (void *)&rib_modldrv,
162 NULL
163 };
164
165 #ifdef SERVER_REG_CACHE
166 typedef struct cache_struct {
167 avl_node_t avl_link;
168 rib_lrc_entry_t r;
169 uint32_t len;
170 uint32_t elements;
171 kmutex_t node_lock;
172 } cache_avl_struct_t;
173
174
175 #if 1
176 int rib_total_buffers = 0;
177 #endif
178 #endif
179 /*
180 * rib_stat: private data pointer used when registering
181 * with the IBTF. It is returned to the consumer
182 * in all callbacks.
183 */
184 static rpcib_state_t *rib_stat = NULL;
185
186 #define RNR_RETRIES IBT_RNR_INFINITE_RETRY
187 #define MAX_PORTS 2
188
189 #ifdef IB_FMR_SUP
190 #define IB_FMR_DIRTY_MARK 32
191 #define IB_FMR_MAX_SIZE 1048576
192 /*#define IB_FMR_MAX_SIZE 32768 */
193 #endif
194
195 int preposted_rbufs = RDMA_BUFS_GRANT;
196 int send_threshold = 1;
197
198 /*
199 * State of the plugin.
200 * ACCEPT = accepting new connections and requests.
201 * NO_ACCEPT = not accepting new connection and requests.
202 * This should eventually move to rpcib_state_t structure, since this
203 * will tell in which state the plugin is for a particular type of service
204 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
205 * state for one and in no_accept state for the other.
206 */
207 int plugin_state;
208 kmutex_t plugin_state_lock;
209
210
211 /*
212 * RPCIB RDMATF operations
213 */
214 #if defined(MEASURE_POOL_DEPTH)
215 static void rib_posted_rbufs(uint32_t x) { return;}
216 #endif
217 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
218 static rdma_stat rib_disconnect(CONN *conn);
219 static void rib_listen(struct rdma_svc_data *rd);
220 static void rib_listen_stop(struct rdma_svc_data *rd);
221 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
222 struct mrc *buf_handle);
223 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
224 struct mrc buf_handle);
225 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
226 caddr_t buf, uint_t buflen, struct mrc *buf_handle);
227 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
228 struct mrc buf_handle);
229 #ifdef SERVER_REG_CACHE
230 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
231 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc);
232 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
233 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
234 #else
235 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
236 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle);
237 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
238 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle);
239
240 #endif
241 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
242 caddr_t buf, int len, int cpu);
243
244 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
245
246 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
247 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
248
249 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
250
251 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
252 #if defined (CLNT_INTERRUPT_COAL)
253 static void rib_scq_free(caddr_t);
254 static rdma_stat rib_send_bl(CONN *conn, struct clist *cl, uint32_t msgid);
255 #endif
256 #if defined(ASYNC_SERVER_DEREG)
257 static rdma_stat rib_send_nw(CONN *conn, struct clist *cl, uint32_t msgid, caddr_t, caddr_t, int, caddr_t, int, int, int);
258 #endif
259 #if defined(ASYNC_CLIENT_DEREG)
260 static void insert_queue(CONN *conn, struct clist *rwc);
261 #endif
262 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
263 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
264 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
265 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
266 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
267 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
268 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
269 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
270 static rdma_stat rib_conn_release(CONN *conn);
271 static rdma_stat rib_getinfo(rdma_info_t *info);
272 #ifdef DYNAMIC_CREDIT_CONTROL
273 void rib_get_resource_info(CONN *, int *, int *);
274 #endif
275
276 #ifdef SERVER_REG_CACHE
277 static rib_lrc_entry_t *rib_get_server_cache_buf(CONN *conn, uint32_t len);
278 static void rib_free_server_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
279 static void rib_destroy_cache(rib_hca_t *hca);
280 static void
281 rib_server_side_cache_reclaim(void *argp);
282 static int avl_compare(const void *t1,const void *t2);
283 #endif
284
285 static rdma_stat rib_register_ats(rib_hca_t *);
286 static void rib_deregister_ats();
287 static void rib_stop_services(rib_hca_t *);
288
289 /*
290 * RPCIB addressing operations
291 */
292 char ** get_ip_addrs(int *count);
293 int get_interfaces(TIUSER *tiptr, int *num);
294 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs);
295 int get_ibd_ipaddr(rpcib_ibd_insts_t *);
296 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *);
297 void rib_get_ibd_insts(rpcib_ibd_insts_t *);
298 #if defined(ASYNC_SERVER_DEREG)||defined(ASYNC_CLIENT_DEREG)
299 static int clist_deregister1(CONN *, struct clist *, bool_t );
300 #endif
301
302 #if defined(ASYNC_CLIENT_DEREG)
303 typedef struct async_dereg {
304 struct async_dereg *forw;
305 struct async_dereg *back;
306 CONN c_conn;
307 struct clist c_clist;
308 } ASYNC;
309 static void async_dereg_thread(caddr_t arg);
310 extern pri_t minclsyspri; /* priority for taskq */
311 static ASYNC rqueue;
312 static kmutex_t at_mutex;
313 static kcondvar_t at_cond;
314 #endif
315 /*
316 * RDMA operations the RPCIB module exports
317 */
318 static rdmaops_t rib_ops = {
319 rib_reachable,
320 rib_conn_get,
321 rib_conn_release,
322 rib_listen,
323 rib_listen_stop,
324 rib_registermem,
325 rib_deregistermem,
326 rib_registermemsync,
327 rib_deregistermemsync,
328 rib_syncmem,
329 rib_reg_buf_alloc,
330 rib_reg_buf_free,
331 rib_send,
332 #if defined (CLNT_INTERRUPT_COAL)
333 rib_send_bl,
334 #endif
335 #if defined(ASYNC_SERVER_DEREG)
336 rib_send_nw,
337 #endif
338 rib_send_resp,
339 rib_post_resp,
340 rib_post_recv,
341 rib_recv,
342 rib_read,
343 rib_write,
344 rib_getinfo,
345 #ifdef SERVER_REG_CACHE
346 rib_get_server_cache_buf,
347 rib_free_server_cache_buf,
348 #endif
349 #ifdef DYNAMIC_CREDIT_CONTROL
350 rib_get_resource_info,
351 #endif
352 #if defined(ASYNC_CLIENT_DEREG)
353 insert_queue,
354 #endif
355 };
356
357 /*
358 * RDMATF RPCIB plugin details
359 */
360 static rdma_mod_t rib_mod = {
361 "ibtf", /* api name */
362 RDMATF_VERS_1,
363 0,
364 &rib_ops, /* rdma op vector for ibtf */
365 };
366
367 static rdma_stat open_hcas(rpcib_state_t *);
368 static rdma_stat rib_qp_init(rib_qp_t *, int);
369 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
370 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
371 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
372 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
373 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
374 #ifdef IB_FMR_SUP
375 static rdma_stat rib_reg_mem_fmr(rib_hca_t *, caddr_t adsp,caddr_t, uint_t, ibt_mr_flags_t,
376 ibt_mr_hdl_t *, ibt_ma_hdl_t *, ibt_pmr_desc_t *);
377 #endif
378 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, ibt_mr_flags_t,
379 ibt_mr_hdl_t *, ibt_mr_desc_t *);
380 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
381 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
382 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *);
383 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
384 rib_qp_t **);
385 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
386 rib_qp_t **);
387 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
388 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
389 static int rib_free_sendwait(struct send_wid *);
390 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
391 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
392 static void rdma_done_rem_list(rib_qp_t *);
393 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
394
395 static void rib_async_handler(void *,
396 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
397 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
398 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
399 static int rib_free_svc_recv(struct svc_recv *);
400 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
401 static void rib_free_wid(struct recv_wid *);
416 };
417
418 /*
419 * Global strucuture
420 */
421
422 typedef struct rpcib_s {
423 dev_info_t *rpcib_dip;
424 kmutex_t rpcib_mutex;
425 } rpcib_t;
426
427 rpcib_t rpcib;
428
429 /*
430 * /etc/system controlled variable to control
431 * debugging in rpcib kernel module.
432 * Set it to values greater that 1 to control
433 * the amount of debugging messages required.
434 */
435 int rib_debug = 0;
436 #if defined(CLNT_POLL_CQ)
437 int max_poll_count = 500;
438 #endif
439 static int ats_running = 0;
440
441
442 int
443 _init(void)
444 {
445 int error;
446
447 error = mod_install((struct modlinkage *)&rib_modlinkage);
448 if (error != 0) {
449 /*
450 * Could not load module
451 */
452 return (error);
453 }
454 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
455
456 return (0);
457 }
458
459 int
460 _fini()
461 {
677 " ATS service: %s",
678 to_remove->srv_name);
679 }
680 #endif
681 }
682 kmem_free(to_remove, sizeof (rib_service_t));
683 }
684 hca->ats_list = NULL;
685 rw_exit(&hca->service_list_lock);
686 }
687
688 static void rib_rbufpool_free(rib_hca_t *, int);
689 static void rib_rbufpool_deregister(rib_hca_t *, int);
690 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
691 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
692 static rdma_stat rib_rem_replylist(rib_qp_t *);
693 static int rib_remreply(rib_qp_t *, struct reply *);
694 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
695 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
696
697
698 /*
699 * One CQ pair per HCA
700 */
701 static rdma_stat
702 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
703 rib_cq_t **cqp, rpcib_state_t *ribstat)
704 {
705 rib_cq_t *cq;
706 ibt_cq_attr_t cq_attr;
707 uint32_t real_size;
708 ibt_status_t status;
709 rdma_stat error = RDMA_SUCCESS;
710
711 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
712 cq->rib_hca = hca;
713 cq_attr.cq_size = cq_size;
714 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
715 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
716 &real_size);
717 if (status != IBT_SUCCESS) {
738
739 return (error);
740 fail:
741 if (cq->rib_cq_hdl)
742 (void) ibt_free_cq(cq->rib_cq_hdl);
743 if (cq)
744 kmem_free(cq, sizeof (rib_cq_t));
745 return (error);
746 }
747
748 static rdma_stat
749 open_hcas(rpcib_state_t *ribstat)
750 {
751 rib_hca_t *hca;
752 ibt_status_t ibt_status;
753 rdma_stat status;
754 ibt_hca_portinfo_t *pinfop;
755 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS;
756 uint_t size, cq_size;
757 int i;
758 #ifdef IB_FMR_SUP
759 ibt_fmr_pool_attr_t fmr_attr;
760 uint_t h_page_sz;
761 #endif
762 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
763 if (ribstat->hcas == NULL)
764 ribstat->hcas = kmem_zalloc(ribstat->hca_count *
765 sizeof (rib_hca_t), KM_SLEEP);
766
767 /*
768 * Open a hca and setup for RDMA
769 */
770 for (i = 0; i < ribstat->hca_count; i++) {
771 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
772 ribstat->hca_guids[i],
773 &ribstat->hcas[i].hca_hdl);
774 if (ibt_status != IBT_SUCCESS) {
775 cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) "
776 "returned %d", i, ibt_status);
777 continue;
778 }
779 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
780 hca = &(ribstat->hcas[i]);
781 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
854 }
855
856 /*
857 * Create buffer pools.
858 * Note rib_rbuf_create also allocates memory windows.
859 */
860 hca->recv_pool = rib_rbufpool_create(hca,
861 RECV_BUFFER, MAX_BUFS);
862 if (hca->recv_pool == NULL) {
863 cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n");
864 goto fail3;
865 }
866
867 hca->send_pool = rib_rbufpool_create(hca,
868 SEND_BUFFER, MAX_BUFS);
869 if (hca->send_pool == NULL) {
870 cmn_err(CE_WARN, "open_hcas: send buf pool failed\n");
871 rib_rbufpool_destroy(hca, RECV_BUFFER);
872 goto fail3;
873 }
874 #ifdef IB_FMR_SUP
875 /* Global FMR POOL */
876 bzero(&fmr_attr, sizeof (ibt_fmr_pool_attr_t));
877
878 h_page_sz = hca->hca_attrs.hca_page_sz * 1024;
879
880 fmr_attr.fmr_max_pages_per_fmr =
881 (IB_FMR_MAX_SIZE / h_page_sz) + 2;
882 fmr_attr.fmr_pool_size = MAX_BUFS * 2;
883 fmr_attr.fmr_dirty_watermark = IB_FMR_DIRTY_MARK;
884 fmr_attr.fmr_page_sz = h_page_sz;
885 fmr_attr.fmr_cache = B_FALSE;
886 fmr_attr.fmr_flags = IBT_MR_SLEEP |
887 IBT_MR_ENABLE_LOCAL_WRITE |
888 IBT_MR_ENABLE_REMOTE_READ |
889 IBT_MR_ENABLE_REMOTE_WRITE;
890 fmr_attr.fmr_func_hdlr = NULL;
891
892 if (rib_debug > 1) {
893 cmn_err(CE_NOTE, "open_hcas: ibt_create_fmr_pool:");
894 cmn_err(CE_NOTE, "fmr_page_sz %d, fmr_pool_sz %d, "
895 "max_pages_per_fmr %d", fmr_attr.fmr_page_sz,
896 fmr_attr.fmr_pool_size,
897 fmr_attr.fmr_max_pages_per_fmr);
898 }
899
900 ibt_status = ibt_create_fmr_pool(hca->hca_hdl, hca->pd_hdl,
901 &fmr_attr, &hca->fmr_pool);
902 if (ibt_status != IBT_SUCCESS) {
903 cmn_err(CE_WARN, "open_hcas: Global FMR pool creation "
904 "failed: %d\n", ibt_status);
905 rib_rbufpool_destroy(hca, RECV_BUFFER);
906 rib_rbufpool_destroy(hca, SEND_BUFFER);
907 goto fail3;
908 }
909 #endif
910 #ifdef SERVER_REG_CACHE
911 cmn_err(CE_NOTE,"Registration Cache enabled\n");
912 {
913 cache_avl_struct_t my_avl_node;
914 hca->server_side_cache =
915 kmem_cache_create("rib_server_side_cache",
916 sizeof (cache_avl_struct_t), 0,
917 NULL,
918 NULL,
919 rib_server_side_cache_reclaim,
920 hca, NULL, 0);
921 avl_create(&hca->avl_tree,
922 avl_compare,
923 sizeof(cache_avl_struct_t),
924 (uint_t)&my_avl_node.avl_link-(uint_t)&my_avl_node);
925 /* mutex_init(&hca->avl_lock, NULL, MUTEX_DEFAULT, NULL);*/
926 rw_init(&hca->avl_rw_lock, NULL, RW_DRIVER, hca->iblock);
927 hca->avl_init = TRUE;
928
929 }
930 #endif
931
932 #if defined(ASYNC_CLIENT_DEREG)
933 rqueue.forw = rqueue.back = &rqueue;
934 mutex_init(&at_mutex, NULL, MUTEX_DEFAULT, NULL);
935 cv_init(&at_cond, NULL, CV_DEFAULT, NULL);
936 (void) thread_create(NULL, 0, async_dereg_thread, NULL, 0, &p0,
937 TS_RUN, minclsyspri);
938 #endif
939 /*
940 * Initialize the registered service list and
941 * the lock
942 */
943 hca->service_list = NULL;
944 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
945
946 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
947 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
948 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
949 hca->iblock);
950 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
951 hca->iblock);
952 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
953 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
954 hca->inuse = TRUE;
955 /*
956 * XXX One hca only. Add multi-hca functionality if needed
957 * later.
958 */
1060 * Notify poster
1061 */
1062 cv_signal(&wd->wait_cv);
1063 mutex_exit(&wd->sendwait_lock);
1064 } else {
1065 /*
1066 * Poster not waiting for notification.
1067 * Free the send buffers and send_wid
1068 */
1069 for (i = 0; i < wd->nsbufs; i++) {
1070 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
1071 (void *)(uintptr_t)wd->sbufaddr[i]);
1072 }
1073 mutex_exit(&wd->sendwait_lock);
1074 (void) rib_free_sendwait(wd);
1075 }
1076 }
1077 }
1078 }
1079
1080 #if defined (CLNT_INTERRUPT_COAL)
1081 static void
1082 rib_scq_free(caddr_t widd)
1083 {
1084 struct send_wid *wd = (struct send_wid *)widd;
1085 ibt_status_t ibt_status;
1086 ibt_wc_t wc;
1087 int i;
1088 CONN *conn = qptoc(wd->qp);
1089
1090 wc.wc_status = RDMA_SUCCESS;
1091 mutex_enter(&wd->sendwait_lock);
1092 switch (wc.wc_status) {
1093 case IBT_WC_SUCCESS:
1094 wd->status = RDMA_SUCCESS;
1095 break;
1096 case IBT_WC_WR_FLUSHED_ERR:
1097 wd->status = RDMA_FAILED;
1098 break;
1099 default:
1100 /*
1101 * RC Send Q Error Code Local state Remote State
1102 * ==================== =========== ============
1103 * IBT_WC_BAD_RESPONSE_ERR ERROR None
1104 * IBT_WC_LOCAL_LEN_ERR ERROR None
1105 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None
1106 * IBT_WC_LOCAL_PROTECT_ERR ERROR None
1107 * IBT_WC_MEM_WIN_BIND_ERR ERROR None
1108 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR
1109 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR
1110 * IBT_WC_REMOTE_OP_ERR ERROR ERROR
1111 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None
1112 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None
1113 * IBT_WC_WR_FLUSHED_ERR None None
1114 */
1115 #ifdef DEBUG
1116 if (rib_debug > 1) {
1117 if (wc.wc_status != IBT_WC_SUCCESS) {
1118 cmn_err(CE_NOTE, "rib_clnt_scq_handler: "
1119 "WR completed in error, wc.wc_status:%d, "
1120 "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id);
1121 }
1122 }
1123 #endif
1124 /*
1125 * Channel in error state. Set connection to
1126 * ERROR and cleanup will happen either from
1127 * conn_release or from rib_conn_get
1128 */
1129 wd->status = RDMA_FAILED;
1130 mutex_enter(&conn->c_lock);
1131 if (conn->c_state != C_DISCONN_PEND)
1132 conn->c_state = C_ERROR;
1133 mutex_exit(&conn->c_lock);
1134 break;
1135 }
1136 if (wd->cv_sig == 1) {
1137 /*
1138 * Notify poster
1139 */
1140 cmn_err(CE_NOTE,"Some error \n");
1141 cv_signal(&wd->wait_cv);
1142 mutex_exit(&wd->sendwait_lock);
1143 } else {
1144 /*
1145 * Poster not waiting for notification.
1146 * Free the send buffers and send_wid
1147 */
1148 for (i = 0; i < wd->nsbufs; i++) {
1149 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
1150 (void *)(uintptr_t)wd->sbufaddr[i]);
1151 }
1152 mutex_exit(&wd->sendwait_lock);
1153 (void) rib_free_sendwait(wd);
1154 }
1155 }
1156 #endif
1157
1158 /* ARGSUSED */
1159 static void
1160 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1161 {
1162 ibt_status_t ibt_status;
1163 ibt_wc_t wc;
1164 int i;
1165
1166 /*
1167 * Re-enable cq notify here to avoid missing any
1168 * completion queue notification.
1169 */
1170 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1171
1172 ibt_status = IBT_SUCCESS;
1173 while (ibt_status != IBT_CQ_EMPTY) {
1174 bzero(&wc, sizeof (wc));
1175 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1176 if (ibt_status != IBT_SUCCESS)
1177 return;
1178
1179 /*
1180 * Got a send completion
1181 */
1182 #ifdef DEBUG
1183 if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) {
1184 cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error "
1185 "wc.wc_status:%d, wc_id:%llX",
1186 wc.wc_status, (longlong_t)wc.wc_id);
1187 }
1188 #endif
1189 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
1190 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1191 #ifdef ASYNC_SERVER_DEREG
1192 if(wd->c1){
1193 (void) clist_deregister1((CONN *)wd->c, (struct clist *)wd->c1, TRUE);
1194 #ifdef SERVER_REG_CACHE
1195 RDMA_FREE_SERVER_CACHE_BUF((CONN *)wd->c, (rib_lrc_entry_t *)(((struct clist *)wd->c1)->long_reply_buf));
1196 #else
1197 if(wd->c1 && wd->l1)
1198 kmem_free((void *) (wd->c1)->c_saddr, wd->l1);
1199 #endif
1200 kmem_free((void *)(wd->c1), wd->wl * sizeof(struct clist));
1201 }
1202 if(wd->c2){
1203 (void) clist_deregister1((CONN *)wd->c, (struct clist *)wd->c2, TRUE);
1204 #ifdef SERVER_REG_CACHE
1205 RDMA_FREE_SERVER_CACHE_BUF((CONN *)wd->c, (rib_lrc_entry_t *)(((struct clist *)wd->c2)->long_reply_buf));
1206 #else
1207 if(wd->l2)
1208 kmem_free((void *) (wd->c2)->c_saddr, wd->l2);
1209 #endif
1210 kmem_free((void *)(wd->c2), wd->rl * sizeof(struct clist));
1211 }
1212 #endif
1213 mutex_enter(&wd->sendwait_lock);
1214 if (wd->cv_sig == 1) {
1215 /*
1216 * Update completion status and notify poster
1217 */
1218 if (wc.wc_status == IBT_WC_SUCCESS)
1219 wd->status = RDMA_SUCCESS;
1220 else
1221 wd->status = RDMA_FAILED;
1222 cv_signal(&wd->wait_cv);
1223 mutex_exit(&wd->sendwait_lock);
1224 } else {
1225 /*
1226 * Poster not waiting for notification.
1227 * Free the send buffers and send_wid
1228 */
1229 for (i = 0; i < wd->nsbufs; i++) {
1230 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
1231 (void *)(uintptr_t)wd->sbufaddr[i]);
1232 }
1233 mutex_exit(&wd->sendwait_lock);
1234 (void) rib_free_sendwait(wd);
1235 }
1236 }
1237 }
1238 }
1239
1240 /*
1241 * RCQ handler
1242 */
1243 /* ARGSUSED */
1244 static void
1245 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1246 {
1247 rib_qp_t *qp;
1248 ibt_status_t ibt_status;
1249 ibt_wc_t wc;
1250 struct recv_wid *rwid;
1251 #if defined(CLNT_POLL_CQ)
1252 uint32_t count = 0;
1253 #endif
1254
1255 /*
1256 * Re-enable cq notify here to avoid missing any
1257 * completion queue notification.
1258 */
1259 #if !defined(CLNT_POLL_CQ)
1260 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1261 #endif
1262
1263 ibt_status = IBT_SUCCESS;
1264 while (ibt_status != IBT_CQ_EMPTY) {
1265 #if defined(CLNT_POLL_CQ)
1266 poll_cq_again:
1267 #endif
1268 bzero(&wc, sizeof (wc));
1269 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1270 #if defined(CLNT_POLL_CQ)
1271 if (ibt_status == IBT_CQ_EMPTY){
1272 count ++;
1273 if(count == max_poll_count){
1274 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1275 return;
1276 }
1277 goto poll_cq_again;
1278 }
1279 #endif
1280 if (ibt_status != IBT_SUCCESS)
1281 #if defined(CLNT_POLL_CQ)
1282 {
1283 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1284 #endif
1285 return;
1286 #if defined(CLNT_POLL_CQ)
1287 }
1288 count = 0;
1289 #endif
1290 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1291 qp = rwid->qp;
1292 if (wc.wc_status == IBT_WC_SUCCESS) {
1293 XDR inxdrs, *xdrs;
1294 uint_t xid, vers, op, find_xid = 0;
1295 struct reply *r;
1296 CONN *conn = qptoc(qp);
1297 uint32_t rdma_credit = 0;
1298
1299 xdrs = &inxdrs;
1300 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1301 wc.wc_bytes_xfer, XDR_DECODE);
1302 /*
1303 * Treat xid as opaque (xid is the first entity
1304 * in the rpc rdma message).
1305 */
1306 xid = *(uint32_t *)(uintptr_t)rwid->addr;
1307 /* Skip xid and set the xdr position accordingly. */
1308 XDR_SETPOS(xdrs, sizeof (uint32_t));
1309 (void) xdr_u_int(xdrs, &vers);
1310 (void) xdr_u_int(xdrs, &rdma_credit);
1311 (void) xdr_u_int(xdrs, &op);
1312 XDR_DESTROY(xdrs);
1313 if (vers != RPCRDMA_VERS) {
1314 /*
1315 * Invalid RPC/RDMA version. Cannot interoperate.
1316 * Set connection to ERROR state and bail out.
1317 */
1318 mutex_enter(&conn->c_lock);
1319 if (conn->c_state != C_DISCONN_PEND)
1320 conn->c_state = C_ERROR;
1321 mutex_exit(&conn->c_lock);
1322 rib_rbuf_free(conn, RECV_BUFFER,
1323 (void *)(uintptr_t)rwid->addr);
1324 rib_free_wid(rwid);
1325 continue;
1326 }
1327
1328 mutex_enter(&qp->replylist_lock);
1329 for (r = qp->replylist; r != NULL; r = r->next) {
1330 if (r->xid == xid) {
1408 mblk_t *mp;
1409
1410 /*
1411 * Re-enable cq notify here to avoid missing any
1412 * completion queue notification.
1413 */
1414 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1415
1416 ibt_status = IBT_SUCCESS;
1417 while (ibt_status != IBT_CQ_EMPTY) {
1418 bzero(&wc, sizeof (wc));
1419 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1420 if (ibt_status != IBT_SUCCESS)
1421 return;
1422
1423 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1424 qp = s_recvp->qp;
1425 conn = qptoc(qp);
1426 mutex_enter(&qp->posted_rbufs_lock);
1427 qp->n_posted_rbufs--;
1428 #if defined(MEASURE_POOL_DEPTH)
1429 rib_posted_rbufs(preposted_rbufs - qp->n_posted_rbufs);
1430 #endif
1431 if (qp->n_posted_rbufs == 0)
1432 cv_signal(&qp->posted_rbufs_cv);
1433 mutex_exit(&qp->posted_rbufs_lock);
1434
1435 if (wc.wc_status == IBT_WC_SUCCESS) {
1436 XDR inxdrs, *xdrs;
1437 uint_t xid, vers, op;
1438 uint32_t rdma_credit;
1439
1440 xdrs = &inxdrs;
1441 /* s_recvp->vaddr stores data */
1442 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1443 wc.wc_bytes_xfer, XDR_DECODE);
1444
1445 /*
1446 * Treat xid as opaque (xid is the first entity
1447 * in the rpc rdma message).
1448 */
1449 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1450 /* Skip xid and set the xdr position accordingly. */
1451 XDR_SETPOS(xdrs, sizeof (uint32_t));
1452 if (!xdr_u_int(xdrs, &vers) ||
1453 !xdr_u_int(xdrs, &rdma_credit) ||
1454 !xdr_u_int(xdrs, &op)) {
1455 rib_rbuf_free(conn, RECV_BUFFER,
1456 (void *)(uintptr_t)s_recvp->vaddr);
1457 XDR_DESTROY(xdrs);
1458 #ifdef DEBUG
1459 cmn_err(CE_NOTE, "rib_svc_rcq_handler: "
1460 "xdr_u_int failed for qp %p, wc_id=%llx",
1461 (void *)qp, (longlong_t)wc.wc_id);
1462 #endif
1463 (void) rib_free_svc_recv(s_recvp);
1464 continue;
1465 }
1466 XDR_DESTROY(xdrs);
1467
1468 if (vers != RPCRDMA_VERS) {
1469 /*
1470 * Invalid RPC/RDMA version. Drop rpc rdma message.
1471 */
1472 rib_rbuf_free(conn, RECV_BUFFER,
1473 (void *)(uintptr_t)s_recvp->vaddr);
1643 return (RDMA_FAILED);
1644 }
1645 } else {
1646 mutex_exit(&rib_stat->open_hca_lock);
1647 return (RDMA_SUCCESS);
1648 }
1649 } else {
1650 *handle = NULL;
1651 if (rib_debug > 2)
1652 cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n");
1653 return (RDMA_FAILED);
1654 }
1655 }
1656
1657 /* Client side qp creation */
1658 static rdma_stat
1659 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1660 {
1661 rib_qp_t *kqp = NULL;
1662 CONN *conn;
1663 rdma_clnt_cred_ctrl_t *cc_info;
1664
1665 ASSERT(qp != NULL);
1666 *qp = NULL;
1667
1668 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1669 conn = qptoc(kqp);
1670 kqp->hca = hca;
1671 kqp->rdmaconn.c_rdmamod = &rib_mod;
1672 kqp->rdmaconn.c_private = (caddr_t)kqp;
1673
1674 kqp->mode = RIB_CLIENT;
1675 kqp->chan_flags = IBT_BLOCKING;
1676 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1677 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1678 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1679
1680 /*
1681 * Initialize
1682 */
1683 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1684 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1685 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1686 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1687 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1688 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1689 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1690 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1691 #if defined (CLNT_INTERRUPT_COAL)
1692 kqp->rdmaconn.c_count = 0;
1693 conn->c_count = 0;
1694 bzero(&kqp->wd, sizeof(struct send_wid));
1695 kqp->wd.forw = kqp->wd.back = &kqp->wd;
1696 #endif
1697 /*
1698 * Initialize the client credit control
1699 * portion of the rdmaconn struct.
1700 */
1701 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1702 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1703 cc_info->clnt_cc_granted_ops = 0;
1704 cc_info->clnt_cc_in_flight_ops = 0;
1705 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1706
1707 *qp = kqp;
1708 return (RDMA_SUCCESS);
1709 }
1710
1711 /* Server side qp creation */
1712 static rdma_stat
1713 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1714 {
1715 rib_qp_t *kqp = NULL;
1716 ibt_chan_sizes_t chan_sizes;
1717 ibt_rc_chan_alloc_args_t qp_attr;
1718 ibt_status_t ibt_status;
1719 rdma_srv_cred_ctrl_t *cc_info;
1720
1721 ASSERT(qp != NULL);
1722 *qp = NULL;
1723
1724 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1725 kqp->hca = hca;
1726 kqp->port_num = port;
1727 kqp->rdmaconn.c_rdmamod = &rib_mod;
1728 kqp->rdmaconn.c_private = (caddr_t)kqp;
1729
1730 /*
1731 * Create the qp handle
1732 */
1733 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1734 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1735 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1736 qp_attr.rc_pd = hca->pd_hdl;
1737 qp_attr.rc_hca_port_num = port;
1738 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1739 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1761 goto fail;
1762 }
1763
1764 kqp->mode = RIB_SERVER;
1765 kqp->chan_flags = IBT_BLOCKING;
1766 kqp->q = q; /* server ONLY */
1767
1768 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1769 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1770 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1771 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1772 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1773 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1774 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1775 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1776 /*
1777 * Set the private data area to qp to be used in callbacks
1778 */
1779 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1780 kqp->rdmaconn.c_state = C_CONNECTED;
1781
1782 /*
1783 * Initialize the server credit control
1784 * portion of the rdmaconn struct.
1785 */
1786 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1787 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1788 cc_info->srv_cc_buffers_granted = preposted_rbufs;
1789 cc_info->srv_cc_cur_buffers_used = 0;
1790 cc_info->srv_cc_posted = preposted_rbufs;
1791
1792 *qp = kqp;
1793
1794 num_clients++;
1795 return (RDMA_SUCCESS);
1796 fail:
1797 if (kqp)
1798 kmem_free(kqp, sizeof (rib_qp_t));
1799
1800 return (RDMA_FAILED);
1801 }
1802
1803 void
1804 rib_dump_pathrec(ibt_path_info_t *path_rec)
1805 {
1806 ib_pkey_t pkey;
1807
1808 if (rib_debug > 1) {
1809 cmn_err(CE_NOTE, "Path Record:\n");
1810
1811 cmn_err(CE_NOTE, "Source HCA GUID = %llx\n",
1812 (longlong_t)path_rec->pi_hca_guid);
1813 cmn_err(CE_NOTE, "Dest Service ID = %llx\n",
1814 (longlong_t)path_rec->pi_sid);
2057
2058 (void) bzero(&chan_args, sizeof (chan_args));
2059 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
2060
2061 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
2062 /* Alloc a RC channel */
2063 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
2064 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
2065 qp_attr.rc_pd = hca->pd_hdl;
2066 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
2067 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
2068 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
2069 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
2070 qp_attr.rc_clone_chan = NULL;
2071 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
2072 qp_attr.rc_flags = IBT_WR_SIGNALED;
2073
2074 chan_args.oc_path = path;
2075 chan_args.oc_cm_handler = rib_clnt_cm_handler;
2076 chan_args.oc_cm_clnt_private = (void *)rib_stat;
2077 chan_args.oc_rdma_ra_out = 4;
2078 chan_args.oc_rdma_ra_in = 4;
2079 chan_args.oc_path_retry_cnt = 2;
2080 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
2081
2082 refresh:
2083 rw_enter(&hca->state_lock, RW_READER);
2084 if (hca->state != HCA_DETACHED) {
2085 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
2086 IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl,
2087 &chan_sizes);
2088 } else {
2089 rw_exit(&hca->state_lock);
2090 return (RDMA_FAILED);
2091 }
2092 rw_exit(&hca->state_lock);
2093
2094 if (ibt_status != IBT_SUCCESS) {
2095 #ifdef DEBUG
2096 cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel "
2097 "failed, ibt_status=%d.", ibt_status);
2098 #endif
2235 (void) rib_rem_replylist(qp);
2236 }
2237
2238 cv_destroy(&qp->cb_conn_cv);
2239 cv_destroy(&qp->posted_rbufs_cv);
2240 mutex_destroy(&qp->cb_lock);
2241
2242 mutex_destroy(&qp->replylist_lock);
2243 mutex_destroy(&qp->posted_rbufs_lock);
2244 mutex_destroy(&qp->rdlist_lock);
2245
2246 cv_destroy(&conn->c_cv);
2247 mutex_destroy(&conn->c_lock);
2248
2249 if (conn->c_raddr.buf != NULL) {
2250 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2251 }
2252 if (conn->c_laddr.buf != NULL) {
2253 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2254 }
2255
2256 /*
2257 * Credit control cleanup.
2258 */
2259 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2260 rdma_clnt_cred_ctrl_t *cc_info;
2261 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2262 cv_destroy(&cc_info->clnt_cc_cv);
2263 }
2264
2265 kmem_free(qp, sizeof (rib_qp_t));
2266
2267 /*
2268 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2269 * then the hca is no longer being used.
2270 */
2271 if (conn_list != NULL) {
2272 rw_enter(&hca->state_lock, RW_READER);
2273 if (hca->state == HCA_DETACHED) {
2274 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2275 if (hca->srv_conn_list.conn_hd == NULL) {
2276 rw_enter(&hca->cl_conn_list.conn_lock,
2277 RW_READER);
2278 if (hca->cl_conn_list.conn_hd == NULL) {
2279 mutex_enter(&hca->inuse_lock);
2280 hca->inuse = FALSE;
2281 cv_signal(&hca->cb_cv);
2282 mutex_exit(&hca->inuse_lock);
2283 }
2284 rw_exit(&hca->cl_conn_list.conn_lock);
2285 }
2286 rw_exit(&hca->srv_conn_list.conn_lock);
2287 }
2288 rw_exit(&hca->state_lock);
2289 }
2290
2291 num_clients--;
2292 return (RDMA_SUCCESS);
2293 }
2294
2295 #ifdef DYNAMIC_CREDIT_CONTROL
2296 void rib_get_resource_info(CONN *conn, int *current_clients, int *avail_bufs)
2297 {
2298 rib_qp_t *qp = ctoqp(conn);
2299 rib_hca_t *hca = qp->hca;
2300 rib_bufpool_t *rbp = NULL;
2301 bufpool_t *bp;
2302
2303 is_server = 1;
2304 rbp = hca->recv_pool;
2305
2306 if (rbp == NULL)
2307 *avail_bufs = 0;
2308 else {
2309 bp = rbp->bpool;
2310 *avail_bufs = bp->buffree;
2311 }
2312
2313 *current_clients = num_clients;
2314 }
2315 #endif
2316
2317 /*
2318 * Wait for send completion notification. Only on receiving a
2319 * notification be it a successful or error completion, free the
2320 * send_wid.
2321 */
2322 static rdma_stat
2323 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2324 {
2325 clock_t timout, cv_wait_ret;
2326 rdma_stat error = RDMA_SUCCESS;
2327 int i;
2328
2329 /*
2330 * Wait for send to complete
2331 */
2332 ASSERT(wd != NULL);
2333 mutex_enter(&wd->sendwait_lock);
2334 if (wd->status == (uint_t)SEND_WAIT) {
2335 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2336 ddi_get_lbolt();
2431
2432 static rdma_stat
2433 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2434 {
2435 mutex_enter(&qp->replylist_lock);
2436 if (rep != NULL) {
2437 (void) rib_remreply(qp, rep);
2438 mutex_exit(&qp->replylist_lock);
2439 return (RDMA_SUCCESS);
2440 }
2441 mutex_exit(&qp->replylist_lock);
2442 return (RDMA_FAILED);
2443 }
2444
2445 /*
2446 * Send buffers are freed here only in case of error in posting
2447 * on QP. If the post succeeded, the send buffers are freed upon
2448 * send completion in rib_sendwait() or in the scq_handler.
2449 */
2450 rdma_stat
2451 #if defined(ASYNC_SERVER_DEREG)
2452 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2453 int send_sig, int cv_sig, caddr_t c, caddr_t c1, int l1, caddr_t c2, int l2, int l3, int l4)
2454 #else
2455 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2456 int send_sig, int cv_sig, caddr_t *swid)
2457 #endif
2458 {
2459 struct send_wid *wdesc;
2460 struct clist *clp;
2461 ibt_status_t ibt_status = IBT_SUCCESS;
2462 rdma_stat ret = RDMA_SUCCESS;
2463 ibt_send_wr_t tx_wr;
2464 int i, nds;
2465 ibt_wr_ds_t sgl[DSEG_MAX];
2466 uint_t total_msg_size;
2467 rib_qp_t *qp = ctoqp(conn);
2468
2469 ASSERT(cl != NULL);
2470
2471 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2472
2473 nds = 0;
2474 total_msg_size = 0;
2475 clp = cl;
2476 while (clp != NULL) {
2477 if (nds >= DSEG_MAX) {
2478 cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX"
2479 " too small!");
2480 return (RDMA_FAILED);
2481 }
2482 sgl[nds].ds_va = clp->c_saddr;
2483 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2484 sgl[nds].ds_len = clp->c_len;
2485 total_msg_size += clp->c_len;
2486 clp = clp->c_next;
2487 nds++;
2488 }
2489
2490 if (send_sig) {
2491 /* Set SEND_SIGNAL flag. */
2492 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2493 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2494 *swid = (caddr_t)wdesc;
2495 } else {
2496 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2497 wdesc = rib_init_sendwait(msgid, 0, qp);
2498 *swid = (caddr_t)wdesc;
2499 }
2500 wdesc->nsbufs = nds;
2501 #if defined(ASYNC_SERVER_DEREG)
2502 wdesc->c = c;
2503 wdesc->c1 = c1;
2504 wdesc->c2 = c2;
2505 wdesc->l1 = l1;
2506 wdesc->l2 = l2;
2507 wdesc->wl = l3;
2508 wdesc->rl = l4;
2509 #endif
2510 for (i = 0; i < nds; i++) {
2511 wdesc->sbufaddr[i] = sgl[i].ds_va;
2512 }
2513
2514 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2515 tx_wr.wr_opcode = IBT_WRC_SEND;
2516 tx_wr.wr_trans = IBT_RC_SRV;
2517 tx_wr.wr_nds = nds;
2518 tx_wr.wr_sgl = sgl;
2519
2520 mutex_enter(&conn->c_lock);
2521 if (conn->c_state & C_CONNECTED) {
2522 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2523 }
2524 if (((conn->c_state & C_CONNECTED) == 0) ||
2525 ibt_status != IBT_SUCCESS) {
2526 mutex_exit(&conn->c_lock);
2527 for (i = 0; i < nds; i++) {
2528 rib_rbuf_free(conn, SEND_BUFFER,
2529 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2546 * cv_wait for send to complete.
2547 * We can fail due to a timeout or signal or
2548 * unsuccessful send.
2549 */
2550 ret = rib_sendwait(qp, wdesc);
2551 #ifdef DEBUG
2552 if (rib_debug > 2)
2553 if (ret != 0) {
2554 cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait "
2555 "FAILED, rdma stat=%d, wr_id %llx, qp %p!",
2556 ret, (longlong_t)tx_wr.wr_id, (void *)qp);
2557 }
2558 #endif
2559 return (ret);
2560 }
2561 }
2562
2563 return (RDMA_SUCCESS);
2564 }
2565
2566 #if defined (CLNT_INTERRUPT_COAL)
2567 rdma_stat
2568 rib_send_bl(CONN *conn, struct clist *cl, uint32_t msgid)
2569 {
2570 rdma_stat ret;
2571 struct send_wid *sd, dlist;
2572 rib_qp_t *qp = ctoqp(conn);
2573 caddr_t wd;
2574 mutex_enter(&conn->c_lock);
2575 if((conn->c_count+1) >= (preposted_rbufs/2)){
2576 conn->c_count = 0;
2577 dlist.forw = dlist.back = &dlist;
2578 while(qp->wd.forw != &qp->wd){
2579 sd = qp->wd.forw;
2580 remque(sd);
2581 insque(sd,&dlist);
2582 }
2583 mutex_exit(&conn->c_lock);
2584 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2585 while(dlist.forw != &dlist){
2586 sd = dlist.forw;
2587 remque(dlist.forw);
2588 rib_scq_free((caddr_t)sd);
2589 }
2590 }else{
2591 mutex_exit(&conn->c_lock);
2592 wd = 0;
2593 ret = rib_send_and_wait(conn, cl, msgid, 0, 0, &wd);
2594 mutex_enter(&conn->c_lock);
2595 conn->c_count ++;
2596 insque(wd, &qp->wd);
2597 mutex_exit(&conn->c_lock);
2598 }
2599 return (ret);
2600 }
2601 #endif
2602
2603 rdma_stat
2604 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2605 {
2606 rdma_stat ret;
2607 /* send-wait & cv_signal */
2608 #if defined(ASYNC_SERVER_DEREG)
2609 ret = rib_send_and_wait(conn, cl, msgid,1,1,0,0,0,0,0,0,0, &wd);
2610 #else
2611 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2612 #endif
2613 return (ret);
2614 }
2615
2616 #if defined(ASYNC_SERVER_DEREG)
2617 rdma_stat
2618 rib_send_nw(CONN *conn, struct clist *cl, uint32_t msgid, caddr_t c, caddr_t c1, int c2, caddr_t c3, int c4, int c5, int c6)
2619 {
2620 rdma_stat ret;
2621 caddr_t *wid;
2622 /* send-wait & cv_signal */
2623 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, c, c1, c2, c3, c4, c5, c6, wid);
2624
2625 return (ret);
2626 }
2627 #endif
2628 /*
2629 * Server interface (svc_rdma_ksend).
2630 * Send RPC reply and wait for RDMA_DONE.
2631 */
2632 rdma_stat
2633 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2634 {
2635 rdma_stat ret = RDMA_SUCCESS;
2636 struct rdma_done_list *rd;
2637 clock_t timout, cv_wait_ret;
2638 caddr_t *wid;
2639 rib_qp_t *qp = ctoqp(conn);
2640
2641 mutex_enter(&qp->rdlist_lock);
2642 rd = rdma_done_add(qp, msgid);
2643
2644 /* No cv_signal (whether send-wait or no-send-wait) */
2645 #if defined(ASYNC_SERVER_DEREG)
2646 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, 0, 0, 0, 0, 0, 0, 0, wid);
2647 #else
2648 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2649 #endif
2650 if (ret != RDMA_SUCCESS) {
2651 #ifdef DEBUG
2652 cmn_err(CE_WARN, "rib_send_resp: send_and_wait "
2653 "failed, msgid %u, qp %p", msgid, (void *)qp);
2654 #endif
2655 rdma_done_rm(qp, rd);
2656 goto done;
2657 }
2658
2659 /*
2660 * Wait for RDMA_DONE from remote end
2661 */
2662 timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2663 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock,
2664 timout);
2665 rdma_done_rm(qp, rd);
2666 if (cv_wait_ret < 0) {
2667 #ifdef DEBUG
2668 if (rib_debug > 1) {
2669 cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
2937 #ifdef DEBUG
2938 cmn_err(CE_WARN, "rib_recv: no matching reply for "
2939 "xid %u, qp %p\n", msgid, (void *)qp);
2940 #endif
2941 }
2942
2943 /*
2944 * Done.
2945 */
2946 mutex_exit(&qp->replylist_lock);
2947 return (ret);
2948 }
2949
2950 /*
2951 * RDMA write a buffer to the remote address.
2952 */
2953 rdma_stat
2954 rib_write(CONN *conn, struct clist *cl, int wait)
2955 {
2956 ibt_send_wr_t tx_wr;
2957 int cv_sig;
2958 ibt_wr_ds_t sgl[DSEG_MAX];
2959 struct send_wid *wdesc;
2960 ibt_status_t ibt_status;
2961 rdma_stat ret = RDMA_SUCCESS;
2962 rib_qp_t *qp = ctoqp(conn);
2963
2964 if (cl == NULL) {
2965 cmn_err(CE_WARN, "rib_write: NULL clist\n");
2966 return (RDMA_FAILED);
2967 }
2968
2969
2970 while ((cl != NULL)) {
2971 if(cl->c_len > 0){
2972 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2973 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr;
2974 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */
2975 sgl[0].ds_va = cl->c_saddr;
2976 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2977 sgl[0].ds_len = cl->c_len;
2978
2979 if (wait) {
2980 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2981 cv_sig = 1;
2982 } else {
2983 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2984 cv_sig = 0;
2985 }
2986
2987 wdesc = rib_init_sendwait(0, cv_sig, qp);
2988 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2989 tx_wr.wr_opcode = IBT_WRC_RDMAW;
2990 tx_wr.wr_trans = IBT_RC_SRV;
2991 tx_wr.wr_nds = 1;
2992 tx_wr.wr_sgl = sgl;
2993
2994 mutex_enter(&conn->c_lock);
2995 if (conn->c_state & C_CONNECTED) {
2996 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2997 }
2998 if (((conn->c_state & C_CONNECTED) == 0) ||
2999 ibt_status != IBT_SUCCESS) {
3000 mutex_exit(&conn->c_lock);
3001 (void) rib_free_sendwait(wdesc);
3002 return (RDMA_FAILED);
3003 }
3004 mutex_exit(&conn->c_lock);
3005
3006 /*
3007 * Wait for send to complete
3008 */
3009 if (wait) {
3010 ret = rib_sendwait(qp, wdesc);
3011 if (ret != 0) {
3012 return (ret);
3013 }
3014 }
3015 }
3016 cl = cl->c_next;
3017 }
3018 return (RDMA_SUCCESS);
3019 }
3020
3021 /*
3022 * RDMA Read a buffer from the remote address.
3023 */
3024 rdma_stat
3025 rib_read(CONN *conn, struct clist *cl, int wait)
3026 {
3027 ibt_send_wr_t rx_wr;
3028 int nds;
3029 int cv_sig;
3030 ibt_wr_ds_t sgl[DSEG_MAX]; /* is 2 sufficient? */
3031 struct send_wid *wdesc;
3032 ibt_status_t ibt_status = IBT_SUCCESS;
3033 rdma_stat ret = RDMA_SUCCESS;
3034 rib_qp_t *qp = ctoqp(conn);
3035
3036 if (cl == NULL) {
3037 cmn_err(CE_WARN, "rib_read: NULL clist\n");
3116 return (zero == 0);
3117 }
3118
3119 /*
3120 * rib_srv_cm_handler()
3121 * Connection Manager callback to handle RC connection requests.
3122 */
3123 /* ARGSUSED */
3124 static ibt_cm_status_t
3125 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
3126 ibt_cm_return_args_t *ret_args, void *priv_data,
3127 ibt_priv_data_len_t len)
3128 {
3129 queue_t *q;
3130 rib_qp_t *qp;
3131 rpcib_state_t *ribstat;
3132 rib_hca_t *hca;
3133 rdma_stat status = RDMA_SUCCESS;
3134 int i;
3135 struct clist cl;
3136 rdma_buf_t rdbuf = {0};
3137 void *buf = NULL;
3138 ibt_cm_req_rcv_t cm_req_rcv;
3139 CONN *conn;
3140 ibt_status_t ibt_status;
3141 ibt_ar_t ar_query, ar_result;
3142 ib_gid_t sgid;
3143
3144
3145 ASSERT(any != NULL);
3146 ASSERT(event != NULL);
3147
3148 ribstat = (rpcib_state_t *)any;
3149 hca = (rib_hca_t *)ribstat->hca;
3150 ASSERT(hca != NULL);
3151
3152 /* got a connection request */
3153 switch (event->cm_type) {
3154 case IBT_CM_EVENT_REQ_RCV:
3155 /*
3156 * If the plugin is in the NO_ACCEPT state, bail out.
3201 cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n",
3202 cm_req_rcv.req_remote_qpn);
3203 cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n",
3204 cm_req_rcv.req_remote_qkey);
3205 cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n",
3206 (void *)qp, (void *)qp->qp_hdl);
3207 }
3208
3209 if (rib_debug > 2) {
3210 ibt_rc_chan_query_attr_t chan_attrs;
3211
3212 if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs)
3213 == IBT_SUCCESS) {
3214 cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in "
3215 "CEP state %d\n", (void *)qp, chan_attrs.rc_state);
3216 }
3217 }
3218 #endif
3219
3220 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
3221 ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
3222 ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
3223 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
3224
3225 /*
3226 * Pre-posts RECV buffers
3227 */
3228 conn = qptoc(qp);
3229 for (i = 0; i < preposted_rbufs; i++) {
3230 bzero(&rdbuf, sizeof (rdbuf));
3231 rdbuf.type = RECV_BUFFER;
3232 buf = rib_rbuf_alloc(conn, &rdbuf);
3233 if (buf == NULL) {
3234 cmn_err(CE_WARN, "rib_svc_cm_handler: "
3235 "No RECV_BUFFER buf!\n");
3236 (void) rib_disconnect_channel(conn, NULL);
3237 return (IBT_CM_REJECT);
3238 }
3239
3240 bzero(&cl, sizeof (cl));
3241 cl.c_saddr = (uintptr_t)rdbuf.addr;
3242 cl.c_len = rdbuf.len;
4126 rep->prev->next = rep->next;
4127 }
4128 if (rep->next) {
4129 rep->next->prev = rep->prev;
4130 }
4131 if (qp->replylist == rep)
4132 qp->replylist = rep->next;
4133
4134 cv_destroy(&rep->wait_cv);
4135 qp->rep_list_size--;
4136 if (rib_debug > 1)
4137 cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n",
4138 (void *)qp, qp->rep_list_size);
4139
4140 kmem_free(rep, sizeof (*rep));
4141
4142 return (0);
4143 }
4144
4145 rdma_stat
4146 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
4147 struct mrc *buf_handle)
4148 {
4149 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
4150 #ifdef IB_FMR_SUP
4151 ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */
4152 ibt_ma_hdl_t ma_hdl = NULL;
4153 #endif
4154 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
4155 rdma_stat status;
4156 rib_hca_t *hca = (ctoqp(conn))->hca;
4157
4158 /*
4159 * Note: ALL buffer pools use the same memory type RDMARW.
4160 */
4161 #ifdef IB_FMR_SUP
4162 status = rib_reg_mem_fmr(hca, adsp, buf, buflen, 0, &mr_hdl, &ma_hdl,
4163 &pmr_desc);
4164 if (status == RDMA_SUCCESS) {
4165 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4166 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey;
4167 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey;
4168 buf_handle->mrc_lma = (uintptr_t)ma_hdl;
4169 goto ret_stat;
4170 } else {
4171 buf_handle->mrc_linfo = NULL;
4172 buf_handle->mrc_lma = NULL;
4173 buf_handle->mrc_lmr = 0;
4174 buf_handle->mrc_rmr = 0;
4175 }
4176 #endif
4177 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
4178 if (status == RDMA_SUCCESS) {
4179 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4180 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4181 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4182 } else {
4183 buf_handle->mrc_linfo = NULL;
4184 buf_handle->mrc_lmr = 0;
4185 buf_handle->mrc_rmr = 0;
4186 }
4187 ret_stat:
4188 return (status);
4189 }
4190
4191 #ifdef IB_FMR_SUP
4192 static rdma_stat
4193 rib_reg_mem_fmr(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec,
4194 ibt_mr_hdl_t *mr_hdlp, ibt_ma_hdl_t *ma_hdlp, ibt_pmr_desc_t *pmr_descp)
4195 {
4196 ibt_va_attr_t va_attr;
4197 ibt_phys_buf_t *paddr_list;
4198 uint_t paddr_list_len, num_paddr;
4199 size_t buf_sz = 0;
4200 ibt_pmr_attr_t pmr_attr;
4201 ib_memlen_t paddr_offset;
4202 ibt_status_t ibt_status;
4203 uint_t h_page_sz;
4204 if(adsp)
4205 return(RDMA_FAILED);
4206 bzero(&va_attr, sizeof (ibt_va_attr_t));
4207 va_attr.va_vaddr = (ib_vaddr_t)buf;
4208 va_attr.va_len = size;
4209 va_attr.va_as = (struct as *)(caddr_t)adsp;
4210 va_attr.va_flags = IBT_VA_FMR | IBT_VA_SLEEP;
4211 if (spec == IBT_MR_NONCOHERENT)
4212 va_attr.va_flags |= IBT_VA_NONCOHERENT;
4213 va_attr.va_phys_buf_min = va_attr.va_phys_buf_max = 0;
4214
4215 h_page_sz = hca->hca_attrs.hca_page_sz * 1024;
4216 paddr_list_len = (size / h_page_sz) + 2;
4217 paddr_list = (ibt_phys_buf_t *)kmem_zalloc(sizeof (ibt_phys_buf_t) *
4218 paddr_list_len, KM_NOSLEEP);
4219
4220 if (rib_debug > 0) {
4221 cmn_err(CE_NOTE, "fmr: vaddr %p, size %d paddr_list_len %d \n",
4222 buf, size, paddr_list_len);
4223 }
4224
4225 ibt_status = ibt_map_mem_area(hca->hca_hdl, &va_attr, paddr_list_len,
4226 paddr_list, &num_paddr, &buf_sz, &paddr_offset, ma_hdlp);
4227 if (ibt_status != IBT_SUCCESS) {
4228 cmn_err(CE_WARN, "rib_reg_mem_fmr: ibt_map_mem_area failed: "
4229 "status %d", ibt_status);
4230 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len);
4231 return (RDMA_FAILED);
4232 }
4233
4234 if (rib_debug > 0) {
4235 cmn_err(CE_NOTE,"fmr: p_laddr %p, p_size %d, buf_sz %d, p_ofset %llX\n",
4236 paddr_list[0].p_laddr, paddr_list[0].p_size, buf_sz,
4237 paddr_offset);
4238 cmn_err(CE_NOTE,"fmr: ibt_map_mem_area: ret %d, num_paddr %d, spec %d\n",
4239 ibt_status, num_paddr, spec);
4240 }
4241
4242 bzero(&pmr_attr, sizeof (ibt_pmr_attr_t));
4243 pmr_attr.pmr_iova = (ib_vaddr_t)buf;
4244 pmr_attr.pmr_len = size;
4245 pmr_attr.pmr_num_buf = num_paddr;
4246 pmr_attr.pmr_buf_sz = buf_sz;
4247 pmr_attr.pmr_buf_list = paddr_list;
4248 pmr_attr.pmr_offset = paddr_offset;
4249 pmr_attr.pmr_flags = spec;
4250 pmr_attr.pmr_ma = *ma_hdlp;
4251
4252 ibt_status = ibt_register_physical_fmr(hca->hca_hdl, hca->fmr_pool,
4253 &pmr_attr, mr_hdlp, pmr_descp);
4254 if (ibt_status != IBT_SUCCESS) {
4255 cmn_err(CE_WARN, "rib_reg_mem_fmr: ibt_register_physical_fmr "
4256 "failed: status %d", ibt_status);
4257 (void) ibt_unmap_mem_area(hca->hca_hdl, *ma_hdlp);
4258 *ma_hdlp=NULL;
4259 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len);
4260 return (RDMA_FAILED);
4261 }
4262
4263 if (rib_debug > 0) {
4264 cmn_err(CE_NOTE,"fmr: rkey: 0x%lX lkey: 0x%lX, iova: %p, fmr_hdl %p \n",
4265 pmr_descp->pmd_rkey, pmr_descp->pmd_lkey,
4266 pmr_descp->pmd_iova, *mr_hdlp);
4267 }
4268
4269 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len);
4270
4271 return (RDMA_SUCCESS);
4272
4273 }
4274
4275 #endif
4276 static rdma_stat
4277 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec,
4278 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
4279 {
4280 ibt_mr_attr_t mem_attr;
4281 ibt_status_t ibt_status;
4282 mem_attr.mr_vaddr = (uintptr_t)buf;
4283 mem_attr.mr_len = (ib_msglen_t)size;
4284 mem_attr.mr_as = (struct as *)(caddr_t)adsp;
4285 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
4286 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
4287 IBT_MR_ENABLE_WINDOW_BIND | spec;
4288
4289 rw_enter(&hca->state_lock, RW_READER);
4290 if (hca->state == HCA_INITED) {
4291 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
4292 &mem_attr, mr_hdlp, mr_descp);
4293 rw_exit(&hca->state_lock);
4294 } else {
4295 rw_exit(&hca->state_lock);
4296 return (RDMA_FAILED);
4297 }
4298
4299 if (ibt_status != IBT_SUCCESS) {
4300 cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr "
4301 "(spec:%d) failed for addr %llX, status %d",
4302 spec, (longlong_t)mem_attr.mr_vaddr, ibt_status);
4303 return (RDMA_FAILED);
4304 }
4305 return (RDMA_SUCCESS);
4306 }
4307
4308 rdma_stat
4309 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
4310 #ifdef SERVER_REG_CACHE
4311 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
4312 #else
4313 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle)
4314 #endif
4315 {
4316 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
4317 #ifdef IB_FMR_SUP
4318 ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */
4319 ibt_ma_hdl_t ma_hdl = NULL;
4320 #endif
4321 #ifdef SERVER_REG_CACHE
4322 rib_lrc_entry_t *l;
4323 #endif
4324 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
4325 rdma_stat status;
4326 rib_hca_t *hca = (ctoqp(conn))->hca;
4327
4328 /*
4329 * Non-coherent memory registration.
4330 */
4331 #ifdef SERVER_REG_CACHE
4332 l = (rib_lrc_entry_t *)lrc;
4333 if(l){
4334 if(l->registered){
4335 buf_handle->mrc_linfo = (uintptr_t)l->lrc_mhandle.mrc_linfo;
4336 buf_handle->mrc_lmr = (uint32_t)l->lrc_mhandle.mrc_lmr;
4337 buf_handle->mrc_rmr = (uint32_t)l->lrc_mhandle.mrc_rmr;
4338 #ifdef IB_FMR_SUP
4339 buf_handle->mrc_lma = (uintptr_t)l->lrc_mhandle.mrc_lma;
4340 #endif
4341 *sync_handle = (RIB_SYNCMEM_HANDLE)l->lrc_mhandle.mrc_linfo;
4342 return(RDMA_SUCCESS);
4343 } else {
4344 /* Always register the whole buffer */
4345 buf = (caddr_t)l->lrc_buf;
4346 buflen = l->lrc_len;
4347 /*cmn_err(CE_NOTE,"Register %p of length %d\n",buf,buflen);*/
4348 }
4349 }
4350 #endif
4351 #ifdef IB_FMR_SUP
4352 status = rib_reg_mem_fmr(hca, adsp, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl,
4353 &ma_hdl, &pmr_desc);
4354 if (status == RDMA_SUCCESS) {
4355 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4356 buf_handle->mrc_lma = (uintptr_t)ma_hdl;
4357 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey;
4358 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey;
4359 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
4360 #ifdef SERVER_REG_CACHE
4361 if(l){
4362 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
4363 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey;
4364 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey;
4365 l->registered = TRUE;
4366 l->lrc_mhandle.mrc_lma = (uintptr_t)ma_hdl;
4367 }
4368 #endif
4369 goto ret_stat;
4370
4371 } else {
4372 if (rib_debug > 1)
4373 cmn_err(CE_WARN,"fmr reg failed for buffer %p of length %d\n",buf,buflen);
4374 buf_handle->mrc_linfo = NULL;
4375 buf_handle->mrc_lma = NULL;
4376 buf_handle->mrc_lmr = 0;
4377 buf_handle->mrc_rmr = 0;
4378 }
4379 #endif
4380 status = rib_reg_mem(hca, adsp, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl,
4381 &mr_desc);
4382 if (status == RDMA_SUCCESS) {
4383 #ifdef SERVER_REG_CACHE
4384 if(l){
4385 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
4386 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey;
4387 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey;
4388 l->registered = TRUE;
4389 }
4390 #endif
4391 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4392 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4393 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4394 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
4395 } else {
4396 buf_handle->mrc_linfo = NULL;
4397 buf_handle->mrc_lmr = 0;
4398 buf_handle->mrc_rmr = 0;
4399 }
4400 ret_stat:
4401 return (status);
4402 }
4403
4404 /* ARGSUSED */
4405 rdma_stat
4406 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
4407 {
4408 avl_index_t where = NULL;
4409 #ifdef IB_FMR_SUP
4410 ibt_status_t ibt_status;
4411 #endif
4412 rib_hca_t *hca = (ctoqp(conn))->hca;
4413 /*
4414 * Allow memory deregistration even if HCA is
4415 * getting detached. Need all outstanding
4416 * memory registrations to be deregistered
4417 * before HCA_DETACH_EVENT can be accepted.
4418 */
4419 #ifdef IB_FMR_SUP
4420 if(buf_handle.mrc_lma){
4421 ibt_status = ibt_unmap_mem_area(hca->hca_hdl,
4422 (ibt_ma_hdl_t)buf_handle.mrc_lma);
4423 if (ibt_status != IBT_SUCCESS){
4424 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed",
4425 ibt_status);
4426 return (RDMA_FAILED);
4427 }
4428
4429 ibt_status = ibt_deregister_fmr(hca->hca_hdl,
4430 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4431 if (ibt_status != IBT_SUCCESS)
4432 return (RDMA_FAILED);
4433 return (RDMA_SUCCESS);
4434 }
4435 #endif
4436 (void) ibt_deregister_mr(hca->hca_hdl,
4437 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4438 return (RDMA_SUCCESS);
4439 }
4440
4441 /* ARGSUSED */
4442 rdma_stat
4443 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
4444 #ifdef SERVER_REG_CACHE
4445 RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
4446 #else
4447 RIB_SYNCMEM_HANDLE sync_handle)
4448 #endif
4449 {
4450 #ifdef SERVER_REG_CACHE
4451 rib_lrc_entry_t *l;
4452 l = (rib_lrc_entry_t *)lrc;
4453 if(l)
4454 if(l->registered)
4455 return(RDMA_SUCCESS);
4456 #endif
4457
4458
4459 (void) rib_deregistermem(conn, buf, buf_handle);
4460
4461 return (RDMA_SUCCESS);
4462 }
4463
4464 /* ARGSUSED */
4465 rdma_stat
4466 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
4467 int len, int cpu)
4468 {
4469 ibt_status_t status;
4470 rib_hca_t *hca = (ctoqp(conn))->hca;
4471 ibt_mr_sync_t mr_segment;
4472
4473 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
4474 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
4475 mr_segment.ms_len = (ib_memlen_t)len;
4476 if (cpu) {
4477 /* make incoming data visible to memory */
4478 mr_segment.ms_flags = IBT_SYNC_WRITE;
4517 }
4518
4519 rib_bufpool_t *
4520 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
4521 {
4522 rib_bufpool_t *rbp = NULL;
4523 bufpool_t *bp = NULL;
4524 caddr_t buf;
4525 ibt_mr_attr_t mem_attr;
4526 ibt_status_t ibt_status;
4527 int i, j;
4528
4529 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
4530
4531 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
4532 num * sizeof (void *), KM_SLEEP);
4533
4534 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
4535 bp->numelems = num;
4536
4537
4538 switch (ptype) {
4539 case SEND_BUFFER:
4540 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4541 bp->rsize = RPC_MSG_SZ;
4542 break;
4543 case RECV_BUFFER:
4544 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4545 bp->rsize = RPC_BUF_SIZE;
4546 break;
4547 default:
4548 goto fail;
4549 }
4550
4551 /*
4552 * Register the pool.
4553 */
4554 bp->bufsize = num * bp->rsize;
4555 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
4556 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
4557 sizeof (ibt_mr_hdl_t), KM_SLEEP);
4558 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
4559 sizeof (ibt_mr_desc_t), KM_SLEEP);
4560 rw_enter(&hca->state_lock, RW_READER);
4561 if (hca->state != HCA_INITED) {
4562 rw_exit(&hca->state_lock);
4563 cmn_err(CE_WARN,"hca->state != HCA_INITED");
4564 goto fail;
4565 }
4566 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
4567 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
4568 mem_attr.mr_vaddr = (uintptr_t)buf;
4569 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
4570 mem_attr.mr_as = NULL;
4571 ibt_status = ibt_register_mr(hca->hca_hdl,
4572 hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i],
4573 &rbp->mr_desc[i]);
4574 if (ibt_status != IBT_SUCCESS) {
4575 for (j = 0; j < i; j++) {
4576 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]);
4577 }
4578 rw_exit(&hca->state_lock);
4579 goto fail;
4580 }
4581 }
4582 rw_exit(&hca->state_lock);
4583 buf = (caddr_t)bp->buf;
4584 for (i = 0; i < num; i++, buf += bp->rsize) {
4585 bp->buflist[i] = (void *)buf;
4586 }
4587 bp->buffree = num - 1; /* no. of free buffers */
4588 rbp->bpool = bp;
4589
4590 return (rbp);
4591 fail:
4592 if (bp) {
4593 if (bp->buf)
4594 kmem_free(bp->buf, bp->bufsize);
4595 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
4596 }
4597 if (rbp) {
4598 if (rbp->mr_hdl)
4599 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
4600 if (rbp->mr_desc)
4601 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
4602 kmem_free(rbp, sizeof (rib_bufpool_t));
4653 break;
4654 case RECV_BUFFER:
4655 rbp = hca->recv_pool;
4656 break;
4657 default:
4658 return;
4659 }
4660 if (rbp == NULL)
4661 return;
4662
4663 bp = rbp->bpool;
4664
4665 /*
4666 * Free the pool memory.
4667 */
4668 if (rbp->mr_hdl)
4669 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4670
4671 if (rbp->mr_desc)
4672 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4673 if (bp->buf)
4674 kmem_free(bp->buf, bp->bufsize);
4675 mutex_destroy(&bp->buflock);
4676 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4677 kmem_free(rbp, sizeof (rib_bufpool_t));
4678 }
4679
4680 void
4681 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4682 {
4683 /*
4684 * Deregister the pool memory and free it.
4685 */
4686 rib_rbufpool_deregister(hca, ptype);
4687 rib_rbufpool_free(hca, ptype);
4688 }
4689
4690 /*
4691 * Fetch a buffer from the pool of type specified in rdbuf->type.
4692 */
4694 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4695 {
4696
4697 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4698 if (rdbuf->addr) {
4699 switch (rdbuf->type) {
4700 case SEND_BUFFER:
4701 rdbuf->len = RPC_MSG_SZ; /* 1K */
4702 break;
4703 case RECV_BUFFER:
4704 rdbuf->len = RPC_BUF_SIZE; /* 2K */
4705 break;
4706 default:
4707 rdbuf->len = 0;
4708 }
4709 return (RDMA_SUCCESS);
4710 } else
4711 return (RDMA_FAILED);
4712 }
4713
4714 #if defined(MEASURE_POOL_DEPTH)
4715 static void rib_recv_bufs(uint32_t x) {
4716 return;
4717 }
4718 static void rib_send_bufs(uint32_t x) {
4719 return;
4720 }
4721 #endif
4722
4723 /*
4724 * Fetch a buffer of specified type.
4725 * Note that rdbuf->handle is mw's rkey.
4726 */
4727 static void *
4728 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4729 {
4730 rib_qp_t *qp = ctoqp(conn);
4731 rib_hca_t *hca = qp->hca;
4732 rdma_btype ptype = rdbuf->type;
4733 void *buf;
4734 rib_bufpool_t *rbp = NULL;
4735 bufpool_t *bp;
4736 int i;
4737
4738 /*
4739 * Obtain pool address based on type of pool
4740 */
4741 switch (ptype) {
4752 return (NULL);
4753
4754 bp = rbp->bpool;
4755
4756 mutex_enter(&bp->buflock);
4757 if (bp->buffree < 0) {
4758 cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!");
4759 mutex_exit(&bp->buflock);
4760 return (NULL);
4761 }
4762
4763 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4764 buf = bp->buflist[bp->buffree];
4765 rdbuf->addr = buf;
4766 rdbuf->len = bp->rsize;
4767 for (i = bp->numelems - 1; i >= 0; i--) {
4768 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4769 rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey;
4770 rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i];
4771 rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey;
4772 #if defined(MEASURE_POOL_DEPTH)
4773 if(ptype == SEND_BUFFER)
4774 rib_send_bufs(MAX_BUFS - (bp->buffree+1));
4775 if(ptype == RECV_BUFFER)
4776 rib_recv_bufs(MAX_BUFS - (bp->buffree+1));
4777 #endif
4778 bp->buffree--;
4779 if (rib_debug > 1)
4780 cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs "
4781 "(type %d)\n", bp->buffree+1, ptype);
4782
4783 mutex_exit(&bp->buflock);
4784
4785 return (buf);
4786 }
4787 }
4788 cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of "
4789 "type %d found!", buf, ptype);
4790 mutex_exit(&bp->buflock);
4791
4792 return (NULL);
4793 }
4794
4795 static void
4796 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4797 {
5609
5610 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
5611 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
5612 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
5613 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
5614 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
5615 kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
5616 kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
5617 kmem_free(hca->svc_scq, sizeof (rib_cq_t));
5618
5619 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
5620 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
5621 if (hca->srv_conn_list.conn_hd == NULL &&
5622 hca->cl_conn_list.conn_hd == NULL) {
5623 /*
5624 * conn_lists are NULL, so destroy
5625 * buffers, close hca and be done.
5626 */
5627 rib_rbufpool_destroy(hca, RECV_BUFFER);
5628 rib_rbufpool_destroy(hca, SEND_BUFFER);
5629 #ifdef SERVER_REG_CACHE
5630 rib_destroy_cache(hca);
5631 #endif
5632 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
5633 (void) ibt_close_hca(hca->hca_hdl);
5634 hca->hca_hdl = NULL;
5635 }
5636 rw_exit(&hca->cl_conn_list.conn_lock);
5637 rw_exit(&hca->srv_conn_list.conn_lock);
5638
5639 if (hca->hca_hdl != NULL) {
5640 mutex_enter(&hca->inuse_lock);
5641 while (hca->inuse)
5642 cv_wait(&hca->cb_cv, &hca->inuse_lock);
5643 mutex_exit(&hca->inuse_lock);
5644 /*
5645 * conn_lists are now NULL, so destroy
5646 * buffers, close hca and be done.
5647 */
5648 rib_rbufpool_destroy(hca, RECV_BUFFER);
5649 rib_rbufpool_destroy(hca, SEND_BUFFER);
5650 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
5651 (void) ibt_close_hca(hca->hca_hdl);
5652 hca->hca_hdl = NULL;
5653 }
5654 }
5655
5656 #ifdef SERVER_REG_CACHE
5657
5658 static void
5659 rib_server_side_cache_reclaim(void *argp)
5660 {
5661 cache_avl_struct_t *rcas;
5662 rib_lrc_entry_t *rb;
5663 rib_hca_t *hca = (rib_hca_t *)argp;
5664
5665 rw_enter(&hca->avl_rw_lock,RW_WRITER);
5666 rcas = avl_first(&hca->avl_tree);
5667 if(rcas != NULL)
5668 avl_remove(&hca->avl_tree, rcas);
5669 while(rcas != NULL){
5670 while(rcas->r.forw != &rcas->r){
5671 rcas->elements--;
5672 rb = rcas->r.forw;
5673 remque(rb);
5674 rib_deregistermem_via_hca(hca, rb->lrc_buf, rb->lrc_mhandle);
5675 kmem_free(rb->lrc_buf, rb->lrc_len);
5676 kmem_free(rb, sizeof(rib_lrc_entry_t));
5677 }
5678 mutex_destroy(&rcas->node_lock);
5679 kmem_cache_free(hca->server_side_cache,rcas);
5680 rcas = avl_first(&hca->avl_tree);
5681 if(rcas != NULL)
5682 avl_remove(&hca->avl_tree, rcas);
5683 }
5684 rw_exit(&hca->avl_rw_lock);
5685 }
5686
5687 static int avl_compare(const void *t1,const void *t2) {
5688
5689 if(rib_debug > 1)
5690 cmn_err(CE_NOTE,"Comparing %d and %d\n",((cache_avl_struct_t *)t1)->len, ((cache_avl_struct_t *)t2)->len);
5691 if(((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5692 return 0;
5693
5694 if(((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5695 return -1;
5696
5697 if(((cache_avl_struct_t *)t1)->len > ((cache_avl_struct_t *)t2)->len)
5698 return 1;
5699 }
5700
5701 static void rib_destroy_cache(rib_hca_t *hca) {
5702 cache_avl_struct_t *rcas, *root;
5703 rib_lrc_entry_t *rb;
5704
5705 hca->avl_init = FALSE;
5706 kmem_cache_destroy(hca->server_side_cache);
5707 avl_destroy(&hca->avl_tree);
5708 rw_destroy(&hca->avl_rw_lock);
5709
5710 }
5711
5712 static rib_lrc_entry_t *
5713 rib_get_server_cache_buf(CONN *conn,uint32_t len)
5714 {
5715 cache_avl_struct_t cas,*rcas;
5716 rib_hca_t *hca = (ctoqp(conn))->hca;
5717 rib_lrc_entry_t *reply_buf;
5718 avl_index_t where = NULL;
5719 struct rib_lrc_entry *forw = NULL;
5720 if(!hca->avl_init)
5721 goto error_alloc;
5722 cas.len = len;
5723 rw_enter(&hca->avl_rw_lock, RW_READER);
5724 if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL){
5725 rw_exit(&hca->avl_rw_lock);
5726 rw_enter(&hca->avl_rw_lock, RW_WRITER);
5727 /* Recheck to make sure no other thread added the entry in */
5728 if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL){
5729 /* Allocate an avl tree entry */
5730 if(rib_debug > 1)
5731 cmn_err(CE_NOTE,"Allocating an avl entry for length %d\n",len);
5732 rcas = (cache_avl_struct_t *)kmem_cache_alloc(hca->server_side_cache,KM_SLEEP);
5733 bzero(rcas, sizeof(cache_avl_struct_t));
5734 rcas->elements = 0;
5735 rcas->r.forw =
5736 &rcas->r;
5737 rcas->r.back =
5738 &rcas->r;
5739 rcas->len = len;
5740 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5741 avl_insert(&hca->avl_tree,rcas,where);
5742 }
5743 }
5744 if(rcas->elements > 0){
5745 mutex_enter(&rcas->node_lock);
5746 reply_buf = rcas->r.forw;
5747 remque(reply_buf);
5748 rcas->elements --;
5749 mutex_exit(&rcas->node_lock);
5750 rw_exit(&hca->avl_rw_lock);
5751 if(rib_debug > 1)
5752 cmn_err(CE_NOTE,"Allocating a pre-alloced buffer for length %d\n",len);
5753 } else {
5754 rw_exit(&hca->avl_rw_lock);
5755 rib_total_buffers ++;
5756 if(rib_debug > 1)
5757 cmn_err(CE_NOTE,"Allocating a new buffer for length %d\n",len);
5758 /* Allocate a reply_buf entry */
5759 reply_buf = (rib_lrc_entry_t *)kmem_alloc(sizeof(rib_lrc_entry_t), KM_SLEEP);
5760 bzero(reply_buf,sizeof(rib_lrc_entry_t));
5761 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5762 reply_buf->lrc_len = len;
5763 reply_buf->registered = FALSE;
5764 reply_buf->avl_node = (void *)rcas;
5765 }
5766
5767 return reply_buf;
5768 error_alloc:
5769 reply_buf = (rib_lrc_entry_t *)kmem_alloc(sizeof(rib_lrc_entry_t), KM_SLEEP);
5770 bzero(reply_buf,sizeof(rib_lrc_entry_t));
5771 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5772 reply_buf->lrc_len = len;
5773 reply_buf->registered = FALSE;
5774 reply_buf->avl_node = NULL;
5775 return reply_buf;
5776 }
5777
5778 /*
5779 * Return a pre-registered back to the cache (without
5780 * unregistering the buffer)..
5781 */
5782
5783 static void
5784 rib_free_server_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5785 {
5786 cache_avl_struct_t cas,*rcas;
5787 avl_index_t where = NULL;
5788 rib_hca_t *hca = (ctoqp(conn))->hca;
5789 if(!reg_buf){
5790 cmn_err(CE_WARN,"Got a null reg_buf\n");
5791 return;
5792 }
5793 if(!hca->avl_init)
5794 goto error_free;
5795 cas.len = reg_buf->lrc_len;
5796 rw_enter(&hca->avl_rw_lock, RW_READER);
5797 if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,&cas,&where)) == NULL){
5798 rw_exit(&hca->avl_rw_lock);
5799 goto error_free;
5800 } else {
5801 mutex_enter(&rcas->node_lock);
5802 insque(reg_buf,&rcas->r);
5803 rcas->elements ++;
5804 mutex_exit(&rcas->node_lock);
5805 rw_exit(&hca->avl_rw_lock);
5806 if(rib_debug > 1)
5807 cmn_err(CE_NOTE,"Returning buffer for length %d\n",reg_buf->lrc_len);
5808 }
5809 return;
5810 error_free:
5811 rib_deregistermem_via_hca(hca, reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5812 kmem_free(reg_buf->lrc_buf,reg_buf->lrc_len);
5813 kmem_free(reg_buf,sizeof(rib_lrc_entry_t));
5814 }
5815
5816 #endif
5817
5818 static rdma_stat
5819 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5820 uint_t buflen, struct mrc *buf_handle)
5821 {
5822 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
5823 #ifdef IB_FMR_SUP
5824 ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */
5825 ibt_ma_hdl_t ma_hdl = NULL;
5826 #endif
5827 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
5828 rdma_stat status;
5829
5830
5831 /*
5832 * Note: ALL buffer pools use the same memory type RDMARW.
5833 */
5834 /* This code will not be activated on the server. We could remove
5835 the call to rib_reg_mem_fmr. But leave it in, in case the FMR
5836 bugs get fixed. The bigger question is whether we need FMR when
5837 the registered bufffers are coming out of a slab cache. This needs
5838 to be evaluated.
5839 */
5840 #ifdef IB_FMR_SUP
5841 status = rib_reg_mem_fmr(hca, buf, adsp, buflen, 0, &mr_hdl, &ma_hdl,
5842 &pmr_desc);
5843 if (status == RDMA_SUCCESS) {
5844 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
5845 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey;
5846 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey;
5847 buf_handle->mrc_lma = (uintptr_t)ma_hdl;
5848 goto ret_stat;
5849 } else {
5850 buf_handle->mrc_linfo = NULL;
5851 buf_handle->mrc_lma = NULL;
5852 buf_handle->mrc_lmr = 0;
5853 buf_handle->mrc_rmr = 0;
5854 }
5855 #endif
5856 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5857 if (status == RDMA_SUCCESS) {
5858 buf_handle->mrc_linfo = (uint64_t)mr_hdl;
5859 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5860 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5861 } else {
5862 buf_handle->mrc_linfo = NULL;
5863 buf_handle->mrc_lmr = 0;
5864 buf_handle->mrc_rmr = 0;
5865 }
5866 ret_stat:
5867 return (status);
5868 }
5869
5870 /* ARGSUSED */
5871 static rdma_stat
5872 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5873 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5874 {
5875
5876 (void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5877
5878 return (RDMA_SUCCESS);
5879 }
5880
5881 /* ARGSUSED */
5882 static rdma_stat
5883 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5884 {
5885 #ifdef IB_FMR_SUP
5886 ibt_status_t ibt_status;
5887 if(buf_handle.mrc_lma){
5888 ibt_status = ibt_unmap_mem_area(hca->hca_hdl,
5889 (ibt_ma_hdl_t)buf_handle.mrc_lma);
5890 if (ibt_status != IBT_SUCCESS){
5891 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed",
5892 ibt_status);
5893 return (RDMA_FAILED);
5894 }
5895 ibt_status = ibt_deregister_fmr(hca->hca_hdl,
5896 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5897 if (ibt_status != IBT_SUCCESS){
5898 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed",
5899 ibt_status);
5900 return (RDMA_FAILED);
5901 }
5902 return (RDMA_SUCCESS);
5903 }
5904 #endif
5905
5906 (void) ibt_deregister_mr(hca->hca_hdl,
5907 (ibt_mr_hdl_t)buf_handle.mrc_linfo);
5908 return (RDMA_SUCCESS);
5909 }
5910
5911 #if defined(ASYNC_SERVER_DEREG)||defined(ASYNC_CLIENT_DEREG)
5912 static int
5913 clist_deregister1(CONN *conn, struct clist *cl, bool_t src)
5914 {
5915 struct clist *c;
5916
5917 for (c = cl; c; c = c->c_next) {
5918 if (src) {
5919 if (c->c_smemhandle.mrc_rmr != 0) {
5920 (void) RDMA_DEREGMEMSYNC(conn,
5921 (caddr_t)(uintptr_t)c->c_saddr,
5922 c->c_smemhandle,
5923 #ifdef SERVER_REG_CACHE
5924 (void *)(uintptr_t)c->c_ssynchandle, (void *)c->long_reply_buf);
5925 #else
5926 (void *)(uintptr_t)c->c_ssynchandle);
5927 #endif
5928 c->c_smemhandle.mrc_rmr = 0;
5929 c->c_ssynchandle = NULL;
5930 }
5931 } else {
5932 if (c->c_dmemhandle.mrc_rmr != 0) {
5933 (void) RDMA_DEREGMEMSYNC(conn,
5934 (caddr_t)(uintptr_t)c->c_daddr,
5935 c->c_dmemhandle,
5936 #ifdef SERVER_REG_CACHE
5937 (void *)(uintptr_t)c->c_dsynchandle, (void *)c->long_reply_buf);
5938 #else
5939 (void *)(uintptr_t)c->c_dsynchandle);
5940 #endif
5941 c->c_dmemhandle.mrc_rmr = 0;
5942 c->c_dsynchandle = NULL;
5943 }
5944 }
5945 }
5946
5947 return (RDMA_SUCCESS);
5948 }
5949 #endif
5950
5951
5952
5953 #if defined(ASYNC_CLIENT_DEREG)
5954 static void
5955 async_dereg_thread(caddr_t arg){
5956 ASYNC *r;
5957 cmn_err(CE_WARN,"async_dereg_thread initiated\n");
5958 fetch_another_entry:
5959 mutex_enter(&at_mutex);
5960 while ((rqueue.forw == rqueue.back) && (rqueue.forw == &rqueue))
5961 cv_wait(&at_cond, &at_mutex);
5962 r=rqueue.forw;
5963 remque(rqueue.forw);
5964 mutex_exit(&at_mutex);
5965 /* Process deregistration */
5966 clist_deregister1(&r->c_conn, &r->c_clist, FALSE);
5967 kmem_free(r, sizeof(ASYNC));
5968 goto fetch_another_entry;
5969
5970 }
5971 void insert_queue(CONN *conn, struct clist *rwc){
5972 ASYNC *r;
5973 r=kmem_zalloc(sizeof(ASYNC),KM_SLEEP);
5974 r->c_clist = *rwc;
5975 r->c_conn = *conn;
5976 mutex_enter(&at_mutex);
5977 insque(r,&rqueue);
5978 cv_broadcast(&at_cond);
5979 mutex_exit(&at_mutex);
5980 }
5981 #endif
|