Old rpcib.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "@(#)rpcib.c 1.29 06/01/25 SMI"
28
29 /*
30 * The rpcib plugin. Implements the interface for RDMATF's
31 * interaction with IBTF.
32 */
33
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/sysmacros.h>
39 #include <sys/proc.h>
40 #include <sys/socket.h>
41 #include <sys/file.h>
42 #include <sys/stream.h>
43 #include <sys/strsubr.h>
44 #include <sys/stropts.h>
45 #include <sys/errno.h>
46 #include <sys/kmem.h>
47 #include <sys/debug.h>
48 #include <sys/systm.h>
49 #include <sys/pathname.h>
50 #include <sys/kstat.h>
51 #include <sys/t_lock.h>
52 #include <sys/ddi.h>
53 #include <sys/cmn_err.h>
54 #include <sys/time.h>
55 #include <sys/isa_defs.h>
56 #include <sys/callb.h>
57 #include <sys/sunddi.h>
58 #include <sys/sunndi.h>
59
60 #include <sys/ib/ibtl/ibti.h>
61 #include <rpc/rpc.h>
62 #include <rpc/ib.h>
63
64 #include <sys/modctl.h>
65
66 #include <sys/pathname.h>
67 #include <sys/kstr.h>
68 #include <sys/sockio.h>
69 #include <sys/vnode.h>
70 #include <sys/tiuser.h>
71 #include <net/if.h>
72 #include <sys/cred.h>
73
74
75 extern char *inet_ntop(int, const void *, char *, int);
76
77
78 /*
79 * Prototype declarations for driver ops
80 */
81
82 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
83 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
84 void *, void **);
85 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
86
87
88 /* rpcib cb_ops */
89 static struct cb_ops rpcib_cbops = {
90 nulldev, /* open */
91 nulldev, /* close */
92 nodev, /* strategy */
93 nodev, /* print */
94 nodev, /* dump */
95 nodev, /* read */
96 nodev, /* write */
97 nodev, /* ioctl */
98 nodev, /* devmap */
99 nodev, /* mmap */
100 nodev, /* segmap */
101 nochpoll, /* poll */
102 ddi_prop_op, /* prop_op */
103 NULL, /* stream */
104 D_MP, /* cb_flag */
105 CB_REV, /* rev */
106 nodev, /* int (*cb_aread)() */
107 nodev /* int (*cb_awrite)() */
108 };
109
110 /*
111 * Device options
112 */
113 static struct dev_ops rpcib_ops = {
114 DEVO_REV, /* devo_rev, */
115 0, /* refcnt */
116 rpcib_getinfo, /* info */
117 nulldev, /* identify */
118 nulldev, /* probe */
119 rpcib_attach, /* attach */
120 rpcib_detach, /* detach */
121 nodev, /* reset */
122 &rpcib_cbops, /* driver ops - devctl interfaces */
123 NULL, /* bus operations */
124 NULL /* power */
125 };
126
127 /*
128 * Module linkage information.
129 */
130
131 static struct modldrv rib_modldrv = {
132 &mod_driverops, /* Driver module */
133 "RPCIB plugin driver, ver 1.29", /* Driver name and version */
134 &rpcib_ops, /* Driver ops */
135 };
136
137 static struct modlinkage rib_modlinkage = {
138 MODREV_1,
139 (void *)&rib_modldrv,
140 NULL
141 };
142
143 /*
144 * rib_stat: private data pointer used when registering
145 * with the IBTF. It is returned to the consumer
146 * in all callbacks.
147 */
148 static rpcib_state_t *rib_stat = NULL;
149
150 #define RNR_RETRIES 2
151 #define MAX_PORTS 2
152
153 int preposted_rbufs = 16;
154 int send_threshold = 1;
155
156 /*
157 * State of the plugin.
158 * ACCEPT = accepting new connections and requests.
159 * NO_ACCEPT = not accepting new connection and requests.
160 * This should eventually move to rpcib_state_t structure, since this
161 * will tell in which state the plugin is for a particular type of service
162 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
163 * state for one and in no_accept state for the other.
164 */
165 int plugin_state;
166 kmutex_t plugin_state_lock;
167
168
169 /*
170 * RPCIB RDMATF operations
171 */
172 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
173 static rdma_stat rib_disconnect(CONN *conn);
174 static void rib_listen(struct rdma_svc_data *rd);
175 static void rib_listen_stop(struct rdma_svc_data *rd);
176 static rdma_stat rib_registermem(CONN *conn, caddr_t buf, uint_t buflen,
177 struct mrc *buf_handle);
178 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
179 struct mrc buf_handle);
180 static rdma_stat rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen,
181 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle);
182 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
183 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle);
184 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
185 caddr_t buf, int len, int cpu);
186
187 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
188
189 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
190 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
191
192 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
193
194 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
195 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
196 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
197 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
198 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
199 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
200 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
201 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
202 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
203 static rdma_stat rib_conn_release(CONN *conn);
204 static rdma_stat rib_getinfo(rdma_info_t *info);
205 static rdma_stat rib_register_ats(rib_hca_t *);
206 static void rib_deregister_ats();
207 static void rib_stop_services(rib_hca_t *);
208
209 /*
210 * RPCIB addressing operations
211 */
212 char ** get_ip_addrs(int *count);
213 int get_interfaces(TIUSER *tiptr, int *num);
214 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs);
215 int get_ibd_ipaddr(rpcib_ibd_insts_t *);
216 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *);
217 void rib_get_ibd_insts(rpcib_ibd_insts_t *);
218
219
220 /*
221 * RDMA operations the RPCIB module exports
222 */
223 static rdmaops_t rib_ops = {
224 rib_reachable,
225 rib_conn_get,
226 rib_conn_release,
227 rib_listen,
228 rib_listen_stop,
229 rib_registermem,
230 rib_deregistermem,
231 rib_registermemsync,
232 rib_deregistermemsync,
233 rib_syncmem,
234 rib_reg_buf_alloc,
235 rib_reg_buf_free,
236 rib_send,
237 rib_send_resp,
238 rib_post_resp,
239 rib_post_recv,
240 rib_recv,
241 rib_read,
242 rib_write,
243 rib_getinfo
244 };
245
246 /*
247 * RDMATF RPCIB plugin details
248 */
249 static rdma_mod_t rib_mod = {
250 "ibtf", /* api name */
251 RDMATF_VERS_1,
252 0,
253 &rib_ops, /* rdma op vector for ibtf */
254 };
255
256 static rdma_stat open_hcas(rpcib_state_t *);
257 static rdma_stat rib_qp_init(rib_qp_t *, int);
258 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
259 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
260 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
261 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
262 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
263 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
264 ibt_mr_hdl_t *, ibt_mr_desc_t *);
265 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *);
266 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
267 rib_qp_t **);
268 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
269 rib_qp_t **);
270 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
271 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
272 static int rib_free_sendwait(struct send_wid *);
273 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
274 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
275 static void rdma_done_rem_list(rib_qp_t *);
276 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
277
278 static void rib_async_handler(void *,
279 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
280 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
281 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
282 static int rib_free_svc_recv(struct svc_recv *);
283 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
284 static void rib_free_wid(struct recv_wid *);
285 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
286 static void rib_detach_hca(rib_hca_t *);
287 static rdma_stat rib_chk_srv_ats(rib_hca_t *, struct netbuf *, int,
288 ibt_path_info_t *);
289
290 /*
291 * Registration with IBTF as a consumer
292 */
293 static struct ibt_clnt_modinfo_s rib_modinfo = {
294 IBTI_V2,
295 IBT_GENERIC,
296 rib_async_handler, /* async event handler */
297 NULL, /* Memory Region Handler */
298 "nfs/ib"
299 };
300
301 /*
302 * Global strucuture
303 */
304
305 typedef struct rpcib_s {
306 dev_info_t *rpcib_dip;
307 kmutex_t rpcib_mutex;
308 } rpcib_t;
309
310 rpcib_t rpcib;
311
312 /*
313 * /etc/system controlled variable to control
314 * debugging in rpcib kernel module.
315 * Set it to values greater that 1 to control
316 * the amount of debugging messages required.
317 */
318 int rib_debug = 0;
319
320 static int ats_running = 0;
321 int
322 _init(void)
323 {
324 int error;
325
326 error = mod_install((struct modlinkage *)&rib_modlinkage);
327 if (error != 0) {
328 /*
329 * Could not load module
330 */
331 return (error);
332 }
333 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
334
335 return (0);
336 }
337
338 int
339 _fini()
340 {
341 int status;
342
343 if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) {
344 return (EBUSY);
345 }
346
347 rib_deregister_ats();
348
349 /*
350 * Remove module
351 */
352 if ((status = mod_remove(&rib_modlinkage)) != 0) {
353 (void) rdma_register_mod(&rib_mod);
354 return (status);
355 }
356 mutex_destroy(&plugin_state_lock);
357 return (0);
358 }
359
360 int
361 _info(struct modinfo *modinfop)
362 {
363 return (mod_info(&rib_modlinkage, modinfop));
364 }
365
366
367 /*
368 * rpcib_getinfo()
369 * Given the device number, return the devinfo pointer or the
370 * instance number.
371 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
372 */
373
374 /*ARGSUSED*/
375 static int
376 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
377 {
378 int ret = DDI_SUCCESS;
379
380 switch (cmd) {
381 case DDI_INFO_DEVT2DEVINFO:
382 if (rpcib.rpcib_dip != NULL)
383 *result = rpcib.rpcib_dip;
384 else {
385 *result = NULL;
386 ret = DDI_FAILURE;
387 }
388 break;
389
390 case DDI_INFO_DEVT2INSTANCE:
391 *result = NULL;
392 break;
393
394 default:
395 ret = DDI_FAILURE;
396 }
397 return (ret);
398 }
399
400 static int
401 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
402 {
403 ibt_status_t ibt_status;
404 rdma_stat r_status;
405
406 switch (cmd) {
407 case DDI_ATTACH:
408 break;
409 case DDI_RESUME:
410 return (DDI_SUCCESS);
411 default:
412 return (DDI_FAILURE);
413 }
414
415 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
416
417 mutex_enter(&rpcib.rpcib_mutex);
418 if (rpcib.rpcib_dip != NULL) {
419 mutex_exit(&rpcib.rpcib_mutex);
420 return (DDI_FAILURE);
421 }
422 rpcib.rpcib_dip = dip;
423 mutex_exit(&rpcib.rpcib_mutex);
424 /*
425 * Create the "rpcib" minor-node.
426 */
427 if (ddi_create_minor_node(dip,
428 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
429 /* Error message, no cmn_err as they print on console */
430 return (DDI_FAILURE);
431 }
432
433 if (rib_stat == NULL) {
434 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
435 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
436 }
437
438 rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
439 if (rib_stat->hca_count < 1) {
440 mutex_destroy(&rib_stat->open_hca_lock);
441 kmem_free(rib_stat, sizeof (*rib_stat));
442 rib_stat = NULL;
443 return (DDI_FAILURE);
444 }
445
446 ibt_status = ibt_attach(&rib_modinfo, dip,
447 (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
448 if (ibt_status != IBT_SUCCESS) {
449 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
450 mutex_destroy(&rib_stat->open_hca_lock);
451 kmem_free(rib_stat, sizeof (*rib_stat));
452 rib_stat = NULL;
453 return (DDI_FAILURE);
454 }
455
456 mutex_enter(&rib_stat->open_hca_lock);
457 if (open_hcas(rib_stat) != RDMA_SUCCESS) {
458 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
459 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
460 mutex_exit(&rib_stat->open_hca_lock);
461 mutex_destroy(&rib_stat->open_hca_lock);
462 kmem_free(rib_stat, sizeof (*rib_stat));
463 rib_stat = NULL;
464 return (DDI_FAILURE);
465 }
466 mutex_exit(&rib_stat->open_hca_lock);
467
468 /*
469 * Register with rdmatf
470 */
471 rib_mod.rdma_count = rib_stat->hca_count;
472 r_status = rdma_register_mod(&rib_mod);
473 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
474 rib_detach_hca(rib_stat->hca);
475 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
476 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
477 mutex_destroy(&rib_stat->open_hca_lock);
478 kmem_free(rib_stat, sizeof (*rib_stat));
479 rib_stat = NULL;
480 return (DDI_FAILURE);
481 }
482
483
484 return (DDI_SUCCESS);
485 }
486
487 /*ARGSUSED*/
488 static int
489 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
490 {
491 switch (cmd) {
492
493 case DDI_DETACH:
494 break;
495
496 case DDI_SUSPEND:
497 default:
498 return (DDI_FAILURE);
499 }
500
501 /*
502 * Detach the hca and free resources
503 */
504 mutex_enter(&plugin_state_lock);
505 plugin_state = NO_ACCEPT;
506 mutex_exit(&plugin_state_lock);
507 rib_detach_hca(rib_stat->hca);
508 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
509 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
510
511 mutex_enter(&rpcib.rpcib_mutex);
512 rpcib.rpcib_dip = NULL;
513 mutex_exit(&rpcib.rpcib_mutex);
514
515 mutex_destroy(&rpcib.rpcib_mutex);
516 return (DDI_SUCCESS);
517 }
518
519
520 static void
521 rib_deregister_ats()
522 {
523 rib_hca_t *hca;
524 rib_service_t *srv_list, *to_remove;
525 ibt_status_t ibt_status;
526
527 /*
528 * deregister the Address Translation Service.
529 */
530 hca = rib_stat->hca;
531 rw_enter(&hca->service_list_lock, RW_WRITER);
532 srv_list = hca->ats_list;
533 while (srv_list != NULL) {
534 to_remove = srv_list;
535 srv_list = to_remove->srv_next;
536
537 ibt_status = ibt_deregister_ar(hca->ibt_clnt_hdl,
538 &to_remove->srv_ar);
539 if (ibt_status != IBT_SUCCESS) {
540 #ifdef DEBUG
541 if (rib_debug) {
542 cmn_err(CE_WARN, "_fini: "
543 "ibt_deregister_ar FAILED"
544 " status: %d", ibt_status);
545 }
546 #endif
547 } else {
548 mutex_enter(&rib_stat->open_hca_lock);
549 ats_running = 0;
550 mutex_exit(&rib_stat->open_hca_lock);
551 #ifdef DEBUG
552 if (rib_debug) {
553
554 cmn_err(CE_NOTE, "_fini: "
555 "Successfully unregistered"
556 " ATS service: %s",
557 to_remove->srv_name);
558 }
559 #endif
560 }
561 kmem_free(to_remove, sizeof (rib_service_t));
562 }
563 hca->ats_list = NULL;
564 rw_exit(&hca->service_list_lock);
565 }
566
567 static void rib_rbufpool_free(rib_hca_t *, int);
568 static void rib_rbufpool_deregister(rib_hca_t *, int);
569 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
570 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
571 static rdma_stat rib_rem_replylist(rib_qp_t *);
572 static int rib_remreply(rib_qp_t *, struct reply *);
573 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
574 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
575
576 /*
577 * One CQ pair per HCA
578 */
579 static rdma_stat
580 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
581 rib_cq_t **cqp, rpcib_state_t *ribstat)
582 {
583 rib_cq_t *cq;
584 ibt_cq_attr_t cq_attr;
585 uint32_t real_size;
586 ibt_status_t status;
587 rdma_stat error = RDMA_SUCCESS;
588
589 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
590 cq->rib_hca = hca;
591 cq_attr.cq_size = cq_size;
592 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
593 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
594 &real_size);
595 if (status != IBT_SUCCESS) {
596 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
597 " status=%d", status);
598 error = RDMA_FAILED;
599 goto fail;
600 }
601 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
602
603 /*
604 * Enable CQ callbacks. CQ Callbacks are single shot
605 * (e.g. you have to call ibt_enable_cq_notify()
606 * after each callback to get another one).
607 */
608 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
609 if (status != IBT_SUCCESS) {
610 cmn_err(CE_WARN, "rib_create_cq: "
611 "enable_cq_notify failed, status %d", status);
612 error = RDMA_FAILED;
613 goto fail;
614 }
615 *cqp = cq;
616
617 return (error);
618 fail:
619 if (cq->rib_cq_hdl)
620 (void) ibt_free_cq(cq->rib_cq_hdl);
621 if (cq)
622 kmem_free(cq, sizeof (rib_cq_t));
623 return (error);
624 }
625
626 static rdma_stat
627 open_hcas(rpcib_state_t *ribstat)
628 {
629 rib_hca_t *hca;
630 ibt_status_t ibt_status;
631 rdma_stat status;
632 ibt_hca_portinfo_t *pinfop;
633 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS;
634 uint_t size, cq_size;
635 int i;
636
637 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
638 if (ribstat->hcas == NULL)
639 ribstat->hcas = kmem_zalloc(ribstat->hca_count *
640 sizeof (rib_hca_t), KM_SLEEP);
641
642 /*
643 * Open a hca and setup for RDMA
644 */
645 for (i = 0; i < ribstat->hca_count; i++) {
646 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
647 ribstat->hca_guids[i],
648 &ribstat->hcas[i].hca_hdl);
649 if (ibt_status != IBT_SUCCESS) {
650 cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) "
651 "returned %d", i, ibt_status);
652 continue;
653 }
654 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
655 hca = &(ribstat->hcas[i]);
656 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
657 hca->state = HCA_INITED;
658
659 /*
660 * query HCA info
661 */
662 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
663 if (ibt_status != IBT_SUCCESS) {
664 cmn_err(CE_WARN, "open_hcas: ibt_query_hca "
665 "returned %d (hca_guid 0x%llx)",
666 ibt_status, (longlong_t)ribstat->hca_guids[i]);
667 goto fail1;
668 }
669
670 /*
671 * One PD (Protection Domain) per HCA.
672 * A qp is allowed to access a memory region
673 * only when it's in the same PD as that of
674 * the memory region.
675 */
676 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
677 if (ibt_status != IBT_SUCCESS) {
678 cmn_err(CE_WARN, "open_hcas: ibt_alloc_pd "
679 "returned %d (hca_guid 0x%llx)",
680 ibt_status, (longlong_t)ribstat->hca_guids[i]);
681 goto fail1;
682 }
683
684 /*
685 * query HCA ports
686 */
687 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
688 0, &pinfop, &hca->hca_nports, &size);
689 if (ibt_status != IBT_SUCCESS) {
690 cmn_err(CE_WARN, "open_hcas: "
691 "ibt_query_hca_ports returned %d "
692 "(hca_guid 0x%llx)",
693 ibt_status, (longlong_t)hca->hca_guid);
694 goto fail2;
695 }
696 hca->hca_ports = pinfop;
697 hca->hca_pinfosz = size;
698 pinfop = NULL;
699
700 cq_size = DEF_CQ_SIZE; /* default cq size */
701 /*
702 * Create 2 pairs of cq's (1 pair for client
703 * and the other pair for server) on this hca.
704 * If number of qp's gets too large, then several
705 * cq's will be needed.
706 */
707 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
708 &hca->svc_rcq, ribstat);
709 if (status != RDMA_SUCCESS) {
710 goto fail3;
711 }
712
713 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
714 &hca->svc_scq, ribstat);
715 if (status != RDMA_SUCCESS) {
716 goto fail3;
717 }
718
719 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
720 &hca->clnt_rcq, ribstat);
721 if (status != RDMA_SUCCESS) {
722 goto fail3;
723 }
724
725 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
726 &hca->clnt_scq, ribstat);
727 if (status != RDMA_SUCCESS) {
728 goto fail3;
729 }
730
731 /*
732 * Create buffer pools.
733 * Note rib_rbuf_create also allocates memory windows.
734 */
735 hca->recv_pool = rib_rbufpool_create(hca,
736 RECV_BUFFER, MAX_BUFS);
737 if (hca->recv_pool == NULL) {
738 cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n");
739 goto fail3;
740 }
741
742 hca->send_pool = rib_rbufpool_create(hca,
743 SEND_BUFFER, MAX_BUFS);
744 if (hca->send_pool == NULL) {
745 cmn_err(CE_WARN, "open_hcas: send buf pool failed\n");
746 rib_rbufpool_destroy(hca, RECV_BUFFER);
747 goto fail3;
748 }
749
750 /*
751 * Initialize the registered service list and
752 * the lock
753 */
754 hca->service_list = NULL;
755 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
756
757 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
758 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
759 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
760 hca->iblock);
761 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
762 hca->iblock);
763 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
764 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
765 hca->inuse = TRUE;
766 /*
767 * XXX One hca only. Add multi-hca functionality if needed
768 * later.
769 */
770 ribstat->hca = hca;
771 ribstat->nhca_inited++;
772 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
773 break;
774
775 fail3:
776 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
777 fail2:
778 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
779 fail1:
780 (void) ibt_close_hca(hca->hca_hdl);
781
782 }
783 if (ribstat->hca != NULL)
784 return (RDMA_SUCCESS);
785 else
786 return (RDMA_FAILED);
787 }
788
789 /*
790 * Callback routines
791 */
792
793 /*
794 * SCQ handlers
795 */
796 /* ARGSUSED */
797 static void
798 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
799 {
800 ibt_status_t ibt_status;
801 ibt_wc_t wc;
802 int i;
803
804 /*
805 * Re-enable cq notify here to avoid missing any
806 * completion queue notification.
807 */
808 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
809
810 ibt_status = IBT_SUCCESS;
811 while (ibt_status != IBT_CQ_EMPTY) {
812 bzero(&wc, sizeof (wc));
813 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
814 if (ibt_status != IBT_SUCCESS)
815 return;
816
817 /*
818 * Got a send completion
819 */
820 if (wc.wc_id != NULL) { /* XXX can it be otherwise ???? */
821 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
822 CONN *conn = qptoc(wd->qp);
823
824 mutex_enter(&wd->sendwait_lock);
825 switch (wc.wc_status) {
826 case IBT_WC_SUCCESS:
827 wd->status = RDMA_SUCCESS;
828 break;
829 case IBT_WC_WR_FLUSHED_ERR:
830 wd->status = RDMA_FAILED;
831 break;
832 default:
833 /*
834 * RC Send Q Error Code Local state Remote State
835 * ==================== =========== ============
836 * IBT_WC_BAD_RESPONSE_ERR ERROR None
837 * IBT_WC_LOCAL_LEN_ERR ERROR None
838 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None
839 * IBT_WC_LOCAL_PROTECT_ERR ERROR None
840 * IBT_WC_MEM_WIN_BIND_ERR ERROR None
841 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR
842 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR
843 * IBT_WC_REMOTE_OP_ERR ERROR ERROR
844 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None
845 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None
846 * IBT_WC_WR_FLUSHED_ERR None None
847 */
848 #ifdef DEBUG
849 if (rib_debug > 1) {
850 if (wc.wc_status != IBT_WC_SUCCESS) {
851 cmn_err(CE_NOTE, "rib_clnt_scq_handler: "
852 "WR completed in error, wc.wc_status:%d, "
853 "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id);
854 }
855 }
856 #endif
857 /*
858 * Channel in error state. Set connection to
859 * ERROR and cleanup will happen either from
860 * conn_release or from rib_conn_get
861 */
862 wd->status = RDMA_FAILED;
863 mutex_enter(&conn->c_lock);
864 if (conn->c_state != C_DISCONN_PEND)
865 conn->c_state = C_ERROR;
866 mutex_exit(&conn->c_lock);
867 break;
868 }
869 if (wd->cv_sig == 1) {
870 /*
871 * Notify poster
872 */
873 cv_signal(&wd->wait_cv);
874 mutex_exit(&wd->sendwait_lock);
875 } else {
876 /*
877 * Poster not waiting for notification.
878 * Free the send buffers and send_wid
879 */
880 for (i = 0; i < wd->nsbufs; i++) {
881 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
882 (void *)(uintptr_t)wd->sbufaddr[i]);
883 }
884 mutex_exit(&wd->sendwait_lock);
885 (void) rib_free_sendwait(wd);
886 }
887 }
888 }
889 }
890
891 /* ARGSUSED */
892 static void
893 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
894 {
895 ibt_status_t ibt_status;
896 ibt_wc_t wc;
897 int i;
898
899 /*
900 * Re-enable cq notify here to avoid missing any
901 * completion queue notification.
902 */
903 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
904
905 ibt_status = IBT_SUCCESS;
906 while (ibt_status != IBT_CQ_EMPTY) {
907 bzero(&wc, sizeof (wc));
908 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
909 if (ibt_status != IBT_SUCCESS)
910 return;
911
912 /*
913 * Got a send completion
914 */
915 #ifdef DEBUG
916 if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) {
917 cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error "
918 "wc.wc_status:%d, wc_id:%llX",
919 wc.wc_status, (longlong_t)wc.wc_id);
920 }
921 #endif
922 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
923 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
924
925 mutex_enter(&wd->sendwait_lock);
926 if (wd->cv_sig == 1) {
927 /*
928 * Update completion status and notify poster
929 */
930 if (wc.wc_status == IBT_WC_SUCCESS)
931 wd->status = RDMA_SUCCESS;
932 else
933 wd->status = RDMA_FAILED;
934 cv_signal(&wd->wait_cv);
935 mutex_exit(&wd->sendwait_lock);
936 } else {
937 /*
938 * Poster not waiting for notification.
939 * Free the send buffers and send_wid
940 */
941 for (i = 0; i < wd->nsbufs; i++) {
942 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
943 (void *)(uintptr_t)wd->sbufaddr[i]);
944 }
945 mutex_exit(&wd->sendwait_lock);
946 (void) rib_free_sendwait(wd);
947 }
948 }
949 }
950 }
951
952 /*
953 * RCQ handler
954 */
955 /* ARGSUSED */
956 static void
957 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
958 {
959 rib_qp_t *qp;
960 ibt_status_t ibt_status;
961 ibt_wc_t wc;
962 struct recv_wid *rwid;
963
964 /*
965 * Re-enable cq notify here to avoid missing any
966 * completion queue notification.
967 */
968 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
969
970 ibt_status = IBT_SUCCESS;
971 while (ibt_status != IBT_CQ_EMPTY) {
972 bzero(&wc, sizeof (wc));
973 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
974 if (ibt_status != IBT_SUCCESS)
975 return;
976
977 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
978 qp = rwid->qp;
979 if (wc.wc_status == IBT_WC_SUCCESS) {
980 XDR inxdrs, *xdrs;
981 uint_t xid, vers, op, find_xid = 0;
982 struct reply *r;
983 CONN *conn = qptoc(qp);
984
985 xdrs = &inxdrs;
986 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
987 wc.wc_bytes_xfer, XDR_DECODE);
988 /*
989 * Treat xid as opaque (xid is the first entity
990 * in the rpc rdma message).
991 */
992 xid = *(uint32_t *)(uintptr_t)rwid->addr;
993 /* Skip xid and set the xdr position accordingly. */
994 XDR_SETPOS(xdrs, sizeof (uint32_t));
995 (void) xdr_u_int(xdrs, &vers);
996 (void) xdr_u_int(xdrs, &op);
997 XDR_DESTROY(xdrs);
998 if (vers != RPCRDMA_VERS) {
999 /*
1000 * Invalid RPC/RDMA version. Cannot interoperate.
1001 * Set connection to ERROR state and bail out.
1002 */
1003 mutex_enter(&conn->c_lock);
1004 if (conn->c_state != C_DISCONN_PEND)
1005 conn->c_state = C_ERROR;
1006 mutex_exit(&conn->c_lock);
1007 rib_rbuf_free(conn, RECV_BUFFER,
1008 (void *)(uintptr_t)rwid->addr);
1009 rib_free_wid(rwid);
1010 continue;
1011 }
1012
1013 mutex_enter(&qp->replylist_lock);
1014 for (r = qp->replylist; r != NULL; r = r->next) {
1015 if (r->xid == xid) {
1016 find_xid = 1;
1017 switch (op) {
1018 case RDMA_MSG:
1019 case RDMA_NOMSG:
1020 case RDMA_MSGP:
1021 r->status = RDMA_SUCCESS;
1022 r->vaddr_cq = rwid->addr;
1023 r->bytes_xfer = wc.wc_bytes_xfer;
1024 cv_signal(&r->wait_cv);
1025 break;
1026 default:
1027 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1028 (void *)(uintptr_t)rwid->addr);
1029 break;
1030 }
1031 break;
1032 }
1033 }
1034 mutex_exit(&qp->replylist_lock);
1035 if (find_xid == 0) {
1036 /* RPC caller not waiting for reply */
1037 #ifdef DEBUG
1038 if (rib_debug) {
1039 cmn_err(CE_NOTE, "rib_clnt_rcq_handler: "
1040 "NO matching xid %u!\n", xid);
1041 }
1042 #endif
1043 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1044 (void *)(uintptr_t)rwid->addr);
1045 }
1046 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1047 CONN *conn = qptoc(qp);
1048
1049 /*
1050 * Connection being flushed. Just free
1051 * the posted buffer
1052 */
1053 rib_rbuf_free(conn, RECV_BUFFER,
1054 (void *)(uintptr_t)rwid->addr);
1055 } else {
1056 CONN *conn = qptoc(qp);
1057 /*
1058 * RC Recv Q Error Code Local state Remote State
1059 * ==================== =========== ============
1060 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd
1061 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd
1062 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd
1063 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd
1064 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd
1065 * IBT_WC_WR_FLUSHED_ERR None None
1066 */
1067 /*
1068 * Channel in error state. Set connection
1069 * in ERROR state.
1070 */
1071 mutex_enter(&conn->c_lock);
1072 if (conn->c_state != C_DISCONN_PEND)
1073 conn->c_state = C_ERROR;
1074 mutex_exit(&conn->c_lock);
1075 rib_rbuf_free(conn, RECV_BUFFER,
1076 (void *)(uintptr_t)rwid->addr);
1077 }
1078 rib_free_wid(rwid);
1079 }
1080 }
1081
1082 /* Server side */
1083 /* ARGSUSED */
1084 static void
1085 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1086 {
1087 struct recv_data *rd;
1088 rib_qp_t *qp;
1089 ibt_status_t ibt_status;
1090 ibt_wc_t wc;
1091 struct svc_recv *s_recvp;
1092 CONN *conn;
1093 mblk_t *mp;
1094
1095 /*
1096 * Re-enable cq notify here to avoid missing any
1097 * completion queue notification.
1098 */
1099 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1100
1101 ibt_status = IBT_SUCCESS;
1102 while (ibt_status != IBT_CQ_EMPTY) {
1103 bzero(&wc, sizeof (wc));
1104 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1105 if (ibt_status != IBT_SUCCESS)
1106 return;
1107
1108 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1109 qp = s_recvp->qp;
1110 conn = qptoc(qp);
1111 mutex_enter(&qp->posted_rbufs_lock);
1112 qp->n_posted_rbufs--;
1113 if (qp->n_posted_rbufs == 0)
1114 cv_signal(&qp->posted_rbufs_cv);
1115 mutex_exit(&qp->posted_rbufs_lock);
1116
1117 if (wc.wc_status == IBT_WC_SUCCESS) {
1118 XDR inxdrs, *xdrs;
1119 uint_t xid, vers, op;
1120
1121 xdrs = &inxdrs;
1122 /* s_recvp->vaddr stores data */
1123 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1124 wc.wc_bytes_xfer, XDR_DECODE);
1125
1126 /*
1127 * Treat xid as opaque (xid is the first entity
1128 * in the rpc rdma message).
1129 */
1130 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1131 /* Skip xid and set the xdr position accordingly. */
1132 XDR_SETPOS(xdrs, sizeof (uint32_t));
1133 if (!xdr_u_int(xdrs, &vers) ||
1134 !xdr_u_int(xdrs, &op)) {
1135 rib_rbuf_free(conn, RECV_BUFFER,
1136 (void *)(uintptr_t)s_recvp->vaddr);
1137 XDR_DESTROY(xdrs);
1138 #ifdef DEBUG
1139 cmn_err(CE_NOTE, "rib_svc_rcq_handler: "
1140 "xdr_u_int failed for qp %p, wc_id=%llx",
1141 (void *)qp, (longlong_t)wc.wc_id);
1142 #endif
1143 (void) rib_free_svc_recv(s_recvp);
1144 continue;
1145 }
1146 XDR_DESTROY(xdrs);
1147
1148 if (vers != RPCRDMA_VERS) {
1149 /*
1150 * Invalid RPC/RDMA version. Drop rpc rdma message.
1151 */
1152 rib_rbuf_free(conn, RECV_BUFFER,
1153 (void *)(uintptr_t)s_recvp->vaddr);
1154 (void) rib_free_svc_recv(s_recvp);
1155 continue;
1156 }
1157 /*
1158 * Is this for RDMA_DONE?
1159 */
1160 if (op == RDMA_DONE) {
1161 rib_rbuf_free(conn, RECV_BUFFER,
1162 (void *)(uintptr_t)s_recvp->vaddr);
1163 /*
1164 * Wake up the thread waiting on
1165 * a RDMA_DONE for xid
1166 */
1167 mutex_enter(&qp->rdlist_lock);
1168 rdma_done_notify(qp, xid);
1169 mutex_exit(&qp->rdlist_lock);
1170 (void) rib_free_svc_recv(s_recvp);
1171 continue;
1172 }
1173
1174 mutex_enter(&plugin_state_lock);
1175 if (plugin_state == ACCEPT) {
1176 while ((mp = allocb(sizeof (*rd), BPRI_LO)) == NULL)
1177 (void) strwaitbuf(sizeof (*rd), BPRI_LO);
1178 /*
1179 * Plugin is in accept state, hence the master
1180 * transport queue for this is still accepting
1181 * requests. Hence we can call svc_queuereq to
1182 * queue this recieved msg.
1183 */
1184 rd = (struct recv_data *)mp->b_rptr;
1185 rd->conn = conn;
1186 rd->rpcmsg.addr = (caddr_t)(uintptr_t)s_recvp->vaddr;
1187 rd->rpcmsg.type = RECV_BUFFER;
1188 rd->rpcmsg.len = wc.wc_bytes_xfer;
1189 rd->status = wc.wc_status;
1190 mutex_enter(&conn->c_lock);
1191 conn->c_ref++;
1192 mutex_exit(&conn->c_lock);
1193 mp->b_wptr += sizeof (*rd);
1194 svc_queuereq((queue_t *)rib_stat->q, mp);
1195 mutex_exit(&plugin_state_lock);
1196 } else {
1197 /*
1198 * The master transport for this is going
1199 * away and the queue is not accepting anymore
1200 * requests for krpc, so don't do anything, just
1201 * free the msg.
1202 */
1203 mutex_exit(&plugin_state_lock);
1204 rib_rbuf_free(conn, RECV_BUFFER,
1205 (void *)(uintptr_t)s_recvp->vaddr);
1206 }
1207 } else {
1208 rib_rbuf_free(conn, RECV_BUFFER,
1209 (void *)(uintptr_t)s_recvp->vaddr);
1210 }
1211 (void) rib_free_svc_recv(s_recvp);
1212 }
1213 }
1214
1215 /*
1216 * Handles DR event of IBT_HCA_DETACH_EVENT.
1217 */
1218 /* ARGSUSED */
1219 static void
1220 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1221 ibt_async_code_t code, ibt_async_event_t *event)
1222 {
1223
1224 switch (code) {
1225 case IBT_HCA_ATTACH_EVENT:
1226 /* ignore */
1227 break;
1228 case IBT_HCA_DETACH_EVENT:
1229 {
1230 ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1231 rib_detach_hca(rib_stat->hca);
1232 #ifdef DEBUG
1233 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1234 #endif
1235 break;
1236 }
1237 #ifdef DEBUG
1238 case IBT_EVENT_PATH_MIGRATED:
1239 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PATH_MIGRATED\n");
1240 break;
1241 case IBT_EVENT_SQD:
1242 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1243 break;
1244 case IBT_EVENT_COM_EST:
1245 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1246 break;
1247 case IBT_ERROR_CATASTROPHIC_CHAN:
1248 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CATASTROPHIC_CHAN\n");
1249 break;
1250 case IBT_ERROR_INVALID_REQUEST_CHAN:
1251 cmn_err(CE_NOTE, "rib_async_handler(): "
1252 "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1253 break;
1254 case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1255 cmn_err(CE_NOTE, "rib_async_handler(): "
1256 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1257 break;
1258 case IBT_ERROR_PATH_MIGRATE_REQ:
1259 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PATH_MIGRATE_REQ\n");
1260 break;
1261 case IBT_ERROR_CQ:
1262 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1263 break;
1264 case IBT_ERROR_PORT_DOWN:
1265 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1266 break;
1267 case IBT_EVENT_PORT_UP:
1268 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1269 break;
1270 case IBT_ASYNC_OPAQUE1:
1271 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1272 break;
1273 case IBT_ASYNC_OPAQUE2:
1274 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1275 break;
1276 case IBT_ASYNC_OPAQUE3:
1277 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1278 break;
1279 case IBT_ASYNC_OPAQUE4:
1280 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1281 break;
1282 #endif
1283 default:
1284 break;
1285 }
1286 }
1287
1288 /*
1289 * Client's reachable function.
1290 */
1291 static rdma_stat
1292 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1293 {
1294 rib_hca_t *hca;
1295 rdma_stat status;
1296
1297 /*
1298 * First check if a hca is still attached
1299 */
1300 *handle = NULL;
1301 rw_enter(&rib_stat->hca->state_lock, RW_READER);
1302 if (rib_stat->hca->state != HCA_INITED) {
1303 rw_exit(&rib_stat->hca->state_lock);
1304 return (RDMA_FAILED);
1305 }
1306 status = rib_ping_srv(addr_type, raddr, &hca);
1307 rw_exit(&rib_stat->hca->state_lock);
1308
1309 if (status == RDMA_SUCCESS) {
1310 *handle = (void *)hca;
1311 /*
1312 * Register the Address translation service
1313 */
1314 mutex_enter(&rib_stat->open_hca_lock);
1315 if (ats_running == 0) {
1316 if (rib_register_ats(rib_stat->hca)
1317 == RDMA_SUCCESS) {
1318 ats_running = 1;
1319 mutex_exit(&rib_stat->open_hca_lock);
1320 return (RDMA_SUCCESS);
1321 } else {
1322 mutex_exit(&rib_stat->open_hca_lock);
1323 return (RDMA_FAILED);
1324 }
1325 } else {
1326 mutex_exit(&rib_stat->open_hca_lock);
1327 return (RDMA_SUCCESS);
1328 }
1329 } else {
1330 *handle = NULL;
1331 if (rib_debug > 2)
1332 cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n");
1333 return (RDMA_FAILED);
1334 }
1335 }
1336
1337 /* Client side qp creation */
1338 static rdma_stat
1339 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1340 {
1341 rib_qp_t *kqp = NULL;
1342 CONN *conn;
1343
1344 ASSERT(qp != NULL);
1345 *qp = NULL;
1346
1347 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1348 conn = qptoc(kqp);
1349 kqp->hca = hca;
1350 kqp->rdmaconn.c_rdmamod = &rib_mod;
1351 kqp->rdmaconn.c_private = (caddr_t)kqp;
1352
1353 kqp->mode = RIB_CLIENT;
1354 kqp->chan_flags = IBT_BLOCKING;
1355 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1356 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1357 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1358
1359 /*
1360 * Initialize
1361 */
1362 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1363 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1364 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1365 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1366 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1367 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1368 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1369 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1370
1371 *qp = kqp;
1372 return (RDMA_SUCCESS);
1373 }
1374
1375 /* Server side qp creation */
1376 static rdma_stat
1377 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1378 {
1379 rib_qp_t *kqp = NULL;
1380 ibt_chan_sizes_t chan_sizes;
1381 ibt_rc_chan_alloc_args_t qp_attr;
1382 ibt_status_t ibt_status;
1383
1384 ASSERT(qp != NULL);
1385 *qp = NULL;
1386
1387 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1388 kqp->hca = hca;
1389 kqp->port_num = port;
1390 kqp->rdmaconn.c_rdmamod = &rib_mod;
1391 kqp->rdmaconn.c_private = (caddr_t)kqp;
1392
1393 /*
1394 * Create the qp handle
1395 */
1396 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1397 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1398 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1399 qp_attr.rc_pd = hca->pd_hdl;
1400 qp_attr.rc_hca_port_num = port;
1401 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1402 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1403 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1404 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1405 qp_attr.rc_clone_chan = NULL;
1406 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1407 qp_attr.rc_flags = IBT_WR_SIGNALED;
1408
1409 rw_enter(&hca->state_lock, RW_READER);
1410 if (hca->state != HCA_DETACHED) {
1411 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1412 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1413 &chan_sizes);
1414 } else {
1415 rw_exit(&hca->state_lock);
1416 goto fail;
1417 }
1418 rw_exit(&hca->state_lock);
1419
1420 if (ibt_status != IBT_SUCCESS) {
1421 cmn_err(CE_WARN, "rib_svc_create_chan: "
1422 "ibt_alloc_rc_channel failed, ibt_status=%d.",
1423 ibt_status);
1424 goto fail;
1425 }
1426
1427 kqp->mode = RIB_SERVER;
1428 kqp->chan_flags = IBT_BLOCKING;
1429 kqp->q = q; /* server ONLY */
1430
1431 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1432 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1433 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1434 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1435 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1436 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1437 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1438 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1439 /*
1440 * Set the private data area to qp to be used in callbacks
1441 */
1442 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1443 kqp->rdmaconn.c_state = C_CONNECTED;
1444 *qp = kqp;
1445 return (RDMA_SUCCESS);
1446 fail:
1447 if (kqp)
1448 kmem_free(kqp, sizeof (rib_qp_t));
1449
1450 return (RDMA_FAILED);
1451 }
1452
1453 void
1454 rib_dump_pathrec(ibt_path_info_t *path_rec)
1455 {
1456 ib_pkey_t pkey;
1457
1458 if (rib_debug > 1) {
1459 cmn_err(CE_NOTE, "Path Record:\n");
1460
1461 cmn_err(CE_NOTE, "Source HCA GUID = %llx\n",
1462 (longlong_t)path_rec->pi_hca_guid);
1463 cmn_err(CE_NOTE, "Dest Service ID = %llx\n",
1464 (longlong_t)path_rec->pi_sid);
1465 cmn_err(CE_NOTE, "Port Num = %02d\n",
1466 path_rec->pi_prim_cep_path.cep_hca_port_num);
1467 cmn_err(CE_NOTE, "P_Key Index = %04d\n",
1468 path_rec->pi_prim_cep_path.cep_pkey_ix);
1469
1470 (void) ibt_index2pkey_byguid(path_rec->pi_hca_guid,
1471 path_rec->pi_prim_cep_path.cep_hca_port_num,
1472 path_rec->pi_prim_cep_path.cep_pkey_ix, &pkey);
1473 cmn_err(CE_NOTE, "P_Key = 0x%x\n", pkey);
1474
1475
1476 cmn_err(CE_NOTE, "SGID: = %llx:%llx\n",
1477 (longlong_t)
1478 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_prefix,
1479 (longlong_t)
1480 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_guid);
1481
1482 cmn_err(CE_NOTE, "DGID: = %llx:%llx\n",
1483 (longlong_t)
1484 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_prefix,
1485 (longlong_t)
1486 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_guid);
1487
1488 cmn_err(CE_NOTE, "Path Rate = %02x\n",
1489 path_rec->pi_prim_cep_path.cep_adds_vect.av_srate);
1490 cmn_err(CE_NOTE, "SL = %02x\n",
1491 path_rec->pi_prim_cep_path.cep_adds_vect.av_srvl);
1492 cmn_err(CE_NOTE, "Prim Packet LT = %02x\n",
1493 path_rec->pi_prim_pkt_lt);
1494 cmn_err(CE_NOTE, "Path MTU = %02x\n",
1495 path_rec->pi_path_mtu);
1496 }
1497 }
1498
1499 /* ARGSUSED */
1500 ibt_cm_status_t
1501 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1502 ibt_cm_return_args_t *ret_args, void *priv_data,
1503 ibt_priv_data_len_t len)
1504 {
1505 rpcib_state_t *ribstat;
1506 rib_hca_t *hca;
1507
1508 ribstat = (rpcib_state_t *)clnt_hdl;
1509 hca = (rib_hca_t *)ribstat->hca;
1510
1511 switch (event->cm_type) {
1512
1513 /* got a connection close event */
1514 case IBT_CM_EVENT_CONN_CLOSED:
1515 {
1516 CONN *conn;
1517 rib_qp_t *qp;
1518
1519 /* check reason why connection was closed */
1520 switch (event->cm_event.closed) {
1521 case IBT_CM_CLOSED_DREP_RCVD:
1522 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1523 case IBT_CM_CLOSED_DUP:
1524 case IBT_CM_CLOSED_ABORT:
1525 case IBT_CM_CLOSED_ALREADY:
1526 /*
1527 * These cases indicate the local end initiated
1528 * the closing of the channel. Nothing to do here.
1529 */
1530 break;
1531 default:
1532 /*
1533 * Reason for CONN_CLOSED event must be one of
1534 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1535 * or IBT_CM_CLOSED_STALE. These indicate cases were
1536 * the remote end is closing the channel. In these
1537 * cases free the channel and transition to error
1538 * state
1539 */
1540 qp = ibt_get_chan_private(event->cm_channel);
1541 conn = qptoc(qp);
1542 mutex_enter(&conn->c_lock);
1543 if (conn->c_state == C_DISCONN_PEND) {
1544 mutex_exit(&conn->c_lock);
1545 break;
1546 }
1547
1548 conn->c_state = C_ERROR;
1549
1550 /*
1551 * Free the rc_channel. Channel has already
1552 * transitioned to ERROR state and WRs have been
1553 * FLUSHED_ERR already.
1554 */
1555 (void) ibt_free_channel(qp->qp_hdl);
1556 qp->qp_hdl = NULL;
1557
1558 /*
1559 * Free the conn if c_ref is down to 0 already
1560 */
1561 if (conn->c_ref == 0) {
1562 /*
1563 * Remove from list and free conn
1564 */
1565 conn->c_state = C_DISCONN_PEND;
1566 mutex_exit(&conn->c_lock);
1567 (void) rib_disconnect_channel(conn,
1568 &hca->cl_conn_list);
1569 } else {
1570 mutex_exit(&conn->c_lock);
1571 }
1572 #ifdef DEBUG
1573 if (rib_debug)
1574 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1575 "(CONN_CLOSED) channel disconnected");
1576 #endif
1577 break;
1578 }
1579 break;
1580 }
1581 default:
1582 break;
1583 }
1584 return (IBT_CM_ACCEPT);
1585 }
1586
1587
1588 /* Check if server has done ATS registration */
1589 rdma_stat
1590 rib_chk_srv_ats(rib_hca_t *hca, struct netbuf *raddr,
1591 int addr_type, ibt_path_info_t *path)
1592 {
1593 struct sockaddr_in *sin4;
1594 struct sockaddr_in6 *sin6;
1595 ibt_path_attr_t path_attr;
1596 ibt_status_t ibt_status;
1597 ib_pkey_t pkey;
1598 ibt_ar_t ar_query, ar_result;
1599 rib_service_t *ats;
1600 ib_gid_t sgid;
1601 ibt_path_info_t paths[MAX_PORTS];
1602 uint8_t npaths, i;
1603
1604 (void) bzero(&path_attr, sizeof (ibt_path_attr_t));
1605 (void) bzero(path, sizeof (ibt_path_info_t));
1606
1607 /*
1608 * Construct svc name
1609 */
1610 path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
1611 switch (addr_type) {
1612 case AF_INET:
1613 sin4 = (struct sockaddr_in *)raddr->buf;
1614 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
1615 IB_SVC_NAME_LEN);
1616 break;
1617
1618 case AF_INET6:
1619 sin6 = (struct sockaddr_in6 *)raddr->buf;
1620 (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
1621 path_attr.pa_sname, IB_SVC_NAME_LEN);
1622 break;
1623
1624 default:
1625 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1626 return (RDMA_INVAL);
1627 }
1628 (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
1629
1630 /*
1631 * Attempt a path to the server on an ATS-registered port.
1632 * Try all ATS-registered ports until one succeeds.
1633 * The first one that succeeds will be used to connect
1634 * to the server. If none of them succeed, return RDMA_FAILED.
1635 */
1636 rw_enter(&hca->state_lock, RW_READER);
1637 if (hca->state != HCA_DETACHED) {
1638 rw_enter(&hca->service_list_lock, RW_READER);
1639 for (ats = hca->ats_list; ats != NULL; ats = ats->srv_next) {
1640 path_attr.pa_hca_guid = hca->hca_guid;
1641 path_attr.pa_hca_port_num = ats->srv_port;
1642 ibt_status = ibt_get_paths(hca->ibt_clnt_hdl,
1643 IBT_PATH_MULTI_SVC_DEST, &path_attr, 2, paths, &npaths);
1644 if (ibt_status == IBT_SUCCESS ||
1645 ibt_status == IBT_INSUFF_DATA) {
1646 for (i = 0; i < npaths; i++) {
1647 if (paths[i].pi_hca_guid) {
1648 /*
1649 * do ibt_query_ar()
1650 */
1651 sgid =
1652 paths[i].pi_prim_cep_path.cep_adds_vect.av_sgid;
1653
1654 (void) ibt_index2pkey_byguid(paths[i].pi_hca_guid,
1655 paths[i].pi_prim_cep_path.cep_hca_port_num,
1656 paths[i].pi_prim_cep_path.cep_pkey_ix, &pkey);
1657
1658 bzero(&ar_query, sizeof (ar_query));
1659 bzero(&ar_result, sizeof (ar_result));
1660 ar_query.ar_gid =
1661 paths[i].pi_prim_cep_path.cep_adds_vect.av_dgid;
1662 ar_query.ar_pkey = pkey;
1663 ibt_status = ibt_query_ar(&sgid, &ar_query,
1664 &ar_result);
1665 if (ibt_status == IBT_SUCCESS) {
1666 #ifdef DEBUG
1667 if (rib_debug > 1)
1668 rib_dump_pathrec(&paths[i]);
1669 #endif
1670 bcopy(&paths[i], path,
1671 sizeof (ibt_path_info_t));
1672 rw_exit(&hca->service_list_lock);
1673 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1674 rw_exit(&hca->state_lock);
1675 return (RDMA_SUCCESS);
1676 }
1677 #ifdef DEBUG
1678 if (rib_debug) {
1679 cmn_err(CE_NOTE, "rib_chk_srv_ats: "
1680 "ibt_query_ar FAILED, return\n");
1681 }
1682 #endif
1683 }
1684 }
1685 }
1686 }
1687 rw_exit(&hca->service_list_lock);
1688 }
1689 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1690 rw_exit(&hca->state_lock);
1691 return (RDMA_FAILED);
1692 }
1693
1694
1695 /*
1696 * Connect to the server.
1697 */
1698 rdma_stat
1699 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path)
1700 {
1701 ibt_chan_open_args_t chan_args; /* channel args */
1702 ibt_chan_sizes_t chan_sizes;
1703 ibt_rc_chan_alloc_args_t qp_attr;
1704 ibt_status_t ibt_status;
1705 ibt_rc_returns_t ret_args; /* conn reject info */
1706 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
1707
1708 (void) bzero(&chan_args, sizeof (chan_args));
1709 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1710
1711 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
1712 /* Alloc a RC channel */
1713 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1714 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1715 qp_attr.rc_pd = hca->pd_hdl;
1716 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1717 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1718 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1719 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1720 qp_attr.rc_clone_chan = NULL;
1721 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1722 qp_attr.rc_flags = IBT_WR_SIGNALED;
1723
1724 chan_args.oc_path = path;
1725 chan_args.oc_cm_handler = rib_clnt_cm_handler;
1726 chan_args.oc_cm_clnt_private = (void *)rib_stat;
1727 chan_args.oc_rdma_ra_out = 1;
1728 chan_args.oc_rdma_ra_in = 1;
1729 chan_args.oc_path_retry_cnt = 2;
1730 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1731
1732 refresh:
1733 rw_enter(&hca->state_lock, RW_READER);
1734 if (hca->state != HCA_DETACHED) {
1735 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1736 IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl,
1737 &chan_sizes);
1738 } else {
1739 rw_exit(&hca->state_lock);
1740 return (RDMA_FAILED);
1741 }
1742 rw_exit(&hca->state_lock);
1743
1744 if (ibt_status != IBT_SUCCESS) {
1745 #ifdef DEBUG
1746 cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel "
1747 "failed, ibt_status=%d.", ibt_status);
1748 #endif
1749 return (RDMA_FAILED);
1750 }
1751
1752 /* Connect to the Server */
1753 (void) bzero(&ret_args, sizeof (ret_args));
1754 mutex_enter(&qp->cb_lock);
1755 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1756 IBT_BLOCKING, &chan_args, &ret_args);
1757 if (ibt_status != IBT_SUCCESS) {
1758 #ifdef DEBUG
1759 if (rib_debug)
1760 cmn_err(CE_WARN, "rib_conn_to_srv: open_rc_channel"
1761 " failed for qp %p, status=%d, "
1762 "ret_args.rc_status=%d\n",
1763 (void *)qp, ibt_status, ret_args.rc_status);
1764 #endif
1765 (void) ibt_free_channel(qp->qp_hdl);
1766 qp->qp_hdl = NULL;
1767 mutex_exit(&qp->cb_lock);
1768 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1769 ret_args.rc_status == IBT_CM_CONN_STALE) {
1770 /*
1771 * Got IBT_CM_CONN_STALE probably because of stale
1772 * data on the passive end of a channel that existed
1773 * prior to reboot. Retry establishing a channel
1774 * REFRESH_ATTEMPTS times, during which time the
1775 * stale conditions on the server might clear up.
1776 */
1777 goto refresh;
1778 }
1779 return (RDMA_FAILED);
1780 }
1781 mutex_exit(&qp->cb_lock);
1782 /*
1783 * Set the private data area to qp to be used in callbacks
1784 */
1785 ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1786 return (RDMA_SUCCESS);
1787 }
1788
1789 rdma_stat
1790 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
1791 {
1792 struct sockaddr_in *sin4;
1793 struct sockaddr_in6 *sin6;
1794 ibt_path_attr_t path_attr;
1795 ibt_path_info_t path;
1796 ibt_status_t ibt_status;
1797
1798 ASSERT(raddr->buf != NULL);
1799
1800 bzero(&path_attr, sizeof (ibt_path_attr_t));
1801 bzero(&path, sizeof (ibt_path_info_t));
1802
1803 /*
1804 * Conctruct svc name
1805 */
1806 path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
1807 switch (addr_type) {
1808 case AF_INET:
1809 sin4 = (struct sockaddr_in *)raddr->buf;
1810 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
1811 IB_SVC_NAME_LEN);
1812 break;
1813
1814 case AF_INET6:
1815 sin6 = (struct sockaddr_in6 *)raddr->buf;
1816 (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
1817 path_attr.pa_sname, IB_SVC_NAME_LEN);
1818 break;
1819
1820 default:
1821 #ifdef DEBUG
1822 if (rib_debug) {
1823 cmn_err(CE_WARN, "rib_ping_srv: Address not recognized\n");
1824 }
1825 #endif
1826 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1827 return (RDMA_INVAL);
1828 }
1829 (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
1830
1831 ibt_status = ibt_get_paths(rib_stat->ibt_clnt_hdl,
1832 IBT_PATH_NO_FLAGS, &path_attr, 1, &path, NULL);
1833 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1834 if (ibt_status != IBT_SUCCESS) {
1835 if (rib_debug > 1) {
1836 cmn_err(CE_WARN, "rib_ping_srv: ibt_get_paths FAILED!"
1837 " status=%d\n", ibt_status);
1838 }
1839 } else if (path.pi_hca_guid) {
1840 ASSERT(path.pi_hca_guid == rib_stat->hca->hca_guid);
1841 *hca = rib_stat->hca;
1842 return (RDMA_SUCCESS);
1843 }
1844 return (RDMA_FAILED);
1845 }
1846
1847 /*
1848 * Close channel, remove from connection list and
1849 * free up resources allocated for that channel.
1850 */
1851 rdma_stat
1852 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1853 {
1854 rib_qp_t *qp = ctoqp(conn);
1855 rib_hca_t *hca;
1856
1857 /*
1858 * c_ref == 0 and connection is in C_DISCONN_PEND
1859 */
1860 hca = qp->hca;
1861 if (conn_list != NULL)
1862 (void) rib_rm_conn(conn, conn_list);
1863 if (qp->qp_hdl != NULL) {
1864 /*
1865 * If the channel has not been establised,
1866 * ibt_flush_channel is called to flush outstanding WRs
1867 * on the Qs. Otherwise, ibt_close_rc_channel() is
1868 * called. The channel is then freed.
1869 */
1870 if (conn_list != NULL)
1871 (void) ibt_close_rc_channel(qp->qp_hdl,
1872 IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
1873 else
1874 (void) ibt_flush_channel(qp->qp_hdl);
1875
1876 mutex_enter(&qp->posted_rbufs_lock);
1877 while (qp->n_posted_rbufs)
1878 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1879 mutex_exit(&qp->posted_rbufs_lock);
1880 (void) ibt_free_channel(qp->qp_hdl);
1881 qp->qp_hdl = NULL;
1882 }
1883 ASSERT(qp->rdlist == NULL);
1884 if (qp->replylist != NULL) {
1885 (void) rib_rem_replylist(qp);
1886 }
1887
1888 cv_destroy(&qp->cb_conn_cv);
1889 cv_destroy(&qp->posted_rbufs_cv);
1890 mutex_destroy(&qp->cb_lock);
1891
1892 mutex_destroy(&qp->replylist_lock);
1893 mutex_destroy(&qp->posted_rbufs_lock);
1894 mutex_destroy(&qp->rdlist_lock);
1895
1896 cv_destroy(&conn->c_cv);
1897 mutex_destroy(&conn->c_lock);
1898
1899 if (conn->c_raddr.buf != NULL) {
1900 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
1901 }
1902 if (conn->c_laddr.buf != NULL) {
1903 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
1904 }
1905 kmem_free(qp, sizeof (rib_qp_t));
1906
1907 /*
1908 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
1909 * then the hca is no longer being used.
1910 */
1911 if (conn_list != NULL) {
1912 rw_enter(&hca->state_lock, RW_READER);
1913 if (hca->state == HCA_DETACHED) {
1914 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
1915 if (hca->srv_conn_list.conn_hd == NULL) {
1916 rw_enter(&hca->cl_conn_list.conn_lock,
1917 RW_READER);
1918 if (hca->cl_conn_list.conn_hd == NULL) {
1919 mutex_enter(&hca->inuse_lock);
1920 hca->inuse = FALSE;
1921 cv_signal(&hca->cb_cv);
1922 mutex_exit(&hca->inuse_lock);
1923 }
1924 rw_exit(&hca->cl_conn_list.conn_lock);
1925 }
1926 rw_exit(&hca->srv_conn_list.conn_lock);
1927 }
1928 rw_exit(&hca->state_lock);
1929 }
1930 return (RDMA_SUCCESS);
1931 }
1932
1933 /*
1934 * Wait for send completion notification. Only on receiving a
1935 * notification be it a successful or error completion, free the
1936 * send_wid.
1937 */
1938 static rdma_stat
1939 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
1940 {
1941 clock_t timout, cv_wait_ret;
1942 rdma_stat error = RDMA_SUCCESS;
1943 int i;
1944
1945 /*
1946 * Wait for send to complete
1947 */
1948 ASSERT(wd != NULL);
1949 mutex_enter(&wd->sendwait_lock);
1950 if (wd->status == (uint_t)SEND_WAIT) {
1951 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
1952 ddi_get_lbolt();
1953 if (qp->mode == RIB_SERVER) {
1954 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
1955 &wd->sendwait_lock, timout)) > 0 &&
1956 wd->status == (uint_t)SEND_WAIT)
1957 ;
1958 switch (cv_wait_ret) {
1959 case -1: /* timeout */
1960 #ifdef DEBUG
1961 if (rib_debug > 2)
1962 cmn_err(CE_WARN, "rib_sendwait: "
1963 "timed out qp %p\n", (void *)qp);
1964 #endif
1965 wd->cv_sig = 0; /* no signal needed */
1966 error = RDMA_TIMEDOUT;
1967 break;
1968 default: /* got send completion */
1969 break;
1970 }
1971 } else {
1972 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
1973 &wd->sendwait_lock, timout)) > 0 &&
1974 wd->status == (uint_t)SEND_WAIT)
1975 ;
1976 switch (cv_wait_ret) {
1977 case -1: /* timeout */
1978 #ifdef DEBUG
1979 if (rib_debug > 2)
1980 cmn_err(CE_WARN, "rib_sendwait: "
1981 "timed out qp %p\n", (void *)qp);
1982 #endif
1983 wd->cv_sig = 0; /* no signal needed */
1984 error = RDMA_TIMEDOUT;
1985 break;
1986 case 0: /* interrupted */
1987 #ifdef DEBUG
1988 if (rib_debug > 2)
1989 cmn_err(CE_NOTE, "rib_sendwait:"
1990 " interrupted on qp %p\n",
1991 (void *)qp);
1992 #endif
1993 wd->cv_sig = 0; /* no signal needed */
1994 error = RDMA_INTR;
1995 break;
1996 default: /* got send completion */
1997 break;
1998 }
1999 }
2000 }
2001
2002 if (wd->status != (uint_t)SEND_WAIT) {
2003 /* got send completion */
2004 if (wd->status != RDMA_SUCCESS) {
2005 error = wd->status;
2006 if (wd->status != RDMA_CONNLOST)
2007 error = RDMA_FAILED;
2008 }
2009 for (i = 0; i < wd->nsbufs; i++) {
2010 rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2011 (void *)(uintptr_t)wd->sbufaddr[i]);
2012 }
2013 mutex_exit(&wd->sendwait_lock);
2014 (void) rib_free_sendwait(wd);
2015 } else {
2016 mutex_exit(&wd->sendwait_lock);
2017 }
2018
2019 return (error);
2020 }
2021
2022 static struct send_wid *
2023 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2024 {
2025 struct send_wid *wd;
2026
2027 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2028 wd->xid = xid;
2029 wd->cv_sig = cv_sig;
2030 wd->qp = qp;
2031 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2032 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2033 wd->status = (uint_t)SEND_WAIT;
2034
2035 return (wd);
2036 }
2037
2038 static int
2039 rib_free_sendwait(struct send_wid *wdesc)
2040 {
2041 cv_destroy(&wdesc->wait_cv);
2042 mutex_destroy(&wdesc->sendwait_lock);
2043 kmem_free(wdesc, sizeof (*wdesc));
2044
2045 return (0);
2046 }
2047
2048 static rdma_stat
2049 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2050 {
2051 mutex_enter(&qp->replylist_lock);
2052 if (rep != NULL) {
2053 (void) rib_remreply(qp, rep);
2054 mutex_exit(&qp->replylist_lock);
2055 return (RDMA_SUCCESS);
2056 }
2057 mutex_exit(&qp->replylist_lock);
2058 return (RDMA_FAILED);
2059 }
2060
2061 /*
2062 * Send buffers are freed here only in case of error in posting
2063 * on QP. If the post succeeded, the send buffers are freed upon
2064 * send completion in rib_sendwait() or in the scq_handler.
2065 */
2066 rdma_stat
2067 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2068 int send_sig, int cv_sig)
2069 {
2070 struct send_wid *wdesc;
2071 struct clist *clp;
2072 ibt_status_t ibt_status = IBT_SUCCESS;
2073 rdma_stat ret = RDMA_SUCCESS;
2074 ibt_send_wr_t tx_wr;
2075 int i, nds;
2076 ibt_wr_ds_t sgl[DSEG_MAX];
2077 uint_t total_msg_size;
2078 rib_qp_t *qp = ctoqp(conn);
2079
2080 ASSERT(cl != NULL);
2081
2082 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2083
2084 nds = 0;
2085 total_msg_size = 0;
2086 clp = cl;
2087 while (clp != NULL) {
2088 if (nds >= DSEG_MAX) {
2089 cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX"
2090 " too small!");
2091 return (RDMA_FAILED);
2092 }
2093 sgl[nds].ds_va = clp->c_saddr;
2094 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2095 sgl[nds].ds_len = clp->c_len;
2096 total_msg_size += clp->c_len;
2097 clp = clp->c_next;
2098 nds++;
2099 }
2100
2101 if (send_sig) {
2102 /* Set SEND_SIGNAL flag. */
2103 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2104 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2105 } else {
2106 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2107 wdesc = rib_init_sendwait(msgid, 0, qp);
2108 }
2109 wdesc->nsbufs = nds;
2110 for (i = 0; i < nds; i++) {
2111 wdesc->sbufaddr[i] = sgl[i].ds_va;
2112 }
2113
2114 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2115 tx_wr.wr_opcode = IBT_WRC_SEND;
2116 tx_wr.wr_trans = IBT_RC_SRV;
2117 tx_wr.wr_nds = nds;
2118 tx_wr.wr_sgl = sgl;
2119
2120 mutex_enter(&conn->c_lock);
2121 if (conn->c_state & C_CONNECTED) {
2122 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2123 }
2124 if (((conn->c_state & C_CONNECTED) == 0) ||
2125 ibt_status != IBT_SUCCESS) {
2126 mutex_exit(&conn->c_lock);
2127 for (i = 0; i < nds; i++) {
2128 rib_rbuf_free(conn, SEND_BUFFER,
2129 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2130 }
2131 (void) rib_free_sendwait(wdesc);
2132 #ifdef DEBUG
2133 if (rib_debug && ibt_status != IBT_SUCCESS)
2134 cmn_err(CE_WARN, "rib_send_and_wait: ibt_post_send "
2135 "failed! wr_id %llx on qpn %p, status=%d!",
2136 (longlong_t)tx_wr.wr_id, (void *)qp,
2137 ibt_status);
2138 #endif
2139 return (RDMA_FAILED);
2140 }
2141 mutex_exit(&conn->c_lock);
2142
2143 if (send_sig) {
2144 if (cv_sig) {
2145 /*
2146 * cv_wait for send to complete.
2147 * We can fail due to a timeout or signal or
2148 * unsuccessful send.
2149 */
2150 ret = rib_sendwait(qp, wdesc);
2151 #ifdef DEBUG
2152 if (rib_debug > 2)
2153 if (ret != 0) {
2154 cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait "
2155 "FAILED, rdma stat=%d, wr_id %llx, qp %p!",
2156 ret, (longlong_t)tx_wr.wr_id, (void *)qp);
2157 }
2158 #endif
2159 return (ret);
2160 }
2161 }
2162
2163 return (RDMA_SUCCESS);
2164 }
2165
2166 rdma_stat
2167 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2168 {
2169 rdma_stat ret;
2170
2171 /* send-wait & cv_signal */
2172 ret = rib_send_and_wait(conn, cl, msgid, 1, 1);
2173
2174 return (ret);
2175 }
2176
2177 /*
2178 * Server interface (svc_rdma_ksend).
2179 * Send RPC reply and wait for RDMA_DONE.
2180 */
2181 rdma_stat
2182 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2183 {
2184 rdma_stat ret = RDMA_SUCCESS;
2185 struct rdma_done_list *rd;
2186 clock_t timout, cv_wait_ret;
2187 rib_qp_t *qp = ctoqp(conn);
2188
2189 mutex_enter(&qp->rdlist_lock);
2190 rd = rdma_done_add(qp, msgid);
2191
2192 /* No cv_signal (whether send-wait or no-send-wait) */
2193 ret = rib_send_and_wait(conn, cl, msgid, 1, 0);
2194 if (ret != RDMA_SUCCESS) {
2195 #ifdef DEBUG
2196 cmn_err(CE_WARN, "rib_send_resp: send_and_wait "
2197 "failed, msgid %u, qp %p", msgid, (void *)qp);
2198 #endif
2199 rdma_done_rm(qp, rd);
2200 goto done;
2201 }
2202
2203 /*
2204 * Wait for RDMA_DONE from remote end
2205 */
2206 timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2207 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock,
2208 timout);
2209 rdma_done_rm(qp, rd);
2210 if (cv_wait_ret < 0) {
2211 #ifdef DEBUG
2212 if (rib_debug > 1) {
2213 cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
2214 " recv'd for qp %p, xid:%u\n",
2215 (void *)qp, msgid);
2216 }
2217 #endif
2218 ret = RDMA_TIMEDOUT;
2219 goto done;
2220 }
2221
2222 done:
2223 mutex_exit(&qp->rdlist_lock);
2224 return (ret);
2225 }
2226
2227 static struct recv_wid *
2228 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2229 {
2230 struct recv_wid *rwid;
2231
2232 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2233 rwid->xid = msgid;
2234 rwid->addr = sgl->ds_va;
2235 rwid->qp = qp;
2236
2237 return (rwid);
2238 }
2239
2240 static void
2241 rib_free_wid(struct recv_wid *rwid)
2242 {
2243 kmem_free(rwid, sizeof (struct recv_wid));
2244 }
2245
2246 rdma_stat
2247 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2248 {
2249 rib_qp_t *qp = ctoqp(conn);
2250 struct clist *clp = cl;
2251 struct reply *rep;
2252 struct recv_wid *rwid;
2253 int nds;
2254 ibt_wr_ds_t sgl[DSEG_MAX];
2255 ibt_recv_wr_t recv_wr;
2256 rdma_stat ret;
2257 ibt_status_t ibt_status;
2258
2259 /*
2260 * rdma_clnt_postrecv uses RECV_BUFFER.
2261 */
2262
2263 nds = 0;
2264 while (cl != NULL) {
2265 if (nds >= DSEG_MAX) {
2266 cmn_err(CE_WARN, "rib_clnt_post: DSEG_MAX too small!");
2267 ret = RDMA_FAILED;
2268 goto done;
2269 }
2270 sgl[nds].ds_va = cl->c_saddr;
2271 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2272 sgl[nds].ds_len = cl->c_len;
2273 cl = cl->c_next;
2274 nds++;
2275 }
2276
2277 if (nds != 1) {
2278 cmn_err(CE_WARN, "rib_clnt_post: nds!=1\n");
2279 ret = RDMA_FAILED;
2280 goto done;
2281 }
2282 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2283 recv_wr.wr_nds = nds;
2284 recv_wr.wr_sgl = sgl;
2285
2286 rwid = rib_create_wid(qp, &sgl[0], msgid);
2287 if (rwid) {
2288 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2289 } else {
2290 cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2291 ret = RDMA_NORESOURCE;
2292 goto done;
2293 }
2294 rep = rib_addreplylist(qp, msgid);
2295 if (!rep) {
2296 cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2297 rib_free_wid(rwid);
2298 ret = RDMA_NORESOURCE;
2299 goto done;
2300 }
2301
2302 mutex_enter(&conn->c_lock);
2303 if (conn->c_state & C_CONNECTED) {
2304 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2305 }
2306 if (((conn->c_state & C_CONNECTED) == 0) ||
2307 ibt_status != IBT_SUCCESS) {
2308 mutex_exit(&conn->c_lock);
2309 #ifdef DEBUG
2310 cmn_err(CE_WARN, "rib_clnt_post: QPN %p failed in "
2311 "ibt_post_recv(), msgid=%d, status=%d",
2312 (void *)qp, msgid, ibt_status);
2313 #endif
2314 rib_free_wid(rwid);
2315 (void) rib_rem_rep(qp, rep);
2316 ret = RDMA_FAILED;
2317 goto done;
2318 }
2319 mutex_exit(&conn->c_lock);
2320 return (RDMA_SUCCESS);
2321
2322 done:
2323 while (clp != NULL) {
2324 rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)clp->c_saddr);
2325 clp = clp->c_next;
2326 }
2327 return (ret);
2328 }
2329
2330 rdma_stat
2331 rib_svc_post(CONN* conn, struct clist *cl)
2332 {
2333 rib_qp_t *qp = ctoqp(conn);
2334 struct svc_recv *s_recvp;
2335 int nds;
2336 ibt_wr_ds_t sgl[DSEG_MAX];
2337 ibt_recv_wr_t recv_wr;
2338 ibt_status_t ibt_status;
2339
2340 nds = 0;
2341 while (cl != NULL) {
2342 if (nds >= DSEG_MAX) {
2343 cmn_err(CE_WARN, "rib_svc_post: DSEG_MAX too small!");
2344 return (RDMA_FAILED);
2345 }
2346 sgl[nds].ds_va = cl->c_saddr;
2347 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2348 sgl[nds].ds_len = cl->c_len;
2349 cl = cl->c_next;
2350 nds++;
2351 }
2352
2353 if (nds != 1) {
2354 cmn_err(CE_WARN, "rib_svc_post: nds!=1\n");
2355 rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)sgl[0].ds_va);
2356 return (RDMA_FAILED);
2357 }
2358 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2359 recv_wr.wr_nds = nds;
2360 recv_wr.wr_sgl = sgl;
2361
2362 s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2363 /* Use s_recvp's addr as wr id */
2364 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2365 mutex_enter(&conn->c_lock);
2366 if (conn->c_state & C_CONNECTED) {
2367 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2368 }
2369 if (((conn->c_state & C_CONNECTED) == 0) ||
2370 ibt_status != IBT_SUCCESS) {
2371 mutex_exit(&conn->c_lock);
2372 #ifdef DEBUG
2373 cmn_err(CE_WARN, "rib_svc_post: QP %p failed in "
2374 "ibt_post_recv(), status=%d",
2375 (void *)qp, ibt_status);
2376 #endif
2377 rib_rbuf_free(conn, RECV_BUFFER,
2378 (caddr_t)(uintptr_t)sgl[0].ds_va);
2379 (void) rib_free_svc_recv(s_recvp);
2380 return (RDMA_FAILED);
2381 }
2382 mutex_exit(&conn->c_lock);
2383
2384 return (RDMA_SUCCESS);
2385 }
2386
2387 /* Client */
2388 rdma_stat
2389 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2390 {
2391
2392 return (rib_clnt_post(conn, cl, msgid));
2393 }
2394
2395 /* Server */
2396 rdma_stat
2397 rib_post_recv(CONN *conn, struct clist *cl)
2398 {
2399 rib_qp_t *qp = ctoqp(conn);
2400
2401 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2402 mutex_enter(&qp->posted_rbufs_lock);
2403 qp->n_posted_rbufs++;
2404 mutex_exit(&qp->posted_rbufs_lock);
2405 return (RDMA_SUCCESS);
2406 }
2407 return (RDMA_FAILED);
2408 }
2409
2410 /*
2411 * Client side only interface to "recv" the rpc reply buf
2412 * posted earlier by rib_post_resp(conn, cl, msgid).
2413 */
2414 rdma_stat
2415 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2416 {
2417 struct reply *rep = NULL;
2418 clock_t timout, cv_wait_ret;
2419 rdma_stat ret = RDMA_SUCCESS;
2420 rib_qp_t *qp = ctoqp(conn);
2421
2422 /*
2423 * Find the reply structure for this msgid
2424 */
2425 mutex_enter(&qp->replylist_lock);
2426
2427 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2428 if (rep->xid == msgid)
2429 break;
2430 }
2431 if (rep != NULL) {
2432 /*
2433 * If message not yet received, wait.
2434 */
2435 if (rep->status == (uint_t)REPLY_WAIT) {
2436 timout = ddi_get_lbolt() +
2437 drv_usectohz(REPLY_WAIT_TIME * 1000000);
2438 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2439 &qp->replylist_lock, timout)) > 0 &&
2440 rep->status == (uint_t)REPLY_WAIT);
2441
2442 switch (cv_wait_ret) {
2443 case -1: /* timeout */
2444 ret = RDMA_TIMEDOUT;
2445 break;
2446 case 0:
2447 ret = RDMA_INTR;
2448 break;
2449 default:
2450 break;
2451 }
2452 }
2453
2454 if (rep->status == RDMA_SUCCESS) {
2455 struct clist *cl = NULL;
2456
2457 /*
2458 * Got message successfully
2459 */
2460 clist_add(&cl, 0, rep->bytes_xfer, NULL,
2461 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2462 *clp = cl;
2463 } else {
2464 if (rep->status != (uint_t)REPLY_WAIT) {
2465 /*
2466 * Got error in reply message. Free
2467 * recv buffer here.
2468 */
2469 ret = rep->status;
2470 rib_rbuf_free(conn, RECV_BUFFER,
2471 (caddr_t)(uintptr_t)rep->vaddr_cq);
2472 }
2473 }
2474 (void) rib_remreply(qp, rep);
2475 } else {
2476 /*
2477 * No matching reply structure found for given msgid on the
2478 * reply wait list.
2479 */
2480 ret = RDMA_INVAL;
2481 #ifdef DEBUG
2482 cmn_err(CE_WARN, "rib_recv: no matching reply for "
2483 "xid %u, qp %p\n", msgid, (void *)qp);
2484 #endif
2485 }
2486
2487 /*
2488 * Done.
2489 */
2490 mutex_exit(&qp->replylist_lock);
2491 return (ret);
2492 }
2493
2494 /*
2495 * RDMA write a buffer to the remote address.
2496 */
2497 rdma_stat
2498 rib_write(CONN *conn, struct clist *cl, int wait)
2499 {
2500 ibt_send_wr_t tx_wr;
2501 int nds;
2502 int cv_sig;
2503 ibt_wr_ds_t sgl[DSEG_MAX];
2504 struct send_wid *wdesc;
2505 ibt_status_t ibt_status;
2506 rdma_stat ret = RDMA_SUCCESS;
2507 rib_qp_t *qp = ctoqp(conn);
2508
2509 if (cl == NULL) {
2510 cmn_err(CE_WARN, "rib_write: NULL clist\n");
2511 return (RDMA_FAILED);
2512 }
2513
2514 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2515 /*
2516 * Remote address is at the head chunk item in list.
2517 */
2518 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr;
2519 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */
2520
2521 nds = 0;
2522 while (cl != NULL) {
2523 if (nds >= DSEG_MAX) {
2524 cmn_err(CE_WARN, "rib_write: DSEG_MAX too small!");
2525 return (RDMA_FAILED);
2526 }
2527 sgl[nds].ds_va = cl->c_saddr;
2528 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2529 sgl[nds].ds_len = cl->c_len;
2530 cl = cl->c_next;
2531 nds++;
2532 }
2533
2534 if (wait) {
2535 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2536 cv_sig = 1;
2537 } else {
2538 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2539 cv_sig = 0;
2540 }
2541
2542 wdesc = rib_init_sendwait(0, cv_sig, qp);
2543 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2544 tx_wr.wr_opcode = IBT_WRC_RDMAW;
2545 tx_wr.wr_trans = IBT_RC_SRV;
2546 tx_wr.wr_nds = nds;
2547 tx_wr.wr_sgl = sgl;
2548
2549 mutex_enter(&conn->c_lock);
2550 if (conn->c_state & C_CONNECTED) {
2551 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2552 }
2553 if (((conn->c_state & C_CONNECTED) == 0) ||
2554 ibt_status != IBT_SUCCESS) {
2555 mutex_exit(&conn->c_lock);
2556 (void) rib_free_sendwait(wdesc);
2557 return (RDMA_FAILED);
2558 }
2559 mutex_exit(&conn->c_lock);
2560
2561 /*
2562 * Wait for send to complete
2563 */
2564 if (wait) {
2565 ret = rib_sendwait(qp, wdesc);
2566 if (ret != 0) {
2567 return (ret);
2568 }
2569 }
2570 return (RDMA_SUCCESS);
2571 }
2572
2573 /*
2574 * RDMA Read a buffer from the remote address.
2575 */
2576 rdma_stat
2577 rib_read(CONN *conn, struct clist *cl, int wait)
2578 {
2579 ibt_send_wr_t rx_wr;
2580 int nds;
2581 int cv_sig;
2582 ibt_wr_ds_t sgl[DSEG_MAX]; /* is 2 sufficient? */
2583 struct send_wid *wdesc;
2584 ibt_status_t ibt_status = IBT_SUCCESS;
2585 rdma_stat ret = RDMA_SUCCESS;
2586 rib_qp_t *qp = ctoqp(conn);
2587
2588 if (cl == NULL) {
2589 cmn_err(CE_WARN, "rib_read: NULL clist\n");
2590 return (RDMA_FAILED);
2591 }
2592
2593 bzero(&rx_wr, sizeof (ibt_send_wr_t));
2594 /*
2595 * Remote address is at the head chunk item in list.
2596 */
2597 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_saddr;
2598 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; /* rkey */
2599
2600 nds = 0;
2601 while (cl != NULL) {
2602 if (nds >= DSEG_MAX) {
2603 cmn_err(CE_WARN, "rib_read: DSEG_MAX too small!");
2604 return (RDMA_FAILED);
2605 }
2606 sgl[nds].ds_va = cl->c_daddr;
2607 sgl[nds].ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2608 sgl[nds].ds_len = cl->c_len;
2609 cl = cl->c_next;
2610 nds++;
2611 }
2612
2613 if (wait) {
2614 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2615 cv_sig = 1;
2616 } else {
2617 rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2618 cv_sig = 0;
2619 }
2620
2621 wdesc = rib_init_sendwait(0, cv_sig, qp);
2622 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2623 rx_wr.wr_opcode = IBT_WRC_RDMAR;
2624 rx_wr.wr_trans = IBT_RC_SRV;
2625 rx_wr.wr_nds = nds;
2626 rx_wr.wr_sgl = sgl;
2627
2628 mutex_enter(&conn->c_lock);
2629 if (conn->c_state & C_CONNECTED) {
2630 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2631 }
2632 if (((conn->c_state & C_CONNECTED) == 0) ||
2633 ibt_status != IBT_SUCCESS) {
2634 mutex_exit(&conn->c_lock);
2635 #ifdef DEBUG
2636 if (rib_debug && ibt_status != IBT_SUCCESS)
2637 cmn_err(CE_WARN, "rib_read: FAILED post_sending RDMAR"
2638 " wr_id %llx on qp %p, status=%d",
2639 (longlong_t)rx_wr.wr_id, (void *)qp,
2640 ibt_status);
2641 #endif
2642 (void) rib_free_sendwait(wdesc);
2643 return (RDMA_FAILED);
2644 }
2645 mutex_exit(&conn->c_lock);
2646
2647 /*
2648 * Wait for send to complete
2649 */
2650 if (wait) {
2651 ret = rib_sendwait(qp, wdesc);
2652 if (ret != 0) {
2653 return (ret);
2654 }
2655 }
2656
2657 return (RDMA_SUCCESS);
2658 }
2659
2660 int
2661 is_for_ipv4(ibt_ar_t *result)
2662 {
2663 int i, size = sizeof (struct in_addr);
2664 uint8_t zero = 0;
2665
2666 for (i = 0; i < (ATS_AR_DATA_LEN - size); i++)
2667 zero |= result->ar_data[i];
2668 return (zero == 0);
2669 }
2670
2671 /*
2672 * rib_srv_cm_handler()
2673 * Connection Manager callback to handle RC connection requests.
2674 */
2675 /* ARGSUSED */
2676 static ibt_cm_status_t
2677 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2678 ibt_cm_return_args_t *ret_args, void *priv_data,
2679 ibt_priv_data_len_t len)
2680 {
2681 queue_t *q;
2682 rib_qp_t *qp;
2683 rpcib_state_t *ribstat;
2684 rib_hca_t *hca;
2685 rdma_stat status = RDMA_SUCCESS;
2686 int i;
2687 struct clist cl;
2688 rdma_buf_t rdbuf;
2689 void *buf = NULL;
2690 ibt_cm_req_rcv_t cm_req_rcv;
2691 CONN *conn;
2692 ibt_status_t ibt_status;
2693 ibt_ar_t ar_query, ar_result;
2694 ib_gid_t sgid;
2695
2696
2697 ASSERT(any != NULL);
2698 ASSERT(event != NULL);
2699
2700 ribstat = (rpcib_state_t *)any;
2701 hca = (rib_hca_t *)ribstat->hca;
2702 ASSERT(hca != NULL);
2703
2704 /* got a connection request */
2705 switch (event->cm_type) {
2706 case IBT_CM_EVENT_REQ_RCV:
2707 /*
2708 * If the plugin is in the NO_ACCEPT state, bail out.
2709 */
2710 mutex_enter(&plugin_state_lock);
2711 if (plugin_state == NO_ACCEPT) {
2712 mutex_exit(&plugin_state_lock);
2713 return (IBT_CM_REJECT);
2714 }
2715 mutex_exit(&plugin_state_lock);
2716
2717 /*
2718 * Need to send a MRA MAD to CM so that it does not
2719 * timeout on us.
2720 */
2721 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2722 event->cm_event.req.req_timeout * 8, NULL, 0);
2723
2724 mutex_enter(&rib_stat->open_hca_lock);
2725 q = rib_stat->q;
2726 mutex_exit(&rib_stat->open_hca_lock);
2727 status = rib_svc_create_chan(hca, (caddr_t)q,
2728 event->cm_event.req.req_prim_hca_port, &qp);
2729 if (status) {
2730 #ifdef DEBUG
2731 cmn_err(CE_WARN, "rib_srv_cm_handler: "
2732 "create_channel failed %d", status);
2733 #endif
2734 return (IBT_CM_REJECT);
2735 }
2736 cm_req_rcv = event->cm_event.req;
2737
2738 #ifdef DEBUG
2739 if (rib_debug > 2) {
2740 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2741 "server recv'ed IBT_CM_EVENT_REQ_RCV\n");
2742 cmn_err(CE_NOTE, "\t\t SID:%llx\n",
2743 (longlong_t)cm_req_rcv.req_service_id);
2744 cmn_err(CE_NOTE, "\t\t Local Port:%d\n",
2745 cm_req_rcv.req_prim_hca_port);
2746 cmn_err(CE_NOTE,
2747 "\t\t Remote GID:(prefix:%llx,guid:%llx)\n",
2748 (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_prefix,
2749 (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_guid);
2750 cmn_err(CE_NOTE, "\t\t Local GID:(prefix:%llx,guid:%llx)\n",
2751 (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_prefix,
2752 (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_guid);
2753 cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n",
2754 cm_req_rcv.req_remote_qpn);
2755 cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n",
2756 cm_req_rcv.req_remote_qkey);
2757 cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n",
2758 (void *)qp, (void *)qp->qp_hdl);
2759 }
2760
2761 if (rib_debug > 2) {
2762 ibt_rc_chan_query_attr_t chan_attrs;
2763
2764 if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs)
2765 == IBT_SUCCESS) {
2766 cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in "
2767 "CEP state %d\n", (void *)qp, chan_attrs.rc_state);
2768 }
2769 }
2770 #endif
2771
2772 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2773 ret_args->cm_ret.rep.cm_rdma_ra_out = 1;
2774 ret_args->cm_ret.rep.cm_rdma_ra_in = 1;
2775 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2776
2777 /*
2778 * Pre-posts RECV buffers
2779 */
2780 conn = qptoc(qp);
2781 for (i = 0; i < preposted_rbufs; i++) {
2782 bzero(&rdbuf, sizeof (rdbuf));
2783 rdbuf.type = RECV_BUFFER;
2784 buf = rib_rbuf_alloc(conn, &rdbuf);
2785 if (buf == NULL) {
2786 cmn_err(CE_WARN, "rib_svc_cm_handler: "
2787 "No RECV_BUFFER buf!\n");
2788 (void) rib_disconnect_channel(conn, NULL);
2789 return (IBT_CM_REJECT);
2790 }
2791
2792 bzero(&cl, sizeof (cl));
2793 cl.c_saddr = (uintptr_t)rdbuf.addr;
2794 cl.c_len = rdbuf.len;
2795 cl.c_smemhandle.mrc_lmr = rdbuf.handle.mrc_lmr; /* lkey */
2796 cl.c_next = NULL;
2797 status = rib_post_recv(conn, &cl);
2798 if (status != RDMA_SUCCESS) {
2799 cmn_err(CE_WARN, "rib_srv_cm_handler: failed "
2800 "posting RPC_REQ buf to qp %p!", (void *)qp);
2801 (void) rib_disconnect_channel(conn, NULL);
2802 return (IBT_CM_REJECT);
2803 }
2804 }
2805 (void) rib_add_connlist(conn, &hca->srv_conn_list);
2806
2807 /*
2808 * Get the address translation service record from ATS
2809 */
2810 rw_enter(&hca->state_lock, RW_READER);
2811 if (hca->state == HCA_DETACHED) {
2812 rw_exit(&hca->state_lock);
2813 return (IBT_CM_REJECT);
2814 }
2815 rw_exit(&hca->state_lock);
2816
2817 for (i = 0; i < hca->hca_nports; i++) {
2818 ibt_status = ibt_get_port_state(hca->hca_hdl, i+1,
2819 &sgid, NULL);
2820 if (ibt_status != IBT_SUCCESS) {
2821 if (rib_debug) {
2822 cmn_err(CE_WARN, "rib_srv_cm_handler: "
2823 "ibt_get_port_state FAILED!"
2824 "status = %d\n", ibt_status);
2825 }
2826 } else {
2827 /*
2828 * do ibt_query_ar()
2829 */
2830 bzero(&ar_query, sizeof (ar_query));
2831 bzero(&ar_result, sizeof (ar_result));
2832 ar_query.ar_gid = cm_req_rcv.req_prim_addr.av_dgid;
2833 ar_query.ar_pkey = event->cm_event.req.req_pkey;
2834 ibt_status = ibt_query_ar(&sgid, &ar_query,
2835 &ar_result);
2836 if (ibt_status != IBT_SUCCESS) {
2837 if (rib_debug) {
2838 cmn_err(CE_WARN, "rib_srv_cm_handler: "
2839 "ibt_query_ar FAILED!"
2840 "status = %d\n", ibt_status);
2841 }
2842 } else {
2843 conn = qptoc(qp);
2844
2845 if (is_for_ipv4(&ar_result)) {
2846 struct sockaddr_in *s;
2847 int sin_size = sizeof (struct sockaddr_in);
2848 int in_size = sizeof (struct in_addr);
2849 uint8_t *start_pos;
2850
2851 conn->c_raddr.maxlen =
2852 conn->c_raddr.len = sin_size;
2853 conn->c_raddr.buf = kmem_zalloc(sin_size,
2854 KM_SLEEP);
2855 s = (struct sockaddr_in *)conn->c_raddr.buf;
2856 s->sin_family = AF_INET;
2857 /*
2858 * For IPv4, the IP addr is stored in
2859 * the last four bytes of ar_data.
2860 */
2861 start_pos = ar_result.ar_data +
2862 ATS_AR_DATA_LEN - in_size;
2863 bcopy(start_pos, &s->sin_addr, in_size);
2864 if (rib_debug > 1) {
2865 char print_addr[INET_ADDRSTRLEN];
2866
2867 bzero(print_addr, INET_ADDRSTRLEN);
2868 (void) inet_ntop(AF_INET, &s->sin_addr,
2869 print_addr, INET_ADDRSTRLEN);
2870 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2871 "remote clnt_addr: %s\n", print_addr);
2872 }
2873 } else {
2874 struct sockaddr_in6 *s6;
2875 int sin6_size = sizeof (struct sockaddr_in6);
2876
2877 conn->c_raddr.maxlen =
2878 conn->c_raddr.len = sin6_size;
2879 conn->c_raddr.buf = kmem_zalloc(sin6_size,
2880 KM_SLEEP);
2881
2882 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2883 s6->sin6_family = AF_INET6;
2884 /* sin6_addr is stored in ar_data */
2885 bcopy(ar_result.ar_data, &s6->sin6_addr,
2886 sizeof (struct in6_addr));
2887 if (rib_debug > 1) {
2888 char print_addr[INET6_ADDRSTRLEN];
2889
2890 bzero(print_addr, INET6_ADDRSTRLEN);
2891 (void) inet_ntop(AF_INET6, &s6->sin6_addr,
2892 print_addr, INET6_ADDRSTRLEN);
2893 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2894 "remote clnt_addr: %s\n", print_addr);
2895 }
2896 }
2897 return (IBT_CM_ACCEPT);
2898 }
2899 }
2900 }
2901 if (rib_debug > 1) {
2902 cmn_err(CE_WARN, "rib_srv_cm_handler: "
2903 "address record query failed!");
2904 }
2905 break;
2906
2907 case IBT_CM_EVENT_CONN_CLOSED:
2908 {
2909 CONN *conn;
2910 rib_qp_t *qp;
2911
2912 switch (event->cm_event.closed) {
2913 case IBT_CM_CLOSED_DREP_RCVD:
2914 case IBT_CM_CLOSED_DREQ_TIMEOUT:
2915 case IBT_CM_CLOSED_DUP:
2916 case IBT_CM_CLOSED_ABORT:
2917 case IBT_CM_CLOSED_ALREADY:
2918 /*
2919 * These cases indicate the local end initiated
2920 * the closing of the channel. Nothing to do here.
2921 */
2922 break;
2923 default:
2924 /*
2925 * Reason for CONN_CLOSED event must be one of
2926 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2927 * or IBT_CM_CLOSED_STALE. These indicate cases were
2928 * the remote end is closing the channel. In these
2929 * cases free the channel and transition to error
2930 * state
2931 */
2932 qp = ibt_get_chan_private(event->cm_channel);
2933 conn = qptoc(qp);
2934 mutex_enter(&conn->c_lock);
2935 if (conn->c_state == C_DISCONN_PEND) {
2936 mutex_exit(&conn->c_lock);
2937 break;
2938 }
2939 conn->c_state = C_ERROR;
2940
2941 /*
2942 * Free the rc_channel. Channel has already
2943 * transitioned to ERROR state and WRs have been
2944 * FLUSHED_ERR already.
2945 */
2946 (void) ibt_free_channel(qp->qp_hdl);
2947 qp->qp_hdl = NULL;
2948
2949 /*
2950 * Free the conn if c_ref goes down to 0
2951 */
2952 if (conn->c_ref == 0) {
2953 /*
2954 * Remove from list and free conn
2955 */
2956 conn->c_state = C_DISCONN_PEND;
2957 mutex_exit(&conn->c_lock);
2958 (void) rib_disconnect_channel(conn,
2959 &hca->srv_conn_list);
2960 } else {
2961 mutex_exit(&conn->c_lock);
2962 }
2963 #ifdef DEBUG
2964 if (rib_debug)
2965 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2966 " (CONN_CLOSED) channel disconnected");
2967 #endif
2968 break;
2969 }
2970 break;
2971 }
2972 case IBT_CM_EVENT_CONN_EST:
2973 /*
2974 * RTU received, hence connection established.
2975 */
2976 if (rib_debug > 1)
2977 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2978 "(CONN_EST) channel established");
2979 break;
2980
2981 default:
2982 if (rib_debug > 2) {
2983 /* Let CM handle the following events. */
2984 if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
2985 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2986 "server recv'ed IBT_CM_EVENT_REP_RCV\n");
2987 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
2988 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2989 "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
2990 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
2991 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2992 "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
2993 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
2994 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2995 "server recv'ed IBT_CM_EVENT_APR_RCV\n");
2996 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
2997 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2998 "server recv'ed IBT_CM_EVENT_FAILURE\n");
2999 }
3000 }
3001 return (IBT_CM_REJECT);
3002 }
3003
3004 /* accept all other CM messages (i.e. let the CM handle them) */
3005 return (IBT_CM_ACCEPT);
3006 }
3007
3008 static rdma_stat
3009 rib_register_ats(rib_hca_t *hca)
3010 {
3011 ibt_hca_portinfo_t *port_infop;
3012 uint_t port_size;
3013 uint_t pki, i, num_ports, nbinds;
3014 ibt_status_t ibt_status;
3015 rib_service_t *new_service, *temp_srv;
3016 rpcib_ats_t *atsp;
3017 rpcib_ibd_insts_t ibds;
3018 ib_pkey_t pkey;
3019 ibt_ar_t ar; /* address record */
3020
3021 /*
3022 * Query all ports for the given HCA
3023 */
3024 rw_enter(&hca->state_lock, RW_READER);
3025 if (hca->state != HCA_DETACHED) {
3026 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3027 &num_ports, &port_size);
3028 rw_exit(&hca->state_lock);
3029 } else {
3030 rw_exit(&hca->state_lock);
3031 return (RDMA_FAILED);
3032 }
3033 if (ibt_status != IBT_SUCCESS) {
3034 #ifdef DEBUG
3035 if (rib_debug) {
3036 cmn_err(CE_NOTE, "rib_register_ats: FAILED in "
3037 "ibt_query_hca_ports, status = %d\n", ibt_status);
3038 }
3039 #endif
3040 return (RDMA_FAILED);
3041 }
3042
3043 #ifdef DEBUG
3044 if (rib_debug > 1) {
3045 cmn_err(CE_NOTE, "rib_register_ats: Ports detected "
3046 "%d\n", num_ports);
3047
3048 for (i = 0; i < num_ports; i++) {
3049 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3050 cmn_err(CE_WARN, "rib_register_ats "
3051 "Port #: %d INACTIVE\n", i+1);
3052 } else if (port_infop[i].p_linkstate ==
3053 IBT_PORT_ACTIVE) {
3054 cmn_err(CE_NOTE, "rib_register_ats "
3055 "Port #: %d ACTIVE\n", i+1);
3056 }
3057 }
3058 }
3059 #endif
3060
3061 ibds.rib_ibd_alloc = N_IBD_INSTANCES;
3062 ibds.rib_ibd_cnt = 0;
3063 ibds.rib_ats = (rpcib_ats_t *)kmem_zalloc(ibds.rib_ibd_alloc *
3064 sizeof (rpcib_ats_t), KM_SLEEP);
3065 rib_get_ibd_insts(&ibds);
3066
3067 if (ibds.rib_ibd_cnt == 0) {
3068 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3069 sizeof (rpcib_ats_t));
3070 ibt_free_portinfo(port_infop, port_size);
3071 return (RDMA_FAILED);
3072 }
3073
3074 /*
3075 * Get the IP addresses of active ports and
3076 * register them with ATS. IPv4 addresses
3077 * have precedence over IPv6 addresses.
3078 */
3079 if (get_ibd_ipaddr(&ibds) != 0) {
3080 #ifdef DEBUG
3081 if (rib_debug > 1) {
3082 cmn_err(CE_WARN, "rib_register_ats: "
3083 "get_ibd_ipaddr failed");
3084 }
3085 #endif
3086 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3087 sizeof (rpcib_ats_t));
3088 ibt_free_portinfo(port_infop, port_size);
3089 return (RDMA_FAILED);
3090 }
3091
3092 /*
3093 * Start ATS registration for active ports on this HCA.
3094 */
3095 rw_enter(&hca->service_list_lock, RW_WRITER);
3096 nbinds = 0;
3097 new_service = NULL;
3098 for (i = 0; i < num_ports; i++) {
3099 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3100 continue;
3101
3102 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3103 pkey = port_infop[i].p_pkey_tbl[pki];
3104 if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3105 ar.ar_gid = port_infop[i].p_sgid_tbl[0];
3106 ar.ar_pkey = pkey;
3107 atsp = get_ibd_entry(&ar.ar_gid, pkey, &ibds);
3108 if (atsp == NULL)
3109 continue;
3110 /*
3111 * store the sin[6]_addr in ar_data
3112 */
3113 (void) bzero(ar.ar_data, ATS_AR_DATA_LEN);
3114 if (atsp->ras_inet_type == AF_INET) {
3115 uint8_t *start_pos;
3116
3117 /*
3118 * The ipv4 addr goes into the last
3119 * four bytes of ar_data.
3120 */
3121 start_pos = ar.ar_data + ATS_AR_DATA_LEN -
3122 sizeof (struct in_addr);
3123 bcopy(&atsp->ras_sin.sin_addr, start_pos,
3124 sizeof (struct in_addr));
3125 } else if (atsp->ras_inet_type == AF_INET6) {
3126 bcopy(&atsp->ras_sin6.sin6_addr, ar.ar_data,
3127 sizeof (struct in6_addr));
3128 } else
3129 continue;
3130
3131 ibt_status = ibt_register_ar(hca->ibt_clnt_hdl, &ar);
3132 if (ibt_status == IBT_SUCCESS) {
3133 #ifdef DEBUG
3134 if (rib_debug > 1) {
3135 cmn_err(CE_WARN, "rib_register_ats: "
3136 "ibt_register_ar OK on port %d", i+1);
3137 }
3138 #endif
3139 /*
3140 * Allocate and prepare a service entry
3141 */
3142 new_service = kmem_zalloc(sizeof (rib_service_t),
3143 KM_SLEEP);
3144 new_service->srv_port = i + 1;
3145 new_service->srv_ar = ar;
3146 new_service->srv_next = NULL;
3147
3148 /*
3149 * Add to the service list for this HCA
3150 */
3151 new_service->srv_next = hca->ats_list;
3152 hca->ats_list = new_service;
3153 new_service = NULL;
3154 nbinds ++;
3155 } else {
3156 #ifdef DEBUG
3157 if (rib_debug > 1) {
3158 cmn_err(CE_WARN, "rib_register_ats: "
3159 "ibt_register_ar FAILED on port %d", i+1);
3160 }
3161 #endif
3162 }
3163 }
3164 }
3165 }
3166
3167 #ifdef DEBUG
3168 if (rib_debug > 1) {
3169 for (temp_srv = hca->ats_list; temp_srv != NULL;
3170 temp_srv = temp_srv->srv_next) {
3171 cmn_err(CE_NOTE, "Service: ATS, active on"
3172 " port: %d\n", temp_srv->srv_port);
3173 }
3174 }
3175 #endif
3176
3177 rw_exit(&hca->service_list_lock);
3178 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * sizeof (rpcib_ats_t));
3179 ibt_free_portinfo(port_infop, port_size);
3180
3181 if (nbinds == 0) {
3182 #ifdef DEBUG
3183 if (rib_debug > 1) {
3184 cmn_err(CE_WARN, "rib_register_ats FAILED!\n");
3185 }
3186 #endif
3187 return (RDMA_FAILED);
3188 }
3189 return (RDMA_SUCCESS);
3190 }
3191
3192 static rdma_stat
3193 rib_register_service(rib_hca_t *hca, int service_type)
3194 {
3195 ibt_srv_desc_t sdesc;
3196 ibt_srv_bind_t sbind;
3197 ibt_hca_portinfo_t *port_infop;
3198 ib_svc_id_t srv_id;
3199 ibt_srv_hdl_t srv_hdl;
3200 uint_t port_size;
3201 uint_t pki, i, j, num_ports, nbinds;
3202 ibt_status_t ibt_status;
3203 char **addrs;
3204 int addr_count;
3205 rib_service_t *new_service, *temp_srv;
3206 ib_pkey_t pkey;
3207
3208 /*
3209 * Query all ports for the given HCA
3210 */
3211 rw_enter(&hca->state_lock, RW_READER);
3212 if (hca->state != HCA_DETACHED) {
3213 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3214 &num_ports, &port_size);
3215 rw_exit(&hca->state_lock);
3216 } else {
3217 rw_exit(&hca->state_lock);
3218 return (RDMA_FAILED);
3219 }
3220 if (ibt_status != IBT_SUCCESS) {
3221 #ifdef DEBUG
3222 cmn_err(CE_NOTE, "rib_register_service: FAILED in "
3223 "ibt_query_hca_ports, status = %d\n", ibt_status);
3224 #endif
3225 return (RDMA_FAILED);
3226 }
3227
3228 #ifdef DEBUG
3229 if (rib_debug > 1) {
3230 cmn_err(CE_NOTE, "rib_register_service: Ports detected "
3231 "%d\n", num_ports);
3232
3233 for (i = 0; i < num_ports; i++) {
3234 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3235 cmn_err(CE_WARN, "rib_register_service "
3236 "Port #: %d INACTIVE\n", i+1);
3237 } else if (port_infop[i].p_linkstate ==
3238 IBT_PORT_ACTIVE) {
3239 cmn_err(CE_NOTE, "rib_register_service "
3240 "Port #: %d ACTIVE\n", i+1);
3241 }
3242 }
3243 }
3244 #endif
3245 /*
3246 * Get all the IP addresses on this system to register the
3247 * given "service type" on all DNS recognized IP addrs.
3248 * Each service type such as NFS will have all the systems
3249 * IP addresses as its different names. For now the only
3250 * type of service we support in RPCIB is NFS.
3251 */
3252 addrs = get_ip_addrs(&addr_count);
3253 if (addrs == NULL) {
3254 #ifdef DEBUG
3255 if (rib_debug) {
3256 cmn_err(CE_WARN, "rib_register_service: "
3257 "get_ip_addrs failed\n");
3258 }
3259 #endif
3260 ibt_free_portinfo(port_infop, port_size);
3261 return (RDMA_FAILED);
3262 }
3263
3264 #ifdef DEBUG
3265 if (rib_debug > 1) {
3266 for (i = 0; i < addr_count; i++)
3267 cmn_err(CE_NOTE, "addr %d: %s\n", i, addrs[i]);
3268 }
3269 #endif
3270
3271 rw_enter(&hca->service_list_lock, RW_WRITER);
3272 /*
3273 * Start registering and binding service to active
3274 * on active ports on this HCA.
3275 */
3276 nbinds = 0;
3277 new_service = NULL;
3278
3279 /*
3280 * We use IP addresses as the service names for
3281 * service registration. Register each of them
3282 * with CM to obtain a svc_id and svc_hdl. We do not
3283 * register the service with machine's loopback address.
3284 */
3285 for (j = 1; j < addr_count; j++) {
3286 (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3287 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3288 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3289
3290 sdesc.sd_handler = rib_srv_cm_handler;
3291 sdesc.sd_flags = 0;
3292
3293 ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3294 &sdesc, 0, 1, &srv_hdl, &srv_id);
3295 if (ibt_status != IBT_SUCCESS) {
3296 #ifdef DEBUG
3297 if (rib_debug) {
3298 cmn_err(CE_WARN, "rib_register_service: "
3299 "ibt_register_service FAILED, status "
3300 "= %d\n", ibt_status);
3301 }
3302 #endif
3303 /*
3304 * No need to go on, since we failed to obtain
3305 * a srv_id and srv_hdl. Move on to the next
3306 * IP addr as a service name.
3307 */
3308 continue;
3309 }
3310 for (i = 0; i < num_ports; i++) {
3311 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3312 continue;
3313
3314 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3315 pkey = port_infop[i].p_pkey_tbl[pki];
3316 if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3317
3318 /*
3319 * Allocate and prepare a service entry
3320 */
3321 new_service = kmem_zalloc(1 * sizeof (rib_service_t),
3322 KM_SLEEP);
3323 new_service->srv_type = service_type;
3324 new_service->srv_port = i + 1;
3325 new_service->srv_id = srv_id;
3326 new_service->srv_hdl = srv_hdl;
3327 new_service->srv_sbind_hdl = kmem_zalloc(1 *
3328 sizeof (ibt_sbind_hdl_t), KM_SLEEP);
3329
3330 new_service->srv_name = kmem_zalloc(IB_SVC_NAME_LEN,
3331 KM_SLEEP);
3332 (void) bcopy(addrs[j], new_service->srv_name,
3333 IB_SVC_NAME_LEN);
3334 (void) strlcat(new_service->srv_name, "::NFS",
3335 IB_SVC_NAME_LEN);
3336 new_service->srv_next = NULL;
3337
3338 /*
3339 * Bind the service, specified by the IP address,
3340 * to the port/pkey using the srv_hdl returned
3341 * from ibt_register_service().
3342 */
3343 (void) bzero(&sbind, sizeof (ibt_srv_bind_t));
3344 sbind.sb_pkey = pkey;
3345 sbind.sb_lease = 0xFFFFFFFF;
3346 sbind.sb_key[0] = NFS_SEC_KEY0;
3347 sbind.sb_key[1] = NFS_SEC_KEY1;
3348 sbind.sb_name = new_service->srv_name;
3349
3350 #ifdef DEBUG
3351 if (rib_debug > 1) {
3352 cmn_err(CE_NOTE, "rib_register_service: "
3353 "binding service using name: %s\n",
3354 sbind.sb_name);
3355 }
3356 #endif
3357 ibt_status = ibt_bind_service(srv_hdl,
3358 port_infop[i].p_sgid_tbl[0], &sbind, rib_stat,
3359 new_service->srv_sbind_hdl);
3360 if (ibt_status != IBT_SUCCESS) {
3361 #ifdef DEBUG
3362 if (rib_debug) {
3363 cmn_err(CE_WARN, "rib_register_service: FAILED"
3364 " in ibt_bind_service, status = %d\n",
3365 ibt_status);
3366 }
3367 #endif
3368 kmem_free(new_service->srv_sbind_hdl,
3369 sizeof (ibt_sbind_hdl_t));
3370 kmem_free(new_service->srv_name,
3371 IB_SVC_NAME_LEN);
3372 kmem_free(new_service,
3373 sizeof (rib_service_t));
3374 new_service = NULL;
3375 continue;
3376 }
3377 #ifdef DEBUG
3378 if (rib_debug > 1) {
3379 if (ibt_status == IBT_SUCCESS)
3380 cmn_err(CE_NOTE, "rib_regstr_service: "
3381 "Serv: %s REGISTERED on port: %d",
3382 sbind.sb_name, i+1);
3383 }
3384 #endif
3385 /*
3386 * Add to the service list for this HCA
3387 */
3388 new_service->srv_next = hca->service_list;
3389 hca->service_list = new_service;
3390 new_service = NULL;
3391 nbinds ++;
3392 }
3393 }
3394 }
3395 }
3396 rw_exit(&hca->service_list_lock);
3397
3398 #ifdef DEBUG
3399 if (rib_debug > 1) {
3400 /*
3401 * Change this print to a more generic one, as rpcib
3402 * is supposed to handle multiple service types.
3403 */
3404 for (temp_srv = hca->service_list; temp_srv != NULL;
3405 temp_srv = temp_srv->srv_next) {
3406 cmn_err(CE_NOTE, "NFS-IB, active on port:"
3407 " %d\n"
3408 "Using name: %s", temp_srv->srv_port,
3409 temp_srv->srv_name);
3410 }
3411 }
3412 #endif
3413
3414 ibt_free_portinfo(port_infop, port_size);
3415 for (i = 0; i < addr_count; i++) {
3416 if (addrs[i])
3417 kmem_free(addrs[i], IB_SVC_NAME_LEN);
3418 }
3419 kmem_free(addrs, addr_count * sizeof (char *));
3420
3421 if (nbinds == 0) {
3422 #ifdef DEBUG
3423 if (rib_debug) {
3424 cmn_err(CE_WARN, "rib_register_service: "
3425 "bind_service FAILED!\n");
3426 }
3427 #endif
3428 return (RDMA_FAILED);
3429 } else {
3430 /*
3431 * Put this plugin into accept state, since atleast
3432 * one registration was successful.
3433 */
3434 mutex_enter(&plugin_state_lock);
3435 plugin_state = ACCEPT;
3436 mutex_exit(&plugin_state_lock);
3437 return (RDMA_SUCCESS);
3438 }
3439 }
3440
3441 void
3442 rib_listen(struct rdma_svc_data *rd)
3443 {
3444 rdma_stat status = RDMA_SUCCESS;
3445
3446 rd->active = 0;
3447 rd->err_code = RDMA_FAILED;
3448
3449 /*
3450 * First check if a hca is still attached
3451 */
3452 rw_enter(&rib_stat->hca->state_lock, RW_READER);
3453 if (rib_stat->hca->state != HCA_INITED) {
3454 rw_exit(&rib_stat->hca->state_lock);
3455 return;
3456 }
3457 rw_exit(&rib_stat->hca->state_lock);
3458
3459 rib_stat->q = &rd->q;
3460 /*
3461 * Register the Address translation service
3462 */
3463 mutex_enter(&rib_stat->open_hca_lock);
3464 if (ats_running == 0) {
3465 if (rib_register_ats(rib_stat->hca) != RDMA_SUCCESS) {
3466 #ifdef DEBUG
3467 if (rib_debug) {
3468 cmn_err(CE_WARN,
3469 "rib_listen(): ats registration failed!");
3470 }
3471 #endif
3472 mutex_exit(&rib_stat->open_hca_lock);
3473 return;
3474 } else {
3475 ats_running = 1;
3476 }
3477 }
3478 mutex_exit(&rib_stat->open_hca_lock);
3479
3480 /*
3481 * Right now the only service type is NFS. Hence force feed this
3482 * value. Ideally to communicate the service type it should be
3483 * passed down in rdma_svc_data.
3484 */
3485 rib_stat->service_type = NFS;
3486 status = rib_register_service(rib_stat->hca, NFS);
3487 if (status != RDMA_SUCCESS) {
3488 rd->err_code = status;
3489 return;
3490 }
3491 /*
3492 * Service active on an HCA, check rd->err_code for more
3493 * explainable errors.
3494 */
3495 rd->active = 1;
3496 rd->err_code = status;
3497 }
3498
3499 /* XXXX */
3500 /* ARGSUSED */
3501 static void
3502 rib_listen_stop(struct rdma_svc_data *svcdata)
3503 {
3504 rib_hca_t *hca;
3505
3506 /*
3507 * KRPC called the RDMATF to stop the listeners, this means
3508 * stop sending incomming or recieved requests to KRPC master
3509 * transport handle for RDMA-IB. This is also means that the
3510 * master transport handle, responsible for us, is going away.
3511 */
3512 mutex_enter(&plugin_state_lock);
3513 plugin_state = NO_ACCEPT;
3514 if (svcdata != NULL)
3515 svcdata->active = 0;
3516 mutex_exit(&plugin_state_lock);
3517
3518 /*
3519 * First check if a hca is still attached
3520 */
3521 hca = rib_stat->hca;
3522 rw_enter(&hca->state_lock, RW_READER);
3523 if (hca->state != HCA_INITED) {
3524 rw_exit(&hca->state_lock);
3525 return;
3526 }
3527 rib_stop_services(hca);
3528 rw_exit(&hca->state_lock);
3529 }
3530
3531 /*
3532 * Traverse the HCA's service list to unbind and deregister services.
3533 * Instead of unbinding the service for a service handle by
3534 * calling ibt_unbind_service() for each port/pkey, we unbind
3535 * all the services for the service handle by making only one
3536 * call to ibt_unbind_all_services(). Then, we deregister the
3537 * service for the service handle.
3538 *
3539 * When traversing the entries in service_list, we compare the
3540 * srv_hdl of the current entry with that of the next. If they
3541 * are different or if the next entry is NULL, the current entry
3542 * marks the last binding of the service handle. In this case,
3543 * call ibt_unbind_all_services() and deregister the service for
3544 * the service handle. If they are the same, the current and the
3545 * next entries are bound to the same service handle. In this
3546 * case, move on to the next entry.
3547 */
3548 static void
3549 rib_stop_services(rib_hca_t *hca)
3550 {
3551 rib_service_t *srv_list, *to_remove;
3552 ibt_status_t ibt_status;
3553
3554 /*
3555 * unbind and deregister the services for this service type.
3556 * Right now there is only one service type. In future it will
3557 * be passed down to this function.
3558 */
3559 rw_enter(&hca->service_list_lock, RW_WRITER);
3560 srv_list = hca->service_list;
3561 while (srv_list != NULL) {
3562 to_remove = srv_list;
3563 srv_list = to_remove->srv_next;
3564 if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3565 srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3566
3567 ibt_status = ibt_unbind_all_services(to_remove->srv_hdl);
3568 if (ibt_status != IBT_SUCCESS) {
3569 cmn_err(CE_WARN, "rib_listen_stop: "
3570 "ibt_unbind_all_services FAILED"
3571 " status: %d\n", ibt_status);
3572 }
3573
3574 ibt_status =
3575 ibt_deregister_service(hca->ibt_clnt_hdl,
3576 to_remove->srv_hdl);
3577 if (ibt_status != IBT_SUCCESS) {
3578 cmn_err(CE_WARN, "rib_listen_stop: "
3579 "ibt_deregister_service FAILED"
3580 " status: %d\n", ibt_status);
3581 }
3582
3583 #ifdef DEBUG
3584 if (rib_debug > 1) {
3585 if (ibt_status == IBT_SUCCESS)
3586 cmn_err(CE_NOTE, "rib_listen_stop: "
3587 "Successfully stopped and"
3588 " UNREGISTERED service: %s\n",
3589 to_remove->srv_name);
3590 }
3591 #endif
3592 }
3593 kmem_free(to_remove->srv_name, IB_SVC_NAME_LEN);
3594 kmem_free(to_remove->srv_sbind_hdl,
3595 sizeof (ibt_sbind_hdl_t));
3596
3597 kmem_free(to_remove, sizeof (rib_service_t));
3598 }
3599 hca->service_list = NULL;
3600 rw_exit(&hca->service_list_lock);
3601 }
3602
3603 static struct svc_recv *
3604 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3605 {
3606 struct svc_recv *recvp;
3607
3608 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3609 recvp->vaddr = sgl->ds_va;
3610 recvp->qp = qp;
3611 recvp->bytes_xfer = 0;
3612 return (recvp);
3613 }
3614
3615 static int
3616 rib_free_svc_recv(struct svc_recv *recvp)
3617 {
3618 kmem_free(recvp, sizeof (*recvp));
3619
3620 return (0);
3621 }
3622
3623 static struct reply *
3624 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3625 {
3626 struct reply *rep;
3627
3628
3629 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3630 if (rep == NULL) {
3631 mutex_exit(&qp->replylist_lock);
3632 cmn_err(CE_WARN, "rib_addreplylist: no memory\n");
3633 return (NULL);
3634 }
3635 rep->xid = msgid;
3636 rep->vaddr_cq = NULL;
3637 rep->bytes_xfer = 0;
3638 rep->status = (uint_t)REPLY_WAIT;
3639 rep->prev = NULL;
3640 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3641
3642 mutex_enter(&qp->replylist_lock);
3643 if (qp->replylist) {
3644 rep->next = qp->replylist;
3645 qp->replylist->prev = rep;
3646 }
3647 qp->rep_list_size++;
3648 if (rib_debug > 1)
3649 cmn_err(CE_NOTE, "rib_addreplylist: qp:%p, rep_list_size:%d\n",
3650 (void *)qp, qp->rep_list_size);
3651 qp->replylist = rep;
3652 mutex_exit(&qp->replylist_lock);
3653
3654 return (rep);
3655 }
3656
3657 static rdma_stat
3658 rib_rem_replylist(rib_qp_t *qp)
3659 {
3660 struct reply *r, *n;
3661
3662 mutex_enter(&qp->replylist_lock);
3663 for (r = qp->replylist; r != NULL; r = n) {
3664 n = r->next;
3665 (void) rib_remreply(qp, r);
3666 }
3667 mutex_exit(&qp->replylist_lock);
3668
3669 return (RDMA_SUCCESS);
3670 }
3671
3672 static int
3673 rib_remreply(rib_qp_t *qp, struct reply *rep)
3674 {
3675
3676 ASSERT(MUTEX_HELD(&qp->replylist_lock));
3677 if (rep->prev) {
3678 rep->prev->next = rep->next;
3679 }
3680 if (rep->next) {
3681 rep->next->prev = rep->prev;
3682 }
3683 if (qp->replylist == rep)
3684 qp->replylist = rep->next;
3685
3686 cv_destroy(&rep->wait_cv);
3687 qp->rep_list_size--;
3688 if (rib_debug > 1)
3689 cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n",
3690 (void *)qp, qp->rep_list_size);
3691
3692 kmem_free(rep, sizeof (*rep));
3693
3694 return (0);
3695 }
3696
3697 rdma_stat
3698 rib_registermem(CONN *conn, caddr_t buf, uint_t buflen,
3699 struct mrc *buf_handle)
3700 {
3701 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3702 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3703 rdma_stat status;
3704 rib_hca_t *hca = (ctoqp(conn))->hca;
3705
3706 /*
3707 * Note: ALL buffer pools use the same memory type RDMARW.
3708 */
3709 status = rib_reg_mem(hca, buf, buflen, 0, &mr_hdl, &mr_desc);
3710 if (status == RDMA_SUCCESS) {
3711 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3712 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3713 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3714 } else {
3715 buf_handle->mrc_linfo = NULL;
3716 buf_handle->mrc_lmr = 0;
3717 buf_handle->mrc_rmr = 0;
3718 }
3719 return (status);
3720 }
3721
3722 static rdma_stat
3723 rib_reg_mem(rib_hca_t *hca, caddr_t buf, uint_t size, ibt_mr_flags_t spec,
3724 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3725 {
3726 ibt_mr_attr_t mem_attr;
3727 ibt_status_t ibt_status;
3728
3729 mem_attr.mr_vaddr = (uintptr_t)buf;
3730 mem_attr.mr_len = (ib_msglen_t)size;
3731 mem_attr.mr_as = NULL;
3732 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3733 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3734 IBT_MR_ENABLE_WINDOW_BIND | spec;
3735
3736 rw_enter(&hca->state_lock, RW_READER);
3737 if (hca->state == HCA_INITED) {
3738 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3739 &mem_attr, mr_hdlp, mr_descp);
3740 rw_exit(&hca->state_lock);
3741 } else {
3742 rw_exit(&hca->state_lock);
3743 return (RDMA_FAILED);
3744 }
3745
3746 if (ibt_status != IBT_SUCCESS) {
3747 cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr "
3748 "(spec:%d) failed for addr %llX, status %d",
3749 spec, (longlong_t)mem_attr.mr_vaddr, ibt_status);
3750 return (RDMA_FAILED);
3751 }
3752 return (RDMA_SUCCESS);
3753 }
3754
3755 rdma_stat
3756 rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen,
3757 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle)
3758 {
3759 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3760 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3761 rdma_stat status;
3762 rib_hca_t *hca = (ctoqp(conn))->hca;
3763
3764 /*
3765 * Non-coherent memory registration.
3766 */
3767 status = rib_reg_mem(hca, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl,
3768 &mr_desc);
3769 if (status == RDMA_SUCCESS) {
3770 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3771 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3772 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3773 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3774 } else {
3775 buf_handle->mrc_linfo = NULL;
3776 buf_handle->mrc_lmr = 0;
3777 buf_handle->mrc_rmr = 0;
3778 }
3779 return (status);
3780 }
3781
3782 /* ARGSUSED */
3783 rdma_stat
3784 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3785 {
3786 rib_hca_t *hca = (ctoqp(conn))->hca;
3787
3788 /*
3789 * Allow memory deregistration even if HCA is
3790 * getting detached. Need all outstanding
3791 * memory registrations to be deregistered
3792 * before HCA_DETACH_EVENT can be accepted.
3793 */
3794 (void) ibt_deregister_mr(hca->hca_hdl,
3795 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3796 return (RDMA_SUCCESS);
3797 }
3798
3799 /* ARGSUSED */
3800 rdma_stat
3801 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3802 RIB_SYNCMEM_HANDLE sync_handle)
3803 {
3804 (void) rib_deregistermem(conn, buf, buf_handle);
3805
3806 return (RDMA_SUCCESS);
3807 }
3808
3809 /* ARGSUSED */
3810 rdma_stat
3811 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3812 int len, int cpu)
3813 {
3814 ibt_status_t status;
3815 rib_hca_t *hca = (ctoqp(conn))->hca;
3816 ibt_mr_sync_t mr_segment;
3817
3818 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3819 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3820 mr_segment.ms_len = (ib_memlen_t)len;
3821 if (cpu) {
3822 /* make incoming data visible to memory */
3823 mr_segment.ms_flags = IBT_SYNC_WRITE;
3824 } else {
3825 /* make memory changes visible to IO */
3826 mr_segment.ms_flags = IBT_SYNC_READ;
3827 }
3828 rw_enter(&hca->state_lock, RW_READER);
3829 if (hca->state == HCA_INITED) {
3830 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3831 rw_exit(&hca->state_lock);
3832 } else {
3833 rw_exit(&hca->state_lock);
3834 return (RDMA_FAILED);
3835 }
3836
3837 if (status == IBT_SUCCESS)
3838 return (RDMA_SUCCESS);
3839 else {
3840 #ifdef DEBUG
3841 cmn_err(CE_WARN, "rib_syncmem: ibt_sync_mr failed with %d\n",
3842 status);
3843 #endif
3844 return (RDMA_FAILED);
3845 }
3846 }
3847
3848 /*
3849 * XXXX ????
3850 */
3851 static rdma_stat
3852 rib_getinfo(rdma_info_t *info)
3853 {
3854 /*
3855 * XXXX Hack!
3856 */
3857 info->addrlen = 16;
3858 info->mts = 1000000;
3859 info->mtu = 1000000;
3860
3861 return (RDMA_SUCCESS);
3862 }
3863
3864 rib_bufpool_t *
3865 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3866 {
3867 rib_bufpool_t *rbp = NULL;
3868 bufpool_t *bp = NULL;
3869 caddr_t buf;
3870 ibt_mr_attr_t mem_attr;
3871 ibt_status_t ibt_status;
3872 int i, j;
3873
3874 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3875
3876 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3877 num * sizeof (void *), KM_SLEEP);
3878
3879 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3880 bp->numelems = num;
3881
3882 switch (ptype) {
3883 case SEND_BUFFER:
3884 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3885 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */
3886 bp->rsize = RPC_MSG_SZ;
3887 break;
3888 case RECV_BUFFER:
3889 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3890 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */
3891 bp->rsize = RPC_BUF_SIZE;
3892 break;
3893 default:
3894 goto fail;
3895 }
3896
3897 /*
3898 * Register the pool.
3899 */
3900 bp->bufsize = num * bp->rsize;
3901 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3902 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3903 sizeof (ibt_mr_hdl_t), KM_SLEEP);
3904 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3905 sizeof (ibt_mr_desc_t), KM_SLEEP);
3906
3907 rw_enter(&hca->state_lock, RW_READER);
3908 if (hca->state != HCA_INITED) {
3909 rw_exit(&hca->state_lock);
3910 goto fail;
3911 }
3912 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3913 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3914 mem_attr.mr_vaddr = (uintptr_t)buf;
3915 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3916 mem_attr.mr_as = NULL;
3917 ibt_status = ibt_register_mr(hca->hca_hdl,
3918 hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i],
3919 &rbp->mr_desc[i]);
3920 if (ibt_status != IBT_SUCCESS) {
3921 for (j = 0; j < i; j++) {
3922 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]);
3923 }
3924 rw_exit(&hca->state_lock);
3925 goto fail;
3926 }
3927 }
3928 rw_exit(&hca->state_lock);
3929
3930 buf = (caddr_t)bp->buf;
3931 for (i = 0; i < num; i++, buf += bp->rsize) {
3932 bp->buflist[i] = (void *)buf;
3933 }
3934 bp->buffree = num - 1; /* no. of free buffers */
3935 rbp->bpool = bp;
3936
3937 return (rbp);
3938 fail:
3939 if (bp) {
3940 if (bp->buf)
3941 kmem_free(bp->buf, bp->bufsize);
3942 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3943 }
3944 if (rbp) {
3945 if (rbp->mr_hdl)
3946 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3947 if (rbp->mr_desc)
3948 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3949 kmem_free(rbp, sizeof (rib_bufpool_t));
3950 }
3951 return (NULL);
3952 }
3953
3954 static void
3955 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3956 {
3957 int i;
3958 rib_bufpool_t *rbp = NULL;
3959 bufpool_t *bp;
3960
3961 /*
3962 * Obtain pool address based on type of pool
3963 */
3964 switch (ptype) {
3965 case SEND_BUFFER:
3966 rbp = hca->send_pool;
3967 break;
3968 case RECV_BUFFER:
3969 rbp = hca->recv_pool;
3970 break;
3971 default:
3972 return;
3973 }
3974 if (rbp == NULL)
3975 return;
3976
3977 bp = rbp->bpool;
3978
3979 /*
3980 * Deregister the pool memory and free it.
3981 */
3982 for (i = 0; i < bp->numelems; i++) {
3983 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3984 }
3985 }
3986
3987 static void
3988 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3989 {
3990
3991 rib_bufpool_t *rbp = NULL;
3992 bufpool_t *bp;
3993
3994 /*
3995 * Obtain pool address based on type of pool
3996 */
3997 switch (ptype) {
3998 case SEND_BUFFER:
3999 rbp = hca->send_pool;
4000 break;
4001 case RECV_BUFFER:
4002 rbp = hca->recv_pool;
4003 break;
4004 default:
4005 return;
4006 }
4007 if (rbp == NULL)
4008 return;
4009
4010 bp = rbp->bpool;
4011
4012 /*
4013 * Free the pool memory.
4014 */
4015 if (rbp->mr_hdl)
4016 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4017
4018 if (rbp->mr_desc)
4019 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4020
4021 if (bp->buf)
4022 kmem_free(bp->buf, bp->bufsize);
4023 mutex_destroy(&bp->buflock);
4024 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4025 kmem_free(rbp, sizeof (rib_bufpool_t));
4026 }
4027
4028 void
4029 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4030 {
4031 /*
4032 * Deregister the pool memory and free it.
4033 */
4034 rib_rbufpool_deregister(hca, ptype);
4035 rib_rbufpool_free(hca, ptype);
4036 }
4037
4038 /*
4039 * Fetch a buffer from the pool of type specified in rdbuf->type.
4040 */
4041 static rdma_stat
4042 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4043 {
4044
4045 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4046 if (rdbuf->addr) {
4047 switch (rdbuf->type) {
4048 case SEND_BUFFER:
4049 rdbuf->len = RPC_MSG_SZ; /* 1K */
4050 break;
4051 case RECV_BUFFER:
4052 rdbuf->len = RPC_BUF_SIZE; /* 2K */
4053 break;
4054 default:
4055 rdbuf->len = 0;
4056 }
4057 return (RDMA_SUCCESS);
4058 } else
4059 return (RDMA_FAILED);
4060 }
4061
4062
4063 /*
4064 * Fetch a buffer of specified type.
4065 * Note that rdbuf->handle is mw's rkey.
4066 */
4067 static void *
4068 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4069 {
4070 rib_qp_t *qp = ctoqp(conn);
4071 rib_hca_t *hca = qp->hca;
4072 rdma_btype ptype = rdbuf->type;
4073 void *buf;
4074 rib_bufpool_t *rbp = NULL;
4075 bufpool_t *bp;
4076 int i;
4077
4078 /*
4079 * Obtain pool address based on type of pool
4080 */
4081 switch (ptype) {
4082 case SEND_BUFFER:
4083 rbp = hca->send_pool;
4084 break;
4085 case RECV_BUFFER:
4086 rbp = hca->recv_pool;
4087 break;
4088 default:
4089 return (NULL);
4090 }
4091 if (rbp == NULL)
4092 return (NULL);
4093
4094 bp = rbp->bpool;
4095
4096 mutex_enter(&bp->buflock);
4097 if (bp->buffree < 0) {
4098 cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!");
4099 mutex_exit(&bp->buflock);
4100 return (NULL);
4101 }
4102
4103 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4104 buf = bp->buflist[bp->buffree];
4105 rdbuf->addr = buf;
4106 rdbuf->len = bp->rsize;
4107 for (i = bp->numelems - 1; i >= 0; i--) {
4108 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4109 rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey;
4110 rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i];
4111 rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey;
4112 bp->buffree--;
4113 if (rib_debug > 1)
4114 cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs "
4115 "(type %d)\n", bp->buffree+1, ptype);
4116
4117 mutex_exit(&bp->buflock);
4118
4119 return (buf);
4120 }
4121 }
4122 cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of "
4123 "type %d found!", buf, ptype);
4124 mutex_exit(&bp->buflock);
4125
4126 return (NULL);
4127 }
4128
4129 static void
4130 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4131 {
4132
4133 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4134 }
4135
4136 static void
4137 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4138 {
4139 rib_qp_t *qp = ctoqp(conn);
4140 rib_hca_t *hca = qp->hca;
4141 rib_bufpool_t *rbp = NULL;
4142 bufpool_t *bp;
4143
4144 /*
4145 * Obtain pool address based on type of pool
4146 */
4147 switch (ptype) {
4148 case SEND_BUFFER:
4149 rbp = hca->send_pool;
4150 break;
4151 case RECV_BUFFER:
4152 rbp = hca->recv_pool;
4153 break;
4154 default:
4155 return;
4156 }
4157 if (rbp == NULL)
4158 return;
4159
4160 bp = rbp->bpool;
4161
4162 mutex_enter(&bp->buflock);
4163 if (++bp->buffree >= bp->numelems) {
4164 /*
4165 * Should never happen
4166 */
4167 cmn_err(CE_WARN, "rib_rbuf_free: One (type %d) "
4168 "too many frees!", ptype);
4169 bp->buffree--;
4170 } else {
4171 bp->buflist[bp->buffree] = buf;
4172 if (rib_debug > 1)
4173 cmn_err(CE_NOTE, "rib_rbuf_free: %d free bufs "
4174 "(type %d)\n", bp->buffree+1, ptype);
4175 }
4176 mutex_exit(&bp->buflock);
4177 }
4178
4179 static rdma_stat
4180 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4181 {
4182 rw_enter(&connlist->conn_lock, RW_WRITER);
4183 if (connlist->conn_hd) {
4184 cn->c_next = connlist->conn_hd;
4185 connlist->conn_hd->c_prev = cn;
4186 }
4187 connlist->conn_hd = cn;
4188 rw_exit(&connlist->conn_lock);
4189
4190 return (RDMA_SUCCESS);
4191 }
4192
4193 static rdma_stat
4194 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4195 {
4196 rw_enter(&connlist->conn_lock, RW_WRITER);
4197 if (cn->c_prev) {
4198 cn->c_prev->c_next = cn->c_next;
4199 }
4200 if (cn->c_next) {
4201 cn->c_next->c_prev = cn->c_prev;
4202 }
4203 if (connlist->conn_hd == cn)
4204 connlist->conn_hd = cn->c_next;
4205 rw_exit(&connlist->conn_lock);
4206
4207 return (RDMA_SUCCESS);
4208 }
4209
4210 /*
4211 * Connection management.
4212 * IBTF does not support recycling of channels. So connections are only
4213 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR or
4214 * C_DISCONN_PEND state. No C_IDLE state.
4215 * C_CONN_PEND state: Connection establishment in progress to the server.
4216 * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4217 * It has an RC channel associated with it. ibt_post_send/recv are allowed
4218 * only in this state.
4219 * C_ERROR state: A connection transitions to this state when WRs on the
4220 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4221 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4222 * C_DISCONN_PEND state: When a connection is in C_ERROR state and when
4223 * c_ref drops to 0 (this indicates that RPC has no more references to this
4224 * connection), the connection should be destroyed. A connection transitions
4225 * into this state when it is being destroyed.
4226 */
4227 static rdma_stat
4228 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
4229 {
4230 CONN *cn;
4231 int status = RDMA_SUCCESS;
4232 rib_hca_t *hca = (rib_hca_t *)handle;
4233 rib_qp_t *qp;
4234 clock_t cv_stat, timout;
4235 ibt_path_info_t path;
4236
4237 again:
4238 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4239 cn = hca->cl_conn_list.conn_hd;
4240 while (cn != NULL) {
4241 /*
4242 * First, clear up any connection in the ERROR state
4243 */
4244 mutex_enter(&cn->c_lock);
4245 if (cn->c_state == C_ERROR) {
4246 if (cn->c_ref == 0) {
4247 /*
4248 * Remove connection from list and destroy it.
4249 */
4250 cn->c_state = C_DISCONN_PEND;
4251 mutex_exit(&cn->c_lock);
4252 rw_exit(&hca->cl_conn_list.conn_lock);
4253 (void) rib_disconnect_channel(cn,
4254 &hca->cl_conn_list);
4255 goto again;
4256 }
4257 mutex_exit(&cn->c_lock);
4258 cn = cn->c_next;
4259 continue;
4260 } else if (cn->c_state == C_DISCONN_PEND) {
4261 mutex_exit(&cn->c_lock);
4262 cn = cn->c_next;
4263 continue;
4264 }
4265 if ((cn->c_raddr.len == svcaddr->len) &&
4266 bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
4267 /*
4268 * Our connection. Give up conn list lock
4269 * as we are done traversing the list.
4270 */
4271 rw_exit(&hca->cl_conn_list.conn_lock);
4272 if (cn->c_state == C_CONNECTED) {
4273 cn->c_ref++; /* sharing a conn */
4274 mutex_exit(&cn->c_lock);
4275 *conn = cn;
4276 return (status);
4277 }
4278 if (cn->c_state == C_CONN_PEND) {
4279 /*
4280 * Hold a reference to this conn before
4281 * we give up the lock.
4282 */
4283 cn->c_ref++;
4284 timout = ddi_get_lbolt() +
4285 drv_usectohz(CONN_WAIT_TIME * 1000000);
4286 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4287 &cn->c_lock, timout)) > 0 &&
4288 cn->c_state == C_CONN_PEND)
4289 ;
4290 if (cv_stat == 0) {
4291 cn->c_ref--;
4292 mutex_exit(&cn->c_lock);
4293 return (RDMA_INTR);
4294 }
4295 if (cv_stat < 0) {
4296 cn->c_ref--;
4297 mutex_exit(&cn->c_lock);
4298 return (RDMA_TIMEDOUT);
4299 }
4300 if (cn->c_state == C_CONNECTED) {
4301 *conn = cn;
4302 mutex_exit(&cn->c_lock);
4303 return (status);
4304 } else {
4305 cn->c_ref--;
4306 mutex_exit(&cn->c_lock);
4307 return (RDMA_TIMEDOUT);
4308 }
4309 }
4310 }
4311 mutex_exit(&cn->c_lock);
4312 cn = cn->c_next;
4313 }
4314 rw_exit(&hca->cl_conn_list.conn_lock);
4315
4316 status = rib_chk_srv_ats(hca, svcaddr, addr_type, &path);
4317 if (status != RDMA_SUCCESS) {
4318 #ifdef DEBUG
4319 if (rib_debug) {
4320 cmn_err(CE_WARN, "rib_conn_get: "
4321 "No server ATS record!");
4322 }
4323 #endif
4324 return (RDMA_FAILED);
4325 }
4326
4327 /*
4328 * Channel to server doesn't exist yet, create one.
4329 */
4330 if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4331 return (RDMA_FAILED);
4332 }
4333 cn = qptoc(qp);
4334 cn->c_state = C_CONN_PEND;
4335 cn->c_ref = 1;
4336
4337 /*
4338 * Add to conn list.
4339 * We had given up the READER lock. In the time since then,
4340 * another thread might have created the connection we are
4341 * trying here. But for now, that is quiet alright - there
4342 * might be two connections between a pair of hosts instead
4343 * of one. If we really want to close that window,
4344 * then need to check the list after acquiring the
4345 * WRITER lock.
4346 */
4347 (void) rib_add_connlist(cn, &hca->cl_conn_list);
4348 status = rib_conn_to_srv(hca, qp, &path);
4349 mutex_enter(&cn->c_lock);
4350 if (status == RDMA_SUCCESS) {
4351 cn->c_state = C_CONNECTED;
4352 *conn = cn;
4353 } else {
4354 cn->c_state = C_ERROR;
4355 cn->c_ref--;
4356 #ifdef DEBUG
4357 if (rib_debug) {
4358 cmn_err(CE_WARN, "rib_conn_get: FAILED creating"
4359 " a channel!");
4360 }
4361 #endif
4362 }
4363 cv_broadcast(&cn->c_cv);
4364 mutex_exit(&cn->c_lock);
4365 return (status);
4366 }
4367
4368 static rdma_stat
4369 rib_conn_release(CONN *conn)
4370 {
4371 rib_qp_t *qp = ctoqp(conn);
4372
4373 mutex_enter(&conn->c_lock);
4374 conn->c_ref--;
4375
4376 /*
4377 * If a conn is C_ERROR, close the channel.
4378 * If it's CONNECTED, keep it that way.
4379 */
4380 if (conn->c_ref == 0 && (conn->c_state & C_ERROR)) {
4381 conn->c_state = C_DISCONN_PEND;
4382 mutex_exit(&conn->c_lock);
4383 if (qp->mode == RIB_SERVER)
4384 (void) rib_disconnect_channel(conn,
4385 &qp->hca->srv_conn_list);
4386 else
4387 (void) rib_disconnect_channel(conn,
4388 &qp->hca->cl_conn_list);
4389 return (RDMA_SUCCESS);
4390 }
4391 mutex_exit(&conn->c_lock);
4392 return (RDMA_SUCCESS);
4393 }
4394
4395 /*
4396 * Add at front of list
4397 */
4398 static struct rdma_done_list *
4399 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4400 {
4401 struct rdma_done_list *rd;
4402
4403 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4404
4405 rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4406 rd->xid = xid;
4407 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4408
4409 rd->prev = NULL;
4410 rd->next = qp->rdlist;
4411 if (qp->rdlist != NULL)
4412 qp->rdlist->prev = rd;
4413 qp->rdlist = rd;
4414
4415 return (rd);
4416 }
4417
4418 static void
4419 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4420 {
4421 struct rdma_done_list *r;
4422
4423 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4424
4425 r = rd->next;
4426 if (r != NULL) {
4427 r->prev = rd->prev;
4428 }
4429
4430 r = rd->prev;
4431 if (r != NULL) {
4432 r->next = rd->next;
4433 } else {
4434 qp->rdlist = rd->next;
4435 }
4436
4437 cv_destroy(&rd->rdma_done_cv);
4438 kmem_free(rd, sizeof (*rd));
4439 }
4440
4441 static void
4442 rdma_done_rem_list(rib_qp_t *qp)
4443 {
4444 struct rdma_done_list *r, *n;
4445
4446 mutex_enter(&qp->rdlist_lock);
4447 for (r = qp->rdlist; r != NULL; r = n) {
4448 n = r->next;
4449 rdma_done_rm(qp, r);
4450 }
4451 mutex_exit(&qp->rdlist_lock);
4452 }
4453
4454 static void
4455 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4456 {
4457 struct rdma_done_list *r = qp->rdlist;
4458
4459 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4460
4461 while (r) {
4462 if (r->xid == xid) {
4463 cv_signal(&r->rdma_done_cv);
4464 return;
4465 } else {
4466 r = r->next;
4467 }
4468 }
4469 if (rib_debug > 1) {
4470 cmn_err(CE_WARN, "rdma_done_notify: "
4471 "No matching xid for %u, qp %p\n", xid, (void *)qp);
4472 }
4473 }
4474
4475 rpcib_ats_t *
4476 get_ibd_entry(ib_gid_t *gid, ib_pkey_t pkey, rpcib_ibd_insts_t *ibds)
4477 {
4478 rpcib_ats_t *atsp;
4479 int i;
4480
4481 for (i = 0, atsp = ibds->rib_ats; i < ibds->rib_ibd_cnt; i++, atsp++) {
4482 if (atsp->ras_port_gid.gid_prefix == gid->gid_prefix &&
4483 atsp->ras_port_gid.gid_guid == gid->gid_guid &&
4484 atsp->ras_pkey == pkey) {
4485 return (atsp);
4486 }
4487 }
4488 return (NULL);
4489 }
4490
4491 int
4492 rib_get_ibd_insts_cb(dev_info_t *dip, void *arg)
4493 {
4494 rpcib_ibd_insts_t *ibds = (rpcib_ibd_insts_t *)arg;
4495 rpcib_ats_t *atsp;
4496 ib_pkey_t pkey;
4497 uint8_t port;
4498 ib_guid_t hca_guid;
4499 ib_gid_t port_gid;
4500
4501 if (i_ddi_devi_attached(dip) &&
4502 (strcmp(ddi_node_name(dip), "ibport") == 0) &&
4503 (strstr(ddi_get_name_addr(dip), "ipib") != NULL)) {
4504
4505 if (ibds->rib_ibd_cnt >= ibds->rib_ibd_alloc) {
4506 rpcib_ats_t *tmp;
4507
4508 tmp = (rpcib_ats_t *)kmem_zalloc((ibds->rib_ibd_alloc +
4509 N_IBD_INSTANCES) * sizeof (rpcib_ats_t), KM_SLEEP);
4510 bcopy(ibds->rib_ats, tmp,
4511 ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
4512 kmem_free(ibds->rib_ats,
4513 ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
4514 ibds->rib_ats = tmp;
4515 ibds->rib_ibd_alloc += N_IBD_INSTANCES;
4516 }
4517 if (((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY,
4518 dip, 0, "hca-guid", 0)) == 0) ||
4519 ((port = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4520 0, "port-number", 0)) == 0) ||
4521 (ibt_get_port_state_byguid(hca_guid, port,
4522 &port_gid, NULL) != IBT_SUCCESS) ||
4523 ((pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4524 "port-pkey", IB_PKEY_INVALID_LIMITED)) <=
4525 IB_PKEY_INVALID_FULL)) {
4526 return (DDI_WALK_CONTINUE);
4527 }
4528 atsp = &ibds->rib_ats[ibds->rib_ibd_cnt];
4529 atsp->ras_inst = ddi_get_instance(dip);
4530 atsp->ras_pkey = pkey;
4531 atsp->ras_port_gid = port_gid;
4532 ibds->rib_ibd_cnt++;
4533 }
4534 return (DDI_WALK_CONTINUE);
4535 }
4536
4537 void
4538 rib_get_ibd_insts(rpcib_ibd_insts_t *ibds)
4539 {
4540 ddi_walk_devs(ddi_root_node(), rib_get_ibd_insts_cb, ibds);
4541 }
4542
4543 /*
4544 * Return ibd interfaces and ibd instances.
4545 */
4546 int
4547 get_ibd_ipaddr(rpcib_ibd_insts_t *ibds)
4548 {
4549 TIUSER *tiptr, *tiptr6;
4550 vnode_t *kvp, *kvp6;
4551 vnode_t *vp = NULL, *vp6 = NULL;
4552 struct strioctl iocb;
4553 struct lifreq lif_req;
4554 int k, ip_cnt;
4555 rpcib_ats_t *atsp;
4556
4557 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
4558 &kvp) == 0) {
4559 if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
4560 &tiptr, CRED()) == 0) {
4561 vp = tiptr->fp->f_vnode;
4562 } else {
4563 VN_RELE(kvp);
4564 }
4565 }
4566
4567 if (lookupname("/dev/udp6", UIO_SYSSPACE, FOLLOW, NULLVPP,
4568 &kvp6) == 0) {
4569 if (t_kopen((file_t *)NULL, kvp6->v_rdev, FREAD|FWRITE,
4570 &tiptr6, CRED()) == 0) {
4571 vp6 = tiptr6->fp->f_vnode;
4572 } else {
4573 VN_RELE(kvp6);
4574 }
4575 }
4576
4577 if (vp == NULL && vp6 == NULL)
4578 return (-1);
4579
4580 /* Get ibd ip's */
4581 ip_cnt = 0;
4582 for (k = 0, atsp = ibds->rib_ats; k < ibds->rib_ibd_cnt; k++, atsp++) {
4583 /* IPv4 */
4584 if (vp != NULL) {
4585 (void) bzero((void *)&lif_req, sizeof (struct lifreq));
4586 (void) snprintf(lif_req.lifr_name,
4587 sizeof (lif_req.lifr_name), "%s%d",
4588 IBD_NAME, atsp->ras_inst);
4589
4590 (void) bzero((void *)&iocb, sizeof (struct strioctl));
4591 iocb.ic_cmd = SIOCGLIFADDR;
4592 iocb.ic_timout = 0;
4593 iocb.ic_len = sizeof (struct lifreq);
4594 iocb.ic_dp = (caddr_t)&lif_req;
4595 if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) {
4596 atsp->ras_inet_type = AF_INET;
4597 bcopy(&lif_req.lifr_addr, &atsp->ras_sin,
4598 sizeof (struct sockaddr_in));
4599 ip_cnt++;
4600 continue;
4601 }
4602 }
4603 /* Try IPv6 */
4604 if (vp6 != NULL) {
4605 (void) bzero((void *)&lif_req, sizeof (struct lifreq));
4606 (void) snprintf(lif_req.lifr_name,
4607 sizeof (lif_req.lifr_name), "%s%d",
4608 IBD_NAME, atsp->ras_inst);
4609
4610 (void) bzero((void *)&iocb, sizeof (struct strioctl));
4611 iocb.ic_cmd = SIOCGLIFADDR;
4612 iocb.ic_timout = 0;
4613 iocb.ic_len = sizeof (struct lifreq);
4614 iocb.ic_dp = (caddr_t)&lif_req;
4615 if (kstr_ioctl(vp6, I_STR, (intptr_t)&iocb) == 0) {
4616
4617 atsp->ras_inet_type = AF_INET6;
4618 bcopy(&lif_req.lifr_addr, &atsp->ras_sin6,
4619 sizeof (struct sockaddr_in6));
4620 ip_cnt++;
4621 }
4622 }
4623 }
4624
4625 if (vp6 != NULL) {
4626 (void) t_kclose(tiptr6, 0);
4627 VN_RELE(kvp6);
4628 }
4629 if (vp != NULL) {
4630 (void) t_kclose(tiptr, 0);
4631 VN_RELE(kvp);
4632 }
4633
4634 if (ip_cnt == 0)
4635 return (-1);
4636 else
4637 return (0);
4638 }
4639
4640 char **
4641 get_ip_addrs(int *count)
4642 {
4643 TIUSER *tiptr;
4644 vnode_t *kvp;
4645 int num_of_ifs;
4646 char **addresses;
4647 int return_code;
4648
4649 /*
4650 * Open a device for doing down stream kernel ioctls
4651 */
4652 return_code = lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW,
4653 NULLVPP, &kvp);
4654 if (return_code != 0) {
4655 cmn_err(CE_NOTE, "get_Ip_addrs: lookupname failed\n");
4656 *count = -1;
4657 return (NULL);
4658 }
4659
4660 return_code = t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
4661 &tiptr, CRED());
4662 if (return_code != 0) {
4663 cmn_err(CE_NOTE, "get_Ip_addrs: t_kopen failed\n");
4664 VN_RELE(kvp);
4665 *count = -1;
4666 return (NULL);
4667 }
4668
4669 /*
4670 * Perform the first ioctl to get the number of interfaces
4671 */
4672 return_code = get_interfaces(tiptr, &num_of_ifs);
4673 if (return_code != 0 || num_of_ifs == 0) {
4674 cmn_err(CE_NOTE, "get_Ip_addrs: get_interfaces failed\n");
4675 (void) t_kclose(tiptr, 0);
4676 VN_RELE(kvp);
4677 *count = -1;
4678 return (NULL);
4679 }
4680
4681 /*
4682 * Perform the second ioctl to get the address on each interface
4683 * found.
4684 */
4685 addresses = kmem_zalloc(num_of_ifs * sizeof (char *), KM_SLEEP);
4686 return_code = find_addrs(tiptr, addresses, num_of_ifs);
4687 if (return_code <= 0) {
4688 cmn_err(CE_NOTE, "get_Ip_addrs: find_addrs failed\n");
4689 (void) t_kclose(tiptr, 0);
4690 kmem_free(addresses, num_of_ifs * sizeof (char *));
4691 VN_RELE(kvp);
4692 *count = -1;
4693 return (NULL);
4694 }
4695
4696 *count = return_code;
4697 VN_RELE(kvp);
4698 (void) t_kclose(tiptr, 0);
4699 return (addresses);
4700 }
4701
4702 int
4703 get_interfaces(TIUSER *tiptr, int *num)
4704 {
4705 struct lifnum if_buf;
4706 struct strioctl iocb;
4707 vnode_t *vp;
4708 int return_code;
4709
4710 /*
4711 * Prep the number of interfaces request buffer for ioctl
4712 */
4713 (void) bzero((void *)&if_buf, sizeof (struct lifnum));
4714 if_buf.lifn_family = AF_UNSPEC;
4715 if_buf.lifn_flags = 0;
4716
4717 /*
4718 * Prep the kernel ioctl buffer and send it down stream
4719 */
4720 (void) bzero((void *)&iocb, sizeof (struct strioctl));
4721 iocb.ic_cmd = SIOCGLIFNUM;
4722 iocb.ic_timout = 0;
4723 iocb.ic_len = sizeof (if_buf);
4724 iocb.ic_dp = (caddr_t)&if_buf;
4725
4726 vp = tiptr->fp->f_vnode;
4727 return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4728 if (return_code != 0) {
4729 cmn_err(CE_NOTE, "get_interfaces: kstr_ioctl failed\n");
4730 *num = -1;
4731 return (-1);
4732 }
4733
4734 *num = if_buf.lifn_count;
4735 #ifdef DEBUG
4736 if (rib_debug > 1)
4737 cmn_err(CE_NOTE, "Number of interfaces detected: %d\n",
4738 if_buf.lifn_count);
4739 #endif
4740 return (0);
4741 }
4742
4743 int
4744 find_addrs(TIUSER *tiptr, char **addrs, int num_ifs)
4745 {
4746 struct lifconf lifc;
4747 struct lifreq *if_data_buf;
4748 struct strioctl iocb;
4749 caddr_t request_buffer;
4750 struct sockaddr_in *sin4;
4751 struct sockaddr_in6 *sin6;
4752 vnode_t *vp;
4753 int i, count, return_code;
4754
4755 /*
4756 * Prep the buffer for requesting all interface's info
4757 */
4758 (void) bzero((void *)&lifc, sizeof (struct lifconf));
4759 lifc.lifc_family = AF_UNSPEC;
4760 lifc.lifc_flags = 0;
4761 lifc.lifc_len = num_ifs * sizeof (struct lifreq);
4762
4763 request_buffer = kmem_zalloc(num_ifs * sizeof (struct lifreq),
4764 KM_SLEEP);
4765
4766 lifc.lifc_buf = request_buffer;
4767
4768 /*
4769 * Prep the kernel ioctl buffer and send it down stream
4770 */
4771 (void) bzero((void *)&iocb, sizeof (struct strioctl));
4772 iocb.ic_cmd = SIOCGLIFCONF;
4773 iocb.ic_timout = 0;
4774 iocb.ic_len = sizeof (struct lifconf);
4775 iocb.ic_dp = (caddr_t)&lifc;
4776
4777 vp = tiptr->fp->f_vnode;
4778 return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4779 if (return_code != 0) {
4780 cmn_err(CE_NOTE, "find_addrs: kstr_ioctl failed\n");
4781 kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
4782 return (-1);
4783 }
4784
4785 /*
4786 * Extract addresses and fill them in the requested array
4787 * IB_SVC_NAME_LEN is defined to be 64 so it covers both IPv4 &
4788 * IPv6. Here count is the number of IP addresses collected.
4789 */
4790 if_data_buf = lifc.lifc_req;
4791 count = 0;
4792 for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--,
4793 if_data_buf++) {
4794 if (if_data_buf->lifr_addr.ss_family == AF_INET) {
4795 sin4 = (struct sockaddr_in *)&if_data_buf->lifr_addr;
4796 addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
4797 (void) inet_ntop(AF_INET, &sin4->sin_addr,
4798 addrs[count], IB_SVC_NAME_LEN);
4799 count ++;
4800 }
4801
4802 if (if_data_buf->lifr_addr.ss_family == AF_INET6) {
4803 sin6 = (struct sockaddr_in6 *)&if_data_buf->lifr_addr;
4804 addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
4805 (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
4806 addrs[count], IB_SVC_NAME_LEN);
4807 count ++;
4808 }
4809 }
4810
4811 kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
4812 return (count);
4813 }
4814
4815 /*
4816 * Goes through all connections and closes the channel
4817 * This will cause all the WRs on those channels to be
4818 * flushed.
4819 */
4820 static void
4821 rib_close_channels(rib_conn_list_t *connlist)
4822 {
4823 CONN *conn;
4824 rib_qp_t *qp;
4825
4826 rw_enter(&connlist->conn_lock, RW_READER);
4827 conn = connlist->conn_hd;
4828 while (conn != NULL) {
4829 mutex_enter(&conn->c_lock);
4830 qp = ctoqp(conn);
4831 if (conn->c_state & C_CONNECTED) {
4832 /*
4833 * Live connection in CONNECTED state.
4834 * Call ibt_close_rc_channel in nonblocking mode
4835 * with no callbacks.
4836 */
4837 conn->c_state = C_ERROR;
4838 (void) ibt_close_rc_channel(qp->qp_hdl,
4839 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
4840 (void) ibt_free_channel(qp->qp_hdl);
4841 qp->qp_hdl = NULL;
4842 } else {
4843 if (conn->c_state == C_ERROR &&
4844 qp->qp_hdl != NULL) {
4845 /*
4846 * Connection in ERROR state but
4847 * channel is not yet freed.
4848 */
4849 (void) ibt_close_rc_channel(qp->qp_hdl,
4850 IBT_NOCALLBACKS, NULL, 0, NULL,
4851 NULL, 0);
4852 (void) ibt_free_channel(qp->qp_hdl);
4853 qp->qp_hdl = NULL;
4854 }
4855 }
4856 mutex_exit(&conn->c_lock);
4857 conn = conn->c_next;
4858 }
4859 rw_exit(&connlist->conn_lock);
4860 }
4861
4862 /*
4863 * Frees up all connections that are no longer being referenced
4864 */
4865 static void
4866 rib_purge_connlist(rib_conn_list_t *connlist)
4867 {
4868 CONN *conn;
4869
4870 top:
4871 rw_enter(&connlist->conn_lock, RW_READER);
4872 conn = connlist->conn_hd;
4873 while (conn != NULL) {
4874 mutex_enter(&conn->c_lock);
4875
4876 /*
4877 * At this point connection is either in ERROR
4878 * or DISCONN_PEND state. If in DISCONN_PEND state
4879 * then some other thread is culling that connection.
4880 * If not and if c_ref is 0, then destroy the connection.
4881 */
4882 if (conn->c_ref == 0 &&
4883 conn->c_state != C_DISCONN_PEND) {
4884 /*
4885 * Cull the connection
4886 */
4887 conn->c_state = C_DISCONN_PEND;
4888 mutex_exit(&conn->c_lock);
4889 rw_exit(&connlist->conn_lock);
4890 (void) rib_disconnect_channel(conn, connlist);
4891 goto top;
4892 } else {
4893 /*
4894 * conn disconnect already scheduled or will
4895 * happen from conn_release when c_ref drops to 0.
4896 */
4897 mutex_exit(&conn->c_lock);
4898 }
4899 conn = conn->c_next;
4900 }
4901 rw_exit(&connlist->conn_lock);
4902
4903 /*
4904 * At this point, only connections with c_ref != 0 are on the list
4905 */
4906 }
4907
4908 /*
4909 * Cleans and closes up all uses of the HCA
4910 */
4911 static void
4912 rib_detach_hca(rib_hca_t *hca)
4913 {
4914
4915 /*
4916 * Stop all services on the HCA
4917 * Go through cl_conn_list and close all rc_channels
4918 * Go through svr_conn_list and close all rc_channels
4919 * Free connections whose c_ref has dropped to 0
4920 * Destroy all CQs
4921 * Deregister and released all buffer pool memory after all
4922 * connections are destroyed
4923 * Free the protection domain
4924 * ibt_close_hca()
4925 */
4926 rw_enter(&hca->state_lock, RW_WRITER);
4927 if (hca->state == HCA_DETACHED) {
4928 rw_exit(&hca->state_lock);
4929 return;
4930 }
4931
4932 hca->state = HCA_DETACHED;
4933 rib_stat->nhca_inited--;
4934
4935 rib_stop_services(hca);
4936 rib_deregister_ats();
4937 rib_close_channels(&hca->cl_conn_list);
4938 rib_close_channels(&hca->srv_conn_list);
4939 rw_exit(&hca->state_lock);
4940
4941 rib_purge_connlist(&hca->cl_conn_list);
4942 rib_purge_connlist(&hca->srv_conn_list);
4943
4944 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4945 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4946 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4947 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4948 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4949 kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4950 kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4951 kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4952
4953 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4954 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4955 if (hca->srv_conn_list.conn_hd == NULL &&
4956 hca->cl_conn_list.conn_hd == NULL) {
4957 /*
4958 * conn_lists are NULL, so destroy
4959 * buffers, close hca and be done.
4960 */
4961 rib_rbufpool_destroy(hca, RECV_BUFFER);
4962 rib_rbufpool_destroy(hca, SEND_BUFFER);
4963 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4964 (void) ibt_close_hca(hca->hca_hdl);
4965 hca->hca_hdl = NULL;
4966 }
4967 rw_exit(&hca->cl_conn_list.conn_lock);
4968 rw_exit(&hca->srv_conn_list.conn_lock);
4969
4970 if (hca->hca_hdl != NULL) {
4971 mutex_enter(&hca->inuse_lock);
4972 while (hca->inuse)
4973 cv_wait(&hca->cb_cv, &hca->inuse_lock);
4974 mutex_exit(&hca->inuse_lock);
4975 /*
4976 * conn_lists are now NULL, so destroy
4977 * buffers, close hca and be done.
4978 */
4979 rib_rbufpool_destroy(hca, RECV_BUFFER);
4980 rib_rbufpool_destroy(hca, SEND_BUFFER);
4981 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4982 (void) ibt_close_hca(hca->hca_hdl);
4983 hca->hca_hdl = NULL;
4984 }
4985 }