New rpcib.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27
28 /* Copyright (c) 2006, The Ohio State University. All rights reserved.
29 *
30 * Portions of this source code is developed by the team members of
31 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
32 * headed by Professor Dhabaleswar K. (DK) Panda.
33 *
34 * Acknowledgements to contributions from developors:
35 * Ranjit Noronha: noronha@cse.ohio-state.edu
36 * Lei Chai : chail@cse.ohio-state.edu
37 * Weikuan Yu : yuw@cse.ohio-state.edu
38 *
39 */
40
41 #pragma ident "@(#)rpcib.c 1.29 06/01/25 SMI"
42
43 /*
44 * The rpcib plugin. Implements the interface for RDMATF's
45 * interaction with IBTF.
46 */
47
48 #include <sys/param.h>
49 #include <sys/types.h>
50 #include <sys/user.h>
51 #include <sys/systm.h>
52 #include <sys/sysmacros.h>
53 #include <sys/proc.h>
54 #include <sys/socket.h>
55 #include <sys/file.h>
56 #include <sys/stream.h>
57 #include <sys/strsubr.h>
58 #include <sys/stropts.h>
59 #include <sys/errno.h>
60 #include <sys/kmem.h>
61 #include <sys/debug.h>
62 #include <sys/systm.h>
63 #include <sys/pathname.h>
64 #include <sys/kstat.h>
65 #include <sys/t_lock.h>
66 #include <sys/ddi.h>
67 #include <sys/cmn_err.h>
68 #include <sys/time.h>
69 #include <sys/isa_defs.h>
70 #include <sys/callb.h>
71 #include <sys/sunddi.h>
72 #include <sys/sunndi.h>
73
74 /* #define IB_FMR_SUP */
75 /* #define CLNT_POLL_CQ */
76 #include <sys/ib/ibtl/ibti.h>
77 #include <rpc/rpc.h>
78 #include <rpc/ib.h>
79
80 #include <sys/modctl.h>
81
82 #include <sys/pathname.h>
83 #include <sys/kstr.h>
84 #include <sys/sockio.h>
85 #include <sys/vnode.h>
86 #include <sys/tiuser.h>
87 #include <net/if.h>
88 #include <sys/cred.h>
89 #include <rpc/rpc_rdma.h>
90
91 int num_clients = 0;
92 volatile uint32_t is_server = 0;
93
94 extern char *inet_ntop(int, const void *, char *, int);
95
96
97 /*
98 * Prototype declarations for driver ops
99 */
100
101 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
102 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
103 void *, void **);
104 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
105
106
107 /* rpcib cb_ops */
108 static struct cb_ops rpcib_cbops = {
109 nulldev, /* open */
110 nulldev, /* close */
111 nodev, /* strategy */
112 nodev, /* print */
113 nodev, /* dump */
114 nodev, /* read */
115 nodev, /* write */
116 nodev, /* ioctl */
117 nodev, /* devmap */
118 nodev, /* mmap */
119 nodev, /* segmap */
120 nochpoll, /* poll */
121 ddi_prop_op, /* prop_op */
122 NULL, /* stream */
123 D_MP, /* cb_flag */
124 CB_REV, /* rev */
125 nodev, /* int (*cb_aread)() */
126 nodev /* int (*cb_awrite)() */
127 };
128
129
130
131
132 /*
133 * Device options
134 */
135 static struct dev_ops rpcib_ops = {
136 DEVO_REV, /* devo_rev, */
137 0, /* refcnt */
138 rpcib_getinfo, /* info */
139 nulldev, /* identify */
140 nulldev, /* probe */
141 rpcib_attach, /* attach */
142 rpcib_detach, /* detach */
143 nodev, /* reset */
144 &rpcib_cbops, /* driver ops - devctl interfaces */
145 NULL, /* bus operations */
146 NULL /* power */
147 };
148
149 /*
150 * Module linkage information.
151 */
152
153 static struct modldrv rib_modldrv = {
154 &mod_driverops, /* Driver module */
155 "RPCIB plugin driver, ver 1.29", /* Driver name and version */
156 &rpcib_ops, /* Driver ops */
157 };
158
159 static struct modlinkage rib_modlinkage = {
160 MODREV_1,
161 (void *)&rib_modldrv,
162 NULL
163 };
164
165 #ifdef SERVER_REG_CACHE
166 typedef struct cache_struct {
167 avl_node_t avl_link;
168 rib_lrc_entry_t r;
169 uint32_t len;
170 uint32_t elements;
171 kmutex_t node_lock;
172 } cache_avl_struct_t;
173
174
175 #if 1
176 int rib_total_buffers = 0;
177 #endif
178 #endif
179 /*
180 * rib_stat: private data pointer used when registering
181 * with the IBTF. It is returned to the consumer
182 * in all callbacks.
183 */
184 static rpcib_state_t *rib_stat = NULL;
185
186 #define RNR_RETRIES IBT_RNR_INFINITE_RETRY
187 #define MAX_PORTS 2
188
189 #ifdef IB_FMR_SUP
190 #define IB_FMR_DIRTY_MARK 32
191 #define IB_FMR_MAX_SIZE 1048576
192 /*#define IB_FMR_MAX_SIZE 32768 */
193 #endif
194
195 int preposted_rbufs = RDMA_BUFS_GRANT;
196 int send_threshold = 1;
197
198 /*
199 * State of the plugin.
200 * ACCEPT = accepting new connections and requests.
201 * NO_ACCEPT = not accepting new connection and requests.
202 * This should eventually move to rpcib_state_t structure, since this
203 * will tell in which state the plugin is for a particular type of service
204 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
205 * state for one and in no_accept state for the other.
206 */
207 int plugin_state;
208 kmutex_t plugin_state_lock;
209
210
211 /*
212 * RPCIB RDMATF operations
213 */
214 #if defined(MEASURE_POOL_DEPTH)
215 static void rib_posted_rbufs(uint32_t x) { return;}
216 #endif
217 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
218 static rdma_stat rib_disconnect(CONN *conn);
219 static void rib_listen(struct rdma_svc_data *rd);
220 static void rib_listen_stop(struct rdma_svc_data *rd);
221 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
222 struct mrc *buf_handle);
223 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
224 struct mrc buf_handle);
225 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
226 caddr_t buf, uint_t buflen, struct mrc *buf_handle);
227 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
228 struct mrc buf_handle);
229 #ifdef SERVER_REG_CACHE
230 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
231 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc);
232 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
233 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
234 #else
235 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
236 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle);
237 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
238 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle);
239
240 #endif
241 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
242 caddr_t buf, int len, int cpu);
243
244 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
245
246 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
247 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
248
249 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
250
251 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
252 #if defined (CLNT_INTERRUPT_COAL)
253 static void rib_scq_free(caddr_t);
254 static rdma_stat rib_send_bl(CONN *conn, struct clist *cl, uint32_t msgid);
255 #endif
256 #if defined(ASYNC_SERVER_DEREG)
257 static rdma_stat rib_send_nw(CONN *conn, struct clist *cl, uint32_t msgid, caddr_t, caddr_t, int, caddr_t, int, int, int);
258 #endif
259 #if defined(ASYNC_CLIENT_DEREG)
260 static void insert_queue(CONN *conn, struct clist *rwc);
261 #endif
262 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
263 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
264 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
265 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
266 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
267 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
268 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
269 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
270 static rdma_stat rib_conn_release(CONN *conn);
271 static rdma_stat rib_getinfo(rdma_info_t *info);
272 #ifdef DYNAMIC_CREDIT_CONTROL
273 void rib_get_resource_info(CONN *, int *, int *);
274 #endif
275
276 #ifdef SERVER_REG_CACHE
277 static rib_lrc_entry_t *rib_get_server_cache_buf(CONN *conn, uint32_t len);
278 static void rib_free_server_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
279 static void rib_destroy_cache(rib_hca_t *hca);
280 static void
281 rib_server_side_cache_reclaim(void *argp);
282 static int avl_compare(const void *t1,const void *t2);
283 #endif
284
285 static rdma_stat rib_register_ats(rib_hca_t *);
286 static void rib_deregister_ats();
287 static void rib_stop_services(rib_hca_t *);
288
289 /*
290 * RPCIB addressing operations
291 */
292 char ** get_ip_addrs(int *count);
293 int get_interfaces(TIUSER *tiptr, int *num);
294 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs);
295 int get_ibd_ipaddr(rpcib_ibd_insts_t *);
296 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *);
297 void rib_get_ibd_insts(rpcib_ibd_insts_t *);
298 #if defined(ASYNC_SERVER_DEREG)||defined(ASYNC_CLIENT_DEREG)
299 static int clist_deregister1(CONN *, struct clist *, bool_t );
300 #endif
301
302 #if defined(ASYNC_CLIENT_DEREG)
303 typedef struct async_dereg {
304 struct async_dereg *forw;
305 struct async_dereg *back;
306 CONN c_conn;
307 struct clist c_clist;
308 } ASYNC;
309 static void async_dereg_thread(caddr_t arg);
310 extern pri_t minclsyspri; /* priority for taskq */
311 static ASYNC rqueue;
312 static kmutex_t at_mutex;
313 static kcondvar_t at_cond;
314 #endif
315 /*
316 * RDMA operations the RPCIB module exports
317 */
318 static rdmaops_t rib_ops = {
319 rib_reachable,
320 rib_conn_get,
321 rib_conn_release,
322 rib_listen,
323 rib_listen_stop,
324 rib_registermem,
325 rib_deregistermem,
326 rib_registermemsync,
327 rib_deregistermemsync,
328 rib_syncmem,
329 rib_reg_buf_alloc,
330 rib_reg_buf_free,
331 rib_send,
332 #if defined (CLNT_INTERRUPT_COAL)
333 rib_send_bl,
334 #endif
335 #if defined(ASYNC_SERVER_DEREG)
336 rib_send_nw,
337 #endif
338 rib_send_resp,
339 rib_post_resp,
340 rib_post_recv,
341 rib_recv,
342 rib_read,
343 rib_write,
344 rib_getinfo,
345 #ifdef SERVER_REG_CACHE
346 rib_get_server_cache_buf,
347 rib_free_server_cache_buf,
348 #endif
349 #ifdef DYNAMIC_CREDIT_CONTROL
350 rib_get_resource_info,
351 #endif
352 #if defined(ASYNC_CLIENT_DEREG)
353 insert_queue,
354 #endif
355 };
356
357 /*
358 * RDMATF RPCIB plugin details
359 */
360 static rdma_mod_t rib_mod = {
361 "ibtf", /* api name */
362 RDMATF_VERS_1,
363 0,
364 &rib_ops, /* rdma op vector for ibtf */
365 };
366
367 static rdma_stat open_hcas(rpcib_state_t *);
368 static rdma_stat rib_qp_init(rib_qp_t *, int);
369 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
370 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
371 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
372 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
373 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
374 #ifdef IB_FMR_SUP
375 static rdma_stat rib_reg_mem_fmr(rib_hca_t *, caddr_t adsp,caddr_t, uint_t, ibt_mr_flags_t,
376 ibt_mr_hdl_t *, ibt_ma_hdl_t *, ibt_pmr_desc_t *);
377 #endif
378 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, ibt_mr_flags_t,
379 ibt_mr_hdl_t *, ibt_mr_desc_t *);
380 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
381 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
382 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *);
383 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
384 rib_qp_t **);
385 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
386 rib_qp_t **);
387 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
388 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
389 static int rib_free_sendwait(struct send_wid *);
390 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
391 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
392 static void rdma_done_rem_list(rib_qp_t *);
393 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
394
395 static void rib_async_handler(void *,
396 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
397 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
398 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
399 static int rib_free_svc_recv(struct svc_recv *);
400 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
401 static void rib_free_wid(struct recv_wid *);
402 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
403 static void rib_detach_hca(rib_hca_t *);
404 static rdma_stat rib_chk_srv_ats(rib_hca_t *, struct netbuf *, int,
405 ibt_path_info_t *);
406
407 /*
408 * Registration with IBTF as a consumer
409 */
410 static struct ibt_clnt_modinfo_s rib_modinfo = {
411 IBTI_V2,
412 IBT_GENERIC,
413 rib_async_handler, /* async event handler */
414 NULL, /* Memory Region Handler */
415 "nfs/ib"
416 };
417
418 /*
419 * Global strucuture
420 */
421
422 typedef struct rpcib_s {
423 dev_info_t *rpcib_dip;
424 kmutex_t rpcib_mutex;
425 } rpcib_t;
426
427 rpcib_t rpcib;
428
429 /*
430 * /etc/system controlled variable to control
431 * debugging in rpcib kernel module.
432 * Set it to values greater that 1 to control
433 * the amount of debugging messages required.
434 */
435 int rib_debug = 0;
436 #if defined(CLNT_POLL_CQ)
437 int max_poll_count = 500;
438 #endif
439 static int ats_running = 0;
440
441
442 int
443 _init(void)
444 {
445 int error;
446
447 error = mod_install((struct modlinkage *)&rib_modlinkage);
448 if (error != 0) {
449 /*
450 * Could not load module
451 */
452 return (error);
453 }
454 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
455
456 return (0);
457 }
458
459 int
460 _fini()
461 {
462 int status;
463
464 if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) {
465 return (EBUSY);
466 }
467
468 rib_deregister_ats();
469
470 /*
471 * Remove module
472 */
473 if ((status = mod_remove(&rib_modlinkage)) != 0) {
474 (void) rdma_register_mod(&rib_mod);
475 return (status);
476 }
477 mutex_destroy(&plugin_state_lock);
478 return (0);
479 }
480
481 int
482 _info(struct modinfo *modinfop)
483 {
484 return (mod_info(&rib_modlinkage, modinfop));
485 }
486
487
488 /*
489 * rpcib_getinfo()
490 * Given the device number, return the devinfo pointer or the
491 * instance number.
492 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
493 */
494
495 /*ARGSUSED*/
496 static int
497 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
498 {
499 int ret = DDI_SUCCESS;
500
501 switch (cmd) {
502 case DDI_INFO_DEVT2DEVINFO:
503 if (rpcib.rpcib_dip != NULL)
504 *result = rpcib.rpcib_dip;
505 else {
506 *result = NULL;
507 ret = DDI_FAILURE;
508 }
509 break;
510
511 case DDI_INFO_DEVT2INSTANCE:
512 *result = NULL;
513 break;
514
515 default:
516 ret = DDI_FAILURE;
517 }
518 return (ret);
519 }
520
521 static int
522 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
523 {
524 ibt_status_t ibt_status;
525 rdma_stat r_status;
526
527 switch (cmd) {
528 case DDI_ATTACH:
529 break;
530 case DDI_RESUME:
531 return (DDI_SUCCESS);
532 default:
533 return (DDI_FAILURE);
534 }
535
536 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
537
538 mutex_enter(&rpcib.rpcib_mutex);
539 if (rpcib.rpcib_dip != NULL) {
540 mutex_exit(&rpcib.rpcib_mutex);
541 return (DDI_FAILURE);
542 }
543 rpcib.rpcib_dip = dip;
544 mutex_exit(&rpcib.rpcib_mutex);
545 /*
546 * Create the "rpcib" minor-node.
547 */
548 if (ddi_create_minor_node(dip,
549 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
550 /* Error message, no cmn_err as they print on console */
551 return (DDI_FAILURE);
552 }
553
554 if (rib_stat == NULL) {
555 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
556 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
557 }
558
559 rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
560 if (rib_stat->hca_count < 1) {
561 mutex_destroy(&rib_stat->open_hca_lock);
562 kmem_free(rib_stat, sizeof (*rib_stat));
563 rib_stat = NULL;
564 return (DDI_FAILURE);
565 }
566
567 ibt_status = ibt_attach(&rib_modinfo, dip,
568 (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
569 if (ibt_status != IBT_SUCCESS) {
570 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
571 mutex_destroy(&rib_stat->open_hca_lock);
572 kmem_free(rib_stat, sizeof (*rib_stat));
573 rib_stat = NULL;
574 return (DDI_FAILURE);
575 }
576
577 mutex_enter(&rib_stat->open_hca_lock);
578 if (open_hcas(rib_stat) != RDMA_SUCCESS) {
579 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
580 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
581 mutex_exit(&rib_stat->open_hca_lock);
582 mutex_destroy(&rib_stat->open_hca_lock);
583 kmem_free(rib_stat, sizeof (*rib_stat));
584 rib_stat = NULL;
585 return (DDI_FAILURE);
586 }
587 mutex_exit(&rib_stat->open_hca_lock);
588
589 /*
590 * Register with rdmatf
591 */
592 rib_mod.rdma_count = rib_stat->hca_count;
593 r_status = rdma_register_mod(&rib_mod);
594 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
595 rib_detach_hca(rib_stat->hca);
596 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
597 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
598 mutex_destroy(&rib_stat->open_hca_lock);
599 kmem_free(rib_stat, sizeof (*rib_stat));
600 rib_stat = NULL;
601 return (DDI_FAILURE);
602 }
603
604
605 return (DDI_SUCCESS);
606 }
607
608 /*ARGSUSED*/
609 static int
610 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
611 {
612 switch (cmd) {
613
614 case DDI_DETACH:
615 break;
616
617 case DDI_SUSPEND:
618 default:
619 return (DDI_FAILURE);
620 }
621
622 /*
623 * Detach the hca and free resources
624 */
625 mutex_enter(&plugin_state_lock);
626 plugin_state = NO_ACCEPT;
627 mutex_exit(&plugin_state_lock);
628 rib_detach_hca(rib_stat->hca);
629 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
630 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
631
632 mutex_enter(&rpcib.rpcib_mutex);
633 rpcib.rpcib_dip = NULL;
634 mutex_exit(&rpcib.rpcib_mutex);
635
636 mutex_destroy(&rpcib.rpcib_mutex);
637 return (DDI_SUCCESS);
638 }
639
640
641 static void
642 rib_deregister_ats()
643 {
644 rib_hca_t *hca;
645 rib_service_t *srv_list, *to_remove;
646 ibt_status_t ibt_status;
647
648 /*
649 * deregister the Address Translation Service.
650 */
651 hca = rib_stat->hca;
652 rw_enter(&hca->service_list_lock, RW_WRITER);
653 srv_list = hca->ats_list;
654 while (srv_list != NULL) {
655 to_remove = srv_list;
656 srv_list = to_remove->srv_next;
657
658 ibt_status = ibt_deregister_ar(hca->ibt_clnt_hdl,
659 &to_remove->srv_ar);
660 if (ibt_status != IBT_SUCCESS) {
661 #ifdef DEBUG
662 if (rib_debug) {
663 cmn_err(CE_WARN, "_fini: "
664 "ibt_deregister_ar FAILED"
665 " status: %d", ibt_status);
666 }
667 #endif
668 } else {
669 mutex_enter(&rib_stat->open_hca_lock);
670 ats_running = 0;
671 mutex_exit(&rib_stat->open_hca_lock);
672 #ifdef DEBUG
673 if (rib_debug) {
674
675 cmn_err(CE_NOTE, "_fini: "
676 "Successfully unregistered"
677 " ATS service: %s",
678 to_remove->srv_name);
679 }
680 #endif
681 }
682 kmem_free(to_remove, sizeof (rib_service_t));
683 }
684 hca->ats_list = NULL;
685 rw_exit(&hca->service_list_lock);
686 }
687
688 static void rib_rbufpool_free(rib_hca_t *, int);
689 static void rib_rbufpool_deregister(rib_hca_t *, int);
690 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
691 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
692 static rdma_stat rib_rem_replylist(rib_qp_t *);
693 static int rib_remreply(rib_qp_t *, struct reply *);
694 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
695 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
696
697
698 /*
699 * One CQ pair per HCA
700 */
701 static rdma_stat
702 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
703 rib_cq_t **cqp, rpcib_state_t *ribstat)
704 {
705 rib_cq_t *cq;
706 ibt_cq_attr_t cq_attr;
707 uint32_t real_size;
708 ibt_status_t status;
709 rdma_stat error = RDMA_SUCCESS;
710
711 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
712 cq->rib_hca = hca;
713 cq_attr.cq_size = cq_size;
714 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
715 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
716 &real_size);
717 if (status != IBT_SUCCESS) {
718 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
719 " status=%d", status);
720 error = RDMA_FAILED;
721 goto fail;
722 }
723 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
724
725 /*
726 * Enable CQ callbacks. CQ Callbacks are single shot
727 * (e.g. you have to call ibt_enable_cq_notify()
728 * after each callback to get another one).
729 */
730 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
731 if (status != IBT_SUCCESS) {
732 cmn_err(CE_WARN, "rib_create_cq: "
733 "enable_cq_notify failed, status %d", status);
734 error = RDMA_FAILED;
735 goto fail;
736 }
737 *cqp = cq;
738
739 return (error);
740 fail:
741 if (cq->rib_cq_hdl)
742 (void) ibt_free_cq(cq->rib_cq_hdl);
743 if (cq)
744 kmem_free(cq, sizeof (rib_cq_t));
745 return (error);
746 }
747
748 static rdma_stat
749 open_hcas(rpcib_state_t *ribstat)
750 {
751 rib_hca_t *hca;
752 ibt_status_t ibt_status;
753 rdma_stat status;
754 ibt_hca_portinfo_t *pinfop;
755 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS;
756 uint_t size, cq_size;
757 int i;
758 #ifdef IB_FMR_SUP
759 ibt_fmr_pool_attr_t fmr_attr;
760 uint_t h_page_sz;
761 #endif
762 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
763 if (ribstat->hcas == NULL)
764 ribstat->hcas = kmem_zalloc(ribstat->hca_count *
765 sizeof (rib_hca_t), KM_SLEEP);
766
767 /*
768 * Open a hca and setup for RDMA
769 */
770 for (i = 0; i < ribstat->hca_count; i++) {
771 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
772 ribstat->hca_guids[i],
773 &ribstat->hcas[i].hca_hdl);
774 if (ibt_status != IBT_SUCCESS) {
775 cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) "
776 "returned %d", i, ibt_status);
777 continue;
778 }
779 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
780 hca = &(ribstat->hcas[i]);
781 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
782 hca->state = HCA_INITED;
783
784 /*
785 * query HCA info
786 */
787 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
788 if (ibt_status != IBT_SUCCESS) {
789 cmn_err(CE_WARN, "open_hcas: ibt_query_hca "
790 "returned %d (hca_guid 0x%llx)",
791 ibt_status, (longlong_t)ribstat->hca_guids[i]);
792 goto fail1;
793 }
794
795 /*
796 * One PD (Protection Domain) per HCA.
797 * A qp is allowed to access a memory region
798 * only when it's in the same PD as that of
799 * the memory region.
800 */
801 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
802 if (ibt_status != IBT_SUCCESS) {
803 cmn_err(CE_WARN, "open_hcas: ibt_alloc_pd "
804 "returned %d (hca_guid 0x%llx)",
805 ibt_status, (longlong_t)ribstat->hca_guids[i]);
806 goto fail1;
807 }
808
809 /*
810 * query HCA ports
811 */
812 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
813 0, &pinfop, &hca->hca_nports, &size);
814 if (ibt_status != IBT_SUCCESS) {
815 cmn_err(CE_WARN, "open_hcas: "
816 "ibt_query_hca_ports returned %d "
817 "(hca_guid 0x%llx)",
818 ibt_status, (longlong_t)hca->hca_guid);
819 goto fail2;
820 }
821 hca->hca_ports = pinfop;
822 hca->hca_pinfosz = size;
823 pinfop = NULL;
824
825 cq_size = DEF_CQ_SIZE; /* default cq size */
826 /*
827 * Create 2 pairs of cq's (1 pair for client
828 * and the other pair for server) on this hca.
829 * If number of qp's gets too large, then several
830 * cq's will be needed.
831 */
832 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
833 &hca->svc_rcq, ribstat);
834 if (status != RDMA_SUCCESS) {
835 goto fail3;
836 }
837
838 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
839 &hca->svc_scq, ribstat);
840 if (status != RDMA_SUCCESS) {
841 goto fail3;
842 }
843
844 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
845 &hca->clnt_rcq, ribstat);
846 if (status != RDMA_SUCCESS) {
847 goto fail3;
848 }
849
850 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
851 &hca->clnt_scq, ribstat);
852 if (status != RDMA_SUCCESS) {
853 goto fail3;
854 }
855
856 /*
857 * Create buffer pools.
858 * Note rib_rbuf_create also allocates memory windows.
859 */
860 hca->recv_pool = rib_rbufpool_create(hca,
861 RECV_BUFFER, MAX_BUFS);
862 if (hca->recv_pool == NULL) {
863 cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n");
864 goto fail3;
865 }
866
867 hca->send_pool = rib_rbufpool_create(hca,
868 SEND_BUFFER, MAX_BUFS);
869 if (hca->send_pool == NULL) {
870 cmn_err(CE_WARN, "open_hcas: send buf pool failed\n");
871 rib_rbufpool_destroy(hca, RECV_BUFFER);
872 goto fail3;
873 }
874 #ifdef IB_FMR_SUP
875 /* Global FMR POOL */
876 bzero(&fmr_attr, sizeof (ibt_fmr_pool_attr_t));
877
878 h_page_sz = hca->hca_attrs.hca_page_sz * 1024;
879
880 fmr_attr.fmr_max_pages_per_fmr =
881 (IB_FMR_MAX_SIZE / h_page_sz) + 2;
882 fmr_attr.fmr_pool_size = MAX_BUFS * 2;
883 fmr_attr.fmr_dirty_watermark = IB_FMR_DIRTY_MARK;
884 fmr_attr.fmr_page_sz = h_page_sz;
885 fmr_attr.fmr_cache = B_FALSE;
886 fmr_attr.fmr_flags = IBT_MR_SLEEP |
887 IBT_MR_ENABLE_LOCAL_WRITE |
888 IBT_MR_ENABLE_REMOTE_READ |
889 IBT_MR_ENABLE_REMOTE_WRITE;
890 fmr_attr.fmr_func_hdlr = NULL;
891
892 if (rib_debug > 1) {
893 cmn_err(CE_NOTE, "open_hcas: ibt_create_fmr_pool:");
894 cmn_err(CE_NOTE, "fmr_page_sz %d, fmr_pool_sz %d, "
895 "max_pages_per_fmr %d", fmr_attr.fmr_page_sz,
896 fmr_attr.fmr_pool_size,
897 fmr_attr.fmr_max_pages_per_fmr);
898 }
899
900 ibt_status = ibt_create_fmr_pool(hca->hca_hdl, hca->pd_hdl,
901 &fmr_attr, &hca->fmr_pool);
902 if (ibt_status != IBT_SUCCESS) {
903 cmn_err(CE_WARN, "open_hcas: Global FMR pool creation "
904 "failed: %d\n", ibt_status);
905 rib_rbufpool_destroy(hca, RECV_BUFFER);
906 rib_rbufpool_destroy(hca, SEND_BUFFER);
907 goto fail3;
908 }
909 #endif
910 #ifdef SERVER_REG_CACHE
911 cmn_err(CE_NOTE,"Registration Cache enabled\n");
912 {
913 cache_avl_struct_t my_avl_node;
914 hca->server_side_cache =
915 kmem_cache_create("rib_server_side_cache",
916 sizeof (cache_avl_struct_t), 0,
917 NULL,
918 NULL,
919 rib_server_side_cache_reclaim,
920 hca, NULL, 0);
921 avl_create(&hca->avl_tree,
922 avl_compare,
923 sizeof(cache_avl_struct_t),
924 (uint_t)&my_avl_node.avl_link-(uint_t)&my_avl_node);
925 /* mutex_init(&hca->avl_lock, NULL, MUTEX_DEFAULT, NULL);*/
926 rw_init(&hca->avl_rw_lock, NULL, RW_DRIVER, hca->iblock);
927 hca->avl_init = TRUE;
928
929 }
930 #endif
931
932 #if defined(ASYNC_CLIENT_DEREG)
933 rqueue.forw = rqueue.back = &rqueue;
934 mutex_init(&at_mutex, NULL, MUTEX_DEFAULT, NULL);
935 cv_init(&at_cond, NULL, CV_DEFAULT, NULL);
936 (void) thread_create(NULL, 0, async_dereg_thread, NULL, 0, &p0,
937 TS_RUN, minclsyspri);
938 #endif
939 /*
940 * Initialize the registered service list and
941 * the lock
942 */
943 hca->service_list = NULL;
944 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
945
946 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
947 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
948 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
949 hca->iblock);
950 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
951 hca->iblock);
952 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
953 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
954 hca->inuse = TRUE;
955 /*
956 * XXX One hca only. Add multi-hca functionality if needed
957 * later.
958 */
959 ribstat->hca = hca;
960 ribstat->nhca_inited++;
961 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
962 break;
963
964 fail3:
965 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
966 fail2:
967 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
968 fail1:
969 (void) ibt_close_hca(hca->hca_hdl);
970
971 }
972 if (ribstat->hca != NULL)
973 return (RDMA_SUCCESS);
974 else
975 return (RDMA_FAILED);
976 }
977
978 /*
979 * Callback routines
980 */
981
982 /*
983 * SCQ handlers
984 */
985 /* ARGSUSED */
986 static void
987 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
988 {
989 ibt_status_t ibt_status;
990 ibt_wc_t wc;
991 int i;
992
993 /*
994 * Re-enable cq notify here to avoid missing any
995 * completion queue notification.
996 */
997 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
998
999 ibt_status = IBT_SUCCESS;
1000 while (ibt_status != IBT_CQ_EMPTY) {
1001 bzero(&wc, sizeof (wc));
1002 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1003 if (ibt_status != IBT_SUCCESS)
1004 return;
1005
1006 /*
1007 * Got a send completion
1008 */
1009 if (wc.wc_id != NULL) { /* XXX can it be otherwise ???? */
1010 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1011 CONN *conn = qptoc(wd->qp);
1012
1013 mutex_enter(&wd->sendwait_lock);
1014 switch (wc.wc_status) {
1015 case IBT_WC_SUCCESS:
1016 wd->status = RDMA_SUCCESS;
1017 break;
1018 case IBT_WC_WR_FLUSHED_ERR:
1019 wd->status = RDMA_FAILED;
1020 break;
1021 default:
1022 /*
1023 * RC Send Q Error Code Local state Remote State
1024 * ==================== =========== ============
1025 * IBT_WC_BAD_RESPONSE_ERR ERROR None
1026 * IBT_WC_LOCAL_LEN_ERR ERROR None
1027 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None
1028 * IBT_WC_LOCAL_PROTECT_ERR ERROR None
1029 * IBT_WC_MEM_WIN_BIND_ERR ERROR None
1030 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR
1031 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR
1032 * IBT_WC_REMOTE_OP_ERR ERROR ERROR
1033 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None
1034 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None
1035 * IBT_WC_WR_FLUSHED_ERR None None
1036 */
1037 #ifdef DEBUG
1038 if (rib_debug > 1) {
1039 if (wc.wc_status != IBT_WC_SUCCESS) {
1040 cmn_err(CE_NOTE, "rib_clnt_scq_handler: "
1041 "WR completed in error, wc.wc_status:%d, "
1042 "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id);
1043 }
1044 }
1045 #endif
1046 /*
1047 * Channel in error state. Set connection to
1048 * ERROR and cleanup will happen either from
1049 * conn_release or from rib_conn_get
1050 */
1051 wd->status = RDMA_FAILED;
1052 mutex_enter(&conn->c_lock);
1053 if (conn->c_state != C_DISCONN_PEND)
1054 conn->c_state = C_ERROR;
1055 mutex_exit(&conn->c_lock);
1056 break;
1057 }
1058 if (wd->cv_sig == 1) {
1059 /*
1060 * Notify poster
1061 */
1062 cv_signal(&wd->wait_cv);
1063 mutex_exit(&wd->sendwait_lock);
1064 } else {
1065 /*
1066 * Poster not waiting for notification.
1067 * Free the send buffers and send_wid
1068 */
1069 for (i = 0; i < wd->nsbufs; i++) {
1070 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
1071 (void *)(uintptr_t)wd->sbufaddr[i]);
1072 }
1073 mutex_exit(&wd->sendwait_lock);
1074 (void) rib_free_sendwait(wd);
1075 }
1076 }
1077 }
1078 }
1079
1080 #if defined (CLNT_INTERRUPT_COAL)
1081 static void
1082 rib_scq_free(caddr_t widd)
1083 {
1084 struct send_wid *wd = (struct send_wid *)widd;
1085 ibt_status_t ibt_status;
1086 ibt_wc_t wc;
1087 int i;
1088 CONN *conn = qptoc(wd->qp);
1089
1090 wc.wc_status = RDMA_SUCCESS;
1091 mutex_enter(&wd->sendwait_lock);
1092 switch (wc.wc_status) {
1093 case IBT_WC_SUCCESS:
1094 wd->status = RDMA_SUCCESS;
1095 break;
1096 case IBT_WC_WR_FLUSHED_ERR:
1097 wd->status = RDMA_FAILED;
1098 break;
1099 default:
1100 /*
1101 * RC Send Q Error Code Local state Remote State
1102 * ==================== =========== ============
1103 * IBT_WC_BAD_RESPONSE_ERR ERROR None
1104 * IBT_WC_LOCAL_LEN_ERR ERROR None
1105 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None
1106 * IBT_WC_LOCAL_PROTECT_ERR ERROR None
1107 * IBT_WC_MEM_WIN_BIND_ERR ERROR None
1108 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR
1109 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR
1110 * IBT_WC_REMOTE_OP_ERR ERROR ERROR
1111 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None
1112 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None
1113 * IBT_WC_WR_FLUSHED_ERR None None
1114 */
1115 #ifdef DEBUG
1116 if (rib_debug > 1) {
1117 if (wc.wc_status != IBT_WC_SUCCESS) {
1118 cmn_err(CE_NOTE, "rib_clnt_scq_handler: "
1119 "WR completed in error, wc.wc_status:%d, "
1120 "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id);
1121 }
1122 }
1123 #endif
1124 /*
1125 * Channel in error state. Set connection to
1126 * ERROR and cleanup will happen either from
1127 * conn_release or from rib_conn_get
1128 */
1129 wd->status = RDMA_FAILED;
1130 mutex_enter(&conn->c_lock);
1131 if (conn->c_state != C_DISCONN_PEND)
1132 conn->c_state = C_ERROR;
1133 mutex_exit(&conn->c_lock);
1134 break;
1135 }
1136 if (wd->cv_sig == 1) {
1137 /*
1138 * Notify poster
1139 */
1140 cmn_err(CE_NOTE,"Some error \n");
1141 cv_signal(&wd->wait_cv);
1142 mutex_exit(&wd->sendwait_lock);
1143 } else {
1144 /*
1145 * Poster not waiting for notification.
1146 * Free the send buffers and send_wid
1147 */
1148 for (i = 0; i < wd->nsbufs; i++) {
1149 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
1150 (void *)(uintptr_t)wd->sbufaddr[i]);
1151 }
1152 mutex_exit(&wd->sendwait_lock);
1153 (void) rib_free_sendwait(wd);
1154 }
1155 }
1156 #endif
1157
1158 /* ARGSUSED */
1159 static void
1160 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1161 {
1162 ibt_status_t ibt_status;
1163 ibt_wc_t wc;
1164 int i;
1165
1166 /*
1167 * Re-enable cq notify here to avoid missing any
1168 * completion queue notification.
1169 */
1170 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1171
1172 ibt_status = IBT_SUCCESS;
1173 while (ibt_status != IBT_CQ_EMPTY) {
1174 bzero(&wc, sizeof (wc));
1175 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1176 if (ibt_status != IBT_SUCCESS)
1177 return;
1178
1179 /*
1180 * Got a send completion
1181 */
1182 #ifdef DEBUG
1183 if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) {
1184 cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error "
1185 "wc.wc_status:%d, wc_id:%llX",
1186 wc.wc_status, (longlong_t)wc.wc_id);
1187 }
1188 #endif
1189 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
1190 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1191 #ifdef ASYNC_SERVER_DEREG
1192 if(wd->c1){
1193 (void) clist_deregister1((CONN *)wd->c, (struct clist *)wd->c1, TRUE);
1194 #ifdef SERVER_REG_CACHE
1195 RDMA_FREE_SERVER_CACHE_BUF((CONN *)wd->c, (rib_lrc_entry_t *)(((struct clist *)wd->c1)->long_reply_buf));
1196 #else
1197 if(wd->c1 && wd->l1)
1198 kmem_free((void *) (wd->c1)->c_saddr, wd->l1);
1199 #endif
1200 kmem_free((void *)(wd->c1), wd->wl * sizeof(struct clist));
1201 }
1202 if(wd->c2){
1203 (void) clist_deregister1((CONN *)wd->c, (struct clist *)wd->c2, TRUE);
1204 #ifdef SERVER_REG_CACHE
1205 RDMA_FREE_SERVER_CACHE_BUF((CONN *)wd->c, (rib_lrc_entry_t *)(((struct clist *)wd->c2)->long_reply_buf));
1206 #else
1207 if(wd->l2)
1208 kmem_free((void *) (wd->c2)->c_saddr, wd->l2);
1209 #endif
1210 kmem_free((void *)(wd->c2), wd->rl * sizeof(struct clist));
1211 }
1212 #endif
1213 mutex_enter(&wd->sendwait_lock);
1214 if (wd->cv_sig == 1) {
1215 /*
1216 * Update completion status and notify poster
1217 */
1218 if (wc.wc_status == IBT_WC_SUCCESS)
1219 wd->status = RDMA_SUCCESS;
1220 else
1221 wd->status = RDMA_FAILED;
1222 cv_signal(&wd->wait_cv);
1223 mutex_exit(&wd->sendwait_lock);
1224 } else {
1225 /*
1226 * Poster not waiting for notification.
1227 * Free the send buffers and send_wid
1228 */
1229 for (i = 0; i < wd->nsbufs; i++) {
1230 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
1231 (void *)(uintptr_t)wd->sbufaddr[i]);
1232 }
1233 mutex_exit(&wd->sendwait_lock);
1234 (void) rib_free_sendwait(wd);
1235 }
1236 }
1237 }
1238 }
1239
1240 /*
1241 * RCQ handler
1242 */
1243 /* ARGSUSED */
1244 static void
1245 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1246 {
1247 rib_qp_t *qp;
1248 ibt_status_t ibt_status;
1249 ibt_wc_t wc;
1250 struct recv_wid *rwid;
1251 #if defined(CLNT_POLL_CQ)
1252 uint32_t count = 0;
1253 #endif
1254
1255 /*
1256 * Re-enable cq notify here to avoid missing any
1257 * completion queue notification.
1258 */
1259 #if !defined(CLNT_POLL_CQ)
1260 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1261 #endif
1262
1263 ibt_status = IBT_SUCCESS;
1264 while (ibt_status != IBT_CQ_EMPTY) {
1265 #if defined(CLNT_POLL_CQ)
1266 poll_cq_again:
1267 #endif
1268 bzero(&wc, sizeof (wc));
1269 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1270 #if defined(CLNT_POLL_CQ)
1271 if (ibt_status == IBT_CQ_EMPTY){
1272 count ++;
1273 if(count == max_poll_count){
1274 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1275 return;
1276 }
1277 goto poll_cq_again;
1278 }
1279 #endif
1280 if (ibt_status != IBT_SUCCESS)
1281 #if defined(CLNT_POLL_CQ)
1282 {
1283 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1284 #endif
1285 return;
1286 #if defined(CLNT_POLL_CQ)
1287 }
1288 count = 0;
1289 #endif
1290 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1291 qp = rwid->qp;
1292 if (wc.wc_status == IBT_WC_SUCCESS) {
1293 XDR inxdrs, *xdrs;
1294 uint_t xid, vers, op, find_xid = 0;
1295 struct reply *r;
1296 CONN *conn = qptoc(qp);
1297 uint32_t rdma_credit = 0;
1298
1299 xdrs = &inxdrs;
1300 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1301 wc.wc_bytes_xfer, XDR_DECODE);
1302 /*
1303 * Treat xid as opaque (xid is the first entity
1304 * in the rpc rdma message).
1305 */
1306 xid = *(uint32_t *)(uintptr_t)rwid->addr;
1307 /* Skip xid and set the xdr position accordingly. */
1308 XDR_SETPOS(xdrs, sizeof (uint32_t));
1309 (void) xdr_u_int(xdrs, &vers);
1310 (void) xdr_u_int(xdrs, &rdma_credit);
1311 (void) xdr_u_int(xdrs, &op);
1312 XDR_DESTROY(xdrs);
1313 if (vers != RPCRDMA_VERS) {
1314 /*
1315 * Invalid RPC/RDMA version. Cannot interoperate.
1316 * Set connection to ERROR state and bail out.
1317 */
1318 mutex_enter(&conn->c_lock);
1319 if (conn->c_state != C_DISCONN_PEND)
1320 conn->c_state = C_ERROR;
1321 mutex_exit(&conn->c_lock);
1322 rib_rbuf_free(conn, RECV_BUFFER,
1323 (void *)(uintptr_t)rwid->addr);
1324 rib_free_wid(rwid);
1325 continue;
1326 }
1327
1328 mutex_enter(&qp->replylist_lock);
1329 for (r = qp->replylist; r != NULL; r = r->next) {
1330 if (r->xid == xid) {
1331 find_xid = 1;
1332 switch (op) {
1333 case RDMA_MSG:
1334 case RDMA_NOMSG:
1335 case RDMA_MSGP:
1336 r->status = RDMA_SUCCESS;
1337 r->vaddr_cq = rwid->addr;
1338 r->bytes_xfer = wc.wc_bytes_xfer;
1339 cv_signal(&r->wait_cv);
1340 break;
1341 default:
1342 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1343 (void *)(uintptr_t)rwid->addr);
1344 break;
1345 }
1346 break;
1347 }
1348 }
1349 mutex_exit(&qp->replylist_lock);
1350 if (find_xid == 0) {
1351 /* RPC caller not waiting for reply */
1352 #ifdef DEBUG
1353 if (rib_debug) {
1354 cmn_err(CE_NOTE, "rib_clnt_rcq_handler: "
1355 "NO matching xid %u!\n", xid);
1356 }
1357 #endif
1358 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1359 (void *)(uintptr_t)rwid->addr);
1360 }
1361 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1362 CONN *conn = qptoc(qp);
1363
1364 /*
1365 * Connection being flushed. Just free
1366 * the posted buffer
1367 */
1368 rib_rbuf_free(conn, RECV_BUFFER,
1369 (void *)(uintptr_t)rwid->addr);
1370 } else {
1371 CONN *conn = qptoc(qp);
1372 /*
1373 * RC Recv Q Error Code Local state Remote State
1374 * ==================== =========== ============
1375 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd
1376 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd
1377 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd
1378 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd
1379 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd
1380 * IBT_WC_WR_FLUSHED_ERR None None
1381 */
1382 /*
1383 * Channel in error state. Set connection
1384 * in ERROR state.
1385 */
1386 mutex_enter(&conn->c_lock);
1387 if (conn->c_state != C_DISCONN_PEND)
1388 conn->c_state = C_ERROR;
1389 mutex_exit(&conn->c_lock);
1390 rib_rbuf_free(conn, RECV_BUFFER,
1391 (void *)(uintptr_t)rwid->addr);
1392 }
1393 rib_free_wid(rwid);
1394 }
1395 }
1396
1397 /* Server side */
1398 /* ARGSUSED */
1399 static void
1400 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1401 {
1402 struct recv_data *rd;
1403 rib_qp_t *qp;
1404 ibt_status_t ibt_status;
1405 ibt_wc_t wc;
1406 struct svc_recv *s_recvp;
1407 CONN *conn;
1408 mblk_t *mp;
1409
1410 /*
1411 * Re-enable cq notify here to avoid missing any
1412 * completion queue notification.
1413 */
1414 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1415
1416 ibt_status = IBT_SUCCESS;
1417 while (ibt_status != IBT_CQ_EMPTY) {
1418 bzero(&wc, sizeof (wc));
1419 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1420 if (ibt_status != IBT_SUCCESS)
1421 return;
1422
1423 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1424 qp = s_recvp->qp;
1425 conn = qptoc(qp);
1426 mutex_enter(&qp->posted_rbufs_lock);
1427 qp->n_posted_rbufs--;
1428 #if defined(MEASURE_POOL_DEPTH)
1429 rib_posted_rbufs(preposted_rbufs - qp->n_posted_rbufs);
1430 #endif
1431 if (qp->n_posted_rbufs == 0)
1432 cv_signal(&qp->posted_rbufs_cv);
1433 mutex_exit(&qp->posted_rbufs_lock);
1434
1435 if (wc.wc_status == IBT_WC_SUCCESS) {
1436 XDR inxdrs, *xdrs;
1437 uint_t xid, vers, op;
1438 uint32_t rdma_credit;
1439
1440 xdrs = &inxdrs;
1441 /* s_recvp->vaddr stores data */
1442 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1443 wc.wc_bytes_xfer, XDR_DECODE);
1444
1445 /*
1446 * Treat xid as opaque (xid is the first entity
1447 * in the rpc rdma message).
1448 */
1449 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1450 /* Skip xid and set the xdr position accordingly. */
1451 XDR_SETPOS(xdrs, sizeof (uint32_t));
1452 if (!xdr_u_int(xdrs, &vers) ||
1453 !xdr_u_int(xdrs, &rdma_credit) ||
1454 !xdr_u_int(xdrs, &op)) {
1455 rib_rbuf_free(conn, RECV_BUFFER,
1456 (void *)(uintptr_t)s_recvp->vaddr);
1457 XDR_DESTROY(xdrs);
1458 #ifdef DEBUG
1459 cmn_err(CE_NOTE, "rib_svc_rcq_handler: "
1460 "xdr_u_int failed for qp %p, wc_id=%llx",
1461 (void *)qp, (longlong_t)wc.wc_id);
1462 #endif
1463 (void) rib_free_svc_recv(s_recvp);
1464 continue;
1465 }
1466 XDR_DESTROY(xdrs);
1467
1468 if (vers != RPCRDMA_VERS) {
1469 /*
1470 * Invalid RPC/RDMA version. Drop rpc rdma message.
1471 */
1472 rib_rbuf_free(conn, RECV_BUFFER,
1473 (void *)(uintptr_t)s_recvp->vaddr);
1474 (void) rib_free_svc_recv(s_recvp);
1475 continue;
1476 }
1477 /*
1478 * Is this for RDMA_DONE?
1479 */
1480 if (op == RDMA_DONE) {
1481 rib_rbuf_free(conn, RECV_BUFFER,
1482 (void *)(uintptr_t)s_recvp->vaddr);
1483 /*
1484 * Wake up the thread waiting on
1485 * a RDMA_DONE for xid
1486 */
1487 mutex_enter(&qp->rdlist_lock);
1488 rdma_done_notify(qp, xid);
1489 mutex_exit(&qp->rdlist_lock);
1490 (void) rib_free_svc_recv(s_recvp);
1491 continue;
1492 }
1493
1494 mutex_enter(&plugin_state_lock);
1495 if (plugin_state == ACCEPT) {
1496 while ((mp = allocb(sizeof (*rd), BPRI_LO)) == NULL)
1497 (void) strwaitbuf(sizeof (*rd), BPRI_LO);
1498 /*
1499 * Plugin is in accept state, hence the master
1500 * transport queue for this is still accepting
1501 * requests. Hence we can call svc_queuereq to
1502 * queue this recieved msg.
1503 */
1504 rd = (struct recv_data *)mp->b_rptr;
1505 rd->conn = conn;
1506 rd->rpcmsg.addr = (caddr_t)(uintptr_t)s_recvp->vaddr;
1507 rd->rpcmsg.type = RECV_BUFFER;
1508 rd->rpcmsg.len = wc.wc_bytes_xfer;
1509 rd->status = wc.wc_status;
1510 mutex_enter(&conn->c_lock);
1511 conn->c_ref++;
1512 mutex_exit(&conn->c_lock);
1513 mp->b_wptr += sizeof (*rd);
1514 svc_queuereq((queue_t *)rib_stat->q, mp);
1515 mutex_exit(&plugin_state_lock);
1516 } else {
1517 /*
1518 * The master transport for this is going
1519 * away and the queue is not accepting anymore
1520 * requests for krpc, so don't do anything, just
1521 * free the msg.
1522 */
1523 mutex_exit(&plugin_state_lock);
1524 rib_rbuf_free(conn, RECV_BUFFER,
1525 (void *)(uintptr_t)s_recvp->vaddr);
1526 }
1527 } else {
1528 rib_rbuf_free(conn, RECV_BUFFER,
1529 (void *)(uintptr_t)s_recvp->vaddr);
1530 }
1531 (void) rib_free_svc_recv(s_recvp);
1532 }
1533 }
1534
1535 /*
1536 * Handles DR event of IBT_HCA_DETACH_EVENT.
1537 */
1538 /* ARGSUSED */
1539 static void
1540 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1541 ibt_async_code_t code, ibt_async_event_t *event)
1542 {
1543
1544 switch (code) {
1545 case IBT_HCA_ATTACH_EVENT:
1546 /* ignore */
1547 break;
1548 case IBT_HCA_DETACH_EVENT:
1549 {
1550 ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1551 rib_detach_hca(rib_stat->hca);
1552 #ifdef DEBUG
1553 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1554 #endif
1555 break;
1556 }
1557 #ifdef DEBUG
1558 case IBT_EVENT_PATH_MIGRATED:
1559 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PATH_MIGRATED\n");
1560 break;
1561 case IBT_EVENT_SQD:
1562 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1563 break;
1564 case IBT_EVENT_COM_EST:
1565 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1566 break;
1567 case IBT_ERROR_CATASTROPHIC_CHAN:
1568 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CATASTROPHIC_CHAN\n");
1569 break;
1570 case IBT_ERROR_INVALID_REQUEST_CHAN:
1571 cmn_err(CE_NOTE, "rib_async_handler(): "
1572 "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1573 break;
1574 case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1575 cmn_err(CE_NOTE, "rib_async_handler(): "
1576 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1577 break;
1578 case IBT_ERROR_PATH_MIGRATE_REQ:
1579 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PATH_MIGRATE_REQ\n");
1580 break;
1581 case IBT_ERROR_CQ:
1582 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1583 break;
1584 case IBT_ERROR_PORT_DOWN:
1585 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1586 break;
1587 case IBT_EVENT_PORT_UP:
1588 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1589 break;
1590 case IBT_ASYNC_OPAQUE1:
1591 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1592 break;
1593 case IBT_ASYNC_OPAQUE2:
1594 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1595 break;
1596 case IBT_ASYNC_OPAQUE3:
1597 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1598 break;
1599 case IBT_ASYNC_OPAQUE4:
1600 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1601 break;
1602 #endif
1603 default:
1604 break;
1605 }
1606 }
1607
1608 /*
1609 * Client's reachable function.
1610 */
1611 static rdma_stat
1612 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1613 {
1614 rib_hca_t *hca;
1615 rdma_stat status;
1616
1617 /*
1618 * First check if a hca is still attached
1619 */
1620 *handle = NULL;
1621 rw_enter(&rib_stat->hca->state_lock, RW_READER);
1622 if (rib_stat->hca->state != HCA_INITED) {
1623 rw_exit(&rib_stat->hca->state_lock);
1624 return (RDMA_FAILED);
1625 }
1626 status = rib_ping_srv(addr_type, raddr, &hca);
1627 rw_exit(&rib_stat->hca->state_lock);
1628
1629 if (status == RDMA_SUCCESS) {
1630 *handle = (void *)hca;
1631 /*
1632 * Register the Address translation service
1633 */
1634 mutex_enter(&rib_stat->open_hca_lock);
1635 if (ats_running == 0) {
1636 if (rib_register_ats(rib_stat->hca)
1637 == RDMA_SUCCESS) {
1638 ats_running = 1;
1639 mutex_exit(&rib_stat->open_hca_lock);
1640 return (RDMA_SUCCESS);
1641 } else {
1642 mutex_exit(&rib_stat->open_hca_lock);
1643 return (RDMA_FAILED);
1644 }
1645 } else {
1646 mutex_exit(&rib_stat->open_hca_lock);
1647 return (RDMA_SUCCESS);
1648 }
1649 } else {
1650 *handle = NULL;
1651 if (rib_debug > 2)
1652 cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n");
1653 return (RDMA_FAILED);
1654 }
1655 }
1656
1657 /* Client side qp creation */
1658 static rdma_stat
1659 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1660 {
1661 rib_qp_t *kqp = NULL;
1662 CONN *conn;
1663 rdma_clnt_cred_ctrl_t *cc_info;
1664
1665 ASSERT(qp != NULL);
1666 *qp = NULL;
1667
1668 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1669 conn = qptoc(kqp);
1670 kqp->hca = hca;
1671 kqp->rdmaconn.c_rdmamod = &rib_mod;
1672 kqp->rdmaconn.c_private = (caddr_t)kqp;
1673
1674 kqp->mode = RIB_CLIENT;
1675 kqp->chan_flags = IBT_BLOCKING;
1676 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1677 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1678 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1679
1680 /*
1681 * Initialize
1682 */
1683 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1684 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1685 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1686 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1687 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1688 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1689 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1690 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1691 #if defined (CLNT_INTERRUPT_COAL)
1692 kqp->rdmaconn.c_count = 0;
1693 conn->c_count = 0;
1694 bzero(&kqp->wd, sizeof(struct send_wid));
1695 kqp->wd.forw = kqp->wd.back = &kqp->wd;
1696 #endif
1697 /*
1698 * Initialize the client credit control
1699 * portion of the rdmaconn struct.
1700 */
1701 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1702 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1703 cc_info->clnt_cc_granted_ops = 0;
1704 cc_info->clnt_cc_in_flight_ops = 0;
1705 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1706
1707 *qp = kqp;
1708 return (RDMA_SUCCESS);
1709 }
1710
1711 /* Server side qp creation */
1712 static rdma_stat
1713 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1714 {
1715 rib_qp_t *kqp = NULL;
1716 ibt_chan_sizes_t chan_sizes;
1717 ibt_rc_chan_alloc_args_t qp_attr;
1718 ibt_status_t ibt_status;
1719 rdma_srv_cred_ctrl_t *cc_info;
1720
1721 ASSERT(qp != NULL);
1722 *qp = NULL;
1723
1724 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1725 kqp->hca = hca;
1726 kqp->port_num = port;
1727 kqp->rdmaconn.c_rdmamod = &rib_mod;
1728 kqp->rdmaconn.c_private = (caddr_t)kqp;
1729
1730 /*
1731 * Create the qp handle
1732 */
1733 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1734 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1735 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1736 qp_attr.rc_pd = hca->pd_hdl;
1737 qp_attr.rc_hca_port_num = port;
1738 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1739 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1740 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1741 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1742 qp_attr.rc_clone_chan = NULL;
1743 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1744 qp_attr.rc_flags = IBT_WR_SIGNALED;
1745
1746 rw_enter(&hca->state_lock, RW_READER);
1747 if (hca->state != HCA_DETACHED) {
1748 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1749 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1750 &chan_sizes);
1751 } else {
1752 rw_exit(&hca->state_lock);
1753 goto fail;
1754 }
1755 rw_exit(&hca->state_lock);
1756
1757 if (ibt_status != IBT_SUCCESS) {
1758 cmn_err(CE_WARN, "rib_svc_create_chan: "
1759 "ibt_alloc_rc_channel failed, ibt_status=%d.",
1760 ibt_status);
1761 goto fail;
1762 }
1763
1764 kqp->mode = RIB_SERVER;
1765 kqp->chan_flags = IBT_BLOCKING;
1766 kqp->q = q; /* server ONLY */
1767
1768 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1769 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1770 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1771 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1772 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1773 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1774 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1775 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1776 /*
1777 * Set the private data area to qp to be used in callbacks
1778 */
1779 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1780 kqp->rdmaconn.c_state = C_CONNECTED;
1781
1782 /*
1783 * Initialize the server credit control
1784 * portion of the rdmaconn struct.
1785 */
1786 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1787 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1788 cc_info->srv_cc_buffers_granted = preposted_rbufs;
1789 cc_info->srv_cc_cur_buffers_used = 0;
1790 cc_info->srv_cc_posted = preposted_rbufs;
1791
1792 *qp = kqp;
1793
1794 num_clients++;
1795 return (RDMA_SUCCESS);
1796 fail:
1797 if (kqp)
1798 kmem_free(kqp, sizeof (rib_qp_t));
1799
1800 return (RDMA_FAILED);
1801 }
1802
1803 void
1804 rib_dump_pathrec(ibt_path_info_t *path_rec)
1805 {
1806 ib_pkey_t pkey;
1807
1808 if (rib_debug > 1) {
1809 cmn_err(CE_NOTE, "Path Record:\n");
1810
1811 cmn_err(CE_NOTE, "Source HCA GUID = %llx\n",
1812 (longlong_t)path_rec->pi_hca_guid);
1813 cmn_err(CE_NOTE, "Dest Service ID = %llx\n",
1814 (longlong_t)path_rec->pi_sid);
1815 cmn_err(CE_NOTE, "Port Num = %02d\n",
1816 path_rec->pi_prim_cep_path.cep_hca_port_num);
1817 cmn_err(CE_NOTE, "P_Key Index = %04d\n",
1818 path_rec->pi_prim_cep_path.cep_pkey_ix);
1819
1820 (void) ibt_index2pkey_byguid(path_rec->pi_hca_guid,
1821 path_rec->pi_prim_cep_path.cep_hca_port_num,
1822 path_rec->pi_prim_cep_path.cep_pkey_ix, &pkey);
1823 cmn_err(CE_NOTE, "P_Key = 0x%x\n", pkey);
1824
1825
1826 cmn_err(CE_NOTE, "SGID: = %llx:%llx\n",
1827 (longlong_t)
1828 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_prefix,
1829 (longlong_t)
1830 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_guid);
1831
1832 cmn_err(CE_NOTE, "DGID: = %llx:%llx\n",
1833 (longlong_t)
1834 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_prefix,
1835 (longlong_t)
1836 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_guid);
1837
1838 cmn_err(CE_NOTE, "Path Rate = %02x\n",
1839 path_rec->pi_prim_cep_path.cep_adds_vect.av_srate);
1840 cmn_err(CE_NOTE, "SL = %02x\n",
1841 path_rec->pi_prim_cep_path.cep_adds_vect.av_srvl);
1842 cmn_err(CE_NOTE, "Prim Packet LT = %02x\n",
1843 path_rec->pi_prim_pkt_lt);
1844 cmn_err(CE_NOTE, "Path MTU = %02x\n",
1845 path_rec->pi_path_mtu);
1846 }
1847 }
1848
1849 /* ARGSUSED */
1850 ibt_cm_status_t
1851 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1852 ibt_cm_return_args_t *ret_args, void *priv_data,
1853 ibt_priv_data_len_t len)
1854 {
1855 rpcib_state_t *ribstat;
1856 rib_hca_t *hca;
1857
1858 ribstat = (rpcib_state_t *)clnt_hdl;
1859 hca = (rib_hca_t *)ribstat->hca;
1860
1861 switch (event->cm_type) {
1862
1863 /* got a connection close event */
1864 case IBT_CM_EVENT_CONN_CLOSED:
1865 {
1866 CONN *conn;
1867 rib_qp_t *qp;
1868
1869 /* check reason why connection was closed */
1870 switch (event->cm_event.closed) {
1871 case IBT_CM_CLOSED_DREP_RCVD:
1872 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1873 case IBT_CM_CLOSED_DUP:
1874 case IBT_CM_CLOSED_ABORT:
1875 case IBT_CM_CLOSED_ALREADY:
1876 /*
1877 * These cases indicate the local end initiated
1878 * the closing of the channel. Nothing to do here.
1879 */
1880 break;
1881 default:
1882 /*
1883 * Reason for CONN_CLOSED event must be one of
1884 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1885 * or IBT_CM_CLOSED_STALE. These indicate cases were
1886 * the remote end is closing the channel. In these
1887 * cases free the channel and transition to error
1888 * state
1889 */
1890 qp = ibt_get_chan_private(event->cm_channel);
1891 conn = qptoc(qp);
1892 mutex_enter(&conn->c_lock);
1893 if (conn->c_state == C_DISCONN_PEND) {
1894 mutex_exit(&conn->c_lock);
1895 break;
1896 }
1897
1898 conn->c_state = C_ERROR;
1899
1900 /*
1901 * Free the rc_channel. Channel has already
1902 * transitioned to ERROR state and WRs have been
1903 * FLUSHED_ERR already.
1904 */
1905 (void) ibt_free_channel(qp->qp_hdl);
1906 qp->qp_hdl = NULL;
1907
1908 /*
1909 * Free the conn if c_ref is down to 0 already
1910 */
1911 if (conn->c_ref == 0) {
1912 /*
1913 * Remove from list and free conn
1914 */
1915 conn->c_state = C_DISCONN_PEND;
1916 mutex_exit(&conn->c_lock);
1917 (void) rib_disconnect_channel(conn,
1918 &hca->cl_conn_list);
1919 } else {
1920 mutex_exit(&conn->c_lock);
1921 }
1922 #ifdef DEBUG
1923 if (rib_debug)
1924 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1925 "(CONN_CLOSED) channel disconnected");
1926 #endif
1927 break;
1928 }
1929 break;
1930 }
1931 default:
1932 break;
1933 }
1934 return (IBT_CM_ACCEPT);
1935 }
1936
1937
1938 /* Check if server has done ATS registration */
1939 rdma_stat
1940 rib_chk_srv_ats(rib_hca_t *hca, struct netbuf *raddr,
1941 int addr_type, ibt_path_info_t *path)
1942 {
1943 struct sockaddr_in *sin4;
1944 struct sockaddr_in6 *sin6;
1945 ibt_path_attr_t path_attr;
1946 ibt_status_t ibt_status;
1947 ib_pkey_t pkey;
1948 ibt_ar_t ar_query, ar_result;
1949 rib_service_t *ats;
1950 ib_gid_t sgid;
1951 ibt_path_info_t paths[MAX_PORTS];
1952 uint8_t npaths, i;
1953
1954 (void) bzero(&path_attr, sizeof (ibt_path_attr_t));
1955 (void) bzero(path, sizeof (ibt_path_info_t));
1956
1957 /*
1958 * Construct svc name
1959 */
1960 path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
1961 switch (addr_type) {
1962 case AF_INET:
1963 sin4 = (struct sockaddr_in *)raddr->buf;
1964 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
1965 IB_SVC_NAME_LEN);
1966 break;
1967
1968 case AF_INET6:
1969 sin6 = (struct sockaddr_in6 *)raddr->buf;
1970 (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
1971 path_attr.pa_sname, IB_SVC_NAME_LEN);
1972 break;
1973
1974 default:
1975 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1976 return (RDMA_INVAL);
1977 }
1978 (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
1979
1980 /*
1981 * Attempt a path to the server on an ATS-registered port.
1982 * Try all ATS-registered ports until one succeeds.
1983 * The first one that succeeds will be used to connect
1984 * to the server. If none of them succeed, return RDMA_FAILED.
1985 */
1986 rw_enter(&hca->state_lock, RW_READER);
1987 if (hca->state != HCA_DETACHED) {
1988 rw_enter(&hca->service_list_lock, RW_READER);
1989 for (ats = hca->ats_list; ats != NULL; ats = ats->srv_next) {
1990 path_attr.pa_hca_guid = hca->hca_guid;
1991 path_attr.pa_hca_port_num = ats->srv_port;
1992 ibt_status = ibt_get_paths(hca->ibt_clnt_hdl,
1993 IBT_PATH_MULTI_SVC_DEST, &path_attr, 2, paths, &npaths);
1994 if (ibt_status == IBT_SUCCESS ||
1995 ibt_status == IBT_INSUFF_DATA) {
1996 for (i = 0; i < npaths; i++) {
1997 if (paths[i].pi_hca_guid) {
1998 /*
1999 * do ibt_query_ar()
2000 */
2001 sgid =
2002 paths[i].pi_prim_cep_path.cep_adds_vect.av_sgid;
2003
2004 (void) ibt_index2pkey_byguid(paths[i].pi_hca_guid,
2005 paths[i].pi_prim_cep_path.cep_hca_port_num,
2006 paths[i].pi_prim_cep_path.cep_pkey_ix, &pkey);
2007
2008 bzero(&ar_query, sizeof (ar_query));
2009 bzero(&ar_result, sizeof (ar_result));
2010 ar_query.ar_gid =
2011 paths[i].pi_prim_cep_path.cep_adds_vect.av_dgid;
2012 ar_query.ar_pkey = pkey;
2013 ibt_status = ibt_query_ar(&sgid, &ar_query,
2014 &ar_result);
2015 if (ibt_status == IBT_SUCCESS) {
2016 #ifdef DEBUG
2017 if (rib_debug > 1)
2018 rib_dump_pathrec(&paths[i]);
2019 #endif
2020 bcopy(&paths[i], path,
2021 sizeof (ibt_path_info_t));
2022 rw_exit(&hca->service_list_lock);
2023 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
2024 rw_exit(&hca->state_lock);
2025 return (RDMA_SUCCESS);
2026 }
2027 #ifdef DEBUG
2028 if (rib_debug) {
2029 cmn_err(CE_NOTE, "rib_chk_srv_ats: "
2030 "ibt_query_ar FAILED, return\n");
2031 }
2032 #endif
2033 }
2034 }
2035 }
2036 }
2037 rw_exit(&hca->service_list_lock);
2038 }
2039 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
2040 rw_exit(&hca->state_lock);
2041 return (RDMA_FAILED);
2042 }
2043
2044
2045 /*
2046 * Connect to the server.
2047 */
2048 rdma_stat
2049 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path)
2050 {
2051 ibt_chan_open_args_t chan_args; /* channel args */
2052 ibt_chan_sizes_t chan_sizes;
2053 ibt_rc_chan_alloc_args_t qp_attr;
2054 ibt_status_t ibt_status;
2055 ibt_rc_returns_t ret_args; /* conn reject info */
2056 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
2057
2058 (void) bzero(&chan_args, sizeof (chan_args));
2059 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
2060
2061 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
2062 /* Alloc a RC channel */
2063 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
2064 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
2065 qp_attr.rc_pd = hca->pd_hdl;
2066 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
2067 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
2068 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
2069 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
2070 qp_attr.rc_clone_chan = NULL;
2071 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
2072 qp_attr.rc_flags = IBT_WR_SIGNALED;
2073
2074 chan_args.oc_path = path;
2075 chan_args.oc_cm_handler = rib_clnt_cm_handler;
2076 chan_args.oc_cm_clnt_private = (void *)rib_stat;
2077 chan_args.oc_rdma_ra_out = 4;
2078 chan_args.oc_rdma_ra_in = 4;
2079 chan_args.oc_path_retry_cnt = 2;
2080 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
2081
2082 refresh:
2083 rw_enter(&hca->state_lock, RW_READER);
2084 if (hca->state != HCA_DETACHED) {
2085 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
2086 IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl,
2087 &chan_sizes);
2088 } else {
2089 rw_exit(&hca->state_lock);
2090 return (RDMA_FAILED);
2091 }
2092 rw_exit(&hca->state_lock);
2093
2094 if (ibt_status != IBT_SUCCESS) {
2095 #ifdef DEBUG
2096 cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel "
2097 "failed, ibt_status=%d.", ibt_status);
2098 #endif
2099 return (RDMA_FAILED);
2100 }
2101
2102 /* Connect to the Server */
2103 (void) bzero(&ret_args, sizeof (ret_args));
2104 mutex_enter(&qp->cb_lock);
2105 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
2106 IBT_BLOCKING, &chan_args, &ret_args);
2107 if (ibt_status != IBT_SUCCESS) {
2108 #ifdef DEBUG
2109 if (rib_debug)
2110 cmn_err(CE_WARN, "rib_conn_to_srv: open_rc_channel"
2111 " failed for qp %p, status=%d, "
2112 "ret_args.rc_status=%d\n",
2113 (void *)qp, ibt_status, ret_args.rc_status);
2114 #endif
2115 (void) ibt_free_channel(qp->qp_hdl);
2116 qp->qp_hdl = NULL;
2117 mutex_exit(&qp->cb_lock);
2118 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
2119 ret_args.rc_status == IBT_CM_CONN_STALE) {
2120 /*
2121 * Got IBT_CM_CONN_STALE probably because of stale
2122 * data on the passive end of a channel that existed
2123 * prior to reboot. Retry establishing a channel
2124 * REFRESH_ATTEMPTS times, during which time the
2125 * stale conditions on the server might clear up.
2126 */
2127 goto refresh;
2128 }
2129 return (RDMA_FAILED);
2130 }
2131 mutex_exit(&qp->cb_lock);
2132 /*
2133 * Set the private data area to qp to be used in callbacks
2134 */
2135 ibt_set_chan_private(qp->qp_hdl, (void *)qp);
2136 return (RDMA_SUCCESS);
2137 }
2138
2139 rdma_stat
2140 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
2141 {
2142 struct sockaddr_in *sin4;
2143 struct sockaddr_in6 *sin6;
2144 ibt_path_attr_t path_attr;
2145 ibt_path_info_t path;
2146 ibt_status_t ibt_status;
2147
2148 ASSERT(raddr->buf != NULL);
2149
2150 bzero(&path_attr, sizeof (ibt_path_attr_t));
2151 bzero(&path, sizeof (ibt_path_info_t));
2152
2153 /*
2154 * Conctruct svc name
2155 */
2156 path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
2157 switch (addr_type) {
2158 case AF_INET:
2159 sin4 = (struct sockaddr_in *)raddr->buf;
2160 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
2161 IB_SVC_NAME_LEN);
2162 break;
2163
2164 case AF_INET6:
2165 sin6 = (struct sockaddr_in6 *)raddr->buf;
2166 (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
2167 path_attr.pa_sname, IB_SVC_NAME_LEN);
2168 break;
2169
2170 default:
2171 #ifdef DEBUG
2172 if (rib_debug) {
2173 cmn_err(CE_WARN, "rib_ping_srv: Address not recognized\n");
2174 }
2175 #endif
2176 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
2177 return (RDMA_INVAL);
2178 }
2179 (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
2180
2181 ibt_status = ibt_get_paths(rib_stat->ibt_clnt_hdl,
2182 IBT_PATH_NO_FLAGS, &path_attr, 1, &path, NULL);
2183 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
2184 if (ibt_status != IBT_SUCCESS) {
2185 if (rib_debug > 1) {
2186 cmn_err(CE_WARN, "rib_ping_srv: ibt_get_paths FAILED!"
2187 " status=%d\n", ibt_status);
2188 }
2189 } else if (path.pi_hca_guid) {
2190 ASSERT(path.pi_hca_guid == rib_stat->hca->hca_guid);
2191 *hca = rib_stat->hca;
2192 return (RDMA_SUCCESS);
2193 }
2194 return (RDMA_FAILED);
2195 }
2196
2197 /*
2198 * Close channel, remove from connection list and
2199 * free up resources allocated for that channel.
2200 */
2201 rdma_stat
2202 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2203 {
2204 rib_qp_t *qp = ctoqp(conn);
2205 rib_hca_t *hca;
2206
2207 /*
2208 * c_ref == 0 and connection is in C_DISCONN_PEND
2209 */
2210 hca = qp->hca;
2211 if (conn_list != NULL)
2212 (void) rib_rm_conn(conn, conn_list);
2213 if (qp->qp_hdl != NULL) {
2214 /*
2215 * If the channel has not been establised,
2216 * ibt_flush_channel is called to flush outstanding WRs
2217 * on the Qs. Otherwise, ibt_close_rc_channel() is
2218 * called. The channel is then freed.
2219 */
2220 if (conn_list != NULL)
2221 (void) ibt_close_rc_channel(qp->qp_hdl,
2222 IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
2223 else
2224 (void) ibt_flush_channel(qp->qp_hdl);
2225
2226 mutex_enter(&qp->posted_rbufs_lock);
2227 while (qp->n_posted_rbufs)
2228 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2229 mutex_exit(&qp->posted_rbufs_lock);
2230 (void) ibt_free_channel(qp->qp_hdl);
2231 qp->qp_hdl = NULL;
2232 }
2233 ASSERT(qp->rdlist == NULL);
2234 if (qp->replylist != NULL) {
2235 (void) rib_rem_replylist(qp);
2236 }
2237
2238 cv_destroy(&qp->cb_conn_cv);
2239 cv_destroy(&qp->posted_rbufs_cv);
2240 mutex_destroy(&qp->cb_lock);
2241
2242 mutex_destroy(&qp->replylist_lock);
2243 mutex_destroy(&qp->posted_rbufs_lock);
2244 mutex_destroy(&qp->rdlist_lock);
2245
2246 cv_destroy(&conn->c_cv);
2247 mutex_destroy(&conn->c_lock);
2248
2249 if (conn->c_raddr.buf != NULL) {
2250 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2251 }
2252 if (conn->c_laddr.buf != NULL) {
2253 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2254 }
2255
2256 /*
2257 * Credit control cleanup.
2258 */
2259 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2260 rdma_clnt_cred_ctrl_t *cc_info;
2261 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2262 cv_destroy(&cc_info->clnt_cc_cv);
2263 }
2264
2265 kmem_free(qp, sizeof (rib_qp_t));
2266
2267 /*
2268 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2269 * then the hca is no longer being used.
2270 */
2271 if (conn_list != NULL) {
2272 rw_enter(&hca->state_lock, RW_READER);
2273 if (hca->state == HCA_DETACHED) {
2274 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2275 if (hca->srv_conn_list.conn_hd == NULL) {
2276 rw_enter(&hca->cl_conn_list.conn_lock,
2277 RW_READER);
2278 if (hca->cl_conn_list.conn_hd == NULL) {
2279 mutex_enter(&hca->inuse_lock);
2280 hca->inuse = FALSE;
2281 cv_signal(&hca->cb_cv);
2282 mutex_exit(&hca->inuse_lock);
2283 }
2284 rw_exit(&hca->cl_conn_list.conn_lock);
2285 }
2286 rw_exit(&hca->srv_conn_list.conn_lock);
2287 }
2288 rw_exit(&hca->state_lock);
2289 }
2290
2291 num_clients--;
2292 return (RDMA_SUCCESS);
2293 }
2294
2295 #ifdef DYNAMIC_CREDIT_CONTROL
2296 void rib_get_resource_info(CONN *conn, int *current_clients, int *avail_bufs)
2297 {
2298 rib_qp_t *qp = ctoqp(conn);
2299 rib_hca_t *hca = qp->hca;
2300 rib_bufpool_t *rbp = NULL;
2301 bufpool_t *bp;
2302
2303 is_server = 1;
2304 rbp = hca->recv_pool;
2305
2306 if (rbp == NULL)
2307 *avail_bufs = 0;
2308 else {
2309 bp = rbp->bpool;
2310 *avail_bufs = bp->buffree;
2311 }
2312
2313 *current_clients = num_clients;
2314 }
2315 #endif
2316
2317 /*
2318 * Wait for send completion notification. Only on receiving a
2319 * notification be it a successful or error completion, free the
2320 * send_wid.
2321 */
2322 static rdma_stat
2323 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2324 {
2325 clock_t timout, cv_wait_ret;
2326 rdma_stat error = RDMA_SUCCESS;
2327 int i;
2328
2329 /*
2330 * Wait for send to complete
2331 */
2332 ASSERT(wd != NULL);
2333 mutex_enter(&wd->sendwait_lock);
2334 if (wd->status == (uint_t)SEND_WAIT) {
2335 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2336 ddi_get_lbolt();
2337 if (qp->mode == RIB_SERVER) {
2338 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2339 &wd->sendwait_lock, timout)) > 0 &&
2340 wd->status == (uint_t)SEND_WAIT)
2341 ;
2342 switch (cv_wait_ret) {
2343 case -1: /* timeout */
2344 #ifdef DEBUG
2345 if (rib_debug > 2)
2346 cmn_err(CE_WARN, "rib_sendwait: "
2347 "timed out qp %p\n", (void *)qp);
2348 #endif
2349 wd->cv_sig = 0; /* no signal needed */
2350 error = RDMA_TIMEDOUT;
2351 break;
2352 default: /* got send completion */
2353 break;
2354 }
2355 } else {
2356 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2357 &wd->sendwait_lock, timout)) > 0 &&
2358 wd->status == (uint_t)SEND_WAIT)
2359 ;
2360 switch (cv_wait_ret) {
2361 case -1: /* timeout */
2362 #ifdef DEBUG
2363 if (rib_debug > 2)
2364 cmn_err(CE_WARN, "rib_sendwait: "
2365 "timed out qp %p\n", (void *)qp);
2366 #endif
2367 wd->cv_sig = 0; /* no signal needed */
2368 error = RDMA_TIMEDOUT;
2369 break;
2370 case 0: /* interrupted */
2371 #ifdef DEBUG
2372 if (rib_debug > 2)
2373 cmn_err(CE_NOTE, "rib_sendwait:"
2374 " interrupted on qp %p\n",
2375 (void *)qp);
2376 #endif
2377 wd->cv_sig = 0; /* no signal needed */
2378 error = RDMA_INTR;
2379 break;
2380 default: /* got send completion */
2381 break;
2382 }
2383 }
2384 }
2385
2386 if (wd->status != (uint_t)SEND_WAIT) {
2387 /* got send completion */
2388 if (wd->status != RDMA_SUCCESS) {
2389 error = wd->status;
2390 if (wd->status != RDMA_CONNLOST)
2391 error = RDMA_FAILED;
2392 }
2393 for (i = 0; i < wd->nsbufs; i++) {
2394 rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2395 (void *)(uintptr_t)wd->sbufaddr[i]);
2396 }
2397 mutex_exit(&wd->sendwait_lock);
2398 (void) rib_free_sendwait(wd);
2399 } else {
2400 mutex_exit(&wd->sendwait_lock);
2401 }
2402
2403 return (error);
2404 }
2405
2406 static struct send_wid *
2407 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2408 {
2409 struct send_wid *wd;
2410
2411 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2412 wd->xid = xid;
2413 wd->cv_sig = cv_sig;
2414 wd->qp = qp;
2415 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2416 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2417 wd->status = (uint_t)SEND_WAIT;
2418
2419 return (wd);
2420 }
2421
2422 static int
2423 rib_free_sendwait(struct send_wid *wdesc)
2424 {
2425 cv_destroy(&wdesc->wait_cv);
2426 mutex_destroy(&wdesc->sendwait_lock);
2427 kmem_free(wdesc, sizeof (*wdesc));
2428
2429 return (0);
2430 }
2431
2432 static rdma_stat
2433 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2434 {
2435 mutex_enter(&qp->replylist_lock);
2436 if (rep != NULL) {
2437 (void) rib_remreply(qp, rep);
2438 mutex_exit(&qp->replylist_lock);
2439 return (RDMA_SUCCESS);
2440 }
2441 mutex_exit(&qp->replylist_lock);
2442 return (RDMA_FAILED);
2443 }
2444
2445 /*
2446 * Send buffers are freed here only in case of error in posting
2447 * on QP. If the post succeeded, the send buffers are freed upon
2448 * send completion in rib_sendwait() or in the scq_handler.
2449 */
2450 rdma_stat
2451 #if defined(ASYNC_SERVER_DEREG)
2452 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2453 int send_sig, int cv_sig, caddr_t c, caddr_t c1, int l1, caddr_t c2, int l2, int l3, int l4)
2454 #else
2455 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2456 int send_sig, int cv_sig, caddr_t *swid)
2457 #endif
2458 {
2459 struct send_wid *wdesc;
2460 struct clist *clp;
2461 ibt_status_t ibt_status = IBT_SUCCESS;
2462 rdma_stat ret = RDMA_SUCCESS;
2463 ibt_send_wr_t tx_wr;
2464 int i, nds;
2465 ibt_wr_ds_t sgl[DSEG_MAX];
2466 uint_t total_msg_size;
2467 rib_qp_t *qp = ctoqp(conn);
2468
2469 ASSERT(cl != NULL);
2470
2471 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2472
2473 nds = 0;
2474 total_msg_size = 0;
2475 clp = cl;
2476 while (clp != NULL) {
2477 if (nds >= DSEG_MAX) {
2478 cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX"
2479 " too small!");
2480 return (RDMA_FAILED);
2481 }
2482 sgl[nds].ds_va = clp->c_saddr;
2483 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2484 sgl[nds].ds_len = clp->c_len;
2485 total_msg_size += clp->c_len;
2486 clp = clp->c_next;
2487 nds++;
2488 }
2489
2490 if (send_sig) {
2491 /* Set SEND_SIGNAL flag. */
2492 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2493 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2494 *swid = (caddr_t)wdesc;
2495 } else {
2496 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2497 wdesc = rib_init_sendwait(msgid, 0, qp);
2498 *swid = (caddr_t)wdesc;
2499 }
2500 wdesc->nsbufs = nds;
2501 #if defined(ASYNC_SERVER_DEREG)
2502 wdesc->c = c;
2503 wdesc->c1 = c1;
2504 wdesc->c2 = c2;
2505 wdesc->l1 = l1;
2506 wdesc->l2 = l2;
2507 wdesc->wl = l3;
2508 wdesc->rl = l4;
2509 #endif
2510 for (i = 0; i < nds; i++) {
2511 wdesc->sbufaddr[i] = sgl[i].ds_va;
2512 }
2513
2514 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2515 tx_wr.wr_opcode = IBT_WRC_SEND;
2516 tx_wr.wr_trans = IBT_RC_SRV;
2517 tx_wr.wr_nds = nds;
2518 tx_wr.wr_sgl = sgl;
2519
2520 mutex_enter(&conn->c_lock);
2521 if (conn->c_state & C_CONNECTED) {
2522 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2523 }
2524 if (((conn->c_state & C_CONNECTED) == 0) ||
2525 ibt_status != IBT_SUCCESS) {
2526 mutex_exit(&conn->c_lock);
2527 for (i = 0; i < nds; i++) {
2528 rib_rbuf_free(conn, SEND_BUFFER,
2529 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2530 }
2531 (void) rib_free_sendwait(wdesc);
2532 #ifdef DEBUG
2533 if (rib_debug && ibt_status != IBT_SUCCESS)
2534 cmn_err(CE_WARN, "rib_send_and_wait: ibt_post_send "
2535 "failed! wr_id %llx on qpn %p, status=%d!",
2536 (longlong_t)tx_wr.wr_id, (void *)qp,
2537 ibt_status);
2538 #endif
2539 return (RDMA_FAILED);
2540 }
2541 mutex_exit(&conn->c_lock);
2542
2543 if (send_sig) {
2544 if (cv_sig) {
2545 /*
2546 * cv_wait for send to complete.
2547 * We can fail due to a timeout or signal or
2548 * unsuccessful send.
2549 */
2550 ret = rib_sendwait(qp, wdesc);
2551 #ifdef DEBUG
2552 if (rib_debug > 2)
2553 if (ret != 0) {
2554 cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait "
2555 "FAILED, rdma stat=%d, wr_id %llx, qp %p!",
2556 ret, (longlong_t)tx_wr.wr_id, (void *)qp);
2557 }
2558 #endif
2559 return (ret);
2560 }
2561 }
2562
2563 return (RDMA_SUCCESS);
2564 }
2565
2566 #if defined (CLNT_INTERRUPT_COAL)
2567 rdma_stat
2568 rib_send_bl(CONN *conn, struct clist *cl, uint32_t msgid)
2569 {
2570 rdma_stat ret;
2571 struct send_wid *sd, dlist;
2572 rib_qp_t *qp = ctoqp(conn);
2573 caddr_t wd;
2574 mutex_enter(&conn->c_lock);
2575 if((conn->c_count+1) >= (preposted_rbufs/2)){
2576 conn->c_count = 0;
2577 dlist.forw = dlist.back = &dlist;
2578 while(qp->wd.forw != &qp->wd){
2579 sd = qp->wd.forw;
2580 remque(sd);
2581 insque(sd,&dlist);
2582 }
2583 mutex_exit(&conn->c_lock);
2584 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2585 while(dlist.forw != &dlist){
2586 sd = dlist.forw;
2587 remque(dlist.forw);
2588 rib_scq_free((caddr_t)sd);
2589 }
2590 }else{
2591 mutex_exit(&conn->c_lock);
2592 wd = 0;
2593 ret = rib_send_and_wait(conn, cl, msgid, 0, 0, &wd);
2594 mutex_enter(&conn->c_lock);
2595 conn->c_count ++;
2596 insque(wd, &qp->wd);
2597 mutex_exit(&conn->c_lock);
2598 }
2599 return (ret);
2600 }
2601 #endif
2602
2603 rdma_stat
2604 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2605 {
2606 rdma_stat ret;
2607 /* send-wait & cv_signal */
2608 #if defined(ASYNC_SERVER_DEREG)
2609 ret = rib_send_and_wait(conn, cl, msgid,1,1,0,0,0,0,0,0,0, &wd);
2610 #else
2611 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2612 #endif
2613 return (ret);
2614 }
2615
2616 #if defined(ASYNC_SERVER_DEREG)
2617 rdma_stat
2618 rib_send_nw(CONN *conn, struct clist *cl, uint32_t msgid, caddr_t c, caddr_t c1, int c2, caddr_t c3, int c4, int c5, int c6)
2619 {
2620 rdma_stat ret;
2621 caddr_t *wid;
2622 /* send-wait & cv_signal */
2623 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, c, c1, c2, c3, c4, c5, c6, wid);
2624
2625 return (ret);
2626 }
2627 #endif
2628 /*
2629 * Server interface (svc_rdma_ksend).
2630 * Send RPC reply and wait for RDMA_DONE.
2631 */
2632 rdma_stat
2633 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2634 {
2635 rdma_stat ret = RDMA_SUCCESS;
2636 struct rdma_done_list *rd;
2637 clock_t timout, cv_wait_ret;
2638 caddr_t *wid;
2639 rib_qp_t *qp = ctoqp(conn);
2640
2641 mutex_enter(&qp->rdlist_lock);
2642 rd = rdma_done_add(qp, msgid);
2643
2644 /* No cv_signal (whether send-wait or no-send-wait) */
2645 #if defined(ASYNC_SERVER_DEREG)
2646 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, 0, 0, 0, 0, 0, 0, 0, wid);
2647 #else
2648 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2649 #endif
2650 if (ret != RDMA_SUCCESS) {
2651 #ifdef DEBUG
2652 cmn_err(CE_WARN, "rib_send_resp: send_and_wait "
2653 "failed, msgid %u, qp %p", msgid, (void *)qp);
2654 #endif
2655 rdma_done_rm(qp, rd);
2656 goto done;
2657 }
2658
2659 /*
2660 * Wait for RDMA_DONE from remote end
2661 */
2662 timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2663 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock,
2664 timout);
2665 rdma_done_rm(qp, rd);
2666 if (cv_wait_ret < 0) {
2667 #ifdef DEBUG
2668 if (rib_debug > 1) {
2669 cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
2670 " recv'd for qp %p, xid:%u\n",
2671 (void *)qp, msgid);
2672 }
2673 #endif
2674 ret = RDMA_TIMEDOUT;
2675 goto done;
2676 }
2677
2678 done:
2679 mutex_exit(&qp->rdlist_lock);
2680 return (ret);
2681 }
2682
2683 static struct recv_wid *
2684 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2685 {
2686 struct recv_wid *rwid;
2687
2688 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2689 rwid->xid = msgid;
2690 rwid->addr = sgl->ds_va;
2691 rwid->qp = qp;
2692
2693 return (rwid);
2694 }
2695
2696 static void
2697 rib_free_wid(struct recv_wid *rwid)
2698 {
2699 kmem_free(rwid, sizeof (struct recv_wid));
2700 }
2701
2702 rdma_stat
2703 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2704 {
2705 rib_qp_t *qp = ctoqp(conn);
2706 struct clist *clp = cl;
2707 struct reply *rep;
2708 struct recv_wid *rwid;
2709 int nds;
2710 ibt_wr_ds_t sgl[DSEG_MAX];
2711 ibt_recv_wr_t recv_wr;
2712 rdma_stat ret;
2713 ibt_status_t ibt_status;
2714
2715 /*
2716 * rdma_clnt_postrecv uses RECV_BUFFER.
2717 */
2718
2719 nds = 0;
2720 while (cl != NULL) {
2721 if (nds >= DSEG_MAX) {
2722 cmn_err(CE_WARN, "rib_clnt_post: DSEG_MAX too small!");
2723 ret = RDMA_FAILED;
2724 goto done;
2725 }
2726 sgl[nds].ds_va = cl->c_saddr;
2727 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2728 sgl[nds].ds_len = cl->c_len;
2729 cl = cl->c_next;
2730 nds++;
2731 }
2732
2733 if (nds != 1) {
2734 cmn_err(CE_WARN, "rib_clnt_post: nds!=1\n");
2735 ret = RDMA_FAILED;
2736 goto done;
2737 }
2738 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2739 recv_wr.wr_nds = nds;
2740 recv_wr.wr_sgl = sgl;
2741
2742 rwid = rib_create_wid(qp, &sgl[0], msgid);
2743 if (rwid) {
2744 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2745 } else {
2746 cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2747 ret = RDMA_NORESOURCE;
2748 goto done;
2749 }
2750 rep = rib_addreplylist(qp, msgid);
2751 if (!rep) {
2752 cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2753 rib_free_wid(rwid);
2754 ret = RDMA_NORESOURCE;
2755 goto done;
2756 }
2757
2758 mutex_enter(&conn->c_lock);
2759 if (conn->c_state & C_CONNECTED) {
2760 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2761 }
2762 if (((conn->c_state & C_CONNECTED) == 0) ||
2763 ibt_status != IBT_SUCCESS) {
2764 mutex_exit(&conn->c_lock);
2765 #ifdef DEBUG
2766 cmn_err(CE_WARN, "rib_clnt_post: QPN %p failed in "
2767 "ibt_post_recv(), msgid=%d, status=%d",
2768 (void *)qp, msgid, ibt_status);
2769 #endif
2770 rib_free_wid(rwid);
2771 (void) rib_rem_rep(qp, rep);
2772 ret = RDMA_FAILED;
2773 goto done;
2774 }
2775 mutex_exit(&conn->c_lock);
2776 return (RDMA_SUCCESS);
2777
2778 done:
2779 while (clp != NULL) {
2780 rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)clp->c_saddr);
2781 clp = clp->c_next;
2782 }
2783 return (ret);
2784 }
2785
2786 rdma_stat
2787 rib_svc_post(CONN* conn, struct clist *cl)
2788 {
2789 rib_qp_t *qp = ctoqp(conn);
2790 struct svc_recv *s_recvp;
2791 int nds;
2792 ibt_wr_ds_t sgl[DSEG_MAX];
2793 ibt_recv_wr_t recv_wr;
2794 ibt_status_t ibt_status;
2795
2796 nds = 0;
2797 while (cl != NULL) {
2798 if (nds >= DSEG_MAX) {
2799 cmn_err(CE_WARN, "rib_svc_post: DSEG_MAX too small!");
2800 return (RDMA_FAILED);
2801 }
2802 sgl[nds].ds_va = cl->c_saddr;
2803 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2804 sgl[nds].ds_len = cl->c_len;
2805 cl = cl->c_next;
2806 nds++;
2807 }
2808
2809 if (nds != 1) {
2810 cmn_err(CE_WARN, "rib_svc_post: nds!=1\n");
2811 rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)sgl[0].ds_va);
2812 return (RDMA_FAILED);
2813 }
2814 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2815 recv_wr.wr_nds = nds;
2816 recv_wr.wr_sgl = sgl;
2817
2818 s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2819 /* Use s_recvp's addr as wr id */
2820 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2821 mutex_enter(&conn->c_lock);
2822 if (conn->c_state & C_CONNECTED) {
2823 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2824 }
2825 if (((conn->c_state & C_CONNECTED) == 0) ||
2826 ibt_status != IBT_SUCCESS) {
2827 mutex_exit(&conn->c_lock);
2828 #ifdef DEBUG
2829 cmn_err(CE_WARN, "rib_svc_post: QP %p failed in "
2830 "ibt_post_recv(), status=%d",
2831 (void *)qp, ibt_status);
2832 #endif
2833 rib_rbuf_free(conn, RECV_BUFFER,
2834 (caddr_t)(uintptr_t)sgl[0].ds_va);
2835 (void) rib_free_svc_recv(s_recvp);
2836 return (RDMA_FAILED);
2837 }
2838 mutex_exit(&conn->c_lock);
2839
2840 return (RDMA_SUCCESS);
2841 }
2842
2843 /* Client */
2844 rdma_stat
2845 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2846 {
2847
2848 return (rib_clnt_post(conn, cl, msgid));
2849 }
2850
2851 /* Server */
2852 rdma_stat
2853 rib_post_recv(CONN *conn, struct clist *cl)
2854 {
2855 rib_qp_t *qp = ctoqp(conn);
2856
2857 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2858 mutex_enter(&qp->posted_rbufs_lock);
2859 qp->n_posted_rbufs++;
2860 mutex_exit(&qp->posted_rbufs_lock);
2861 return (RDMA_SUCCESS);
2862 }
2863 return (RDMA_FAILED);
2864 }
2865
2866 /*
2867 * Client side only interface to "recv" the rpc reply buf
2868 * posted earlier by rib_post_resp(conn, cl, msgid).
2869 */
2870 rdma_stat
2871 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2872 {
2873 struct reply *rep = NULL;
2874 clock_t timout, cv_wait_ret;
2875 rdma_stat ret = RDMA_SUCCESS;
2876 rib_qp_t *qp = ctoqp(conn);
2877
2878 /*
2879 * Find the reply structure for this msgid
2880 */
2881 mutex_enter(&qp->replylist_lock);
2882
2883 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2884 if (rep->xid == msgid)
2885 break;
2886 }
2887 if (rep != NULL) {
2888 /*
2889 * If message not yet received, wait.
2890 */
2891 if (rep->status == (uint_t)REPLY_WAIT) {
2892 timout = ddi_get_lbolt() +
2893 drv_usectohz(REPLY_WAIT_TIME * 1000000);
2894 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2895 &qp->replylist_lock, timout)) > 0 &&
2896 rep->status == (uint_t)REPLY_WAIT);
2897
2898 switch (cv_wait_ret) {
2899 case -1: /* timeout */
2900 ret = RDMA_TIMEDOUT;
2901 break;
2902 case 0:
2903 ret = RDMA_INTR;
2904 break;
2905 default:
2906 break;
2907 }
2908 }
2909
2910 if (rep->status == RDMA_SUCCESS) {
2911 struct clist *cl = NULL;
2912
2913 /*
2914 * Got message successfully
2915 */
2916 clist_add(&cl, 0, rep->bytes_xfer, NULL,
2917 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2918 *clp = cl;
2919 } else {
2920 if (rep->status != (uint_t)REPLY_WAIT) {
2921 /*
2922 * Got error in reply message. Free
2923 * recv buffer here.
2924 */
2925 ret = rep->status;
2926 rib_rbuf_free(conn, RECV_BUFFER,
2927 (caddr_t)(uintptr_t)rep->vaddr_cq);
2928 }
2929 }
2930 (void) rib_remreply(qp, rep);
2931 } else {
2932 /*
2933 * No matching reply structure found for given msgid on the
2934 * reply wait list.
2935 */
2936 ret = RDMA_INVAL;
2937 #ifdef DEBUG
2938 cmn_err(CE_WARN, "rib_recv: no matching reply for "
2939 "xid %u, qp %p\n", msgid, (void *)qp);
2940 #endif
2941 }
2942
2943 /*
2944 * Done.
2945 */
2946 mutex_exit(&qp->replylist_lock);
2947 return (ret);
2948 }
2949
2950 /*
2951 * RDMA write a buffer to the remote address.
2952 */
2953 rdma_stat
2954 rib_write(CONN *conn, struct clist *cl, int wait)
2955 {
2956 ibt_send_wr_t tx_wr;
2957 int cv_sig;
2958 ibt_wr_ds_t sgl[DSEG_MAX];
2959 struct send_wid *wdesc;
2960 ibt_status_t ibt_status;
2961 rdma_stat ret = RDMA_SUCCESS;
2962 rib_qp_t *qp = ctoqp(conn);
2963
2964 if (cl == NULL) {
2965 cmn_err(CE_WARN, "rib_write: NULL clist\n");
2966 return (RDMA_FAILED);
2967 }
2968
2969
2970 while ((cl != NULL)) {
2971 if(cl->c_len > 0){
2972 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2973 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr;
2974 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */
2975 sgl[0].ds_va = cl->c_saddr;
2976 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2977 sgl[0].ds_len = cl->c_len;
2978
2979 if (wait) {
2980 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2981 cv_sig = 1;
2982 } else {
2983 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2984 cv_sig = 0;
2985 }
2986
2987 wdesc = rib_init_sendwait(0, cv_sig, qp);
2988 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2989 tx_wr.wr_opcode = IBT_WRC_RDMAW;
2990 tx_wr.wr_trans = IBT_RC_SRV;
2991 tx_wr.wr_nds = 1;
2992 tx_wr.wr_sgl = sgl;
2993
2994 mutex_enter(&conn->c_lock);
2995 if (conn->c_state & C_CONNECTED) {
2996 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2997 }
2998 if (((conn->c_state & C_CONNECTED) == 0) ||
2999 ibt_status != IBT_SUCCESS) {
3000 mutex_exit(&conn->c_lock);
3001 (void) rib_free_sendwait(wdesc);
3002 return (RDMA_FAILED);
3003 }
3004 mutex_exit(&conn->c_lock);
3005
3006 /*
3007 * Wait for send to complete
3008 */
3009 if (wait) {
3010 ret = rib_sendwait(qp, wdesc);
3011 if (ret != 0) {
3012 return (ret);
3013 }
3014 }
3015 }
3016 cl = cl->c_next;
3017 }
3018 return (RDMA_SUCCESS);
3019 }
3020
3021 /*
3022 * RDMA Read a buffer from the remote address.
3023 */
3024 rdma_stat
3025 rib_read(CONN *conn, struct clist *cl, int wait)
3026 {
3027 ibt_send_wr_t rx_wr;
3028 int nds;
3029 int cv_sig;
3030 ibt_wr_ds_t sgl[DSEG_MAX]; /* is 2 sufficient? */
3031 struct send_wid *wdesc;
3032 ibt_status_t ibt_status = IBT_SUCCESS;
3033 rdma_stat ret = RDMA_SUCCESS;
3034 rib_qp_t *qp = ctoqp(conn);
3035
3036 if (cl == NULL) {
3037 cmn_err(CE_WARN, "rib_read: NULL clist\n");
3038 return (RDMA_FAILED);
3039 }
3040
3041 bzero(&rx_wr, sizeof (ibt_send_wr_t));
3042 /*
3043 * Remote address is at the head chunk item in list.
3044 */
3045 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_saddr;
3046 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; /* rkey */
3047
3048 nds = 0;
3049 while (cl != NULL) {
3050 if (nds >= DSEG_MAX) {
3051 cmn_err(CE_WARN, "rib_read: DSEG_MAX too small!");
3052 return (RDMA_FAILED);
3053 }
3054 sgl[nds].ds_va = cl->c_daddr;
3055 sgl[nds].ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
3056 sgl[nds].ds_len = cl->c_len;
3057 cl = cl->c_next;
3058 nds++;
3059 }
3060
3061 if (wait) {
3062 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
3063 cv_sig = 1;
3064 } else {
3065 rx_wr.wr_flags = IBT_WR_NO_FLAGS;
3066 cv_sig = 0;
3067 }
3068
3069 wdesc = rib_init_sendwait(0, cv_sig, qp);
3070 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
3071 rx_wr.wr_opcode = IBT_WRC_RDMAR;
3072 rx_wr.wr_trans = IBT_RC_SRV;
3073 rx_wr.wr_nds = nds;
3074 rx_wr.wr_sgl = sgl;
3075
3076 mutex_enter(&conn->c_lock);
3077 if (conn->c_state & C_CONNECTED) {
3078 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
3079 }
3080 if (((conn->c_state & C_CONNECTED) == 0) ||
3081 ibt_status != IBT_SUCCESS) {
3082 mutex_exit(&conn->c_lock);
3083 #ifdef DEBUG
3084 if (rib_debug && ibt_status != IBT_SUCCESS)
3085 cmn_err(CE_WARN, "rib_read: FAILED post_sending RDMAR"
3086 " wr_id %llx on qp %p, status=%d",
3087 (longlong_t)rx_wr.wr_id, (void *)qp,
3088 ibt_status);
3089 #endif
3090 (void) rib_free_sendwait(wdesc);
3091 return (RDMA_FAILED);
3092 }
3093 mutex_exit(&conn->c_lock);
3094
3095 /*
3096 * Wait for send to complete
3097 */
3098 if (wait) {
3099 ret = rib_sendwait(qp, wdesc);
3100 if (ret != 0) {
3101 return (ret);
3102 }
3103 }
3104
3105 return (RDMA_SUCCESS);
3106 }
3107
3108 int
3109 is_for_ipv4(ibt_ar_t *result)
3110 {
3111 int i, size = sizeof (struct in_addr);
3112 uint8_t zero = 0;
3113
3114 for (i = 0; i < (ATS_AR_DATA_LEN - size); i++)
3115 zero |= result->ar_data[i];
3116 return (zero == 0);
3117 }
3118
3119 /*
3120 * rib_srv_cm_handler()
3121 * Connection Manager callback to handle RC connection requests.
3122 */
3123 /* ARGSUSED */
3124 static ibt_cm_status_t
3125 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
3126 ibt_cm_return_args_t *ret_args, void *priv_data,
3127 ibt_priv_data_len_t len)
3128 {
3129 queue_t *q;
3130 rib_qp_t *qp;
3131 rpcib_state_t *ribstat;
3132 rib_hca_t *hca;
3133 rdma_stat status = RDMA_SUCCESS;
3134 int i;
3135 struct clist cl;
3136 rdma_buf_t rdbuf = {0};
3137 void *buf = NULL;
3138 ibt_cm_req_rcv_t cm_req_rcv;
3139 CONN *conn;
3140 ibt_status_t ibt_status;
3141 ibt_ar_t ar_query, ar_result;
3142 ib_gid_t sgid;
3143
3144
3145 ASSERT(any != NULL);
3146 ASSERT(event != NULL);
3147
3148 ribstat = (rpcib_state_t *)any;
3149 hca = (rib_hca_t *)ribstat->hca;
3150 ASSERT(hca != NULL);
3151
3152 /* got a connection request */
3153 switch (event->cm_type) {
3154 case IBT_CM_EVENT_REQ_RCV:
3155 /*
3156 * If the plugin is in the NO_ACCEPT state, bail out.
3157 */
3158 mutex_enter(&plugin_state_lock);
3159 if (plugin_state == NO_ACCEPT) {
3160 mutex_exit(&plugin_state_lock);
3161 return (IBT_CM_REJECT);
3162 }
3163 mutex_exit(&plugin_state_lock);
3164
3165 /*
3166 * Need to send a MRA MAD to CM so that it does not
3167 * timeout on us.
3168 */
3169 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
3170 event->cm_event.req.req_timeout * 8, NULL, 0);
3171
3172 mutex_enter(&rib_stat->open_hca_lock);
3173 q = rib_stat->q;
3174 mutex_exit(&rib_stat->open_hca_lock);
3175 status = rib_svc_create_chan(hca, (caddr_t)q,
3176 event->cm_event.req.req_prim_hca_port, &qp);
3177 if (status) {
3178 #ifdef DEBUG
3179 cmn_err(CE_WARN, "rib_srv_cm_handler: "
3180 "create_channel failed %d", status);
3181 #endif
3182 return (IBT_CM_REJECT);
3183 }
3184 cm_req_rcv = event->cm_event.req;
3185
3186 #ifdef DEBUG
3187 if (rib_debug > 2) {
3188 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3189 "server recv'ed IBT_CM_EVENT_REQ_RCV\n");
3190 cmn_err(CE_NOTE, "\t\t SID:%llx\n",
3191 (longlong_t)cm_req_rcv.req_service_id);
3192 cmn_err(CE_NOTE, "\t\t Local Port:%d\n",
3193 cm_req_rcv.req_prim_hca_port);
3194 cmn_err(CE_NOTE,
3195 "\t\t Remote GID:(prefix:%llx,guid:%llx)\n",
3196 (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_prefix,
3197 (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_guid);
3198 cmn_err(CE_NOTE, "\t\t Local GID:(prefix:%llx,guid:%llx)\n",
3199 (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_prefix,
3200 (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_guid);
3201 cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n",
3202 cm_req_rcv.req_remote_qpn);
3203 cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n",
3204 cm_req_rcv.req_remote_qkey);
3205 cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n",
3206 (void *)qp, (void *)qp->qp_hdl);
3207 }
3208
3209 if (rib_debug > 2) {
3210 ibt_rc_chan_query_attr_t chan_attrs;
3211
3212 if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs)
3213 == IBT_SUCCESS) {
3214 cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in "
3215 "CEP state %d\n", (void *)qp, chan_attrs.rc_state);
3216 }
3217 }
3218 #endif
3219
3220 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
3221 ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
3222 ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
3223 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
3224
3225 /*
3226 * Pre-posts RECV buffers
3227 */
3228 conn = qptoc(qp);
3229 for (i = 0; i < preposted_rbufs; i++) {
3230 bzero(&rdbuf, sizeof (rdbuf));
3231 rdbuf.type = RECV_BUFFER;
3232 buf = rib_rbuf_alloc(conn, &rdbuf);
3233 if (buf == NULL) {
3234 cmn_err(CE_WARN, "rib_svc_cm_handler: "
3235 "No RECV_BUFFER buf!\n");
3236 (void) rib_disconnect_channel(conn, NULL);
3237 return (IBT_CM_REJECT);
3238 }
3239
3240 bzero(&cl, sizeof (cl));
3241 cl.c_saddr = (uintptr_t)rdbuf.addr;
3242 cl.c_len = rdbuf.len;
3243 cl.c_smemhandle.mrc_lmr = rdbuf.handle.mrc_lmr; /* lkey */
3244 cl.c_next = NULL;
3245 status = rib_post_recv(conn, &cl);
3246 if (status != RDMA_SUCCESS) {
3247 cmn_err(CE_WARN, "rib_srv_cm_handler: failed "
3248 "posting RPC_REQ buf to qp %p!", (void *)qp);
3249 (void) rib_disconnect_channel(conn, NULL);
3250 return (IBT_CM_REJECT);
3251 }
3252 }
3253 (void) rib_add_connlist(conn, &hca->srv_conn_list);
3254
3255 /*
3256 * Get the address translation service record from ATS
3257 */
3258 rw_enter(&hca->state_lock, RW_READER);
3259 if (hca->state == HCA_DETACHED) {
3260 rw_exit(&hca->state_lock);
3261 return (IBT_CM_REJECT);
3262 }
3263 rw_exit(&hca->state_lock);
3264
3265 for (i = 0; i < hca->hca_nports; i++) {
3266 ibt_status = ibt_get_port_state(hca->hca_hdl, i+1,
3267 &sgid, NULL);
3268 if (ibt_status != IBT_SUCCESS) {
3269 if (rib_debug) {
3270 cmn_err(CE_WARN, "rib_srv_cm_handler: "
3271 "ibt_get_port_state FAILED!"
3272 "status = %d\n", ibt_status);
3273 }
3274 } else {
3275 /*
3276 * do ibt_query_ar()
3277 */
3278 bzero(&ar_query, sizeof (ar_query));
3279 bzero(&ar_result, sizeof (ar_result));
3280 ar_query.ar_gid = cm_req_rcv.req_prim_addr.av_dgid;
3281 ar_query.ar_pkey = event->cm_event.req.req_pkey;
3282 ibt_status = ibt_query_ar(&sgid, &ar_query,
3283 &ar_result);
3284 if (ibt_status != IBT_SUCCESS) {
3285 if (rib_debug) {
3286 cmn_err(CE_WARN, "rib_srv_cm_handler: "
3287 "ibt_query_ar FAILED!"
3288 "status = %d\n", ibt_status);
3289 }
3290 } else {
3291 conn = qptoc(qp);
3292
3293 if (is_for_ipv4(&ar_result)) {
3294 struct sockaddr_in *s;
3295 int sin_size = sizeof (struct sockaddr_in);
3296 int in_size = sizeof (struct in_addr);
3297 uint8_t *start_pos;
3298
3299 conn->c_raddr.maxlen =
3300 conn->c_raddr.len = sin_size;
3301 conn->c_raddr.buf = kmem_zalloc(sin_size,
3302 KM_SLEEP);
3303 s = (struct sockaddr_in *)conn->c_raddr.buf;
3304 s->sin_family = AF_INET;
3305 /*
3306 * For IPv4, the IP addr is stored in
3307 * the last four bytes of ar_data.
3308 */
3309 start_pos = ar_result.ar_data +
3310 ATS_AR_DATA_LEN - in_size;
3311 bcopy(start_pos, &s->sin_addr, in_size);
3312 if (rib_debug > 1) {
3313 char print_addr[INET_ADDRSTRLEN];
3314
3315 bzero(print_addr, INET_ADDRSTRLEN);
3316 (void) inet_ntop(AF_INET, &s->sin_addr,
3317 print_addr, INET_ADDRSTRLEN);
3318 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3319 "remote clnt_addr: %s\n", print_addr);
3320 }
3321 } else {
3322 struct sockaddr_in6 *s6;
3323 int sin6_size = sizeof (struct sockaddr_in6);
3324
3325 conn->c_raddr.maxlen =
3326 conn->c_raddr.len = sin6_size;
3327 conn->c_raddr.buf = kmem_zalloc(sin6_size,
3328 KM_SLEEP);
3329
3330 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3331 s6->sin6_family = AF_INET6;
3332 /* sin6_addr is stored in ar_data */
3333 bcopy(ar_result.ar_data, &s6->sin6_addr,
3334 sizeof (struct in6_addr));
3335 if (rib_debug > 1) {
3336 char print_addr[INET6_ADDRSTRLEN];
3337
3338 bzero(print_addr, INET6_ADDRSTRLEN);
3339 (void) inet_ntop(AF_INET6, &s6->sin6_addr,
3340 print_addr, INET6_ADDRSTRLEN);
3341 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3342 "remote clnt_addr: %s\n", print_addr);
3343 }
3344 }
3345 return (IBT_CM_ACCEPT);
3346 }
3347 }
3348 }
3349 if (rib_debug > 1) {
3350 cmn_err(CE_WARN, "rib_srv_cm_handler: "
3351 "address record query failed!");
3352 }
3353 break;
3354
3355 case IBT_CM_EVENT_CONN_CLOSED:
3356 {
3357 CONN *conn;
3358 rib_qp_t *qp;
3359
3360 switch (event->cm_event.closed) {
3361 case IBT_CM_CLOSED_DREP_RCVD:
3362 case IBT_CM_CLOSED_DREQ_TIMEOUT:
3363 case IBT_CM_CLOSED_DUP:
3364 case IBT_CM_CLOSED_ABORT:
3365 case IBT_CM_CLOSED_ALREADY:
3366 /*
3367 * These cases indicate the local end initiated
3368 * the closing of the channel. Nothing to do here.
3369 */
3370 break;
3371 default:
3372 /*
3373 * Reason for CONN_CLOSED event must be one of
3374 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3375 * or IBT_CM_CLOSED_STALE. These indicate cases were
3376 * the remote end is closing the channel. In these
3377 * cases free the channel and transition to error
3378 * state
3379 */
3380 qp = ibt_get_chan_private(event->cm_channel);
3381 conn = qptoc(qp);
3382 mutex_enter(&conn->c_lock);
3383 if (conn->c_state == C_DISCONN_PEND) {
3384 mutex_exit(&conn->c_lock);
3385 break;
3386 }
3387 conn->c_state = C_ERROR;
3388
3389 /*
3390 * Free the rc_channel. Channel has already
3391 * transitioned to ERROR state and WRs have been
3392 * FLUSHED_ERR already.
3393 */
3394 (void) ibt_free_channel(qp->qp_hdl);
3395 qp->qp_hdl = NULL;
3396
3397 /*
3398 * Free the conn if c_ref goes down to 0
3399 */
3400 if (conn->c_ref == 0) {
3401 /*
3402 * Remove from list and free conn
3403 */
3404 conn->c_state = C_DISCONN_PEND;
3405 mutex_exit(&conn->c_lock);
3406 (void) rib_disconnect_channel(conn,
3407 &hca->srv_conn_list);
3408 } else {
3409 mutex_exit(&conn->c_lock);
3410 }
3411 #ifdef DEBUG
3412 if (rib_debug)
3413 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3414 " (CONN_CLOSED) channel disconnected");
3415 #endif
3416 break;
3417 }
3418 break;
3419 }
3420 case IBT_CM_EVENT_CONN_EST:
3421 /*
3422 * RTU received, hence connection established.
3423 */
3424 if (rib_debug > 1)
3425 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3426 "(CONN_EST) channel established");
3427 break;
3428
3429 default:
3430 if (rib_debug > 2) {
3431 /* Let CM handle the following events. */
3432 if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3433 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3434 "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3435 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3436 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3437 "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3438 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3439 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3440 "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3441 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3442 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3443 "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3444 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3445 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3446 "server recv'ed IBT_CM_EVENT_FAILURE\n");
3447 }
3448 }
3449 return (IBT_CM_REJECT);
3450 }
3451
3452 /* accept all other CM messages (i.e. let the CM handle them) */
3453 return (IBT_CM_ACCEPT);
3454 }
3455
3456 static rdma_stat
3457 rib_register_ats(rib_hca_t *hca)
3458 {
3459 ibt_hca_portinfo_t *port_infop;
3460 uint_t port_size;
3461 uint_t pki, i, num_ports, nbinds;
3462 ibt_status_t ibt_status;
3463 rib_service_t *new_service, *temp_srv;
3464 rpcib_ats_t *atsp;
3465 rpcib_ibd_insts_t ibds;
3466 ib_pkey_t pkey;
3467 ibt_ar_t ar; /* address record */
3468
3469 /*
3470 * Query all ports for the given HCA
3471 */
3472 rw_enter(&hca->state_lock, RW_READER);
3473 if (hca->state != HCA_DETACHED) {
3474 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3475 &num_ports, &port_size);
3476 rw_exit(&hca->state_lock);
3477 } else {
3478 rw_exit(&hca->state_lock);
3479 return (RDMA_FAILED);
3480 }
3481 if (ibt_status != IBT_SUCCESS) {
3482 #ifdef DEBUG
3483 if (rib_debug) {
3484 cmn_err(CE_NOTE, "rib_register_ats: FAILED in "
3485 "ibt_query_hca_ports, status = %d\n", ibt_status);
3486 }
3487 #endif
3488 return (RDMA_FAILED);
3489 }
3490
3491 #ifdef DEBUG
3492 if (rib_debug > 1) {
3493 cmn_err(CE_NOTE, "rib_register_ats: Ports detected "
3494 "%d\n", num_ports);
3495
3496 for (i = 0; i < num_ports; i++) {
3497 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3498 cmn_err(CE_WARN, "rib_register_ats "
3499 "Port #: %d INACTIVE\n", i+1);
3500 } else if (port_infop[i].p_linkstate ==
3501 IBT_PORT_ACTIVE) {
3502 cmn_err(CE_NOTE, "rib_register_ats "
3503 "Port #: %d ACTIVE\n", i+1);
3504 }
3505 }
3506 }
3507 #endif
3508
3509 ibds.rib_ibd_alloc = N_IBD_INSTANCES;
3510 ibds.rib_ibd_cnt = 0;
3511 ibds.rib_ats = (rpcib_ats_t *)kmem_zalloc(ibds.rib_ibd_alloc *
3512 sizeof (rpcib_ats_t), KM_SLEEP);
3513 rib_get_ibd_insts(&ibds);
3514
3515 if (ibds.rib_ibd_cnt == 0) {
3516 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3517 sizeof (rpcib_ats_t));
3518 ibt_free_portinfo(port_infop, port_size);
3519 return (RDMA_FAILED);
3520 }
3521
3522 /*
3523 * Get the IP addresses of active ports and
3524 * register them with ATS. IPv4 addresses
3525 * have precedence over IPv6 addresses.
3526 */
3527 if (get_ibd_ipaddr(&ibds) != 0) {
3528 #ifdef DEBUG
3529 if (rib_debug > 1) {
3530 cmn_err(CE_WARN, "rib_register_ats: "
3531 "get_ibd_ipaddr failed");
3532 }
3533 #endif
3534 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3535 sizeof (rpcib_ats_t));
3536 ibt_free_portinfo(port_infop, port_size);
3537 return (RDMA_FAILED);
3538 }
3539
3540 /*
3541 * Start ATS registration for active ports on this HCA.
3542 */
3543 rw_enter(&hca->service_list_lock, RW_WRITER);
3544 nbinds = 0;
3545 new_service = NULL;
3546 for (i = 0; i < num_ports; i++) {
3547 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3548 continue;
3549
3550 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3551 pkey = port_infop[i].p_pkey_tbl[pki];
3552 if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3553 ar.ar_gid = port_infop[i].p_sgid_tbl[0];
3554 ar.ar_pkey = pkey;
3555 atsp = get_ibd_entry(&ar.ar_gid, pkey, &ibds);
3556 if (atsp == NULL)
3557 continue;
3558 /*
3559 * store the sin[6]_addr in ar_data
3560 */
3561 (void) bzero(ar.ar_data, ATS_AR_DATA_LEN);
3562 if (atsp->ras_inet_type == AF_INET) {
3563 uint8_t *start_pos;
3564
3565 /*
3566 * The ipv4 addr goes into the last
3567 * four bytes of ar_data.
3568 */
3569 start_pos = ar.ar_data + ATS_AR_DATA_LEN -
3570 sizeof (struct in_addr);
3571 bcopy(&atsp->ras_sin.sin_addr, start_pos,
3572 sizeof (struct in_addr));
3573 } else if (atsp->ras_inet_type == AF_INET6) {
3574 bcopy(&atsp->ras_sin6.sin6_addr, ar.ar_data,
3575 sizeof (struct in6_addr));
3576 } else
3577 continue;
3578
3579 ibt_status = ibt_register_ar(hca->ibt_clnt_hdl, &ar);
3580 if (ibt_status == IBT_SUCCESS) {
3581 #ifdef DEBUG
3582 if (rib_debug > 1) {
3583 cmn_err(CE_WARN, "rib_register_ats: "
3584 "ibt_register_ar OK on port %d", i+1);
3585 }
3586 #endif
3587 /*
3588 * Allocate and prepare a service entry
3589 */
3590 new_service = kmem_zalloc(sizeof (rib_service_t),
3591 KM_SLEEP);
3592 new_service->srv_port = i + 1;
3593 new_service->srv_ar = ar;
3594 new_service->srv_next = NULL;
3595
3596 /*
3597 * Add to the service list for this HCA
3598 */
3599 new_service->srv_next = hca->ats_list;
3600 hca->ats_list = new_service;
3601 new_service = NULL;
3602 nbinds ++;
3603 } else {
3604 #ifdef DEBUG
3605 if (rib_debug > 1) {
3606 cmn_err(CE_WARN, "rib_register_ats: "
3607 "ibt_register_ar FAILED on port %d", i+1);
3608 }
3609 #endif
3610 }
3611 }
3612 }
3613 }
3614
3615 #ifdef DEBUG
3616 if (rib_debug > 1) {
3617 for (temp_srv = hca->ats_list; temp_srv != NULL;
3618 temp_srv = temp_srv->srv_next) {
3619 cmn_err(CE_NOTE, "Service: ATS, active on"
3620 " port: %d\n", temp_srv->srv_port);
3621 }
3622 }
3623 #endif
3624
3625 rw_exit(&hca->service_list_lock);
3626 kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * sizeof (rpcib_ats_t));
3627 ibt_free_portinfo(port_infop, port_size);
3628
3629 if (nbinds == 0) {
3630 #ifdef DEBUG
3631 if (rib_debug > 1) {
3632 cmn_err(CE_WARN, "rib_register_ats FAILED!\n");
3633 }
3634 #endif
3635 return (RDMA_FAILED);
3636 }
3637 return (RDMA_SUCCESS);
3638 }
3639
3640 static rdma_stat
3641 rib_register_service(rib_hca_t *hca, int service_type)
3642 {
3643 ibt_srv_desc_t sdesc;
3644 ibt_srv_bind_t sbind;
3645 ibt_hca_portinfo_t *port_infop;
3646 ib_svc_id_t srv_id;
3647 ibt_srv_hdl_t srv_hdl;
3648 uint_t port_size;
3649 uint_t pki, i, j, num_ports, nbinds;
3650 ibt_status_t ibt_status;
3651 char **addrs;
3652 int addr_count;
3653 rib_service_t *new_service, *temp_srv;
3654 ib_pkey_t pkey;
3655
3656 /*
3657 * Query all ports for the given HCA
3658 */
3659 rw_enter(&hca->state_lock, RW_READER);
3660 if (hca->state != HCA_DETACHED) {
3661 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3662 &num_ports, &port_size);
3663 rw_exit(&hca->state_lock);
3664 } else {
3665 rw_exit(&hca->state_lock);
3666 return (RDMA_FAILED);
3667 }
3668 if (ibt_status != IBT_SUCCESS) {
3669 #ifdef DEBUG
3670 cmn_err(CE_NOTE, "rib_register_service: FAILED in "
3671 "ibt_query_hca_ports, status = %d\n", ibt_status);
3672 #endif
3673 return (RDMA_FAILED);
3674 }
3675
3676 #ifdef DEBUG
3677 if (rib_debug > 1) {
3678 cmn_err(CE_NOTE, "rib_register_service: Ports detected "
3679 "%d\n", num_ports);
3680
3681 for (i = 0; i < num_ports; i++) {
3682 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3683 cmn_err(CE_WARN, "rib_register_service "
3684 "Port #: %d INACTIVE\n", i+1);
3685 } else if (port_infop[i].p_linkstate ==
3686 IBT_PORT_ACTIVE) {
3687 cmn_err(CE_NOTE, "rib_register_service "
3688 "Port #: %d ACTIVE\n", i+1);
3689 }
3690 }
3691 }
3692 #endif
3693 /*
3694 * Get all the IP addresses on this system to register the
3695 * given "service type" on all DNS recognized IP addrs.
3696 * Each service type such as NFS will have all the systems
3697 * IP addresses as its different names. For now the only
3698 * type of service we support in RPCIB is NFS.
3699 */
3700 addrs = get_ip_addrs(&addr_count);
3701 if (addrs == NULL) {
3702 #ifdef DEBUG
3703 if (rib_debug) {
3704 cmn_err(CE_WARN, "rib_register_service: "
3705 "get_ip_addrs failed\n");
3706 }
3707 #endif
3708 ibt_free_portinfo(port_infop, port_size);
3709 return (RDMA_FAILED);
3710 }
3711
3712 #ifdef DEBUG
3713 if (rib_debug > 1) {
3714 for (i = 0; i < addr_count; i++)
3715 cmn_err(CE_NOTE, "addr %d: %s\n", i, addrs[i]);
3716 }
3717 #endif
3718
3719 rw_enter(&hca->service_list_lock, RW_WRITER);
3720 /*
3721 * Start registering and binding service to active
3722 * on active ports on this HCA.
3723 */
3724 nbinds = 0;
3725 new_service = NULL;
3726
3727 /*
3728 * We use IP addresses as the service names for
3729 * service registration. Register each of them
3730 * with CM to obtain a svc_id and svc_hdl. We do not
3731 * register the service with machine's loopback address.
3732 */
3733 for (j = 1; j < addr_count; j++) {
3734 (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3735 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3736 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3737
3738 sdesc.sd_handler = rib_srv_cm_handler;
3739 sdesc.sd_flags = 0;
3740
3741 ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3742 &sdesc, 0, 1, &srv_hdl, &srv_id);
3743 if (ibt_status != IBT_SUCCESS) {
3744 #ifdef DEBUG
3745 if (rib_debug) {
3746 cmn_err(CE_WARN, "rib_register_service: "
3747 "ibt_register_service FAILED, status "
3748 "= %d\n", ibt_status);
3749 }
3750 #endif
3751 /*
3752 * No need to go on, since we failed to obtain
3753 * a srv_id and srv_hdl. Move on to the next
3754 * IP addr as a service name.
3755 */
3756 continue;
3757 }
3758 for (i = 0; i < num_ports; i++) {
3759 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3760 continue;
3761
3762 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3763 pkey = port_infop[i].p_pkey_tbl[pki];
3764 if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3765
3766 /*
3767 * Allocate and prepare a service entry
3768 */
3769 new_service = kmem_zalloc(1 * sizeof (rib_service_t),
3770 KM_SLEEP);
3771 new_service->srv_type = service_type;
3772 new_service->srv_port = i + 1;
3773 new_service->srv_id = srv_id;
3774 new_service->srv_hdl = srv_hdl;
3775 new_service->srv_sbind_hdl = kmem_zalloc(1 *
3776 sizeof (ibt_sbind_hdl_t), KM_SLEEP);
3777
3778 new_service->srv_name = kmem_zalloc(IB_SVC_NAME_LEN,
3779 KM_SLEEP);
3780 (void) bcopy(addrs[j], new_service->srv_name,
3781 IB_SVC_NAME_LEN);
3782 (void) strlcat(new_service->srv_name, "::NFS",
3783 IB_SVC_NAME_LEN);
3784 new_service->srv_next = NULL;
3785
3786 /*
3787 * Bind the service, specified by the IP address,
3788 * to the port/pkey using the srv_hdl returned
3789 * from ibt_register_service().
3790 */
3791 (void) bzero(&sbind, sizeof (ibt_srv_bind_t));
3792 sbind.sb_pkey = pkey;
3793 sbind.sb_lease = 0xFFFFFFFF;
3794 sbind.sb_key[0] = NFS_SEC_KEY0;
3795 sbind.sb_key[1] = NFS_SEC_KEY1;
3796 sbind.sb_name = new_service->srv_name;
3797
3798 #ifdef DEBUG
3799 if (rib_debug > 1) {
3800 cmn_err(CE_NOTE, "rib_register_service: "
3801 "binding service using name: %s\n",
3802 sbind.sb_name);
3803 }
3804 #endif
3805 ibt_status = ibt_bind_service(srv_hdl,
3806 port_infop[i].p_sgid_tbl[0], &sbind, rib_stat,
3807 new_service->srv_sbind_hdl);
3808 if (ibt_status != IBT_SUCCESS) {
3809 #ifdef DEBUG
3810 if (rib_debug) {
3811 cmn_err(CE_WARN, "rib_register_service: FAILED"
3812 " in ibt_bind_service, status = %d\n",
3813 ibt_status);
3814 }
3815 #endif
3816 kmem_free(new_service->srv_sbind_hdl,
3817 sizeof (ibt_sbind_hdl_t));
3818 kmem_free(new_service->srv_name,
3819 IB_SVC_NAME_LEN);
3820 kmem_free(new_service,
3821 sizeof (rib_service_t));
3822 new_service = NULL;
3823 continue;
3824 }
3825 #ifdef DEBUG
3826 if (rib_debug > 1) {
3827 if (ibt_status == IBT_SUCCESS)
3828 cmn_err(CE_NOTE, "rib_regstr_service: "
3829 "Serv: %s REGISTERED on port: %d",
3830 sbind.sb_name, i+1);
3831 }
3832 #endif
3833 /*
3834 * Add to the service list for this HCA
3835 */
3836 new_service->srv_next = hca->service_list;
3837 hca->service_list = new_service;
3838 new_service = NULL;
3839 nbinds ++;
3840 }
3841 }
3842 }
3843 }
3844 rw_exit(&hca->service_list_lock);
3845
3846 #ifdef DEBUG
3847 if (rib_debug > 1) {
3848 /*
3849 * Change this print to a more generic one, as rpcib
3850 * is supposed to handle multiple service types.
3851 */
3852 for (temp_srv = hca->service_list; temp_srv != NULL;
3853 temp_srv = temp_srv->srv_next) {
3854 cmn_err(CE_NOTE, "NFS-IB, active on port:"
3855 " %d\n"
3856 "Using name: %s", temp_srv->srv_port,
3857 temp_srv->srv_name);
3858 }
3859 }
3860 #endif
3861
3862 ibt_free_portinfo(port_infop, port_size);
3863 for (i = 0; i < addr_count; i++) {
3864 if (addrs[i])
3865 kmem_free(addrs[i], IB_SVC_NAME_LEN);
3866 }
3867 kmem_free(addrs, addr_count * sizeof (char *));
3868
3869 if (nbinds == 0) {
3870 #ifdef DEBUG
3871 if (rib_debug) {
3872 cmn_err(CE_WARN, "rib_register_service: "
3873 "bind_service FAILED!\n");
3874 }
3875 #endif
3876 return (RDMA_FAILED);
3877 } else {
3878 /*
3879 * Put this plugin into accept state, since atleast
3880 * one registration was successful.
3881 */
3882 mutex_enter(&plugin_state_lock);
3883 plugin_state = ACCEPT;
3884 mutex_exit(&plugin_state_lock);
3885 return (RDMA_SUCCESS);
3886 }
3887 }
3888
3889 void
3890 rib_listen(struct rdma_svc_data *rd)
3891 {
3892 rdma_stat status = RDMA_SUCCESS;
3893
3894 rd->active = 0;
3895 rd->err_code = RDMA_FAILED;
3896
3897 /*
3898 * First check if a hca is still attached
3899 */
3900 rw_enter(&rib_stat->hca->state_lock, RW_READER);
3901 if (rib_stat->hca->state != HCA_INITED) {
3902 rw_exit(&rib_stat->hca->state_lock);
3903 return;
3904 }
3905 rw_exit(&rib_stat->hca->state_lock);
3906
3907 rib_stat->q = &rd->q;
3908 /*
3909 * Register the Address translation service
3910 */
3911 mutex_enter(&rib_stat->open_hca_lock);
3912 if (ats_running == 0) {
3913 if (rib_register_ats(rib_stat->hca) != RDMA_SUCCESS) {
3914 #ifdef DEBUG
3915 if (rib_debug) {
3916 cmn_err(CE_WARN,
3917 "rib_listen(): ats registration failed!");
3918 }
3919 #endif
3920 mutex_exit(&rib_stat->open_hca_lock);
3921 return;
3922 } else {
3923 ats_running = 1;
3924 }
3925 }
3926 mutex_exit(&rib_stat->open_hca_lock);
3927
3928 /*
3929 * Right now the only service type is NFS. Hence force feed this
3930 * value. Ideally to communicate the service type it should be
3931 * passed down in rdma_svc_data.
3932 */
3933 rib_stat->service_type = NFS;
3934 status = rib_register_service(rib_stat->hca, NFS);
3935 if (status != RDMA_SUCCESS) {
3936 rd->err_code = status;
3937 return;
3938 }
3939 /*
3940 * Service active on an HCA, check rd->err_code for more
3941 * explainable errors.
3942 */
3943 rd->active = 1;
3944 rd->err_code = status;
3945 }
3946
3947 /* XXXX */
3948 /* ARGSUSED */
3949 static void
3950 rib_listen_stop(struct rdma_svc_data *svcdata)
3951 {
3952 rib_hca_t *hca;
3953
3954 /*
3955 * KRPC called the RDMATF to stop the listeners, this means
3956 * stop sending incomming or recieved requests to KRPC master
3957 * transport handle for RDMA-IB. This is also means that the
3958 * master transport handle, responsible for us, is going away.
3959 */
3960 mutex_enter(&plugin_state_lock);
3961 plugin_state = NO_ACCEPT;
3962 if (svcdata != NULL)
3963 svcdata->active = 0;
3964 mutex_exit(&plugin_state_lock);
3965
3966 /*
3967 * First check if a hca is still attached
3968 */
3969 hca = rib_stat->hca;
3970 rw_enter(&hca->state_lock, RW_READER);
3971 if (hca->state != HCA_INITED) {
3972 rw_exit(&hca->state_lock);
3973 return;
3974 }
3975 rib_stop_services(hca);
3976 rw_exit(&hca->state_lock);
3977 }
3978
3979 /*
3980 * Traverse the HCA's service list to unbind and deregister services.
3981 * Instead of unbinding the service for a service handle by
3982 * calling ibt_unbind_service() for each port/pkey, we unbind
3983 * all the services for the service handle by making only one
3984 * call to ibt_unbind_all_services(). Then, we deregister the
3985 * service for the service handle.
3986 *
3987 * When traversing the entries in service_list, we compare the
3988 * srv_hdl of the current entry with that of the next. If they
3989 * are different or if the next entry is NULL, the current entry
3990 * marks the last binding of the service handle. In this case,
3991 * call ibt_unbind_all_services() and deregister the service for
3992 * the service handle. If they are the same, the current and the
3993 * next entries are bound to the same service handle. In this
3994 * case, move on to the next entry.
3995 */
3996 static void
3997 rib_stop_services(rib_hca_t *hca)
3998 {
3999 rib_service_t *srv_list, *to_remove;
4000 ibt_status_t ibt_status;
4001
4002 /*
4003 * unbind and deregister the services for this service type.
4004 * Right now there is only one service type. In future it will
4005 * be passed down to this function.
4006 */
4007 rw_enter(&hca->service_list_lock, RW_WRITER);
4008 srv_list = hca->service_list;
4009 while (srv_list != NULL) {
4010 to_remove = srv_list;
4011 srv_list = to_remove->srv_next;
4012 if (srv_list == NULL || bcmp(to_remove->srv_hdl,
4013 srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
4014
4015 ibt_status = ibt_unbind_all_services(to_remove->srv_hdl);
4016 if (ibt_status != IBT_SUCCESS) {
4017 cmn_err(CE_WARN, "rib_listen_stop: "
4018 "ibt_unbind_all_services FAILED"
4019 " status: %d\n", ibt_status);
4020 }
4021
4022 ibt_status =
4023 ibt_deregister_service(hca->ibt_clnt_hdl,
4024 to_remove->srv_hdl);
4025 if (ibt_status != IBT_SUCCESS) {
4026 cmn_err(CE_WARN, "rib_listen_stop: "
4027 "ibt_deregister_service FAILED"
4028 " status: %d\n", ibt_status);
4029 }
4030
4031 #ifdef DEBUG
4032 if (rib_debug > 1) {
4033 if (ibt_status == IBT_SUCCESS)
4034 cmn_err(CE_NOTE, "rib_listen_stop: "
4035 "Successfully stopped and"
4036 " UNREGISTERED service: %s\n",
4037 to_remove->srv_name);
4038 }
4039 #endif
4040 }
4041 kmem_free(to_remove->srv_name, IB_SVC_NAME_LEN);
4042 kmem_free(to_remove->srv_sbind_hdl,
4043 sizeof (ibt_sbind_hdl_t));
4044
4045 kmem_free(to_remove, sizeof (rib_service_t));
4046 }
4047 hca->service_list = NULL;
4048 rw_exit(&hca->service_list_lock);
4049 }
4050
4051 static struct svc_recv *
4052 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
4053 {
4054 struct svc_recv *recvp;
4055
4056 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
4057 recvp->vaddr = sgl->ds_va;
4058 recvp->qp = qp;
4059 recvp->bytes_xfer = 0;
4060 return (recvp);
4061 }
4062
4063 static int
4064 rib_free_svc_recv(struct svc_recv *recvp)
4065 {
4066 kmem_free(recvp, sizeof (*recvp));
4067
4068 return (0);
4069 }
4070
4071 static struct reply *
4072 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
4073 {
4074 struct reply *rep;
4075
4076
4077 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
4078 if (rep == NULL) {
4079 mutex_exit(&qp->replylist_lock);
4080 cmn_err(CE_WARN, "rib_addreplylist: no memory\n");
4081 return (NULL);
4082 }
4083 rep->xid = msgid;
4084 rep->vaddr_cq = NULL;
4085 rep->bytes_xfer = 0;
4086 rep->status = (uint_t)REPLY_WAIT;
4087 rep->prev = NULL;
4088 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
4089
4090 mutex_enter(&qp->replylist_lock);
4091 if (qp->replylist) {
4092 rep->next = qp->replylist;
4093 qp->replylist->prev = rep;
4094 }
4095 qp->rep_list_size++;
4096 if (rib_debug > 1)
4097 cmn_err(CE_NOTE, "rib_addreplylist: qp:%p, rep_list_size:%d\n",
4098 (void *)qp, qp->rep_list_size);
4099 qp->replylist = rep;
4100 mutex_exit(&qp->replylist_lock);
4101
4102 return (rep);
4103 }
4104
4105 static rdma_stat
4106 rib_rem_replylist(rib_qp_t *qp)
4107 {
4108 struct reply *r, *n;
4109
4110 mutex_enter(&qp->replylist_lock);
4111 for (r = qp->replylist; r != NULL; r = n) {
4112 n = r->next;
4113 (void) rib_remreply(qp, r);
4114 }
4115 mutex_exit(&qp->replylist_lock);
4116
4117 return (RDMA_SUCCESS);
4118 }
4119
4120 static int
4121 rib_remreply(rib_qp_t *qp, struct reply *rep)
4122 {
4123
4124 ASSERT(MUTEX_HELD(&qp->replylist_lock));
4125 if (rep->prev) {
4126 rep->prev->next = rep->next;
4127 }
4128 if (rep->next) {
4129 rep->next->prev = rep->prev;
4130 }
4131 if (qp->replylist == rep)
4132 qp->replylist = rep->next;
4133
4134 cv_destroy(&rep->wait_cv);
4135 qp->rep_list_size--;
4136 if (rib_debug > 1)
4137 cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n",
4138 (void *)qp, qp->rep_list_size);
4139
4140 kmem_free(rep, sizeof (*rep));
4141
4142 return (0);
4143 }
4144
4145 rdma_stat
4146 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
4147 struct mrc *buf_handle)
4148 {
4149 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
4150 #ifdef IB_FMR_SUP
4151 ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */
4152 ibt_ma_hdl_t ma_hdl = NULL;
4153 #endif
4154 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
4155 rdma_stat status;
4156 rib_hca_t *hca = (ctoqp(conn))->hca;
4157
4158 /*
4159 * Note: ALL buffer pools use the same memory type RDMARW.
4160 */
4161 #ifdef IB_FMR_SUP
4162 status = rib_reg_mem_fmr(hca, adsp, buf, buflen, 0, &mr_hdl, &ma_hdl,
4163 &pmr_desc);
4164 if (status == RDMA_SUCCESS) {
4165 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4166 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey;
4167 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey;
4168 buf_handle->mrc_lma = (uintptr_t)ma_hdl;
4169 goto ret_stat;
4170 } else {
4171 buf_handle->mrc_linfo = NULL;
4172 buf_handle->mrc_lma = NULL;
4173 buf_handle->mrc_lmr = 0;
4174 buf_handle->mrc_rmr = 0;
4175 }
4176 #endif
4177 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
4178 if (status == RDMA_SUCCESS) {
4179 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4180 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4181 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4182 } else {
4183 buf_handle->mrc_linfo = NULL;
4184 buf_handle->mrc_lmr = 0;
4185 buf_handle->mrc_rmr = 0;
4186 }
4187 ret_stat:
4188 return (status);
4189 }
4190
4191 #ifdef IB_FMR_SUP
4192 static rdma_stat
4193 rib_reg_mem_fmr(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec,
4194 ibt_mr_hdl_t *mr_hdlp, ibt_ma_hdl_t *ma_hdlp, ibt_pmr_desc_t *pmr_descp)
4195 {
4196 ibt_va_attr_t va_attr;
4197 ibt_phys_buf_t *paddr_list;
4198 uint_t paddr_list_len, num_paddr;
4199 size_t buf_sz = 0;
4200 ibt_pmr_attr_t pmr_attr;
4201 ib_memlen_t paddr_offset;
4202 ibt_status_t ibt_status;
4203 uint_t h_page_sz;
4204 if(adsp)
4205 return(RDMA_FAILED);
4206 bzero(&va_attr, sizeof (ibt_va_attr_t));
4207 va_attr.va_vaddr = (ib_vaddr_t)buf;
4208 va_attr.va_len = size;
4209 va_attr.va_as = (struct as *)(caddr_t)adsp;
4210 va_attr.va_flags = IBT_VA_FMR | IBT_VA_SLEEP;
4211 if (spec == IBT_MR_NONCOHERENT)
4212 va_attr.va_flags |= IBT_VA_NONCOHERENT;
4213 va_attr.va_phys_buf_min = va_attr.va_phys_buf_max = 0;
4214
4215 h_page_sz = hca->hca_attrs.hca_page_sz * 1024;
4216 paddr_list_len = (size / h_page_sz) + 2;
4217 paddr_list = (ibt_phys_buf_t *)kmem_zalloc(sizeof (ibt_phys_buf_t) *
4218 paddr_list_len, KM_NOSLEEP);
4219
4220 if (rib_debug > 0) {
4221 cmn_err(CE_NOTE, "fmr: vaddr %p, size %d paddr_list_len %d \n",
4222 buf, size, paddr_list_len);
4223 }
4224
4225 ibt_status = ibt_map_mem_area(hca->hca_hdl, &va_attr, paddr_list_len,
4226 paddr_list, &num_paddr, &buf_sz, &paddr_offset, ma_hdlp);
4227 if (ibt_status != IBT_SUCCESS) {
4228 cmn_err(CE_WARN, "rib_reg_mem_fmr: ibt_map_mem_area failed: "
4229 "status %d", ibt_status);
4230 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len);
4231 return (RDMA_FAILED);
4232 }
4233
4234 if (rib_debug > 0) {
4235 cmn_err(CE_NOTE,"fmr: p_laddr %p, p_size %d, buf_sz %d, p_ofset %llX\n",
4236 paddr_list[0].p_laddr, paddr_list[0].p_size, buf_sz,
4237 paddr_offset);
4238 cmn_err(CE_NOTE,"fmr: ibt_map_mem_area: ret %d, num_paddr %d, spec %d\n",
4239 ibt_status, num_paddr, spec);
4240 }
4241
4242 bzero(&pmr_attr, sizeof (ibt_pmr_attr_t));
4243 pmr_attr.pmr_iova = (ib_vaddr_t)buf;
4244 pmr_attr.pmr_len = size;
4245 pmr_attr.pmr_num_buf = num_paddr;
4246 pmr_attr.pmr_buf_sz = buf_sz;
4247 pmr_attr.pmr_buf_list = paddr_list;
4248 pmr_attr.pmr_offset = paddr_offset;
4249 pmr_attr.pmr_flags = spec;
4250 pmr_attr.pmr_ma = *ma_hdlp;
4251
4252 ibt_status = ibt_register_physical_fmr(hca->hca_hdl, hca->fmr_pool,
4253 &pmr_attr, mr_hdlp, pmr_descp);
4254 if (ibt_status != IBT_SUCCESS) {
4255 cmn_err(CE_WARN, "rib_reg_mem_fmr: ibt_register_physical_fmr "
4256 "failed: status %d", ibt_status);
4257 (void) ibt_unmap_mem_area(hca->hca_hdl, *ma_hdlp);
4258 *ma_hdlp=NULL;
4259 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len);
4260 return (RDMA_FAILED);
4261 }
4262
4263 if (rib_debug > 0) {
4264 cmn_err(CE_NOTE,"fmr: rkey: 0x%lX lkey: 0x%lX, iova: %p, fmr_hdl %p \n",
4265 pmr_descp->pmd_rkey, pmr_descp->pmd_lkey,
4266 pmr_descp->pmd_iova, *mr_hdlp);
4267 }
4268
4269 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len);
4270
4271 return (RDMA_SUCCESS);
4272
4273 }
4274
4275 #endif
4276 static rdma_stat
4277 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec,
4278 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
4279 {
4280 ibt_mr_attr_t mem_attr;
4281 ibt_status_t ibt_status;
4282 mem_attr.mr_vaddr = (uintptr_t)buf;
4283 mem_attr.mr_len = (ib_msglen_t)size;
4284 mem_attr.mr_as = (struct as *)(caddr_t)adsp;
4285 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
4286 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
4287 IBT_MR_ENABLE_WINDOW_BIND | spec;
4288
4289 rw_enter(&hca->state_lock, RW_READER);
4290 if (hca->state == HCA_INITED) {
4291 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
4292 &mem_attr, mr_hdlp, mr_descp);
4293 rw_exit(&hca->state_lock);
4294 } else {
4295 rw_exit(&hca->state_lock);
4296 return (RDMA_FAILED);
4297 }
4298
4299 if (ibt_status != IBT_SUCCESS) {
4300 cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr "
4301 "(spec:%d) failed for addr %llX, status %d",
4302 spec, (longlong_t)mem_attr.mr_vaddr, ibt_status);
4303 return (RDMA_FAILED);
4304 }
4305 return (RDMA_SUCCESS);
4306 }
4307
4308 rdma_stat
4309 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
4310 #ifdef SERVER_REG_CACHE
4311 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
4312 #else
4313 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle)
4314 #endif
4315 {
4316 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
4317 #ifdef IB_FMR_SUP
4318 ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */
4319 ibt_ma_hdl_t ma_hdl = NULL;
4320 #endif
4321 #ifdef SERVER_REG_CACHE
4322 rib_lrc_entry_t *l;
4323 #endif
4324 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
4325 rdma_stat status;
4326 rib_hca_t *hca = (ctoqp(conn))->hca;
4327
4328 /*
4329 * Non-coherent memory registration.
4330 */
4331 #ifdef SERVER_REG_CACHE
4332 l = (rib_lrc_entry_t *)lrc;
4333 if(l){
4334 if(l->registered){
4335 buf_handle->mrc_linfo = (uintptr_t)l->lrc_mhandle.mrc_linfo;
4336 buf_handle->mrc_lmr = (uint32_t)l->lrc_mhandle.mrc_lmr;
4337 buf_handle->mrc_rmr = (uint32_t)l->lrc_mhandle.mrc_rmr;
4338 #ifdef IB_FMR_SUP
4339 buf_handle->mrc_lma = (uintptr_t)l->lrc_mhandle.mrc_lma;
4340 #endif
4341 *sync_handle = (RIB_SYNCMEM_HANDLE)l->lrc_mhandle.mrc_linfo;
4342 return(RDMA_SUCCESS);
4343 } else {
4344 /* Always register the whole buffer */
4345 buf = (caddr_t)l->lrc_buf;
4346 buflen = l->lrc_len;
4347 /*cmn_err(CE_NOTE,"Register %p of length %d\n",buf,buflen);*/
4348 }
4349 }
4350 #endif
4351 #ifdef IB_FMR_SUP
4352 status = rib_reg_mem_fmr(hca, adsp, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl,
4353 &ma_hdl, &pmr_desc);
4354 if (status == RDMA_SUCCESS) {
4355 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4356 buf_handle->mrc_lma = (uintptr_t)ma_hdl;
4357 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey;
4358 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey;
4359 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
4360 #ifdef SERVER_REG_CACHE
4361 if(l){
4362 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
4363 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey;
4364 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey;
4365 l->registered = TRUE;
4366 l->lrc_mhandle.mrc_lma = (uintptr_t)ma_hdl;
4367 }
4368 #endif
4369 goto ret_stat;
4370
4371 } else {
4372 if (rib_debug > 1)
4373 cmn_err(CE_WARN,"fmr reg failed for buffer %p of length %d\n",buf,buflen);
4374 buf_handle->mrc_linfo = NULL;
4375 buf_handle->mrc_lma = NULL;
4376 buf_handle->mrc_lmr = 0;
4377 buf_handle->mrc_rmr = 0;
4378 }
4379 #endif
4380 status = rib_reg_mem(hca, adsp, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl,
4381 &mr_desc);
4382 if (status == RDMA_SUCCESS) {
4383 #ifdef SERVER_REG_CACHE
4384 if(l){
4385 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
4386 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey;
4387 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey;
4388 l->registered = TRUE;
4389 }
4390 #endif
4391 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4392 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4393 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4394 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
4395 } else {
4396 buf_handle->mrc_linfo = NULL;
4397 buf_handle->mrc_lmr = 0;
4398 buf_handle->mrc_rmr = 0;
4399 }
4400 ret_stat:
4401 return (status);
4402 }
4403
4404 /* ARGSUSED */
4405 rdma_stat
4406 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
4407 {
4408 avl_index_t where = NULL;
4409 #ifdef IB_FMR_SUP
4410 ibt_status_t ibt_status;
4411 #endif
4412 rib_hca_t *hca = (ctoqp(conn))->hca;
4413 /*
4414 * Allow memory deregistration even if HCA is
4415 * getting detached. Need all outstanding
4416 * memory registrations to be deregistered
4417 * before HCA_DETACH_EVENT can be accepted.
4418 */
4419 #ifdef IB_FMR_SUP
4420 if(buf_handle.mrc_lma){
4421 ibt_status = ibt_unmap_mem_area(hca->hca_hdl,
4422 (ibt_ma_hdl_t)buf_handle.mrc_lma);
4423 if (ibt_status != IBT_SUCCESS){
4424 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed",
4425 ibt_status);
4426 return (RDMA_FAILED);
4427 }
4428
4429 ibt_status = ibt_deregister_fmr(hca->hca_hdl,
4430 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4431 if (ibt_status != IBT_SUCCESS)
4432 return (RDMA_FAILED);
4433 return (RDMA_SUCCESS);
4434 }
4435 #endif
4436 (void) ibt_deregister_mr(hca->hca_hdl,
4437 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4438 return (RDMA_SUCCESS);
4439 }
4440
4441 /* ARGSUSED */
4442 rdma_stat
4443 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
4444 #ifdef SERVER_REG_CACHE
4445 RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
4446 #else
4447 RIB_SYNCMEM_HANDLE sync_handle)
4448 #endif
4449 {
4450 #ifdef SERVER_REG_CACHE
4451 rib_lrc_entry_t *l;
4452 l = (rib_lrc_entry_t *)lrc;
4453 if(l)
4454 if(l->registered)
4455 return(RDMA_SUCCESS);
4456 #endif
4457
4458
4459 (void) rib_deregistermem(conn, buf, buf_handle);
4460
4461 return (RDMA_SUCCESS);
4462 }
4463
4464 /* ARGSUSED */
4465 rdma_stat
4466 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
4467 int len, int cpu)
4468 {
4469 ibt_status_t status;
4470 rib_hca_t *hca = (ctoqp(conn))->hca;
4471 ibt_mr_sync_t mr_segment;
4472
4473 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
4474 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
4475 mr_segment.ms_len = (ib_memlen_t)len;
4476 if (cpu) {
4477 /* make incoming data visible to memory */
4478 mr_segment.ms_flags = IBT_SYNC_WRITE;
4479 } else {
4480 /* make memory changes visible to IO */
4481 mr_segment.ms_flags = IBT_SYNC_READ;
4482 }
4483 rw_enter(&hca->state_lock, RW_READER);
4484 if (hca->state == HCA_INITED) {
4485 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
4486 rw_exit(&hca->state_lock);
4487 } else {
4488 rw_exit(&hca->state_lock);
4489 return (RDMA_FAILED);
4490 }
4491
4492 if (status == IBT_SUCCESS)
4493 return (RDMA_SUCCESS);
4494 else {
4495 #ifdef DEBUG
4496 cmn_err(CE_WARN, "rib_syncmem: ibt_sync_mr failed with %d\n",
4497 status);
4498 #endif
4499 return (RDMA_FAILED);
4500 }
4501 }
4502
4503 /*
4504 * XXXX ????
4505 */
4506 static rdma_stat
4507 rib_getinfo(rdma_info_t *info)
4508 {
4509 /*
4510 * XXXX Hack!
4511 */
4512 info->addrlen = 16;
4513 info->mts = 1000000;
4514 info->mtu = 1000000;
4515
4516 return (RDMA_SUCCESS);
4517 }
4518
4519 rib_bufpool_t *
4520 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
4521 {
4522 rib_bufpool_t *rbp = NULL;
4523 bufpool_t *bp = NULL;
4524 caddr_t buf;
4525 ibt_mr_attr_t mem_attr;
4526 ibt_status_t ibt_status;
4527 int i, j;
4528
4529 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
4530
4531 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
4532 num * sizeof (void *), KM_SLEEP);
4533
4534 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
4535 bp->numelems = num;
4536
4537
4538 switch (ptype) {
4539 case SEND_BUFFER:
4540 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4541 bp->rsize = RPC_MSG_SZ;
4542 break;
4543 case RECV_BUFFER:
4544 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4545 bp->rsize = RPC_BUF_SIZE;
4546 break;
4547 default:
4548 goto fail;
4549 }
4550
4551 /*
4552 * Register the pool.
4553 */
4554 bp->bufsize = num * bp->rsize;
4555 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
4556 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
4557 sizeof (ibt_mr_hdl_t), KM_SLEEP);
4558 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
4559 sizeof (ibt_mr_desc_t), KM_SLEEP);
4560 rw_enter(&hca->state_lock, RW_READER);
4561 if (hca->state != HCA_INITED) {
4562 rw_exit(&hca->state_lock);
4563 cmn_err(CE_WARN,"hca->state != HCA_INITED");
4564 goto fail;
4565 }
4566 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
4567 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
4568 mem_attr.mr_vaddr = (uintptr_t)buf;
4569 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
4570 mem_attr.mr_as = NULL;
4571 ibt_status = ibt_register_mr(hca->hca_hdl,
4572 hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i],
4573 &rbp->mr_desc[i]);
4574 if (ibt_status != IBT_SUCCESS) {
4575 for (j = 0; j < i; j++) {
4576 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]);
4577 }
4578 rw_exit(&hca->state_lock);
4579 goto fail;
4580 }
4581 }
4582 rw_exit(&hca->state_lock);
4583 buf = (caddr_t)bp->buf;
4584 for (i = 0; i < num; i++, buf += bp->rsize) {
4585 bp->buflist[i] = (void *)buf;
4586 }
4587 bp->buffree = num - 1; /* no. of free buffers */
4588 rbp->bpool = bp;
4589
4590 return (rbp);
4591 fail:
4592 if (bp) {
4593 if (bp->buf)
4594 kmem_free(bp->buf, bp->bufsize);
4595 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
4596 }
4597 if (rbp) {
4598 if (rbp->mr_hdl)
4599 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
4600 if (rbp->mr_desc)
4601 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
4602 kmem_free(rbp, sizeof (rib_bufpool_t));
4603 }
4604 return (NULL);
4605 }
4606
4607 static void
4608 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
4609 {
4610 int i;
4611 rib_bufpool_t *rbp = NULL;
4612 bufpool_t *bp;
4613
4614 /*
4615 * Obtain pool address based on type of pool
4616 */
4617 switch (ptype) {
4618 case SEND_BUFFER:
4619 rbp = hca->send_pool;
4620 break;
4621 case RECV_BUFFER:
4622 rbp = hca->recv_pool;
4623 break;
4624 default:
4625 return;
4626 }
4627 if (rbp == NULL)
4628 return;
4629
4630 bp = rbp->bpool;
4631
4632 /*
4633 * Deregister the pool memory and free it.
4634 */
4635 for (i = 0; i < bp->numelems; i++) {
4636 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
4637 }
4638 }
4639
4640 static void
4641 rib_rbufpool_free(rib_hca_t *hca, int ptype)
4642 {
4643
4644 rib_bufpool_t *rbp = NULL;
4645 bufpool_t *bp;
4646
4647 /*
4648 * Obtain pool address based on type of pool
4649 */
4650 switch (ptype) {
4651 case SEND_BUFFER:
4652 rbp = hca->send_pool;
4653 break;
4654 case RECV_BUFFER:
4655 rbp = hca->recv_pool;
4656 break;
4657 default:
4658 return;
4659 }
4660 if (rbp == NULL)
4661 return;
4662
4663 bp = rbp->bpool;
4664
4665 /*
4666 * Free the pool memory.
4667 */
4668 if (rbp->mr_hdl)
4669 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4670
4671 if (rbp->mr_desc)
4672 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4673 if (bp->buf)
4674 kmem_free(bp->buf, bp->bufsize);
4675 mutex_destroy(&bp->buflock);
4676 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4677 kmem_free(rbp, sizeof (rib_bufpool_t));
4678 }
4679
4680 void
4681 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4682 {
4683 /*
4684 * Deregister the pool memory and free it.
4685 */
4686 rib_rbufpool_deregister(hca, ptype);
4687 rib_rbufpool_free(hca, ptype);
4688 }
4689
4690 /*
4691 * Fetch a buffer from the pool of type specified in rdbuf->type.
4692 */
4693 static rdma_stat
4694 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4695 {
4696
4697 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4698 if (rdbuf->addr) {
4699 switch (rdbuf->type) {
4700 case SEND_BUFFER:
4701 rdbuf->len = RPC_MSG_SZ; /* 1K */
4702 break;
4703 case RECV_BUFFER:
4704 rdbuf->len = RPC_BUF_SIZE; /* 2K */
4705 break;
4706 default:
4707 rdbuf->len = 0;
4708 }
4709 return (RDMA_SUCCESS);
4710 } else
4711 return (RDMA_FAILED);
4712 }
4713
4714 #if defined(MEASURE_POOL_DEPTH)
4715 static void rib_recv_bufs(uint32_t x) {
4716 return;
4717 }
4718 static void rib_send_bufs(uint32_t x) {
4719 return;
4720 }
4721 #endif
4722
4723 /*
4724 * Fetch a buffer of specified type.
4725 * Note that rdbuf->handle is mw's rkey.
4726 */
4727 static void *
4728 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4729 {
4730 rib_qp_t *qp = ctoqp(conn);
4731 rib_hca_t *hca = qp->hca;
4732 rdma_btype ptype = rdbuf->type;
4733 void *buf;
4734 rib_bufpool_t *rbp = NULL;
4735 bufpool_t *bp;
4736 int i;
4737
4738 /*
4739 * Obtain pool address based on type of pool
4740 */
4741 switch (ptype) {
4742 case SEND_BUFFER:
4743 rbp = hca->send_pool;
4744 break;
4745 case RECV_BUFFER:
4746 rbp = hca->recv_pool;
4747 break;
4748 default:
4749 return (NULL);
4750 }
4751 if (rbp == NULL)
4752 return (NULL);
4753
4754 bp = rbp->bpool;
4755
4756 mutex_enter(&bp->buflock);
4757 if (bp->buffree < 0) {
4758 cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!");
4759 mutex_exit(&bp->buflock);
4760 return (NULL);
4761 }
4762
4763 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4764 buf = bp->buflist[bp->buffree];
4765 rdbuf->addr = buf;
4766 rdbuf->len = bp->rsize;
4767 for (i = bp->numelems - 1; i >= 0; i--) {
4768 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4769 rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey;
4770 rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i];
4771 rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey;
4772 #if defined(MEASURE_POOL_DEPTH)
4773 if(ptype == SEND_BUFFER)
4774 rib_send_bufs(MAX_BUFS - (bp->buffree+1));
4775 if(ptype == RECV_BUFFER)
4776 rib_recv_bufs(MAX_BUFS - (bp->buffree+1));
4777 #endif
4778 bp->buffree--;
4779 if (rib_debug > 1)
4780 cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs "
4781 "(type %d)\n", bp->buffree+1, ptype);
4782
4783 mutex_exit(&bp->buflock);
4784
4785 return (buf);
4786 }
4787 }
4788 cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of "
4789 "type %d found!", buf, ptype);
4790 mutex_exit(&bp->buflock);
4791
4792 return (NULL);
4793 }
4794
4795 static void
4796 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4797 {
4798
4799 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4800 }
4801
4802 static void
4803 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4804 {
4805 rib_qp_t *qp = ctoqp(conn);
4806 rib_hca_t *hca = qp->hca;
4807 rib_bufpool_t *rbp = NULL;
4808 bufpool_t *bp;
4809
4810 /*
4811 * Obtain pool address based on type of pool
4812 */
4813 switch (ptype) {
4814 case SEND_BUFFER:
4815 rbp = hca->send_pool;
4816 break;
4817 case RECV_BUFFER:
4818 rbp = hca->recv_pool;
4819 break;
4820 default:
4821 return;
4822 }
4823 if (rbp == NULL)
4824 return;
4825
4826 bp = rbp->bpool;
4827
4828 mutex_enter(&bp->buflock);
4829 if (++bp->buffree >= bp->numelems) {
4830 /*
4831 * Should never happen
4832 */
4833 cmn_err(CE_WARN, "rib_rbuf_free: One (type %d) "
4834 "too many frees!", ptype);
4835 bp->buffree--;
4836 } else {
4837 bp->buflist[bp->buffree] = buf;
4838 if (rib_debug > 1)
4839 cmn_err(CE_NOTE, "rib_rbuf_free: %d free bufs "
4840 "(type %d)\n", bp->buffree+1, ptype);
4841 }
4842 mutex_exit(&bp->buflock);
4843 }
4844
4845 static rdma_stat
4846 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4847 {
4848 rw_enter(&connlist->conn_lock, RW_WRITER);
4849 if (connlist->conn_hd) {
4850 cn->c_next = connlist->conn_hd;
4851 connlist->conn_hd->c_prev = cn;
4852 }
4853 connlist->conn_hd = cn;
4854 rw_exit(&connlist->conn_lock);
4855
4856 return (RDMA_SUCCESS);
4857 }
4858
4859 static rdma_stat
4860 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4861 {
4862 rw_enter(&connlist->conn_lock, RW_WRITER);
4863 if (cn->c_prev) {
4864 cn->c_prev->c_next = cn->c_next;
4865 }
4866 if (cn->c_next) {
4867 cn->c_next->c_prev = cn->c_prev;
4868 }
4869 if (connlist->conn_hd == cn)
4870 connlist->conn_hd = cn->c_next;
4871 rw_exit(&connlist->conn_lock);
4872
4873 return (RDMA_SUCCESS);
4874 }
4875
4876 /*
4877 * Connection management.
4878 * IBTF does not support recycling of channels. So connections are only
4879 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR or
4880 * C_DISCONN_PEND state. No C_IDLE state.
4881 * C_CONN_PEND state: Connection establishment in progress to the server.
4882 * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4883 * It has an RC channel associated with it. ibt_post_send/recv are allowed
4884 * only in this state.
4885 * C_ERROR state: A connection transitions to this state when WRs on the
4886 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4887 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4888 * C_DISCONN_PEND state: When a connection is in C_ERROR state and when
4889 * c_ref drops to 0 (this indicates that RPC has no more references to this
4890 * connection), the connection should be destroyed. A connection transitions
4891 * into this state when it is being destroyed.
4892 */
4893 static rdma_stat
4894 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
4895 {
4896 CONN *cn;
4897 int status = RDMA_SUCCESS;
4898 rib_hca_t *hca = (rib_hca_t *)handle;
4899 rib_qp_t *qp;
4900 clock_t cv_stat, timout;
4901 ibt_path_info_t path;
4902
4903 again:
4904 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4905 cn = hca->cl_conn_list.conn_hd;
4906 while (cn != NULL) {
4907 /*
4908 * First, clear up any connection in the ERROR state
4909 */
4910 mutex_enter(&cn->c_lock);
4911 if (cn->c_state == C_ERROR) {
4912 if (cn->c_ref == 0) {
4913 /*
4914 * Remove connection from list and destroy it.
4915 */
4916 cn->c_state = C_DISCONN_PEND;
4917 mutex_exit(&cn->c_lock);
4918 rw_exit(&hca->cl_conn_list.conn_lock);
4919 (void) rib_disconnect_channel(cn,
4920 &hca->cl_conn_list);
4921 goto again;
4922 }
4923 mutex_exit(&cn->c_lock);
4924 cn = cn->c_next;
4925 continue;
4926 } else if (cn->c_state == C_DISCONN_PEND) {
4927 mutex_exit(&cn->c_lock);
4928 cn = cn->c_next;
4929 continue;
4930 }
4931 if ((cn->c_raddr.len == svcaddr->len) &&
4932 bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
4933 /*
4934 * Our connection. Give up conn list lock
4935 * as we are done traversing the list.
4936 */
4937 rw_exit(&hca->cl_conn_list.conn_lock);
4938 if (cn->c_state == C_CONNECTED) {
4939 cn->c_ref++; /* sharing a conn */
4940 mutex_exit(&cn->c_lock);
4941 *conn = cn;
4942 return (status);
4943 }
4944 if (cn->c_state == C_CONN_PEND) {
4945 /*
4946 * Hold a reference to this conn before
4947 * we give up the lock.
4948 */
4949 cn->c_ref++;
4950 timout = ddi_get_lbolt() +
4951 drv_usectohz(CONN_WAIT_TIME * 1000000);
4952 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4953 &cn->c_lock, timout)) > 0 &&
4954 cn->c_state == C_CONN_PEND)
4955 ;
4956 if (cv_stat == 0) {
4957 cn->c_ref--;
4958 mutex_exit(&cn->c_lock);
4959 return (RDMA_INTR);
4960 }
4961 if (cv_stat < 0) {
4962 cn->c_ref--;
4963 mutex_exit(&cn->c_lock);
4964 return (RDMA_TIMEDOUT);
4965 }
4966 if (cn->c_state == C_CONNECTED) {
4967 *conn = cn;
4968 mutex_exit(&cn->c_lock);
4969 return (status);
4970 } else {
4971 cn->c_ref--;
4972 mutex_exit(&cn->c_lock);
4973 return (RDMA_TIMEDOUT);
4974 }
4975 }
4976 }
4977 mutex_exit(&cn->c_lock);
4978 cn = cn->c_next;
4979 }
4980 rw_exit(&hca->cl_conn_list.conn_lock);
4981
4982 status = rib_chk_srv_ats(hca, svcaddr, addr_type, &path);
4983 if (status != RDMA_SUCCESS) {
4984 #ifdef DEBUG
4985 if (rib_debug) {
4986 cmn_err(CE_WARN, "rib_conn_get: "
4987 "No server ATS record!");
4988 }
4989 #endif
4990 return (RDMA_FAILED);
4991 }
4992
4993 /*
4994 * Channel to server doesn't exist yet, create one.
4995 */
4996 if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4997 return (RDMA_FAILED);
4998 }
4999 cn = qptoc(qp);
5000 cn->c_state = C_CONN_PEND;
5001 cn->c_ref = 1;
5002
5003 /*
5004 * Add to conn list.
5005 * We had given up the READER lock. In the time since then,
5006 * another thread might have created the connection we are
5007 * trying here. But for now, that is quiet alright - there
5008 * might be two connections between a pair of hosts instead
5009 * of one. If we really want to close that window,
5010 * then need to check the list after acquiring the
5011 * WRITER lock.
5012 */
5013 (void) rib_add_connlist(cn, &hca->cl_conn_list);
5014 status = rib_conn_to_srv(hca, qp, &path);
5015 mutex_enter(&cn->c_lock);
5016 if (status == RDMA_SUCCESS) {
5017 cn->c_state = C_CONNECTED;
5018 *conn = cn;
5019 } else {
5020 cn->c_state = C_ERROR;
5021 cn->c_ref--;
5022 #ifdef DEBUG
5023 if (rib_debug) {
5024 cmn_err(CE_WARN, "rib_conn_get: FAILED creating"
5025 " a channel!");
5026 }
5027 #endif
5028 }
5029 cv_broadcast(&cn->c_cv);
5030 mutex_exit(&cn->c_lock);
5031 return (status);
5032 }
5033
5034 static rdma_stat
5035 rib_conn_release(CONN *conn)
5036 {
5037 rib_qp_t *qp = ctoqp(conn);
5038
5039 mutex_enter(&conn->c_lock);
5040 conn->c_ref--;
5041
5042 /*
5043 * If a conn is C_ERROR, close the channel.
5044 * If it's CONNECTED, keep it that way.
5045 */
5046 if (conn->c_ref == 0 && (conn->c_state & C_ERROR)) {
5047 conn->c_state = C_DISCONN_PEND;
5048 mutex_exit(&conn->c_lock);
5049 if (qp->mode == RIB_SERVER)
5050 (void) rib_disconnect_channel(conn,
5051 &qp->hca->srv_conn_list);
5052 else
5053 (void) rib_disconnect_channel(conn,
5054 &qp->hca->cl_conn_list);
5055 return (RDMA_SUCCESS);
5056 }
5057 mutex_exit(&conn->c_lock);
5058 return (RDMA_SUCCESS);
5059 }
5060
5061 /*
5062 * Add at front of list
5063 */
5064 static struct rdma_done_list *
5065 rdma_done_add(rib_qp_t *qp, uint32_t xid)
5066 {
5067 struct rdma_done_list *rd;
5068
5069 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
5070
5071 rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
5072 rd->xid = xid;
5073 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
5074
5075 rd->prev = NULL;
5076 rd->next = qp->rdlist;
5077 if (qp->rdlist != NULL)
5078 qp->rdlist->prev = rd;
5079 qp->rdlist = rd;
5080
5081 return (rd);
5082 }
5083
5084 static void
5085 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
5086 {
5087 struct rdma_done_list *r;
5088
5089 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
5090
5091 r = rd->next;
5092 if (r != NULL) {
5093 r->prev = rd->prev;
5094 }
5095
5096 r = rd->prev;
5097 if (r != NULL) {
5098 r->next = rd->next;
5099 } else {
5100 qp->rdlist = rd->next;
5101 }
5102
5103 cv_destroy(&rd->rdma_done_cv);
5104 kmem_free(rd, sizeof (*rd));
5105 }
5106
5107 static void
5108 rdma_done_rem_list(rib_qp_t *qp)
5109 {
5110 struct rdma_done_list *r, *n;
5111
5112 mutex_enter(&qp->rdlist_lock);
5113 for (r = qp->rdlist; r != NULL; r = n) {
5114 n = r->next;
5115 rdma_done_rm(qp, r);
5116 }
5117 mutex_exit(&qp->rdlist_lock);
5118 }
5119
5120 static void
5121 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
5122 {
5123 struct rdma_done_list *r = qp->rdlist;
5124
5125 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
5126
5127 while (r) {
5128 if (r->xid == xid) {
5129 cv_signal(&r->rdma_done_cv);
5130 return;
5131 } else {
5132 r = r->next;
5133 }
5134 }
5135 if (rib_debug > 1) {
5136 cmn_err(CE_WARN, "rdma_done_notify: "
5137 "No matching xid for %u, qp %p\n", xid, (void *)qp);
5138 }
5139 }
5140
5141 rpcib_ats_t *
5142 get_ibd_entry(ib_gid_t *gid, ib_pkey_t pkey, rpcib_ibd_insts_t *ibds)
5143 {
5144 rpcib_ats_t *atsp;
5145 int i;
5146
5147 for (i = 0, atsp = ibds->rib_ats; i < ibds->rib_ibd_cnt; i++, atsp++) {
5148 if (atsp->ras_port_gid.gid_prefix == gid->gid_prefix &&
5149 atsp->ras_port_gid.gid_guid == gid->gid_guid &&
5150 atsp->ras_pkey == pkey) {
5151 return (atsp);
5152 }
5153 }
5154 return (NULL);
5155 }
5156
5157 int
5158 rib_get_ibd_insts_cb(dev_info_t *dip, void *arg)
5159 {
5160 rpcib_ibd_insts_t *ibds = (rpcib_ibd_insts_t *)arg;
5161 rpcib_ats_t *atsp;
5162 ib_pkey_t pkey;
5163 uint8_t port;
5164 ib_guid_t hca_guid;
5165 ib_gid_t port_gid;
5166
5167 if (i_ddi_devi_attached(dip) &&
5168 (strcmp(ddi_node_name(dip), "ibport") == 0) &&
5169 (strstr(ddi_get_name_addr(dip), "ipib") != NULL)) {
5170
5171 if (ibds->rib_ibd_cnt >= ibds->rib_ibd_alloc) {
5172 rpcib_ats_t *tmp;
5173
5174 tmp = (rpcib_ats_t *)kmem_zalloc((ibds->rib_ibd_alloc +
5175 N_IBD_INSTANCES) * sizeof (rpcib_ats_t), KM_SLEEP);
5176 bcopy(ibds->rib_ats, tmp,
5177 ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
5178 kmem_free(ibds->rib_ats,
5179 ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
5180 ibds->rib_ats = tmp;
5181 ibds->rib_ibd_alloc += N_IBD_INSTANCES;
5182 }
5183 if (((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY,
5184 dip, 0, "hca-guid", 0)) == 0) ||
5185 ((port = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5186 0, "port-number", 0)) == 0) ||
5187 (ibt_get_port_state_byguid(hca_guid, port,
5188 &port_gid, NULL) != IBT_SUCCESS) ||
5189 ((pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
5190 "port-pkey", IB_PKEY_INVALID_LIMITED)) <=
5191 IB_PKEY_INVALID_FULL)) {
5192 return (DDI_WALK_CONTINUE);
5193 }
5194 atsp = &ibds->rib_ats[ibds->rib_ibd_cnt];
5195 atsp->ras_inst = ddi_get_instance(dip);
5196 atsp->ras_pkey = pkey;
5197 atsp->ras_port_gid = port_gid;
5198 ibds->rib_ibd_cnt++;
5199 }
5200 return (DDI_WALK_CONTINUE);
5201 }
5202
5203 void
5204 rib_get_ibd_insts(rpcib_ibd_insts_t *ibds)
5205 {
5206 ddi_walk_devs(ddi_root_node(), rib_get_ibd_insts_cb, ibds);
5207 }
5208
5209 /*
5210 * Return ibd interfaces and ibd instances.
5211 */
5212 int
5213 get_ibd_ipaddr(rpcib_ibd_insts_t *ibds)
5214 {
5215 TIUSER *tiptr, *tiptr6;
5216 vnode_t *kvp, *kvp6;
5217 vnode_t *vp = NULL, *vp6 = NULL;
5218 struct strioctl iocb;
5219 struct lifreq lif_req;
5220 int k, ip_cnt;
5221 rpcib_ats_t *atsp;
5222
5223 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
5224 &kvp) == 0) {
5225 if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
5226 &tiptr, CRED()) == 0) {
5227 vp = tiptr->fp->f_vnode;
5228 } else {
5229 VN_RELE(kvp);
5230 }
5231 }
5232
5233 if (lookupname("/dev/udp6", UIO_SYSSPACE, FOLLOW, NULLVPP,
5234 &kvp6) == 0) {
5235 if (t_kopen((file_t *)NULL, kvp6->v_rdev, FREAD|FWRITE,
5236 &tiptr6, CRED()) == 0) {
5237 vp6 = tiptr6->fp->f_vnode;
5238 } else {
5239 VN_RELE(kvp6);
5240 }
5241 }
5242
5243 if (vp == NULL && vp6 == NULL)
5244 return (-1);
5245
5246 /* Get ibd ip's */
5247 ip_cnt = 0;
5248 for (k = 0, atsp = ibds->rib_ats; k < ibds->rib_ibd_cnt; k++, atsp++) {
5249 /* IPv4 */
5250 if (vp != NULL) {
5251 (void) bzero((void *)&lif_req, sizeof (struct lifreq));
5252 (void) snprintf(lif_req.lifr_name,
5253 sizeof (lif_req.lifr_name), "%s%d",
5254 IBD_NAME, atsp->ras_inst);
5255
5256 (void) bzero((void *)&iocb, sizeof (struct strioctl));
5257 iocb.ic_cmd = SIOCGLIFADDR;
5258 iocb.ic_timout = 0;
5259 iocb.ic_len = sizeof (struct lifreq);
5260 iocb.ic_dp = (caddr_t)&lif_req;
5261 if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) {
5262 atsp->ras_inet_type = AF_INET;
5263 bcopy(&lif_req.lifr_addr, &atsp->ras_sin,
5264 sizeof (struct sockaddr_in));
5265 ip_cnt++;
5266 continue;
5267 }
5268 }
5269 /* Try IPv6 */
5270 if (vp6 != NULL) {
5271 (void) bzero((void *)&lif_req, sizeof (struct lifreq));
5272 (void) snprintf(lif_req.lifr_name,
5273 sizeof (lif_req.lifr_name), "%s%d",
5274 IBD_NAME, atsp->ras_inst);
5275
5276 (void) bzero((void *)&iocb, sizeof (struct strioctl));
5277 iocb.ic_cmd = SIOCGLIFADDR;
5278 iocb.ic_timout = 0;
5279 iocb.ic_len = sizeof (struct lifreq);
5280 iocb.ic_dp = (caddr_t)&lif_req;
5281 if (kstr_ioctl(vp6, I_STR, (intptr_t)&iocb) == 0) {
5282
5283 atsp->ras_inet_type = AF_INET6;
5284 bcopy(&lif_req.lifr_addr, &atsp->ras_sin6,
5285 sizeof (struct sockaddr_in6));
5286 ip_cnt++;
5287 }
5288 }
5289 }
5290
5291 if (vp6 != NULL) {
5292 (void) t_kclose(tiptr6, 0);
5293 VN_RELE(kvp6);
5294 }
5295 if (vp != NULL) {
5296 (void) t_kclose(tiptr, 0);
5297 VN_RELE(kvp);
5298 }
5299
5300 if (ip_cnt == 0)
5301 return (-1);
5302 else
5303 return (0);
5304 }
5305
5306 char **
5307 get_ip_addrs(int *count)
5308 {
5309 TIUSER *tiptr;
5310 vnode_t *kvp;
5311 int num_of_ifs;
5312 char **addresses;
5313 int return_code;
5314
5315 /*
5316 * Open a device for doing down stream kernel ioctls
5317 */
5318 return_code = lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW,
5319 NULLVPP, &kvp);
5320 if (return_code != 0) {
5321 cmn_err(CE_NOTE, "get_Ip_addrs: lookupname failed\n");
5322 *count = -1;
5323 return (NULL);
5324 }
5325
5326 return_code = t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
5327 &tiptr, CRED());
5328 if (return_code != 0) {
5329 cmn_err(CE_NOTE, "get_Ip_addrs: t_kopen failed\n");
5330 VN_RELE(kvp);
5331 *count = -1;
5332 return (NULL);
5333 }
5334
5335 /*
5336 * Perform the first ioctl to get the number of interfaces
5337 */
5338 return_code = get_interfaces(tiptr, &num_of_ifs);
5339 if (return_code != 0 || num_of_ifs == 0) {
5340 cmn_err(CE_NOTE, "get_Ip_addrs: get_interfaces failed\n");
5341 (void) t_kclose(tiptr, 0);
5342 VN_RELE(kvp);
5343 *count = -1;
5344 return (NULL);
5345 }
5346
5347 /*
5348 * Perform the second ioctl to get the address on each interface
5349 * found.
5350 */
5351 addresses = kmem_zalloc(num_of_ifs * sizeof (char *), KM_SLEEP);
5352 return_code = find_addrs(tiptr, addresses, num_of_ifs);
5353 if (return_code <= 0) {
5354 cmn_err(CE_NOTE, "get_Ip_addrs: find_addrs failed\n");
5355 (void) t_kclose(tiptr, 0);
5356 kmem_free(addresses, num_of_ifs * sizeof (char *));
5357 VN_RELE(kvp);
5358 *count = -1;
5359 return (NULL);
5360 }
5361
5362 *count = return_code;
5363 VN_RELE(kvp);
5364 (void) t_kclose(tiptr, 0);
5365 return (addresses);
5366 }
5367
5368 int
5369 get_interfaces(TIUSER *tiptr, int *num)
5370 {
5371 struct lifnum if_buf;
5372 struct strioctl iocb;
5373 vnode_t *vp;
5374 int return_code;
5375
5376 /*
5377 * Prep the number of interfaces request buffer for ioctl
5378 */
5379 (void) bzero((void *)&if_buf, sizeof (struct lifnum));
5380 if_buf.lifn_family = AF_UNSPEC;
5381 if_buf.lifn_flags = 0;
5382
5383 /*
5384 * Prep the kernel ioctl buffer and send it down stream
5385 */
5386 (void) bzero((void *)&iocb, sizeof (struct strioctl));
5387 iocb.ic_cmd = SIOCGLIFNUM;
5388 iocb.ic_timout = 0;
5389 iocb.ic_len = sizeof (if_buf);
5390 iocb.ic_dp = (caddr_t)&if_buf;
5391
5392 vp = tiptr->fp->f_vnode;
5393 return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5394 if (return_code != 0) {
5395 cmn_err(CE_NOTE, "get_interfaces: kstr_ioctl failed\n");
5396 *num = -1;
5397 return (-1);
5398 }
5399
5400 *num = if_buf.lifn_count;
5401 #ifdef DEBUG
5402 if (rib_debug > 1)
5403 cmn_err(CE_NOTE, "Number of interfaces detected: %d\n",
5404 if_buf.lifn_count);
5405 #endif
5406 return (0);
5407 }
5408
5409 int
5410 find_addrs(TIUSER *tiptr, char **addrs, int num_ifs)
5411 {
5412 struct lifconf lifc;
5413 struct lifreq *if_data_buf;
5414 struct strioctl iocb;
5415 caddr_t request_buffer;
5416 struct sockaddr_in *sin4;
5417 struct sockaddr_in6 *sin6;
5418 vnode_t *vp;
5419 int i, count, return_code;
5420
5421 /*
5422 * Prep the buffer for requesting all interface's info
5423 */
5424 (void) bzero((void *)&lifc, sizeof (struct lifconf));
5425 lifc.lifc_family = AF_UNSPEC;
5426 lifc.lifc_flags = 0;
5427 lifc.lifc_len = num_ifs * sizeof (struct lifreq);
5428
5429 request_buffer = kmem_zalloc(num_ifs * sizeof (struct lifreq),
5430 KM_SLEEP);
5431
5432 lifc.lifc_buf = request_buffer;
5433
5434 /*
5435 * Prep the kernel ioctl buffer and send it down stream
5436 */
5437 (void) bzero((void *)&iocb, sizeof (struct strioctl));
5438 iocb.ic_cmd = SIOCGLIFCONF;
5439 iocb.ic_timout = 0;
5440 iocb.ic_len = sizeof (struct lifconf);
5441 iocb.ic_dp = (caddr_t)&lifc;
5442
5443 vp = tiptr->fp->f_vnode;
5444 return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5445 if (return_code != 0) {
5446 cmn_err(CE_NOTE, "find_addrs: kstr_ioctl failed\n");
5447 kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
5448 return (-1);
5449 }
5450
5451 /*
5452 * Extract addresses and fill them in the requested array
5453 * IB_SVC_NAME_LEN is defined to be 64 so it covers both IPv4 &
5454 * IPv6. Here count is the number of IP addresses collected.
5455 */
5456 if_data_buf = lifc.lifc_req;
5457 count = 0;
5458 for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--,
5459 if_data_buf++) {
5460 if (if_data_buf->lifr_addr.ss_family == AF_INET) {
5461 sin4 = (struct sockaddr_in *)&if_data_buf->lifr_addr;
5462 addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
5463 (void) inet_ntop(AF_INET, &sin4->sin_addr,
5464 addrs[count], IB_SVC_NAME_LEN);
5465 count ++;
5466 }
5467
5468 if (if_data_buf->lifr_addr.ss_family == AF_INET6) {
5469 sin6 = (struct sockaddr_in6 *)&if_data_buf->lifr_addr;
5470 addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
5471 (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
5472 addrs[count], IB_SVC_NAME_LEN);
5473 count ++;
5474 }
5475 }
5476
5477 kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
5478 return (count);
5479 }
5480
5481 /*
5482 * Goes through all connections and closes the channel
5483 * This will cause all the WRs on those channels to be
5484 * flushed.
5485 */
5486 static void
5487 rib_close_channels(rib_conn_list_t *connlist)
5488 {
5489 CONN *conn;
5490 rib_qp_t *qp;
5491
5492 rw_enter(&connlist->conn_lock, RW_READER);
5493 conn = connlist->conn_hd;
5494 while (conn != NULL) {
5495 mutex_enter(&conn->c_lock);
5496 qp = ctoqp(conn);
5497 if (conn->c_state & C_CONNECTED) {
5498 /*
5499 * Live connection in CONNECTED state.
5500 * Call ibt_close_rc_channel in nonblocking mode
5501 * with no callbacks.
5502 */
5503 conn->c_state = C_ERROR;
5504 (void) ibt_close_rc_channel(qp->qp_hdl,
5505 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
5506 (void) ibt_free_channel(qp->qp_hdl);
5507 qp->qp_hdl = NULL;
5508 } else {
5509 if (conn->c_state == C_ERROR &&
5510 qp->qp_hdl != NULL) {
5511 /*
5512 * Connection in ERROR state but
5513 * channel is not yet freed.
5514 */
5515 (void) ibt_close_rc_channel(qp->qp_hdl,
5516 IBT_NOCALLBACKS, NULL, 0, NULL,
5517 NULL, 0);
5518 (void) ibt_free_channel(qp->qp_hdl);
5519 qp->qp_hdl = NULL;
5520 }
5521 }
5522 mutex_exit(&conn->c_lock);
5523 conn = conn->c_next;
5524 }
5525 rw_exit(&connlist->conn_lock);
5526 }
5527
5528 /*
5529 * Frees up all connections that are no longer being referenced
5530 */
5531 static void
5532 rib_purge_connlist(rib_conn_list_t *connlist)
5533 {
5534 CONN *conn;
5535
5536 top:
5537 rw_enter(&connlist->conn_lock, RW_READER);
5538 conn = connlist->conn_hd;
5539 while (conn != NULL) {
5540 mutex_enter(&conn->c_lock);
5541
5542 /*
5543 * At this point connection is either in ERROR
5544 * or DISCONN_PEND state. If in DISCONN_PEND state
5545 * then some other thread is culling that connection.
5546 * If not and if c_ref is 0, then destroy the connection.
5547 */
5548 if (conn->c_ref == 0 &&
5549 conn->c_state != C_DISCONN_PEND) {
5550 /*
5551 * Cull the connection
5552 */
5553 conn->c_state = C_DISCONN_PEND;
5554 mutex_exit(&conn->c_lock);
5555 rw_exit(&connlist->conn_lock);
5556 (void) rib_disconnect_channel(conn, connlist);
5557 goto top;
5558 } else {
5559 /*
5560 * conn disconnect already scheduled or will
5561 * happen from conn_release when c_ref drops to 0.
5562 */
5563 mutex_exit(&conn->c_lock);
5564 }
5565 conn = conn->c_next;
5566 }
5567 rw_exit(&connlist->conn_lock);
5568
5569 /*
5570 * At this point, only connections with c_ref != 0 are on the list
5571 */
5572 }
5573
5574 /*
5575 * Cleans and closes up all uses of the HCA
5576 */
5577 static void
5578 rib_detach_hca(rib_hca_t *hca)
5579 {
5580
5581 /*
5582 * Stop all services on the HCA
5583 * Go through cl_conn_list and close all rc_channels
5584 * Go through svr_conn_list and close all rc_channels
5585 * Free connections whose c_ref has dropped to 0
5586 * Destroy all CQs
5587 * Deregister and released all buffer pool memory after all
5588 * connections are destroyed
5589 * Free the protection domain
5590 * ibt_close_hca()
5591 */
5592 rw_enter(&hca->state_lock, RW_WRITER);
5593 if (hca->state == HCA_DETACHED) {
5594 rw_exit(&hca->state_lock);
5595 return;
5596 }
5597
5598 hca->state = HCA_DETACHED;
5599 rib_stat->nhca_inited--;
5600
5601 rib_stop_services(hca);
5602 rib_deregister_ats();
5603 rib_close_channels(&hca->cl_conn_list);
5604 rib_close_channels(&hca->srv_conn_list);
5605 rw_exit(&hca->state_lock);
5606
5607 rib_purge_connlist(&hca->cl_conn_list);
5608 rib_purge_connlist(&hca->srv_conn_list);
5609
5610 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
5611 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
5612 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
5613 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
5614 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
5615 kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
5616 kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
5617 kmem_free(hca->svc_scq, sizeof (rib_cq_t));
5618
5619 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
5620 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
5621 if (hca->srv_conn_list.conn_hd == NULL &&
5622 hca->cl_conn_list.conn_hd == NULL) {
5623 /*
5624 * conn_lists are NULL, so destroy
5625 * buffers, close hca and be done.
5626 */
5627 rib_rbufpool_destroy(hca, RECV_BUFFER);
5628 rib_rbufpool_destroy(hca, SEND_BUFFER);
5629 #ifdef SERVER_REG_CACHE
5630 rib_destroy_cache(hca);
5631 #endif
5632 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
5633 (void) ibt_close_hca(hca->hca_hdl);
5634 hca->hca_hdl = NULL;
5635 }
5636 rw_exit(&hca->cl_conn_list.conn_lock);
5637 rw_exit(&hca->srv_conn_list.conn_lock);
5638
5639 if (hca->hca_hdl != NULL) {
5640 mutex_enter(&hca->inuse_lock);
5641 while (hca->inuse)
5642 cv_wait(&hca->cb_cv, &hca->inuse_lock);
5643 mutex_exit(&hca->inuse_lock);
5644 /*
5645 * conn_lists are now NULL, so destroy
5646 * buffers, close hca and be done.
5647 */
5648 rib_rbufpool_destroy(hca, RECV_BUFFER);
5649 rib_rbufpool_destroy(hca, SEND_BUFFER);
5650 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
5651 (void) ibt_close_hca(hca->hca_hdl);
5652 hca->hca_hdl = NULL;
5653 }
5654 }
5655
5656 #ifdef SERVER_REG_CACHE
5657
5658 static void
5659 rib_server_side_cache_reclaim(void *argp)
5660 {
5661 cache_avl_struct_t *rcas;
5662 rib_lrc_entry_t *rb;
5663 rib_hca_t *hca = (rib_hca_t *)argp;
5664
5665 rw_enter(&hca->avl_rw_lock,RW_WRITER);
5666 rcas = avl_first(&hca->avl_tree);
5667 if(rcas != NULL)
5668 avl_remove(&hca->avl_tree, rcas);
5669 while(rcas != NULL){
5670 while(rcas->r.forw != &rcas->r){
5671 rcas->elements--;
5672 rb = rcas->r.forw;
5673 remque(rb);
5674 rib_deregistermem_via_hca(hca, rb->lrc_buf, rb->lrc_mhandle);
5675 kmem_free(rb->lrc_buf, rb->lrc_len);
5676 kmem_free(rb, sizeof(rib_lrc_entry_t));
5677 }
5678 mutex_destroy(&rcas->node_lock);
5679 kmem_cache_free(hca->server_side_cache,rcas);
5680 rcas = avl_first(&hca->avl_tree);
5681 if(rcas != NULL)
5682 avl_remove(&hca->avl_tree, rcas);
5683 }
5684 rw_exit(&hca->avl_rw_lock);
5685 }
5686
5687 static int avl_compare(const void *t1,const void *t2) {
5688
5689 if(rib_debug > 1)
5690 cmn_err(CE_NOTE,"Comparing %d and %d\n",((cache_avl_struct_t *)t1)->len, ((cache_avl_struct_t *)t2)->len);
5691 if(((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5692 return 0;
5693
5694 if(((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5695 return -1;
5696
5697 if(((cache_avl_struct_t *)t1)->len > ((cache_avl_struct_t *)t2)->len)
5698 return 1;
5699 }
5700
5701 static void rib_destroy_cache(rib_hca_t *hca) {
5702 cache_avl_struct_t *rcas, *root;
5703 rib_lrc_entry_t *rb;
5704
5705 hca->avl_init = FALSE;
5706 kmem_cache_destroy(hca->server_side_cache);
5707 avl_destroy(&hca->avl_tree);
5708 rw_destroy(&hca->avl_rw_lock);
5709
5710 }
5711
5712 static rib_lrc_entry_t *
5713 rib_get_server_cache_buf(CONN *conn,uint32_t len)
5714 {
5715 cache_avl_struct_t cas,*rcas;
5716 rib_hca_t *hca = (ctoqp(conn))->hca;
5717 rib_lrc_entry_t *reply_buf;
5718 avl_index_t where = NULL;
5719 struct rib_lrc_entry *forw = NULL;
5720 if(!hca->avl_init)
5721 goto error_alloc;
5722 cas.len = len;
5723 rw_enter(&hca->avl_rw_lock, RW_READER);
5724 if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL){
5725 rw_exit(&hca->avl_rw_lock);
5726 rw_enter(&hca->avl_rw_lock, RW_WRITER);
5727 /* Recheck to make sure no other thread added the entry in */
5728 if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL){
5729 /* Allocate an avl tree entry */
5730 if(rib_debug > 1)
5731 cmn_err(CE_NOTE,"Allocating an avl entry for length %d\n",len);
5732 rcas = (cache_avl_struct_t *)kmem_cache_alloc(hca->server_side_cache,KM_SLEEP);
5733 bzero(rcas, sizeof(cache_avl_struct_t));
5734 rcas->elements = 0;
5735 rcas->r.forw =
5736 &rcas->r;
5737 rcas->r.back =
5738 &rcas->r;
5739 rcas->len = len;
5740 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5741 avl_insert(&hca->avl_tree,rcas,where);
5742 }
5743 }
5744 if(rcas->elements > 0){
5745 mutex_enter(&rcas->node_lock);
5746 reply_buf = rcas->r.forw;
5747 remque(reply_buf);
5748 rcas->elements --;
5749 mutex_exit(&rcas->node_lock);
5750 rw_exit(&hca->avl_rw_lock);
5751 if(rib_debug > 1)
5752 cmn_err(CE_NOTE,"Allocating a pre-alloced buffer for length %d\n",len);
5753 } else {
5754 rw_exit(&hca->avl_rw_lock);
5755 rib_total_buffers ++;
5756 if(rib_debug > 1)
5757 cmn_err(CE_NOTE,"Allocating a new buffer for length %d\n",len);
5758 /* Allocate a reply_buf entry */
5759 reply_buf = (rib_lrc_entry_t *)kmem_alloc(sizeof(rib_lrc_entry_t), KM_SLEEP);
5760 bzero(reply_buf,sizeof(rib_lrc_entry_t));
5761 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5762 reply_buf->lrc_len = len;
5763 reply_buf->registered = FALSE;
5764 reply_buf->avl_node = (void *)rcas;
5765 }
5766
5767 return reply_buf;
5768 error_alloc:
5769 reply_buf = (rib_lrc_entry_t *)kmem_alloc(sizeof(rib_lrc_entry_t), KM_SLEEP);
5770 bzero(reply_buf,sizeof(rib_lrc_entry_t));
5771 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5772 reply_buf->lrc_len = len;
5773 reply_buf->registered = FALSE;
5774 reply_buf->avl_node = NULL;
5775 return reply_buf;
5776 }
5777
5778 /*
5779 * Return a pre-registered back to the cache (without
5780 * unregistering the buffer)..
5781 */
5782
5783 static void
5784 rib_free_server_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5785 {
5786 cache_avl_struct_t cas,*rcas;
5787 avl_index_t where = NULL;
5788 rib_hca_t *hca = (ctoqp(conn))->hca;
5789 if(!reg_buf){
5790 cmn_err(CE_WARN,"Got a null reg_buf\n");
5791 return;
5792 }
5793 if(!hca->avl_init)
5794 goto error_free;
5795 cas.len = reg_buf->lrc_len;
5796 rw_enter(&hca->avl_rw_lock, RW_READER);
5797 if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,&cas,&where)) == NULL){
5798 rw_exit(&hca->avl_rw_lock);
5799 goto error_free;
5800 } else {
5801 mutex_enter(&rcas->node_lock);
5802 insque(reg_buf,&rcas->r);
5803 rcas->elements ++;
5804 mutex_exit(&rcas->node_lock);
5805 rw_exit(&hca->avl_rw_lock);
5806 if(rib_debug > 1)
5807 cmn_err(CE_NOTE,"Returning buffer for length %d\n",reg_buf->lrc_len);
5808 }
5809 return;
5810 error_free:
5811 rib_deregistermem_via_hca(hca, reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5812 kmem_free(reg_buf->lrc_buf,reg_buf->lrc_len);
5813 kmem_free(reg_buf,sizeof(rib_lrc_entry_t));
5814 }
5815
5816 #endif
5817
5818 static rdma_stat
5819 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5820 uint_t buflen, struct mrc *buf_handle)
5821 {
5822 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
5823 #ifdef IB_FMR_SUP
5824 ibt_pmr_desc_t pmr_desc; /* vaddr, lkey, rkey */
5825 ibt_ma_hdl_t ma_hdl = NULL;
5826 #endif
5827 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
5828 rdma_stat status;
5829
5830
5831 /*
5832 * Note: ALL buffer pools use the same memory type RDMARW.
5833 */
5834 /* This code will not be activated on the server. We could remove
5835 the call to rib_reg_mem_fmr. But leave it in, in case the FMR
5836 bugs get fixed. The bigger question is whether we need FMR when
5837 the registered bufffers are coming out of a slab cache. This needs
5838 to be evaluated.
5839 */
5840 #ifdef IB_FMR_SUP
5841 status = rib_reg_mem_fmr(hca, buf, adsp, buflen, 0, &mr_hdl, &ma_hdl,
5842 &pmr_desc);
5843 if (status == RDMA_SUCCESS) {
5844 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
5845 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey;
5846 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey;
5847 buf_handle->mrc_lma = (uintptr_t)ma_hdl;
5848 goto ret_stat;
5849 } else {
5850 buf_handle->mrc_linfo = NULL;
5851 buf_handle->mrc_lma = NULL;
5852 buf_handle->mrc_lmr = 0;
5853 buf_handle->mrc_rmr = 0;
5854 }
5855 #endif
5856 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5857 if (status == RDMA_SUCCESS) {
5858 buf_handle->mrc_linfo = (uint64_t)mr_hdl;
5859 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5860 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5861 } else {
5862 buf_handle->mrc_linfo = NULL;
5863 buf_handle->mrc_lmr = 0;
5864 buf_handle->mrc_rmr = 0;
5865 }
5866 ret_stat:
5867 return (status);
5868 }
5869
5870 /* ARGSUSED */
5871 static rdma_stat
5872 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5873 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5874 {
5875
5876 (void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5877
5878 return (RDMA_SUCCESS);
5879 }
5880
5881 /* ARGSUSED */
5882 static rdma_stat
5883 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5884 {
5885 #ifdef IB_FMR_SUP
5886 ibt_status_t ibt_status;
5887 if(buf_handle.mrc_lma){
5888 ibt_status = ibt_unmap_mem_area(hca->hca_hdl,
5889 (ibt_ma_hdl_t)buf_handle.mrc_lma);
5890 if (ibt_status != IBT_SUCCESS){
5891 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed",
5892 ibt_status);
5893 return (RDMA_FAILED);
5894 }
5895 ibt_status = ibt_deregister_fmr(hca->hca_hdl,
5896 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5897 if (ibt_status != IBT_SUCCESS){
5898 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed",
5899 ibt_status);
5900 return (RDMA_FAILED);
5901 }
5902 return (RDMA_SUCCESS);
5903 }
5904 #endif
5905
5906 (void) ibt_deregister_mr(hca->hca_hdl,
5907 (ibt_mr_hdl_t)buf_handle.mrc_linfo);
5908 return (RDMA_SUCCESS);
5909 }
5910
5911 #if defined(ASYNC_SERVER_DEREG)||defined(ASYNC_CLIENT_DEREG)
5912 static int
5913 clist_deregister1(CONN *conn, struct clist *cl, bool_t src)
5914 {
5915 struct clist *c;
5916
5917 for (c = cl; c; c = c->c_next) {
5918 if (src) {
5919 if (c->c_smemhandle.mrc_rmr != 0) {
5920 (void) RDMA_DEREGMEMSYNC(conn,
5921 (caddr_t)(uintptr_t)c->c_saddr,
5922 c->c_smemhandle,
5923 #ifdef SERVER_REG_CACHE
5924 (void *)(uintptr_t)c->c_ssynchandle, (void *)c->long_reply_buf);
5925 #else
5926 (void *)(uintptr_t)c->c_ssynchandle);
5927 #endif
5928 c->c_smemhandle.mrc_rmr = 0;
5929 c->c_ssynchandle = NULL;
5930 }
5931 } else {
5932 if (c->c_dmemhandle.mrc_rmr != 0) {
5933 (void) RDMA_DEREGMEMSYNC(conn,
5934 (caddr_t)(uintptr_t)c->c_daddr,
5935 c->c_dmemhandle,
5936 #ifdef SERVER_REG_CACHE
5937 (void *)(uintptr_t)c->c_dsynchandle, (void *)c->long_reply_buf);
5938 #else
5939 (void *)(uintptr_t)c->c_dsynchandle);
5940 #endif
5941 c->c_dmemhandle.mrc_rmr = 0;
5942 c->c_dsynchandle = NULL;
5943 }
5944 }
5945 }
5946
5947 return (RDMA_SUCCESS);
5948 }
5949 #endif
5950
5951
5952
5953 #if defined(ASYNC_CLIENT_DEREG)
5954 static void
5955 async_dereg_thread(caddr_t arg){
5956 ASYNC *r;
5957 cmn_err(CE_WARN,"async_dereg_thread initiated\n");
5958 fetch_another_entry:
5959 mutex_enter(&at_mutex);
5960 while ((rqueue.forw == rqueue.back) && (rqueue.forw == &rqueue))
5961 cv_wait(&at_cond, &at_mutex);
5962 r=rqueue.forw;
5963 remque(rqueue.forw);
5964 mutex_exit(&at_mutex);
5965 /* Process deregistration */
5966 clist_deregister1(&r->c_conn, &r->c_clist, FALSE);
5967 kmem_free(r, sizeof(ASYNC));
5968 goto fetch_another_entry;
5969
5970 }
5971 void insert_queue(CONN *conn, struct clist *rwc){
5972 ASYNC *r;
5973 r=kmem_zalloc(sizeof(ASYNC),KM_SLEEP);
5974 r->c_clist = *rwc;
5975 r->c_conn = *conn;
5976 mutex_enter(&at_mutex);
5977 insque(r,&rqueue);
5978 cv_broadcast(&at_cond);
5979 mutex_exit(&at_mutex);
5980 }
5981 #endif