1 /*
  2  * CDDL HEADER START
  3  *
  4  * The contents of this file are subject to the terms of the
  5  * Common Development and Distribution License, Version 1.0 only
  6  * (the "License").  You may not use this file except in compliance
  7  * with the License.
  8  *
  9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 10  * or http://www.opensolaris.org/os/licensing.
 11  * See the License for the specific language governing permissions
 12  * and limitations under the License.
 13  *
 14  * When distributing Covered Code, include this CDDL HEADER in each
 15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 16  * If applicable, add the following below this CDDL HEADER, with the
 17  * fields enclosed by brackets "[]" replaced with your own identifying
 18  * information: Portions Copyright [yyyy] [name of copyright owner]
 19  *
 20  * CDDL HEADER END
 21  */
 22 /*
 23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 24  * Use is subject to license terms.
 25  */
 26 














 27 #pragma ident   "@(#)rpcib.c    1.29    06/01/25 SMI"
 28 
 29 /*
 30  * The rpcib plugin. Implements the interface for RDMATF's
 31  * interaction with IBTF.
 32  */
 33 
 34 #include <sys/param.h>
 35 #include <sys/types.h>
 36 #include <sys/user.h>
 37 #include <sys/systm.h>
 38 #include <sys/sysmacros.h>
 39 #include <sys/proc.h>
 40 #include <sys/socket.h>
 41 #include <sys/file.h>
 42 #include <sys/stream.h>
 43 #include <sys/strsubr.h>
 44 #include <sys/stropts.h>
 45 #include <sys/errno.h>
 46 #include <sys/kmem.h>
 47 #include <sys/debug.h>
 48 #include <sys/systm.h>
 49 #include <sys/pathname.h>
 50 #include <sys/kstat.h>
 51 #include <sys/t_lock.h>
 52 #include <sys/ddi.h>
 53 #include <sys/cmn_err.h>
 54 #include <sys/time.h>
 55 #include <sys/isa_defs.h>
 56 #include <sys/callb.h>
 57 #include <sys/sunddi.h>
 58 #include <sys/sunndi.h>
 59 


 60 #include <sys/ib/ibtl/ibti.h>
 61 #include <rpc/rpc.h>
 62 #include <rpc/ib.h>
 63 
 64 #include <sys/modctl.h>
 65 
 66 #include <sys/pathname.h>
 67 #include <sys/kstr.h>
 68 #include <sys/sockio.h>
 69 #include <sys/vnode.h>
 70 #include <sys/tiuser.h>
 71 #include <net/if.h>
 72 #include <sys/cred.h>

 73 


 74 
 75 extern char *inet_ntop(int, const void *, char *, int);
 76 
 77 
 78 /*
 79  * Prototype declarations for driver ops
 80  */
 81 
 82 static int      rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
 83 static int      rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
 84                             void *, void **);
 85 static int      rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
 86 
 87 
 88 /* rpcib cb_ops */
 89 static struct cb_ops rpcib_cbops = {
 90         nulldev,                /* open */
 91         nulldev,                /* close */
 92         nodev,                  /* strategy */
 93         nodev,                  /* print */
 94         nodev,                  /* dump */
 95         nodev,                  /* read */
 96         nodev,                  /* write */
 97         nodev,                  /* ioctl */
 98         nodev,                  /* devmap */
 99         nodev,                  /* mmap */
100         nodev,                  /* segmap */
101         nochpoll,               /* poll */
102         ddi_prop_op,            /* prop_op */
103         NULL,                   /* stream */
104         D_MP,                   /* cb_flag */
105         CB_REV,                 /* rev */
106         nodev,                  /* int (*cb_aread)() */
107         nodev                   /* int (*cb_awrite)() */
108 };
109 



110 /*
111  * Device options
112  */
113 static struct dev_ops rpcib_ops = {
114         DEVO_REV,               /* devo_rev, */
115         0,                      /* refcnt  */
116         rpcib_getinfo,          /* info */
117         nulldev,                /* identify */
118         nulldev,                /* probe */
119         rpcib_attach,           /* attach */
120         rpcib_detach,           /* detach */
121         nodev,                  /* reset */
122         &rpcib_cbops,                   /* driver ops - devctl interfaces */
123         NULL,                   /* bus operations */
124         NULL                    /* power */
125 };
126 
127 /*
128  * Module linkage information.
129  */
130 
131 static struct modldrv rib_modldrv = {
132         &mod_driverops,                         /* Driver module */
133         "RPCIB plugin driver, ver 1.29", /* Driver name and version */
134         &rpcib_ops,             /* Driver ops */
135 };
136 
137 static struct modlinkage rib_modlinkage = {
138         MODREV_1,
139         (void *)&rib_modldrv,
140         NULL
141 };
142 














143 /*
144  * rib_stat: private data pointer used when registering
145  *      with the IBTF.  It is returned to the consumer
146  *      in all callbacks.
147  */
148 static rpcib_state_t *rib_stat = NULL;
149 
150 #define RNR_RETRIES     2 
151 #define MAX_PORTS       2
152 
153 int preposted_rbufs = 16; 






154 int send_threshold = 1;
155 
156 /*
157  * State of the plugin.
158  * ACCEPT = accepting new connections and requests.
159  * NO_ACCEPT = not accepting new connection and requests.
160  * This should eventually move to rpcib_state_t structure, since this
161  * will tell in which state the plugin is for a particular type of service
162  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
163  * state for one and in no_accept state for the other.
164  */
165 int             plugin_state;
166 kmutex_t        plugin_state_lock;
167 
168 
169 /*
170  * RPCIB RDMATF operations
171  */



172 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
173 static rdma_stat rib_disconnect(CONN *conn);
174 static void rib_listen(struct rdma_svc_data *rd);
175 static void rib_listen_stop(struct rdma_svc_data *rd);
176 static rdma_stat rib_registermem(CONN *conn, caddr_t buf, uint_t buflen, 
177         struct mrc *buf_handle);
178 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
179         struct mrc buf_handle);
180 static rdma_stat rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen, 










181         struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle);
182 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
183         struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle);


184 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
185         caddr_t buf, int len, int cpu);
186 
187 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
188 
189 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
190 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
191 
192 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
193 
194 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);










195 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
196 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
197 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
198 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
199 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
200 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
201 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
202 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
203 static rdma_stat rib_conn_release(CONN *conn);
204 static rdma_stat rib_getinfo(rdma_info_t *info);













205 static rdma_stat rib_register_ats(rib_hca_t *);
206 static void rib_deregister_ats();
207 static void rib_stop_services(rib_hca_t *);
208 
209 /*
210  * RPCIB addressing operations
211  */
212 char ** get_ip_addrs(int *count);
213 int get_interfaces(TIUSER *tiptr, int *num);
214 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs);
215 int get_ibd_ipaddr(rpcib_ibd_insts_t *);
216 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *);
217 void rib_get_ibd_insts(rpcib_ibd_insts_t *);



218 
219  












220 /*
221  * RDMA operations the RPCIB module exports
222  */
223 static rdmaops_t rib_ops = {
224         rib_reachable,
225         rib_conn_get,
226         rib_conn_release,
227         rib_listen,
228         rib_listen_stop,
229         rib_registermem,
230         rib_deregistermem,
231         rib_registermemsync,
232         rib_deregistermemsync,
233         rib_syncmem,
234         rib_reg_buf_alloc,
235         rib_reg_buf_free,
236         rib_send,






237         rib_send_resp,
238         rib_post_resp,
239         rib_post_recv,
240         rib_recv,
241         rib_read,
242         rib_write,
243         rib_getinfo 










244 };
245 
246 /*
247  * RDMATF RPCIB plugin details
248  */
249 static rdma_mod_t rib_mod = {
250         "ibtf",         /* api name */
251         RDMATF_VERS_1,
252         0,
253         &rib_ops,   /* rdma op vector for ibtf */
254 };
255 
256 static rdma_stat open_hcas(rpcib_state_t *);
257 static rdma_stat rib_qp_init(rib_qp_t *, int);
258 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
259 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
260 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
261 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
262 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
263 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 




264         ibt_mr_hdl_t *, ibt_mr_desc_t *);


265 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *);
266 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
267         rib_qp_t **);
268 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
269         rib_qp_t **);
270 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
271 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
272 static int rib_free_sendwait(struct send_wid *);
273 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
274 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
275 static void rdma_done_rem_list(rib_qp_t *);
276 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
277 
278 static void rib_async_handler(void *,
279         ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
280 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
281 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
282 static int rib_free_svc_recv(struct svc_recv *);
283 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
284 static void rib_free_wid(struct recv_wid *);
285 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
286 static void rib_detach_hca(rib_hca_t *);
287 static rdma_stat rib_chk_srv_ats(rib_hca_t *, struct netbuf *, int,
288         ibt_path_info_t *);
289 
290 /*
291  * Registration with IBTF as a consumer
292  */
293 static struct ibt_clnt_modinfo_s rib_modinfo = {
294         IBTI_V2,
295         IBT_GENERIC,
296         rib_async_handler,      /* async event handler */
297         NULL,                   /* Memory Region Handler */
298         "nfs/ib"
299 };
300 
301 /*
302  * Global strucuture
303  */
304 
305 typedef struct rpcib_s {
306         dev_info_t      *rpcib_dip;
307         kmutex_t        rpcib_mutex;
308 } rpcib_t;
309 
310 rpcib_t rpcib;
311 
312 /*
313  * /etc/system controlled variable to control
314  * debugging in rpcib kernel module.
315  * Set it to values greater that 1 to control
316  * the amount of debugging messages required.
317  */
318 int rib_debug = 0;
319  


320 static int ats_running = 0;


321 int
322 _init(void)
323 {
324         int             error;
325 
326         error = mod_install((struct modlinkage *)&rib_modlinkage);
327         if (error != 0) {
328                 /*
329                  * Could not load module
330                  */
331                 return (error);
332         }
333         mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
334 
335         return (0);
336 }
337 
338 int
339 _fini()
340 {
341         int status;
342 
343         if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) {
344                 return (EBUSY);
345         }
346 
347         rib_deregister_ats();
348 
349         /*
350          * Remove module
351          */
352         if ((status = mod_remove(&rib_modlinkage)) != 0) {
353                 (void) rdma_register_mod(&rib_mod);
354                 return (status);
355         }
356         mutex_destroy(&plugin_state_lock);
357         return (0);
358 }
359 
360 int
361 _info(struct modinfo *modinfop)
362 {
363         return (mod_info(&rib_modlinkage, modinfop));
364 }
365 
366 
367 /*
368  * rpcib_getinfo()
369  * Given the device number, return the devinfo pointer or the
370  * instance number.
371  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
372  */
373 
374 /*ARGSUSED*/
375 static int
376 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
377 {
378         int ret = DDI_SUCCESS;
379 
380         switch (cmd) {
381         case DDI_INFO_DEVT2DEVINFO:
382                 if (rpcib.rpcib_dip != NULL)
383                         *result = rpcib.rpcib_dip;
384                 else {
385                         *result = NULL;
386                         ret = DDI_FAILURE;
387                 }
388                 break;
389 
390         case DDI_INFO_DEVT2INSTANCE:
391                 *result = NULL;
392                 break;
393 
394         default:
395                 ret = DDI_FAILURE;
396         }
397         return (ret);
398 }
399 
400 static int
401 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
402 {
403         ibt_status_t    ibt_status;
404         rdma_stat       r_status;
405 
406         switch (cmd) {
407         case DDI_ATTACH:
408                 break;
409         case DDI_RESUME:
410                 return (DDI_SUCCESS);
411         default:
412                 return (DDI_FAILURE);
413         }
414 
415         mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
416 
417         mutex_enter(&rpcib.rpcib_mutex);
418         if (rpcib.rpcib_dip != NULL) {
419                 mutex_exit(&rpcib.rpcib_mutex);
420                 return (DDI_FAILURE);
421         }
422         rpcib.rpcib_dip = dip;
423         mutex_exit(&rpcib.rpcib_mutex);
424         /*
425          * Create the "rpcib" minor-node.
426          */
427         if (ddi_create_minor_node(dip,
428             "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
429                 /* Error message, no cmn_err as they print on console */
430                 return (DDI_FAILURE);
431         }
432 
433         if (rib_stat == NULL) {
434                 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
435                 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
436         }
437 
438         rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
439         if (rib_stat->hca_count < 1) {
440                 mutex_destroy(&rib_stat->open_hca_lock);
441                 kmem_free(rib_stat, sizeof (*rib_stat));
442                 rib_stat = NULL;
443                 return (DDI_FAILURE);
444         }
445 
446         ibt_status = ibt_attach(&rib_modinfo, dip,
447                         (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
448         if (ibt_status != IBT_SUCCESS) {
449                 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
450                 mutex_destroy(&rib_stat->open_hca_lock);
451                 kmem_free(rib_stat, sizeof (*rib_stat));
452                 rib_stat = NULL;
453                 return (DDI_FAILURE);
454         }
455 
456         mutex_enter(&rib_stat->open_hca_lock);
457         if (open_hcas(rib_stat) != RDMA_SUCCESS) {
458                 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
459                 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
460                 mutex_exit(&rib_stat->open_hca_lock);
461                 mutex_destroy(&rib_stat->open_hca_lock);
462                 kmem_free(rib_stat, sizeof (*rib_stat));
463                 rib_stat = NULL;
464                 return (DDI_FAILURE);
465         }
466         mutex_exit(&rib_stat->open_hca_lock);
467 
468         /*
469          * Register with rdmatf
470          */
471         rib_mod.rdma_count = rib_stat->hca_count;
472         r_status = rdma_register_mod(&rib_mod);
473         if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
474                 rib_detach_hca(rib_stat->hca);
475                 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
476                 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
477                 mutex_destroy(&rib_stat->open_hca_lock);
478                 kmem_free(rib_stat, sizeof (*rib_stat));
479                 rib_stat = NULL;
480                 return (DDI_FAILURE);
481         }
482 
483 
484         return (DDI_SUCCESS);
485 }
486 
487 /*ARGSUSED*/
488 static int
489 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
490 {
491         switch (cmd) {
492 
493         case DDI_DETACH:
494                 break;
495 
496         case DDI_SUSPEND:
497         default:
498                 return (DDI_FAILURE);
499         }
500 
501         /*
502          * Detach the hca and free resources
503          */
504         mutex_enter(&plugin_state_lock);
505         plugin_state = NO_ACCEPT;
506         mutex_exit(&plugin_state_lock);
507         rib_detach_hca(rib_stat->hca);
508         ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
509         (void) ibt_detach(rib_stat->ibt_clnt_hdl);
510 
511         mutex_enter(&rpcib.rpcib_mutex);
512         rpcib.rpcib_dip = NULL;
513         mutex_exit(&rpcib.rpcib_mutex);
514 
515         mutex_destroy(&rpcib.rpcib_mutex);
516         return (DDI_SUCCESS);
517 }
518 
519 
520 static void
521 rib_deregister_ats()
522 {
523         rib_hca_t               *hca;
524         rib_service_t           *srv_list, *to_remove;
525         ibt_status_t            ibt_status;
526 
527         /*
528          * deregister the Address Translation Service.
529          */
530         hca = rib_stat->hca;
531         rw_enter(&hca->service_list_lock, RW_WRITER);
532         srv_list = hca->ats_list;
533         while (srv_list != NULL) {
534                 to_remove = srv_list;
535                 srv_list = to_remove->srv_next;
536 
537                 ibt_status = ibt_deregister_ar(hca->ibt_clnt_hdl,
538                                 &to_remove->srv_ar);
539                 if (ibt_status != IBT_SUCCESS) {
540 #ifdef DEBUG
541                     if (rib_debug) {
542                         cmn_err(CE_WARN, "_fini: "
543                             "ibt_deregister_ar FAILED"
544                                 " status: %d", ibt_status);
545                     }
546 #endif
547                 } else {
548                     mutex_enter(&rib_stat->open_hca_lock);
549                     ats_running = 0;
550                     mutex_exit(&rib_stat->open_hca_lock);
551 #ifdef DEBUG
552                     if (rib_debug) {
553 
554                         cmn_err(CE_NOTE, "_fini: "
555                             "Successfully unregistered"
556                             " ATS service: %s",
557                             to_remove->srv_name);
558                     }
559 #endif
560                 }
561                 kmem_free(to_remove, sizeof (rib_service_t));
562         }
563         hca->ats_list = NULL;
564         rw_exit(&hca->service_list_lock);
565 }
566 
567 static void rib_rbufpool_free(rib_hca_t *, int);
568 static void rib_rbufpool_deregister(rib_hca_t *, int);
569 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
570 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
571 static rdma_stat rib_rem_replylist(rib_qp_t *);
572 static int rib_remreply(rib_qp_t *, struct reply *);
573 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
574 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
575 

576 /*
577  * One CQ pair per HCA
578  */
579 static rdma_stat
580 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
581         rib_cq_t **cqp, rpcib_state_t *ribstat)
582 {
583         rib_cq_t        *cq;
584         ibt_cq_attr_t   cq_attr;
585         uint32_t        real_size;
586         ibt_status_t    status;
587         rdma_stat       error = RDMA_SUCCESS;
588 
589         cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
590         cq->rib_hca = hca;
591         cq_attr.cq_size = cq_size;
592         cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
593         status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
594             &real_size);
595         if (status != IBT_SUCCESS) {
596                 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
597                                 " status=%d", status);
598                 error = RDMA_FAILED;
599                 goto fail;
600         }
601         ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
602 
603         /*
604          * Enable CQ callbacks. CQ Callbacks are single shot
605          * (e.g. you have to call ibt_enable_cq_notify()
606          * after each callback to get another one).
607          */
608         status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
609         if (status != IBT_SUCCESS) {
610                 cmn_err(CE_WARN, "rib_create_cq: "
611                         "enable_cq_notify failed, status %d", status);
612                 error = RDMA_FAILED;
613                 goto fail;
614         }
615         *cqp = cq;
616 
617         return (error);
618 fail:
619         if (cq->rib_cq_hdl)
620                 (void) ibt_free_cq(cq->rib_cq_hdl);
621         if (cq)
622                 kmem_free(cq, sizeof (rib_cq_t));
623         return (error);
624 }
625 
626 static rdma_stat
627 open_hcas(rpcib_state_t *ribstat)
628 {
629         rib_hca_t               *hca;
630         ibt_status_t            ibt_status;
631         rdma_stat               status;
632         ibt_hca_portinfo_t      *pinfop;
633         ibt_pd_flags_t          pd_flags = IBT_PD_NO_FLAGS;
634         uint_t                  size, cq_size;
635         int                     i;
636  



637         ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
638         if (ribstat->hcas == NULL)
639                 ribstat->hcas = kmem_zalloc(ribstat->hca_count *
640                                     sizeof (rib_hca_t), KM_SLEEP);
641 
642         /*
643          * Open a hca and setup for RDMA
644          */
645         for (i = 0; i < ribstat->hca_count; i++) {
646                 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
647                                 ribstat->hca_guids[i],
648                                 &ribstat->hcas[i].hca_hdl);
649                 if (ibt_status != IBT_SUCCESS) {
650                         cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) "
651                                 "returned %d", i, ibt_status);
652                         continue;
653                 }
654                 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
655                 hca = &(ribstat->hcas[i]);
656                 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
657                 hca->state = HCA_INITED;
658 
659                 /*
660                  * query HCA info
661                  */
662                 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
663                 if (ibt_status != IBT_SUCCESS) {
664                         cmn_err(CE_WARN, "open_hcas: ibt_query_hca "
665                             "returned %d (hca_guid 0x%llx)",
666                             ibt_status, (longlong_t)ribstat->hca_guids[i]);
667                         goto fail1;
668                 }
669 
670                 /*
671                  * One PD (Protection Domain) per HCA.
672                  * A qp is allowed to access a memory region
673                  * only when it's in the same PD as that of
674                  * the memory region.
675                  */
676                 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
677                 if (ibt_status != IBT_SUCCESS) {
678                         cmn_err(CE_WARN, "open_hcas: ibt_alloc_pd "
679                                 "returned %d (hca_guid 0x%llx)",
680                                 ibt_status, (longlong_t)ribstat->hca_guids[i]);
681                         goto fail1;
682                 }
683 
684                 /*
685                  * query HCA ports
686                  */
687                 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
688                                 0, &pinfop, &hca->hca_nports, &size);
689                 if (ibt_status != IBT_SUCCESS) {
690                         cmn_err(CE_WARN, "open_hcas: "
691                                 "ibt_query_hca_ports returned %d "
692                                 "(hca_guid 0x%llx)",
693                                 ibt_status, (longlong_t)hca->hca_guid);
694                         goto fail2;
695                 }
696                 hca->hca_ports = pinfop;
697                 hca->hca_pinfosz = size;
698                 pinfop = NULL;
699 
700                 cq_size = DEF_CQ_SIZE; /* default cq size */
701                 /*
702                  * Create 2 pairs of cq's (1 pair for client
703                  * and the other pair for server) on this hca.
704                  * If number of qp's gets too large, then several
705                  * cq's will be needed.
706                  */
707                 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
708                                 &hca->svc_rcq, ribstat);
709                 if (status != RDMA_SUCCESS) {
710                         goto fail3;
711                 }
712 
713                 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
714                                 &hca->svc_scq, ribstat);
715                 if (status != RDMA_SUCCESS) {
716                         goto fail3;
717                 }
718 
719                 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
720                                 &hca->clnt_rcq, ribstat);
721                 if (status != RDMA_SUCCESS) {
722                         goto fail3;
723                 }
724 
725                 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
726                                 &hca->clnt_scq, ribstat);
727                 if (status != RDMA_SUCCESS) {
728                         goto fail3;
729                 }
730 
731                 /*
732                  * Create buffer pools.
733                  * Note rib_rbuf_create also allocates memory windows.
734                  */
735                 hca->recv_pool = rib_rbufpool_create(hca,
736                                         RECV_BUFFER, MAX_BUFS);
737                 if (hca->recv_pool == NULL) {
738                         cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n");
739                         goto fail3;
740                 }
741 
742                 hca->send_pool = rib_rbufpool_create(hca,
743                                         SEND_BUFFER, MAX_BUFS);
744                 if (hca->send_pool == NULL) {
745                         cmn_err(CE_WARN, "open_hcas: send buf pool failed\n");
746                         rib_rbufpool_destroy(hca, RECV_BUFFER);
747                         goto fail3;
748                 }



749 





























































750                 /*
751                  * Initialize the registered service list and
752                  * the lock
753                  */
754                 hca->service_list = NULL;
755                 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
756 
757                 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
758                 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
759                 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
760                         hca->iblock);
761                 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
762                         hca->iblock);
763                 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
764                 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
765                 hca->inuse = TRUE;
766                 /*
767                  * XXX One hca only. Add multi-hca functionality if needed
768                  * later.
769                  */
770                 ribstat->hca = hca;
771                 ribstat->nhca_inited++;
772                 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
773                 break;
774 
775 fail3:
776                 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
777 fail2:
778                 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
779 fail1:
780                 (void) ibt_close_hca(hca->hca_hdl);
781 
782         }
783         if (ribstat->hca != NULL)
784                 return (RDMA_SUCCESS);
785         else
786                 return (RDMA_FAILED);
787 }
788 
789 /*
790  * Callback routines
791  */
792 
793 /*
794  * SCQ handlers
795  */
796 /* ARGSUSED */
797 static void
798 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
799 {
800         ibt_status_t    ibt_status;
801         ibt_wc_t        wc;
802         int             i;
803 
804         /*
805          * Re-enable cq notify here to avoid missing any
806          * completion queue notification.
807          */
808         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
809 
810         ibt_status = IBT_SUCCESS;
811         while (ibt_status != IBT_CQ_EMPTY) {
812             bzero(&wc, sizeof (wc));
813             ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
814             if (ibt_status != IBT_SUCCESS)
815                 return;
816 
817         /*
818          * Got a send completion
819          */
820             if (wc.wc_id != NULL) {     /* XXX can it be otherwise ???? */
821                 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
822                 CONN    *conn = qptoc(wd->qp);
823 
824                 mutex_enter(&wd->sendwait_lock);
825                 switch (wc.wc_status) {
826                 case IBT_WC_SUCCESS:
827                         wd->status = RDMA_SUCCESS;
828                         break;
829                 case IBT_WC_WR_FLUSHED_ERR:
830                         wd->status = RDMA_FAILED;
831                         break;
832                 default:
833 /*
834  *    RC Send Q Error Code              Local state     Remote State
835  *    ====================              ===========     ============
836  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
837  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
838  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
839  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
840  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
841  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
842  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
843  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
844  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
845  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
846  *    IBT_WC_WR_FLUSHED_ERR               None            None
847  */
848 #ifdef DEBUG
849         if (rib_debug > 1) {
850             if (wc.wc_status != IBT_WC_SUCCESS) {
851                     cmn_err(CE_NOTE, "rib_clnt_scq_handler: "
852                         "WR completed in error, wc.wc_status:%d, "
853                         "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id);
854             }
855         }
856 #endif
857                         /*
858                          * Channel in error state. Set connection to
859                          * ERROR and cleanup will happen either from
860                          * conn_release  or from rib_conn_get
861                          */
862                         wd->status = RDMA_FAILED;
863                         mutex_enter(&conn->c_lock);
864                         if (conn->c_state != C_DISCONN_PEND)
865                                 conn->c_state = C_ERROR;
866                         mutex_exit(&conn->c_lock);
867                         break;
868                 }
869                 if (wd->cv_sig == 1) {
870                         /*
871                          * Notify poster
872                          */
873                         cv_signal(&wd->wait_cv);
874                         mutex_exit(&wd->sendwait_lock);
875                 } else {
876                         /*
877                          * Poster not waiting for notification.
878                          * Free the send buffers and send_wid
879                          */
880                         for (i = 0; i < wd->nsbufs; i++) {
881                                 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
882                                         (void *)(uintptr_t)wd->sbufaddr[i]);
883                         }
884                         mutex_exit(&wd->sendwait_lock);
885                         (void) rib_free_sendwait(wd);
886                 }
887             }
888         }
889 }
890 














































































891 /* ARGSUSED */
892 static void
893 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
894 {
895         ibt_status_t    ibt_status;
896         ibt_wc_t        wc;
897         int             i;
898 
899         /*
900          * Re-enable cq notify here to avoid missing any
901          * completion queue notification.
902          */
903         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
904 
905         ibt_status = IBT_SUCCESS;
906         while (ibt_status != IBT_CQ_EMPTY) {
907             bzero(&wc, sizeof (wc));
908             ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
909             if (ibt_status != IBT_SUCCESS)
910                 return;
911 
912         /*
913          * Got a send completion
914          */
915 #ifdef DEBUG
916             if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) {
917                 cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error "
918                         "wc.wc_status:%d, wc_id:%llX",
919                         wc.wc_status, (longlong_t)wc.wc_id);
920             }
921 #endif
922             if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
923                 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
924  





















925                 mutex_enter(&wd->sendwait_lock);
926                 if (wd->cv_sig == 1) {
927                         /*
928                          * Update completion status and notify poster
929                          */
930                         if (wc.wc_status == IBT_WC_SUCCESS)
931                                 wd->status = RDMA_SUCCESS;
932                         else
933                                 wd->status = RDMA_FAILED;
934                         cv_signal(&wd->wait_cv);
935                         mutex_exit(&wd->sendwait_lock);
936                 } else {
937                         /*
938                          * Poster not waiting for notification.
939                          * Free the send buffers and send_wid
940                          */
941                         for (i = 0; i < wd->nsbufs; i++) {
942                                 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
943                                         (void *)(uintptr_t)wd->sbufaddr[i]);
944                         }
945                         mutex_exit(&wd->sendwait_lock);
946                         (void) rib_free_sendwait(wd);
947                 }
948             }
949         }
950 }
951 
952 /*
953  * RCQ handler
954  */
955 /* ARGSUSED */
956 static void
957 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
958 {
959         rib_qp_t        *qp;
960         ibt_status_t    ibt_status;
961         ibt_wc_t        wc;
962         struct recv_wid *rwid;



963 
964         /*
965          * Re-enable cq notify here to avoid missing any
966          * completion queue notification.
967          */

968         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);

969 
970         ibt_status = IBT_SUCCESS;
971         while (ibt_status != IBT_CQ_EMPTY) {



972                 bzero(&wc, sizeof (wc));
973                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);










974                 if (ibt_status != IBT_SUCCESS)




975                     return;
976  



977                 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
978                 qp = rwid->qp;
979                 if (wc.wc_status == IBT_WC_SUCCESS) {
980                     XDR                 inxdrs, *xdrs;
981                     uint_t              xid, vers, op, find_xid = 0;
982                     struct reply        *r;
983                     CONN *conn = qptoc(qp);

984 
985                     xdrs = &inxdrs;
986                     xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
987                         wc.wc_bytes_xfer, XDR_DECODE);
988                 /*
989                  * Treat xid as opaque (xid is the first entity
990                  * in the rpc rdma message).
991                  */
992                     xid = *(uint32_t *)(uintptr_t)rwid->addr;
993                 /* Skip xid and set the xdr position accordingly. */
994                     XDR_SETPOS(xdrs, sizeof (uint32_t));
995                     (void) xdr_u_int(xdrs, &vers);

996                     (void) xdr_u_int(xdrs, &op);
997                     XDR_DESTROY(xdrs);
998                     if (vers != RPCRDMA_VERS) {
999                         /*
1000                          * Invalid RPC/RDMA version. Cannot interoperate.
1001                          * Set connection to ERROR state and bail out.
1002                          */
1003                         mutex_enter(&conn->c_lock);
1004                         if (conn->c_state != C_DISCONN_PEND)
1005                                 conn->c_state = C_ERROR;
1006                         mutex_exit(&conn->c_lock);
1007                         rib_rbuf_free(conn, RECV_BUFFER,
1008                                 (void *)(uintptr_t)rwid->addr);
1009                         rib_free_wid(rwid);
1010                         continue;
1011                     }
1012 
1013                     mutex_enter(&qp->replylist_lock);
1014                     for (r = qp->replylist; r != NULL; r = r->next) {
1015                         if (r->xid == xid) {
1016                             find_xid = 1;
1017                             switch (op) {
1018                             case RDMA_MSG:
1019                             case RDMA_NOMSG:
1020                             case RDMA_MSGP:
1021                                 r->status = RDMA_SUCCESS;
1022                                 r->vaddr_cq = rwid->addr;
1023                                 r->bytes_xfer = wc.wc_bytes_xfer;
1024                                 cv_signal(&r->wait_cv);
1025                                 break;
1026                             default:
1027                                 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1028                                                 (void *)(uintptr_t)rwid->addr);
1029                                 break;
1030                             }
1031                             break;
1032                         }
1033                     }
1034                     mutex_exit(&qp->replylist_lock);
1035                     if (find_xid == 0) {
1036                         /* RPC caller not waiting for reply */
1037 #ifdef DEBUG
1038                             if (rib_debug) {
1039                         cmn_err(CE_NOTE, "rib_clnt_rcq_handler: "
1040                             "NO matching xid %u!\n", xid);
1041                             }
1042 #endif
1043                         rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1044                                 (void *)(uintptr_t)rwid->addr);
1045                     }
1046                 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1047                         CONN *conn = qptoc(qp);
1048 
1049                         /*
1050                          * Connection being flushed. Just free
1051                          * the posted buffer
1052                          */
1053                         rib_rbuf_free(conn, RECV_BUFFER,
1054                                 (void *)(uintptr_t)rwid->addr);
1055                 } else {
1056                         CONN *conn = qptoc(qp);
1057 /*
1058  *  RC Recv Q Error Code                Local state     Remote State
1059  *  ====================                ===========     ============
1060  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1061  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1062  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1063  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1064  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1065  *  IBT_WC_WR_FLUSHED_ERR               None            None
1066  */
1067                         /*
1068                          * Channel in error state. Set connection
1069                          * in ERROR state.
1070                          */
1071                         mutex_enter(&conn->c_lock);
1072                         if (conn->c_state != C_DISCONN_PEND)
1073                                 conn->c_state = C_ERROR;
1074                         mutex_exit(&conn->c_lock);
1075                         rib_rbuf_free(conn, RECV_BUFFER,
1076                                 (void *)(uintptr_t)rwid->addr);
1077                 }
1078                 rib_free_wid(rwid);
1079         }
1080 }
1081 
1082 /* Server side */
1083 /* ARGSUSED */
1084 static void
1085 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1086 {
1087         struct recv_data *rd;
1088         rib_qp_t        *qp;
1089         ibt_status_t    ibt_status;
1090         ibt_wc_t        wc;
1091         struct svc_recv *s_recvp;
1092         CONN            *conn;
1093         mblk_t          *mp;
1094 
1095         /*
1096          * Re-enable cq notify here to avoid missing any
1097          * completion queue notification.
1098          */
1099         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1100 
1101         ibt_status = IBT_SUCCESS;
1102         while (ibt_status != IBT_CQ_EMPTY) {
1103                 bzero(&wc, sizeof (wc));
1104                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1105                 if (ibt_status != IBT_SUCCESS)
1106                     return;
1107 
1108                 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1109                 qp = s_recvp->qp;
1110                 conn = qptoc(qp);
1111                 mutex_enter(&qp->posted_rbufs_lock);
1112                 qp->n_posted_rbufs--;



1113                 if (qp->n_posted_rbufs == 0)
1114                         cv_signal(&qp->posted_rbufs_cv);
1115                 mutex_exit(&qp->posted_rbufs_lock);
1116 
1117                 if (wc.wc_status == IBT_WC_SUCCESS) {
1118                     XDR         inxdrs, *xdrs;
1119                     uint_t      xid, vers, op;

1120 
1121                     xdrs = &inxdrs;
1122                     /* s_recvp->vaddr stores data */
1123                     xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1124                         wc.wc_bytes_xfer, XDR_DECODE);
1125 
1126                 /*
1127                  * Treat xid as opaque (xid is the first entity
1128                  * in the rpc rdma message).
1129                  */
1130                     xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1131                 /* Skip xid and set the xdr position accordingly. */
1132                     XDR_SETPOS(xdrs, sizeof (uint32_t));
1133                     if (!xdr_u_int(xdrs, &vers) ||

1134                         !xdr_u_int(xdrs, &op)) {
1135                         rib_rbuf_free(conn, RECV_BUFFER,
1136                                 (void *)(uintptr_t)s_recvp->vaddr);
1137                         XDR_DESTROY(xdrs);
1138 #ifdef DEBUG
1139                         cmn_err(CE_NOTE, "rib_svc_rcq_handler: "
1140                             "xdr_u_int failed for qp %p, wc_id=%llx",
1141                             (void *)qp, (longlong_t)wc.wc_id);
1142 #endif
1143                         (void) rib_free_svc_recv(s_recvp);
1144                         continue;
1145                     }
1146                     XDR_DESTROY(xdrs);
1147 
1148                     if (vers != RPCRDMA_VERS) {
1149                         /*
1150                          * Invalid RPC/RDMA version. Drop rpc rdma message.
1151                          */
1152                         rib_rbuf_free(conn, RECV_BUFFER,
1153                                 (void *)(uintptr_t)s_recvp->vaddr);
1154                         (void) rib_free_svc_recv(s_recvp);
1155                         continue;
1156                     }
1157                         /*
1158                          * Is this for RDMA_DONE?
1159                          */
1160                     if (op == RDMA_DONE) {
1161                         rib_rbuf_free(conn, RECV_BUFFER,
1162                                 (void *)(uintptr_t)s_recvp->vaddr);
1163                         /*
1164                          * Wake up the thread waiting on
1165                          * a RDMA_DONE for xid
1166                          */
1167                         mutex_enter(&qp->rdlist_lock);
1168                         rdma_done_notify(qp, xid);
1169                         mutex_exit(&qp->rdlist_lock);
1170                         (void) rib_free_svc_recv(s_recvp);
1171                         continue;
1172                     }
1173 
1174                     mutex_enter(&plugin_state_lock);
1175                     if (plugin_state == ACCEPT) {
1176                         while ((mp = allocb(sizeof (*rd), BPRI_LO)) == NULL)
1177                             (void) strwaitbuf(sizeof (*rd), BPRI_LO);
1178                         /*
1179                          * Plugin is in accept state, hence the master
1180                          * transport queue for this is still accepting
1181                          * requests. Hence we can call svc_queuereq to
1182                          * queue this recieved msg.
1183                          */
1184                         rd = (struct recv_data *)mp->b_rptr;
1185                         rd->conn = conn;
1186                         rd->rpcmsg.addr = (caddr_t)(uintptr_t)s_recvp->vaddr;
1187                         rd->rpcmsg.type = RECV_BUFFER;
1188                         rd->rpcmsg.len = wc.wc_bytes_xfer;
1189                         rd->status = wc.wc_status;
1190                         mutex_enter(&conn->c_lock);
1191                         conn->c_ref++;
1192                         mutex_exit(&conn->c_lock);
1193                         mp->b_wptr += sizeof (*rd);
1194                         svc_queuereq((queue_t *)rib_stat->q, mp);
1195                         mutex_exit(&plugin_state_lock);
1196                     } else {
1197                         /*
1198                          * The master transport for this is going
1199                          * away and the queue is not accepting anymore
1200                          * requests for krpc, so don't do anything, just
1201                          * free the msg.
1202                          */
1203                         mutex_exit(&plugin_state_lock);
1204                         rib_rbuf_free(conn, RECV_BUFFER,
1205                         (void *)(uintptr_t)s_recvp->vaddr);
1206                     }
1207                 } else {
1208                         rib_rbuf_free(conn, RECV_BUFFER,
1209                                 (void *)(uintptr_t)s_recvp->vaddr);
1210                 }
1211                 (void) rib_free_svc_recv(s_recvp);
1212         }
1213 }
1214 
1215 /*
1216  * Handles DR event of IBT_HCA_DETACH_EVENT.
1217  */
1218 /* ARGSUSED */
1219 static void
1220 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1221         ibt_async_code_t code, ibt_async_event_t *event)
1222 {
1223 
1224         switch (code) {
1225         case IBT_HCA_ATTACH_EVENT:
1226                 /* ignore */
1227                 break;
1228         case IBT_HCA_DETACH_EVENT:
1229         {
1230                 ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1231                 rib_detach_hca(rib_stat->hca);
1232 #ifdef DEBUG
1233         cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1234 #endif
1235                 break;
1236         }
1237 #ifdef DEBUG
1238         case IBT_EVENT_PATH_MIGRATED:
1239         cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PATH_MIGRATED\n");
1240                 break;
1241         case IBT_EVENT_SQD:
1242         cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1243                 break;
1244         case IBT_EVENT_COM_EST:
1245         cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1246                 break;
1247         case IBT_ERROR_CATASTROPHIC_CHAN:
1248         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CATASTROPHIC_CHAN\n");
1249                 break;
1250         case IBT_ERROR_INVALID_REQUEST_CHAN:
1251         cmn_err(CE_NOTE, "rib_async_handler(): "
1252                 "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1253                 break;
1254         case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1255         cmn_err(CE_NOTE, "rib_async_handler(): "
1256                 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1257                 break;
1258         case IBT_ERROR_PATH_MIGRATE_REQ:
1259         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PATH_MIGRATE_REQ\n");
1260                 break;
1261         case IBT_ERROR_CQ:
1262         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1263                 break;
1264         case IBT_ERROR_PORT_DOWN:
1265         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1266                 break;
1267         case IBT_EVENT_PORT_UP:
1268         cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1269                 break;
1270         case IBT_ASYNC_OPAQUE1:
1271         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1272                 break;
1273         case IBT_ASYNC_OPAQUE2:
1274         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1275                 break;
1276         case IBT_ASYNC_OPAQUE3:
1277         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1278                 break;
1279         case IBT_ASYNC_OPAQUE4:
1280         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1281                 break;
1282 #endif
1283         default:
1284                 break;
1285         }
1286 }
1287 
1288 /*
1289  * Client's reachable function.
1290  */
1291 static rdma_stat
1292 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1293 {
1294         rib_hca_t       *hca;
1295         rdma_stat       status;
1296 
1297         /*
1298          * First check if a hca is still attached
1299          */
1300         *handle = NULL;
1301         rw_enter(&rib_stat->hca->state_lock, RW_READER);
1302         if (rib_stat->hca->state != HCA_INITED) {
1303                 rw_exit(&rib_stat->hca->state_lock);
1304                 return (RDMA_FAILED);
1305         }
1306         status = rib_ping_srv(addr_type, raddr, &hca);
1307         rw_exit(&rib_stat->hca->state_lock);
1308 
1309         if (status == RDMA_SUCCESS) {
1310                 *handle = (void *)hca;
1311                 /*
1312                  * Register the Address translation service
1313                  */
1314                 mutex_enter(&rib_stat->open_hca_lock);
1315                 if (ats_running == 0) {
1316                         if (rib_register_ats(rib_stat->hca)
1317                             == RDMA_SUCCESS) {
1318                                 ats_running = 1;
1319                                 mutex_exit(&rib_stat->open_hca_lock);
1320                                 return (RDMA_SUCCESS);
1321                         } else {
1322                                 mutex_exit(&rib_stat->open_hca_lock);
1323                                 return (RDMA_FAILED);
1324                         }
1325                 } else {
1326                         mutex_exit(&rib_stat->open_hca_lock);
1327                         return (RDMA_SUCCESS);
1328                 }
1329         } else {
1330                 *handle = NULL;
1331                 if (rib_debug > 2)
1332                     cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n");
1333                 return (RDMA_FAILED);
1334         }
1335 }
1336 
1337 /* Client side qp creation */
1338 static rdma_stat
1339 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1340 {
1341         rib_qp_t        *kqp = NULL;
1342         CONN            *conn;

1343 
1344         ASSERT(qp != NULL);
1345         *qp = NULL;
1346 
1347         kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1348         conn = qptoc(kqp);
1349         kqp->hca = hca;
1350         kqp->rdmaconn.c_rdmamod = &rib_mod;
1351         kqp->rdmaconn.c_private = (caddr_t)kqp;
1352 
1353         kqp->mode = RIB_CLIENT;
1354         kqp->chan_flags = IBT_BLOCKING;
1355         conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1356         bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1357         conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1358 
1359         /*
1360          * Initialize
1361          */
1362         cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1363         cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1364         mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1365         mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1366         mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1367         mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1368         cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1369         mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);















1370 
1371         *qp = kqp;
1372         return (RDMA_SUCCESS);
1373 }
1374 
1375 /* Server side qp creation */
1376 static rdma_stat
1377 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1378 {
1379         rib_qp_t        *kqp = NULL;
1380         ibt_chan_sizes_t        chan_sizes;
1381         ibt_rc_chan_alloc_args_t        qp_attr;
1382         ibt_status_t            ibt_status;

1383 
1384         ASSERT(qp != NULL);
1385         *qp = NULL;
1386 
1387         kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1388         kqp->hca = hca;
1389         kqp->port_num = port;
1390         kqp->rdmaconn.c_rdmamod = &rib_mod;
1391         kqp->rdmaconn.c_private = (caddr_t)kqp;
1392 
1393         /*
1394          * Create the qp handle
1395          */
1396         bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1397         qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1398         qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1399         qp_attr.rc_pd = hca->pd_hdl;
1400         qp_attr.rc_hca_port_num = port;
1401         qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1402         qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1403         qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1404         qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1405         qp_attr.rc_clone_chan = NULL;
1406         qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1407         qp_attr.rc_flags = IBT_WR_SIGNALED;
1408 
1409         rw_enter(&hca->state_lock, RW_READER);
1410         if (hca->state != HCA_DETACHED) {
1411                 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1412                         IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1413                         &chan_sizes);
1414         } else {
1415                 rw_exit(&hca->state_lock);
1416                 goto fail;
1417         }
1418         rw_exit(&hca->state_lock);
1419 
1420         if (ibt_status != IBT_SUCCESS) {
1421                 cmn_err(CE_WARN, "rib_svc_create_chan: "
1422                         "ibt_alloc_rc_channel failed, ibt_status=%d.",
1423                         ibt_status);
1424                 goto fail;
1425         }
1426 
1427         kqp->mode = RIB_SERVER;
1428         kqp->chan_flags = IBT_BLOCKING;
1429         kqp->q = q;  /* server ONLY */
1430 
1431         cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1432         cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1433         mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1434         mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1435         mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1436         mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1437         cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1438         mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1439         /*
1440          * Set the private data area to qp to be used in callbacks
1441          */
1442         ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1443         kqp->rdmaconn.c_state = C_CONNECTED;











1444         *qp = kqp;


1445         return (RDMA_SUCCESS);
1446 fail:
1447         if (kqp)
1448                 kmem_free(kqp, sizeof (rib_qp_t));
1449 
1450         return (RDMA_FAILED);
1451 }
1452 
1453 void
1454 rib_dump_pathrec(ibt_path_info_t *path_rec)
1455 {
1456         ib_pkey_t       pkey;
1457 
1458         if (rib_debug > 1) {
1459             cmn_err(CE_NOTE, "Path Record:\n");
1460 
1461             cmn_err(CE_NOTE, "Source HCA GUID = %llx\n",
1462                 (longlong_t)path_rec->pi_hca_guid);
1463             cmn_err(CE_NOTE, "Dest Service ID = %llx\n",
1464                 (longlong_t)path_rec->pi_sid);
1465             cmn_err(CE_NOTE, "Port Num        = %02d\n",
1466                 path_rec->pi_prim_cep_path.cep_hca_port_num);
1467             cmn_err(CE_NOTE, "P_Key Index     = %04d\n",
1468                 path_rec->pi_prim_cep_path.cep_pkey_ix);
1469 
1470             (void) ibt_index2pkey_byguid(path_rec->pi_hca_guid,
1471                         path_rec->pi_prim_cep_path.cep_hca_port_num,
1472                         path_rec->pi_prim_cep_path.cep_pkey_ix, &pkey);
1473             cmn_err(CE_NOTE, "P_Key             = 0x%x\n", pkey);
1474 
1475 
1476             cmn_err(CE_NOTE, "SGID:           = %llx:%llx\n",
1477                 (longlong_t)
1478                 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_prefix,
1479                 (longlong_t)
1480                 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_guid);
1481 
1482             cmn_err(CE_NOTE, "DGID:           = %llx:%llx\n",
1483                 (longlong_t)
1484                 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_prefix,
1485                 (longlong_t)
1486                 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_guid);
1487 
1488             cmn_err(CE_NOTE, "Path Rate       = %02x\n",
1489                 path_rec->pi_prim_cep_path.cep_adds_vect.av_srate);
1490             cmn_err(CE_NOTE, "SL              = %02x\n",
1491                 path_rec->pi_prim_cep_path.cep_adds_vect.av_srvl);
1492             cmn_err(CE_NOTE, "Prim Packet LT  = %02x\n",
1493                 path_rec->pi_prim_pkt_lt);
1494             cmn_err(CE_NOTE, "Path MTU        = %02x\n",
1495                 path_rec->pi_path_mtu);
1496         }
1497 }
1498 
1499 /* ARGSUSED */
1500 ibt_cm_status_t
1501 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1502     ibt_cm_return_args_t *ret_args, void *priv_data,
1503     ibt_priv_data_len_t len)
1504 {
1505         rpcib_state_t   *ribstat;
1506         rib_hca_t       *hca;
1507 
1508         ribstat = (rpcib_state_t *)clnt_hdl;
1509         hca = (rib_hca_t *)ribstat->hca;
1510 
1511         switch (event->cm_type) {
1512 
1513         /* got a connection close event */
1514         case IBT_CM_EVENT_CONN_CLOSED:
1515         {
1516                 CONN    *conn;
1517                 rib_qp_t *qp;
1518 
1519                 /* check reason why connection was closed */
1520                 switch (event->cm_event.closed) {
1521                 case IBT_CM_CLOSED_DREP_RCVD:
1522                 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1523                 case IBT_CM_CLOSED_DUP:
1524                 case IBT_CM_CLOSED_ABORT:
1525                 case IBT_CM_CLOSED_ALREADY:
1526                         /*
1527                          * These cases indicate the local end initiated
1528                          * the closing of the channel. Nothing to do here.
1529                          */
1530                         break;
1531                 default:
1532                         /*
1533                          * Reason for CONN_CLOSED event must be one of
1534                          * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1535                          * or IBT_CM_CLOSED_STALE. These indicate cases were
1536                          * the remote end is closing the channel. In these
1537                          * cases free the channel and transition to error
1538                          * state
1539                          */
1540                         qp = ibt_get_chan_private(event->cm_channel);
1541                         conn = qptoc(qp);
1542                         mutex_enter(&conn->c_lock);
1543                         if (conn->c_state == C_DISCONN_PEND) {
1544                                 mutex_exit(&conn->c_lock);
1545                                 break;
1546                         }
1547 
1548                         conn->c_state = C_ERROR;
1549 
1550                         /*
1551                          * Free the rc_channel. Channel has already
1552                          * transitioned to ERROR state and WRs have been
1553                          * FLUSHED_ERR already.
1554                          */
1555                         (void) ibt_free_channel(qp->qp_hdl);
1556                         qp->qp_hdl = NULL;
1557 
1558                         /*
1559                          * Free the conn if c_ref is down to 0 already
1560                          */
1561                         if (conn->c_ref == 0) {
1562                                 /*
1563                                  * Remove from list and free conn
1564                                  */
1565                                 conn->c_state = C_DISCONN_PEND;
1566                                 mutex_exit(&conn->c_lock);
1567                                 (void) rib_disconnect_channel(conn,
1568                                         &hca->cl_conn_list);
1569                         } else {
1570                                 mutex_exit(&conn->c_lock);
1571                         }
1572 #ifdef DEBUG
1573                         if (rib_debug)
1574                                 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1575                                         "(CONN_CLOSED) channel disconnected");
1576 #endif
1577                         break;
1578                 }
1579                 break;
1580         }
1581         default:
1582                 break;
1583         }
1584         return (IBT_CM_ACCEPT);
1585 }
1586 
1587 
1588 /* Check if server has done ATS registration */
1589 rdma_stat
1590 rib_chk_srv_ats(rib_hca_t *hca, struct netbuf *raddr,
1591         int addr_type, ibt_path_info_t *path)
1592 {
1593         struct sockaddr_in      *sin4;
1594         struct sockaddr_in6     *sin6;
1595         ibt_path_attr_t         path_attr;
1596         ibt_status_t            ibt_status;
1597         ib_pkey_t               pkey;
1598         ibt_ar_t                ar_query, ar_result;
1599         rib_service_t           *ats;
1600         ib_gid_t                sgid;
1601         ibt_path_info_t         paths[MAX_PORTS];
1602         uint8_t                 npaths, i;
1603 
1604         (void) bzero(&path_attr, sizeof (ibt_path_attr_t));
1605         (void) bzero(path, sizeof (ibt_path_info_t));
1606 
1607         /*
1608          * Construct svc name
1609          */
1610         path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
1611         switch (addr_type) {
1612         case AF_INET:
1613                 sin4 = (struct sockaddr_in *)raddr->buf;
1614                 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
1615                     IB_SVC_NAME_LEN);
1616                 break;
1617 
1618         case AF_INET6:
1619                 sin6 = (struct sockaddr_in6 *)raddr->buf;
1620                 (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
1621                     path_attr.pa_sname, IB_SVC_NAME_LEN);
1622                 break;
1623 
1624         default:
1625                 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1626                 return (RDMA_INVAL);
1627         }
1628         (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
1629 
1630         /*
1631          * Attempt a path to the server on an ATS-registered port.
1632          * Try all ATS-registered ports until one succeeds.
1633          * The first one that succeeds will be used to connect
1634          * to the server.  If none of them succeed, return RDMA_FAILED.
1635          */
1636         rw_enter(&hca->state_lock, RW_READER);
1637         if (hca->state != HCA_DETACHED) {
1638             rw_enter(&hca->service_list_lock, RW_READER);
1639             for (ats = hca->ats_list; ats != NULL; ats = ats->srv_next) {
1640                 path_attr.pa_hca_guid = hca->hca_guid;
1641                 path_attr.pa_hca_port_num = ats->srv_port;
1642                 ibt_status = ibt_get_paths(hca->ibt_clnt_hdl,
1643                         IBT_PATH_MULTI_SVC_DEST, &path_attr, 2, paths, &npaths);
1644                 if (ibt_status == IBT_SUCCESS ||
1645                         ibt_status == IBT_INSUFF_DATA) {
1646                     for (i = 0; i < npaths; i++) {
1647                         if (paths[i].pi_hca_guid) {
1648                         /*
1649                          * do ibt_query_ar()
1650                          */
1651                             sgid =
1652                                 paths[i].pi_prim_cep_path.cep_adds_vect.av_sgid;
1653 
1654                             (void) ibt_index2pkey_byguid(paths[i].pi_hca_guid,
1655                                 paths[i].pi_prim_cep_path.cep_hca_port_num,
1656                                 paths[i].pi_prim_cep_path.cep_pkey_ix, &pkey);
1657 
1658                             bzero(&ar_query, sizeof (ar_query));
1659                             bzero(&ar_result, sizeof (ar_result));
1660                             ar_query.ar_gid =
1661                                 paths[i].pi_prim_cep_path.cep_adds_vect.av_dgid;
1662                             ar_query.ar_pkey = pkey;
1663                             ibt_status = ibt_query_ar(&sgid, &ar_query,
1664                                         &ar_result);
1665                             if (ibt_status == IBT_SUCCESS) {
1666 #ifdef DEBUG
1667                                 if (rib_debug > 1)
1668                                     rib_dump_pathrec(&paths[i]);
1669 #endif
1670                                 bcopy(&paths[i], path,
1671                                         sizeof (ibt_path_info_t));
1672                                 rw_exit(&hca->service_list_lock);
1673                                 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1674                                 rw_exit(&hca->state_lock);
1675                                 return (RDMA_SUCCESS);
1676                             }
1677 #ifdef DEBUG
1678                             if (rib_debug) {
1679                                 cmn_err(CE_NOTE, "rib_chk_srv_ats: "
1680                                     "ibt_query_ar FAILED, return\n");
1681                             }
1682 #endif
1683                         }
1684                     }
1685                 }
1686             }
1687             rw_exit(&hca->service_list_lock);
1688         }
1689         kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1690         rw_exit(&hca->state_lock);
1691         return (RDMA_FAILED);
1692 }
1693 
1694 
1695 /*
1696  * Connect to the server.
1697  */
1698 rdma_stat
1699 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path)
1700 {
1701         ibt_chan_open_args_t    chan_args;      /* channel args */
1702         ibt_chan_sizes_t        chan_sizes;
1703         ibt_rc_chan_alloc_args_t        qp_attr;
1704         ibt_status_t            ibt_status;
1705         ibt_rc_returns_t        ret_args;       /* conn reject info */
1706         int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
1707 
1708         (void) bzero(&chan_args, sizeof (chan_args));
1709         (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1710 
1711         qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
1712         /* Alloc a RC channel */
1713         qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1714         qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1715         qp_attr.rc_pd = hca->pd_hdl;
1716         qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1717         qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1718         qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1719         qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1720         qp_attr.rc_clone_chan = NULL;
1721         qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1722         qp_attr.rc_flags = IBT_WR_SIGNALED;
1723 
1724         chan_args.oc_path = path;
1725         chan_args.oc_cm_handler = rib_clnt_cm_handler;
1726         chan_args.oc_cm_clnt_private = (void *)rib_stat;
1727         chan_args.oc_rdma_ra_out = 1; 
1728         chan_args.oc_rdma_ra_in = 1; 
1729         chan_args.oc_path_retry_cnt = 2;
1730         chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1731 
1732 refresh:
1733         rw_enter(&hca->state_lock, RW_READER);
1734         if (hca->state != HCA_DETACHED) {
1735                 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1736                         IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl,
1737                         &chan_sizes);
1738         } else {
1739                 rw_exit(&hca->state_lock);
1740                 return (RDMA_FAILED);
1741         }
1742         rw_exit(&hca->state_lock);
1743 
1744         if (ibt_status != IBT_SUCCESS) {
1745 #ifdef DEBUG
1746                 cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel "
1747                 "failed, ibt_status=%d.", ibt_status);
1748 #endif
1749                 return (RDMA_FAILED);
1750         }
1751 
1752         /* Connect to the Server */
1753         (void) bzero(&ret_args, sizeof (ret_args));
1754         mutex_enter(&qp->cb_lock);
1755         ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1756                         IBT_BLOCKING, &chan_args, &ret_args);
1757         if (ibt_status != IBT_SUCCESS) {
1758 #ifdef DEBUG
1759                 if (rib_debug)
1760                         cmn_err(CE_WARN, "rib_conn_to_srv: open_rc_channel"
1761                                 " failed for qp %p, status=%d, "
1762                                 "ret_args.rc_status=%d\n",
1763                                 (void *)qp, ibt_status, ret_args.rc_status);
1764 #endif
1765                 (void) ibt_free_channel(qp->qp_hdl);
1766                 qp->qp_hdl = NULL;
1767                 mutex_exit(&qp->cb_lock);
1768                 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1769                         ret_args.rc_status == IBT_CM_CONN_STALE) {
1770                         /*
1771                          * Got IBT_CM_CONN_STALE probably because of stale
1772                          * data on the passive end of a channel that existed
1773                          * prior to reboot. Retry establishing a channel
1774                          * REFRESH_ATTEMPTS times, during which time the
1775                          * stale conditions on the server might clear up.
1776                          */
1777                         goto refresh;
1778                 }
1779                 return (RDMA_FAILED);
1780         }
1781         mutex_exit(&qp->cb_lock);
1782         /*
1783          * Set the private data area to qp to be used in callbacks
1784          */
1785         ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1786         return (RDMA_SUCCESS);
1787 }
1788 
1789 rdma_stat
1790 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
1791 {
1792         struct sockaddr_in      *sin4;
1793         struct sockaddr_in6     *sin6;
1794         ibt_path_attr_t         path_attr;
1795         ibt_path_info_t         path;
1796         ibt_status_t            ibt_status;
1797 
1798         ASSERT(raddr->buf != NULL);
1799 
1800         bzero(&path_attr, sizeof (ibt_path_attr_t));
1801         bzero(&path, sizeof (ibt_path_info_t));
1802 
1803         /*
1804          * Conctruct svc name
1805          */
1806         path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
1807         switch (addr_type) {
1808         case AF_INET:
1809                 sin4 = (struct sockaddr_in *)raddr->buf;
1810                 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
1811                     IB_SVC_NAME_LEN);
1812                 break;
1813 
1814         case AF_INET6:
1815                 sin6 = (struct sockaddr_in6 *)raddr->buf;
1816                 (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
1817                     path_attr.pa_sname, IB_SVC_NAME_LEN);
1818                 break;
1819 
1820         default:
1821 #ifdef  DEBUG
1822             if (rib_debug) {
1823                 cmn_err(CE_WARN, "rib_ping_srv: Address not recognized\n");
1824             }
1825 #endif
1826                 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1827                 return (RDMA_INVAL);
1828         }
1829         (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
1830 
1831         ibt_status = ibt_get_paths(rib_stat->ibt_clnt_hdl,
1832                 IBT_PATH_NO_FLAGS, &path_attr, 1, &path, NULL);
1833         kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1834         if (ibt_status != IBT_SUCCESS) {
1835             if (rib_debug > 1) {
1836                 cmn_err(CE_WARN, "rib_ping_srv: ibt_get_paths FAILED!"
1837                         " status=%d\n", ibt_status);
1838             }
1839         } else if (path.pi_hca_guid) {
1840                 ASSERT(path.pi_hca_guid == rib_stat->hca->hca_guid);
1841                 *hca = rib_stat->hca;
1842                 return (RDMA_SUCCESS);
1843         }
1844         return (RDMA_FAILED);
1845 }
1846 
1847 /*
1848  * Close channel, remove from connection list and
1849  * free up resources allocated for that channel.
1850  */
1851 rdma_stat
1852 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1853 {
1854         rib_qp_t        *qp = ctoqp(conn);
1855         rib_hca_t       *hca;
1856 
1857         /*
1858          * c_ref == 0 and connection is in C_DISCONN_PEND
1859          */
1860         hca = qp->hca;
1861         if (conn_list != NULL)
1862                 (void) rib_rm_conn(conn, conn_list);
1863         if (qp->qp_hdl != NULL) {
1864                 /*
1865                  * If the channel has not been establised,
1866                  * ibt_flush_channel is called to flush outstanding WRs
1867                  * on the Qs.  Otherwise, ibt_close_rc_channel() is
1868                  * called.  The channel is then freed.
1869                  */
1870                 if (conn_list != NULL)
1871                     (void) ibt_close_rc_channel(qp->qp_hdl,
1872                         IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
1873                 else
1874                     (void) ibt_flush_channel(qp->qp_hdl);
1875 
1876                 mutex_enter(&qp->posted_rbufs_lock);
1877                 while (qp->n_posted_rbufs)
1878                         cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1879                 mutex_exit(&qp->posted_rbufs_lock);
1880                 (void) ibt_free_channel(qp->qp_hdl);
1881                 qp->qp_hdl = NULL;
1882         }
1883         ASSERT(qp->rdlist == NULL);
1884         if (qp->replylist != NULL) {
1885                 (void) rib_rem_replylist(qp);
1886         }
1887 
1888         cv_destroy(&qp->cb_conn_cv);
1889         cv_destroy(&qp->posted_rbufs_cv);
1890         mutex_destroy(&qp->cb_lock);
1891 
1892         mutex_destroy(&qp->replylist_lock);
1893         mutex_destroy(&qp->posted_rbufs_lock);
1894         mutex_destroy(&qp->rdlist_lock);
1895 
1896         cv_destroy(&conn->c_cv);
1897         mutex_destroy(&conn->c_lock);
1898 
1899         if (conn->c_raddr.buf != NULL) {
1900                 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
1901         }
1902         if (conn->c_laddr.buf != NULL) {
1903                 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
1904         }










1905         kmem_free(qp, sizeof (rib_qp_t));
1906 
1907         /*
1908          * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
1909          * then the hca is no longer being used.
1910          */
1911         if (conn_list != NULL) {
1912                 rw_enter(&hca->state_lock, RW_READER);
1913                 if (hca->state == HCA_DETACHED) {
1914                         rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
1915                         if (hca->srv_conn_list.conn_hd == NULL) {
1916                                 rw_enter(&hca->cl_conn_list.conn_lock,
1917                                         RW_READER);
1918                                 if (hca->cl_conn_list.conn_hd == NULL) {
1919                                         mutex_enter(&hca->inuse_lock);
1920                                         hca->inuse = FALSE;
1921                                         cv_signal(&hca->cb_cv);
1922                                         mutex_exit(&hca->inuse_lock);
1923                                 }
1924                                 rw_exit(&hca->cl_conn_list.conn_lock);
1925                         }
1926                         rw_exit(&hca->srv_conn_list.conn_lock);
1927                 }
1928                 rw_exit(&hca->state_lock);
1929         }


1930         return (RDMA_SUCCESS);
1931 }
1932 






















1933 /*
1934  * Wait for send completion notification. Only on receiving a
1935  * notification be it a successful or error completion, free the
1936  * send_wid.
1937  */
1938 static rdma_stat
1939 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
1940 {
1941         clock_t timout, cv_wait_ret;
1942         rdma_stat error = RDMA_SUCCESS;
1943         int     i;
1944 
1945         /*
1946          * Wait for send to complete
1947          */
1948         ASSERT(wd != NULL);
1949         mutex_enter(&wd->sendwait_lock);
1950         if (wd->status == (uint_t)SEND_WAIT) {
1951                 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
1952                     ddi_get_lbolt();
1953                 if (qp->mode == RIB_SERVER) {
1954                         while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
1955                                     &wd->sendwait_lock, timout)) > 0 &&
1956                             wd->status == (uint_t)SEND_WAIT)
1957                                 ;
1958                         switch (cv_wait_ret) {
1959                         case -1:        /* timeout */
1960 #ifdef DEBUG
1961                                 if (rib_debug > 2)
1962                                         cmn_err(CE_WARN, "rib_sendwait: "
1963                                             "timed out qp %p\n", (void *)qp);
1964 #endif
1965                                 wd->cv_sig = 0;              /* no signal needed */
1966                                 error = RDMA_TIMEDOUT;
1967                                 break;
1968                         default:        /* got send completion */
1969                                 break;
1970                         }
1971                 } else {
1972                         while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
1973                                     &wd->sendwait_lock, timout)) > 0 &&
1974                             wd->status == (uint_t)SEND_WAIT)
1975                                 ;
1976                         switch (cv_wait_ret) {
1977                         case -1:        /* timeout */
1978 #ifdef DEBUG
1979                                 if (rib_debug > 2)
1980                                         cmn_err(CE_WARN, "rib_sendwait: "
1981                                             "timed out qp %p\n", (void *)qp);
1982 #endif
1983                                 wd->cv_sig = 0;              /* no signal needed */
1984                                 error = RDMA_TIMEDOUT;
1985                                 break;
1986                         case 0:         /* interrupted */
1987 #ifdef DEBUG
1988                                 if (rib_debug > 2)
1989                                         cmn_err(CE_NOTE, "rib_sendwait:"
1990                                             " interrupted on qp %p\n",
1991                                             (void *)qp);
1992 #endif
1993                                 wd->cv_sig = 0;              /* no signal needed */
1994                                 error = RDMA_INTR;
1995                                 break;
1996                         default:        /* got send completion */
1997                                 break;
1998                         }
1999                 }
2000         }
2001 
2002         if (wd->status != (uint_t)SEND_WAIT) {
2003                 /* got send completion */
2004                 if (wd->status != RDMA_SUCCESS) {
2005                     error = wd->status;
2006                     if (wd->status != RDMA_CONNLOST)
2007                         error = RDMA_FAILED;
2008                 }
2009                 for (i = 0; i < wd->nsbufs; i++) {
2010                         rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2011                                 (void *)(uintptr_t)wd->sbufaddr[i]);
2012                 }
2013                 mutex_exit(&wd->sendwait_lock);
2014                 (void) rib_free_sendwait(wd);
2015         } else {
2016                 mutex_exit(&wd->sendwait_lock);
2017         }
2018 
2019         return (error);
2020 }
2021 
2022 static struct send_wid *
2023 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2024 {
2025         struct send_wid *wd;
2026 
2027         wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2028         wd->xid = xid;
2029         wd->cv_sig = cv_sig;
2030         wd->qp = qp;
2031         cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2032         mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2033         wd->status = (uint_t)SEND_WAIT;
2034 
2035         return (wd);
2036 }
2037 
2038 static int
2039 rib_free_sendwait(struct send_wid *wdesc)
2040 {
2041         cv_destroy(&wdesc->wait_cv);
2042         mutex_destroy(&wdesc->sendwait_lock);
2043         kmem_free(wdesc, sizeof (*wdesc));
2044 
2045         return (0);
2046 }
2047 
2048 static rdma_stat
2049 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2050 {
2051         mutex_enter(&qp->replylist_lock);
2052         if (rep != NULL) {
2053             (void) rib_remreply(qp, rep);
2054             mutex_exit(&qp->replylist_lock);
2055             return (RDMA_SUCCESS);
2056         }
2057         mutex_exit(&qp->replylist_lock);
2058         return (RDMA_FAILED);
2059 }
2060 
2061 /*
2062  * Send buffers are freed here only in case of error in posting
2063  * on QP. If the post succeeded, the send buffers are freed upon
2064  * send completion in rib_sendwait() or in the scq_handler.
2065  */
2066 rdma_stat

2067 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2068         int send_sig, int cv_sig) 




2069 {
2070         struct send_wid *wdesc;
2071         struct clist    *clp;
2072         ibt_status_t    ibt_status = IBT_SUCCESS;
2073         rdma_stat       ret = RDMA_SUCCESS;
2074         ibt_send_wr_t   tx_wr;
2075         int             i, nds;
2076         ibt_wr_ds_t     sgl[DSEG_MAX];
2077         uint_t          total_msg_size;
2078         rib_qp_t        *qp = ctoqp(conn);
2079 
2080         ASSERT(cl != NULL);
2081 
2082         bzero(&tx_wr, sizeof (ibt_send_wr_t));
2083 
2084         nds = 0;
2085         total_msg_size = 0;
2086         clp = cl;
2087         while (clp != NULL) {
2088                 if (nds >= DSEG_MAX) {
2089                         cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX"
2090                             " too small!");
2091                         return (RDMA_FAILED);
2092                 }
2093                 sgl[nds].ds_va = clp->c_saddr;
2094                 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2095                 sgl[nds].ds_len = clp->c_len;
2096                 total_msg_size += clp->c_len;
2097                 clp = clp->c_next;
2098                 nds++;
2099         }
2100 
2101         if (send_sig) {
2102                 /* Set SEND_SIGNAL flag. */
2103                 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2104                 wdesc = rib_init_sendwait(msgid, cv_sig, qp);

2105         } else {
2106                 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2107                 wdesc = rib_init_sendwait(msgid, 0, qp);

2108         }
2109         wdesc->nsbufs = nds;









2110         for (i = 0; i < nds; i++) {
2111                 wdesc->sbufaddr[i] = sgl[i].ds_va;
2112         }
2113 
2114         tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2115         tx_wr.wr_opcode = IBT_WRC_SEND;
2116         tx_wr.wr_trans = IBT_RC_SRV;
2117         tx_wr.wr_nds = nds;
2118         tx_wr.wr_sgl = sgl;
2119 
2120         mutex_enter(&conn->c_lock);
2121         if (conn->c_state & C_CONNECTED) {
2122                 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2123         }
2124         if (((conn->c_state & C_CONNECTED) == 0) ||
2125                 ibt_status != IBT_SUCCESS) {
2126                 mutex_exit(&conn->c_lock);
2127                 for (i = 0; i < nds; i++) {
2128                         rib_rbuf_free(conn, SEND_BUFFER,
2129                                 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2130                 }
2131                 (void) rib_free_sendwait(wdesc);
2132 #ifdef DEBUG
2133                 if (rib_debug && ibt_status != IBT_SUCCESS)
2134                         cmn_err(CE_WARN, "rib_send_and_wait: ibt_post_send "
2135                                 "failed! wr_id %llx on qpn %p, status=%d!",
2136                                 (longlong_t)tx_wr.wr_id, (void *)qp,
2137                                 ibt_status);
2138 #endif
2139                 return (RDMA_FAILED);
2140         }
2141         mutex_exit(&conn->c_lock);
2142 
2143         if (send_sig) {
2144             if (cv_sig) {
2145                 /*
2146                  * cv_wait for send to complete.
2147                  * We can fail due to a timeout or signal or
2148                  * unsuccessful send.
2149                  */
2150                 ret = rib_sendwait(qp, wdesc);
2151 #ifdef DEBUG
2152             if (rib_debug > 2)
2153                 if (ret != 0) {
2154                     cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait "
2155                         "FAILED, rdma stat=%d, wr_id %llx, qp %p!",
2156                         ret, (longlong_t)tx_wr.wr_id, (void *)qp);
2157                 }
2158 #endif
2159                 return (ret);
2160             }
2161         }
2162 
2163         return (RDMA_SUCCESS);
2164 }
2165 

2166 rdma_stat




































2167 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2168 {
2169         rdma_stat       ret;








2170 






2171         /* send-wait & cv_signal */
2172         ret = rib_send_and_wait(conn, cl, msgid, 1, 1); 
2173 
2174         return (ret);
2175 }
2176  
2177 /*
2178  * Server interface (svc_rdma_ksend).
2179  * Send RPC reply and wait for RDMA_DONE.
2180  */
2181 rdma_stat
2182 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2183 {
2184         rdma_stat ret = RDMA_SUCCESS;
2185         struct rdma_done_list *rd;
2186         clock_t timout, cv_wait_ret;

2187         rib_qp_t *qp = ctoqp(conn);
2188 
2189         mutex_enter(&qp->rdlist_lock);
2190         rd = rdma_done_add(qp, msgid);
2191 
2192         /* No cv_signal (whether send-wait or no-send-wait) */
2193         ret = rib_send_and_wait(conn, cl, msgid, 1, 0); 




2194         if (ret != RDMA_SUCCESS) {
2195 #ifdef DEBUG
2196             cmn_err(CE_WARN, "rib_send_resp: send_and_wait "
2197                 "failed, msgid %u, qp %p", msgid, (void *)qp);
2198 #endif
2199             rdma_done_rm(qp, rd);
2200             goto done;
2201         }
2202 
2203         /*
2204          * Wait for RDMA_DONE from remote end
2205          */
2206         timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2207         cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock,
2208             timout);
2209         rdma_done_rm(qp, rd);
2210         if (cv_wait_ret < 0) {
2211 #ifdef DEBUG
2212                 if (rib_debug > 1) {
2213                         cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
2214                             " recv'd for qp %p, xid:%u\n",
2215                             (void *)qp, msgid);
2216                 }
2217 #endif
2218                 ret = RDMA_TIMEDOUT;
2219                 goto done;
2220         }
2221 
2222 done:
2223         mutex_exit(&qp->rdlist_lock);
2224         return (ret);
2225 }
2226 
2227 static struct recv_wid *
2228 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2229 {
2230         struct recv_wid *rwid;
2231 
2232         rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2233         rwid->xid = msgid;
2234         rwid->addr = sgl->ds_va;
2235         rwid->qp = qp;
2236 
2237         return (rwid);
2238 }
2239 
2240 static void
2241 rib_free_wid(struct recv_wid *rwid)
2242 {
2243         kmem_free(rwid, sizeof (struct recv_wid));
2244 }
2245 
2246 rdma_stat
2247 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2248 {
2249         rib_qp_t        *qp = ctoqp(conn);
2250         struct clist    *clp = cl;
2251         struct reply    *rep;
2252         struct recv_wid *rwid;
2253         int             nds;
2254         ibt_wr_ds_t     sgl[DSEG_MAX];
2255         ibt_recv_wr_t   recv_wr;
2256         rdma_stat       ret;
2257         ibt_status_t    ibt_status;
2258 
2259         /*
2260          * rdma_clnt_postrecv uses RECV_BUFFER.
2261          */
2262 
2263         nds = 0;
2264         while (cl != NULL) {
2265                 if (nds >= DSEG_MAX) {
2266                     cmn_err(CE_WARN, "rib_clnt_post: DSEG_MAX too small!");
2267                     ret = RDMA_FAILED;
2268                     goto done;
2269                 }
2270                 sgl[nds].ds_va = cl->c_saddr;
2271                 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2272                 sgl[nds].ds_len = cl->c_len;
2273                 cl = cl->c_next;
2274                 nds++;
2275         }
2276 
2277         if (nds != 1) {
2278             cmn_err(CE_WARN, "rib_clnt_post: nds!=1\n");
2279             ret = RDMA_FAILED;
2280             goto done;
2281         }
2282         bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2283         recv_wr.wr_nds = nds;
2284         recv_wr.wr_sgl = sgl;
2285 
2286         rwid = rib_create_wid(qp, &sgl[0], msgid);
2287         if (rwid) {
2288             recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2289         } else {
2290                 cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2291                 ret = RDMA_NORESOURCE;
2292                 goto done;
2293         }
2294         rep = rib_addreplylist(qp, msgid);
2295         if (!rep) {
2296                 cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2297                 rib_free_wid(rwid);
2298                 ret = RDMA_NORESOURCE;
2299                 goto done;
2300         }
2301 
2302         mutex_enter(&conn->c_lock);
2303         if (conn->c_state & C_CONNECTED) {
2304                 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2305         }
2306         if (((conn->c_state & C_CONNECTED) == 0) ||
2307                 ibt_status != IBT_SUCCESS) {
2308                 mutex_exit(&conn->c_lock);
2309 #ifdef DEBUG
2310                 cmn_err(CE_WARN, "rib_clnt_post: QPN %p failed in "
2311                     "ibt_post_recv(), msgid=%d, status=%d",
2312                     (void *)qp,  msgid, ibt_status);
2313 #endif
2314                 rib_free_wid(rwid);
2315                 (void) rib_rem_rep(qp, rep);
2316                 ret = RDMA_FAILED;
2317                 goto done;
2318         }
2319         mutex_exit(&conn->c_lock);
2320         return (RDMA_SUCCESS);
2321 
2322 done:
2323         while (clp != NULL) {
2324             rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)clp->c_saddr);
2325             clp = clp->c_next;
2326         }
2327         return (ret);
2328 }
2329 
2330 rdma_stat
2331 rib_svc_post(CONN* conn, struct clist *cl)
2332 {
2333         rib_qp_t        *qp = ctoqp(conn);
2334         struct svc_recv *s_recvp;
2335         int             nds;
2336         ibt_wr_ds_t     sgl[DSEG_MAX];
2337         ibt_recv_wr_t   recv_wr;
2338         ibt_status_t    ibt_status;
2339 
2340         nds = 0;
2341         while (cl != NULL) {
2342                 if (nds >= DSEG_MAX) {
2343                     cmn_err(CE_WARN, "rib_svc_post: DSEG_MAX too small!");
2344                     return (RDMA_FAILED);
2345                 }
2346                 sgl[nds].ds_va = cl->c_saddr;
2347                 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2348                 sgl[nds].ds_len = cl->c_len;
2349                 cl = cl->c_next;
2350                 nds++;
2351         }
2352 
2353         if (nds != 1) {
2354             cmn_err(CE_WARN, "rib_svc_post: nds!=1\n");
2355             rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)sgl[0].ds_va);
2356             return (RDMA_FAILED);
2357         }
2358         bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2359         recv_wr.wr_nds = nds;
2360         recv_wr.wr_sgl = sgl;
2361 
2362         s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2363         /* Use s_recvp's addr as wr id */
2364         recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2365         mutex_enter(&conn->c_lock);
2366         if (conn->c_state & C_CONNECTED) {
2367                 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2368         }
2369         if (((conn->c_state & C_CONNECTED) == 0) ||
2370                 ibt_status != IBT_SUCCESS) {
2371                 mutex_exit(&conn->c_lock);
2372 #ifdef DEBUG
2373                 cmn_err(CE_WARN, "rib_svc_post: QP %p failed in "
2374                     "ibt_post_recv(), status=%d",
2375                     (void *)qp, ibt_status);
2376 #endif
2377                 rib_rbuf_free(conn, RECV_BUFFER,
2378                         (caddr_t)(uintptr_t)sgl[0].ds_va);
2379                 (void) rib_free_svc_recv(s_recvp);
2380                 return (RDMA_FAILED);
2381         }
2382         mutex_exit(&conn->c_lock);
2383 
2384         return (RDMA_SUCCESS);
2385 }
2386 
2387 /* Client */
2388 rdma_stat
2389 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2390 {
2391 
2392         return (rib_clnt_post(conn, cl, msgid));
2393 }
2394 
2395 /* Server */
2396 rdma_stat
2397 rib_post_recv(CONN *conn, struct clist *cl)
2398 {
2399         rib_qp_t        *qp = ctoqp(conn);
2400 
2401         if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2402                 mutex_enter(&qp->posted_rbufs_lock);
2403                 qp->n_posted_rbufs++;
2404                 mutex_exit(&qp->posted_rbufs_lock);
2405                 return (RDMA_SUCCESS);
2406         }
2407         return (RDMA_FAILED);
2408 }
2409 
2410 /*
2411  * Client side only interface to "recv" the rpc reply buf
2412  * posted earlier by rib_post_resp(conn, cl, msgid).
2413  */
2414 rdma_stat
2415 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2416 {
2417         struct reply *rep = NULL;
2418         clock_t timout, cv_wait_ret;
2419         rdma_stat ret = RDMA_SUCCESS;
2420         rib_qp_t *qp = ctoqp(conn);
2421 
2422         /*
2423          * Find the reply structure for this msgid
2424          */
2425         mutex_enter(&qp->replylist_lock);
2426 
2427         for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2428             if (rep->xid == msgid)
2429                 break;
2430         }
2431         if (rep != NULL) {
2432                 /*
2433                  * If message not yet received, wait.
2434                  */
2435                 if (rep->status == (uint_t)REPLY_WAIT) {
2436                         timout = ddi_get_lbolt() +
2437                             drv_usectohz(REPLY_WAIT_TIME * 1000000);
2438                         while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2439                                     &qp->replylist_lock, timout)) > 0 &&
2440                             rep->status == (uint_t)REPLY_WAIT);
2441 
2442                         switch (cv_wait_ret) {
2443                         case -1:        /* timeout */
2444                                 ret = RDMA_TIMEDOUT;
2445                                 break;
2446                         case 0:
2447                                 ret = RDMA_INTR;
2448                                 break;
2449                         default:
2450                                 break;
2451                         }
2452                 }
2453 
2454                 if (rep->status == RDMA_SUCCESS) {
2455                         struct clist *cl = NULL;
2456 
2457                         /*
2458                          * Got message successfully
2459                          */
2460                         clist_add(&cl, 0, rep->bytes_xfer, NULL,
2461                             (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2462                         *clp = cl;
2463                 } else {
2464                         if (rep->status != (uint_t)REPLY_WAIT) {
2465                                 /*
2466                                  * Got error in reply message. Free
2467                                  * recv buffer here.
2468                                  */
2469                                 ret = rep->status;
2470                                 rib_rbuf_free(conn, RECV_BUFFER,
2471                                         (caddr_t)(uintptr_t)rep->vaddr_cq);
2472                         }
2473                 }
2474                 (void) rib_remreply(qp, rep);
2475         } else {
2476                 /*
2477                  * No matching reply structure found for given msgid on the
2478                  * reply wait list.
2479                  */
2480                 ret = RDMA_INVAL;
2481 #ifdef DEBUG
2482                 cmn_err(CE_WARN, "rib_recv: no matching reply for "
2483                     "xid %u, qp %p\n", msgid, (void *)qp);
2484 #endif
2485         }
2486 
2487         /*
2488          * Done.
2489          */
2490         mutex_exit(&qp->replylist_lock);
2491         return (ret);
2492 }
2493 
2494 /*
2495  * RDMA write a buffer to the remote address.
2496  */
2497 rdma_stat
2498 rib_write(CONN *conn, struct clist *cl, int wait)
2499 {
2500         ibt_send_wr_t   tx_wr;
2501         int             nds; 
2502         int             cv_sig;
2503         ibt_wr_ds_t     sgl[DSEG_MAX];
2504         struct send_wid *wdesc;
2505         ibt_status_t    ibt_status;
2506         rdma_stat       ret = RDMA_SUCCESS;
2507         rib_qp_t        *qp = ctoqp(conn);
2508 
2509         if (cl == NULL) {
2510                 cmn_err(CE_WARN, "rib_write: NULL clist\n");
2511                 return (RDMA_FAILED);
2512         }
2513 



2514         bzero(&tx_wr, sizeof (ibt_send_wr_t));
2515         /* 
2516          * Remote address is at the head chunk item in list. 
2517          */ 
2518         tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr;
2519         tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */



2520 
2521         nds = 0; 
2522         while (cl != NULL) { 
2523                 if (nds >= DSEG_MAX) { 
2524                         cmn_err(CE_WARN, "rib_write: DSEG_MAX too small!"); 
2525                         return (RDMA_FAILED); 
2526                 } 
2527                 sgl[nds].ds_va = cl->c_saddr; 
2528                 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 
2529                 sgl[nds].ds_len = cl->c_len; 
2530                 cl = cl->c_next; 
2531                 nds++; 
2532         } 
2533  
2534         if (wait) {
2535                 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2536                 cv_sig = 1;
2537         } else {
2538                 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2539                 cv_sig = 0;
2540         }
2541 
2542         wdesc = rib_init_sendwait(0, cv_sig, qp);
2543         tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2544         tx_wr.wr_opcode = IBT_WRC_RDMAW;
2545         tx_wr.wr_trans = IBT_RC_SRV;
2546         tx_wr.wr_nds = nds; 
2547         tx_wr.wr_sgl = sgl;
2548 
2549         mutex_enter(&conn->c_lock);
2550         if (conn->c_state & C_CONNECTED) {
2551                 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2552         }
2553         if (((conn->c_state & C_CONNECTED) == 0) ||
2554                 ibt_status != IBT_SUCCESS) {
2555                 mutex_exit(&conn->c_lock);
2556                 (void) rib_free_sendwait(wdesc);
2557                 return (RDMA_FAILED);
2558         }
2559         mutex_exit(&conn->c_lock);
2560 
2561         /*
2562          * Wait for send to complete
2563          */
2564         if (wait) {
2565                 ret = rib_sendwait(qp, wdesc);
2566                 if (ret != 0) {
2567                         return (ret);
2568                 }
2569         }



2570         return (RDMA_SUCCESS);
2571 }
2572 
2573 /*
2574  * RDMA Read a buffer from the remote address.
2575  */
2576 rdma_stat
2577 rib_read(CONN *conn, struct clist *cl, int wait)
2578 {
2579         ibt_send_wr_t   rx_wr;
2580         int             nds;
2581         int             cv_sig;
2582         ibt_wr_ds_t     sgl[DSEG_MAX];  /* is 2 sufficient? */
2583         struct send_wid *wdesc;
2584         ibt_status_t    ibt_status = IBT_SUCCESS;
2585         rdma_stat       ret = RDMA_SUCCESS;
2586         rib_qp_t        *qp = ctoqp(conn);
2587 
2588         if (cl == NULL) {
2589                 cmn_err(CE_WARN, "rib_read: NULL clist\n");
2590                 return (RDMA_FAILED);
2591         }
2592 
2593         bzero(&rx_wr, sizeof (ibt_send_wr_t));
2594         /*
2595          * Remote address is at the head chunk item in list.
2596          */
2597         rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_saddr;
2598         rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; /* rkey */
2599 
2600         nds = 0;
2601         while (cl != NULL) {
2602                 if (nds >= DSEG_MAX) {
2603                         cmn_err(CE_WARN, "rib_read: DSEG_MAX too small!");
2604                         return (RDMA_FAILED);
2605                 }
2606                 sgl[nds].ds_va = cl->c_daddr;
2607                 sgl[nds].ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2608                 sgl[nds].ds_len = cl->c_len;
2609                 cl = cl->c_next;
2610                 nds++;
2611         }
2612 
2613         if (wait) {
2614                 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2615                 cv_sig = 1;
2616         } else {
2617                 rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2618                 cv_sig = 0;
2619         }
2620 
2621         wdesc = rib_init_sendwait(0, cv_sig, qp);
2622         rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2623         rx_wr.wr_opcode = IBT_WRC_RDMAR;
2624         rx_wr.wr_trans = IBT_RC_SRV;
2625         rx_wr.wr_nds = nds;
2626         rx_wr.wr_sgl = sgl;
2627 
2628         mutex_enter(&conn->c_lock);
2629         if (conn->c_state & C_CONNECTED) {
2630                 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2631         }
2632         if (((conn->c_state & C_CONNECTED) == 0) ||
2633                 ibt_status != IBT_SUCCESS) {
2634                 mutex_exit(&conn->c_lock);
2635 #ifdef DEBUG
2636                 if (rib_debug && ibt_status != IBT_SUCCESS)
2637                         cmn_err(CE_WARN, "rib_read: FAILED post_sending RDMAR"
2638                                 " wr_id %llx on qp %p, status=%d",
2639                                 (longlong_t)rx_wr.wr_id, (void *)qp,
2640                                 ibt_status);
2641 #endif
2642                 (void) rib_free_sendwait(wdesc);
2643                 return (RDMA_FAILED);
2644         }
2645         mutex_exit(&conn->c_lock);
2646 
2647         /*
2648          * Wait for send to complete
2649          */
2650         if (wait) {
2651                 ret = rib_sendwait(qp, wdesc);
2652                 if (ret != 0) {
2653                         return (ret);
2654                 }
2655         }
2656 
2657         return (RDMA_SUCCESS);
2658 }
2659 
2660 int
2661 is_for_ipv4(ibt_ar_t *result)
2662 {
2663         int     i, size = sizeof (struct in_addr);
2664         uint8_t zero = 0;
2665 
2666         for (i = 0; i < (ATS_AR_DATA_LEN - size); i++)
2667                 zero |= result->ar_data[i];
2668         return (zero == 0);
2669 }
2670 
2671 /*
2672  * rib_srv_cm_handler()
2673  *    Connection Manager callback to handle RC connection requests.
2674  */
2675 /* ARGSUSED */
2676 static ibt_cm_status_t
2677 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2678         ibt_cm_return_args_t *ret_args, void *priv_data,
2679         ibt_priv_data_len_t len)
2680 {
2681         queue_t         *q;
2682         rib_qp_t        *qp;
2683         rpcib_state_t   *ribstat;
2684         rib_hca_t       *hca;
2685         rdma_stat       status = RDMA_SUCCESS;
2686         int             i;
2687         struct clist    cl;
2688         rdma_buf_t      rdbuf; 
2689         void            *buf = NULL;
2690         ibt_cm_req_rcv_t        cm_req_rcv;
2691         CONN            *conn;
2692         ibt_status_t ibt_status;
2693         ibt_ar_t        ar_query, ar_result;
2694         ib_gid_t        sgid;
2695 
2696 
2697         ASSERT(any != NULL);
2698         ASSERT(event != NULL);
2699 
2700         ribstat = (rpcib_state_t *)any;
2701         hca = (rib_hca_t *)ribstat->hca;
2702         ASSERT(hca != NULL);
2703 
2704         /* got a connection request */
2705         switch (event->cm_type) {
2706         case IBT_CM_EVENT_REQ_RCV:
2707                 /*
2708                  * If the plugin is in the NO_ACCEPT state, bail out.
2709                  */
2710                 mutex_enter(&plugin_state_lock);
2711                 if (plugin_state == NO_ACCEPT) {
2712                         mutex_exit(&plugin_state_lock);
2713                         return (IBT_CM_REJECT);
2714                 }
2715                 mutex_exit(&plugin_state_lock);
2716 
2717                 /*
2718                  * Need to send a MRA MAD to CM so that it does not
2719                  * timeout on us.
2720                  */
2721                 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2722                             event->cm_event.req.req_timeout * 8, NULL, 0);
2723 
2724                 mutex_enter(&rib_stat->open_hca_lock);
2725                 q = rib_stat->q;
2726                 mutex_exit(&rib_stat->open_hca_lock);
2727                 status = rib_svc_create_chan(hca, (caddr_t)q,
2728                         event->cm_event.req.req_prim_hca_port, &qp);
2729                 if (status) {
2730 #ifdef DEBUG
2731                         cmn_err(CE_WARN, "rib_srv_cm_handler: "
2732                             "create_channel failed %d", status);
2733 #endif
2734                         return (IBT_CM_REJECT);
2735                 }
2736                 cm_req_rcv = event->cm_event.req;
2737 
2738 #ifdef DEBUG
2739                 if (rib_debug > 2) {
2740                     cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2741                         "server recv'ed IBT_CM_EVENT_REQ_RCV\n");
2742                     cmn_err(CE_NOTE, "\t\t SID:%llx\n",
2743                                 (longlong_t)cm_req_rcv.req_service_id);
2744                     cmn_err(CE_NOTE, "\t\t Local Port:%d\n",
2745                                 cm_req_rcv.req_prim_hca_port);
2746                     cmn_err(CE_NOTE,
2747                         "\t\t Remote GID:(prefix:%llx,guid:%llx)\n",
2748                         (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_prefix,
2749                         (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_guid);
2750                     cmn_err(CE_NOTE, "\t\t Local GID:(prefix:%llx,guid:%llx)\n",
2751                         (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_prefix,
2752                         (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_guid);
2753                     cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n",
2754                         cm_req_rcv.req_remote_qpn);
2755                     cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n",
2756                         cm_req_rcv.req_remote_qkey);
2757                     cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n",
2758                         (void *)qp, (void *)qp->qp_hdl);
2759                 }
2760 
2761                 if (rib_debug > 2) {
2762                     ibt_rc_chan_query_attr_t    chan_attrs;
2763 
2764                     if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs)
2765                         == IBT_SUCCESS) {
2766                         cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in "
2767                             "CEP state %d\n", (void *)qp, chan_attrs.rc_state);
2768                     }
2769                 }
2770 #endif
2771 
2772                 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2773                 ret_args->cm_ret.rep.cm_rdma_ra_out = 1; 
2774                 ret_args->cm_ret.rep.cm_rdma_ra_in = 1; 
2775                 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2776 
2777                 /*
2778                  * Pre-posts RECV buffers
2779                  */
2780                 conn = qptoc(qp);
2781                 for (i = 0; i < preposted_rbufs; i++) {
2782                     bzero(&rdbuf, sizeof (rdbuf));
2783                     rdbuf.type = RECV_BUFFER;
2784                     buf = rib_rbuf_alloc(conn, &rdbuf);
2785                     if (buf == NULL) {
2786                         cmn_err(CE_WARN, "rib_svc_cm_handler: "
2787                             "No RECV_BUFFER buf!\n");
2788                         (void) rib_disconnect_channel(conn, NULL);
2789                         return (IBT_CM_REJECT);
2790                     }
2791 
2792                     bzero(&cl, sizeof (cl));
2793                     cl.c_saddr = (uintptr_t)rdbuf.addr;
2794                     cl.c_len = rdbuf.len;
2795                     cl.c_smemhandle.mrc_lmr = rdbuf.handle.mrc_lmr; /* lkey */
2796                     cl.c_next = NULL;
2797                     status = rib_post_recv(conn, &cl);
2798                     if (status != RDMA_SUCCESS) {
2799                         cmn_err(CE_WARN, "rib_srv_cm_handler: failed "
2800                             "posting RPC_REQ buf to qp %p!", (void *)qp);
2801                         (void) rib_disconnect_channel(conn, NULL);
2802                         return (IBT_CM_REJECT);
2803                     }
2804                 }
2805                 (void) rib_add_connlist(conn, &hca->srv_conn_list);
2806 
2807                 /*
2808                  * Get the address translation service record from ATS
2809                  */
2810                 rw_enter(&hca->state_lock, RW_READER);
2811                 if (hca->state == HCA_DETACHED) {
2812                     rw_exit(&hca->state_lock);
2813                     return (IBT_CM_REJECT);
2814                 }
2815                 rw_exit(&hca->state_lock);
2816 
2817                 for (i = 0; i < hca->hca_nports; i++) {
2818                     ibt_status = ibt_get_port_state(hca->hca_hdl, i+1,
2819                                         &sgid, NULL);
2820                     if (ibt_status != IBT_SUCCESS) {
2821                         if (rib_debug) {
2822                             cmn_err(CE_WARN, "rib_srv_cm_handler: "
2823                                 "ibt_get_port_state FAILED!"
2824                                 "status = %d\n", ibt_status);
2825                         }
2826                     } else {
2827                         /*
2828                          * do ibt_query_ar()
2829                          */
2830                         bzero(&ar_query, sizeof (ar_query));
2831                         bzero(&ar_result, sizeof (ar_result));
2832                         ar_query.ar_gid = cm_req_rcv.req_prim_addr.av_dgid;
2833                         ar_query.ar_pkey = event->cm_event.req.req_pkey;
2834                         ibt_status = ibt_query_ar(&sgid, &ar_query,
2835                                                         &ar_result);
2836                         if (ibt_status != IBT_SUCCESS) {
2837                             if (rib_debug) {
2838                                 cmn_err(CE_WARN, "rib_srv_cm_handler: "
2839                                     "ibt_query_ar FAILED!"
2840                                     "status = %d\n", ibt_status);
2841                             }
2842                         } else {
2843                             conn = qptoc(qp);
2844 
2845                             if (is_for_ipv4(&ar_result)) {
2846                                 struct sockaddr_in *s;
2847                                 int sin_size = sizeof (struct sockaddr_in);
2848                                 int in_size = sizeof (struct in_addr);
2849                                 uint8_t *start_pos;
2850 
2851                                 conn->c_raddr.maxlen =
2852                                         conn->c_raddr.len = sin_size;
2853                                 conn->c_raddr.buf = kmem_zalloc(sin_size,
2854                                                 KM_SLEEP);
2855                                 s = (struct sockaddr_in *)conn->c_raddr.buf;
2856                                 s->sin_family = AF_INET;
2857                                 /*
2858                                  * For IPv4,  the IP addr is stored in
2859                                  * the last four bytes of ar_data.
2860                                  */
2861                                 start_pos = ar_result.ar_data +
2862                                         ATS_AR_DATA_LEN - in_size;
2863                                 bcopy(start_pos, &s->sin_addr, in_size);
2864                                 if (rib_debug > 1) {
2865                                     char print_addr[INET_ADDRSTRLEN];
2866 
2867                                     bzero(print_addr, INET_ADDRSTRLEN);
2868                                     (void) inet_ntop(AF_INET, &s->sin_addr,
2869                                                 print_addr, INET_ADDRSTRLEN);
2870                                     cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2871                                         "remote clnt_addr: %s\n", print_addr);
2872                                 }
2873                             } else {
2874                                 struct sockaddr_in6 *s6;
2875                                 int sin6_size = sizeof (struct sockaddr_in6);
2876 
2877                                 conn->c_raddr.maxlen =
2878                                         conn->c_raddr.len = sin6_size;
2879                                 conn->c_raddr.buf = kmem_zalloc(sin6_size,
2880                                         KM_SLEEP);
2881 
2882                                 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2883                                 s6->sin6_family = AF_INET6;
2884                                 /* sin6_addr is stored in ar_data */
2885                                 bcopy(ar_result.ar_data, &s6->sin6_addr,
2886                                         sizeof (struct in6_addr));
2887                                 if (rib_debug > 1) {
2888                                     char print_addr[INET6_ADDRSTRLEN];
2889 
2890                                     bzero(print_addr, INET6_ADDRSTRLEN);
2891                                     (void) inet_ntop(AF_INET6, &s6->sin6_addr,
2892                                                 print_addr, INET6_ADDRSTRLEN);
2893                                     cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2894                                         "remote clnt_addr: %s\n", print_addr);
2895                                 }
2896                             }
2897                             return (IBT_CM_ACCEPT);
2898                         }
2899                     }
2900                 }
2901                 if (rib_debug > 1) {
2902                     cmn_err(CE_WARN, "rib_srv_cm_handler: "
2903                                 "address record query failed!");
2904                 }
2905                 break;
2906 
2907         case IBT_CM_EVENT_CONN_CLOSED:
2908         {
2909                 CONN            *conn;
2910                 rib_qp_t        *qp;
2911 
2912                 switch (event->cm_event.closed) {
2913                 case IBT_CM_CLOSED_DREP_RCVD:
2914                 case IBT_CM_CLOSED_DREQ_TIMEOUT:
2915                 case IBT_CM_CLOSED_DUP:
2916                 case IBT_CM_CLOSED_ABORT:
2917                 case IBT_CM_CLOSED_ALREADY:
2918                         /*
2919                          * These cases indicate the local end initiated
2920                          * the closing of the channel. Nothing to do here.
2921                          */
2922                         break;
2923                 default:
2924                         /*
2925                          * Reason for CONN_CLOSED event must be one of
2926                          * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2927                          * or IBT_CM_CLOSED_STALE. These indicate cases were
2928                          * the remote end is closing the channel. In these
2929                          * cases free the channel and transition to error
2930                          * state
2931                          */
2932                         qp = ibt_get_chan_private(event->cm_channel);
2933                         conn = qptoc(qp);
2934                         mutex_enter(&conn->c_lock);
2935                         if (conn->c_state == C_DISCONN_PEND) {
2936                                 mutex_exit(&conn->c_lock);
2937                                 break;
2938                         }
2939                         conn->c_state = C_ERROR;
2940 
2941                         /*
2942                          * Free the rc_channel. Channel has already
2943                          * transitioned to ERROR state and WRs have been
2944                          * FLUSHED_ERR already.
2945                          */
2946                         (void) ibt_free_channel(qp->qp_hdl);
2947                         qp->qp_hdl = NULL;
2948 
2949                         /*
2950                          * Free the conn if c_ref goes down to 0
2951                          */
2952                         if (conn->c_ref == 0) {
2953                                 /*
2954                                  * Remove from list and free conn
2955                                  */
2956                                 conn->c_state = C_DISCONN_PEND;
2957                                 mutex_exit(&conn->c_lock);
2958                                 (void) rib_disconnect_channel(conn,
2959                                         &hca->srv_conn_list);
2960                         } else {
2961                                 mutex_exit(&conn->c_lock);
2962                         }
2963 #ifdef DEBUG
2964                         if (rib_debug)
2965                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2966                                         " (CONN_CLOSED) channel disconnected");
2967 #endif
2968                         break;
2969                 }
2970                 break;
2971         }
2972         case IBT_CM_EVENT_CONN_EST:
2973         /*
2974          * RTU received, hence connection established.
2975          */
2976                 if (rib_debug > 1)
2977                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2978                                 "(CONN_EST) channel established");
2979                 break;
2980 
2981         default:
2982             if (rib_debug > 2) {
2983                 /* Let CM handle the following events. */
2984                 if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
2985                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2986                             "server recv'ed IBT_CM_EVENT_REP_RCV\n");
2987                 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
2988                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2989                             "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
2990                 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
2991                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2992                             "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
2993                 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
2994                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2995                             "server recv'ed IBT_CM_EVENT_APR_RCV\n");
2996                 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
2997                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2998                             "server recv'ed IBT_CM_EVENT_FAILURE\n");
2999                 }
3000             }
3001             return (IBT_CM_REJECT);
3002         }
3003 
3004         /* accept all other CM messages (i.e. let the CM handle them) */
3005         return (IBT_CM_ACCEPT);
3006 }
3007 
3008 static rdma_stat
3009 rib_register_ats(rib_hca_t *hca)
3010 {
3011         ibt_hca_portinfo_t      *port_infop;
3012         uint_t                  port_size;
3013         uint_t                  pki, i, num_ports, nbinds;
3014         ibt_status_t            ibt_status;
3015         rib_service_t           *new_service, *temp_srv;
3016         rpcib_ats_t             *atsp;
3017         rpcib_ibd_insts_t       ibds;
3018         ib_pkey_t               pkey;
3019         ibt_ar_t                ar;     /* address record */
3020 
3021         /*
3022          * Query all ports for the given HCA
3023          */
3024         rw_enter(&hca->state_lock, RW_READER);
3025         if (hca->state != HCA_DETACHED) {
3026                 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3027                     &num_ports, &port_size);
3028                 rw_exit(&hca->state_lock);
3029         } else {
3030                 rw_exit(&hca->state_lock);
3031                 return (RDMA_FAILED);
3032         }
3033         if (ibt_status != IBT_SUCCESS) {
3034 #ifdef DEBUG
3035             if (rib_debug) {
3036                 cmn_err(CE_NOTE, "rib_register_ats: FAILED in "
3037                     "ibt_query_hca_ports, status = %d\n", ibt_status);
3038             }
3039 #endif
3040                 return (RDMA_FAILED);
3041         }
3042 
3043 #ifdef  DEBUG
3044         if (rib_debug > 1) {
3045                 cmn_err(CE_NOTE, "rib_register_ats: Ports detected "
3046                     "%d\n", num_ports);
3047 
3048                 for (i = 0; i < num_ports; i++) {
3049                         if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3050                                 cmn_err(CE_WARN, "rib_register_ats "
3051                                     "Port #: %d INACTIVE\n", i+1);
3052                         } else if (port_infop[i].p_linkstate ==
3053                             IBT_PORT_ACTIVE) {
3054                                 cmn_err(CE_NOTE, "rib_register_ats "
3055                                     "Port #: %d ACTIVE\n", i+1);
3056                         }
3057                 }
3058         }
3059 #endif
3060 
3061         ibds.rib_ibd_alloc = N_IBD_INSTANCES;
3062         ibds.rib_ibd_cnt = 0;
3063         ibds.rib_ats = (rpcib_ats_t *)kmem_zalloc(ibds.rib_ibd_alloc *
3064                         sizeof (rpcib_ats_t), KM_SLEEP);
3065         rib_get_ibd_insts(&ibds);
3066 
3067         if (ibds.rib_ibd_cnt == 0) {
3068             kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3069                                 sizeof (rpcib_ats_t));
3070             ibt_free_portinfo(port_infop, port_size);
3071             return (RDMA_FAILED);
3072         }
3073 
3074         /*
3075          * Get the IP addresses of active ports and
3076          * register them with ATS.  IPv4 addresses
3077          * have precedence over IPv6 addresses.
3078          */
3079         if (get_ibd_ipaddr(&ibds) != 0) {
3080 #ifdef  DEBUG
3081             if (rib_debug > 1) {
3082                 cmn_err(CE_WARN, "rib_register_ats: "
3083                     "get_ibd_ipaddr failed");
3084             }
3085 #endif
3086             kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3087                                 sizeof (rpcib_ats_t));
3088             ibt_free_portinfo(port_infop, port_size);
3089             return (RDMA_FAILED);
3090         }
3091 
3092         /*
3093          * Start ATS registration for active ports on this HCA.
3094          */
3095         rw_enter(&hca->service_list_lock, RW_WRITER);
3096         nbinds = 0;
3097         new_service = NULL;
3098         for (i = 0; i < num_ports; i++) {
3099                 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3100                         continue;
3101 
3102             for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3103                 pkey = port_infop[i].p_pkey_tbl[pki];
3104                 if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3105                     ar.ar_gid = port_infop[i].p_sgid_tbl[0];
3106                     ar.ar_pkey = pkey;
3107                     atsp = get_ibd_entry(&ar.ar_gid, pkey, &ibds);
3108                     if (atsp == NULL)
3109                         continue;
3110                 /*
3111                  * store the sin[6]_addr in ar_data
3112                  */
3113                     (void) bzero(ar.ar_data, ATS_AR_DATA_LEN);
3114                     if (atsp->ras_inet_type == AF_INET) {
3115                         uint8_t *start_pos;
3116 
3117                         /*
3118                          * The ipv4 addr goes into the last
3119                          * four bytes of ar_data.
3120                          */
3121                         start_pos = ar.ar_data + ATS_AR_DATA_LEN -
3122                                 sizeof (struct in_addr);
3123                         bcopy(&atsp->ras_sin.sin_addr, start_pos,
3124                                 sizeof (struct in_addr));
3125                     } else if (atsp->ras_inet_type == AF_INET6) {
3126                         bcopy(&atsp->ras_sin6.sin6_addr, ar.ar_data,
3127                                 sizeof (struct in6_addr));
3128                     } else
3129                         continue;
3130 
3131                     ibt_status = ibt_register_ar(hca->ibt_clnt_hdl, &ar);
3132                     if (ibt_status == IBT_SUCCESS) {
3133 #ifdef  DEBUG
3134                         if (rib_debug > 1) {
3135                                 cmn_err(CE_WARN, "rib_register_ats: "
3136                                     "ibt_register_ar OK on port %d", i+1);
3137                         }
3138 #endif
3139                         /*
3140                          * Allocate and prepare a service entry
3141                          */
3142                         new_service = kmem_zalloc(sizeof (rib_service_t),
3143                                 KM_SLEEP);
3144                         new_service->srv_port = i + 1;
3145                         new_service->srv_ar = ar;
3146                         new_service->srv_next = NULL;
3147 
3148                         /*
3149                          * Add to the service list for this HCA
3150                          */
3151                         new_service->srv_next = hca->ats_list;
3152                         hca->ats_list = new_service;
3153                         new_service = NULL;
3154                         nbinds ++;
3155                     } else {
3156 #ifdef  DEBUG
3157                         if (rib_debug > 1) {
3158                             cmn_err(CE_WARN, "rib_register_ats: "
3159                             "ibt_register_ar FAILED on port %d", i+1);
3160                         }
3161 #endif
3162                     }
3163                 }
3164             }
3165         }
3166 
3167 #ifdef  DEBUG
3168         if (rib_debug > 1) {
3169                 for (temp_srv = hca->ats_list; temp_srv != NULL;
3170                         temp_srv = temp_srv->srv_next) {
3171                                 cmn_err(CE_NOTE, "Service: ATS, active on"
3172                                         " port: %d\n", temp_srv->srv_port);
3173                 }
3174         }
3175 #endif
3176 
3177         rw_exit(&hca->service_list_lock);
3178         kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * sizeof (rpcib_ats_t));
3179         ibt_free_portinfo(port_infop, port_size);
3180 
3181         if (nbinds == 0) {
3182 #ifdef  DEBUG
3183         if (rib_debug > 1) {
3184                 cmn_err(CE_WARN, "rib_register_ats FAILED!\n");
3185         }
3186 #endif
3187                 return (RDMA_FAILED);
3188         }
3189         return (RDMA_SUCCESS);
3190 }
3191 
3192 static rdma_stat
3193 rib_register_service(rib_hca_t *hca, int service_type)
3194 {
3195         ibt_srv_desc_t          sdesc;
3196         ibt_srv_bind_t          sbind;
3197         ibt_hca_portinfo_t      *port_infop;
3198         ib_svc_id_t             srv_id;
3199         ibt_srv_hdl_t           srv_hdl;
3200         uint_t                  port_size;
3201         uint_t                  pki, i, j, num_ports, nbinds;
3202         ibt_status_t            ibt_status;
3203         char                    **addrs;
3204         int                     addr_count;
3205         rib_service_t           *new_service, *temp_srv;
3206         ib_pkey_t               pkey;
3207 
3208         /*
3209          * Query all ports for the given HCA
3210          */
3211         rw_enter(&hca->state_lock, RW_READER);
3212         if (hca->state != HCA_DETACHED) {
3213                 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3214                     &num_ports, &port_size);
3215                 rw_exit(&hca->state_lock);
3216         } else {
3217                 rw_exit(&hca->state_lock);
3218                 return (RDMA_FAILED);
3219         }
3220         if (ibt_status != IBT_SUCCESS) {
3221 #ifdef DEBUG
3222                 cmn_err(CE_NOTE, "rib_register_service: FAILED in "
3223                     "ibt_query_hca_ports, status = %d\n", ibt_status);
3224 #endif
3225                 return (RDMA_FAILED);
3226         }
3227 
3228 #ifdef  DEBUG
3229         if (rib_debug > 1) {
3230                 cmn_err(CE_NOTE, "rib_register_service: Ports detected "
3231                     "%d\n", num_ports);
3232 
3233                 for (i = 0; i < num_ports; i++) {
3234                         if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3235                                 cmn_err(CE_WARN, "rib_register_service "
3236                                     "Port #: %d INACTIVE\n", i+1);
3237                         } else if (port_infop[i].p_linkstate ==
3238                             IBT_PORT_ACTIVE) {
3239                                 cmn_err(CE_NOTE, "rib_register_service "
3240                                     "Port #: %d ACTIVE\n", i+1);
3241                         }
3242                 }
3243         }
3244 #endif
3245         /*
3246          * Get all the IP addresses on this system to register the
3247          * given "service type" on all DNS recognized IP addrs.
3248          * Each service type such as NFS will have all the systems
3249          * IP addresses as its different names. For now the only
3250          * type of service we support in RPCIB is NFS.
3251          */
3252         addrs = get_ip_addrs(&addr_count);
3253         if (addrs == NULL) {
3254 #ifdef DEBUG
3255                 if (rib_debug) {
3256                     cmn_err(CE_WARN, "rib_register_service: "
3257                         "get_ip_addrs failed\n");
3258                 }
3259 #endif
3260                 ibt_free_portinfo(port_infop, port_size);
3261                 return (RDMA_FAILED);
3262         }
3263 
3264 #ifdef  DEBUG
3265         if (rib_debug > 1) {
3266                 for (i = 0; i < addr_count; i++)
3267                         cmn_err(CE_NOTE, "addr %d: %s\n", i, addrs[i]);
3268         }
3269 #endif
3270 
3271         rw_enter(&hca->service_list_lock, RW_WRITER);
3272         /*
3273          * Start registering and binding service to active
3274          * on active ports on this HCA.
3275          */
3276         nbinds = 0;
3277         new_service = NULL;
3278 
3279         /*
3280          * We use IP addresses as the service names for
3281          * service registration.  Register each of them
3282          * with CM to obtain a svc_id and svc_hdl.  We do not
3283          * register the service with machine's loopback address.
3284          */
3285         for (j = 1; j < addr_count; j++) {
3286             (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3287             (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3288             (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3289 
3290             sdesc.sd_handler = rib_srv_cm_handler;
3291             sdesc.sd_flags = 0;
3292 
3293             ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3294                             &sdesc, 0, 1, &srv_hdl, &srv_id);
3295             if (ibt_status != IBT_SUCCESS) {
3296 #ifdef DEBUG
3297                 if (rib_debug) {
3298                     cmn_err(CE_WARN, "rib_register_service: "
3299                         "ibt_register_service FAILED, status "
3300                         "= %d\n", ibt_status);
3301                 }
3302 #endif
3303                 /*
3304                  * No need to go on, since we failed to obtain
3305                  * a srv_id and srv_hdl. Move on to the next
3306                  * IP addr as a service name.
3307                  */
3308                 continue;
3309             }
3310             for (i = 0; i < num_ports; i++) {
3311                 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3312                         continue;
3313 
3314                 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3315                     pkey = port_infop[i].p_pkey_tbl[pki];
3316                     if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3317 
3318                         /*
3319                          * Allocate and prepare a service entry
3320                          */
3321                         new_service = kmem_zalloc(1 * sizeof (rib_service_t),
3322                             KM_SLEEP);
3323                         new_service->srv_type = service_type;
3324                         new_service->srv_port = i + 1;
3325                         new_service->srv_id = srv_id;
3326                         new_service->srv_hdl = srv_hdl;
3327                         new_service->srv_sbind_hdl = kmem_zalloc(1 *
3328                             sizeof (ibt_sbind_hdl_t), KM_SLEEP);
3329 
3330                         new_service->srv_name = kmem_zalloc(IB_SVC_NAME_LEN,
3331                             KM_SLEEP);
3332                         (void) bcopy(addrs[j], new_service->srv_name,
3333                             IB_SVC_NAME_LEN);
3334                         (void) strlcat(new_service->srv_name, "::NFS",
3335                                 IB_SVC_NAME_LEN);
3336                         new_service->srv_next = NULL;
3337 
3338                         /*
3339                          * Bind the service, specified by the IP address,
3340                          * to the port/pkey using the srv_hdl returned
3341                          * from ibt_register_service().
3342                          */
3343                         (void) bzero(&sbind, sizeof (ibt_srv_bind_t));
3344                         sbind.sb_pkey = pkey;
3345                         sbind.sb_lease = 0xFFFFFFFF;
3346                         sbind.sb_key[0] = NFS_SEC_KEY0;
3347                         sbind.sb_key[1] = NFS_SEC_KEY1;
3348                         sbind.sb_name = new_service->srv_name;
3349 
3350 #ifdef  DEBUG
3351                         if (rib_debug > 1) {
3352                                 cmn_err(CE_NOTE, "rib_register_service: "
3353                                     "binding service using name: %s\n",
3354                                     sbind.sb_name);
3355                         }
3356 #endif
3357                         ibt_status = ibt_bind_service(srv_hdl,
3358                             port_infop[i].p_sgid_tbl[0], &sbind, rib_stat,
3359                             new_service->srv_sbind_hdl);
3360                         if (ibt_status != IBT_SUCCESS) {
3361 #ifdef  DEBUG
3362                             if (rib_debug) {
3363                                 cmn_err(CE_WARN, "rib_register_service: FAILED"
3364                                     " in ibt_bind_service, status = %d\n",
3365                                     ibt_status);
3366                             }
3367 #endif
3368                                 kmem_free(new_service->srv_sbind_hdl,
3369                                     sizeof (ibt_sbind_hdl_t));
3370                                 kmem_free(new_service->srv_name,
3371                                     IB_SVC_NAME_LEN);
3372                                 kmem_free(new_service,
3373                                     sizeof (rib_service_t));
3374                                 new_service = NULL;
3375                                 continue;
3376                         }
3377 #ifdef  DEBUG
3378                         if (rib_debug > 1) {
3379                                 if (ibt_status == IBT_SUCCESS)
3380                                         cmn_err(CE_NOTE, "rib_regstr_service: "
3381                                             "Serv: %s REGISTERED on port: %d",
3382                                             sbind.sb_name, i+1);
3383                         }
3384 #endif
3385                         /*
3386                          * Add to the service list for this HCA
3387                          */
3388                         new_service->srv_next = hca->service_list;
3389                         hca->service_list = new_service;
3390                         new_service = NULL;
3391                         nbinds ++;
3392                     }
3393                 }
3394             }
3395         }
3396         rw_exit(&hca->service_list_lock);
3397 
3398 #ifdef  DEBUG
3399         if (rib_debug > 1) {
3400                 /*
3401                  * Change this print to a more generic one, as rpcib
3402                  * is supposed to handle multiple service types.
3403                  */
3404                 for (temp_srv = hca->service_list; temp_srv != NULL;
3405                         temp_srv = temp_srv->srv_next) {
3406                                 cmn_err(CE_NOTE, "NFS-IB, active on port:"
3407                                         " %d\n"
3408                                         "Using name: %s", temp_srv->srv_port,
3409                                         temp_srv->srv_name);
3410                 }
3411         }
3412 #endif
3413 
3414         ibt_free_portinfo(port_infop, port_size);
3415         for (i = 0; i < addr_count; i++) {
3416                 if (addrs[i])
3417                         kmem_free(addrs[i], IB_SVC_NAME_LEN);
3418         }
3419         kmem_free(addrs, addr_count * sizeof (char *));
3420 
3421         if (nbinds == 0) {
3422 #ifdef  DEBUG
3423             if (rib_debug) {
3424                 cmn_err(CE_WARN, "rib_register_service: "
3425                     "bind_service FAILED!\n");
3426             }
3427 #endif
3428                 return (RDMA_FAILED);
3429         } else {
3430                 /*
3431                  * Put this plugin into accept state, since atleast
3432                  * one registration was successful.
3433                  */
3434                 mutex_enter(&plugin_state_lock);
3435                 plugin_state = ACCEPT;
3436                 mutex_exit(&plugin_state_lock);
3437                 return (RDMA_SUCCESS);
3438         }
3439 }
3440 
3441 void
3442 rib_listen(struct rdma_svc_data *rd)
3443 {
3444         rdma_stat status = RDMA_SUCCESS;
3445 
3446         rd->active = 0;
3447         rd->err_code = RDMA_FAILED;
3448 
3449         /*
3450          * First check if a hca is still attached
3451          */
3452         rw_enter(&rib_stat->hca->state_lock, RW_READER);
3453         if (rib_stat->hca->state != HCA_INITED) {
3454                 rw_exit(&rib_stat->hca->state_lock);
3455                 return;
3456         }
3457         rw_exit(&rib_stat->hca->state_lock);
3458 
3459         rib_stat->q = &rd->q;
3460         /*
3461          * Register the Address translation service
3462          */
3463         mutex_enter(&rib_stat->open_hca_lock);
3464         if (ats_running == 0) {
3465                 if (rib_register_ats(rib_stat->hca) != RDMA_SUCCESS) {
3466 #ifdef  DEBUG
3467                     if (rib_debug) {
3468                         cmn_err(CE_WARN,
3469                             "rib_listen(): ats registration failed!");
3470                     }
3471 #endif
3472                     mutex_exit(&rib_stat->open_hca_lock);
3473                     return;
3474                 } else {
3475                         ats_running = 1;
3476                 }
3477         }
3478         mutex_exit(&rib_stat->open_hca_lock);
3479 
3480         /*
3481          * Right now the only service type is NFS. Hence force feed this
3482          * value. Ideally to communicate the service type it should be
3483          * passed down in rdma_svc_data.
3484          */
3485         rib_stat->service_type = NFS;
3486         status = rib_register_service(rib_stat->hca, NFS);
3487         if (status != RDMA_SUCCESS) {
3488                 rd->err_code = status;
3489                 return;
3490         }
3491         /*
3492          * Service active on an HCA, check rd->err_code for more
3493          * explainable errors.
3494          */
3495         rd->active = 1;
3496         rd->err_code = status;
3497 }
3498 
3499 /* XXXX */
3500 /* ARGSUSED */
3501 static void
3502 rib_listen_stop(struct rdma_svc_data *svcdata)
3503 {
3504         rib_hca_t               *hca;
3505 
3506         /*
3507          * KRPC called the RDMATF to stop the listeners, this means
3508          * stop sending incomming or recieved requests to KRPC master
3509          * transport handle for RDMA-IB. This is also means that the
3510          * master transport handle, responsible for us, is going away.
3511          */
3512         mutex_enter(&plugin_state_lock);
3513         plugin_state = NO_ACCEPT;
3514         if (svcdata != NULL)
3515                 svcdata->active = 0;
3516         mutex_exit(&plugin_state_lock);
3517 
3518         /*
3519          * First check if a hca is still attached
3520          */
3521         hca = rib_stat->hca;
3522         rw_enter(&hca->state_lock, RW_READER);
3523         if (hca->state != HCA_INITED) {
3524                 rw_exit(&hca->state_lock);
3525                 return;
3526         }
3527         rib_stop_services(hca);
3528         rw_exit(&hca->state_lock);
3529 }
3530 
3531 /*
3532  * Traverse the HCA's service list to unbind and deregister services.
3533  * Instead of unbinding the service for a service handle by
3534  * calling ibt_unbind_service() for each port/pkey, we unbind
3535  * all the services for the service handle by making only one
3536  * call to ibt_unbind_all_services().  Then, we deregister the
3537  * service for the service handle.
3538  *
3539  * When traversing the entries in service_list, we compare the
3540  * srv_hdl of the current entry with that of the next.  If they
3541  * are different or if the next entry is NULL, the current entry
3542  * marks the last binding of the service handle.  In this case,
3543  * call ibt_unbind_all_services() and deregister the service for
3544  * the service handle.  If they are the same, the current and the
3545  * next entries are bound to the same service handle.  In this
3546  * case, move on to the next entry.
3547  */
3548 static void
3549 rib_stop_services(rib_hca_t *hca)
3550 {
3551         rib_service_t           *srv_list, *to_remove;
3552         ibt_status_t            ibt_status;
3553 
3554         /*
3555          * unbind and deregister the services for this service type.
3556          * Right now there is only one service type. In future it will
3557          * be passed down to this function.
3558          */
3559         rw_enter(&hca->service_list_lock, RW_WRITER);
3560         srv_list = hca->service_list;
3561         while (srv_list != NULL) {
3562                 to_remove = srv_list;
3563                 srv_list = to_remove->srv_next;
3564                 if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3565                     srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3566 
3567                     ibt_status = ibt_unbind_all_services(to_remove->srv_hdl);
3568                     if (ibt_status != IBT_SUCCESS) {
3569                         cmn_err(CE_WARN, "rib_listen_stop: "
3570                             "ibt_unbind_all_services FAILED"
3571                                 " status: %d\n", ibt_status);
3572                     }
3573 
3574                     ibt_status =
3575                         ibt_deregister_service(hca->ibt_clnt_hdl,
3576                                 to_remove->srv_hdl);
3577                     if (ibt_status != IBT_SUCCESS) {
3578                         cmn_err(CE_WARN, "rib_listen_stop: "
3579                             "ibt_deregister_service FAILED"
3580                                 " status: %d\n", ibt_status);
3581                     }
3582 
3583 #ifdef  DEBUG
3584                     if (rib_debug > 1) {
3585                         if (ibt_status == IBT_SUCCESS)
3586                                 cmn_err(CE_NOTE, "rib_listen_stop: "
3587                                     "Successfully stopped and"
3588                                     " UNREGISTERED service: %s\n",
3589                                     to_remove->srv_name);
3590                     }
3591 #endif
3592                 }
3593                 kmem_free(to_remove->srv_name, IB_SVC_NAME_LEN);
3594                 kmem_free(to_remove->srv_sbind_hdl,
3595                         sizeof (ibt_sbind_hdl_t));
3596 
3597                 kmem_free(to_remove, sizeof (rib_service_t));
3598         }
3599         hca->service_list = NULL;
3600         rw_exit(&hca->service_list_lock);
3601 }
3602 
3603 static struct svc_recv *
3604 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3605 {
3606         struct svc_recv *recvp;
3607 
3608         recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3609         recvp->vaddr = sgl->ds_va;
3610         recvp->qp = qp;
3611         recvp->bytes_xfer = 0;
3612         return (recvp);
3613 }
3614 
3615 static int
3616 rib_free_svc_recv(struct svc_recv *recvp)
3617 {
3618         kmem_free(recvp, sizeof (*recvp));
3619 
3620         return (0);
3621 }
3622 
3623 static struct reply *
3624 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3625 {
3626         struct reply    *rep;
3627 
3628 
3629         rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3630         if (rep == NULL) {
3631                 mutex_exit(&qp->replylist_lock);
3632                 cmn_err(CE_WARN, "rib_addreplylist: no memory\n");
3633                 return (NULL);
3634         }
3635         rep->xid = msgid;
3636         rep->vaddr_cq = NULL;
3637         rep->bytes_xfer = 0;
3638         rep->status = (uint_t)REPLY_WAIT;
3639         rep->prev = NULL;
3640         cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3641 
3642         mutex_enter(&qp->replylist_lock);
3643         if (qp->replylist) {
3644                 rep->next = qp->replylist;
3645                 qp->replylist->prev = rep;
3646         }
3647         qp->rep_list_size++;
3648         if (rib_debug > 1)
3649             cmn_err(CE_NOTE, "rib_addreplylist: qp:%p, rep_list_size:%d\n",
3650                 (void *)qp, qp->rep_list_size);
3651         qp->replylist = rep;
3652         mutex_exit(&qp->replylist_lock);
3653 
3654         return (rep);
3655 }
3656 
3657 static rdma_stat
3658 rib_rem_replylist(rib_qp_t *qp)
3659 {
3660         struct reply    *r, *n;
3661 
3662         mutex_enter(&qp->replylist_lock);
3663         for (r = qp->replylist; r != NULL; r = n) {
3664                 n = r->next;
3665                 (void) rib_remreply(qp, r);
3666         }
3667         mutex_exit(&qp->replylist_lock);
3668 
3669         return (RDMA_SUCCESS);
3670 }
3671 
3672 static int
3673 rib_remreply(rib_qp_t *qp, struct reply *rep)
3674 {
3675 
3676         ASSERT(MUTEX_HELD(&qp->replylist_lock));
3677         if (rep->prev) {
3678                 rep->prev->next = rep->next;
3679         }
3680         if (rep->next) {
3681                 rep->next->prev = rep->prev;
3682         }
3683         if (qp->replylist == rep)
3684                 qp->replylist = rep->next;
3685 
3686         cv_destroy(&rep->wait_cv);
3687         qp->rep_list_size--;
3688         if (rib_debug > 1)
3689             cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n",
3690                 (void *)qp, qp->rep_list_size);
3691 
3692         kmem_free(rep, sizeof (*rep));
3693 
3694         return (0);
3695 }
3696 
3697 rdma_stat
3698 rib_registermem(CONN *conn, caddr_t buf, uint_t buflen, 
3699         struct mrc *buf_handle)
3700 {
3701         ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */




3702         ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
3703         rdma_stat       status;
3704         rib_hca_t       *hca = (ctoqp(conn))->hca;
3705 
3706         /*
3707          * Note: ALL buffer pools use the same memory type RDMARW.
3708          */
3709         status = rib_reg_mem(hca, buf, buflen, 0, &mr_hdl, &mr_desc); 


3710         if (status == RDMA_SUCCESS) {
3711                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;














3712                 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3713                 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3714         } else {
3715                 buf_handle->mrc_linfo = NULL;
3716                 buf_handle->mrc_lmr = 0;
3717                 buf_handle->mrc_rmr = 0;
3718         }

3719         return (status);
3720 }
3721 

3722 static rdma_stat
3723 rib_reg_mem(rib_hca_t *hca, caddr_t buf, uint_t size, ibt_mr_flags_t spec, 




















































































3724         ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3725 {
3726         ibt_mr_attr_t   mem_attr;
3727         ibt_status_t    ibt_status;
3728  
3729         mem_attr.mr_vaddr = (uintptr_t)buf;
3730         mem_attr.mr_len = (ib_msglen_t)size;
3731         mem_attr.mr_as = NULL; 
3732         mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3733             IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3734             IBT_MR_ENABLE_WINDOW_BIND | spec;
3735 
3736         rw_enter(&hca->state_lock, RW_READER);
3737         if (hca->state == HCA_INITED) {
3738                 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3739                                         &mem_attr, mr_hdlp, mr_descp);
3740                 rw_exit(&hca->state_lock);
3741         } else {
3742                 rw_exit(&hca->state_lock);
3743                 return (RDMA_FAILED);
3744         }
3745 
3746         if (ibt_status != IBT_SUCCESS) {
3747                 cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr "
3748                         "(spec:%d) failed for addr %llX, status %d",
3749                         spec, (longlong_t)mem_attr.mr_vaddr, ibt_status);
3750                 return (RDMA_FAILED);
3751         }
3752         return (RDMA_SUCCESS);
3753 }
3754 
3755 rdma_stat
3756 rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen, 



3757         struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle)

3758 {
3759         ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */







3760         ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
3761         rdma_stat       status;
3762         rib_hca_t       *hca = (ctoqp(conn))->hca;
3763 
3764         /*
3765          * Non-coherent memory registration.
3766          */
3767         status = rib_reg_mem(hca, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, 

















































3768                         &mr_desc);
3769         if (status == RDMA_SUCCESS) {








3770                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3771                 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3772                 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3773                 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3774         } else {
3775                 buf_handle->mrc_linfo = NULL;
3776                 buf_handle->mrc_lmr = 0;
3777                 buf_handle->mrc_rmr = 0;
3778         }

3779         return (status);
3780 }
3781 
3782 /* ARGSUSED */
3783 rdma_stat
3784 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3785 {




3786         rib_hca_t *hca = (ctoqp(conn))->hca;
3787  
3788         /*
3789          * Allow memory deregistration even if HCA is
3790          * getting detached. Need all outstanding
3791          * memory registrations to be deregistered
3792          * before HCA_DETACH_EVENT can be accepted.
3793          */

















3794         (void) ibt_deregister_mr(hca->hca_hdl,
3795                         (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3796         return (RDMA_SUCCESS);
3797 }
3798 
3799 /* ARGSUSED */
3800 rdma_stat
3801 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,



3802                 RIB_SYNCMEM_HANDLE sync_handle)

3803 {









3804         (void) rib_deregistermem(conn, buf, buf_handle);
3805 
3806         return (RDMA_SUCCESS);
3807 }
3808 
3809 /* ARGSUSED */
3810 rdma_stat
3811 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3812                 int len, int cpu)
3813 {
3814         ibt_status_t    status;
3815         rib_hca_t *hca = (ctoqp(conn))->hca;
3816         ibt_mr_sync_t   mr_segment;
3817 
3818         mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3819         mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3820         mr_segment.ms_len = (ib_memlen_t)len;
3821         if (cpu) {
3822                 /* make incoming data visible to memory */
3823                 mr_segment.ms_flags = IBT_SYNC_WRITE;
3824         } else {
3825                 /* make memory changes visible to IO */
3826                 mr_segment.ms_flags = IBT_SYNC_READ;
3827         }
3828         rw_enter(&hca->state_lock, RW_READER);
3829         if (hca->state == HCA_INITED) {
3830                 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3831                 rw_exit(&hca->state_lock);
3832         } else {
3833                 rw_exit(&hca->state_lock);
3834                 return (RDMA_FAILED);
3835         }
3836 
3837         if (status == IBT_SUCCESS)
3838                 return (RDMA_SUCCESS);
3839         else {
3840 #ifdef DEBUG
3841                 cmn_err(CE_WARN, "rib_syncmem: ibt_sync_mr failed with %d\n",
3842                         status);
3843 #endif
3844                 return (RDMA_FAILED);
3845         }
3846 }
3847 
3848 /*
3849  * XXXX ????
3850  */
3851 static rdma_stat
3852 rib_getinfo(rdma_info_t *info)
3853 {
3854         /*
3855          * XXXX Hack!
3856          */
3857         info->addrlen = 16;
3858         info->mts = 1000000;
3859         info->mtu = 1000000;
3860 
3861         return (RDMA_SUCCESS);
3862 }
3863 
3864 rib_bufpool_t *
3865 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3866 {
3867         rib_bufpool_t   *rbp = NULL;
3868         bufpool_t       *bp = NULL;
3869         caddr_t         buf;
3870         ibt_mr_attr_t   mem_attr;
3871         ibt_status_t    ibt_status;
3872         int             i, j;
3873 
3874         rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3875 
3876         bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3877                         num * sizeof (void *), KM_SLEEP);
3878 
3879         mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3880         bp->numelems = num;
3881 

3882         switch (ptype) {
3883             case SEND_BUFFER:
3884                 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3885                 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */ 
3886                 bp->rsize = RPC_MSG_SZ;
3887                 break;
3888             case RECV_BUFFER:
3889                 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3890                 /* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */ 
3891                 bp->rsize = RPC_BUF_SIZE;
3892                 break;
3893             default:
3894                 goto fail;
3895         }
3896 
3897         /*
3898          * Register the pool.
3899          */
3900         bp->bufsize = num * bp->rsize;
3901         bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3902         rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3903                         sizeof (ibt_mr_hdl_t), KM_SLEEP);
3904         rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3905                         sizeof (ibt_mr_desc_t), KM_SLEEP);
3906  
3907         rw_enter(&hca->state_lock, RW_READER);
3908         if (hca->state != HCA_INITED) {
3909                 rw_exit(&hca->state_lock);

3910                 goto fail;
3911         }
3912         for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3913                 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3914                 mem_attr.mr_vaddr = (uintptr_t)buf;
3915                 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3916                 mem_attr.mr_as = NULL;
3917                 ibt_status = ibt_register_mr(hca->hca_hdl,
3918                         hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i],
3919                         &rbp->mr_desc[i]);
3920                 if (ibt_status != IBT_SUCCESS) {
3921                     for (j = 0; j < i; j++) {
3922                         (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]);
3923                     }
3924                     rw_exit(&hca->state_lock);
3925                     goto fail;
3926                 }
3927         }
3928         rw_exit(&hca->state_lock);
3929  
3930         buf = (caddr_t)bp->buf;
3931         for (i = 0; i < num; i++, buf += bp->rsize) {
3932                 bp->buflist[i] = (void *)buf;
3933         }
3934         bp->buffree = num - 1;       /* no. of free buffers */
3935         rbp->bpool = bp;
3936 
3937         return (rbp);
3938 fail:
3939         if (bp) {
3940             if (bp->buf)
3941                 kmem_free(bp->buf, bp->bufsize);
3942             kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3943         }
3944         if (rbp) {
3945             if (rbp->mr_hdl)
3946                 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3947             if (rbp->mr_desc)
3948                 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3949             kmem_free(rbp, sizeof (rib_bufpool_t));
3950         }
3951         return (NULL);
3952 }
3953 
3954 static void
3955 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3956 {
3957         int i;
3958         rib_bufpool_t *rbp = NULL;
3959         bufpool_t *bp;
3960 
3961         /*
3962          * Obtain pool address based on type of pool
3963          */
3964         switch (ptype) {
3965                 case SEND_BUFFER:
3966                         rbp = hca->send_pool;
3967                         break;
3968                 case RECV_BUFFER:
3969                         rbp = hca->recv_pool;
3970                         break;
3971                 default:
3972                         return;
3973         }
3974         if (rbp == NULL)
3975                 return;
3976 
3977         bp = rbp->bpool;
3978 
3979         /*
3980          * Deregister the pool memory and free it.
3981          */
3982         for (i = 0; i < bp->numelems; i++) {
3983                 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3984         }
3985 }
3986 
3987 static void
3988 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3989 {
3990 
3991         rib_bufpool_t *rbp = NULL;
3992         bufpool_t *bp;
3993 
3994         /*
3995          * Obtain pool address based on type of pool
3996          */
3997         switch (ptype) {
3998                 case SEND_BUFFER:
3999                         rbp = hca->send_pool;
4000                         break;
4001                 case RECV_BUFFER:
4002                         rbp = hca->recv_pool;
4003                         break;
4004                 default:
4005                         return;
4006         }
4007         if (rbp == NULL)
4008                 return;
4009 
4010         bp = rbp->bpool;
4011 
4012         /*
4013          * Free the pool memory.
4014          */
4015         if (rbp->mr_hdl)
4016                 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4017 
4018         if (rbp->mr_desc)
4019                 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4020  
4021         if (bp->buf)
4022                 kmem_free(bp->buf, bp->bufsize);
4023         mutex_destroy(&bp->buflock);
4024         kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4025         kmem_free(rbp, sizeof (rib_bufpool_t));
4026 }
4027 
4028 void
4029 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4030 {
4031         /*
4032          * Deregister the pool memory and free it.
4033          */
4034         rib_rbufpool_deregister(hca, ptype);
4035         rib_rbufpool_free(hca, ptype);
4036 }
4037 
4038 /*
4039  * Fetch a buffer from the pool of type specified in rdbuf->type.
4040  */
4041 static rdma_stat
4042 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4043 {
4044 
4045         rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4046         if (rdbuf->addr) {
4047                 switch (rdbuf->type) {
4048                 case SEND_BUFFER:
4049                         rdbuf->len = RPC_MSG_SZ;     /* 1K */
4050                         break;
4051                 case RECV_BUFFER:
4052                         rdbuf->len = RPC_BUF_SIZE; /* 2K */
4053                         break;
4054                 default:
4055                         rdbuf->len = 0;
4056                 }
4057                 return (RDMA_SUCCESS);
4058         } else
4059                 return (RDMA_FAILED);
4060 }
4061 








4062 
4063 /*
4064  * Fetch a buffer of specified type.
4065  * Note that rdbuf->handle is mw's rkey.
4066  */
4067 static void *
4068 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4069 {
4070         rib_qp_t        *qp = ctoqp(conn);
4071         rib_hca_t       *hca = qp->hca;
4072         rdma_btype      ptype = rdbuf->type;
4073         void            *buf;
4074         rib_bufpool_t   *rbp = NULL;
4075         bufpool_t       *bp;
4076         int             i;
4077 
4078         /*
4079          * Obtain pool address based on type of pool
4080          */
4081         switch (ptype) {
4082                 case SEND_BUFFER:
4083                         rbp = hca->send_pool;
4084                         break;
4085                 case RECV_BUFFER:
4086                         rbp = hca->recv_pool;
4087                         break;
4088                 default:
4089                         return (NULL);
4090         }
4091         if (rbp == NULL)
4092                 return (NULL);
4093 
4094         bp = rbp->bpool;
4095 
4096         mutex_enter(&bp->buflock);
4097         if (bp->buffree < 0) {
4098                 cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!");
4099                 mutex_exit(&bp->buflock);
4100                 return (NULL);
4101         }
4102 
4103         /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4104         buf = bp->buflist[bp->buffree];
4105         rdbuf->addr = buf;
4106         rdbuf->len = bp->rsize;
4107         for (i = bp->numelems - 1; i >= 0; i--) {
4108             if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4109                 rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey;
4110                 rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i];
4111                 rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey;






4112                 bp->buffree--;
4113                 if (rib_debug > 1)
4114                     cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs "
4115                         "(type %d)\n", bp->buffree+1, ptype);
4116 
4117                 mutex_exit(&bp->buflock);
4118 
4119                 return (buf);
4120             }
4121         }
4122         cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of "
4123                 "type %d found!", buf, ptype);
4124         mutex_exit(&bp->buflock);
4125 
4126         return (NULL);
4127 }
4128 
4129 static void
4130 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4131 {
4132 
4133         rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4134 }
4135 
4136 static void
4137 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4138 {
4139         rib_qp_t *qp = ctoqp(conn);
4140         rib_hca_t *hca = qp->hca;
4141         rib_bufpool_t *rbp = NULL;
4142         bufpool_t *bp;
4143 
4144         /*
4145          * Obtain pool address based on type of pool
4146          */
4147         switch (ptype) {
4148                 case SEND_BUFFER:
4149                         rbp = hca->send_pool;
4150                         break;
4151                 case RECV_BUFFER:
4152                         rbp = hca->recv_pool;
4153                         break;
4154                 default:
4155                         return;
4156         }
4157         if (rbp == NULL)
4158                 return;
4159 
4160         bp = rbp->bpool;
4161 
4162         mutex_enter(&bp->buflock);
4163         if (++bp->buffree >= bp->numelems) {
4164                 /*
4165                  * Should never happen
4166                  */
4167                 cmn_err(CE_WARN, "rib_rbuf_free: One (type %d) "
4168                         "too many frees!", ptype);
4169                 bp->buffree--;
4170         } else {
4171                 bp->buflist[bp->buffree] = buf;
4172                 if (rib_debug > 1)
4173                     cmn_err(CE_NOTE, "rib_rbuf_free: %d free bufs "
4174                         "(type %d)\n", bp->buffree+1, ptype);
4175         }
4176         mutex_exit(&bp->buflock);
4177 }
4178 
4179 static rdma_stat
4180 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4181 {
4182         rw_enter(&connlist->conn_lock, RW_WRITER);
4183         if (connlist->conn_hd) {
4184                 cn->c_next = connlist->conn_hd;
4185                 connlist->conn_hd->c_prev = cn;
4186         }
4187         connlist->conn_hd = cn;
4188         rw_exit(&connlist->conn_lock);
4189 
4190         return (RDMA_SUCCESS);
4191 }
4192 
4193 static rdma_stat
4194 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4195 {
4196         rw_enter(&connlist->conn_lock, RW_WRITER);
4197         if (cn->c_prev) {
4198                 cn->c_prev->c_next = cn->c_next;
4199         }
4200         if (cn->c_next) {
4201                 cn->c_next->c_prev = cn->c_prev;
4202         }
4203         if (connlist->conn_hd == cn)
4204                 connlist->conn_hd = cn->c_next;
4205         rw_exit(&connlist->conn_lock);
4206 
4207         return (RDMA_SUCCESS);
4208 }
4209 
4210 /*
4211  * Connection management.
4212  * IBTF does not support recycling of channels. So connections are only
4213  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR or
4214  * C_DISCONN_PEND state. No C_IDLE state.
4215  * C_CONN_PEND state: Connection establishment in progress to the server.
4216  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4217  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4218  * only in this state.
4219  * C_ERROR state: A connection transitions to this state when WRs on the
4220  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4221  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4222  * C_DISCONN_PEND state: When a connection is in C_ERROR state and when
4223  * c_ref drops to 0 (this indicates that RPC has no more references to this
4224  * connection), the connection should be destroyed. A connection transitions
4225  * into this state when it is being destroyed.
4226  */
4227 static rdma_stat
4228 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
4229 {
4230         CONN *cn;
4231         int status = RDMA_SUCCESS;
4232         rib_hca_t *hca = (rib_hca_t *)handle;
4233         rib_qp_t *qp;
4234         clock_t cv_stat, timout;
4235         ibt_path_info_t path;
4236 
4237 again:
4238         rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4239         cn = hca->cl_conn_list.conn_hd;
4240         while (cn != NULL) {
4241                 /*
4242                  * First, clear up any connection in the ERROR state
4243                  */
4244                 mutex_enter(&cn->c_lock);
4245                 if (cn->c_state == C_ERROR) {
4246                         if (cn->c_ref == 0) {
4247                                 /*
4248                                  * Remove connection from list and destroy it.
4249                                  */
4250                                 cn->c_state = C_DISCONN_PEND;
4251                                 mutex_exit(&cn->c_lock);
4252                                 rw_exit(&hca->cl_conn_list.conn_lock);
4253                                 (void) rib_disconnect_channel(cn,
4254                                     &hca->cl_conn_list);
4255                                 goto again;
4256                         }
4257                         mutex_exit(&cn->c_lock);
4258                         cn = cn->c_next;
4259                         continue;
4260                 } else if (cn->c_state == C_DISCONN_PEND) {
4261                         mutex_exit(&cn->c_lock);
4262                         cn = cn->c_next;
4263                         continue;
4264                 }
4265                 if ((cn->c_raddr.len == svcaddr->len) &&
4266                     bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
4267                         /*
4268                          * Our connection. Give up conn list lock
4269                          * as we are done traversing the list.
4270                          */
4271                         rw_exit(&hca->cl_conn_list.conn_lock);
4272                         if (cn->c_state == C_CONNECTED) {
4273                                 cn->c_ref++; /* sharing a conn */
4274                                 mutex_exit(&cn->c_lock);
4275                                 *conn = cn;
4276                                 return (status);
4277                         }
4278                         if (cn->c_state == C_CONN_PEND) {
4279                                 /*
4280                                  * Hold a reference to this conn before
4281                                  * we give up the lock.
4282                                  */
4283                                 cn->c_ref++;
4284                                 timout =  ddi_get_lbolt() +
4285                                     drv_usectohz(CONN_WAIT_TIME * 1000000);
4286                                 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4287                                         &cn->c_lock, timout)) > 0 &&
4288                                         cn->c_state == C_CONN_PEND)
4289                                         ;
4290                                 if (cv_stat == 0) {
4291                                         cn->c_ref--;
4292                                         mutex_exit(&cn->c_lock);
4293                                         return (RDMA_INTR);
4294                                 }
4295                                 if (cv_stat < 0) {
4296                                         cn->c_ref--;
4297                                         mutex_exit(&cn->c_lock);
4298                                         return (RDMA_TIMEDOUT);
4299                                 }
4300                                 if (cn->c_state == C_CONNECTED) {
4301                                         *conn = cn;
4302                                         mutex_exit(&cn->c_lock);
4303                                         return (status);
4304                                 } else {
4305                                         cn->c_ref--;
4306                                         mutex_exit(&cn->c_lock);
4307                                         return (RDMA_TIMEDOUT);
4308                                 }
4309                         }
4310                 }
4311                 mutex_exit(&cn->c_lock);
4312                 cn = cn->c_next;
4313         }
4314         rw_exit(&hca->cl_conn_list.conn_lock);
4315 
4316         status = rib_chk_srv_ats(hca, svcaddr, addr_type, &path);
4317         if (status != RDMA_SUCCESS) {
4318 #ifdef DEBUG
4319                 if (rib_debug) {
4320                         cmn_err(CE_WARN, "rib_conn_get: "
4321                                 "No server ATS record!");
4322                 }
4323 #endif
4324                 return (RDMA_FAILED);
4325         }
4326 
4327         /*
4328          * Channel to server doesn't exist yet, create one.
4329          */
4330         if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4331                 return (RDMA_FAILED);
4332         }
4333         cn = qptoc(qp);
4334         cn->c_state = C_CONN_PEND;
4335         cn->c_ref = 1;
4336 
4337         /*
4338          * Add to conn list.
4339          * We had given up the READER lock. In the time since then,
4340          * another thread might have created the connection we are
4341          * trying here. But for now, that is quiet alright - there
4342          * might be two connections between a pair of hosts instead
4343          * of one. If we really want to close that window,
4344          * then need to check the list after acquiring the
4345          * WRITER lock.
4346          */
4347         (void) rib_add_connlist(cn, &hca->cl_conn_list);
4348         status = rib_conn_to_srv(hca, qp, &path);
4349         mutex_enter(&cn->c_lock);
4350         if (status == RDMA_SUCCESS) {
4351                 cn->c_state = C_CONNECTED;
4352                 *conn = cn;
4353         } else {
4354                 cn->c_state = C_ERROR;
4355                 cn->c_ref--;
4356 #ifdef DEBUG
4357                 if (rib_debug) {
4358                         cmn_err(CE_WARN, "rib_conn_get: FAILED creating"
4359                             " a channel!");
4360                 }
4361 #endif
4362         }
4363         cv_broadcast(&cn->c_cv);
4364         mutex_exit(&cn->c_lock);
4365         return (status);
4366 }
4367 
4368 static rdma_stat
4369 rib_conn_release(CONN *conn)
4370 {
4371         rib_qp_t        *qp = ctoqp(conn);
4372 
4373         mutex_enter(&conn->c_lock);
4374         conn->c_ref--;
4375 
4376         /*
4377          * If a conn is C_ERROR, close the channel.
4378          * If it's CONNECTED, keep it that way.
4379          */
4380         if (conn->c_ref == 0 && (conn->c_state &  C_ERROR)) {
4381                 conn->c_state = C_DISCONN_PEND;
4382                 mutex_exit(&conn->c_lock);
4383                 if (qp->mode == RIB_SERVER)
4384                         (void) rib_disconnect_channel(conn,
4385                             &qp->hca->srv_conn_list);
4386                 else
4387                         (void) rib_disconnect_channel(conn,
4388                             &qp->hca->cl_conn_list);
4389                 return (RDMA_SUCCESS);
4390         }
4391         mutex_exit(&conn->c_lock);
4392         return (RDMA_SUCCESS);
4393 }
4394 
4395 /*
4396  * Add at front of list
4397  */
4398 static struct rdma_done_list *
4399 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4400 {
4401         struct rdma_done_list *rd;
4402 
4403         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4404 
4405         rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4406         rd->xid = xid;
4407         cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4408 
4409         rd->prev = NULL;
4410         rd->next = qp->rdlist;
4411         if (qp->rdlist != NULL)
4412                 qp->rdlist->prev = rd;
4413         qp->rdlist = rd;
4414 
4415         return (rd);
4416 }
4417 
4418 static void
4419 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4420 {
4421         struct rdma_done_list *r;
4422 
4423         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4424 
4425         r = rd->next;
4426         if (r != NULL) {
4427                 r->prev = rd->prev;
4428         }
4429 
4430         r = rd->prev;
4431         if (r != NULL) {
4432                 r->next = rd->next;
4433         } else {
4434                 qp->rdlist = rd->next;
4435         }
4436 
4437         cv_destroy(&rd->rdma_done_cv);
4438         kmem_free(rd, sizeof (*rd));
4439 }
4440 
4441 static void
4442 rdma_done_rem_list(rib_qp_t *qp)
4443 {
4444         struct rdma_done_list   *r, *n;
4445 
4446         mutex_enter(&qp->rdlist_lock);
4447         for (r = qp->rdlist; r != NULL; r = n) {
4448                 n = r->next;
4449                 rdma_done_rm(qp, r);
4450         }
4451         mutex_exit(&qp->rdlist_lock);
4452 }
4453 
4454 static void
4455 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4456 {
4457         struct rdma_done_list *r = qp->rdlist;
4458 
4459         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4460 
4461         while (r) {
4462                 if (r->xid == xid) {
4463                         cv_signal(&r->rdma_done_cv);
4464                         return;
4465                 } else {
4466                         r = r->next;
4467                 }
4468         }
4469         if (rib_debug > 1) {
4470             cmn_err(CE_WARN, "rdma_done_notify: "
4471                 "No matching xid for %u, qp %p\n", xid, (void *)qp);
4472         }
4473 }
4474 
4475 rpcib_ats_t *
4476 get_ibd_entry(ib_gid_t *gid, ib_pkey_t pkey, rpcib_ibd_insts_t *ibds)
4477 {
4478         rpcib_ats_t             *atsp;
4479         int                     i;
4480 
4481         for (i = 0, atsp = ibds->rib_ats; i < ibds->rib_ibd_cnt; i++, atsp++) {
4482                 if (atsp->ras_port_gid.gid_prefix == gid->gid_prefix &&
4483                     atsp->ras_port_gid.gid_guid == gid->gid_guid &&
4484                     atsp->ras_pkey == pkey) {
4485                         return (atsp);
4486                 }
4487         }
4488         return (NULL);
4489 }
4490 
4491 int
4492 rib_get_ibd_insts_cb(dev_info_t *dip, void *arg)
4493 {
4494         rpcib_ibd_insts_t *ibds = (rpcib_ibd_insts_t *)arg;
4495         rpcib_ats_t     *atsp;
4496         ib_pkey_t       pkey;
4497         uint8_t         port;
4498         ib_guid_t       hca_guid;
4499         ib_gid_t        port_gid;
4500 
4501         if (i_ddi_devi_attached(dip) &&
4502             (strcmp(ddi_node_name(dip), "ibport") == 0) &&
4503             (strstr(ddi_get_name_addr(dip), "ipib") != NULL)) {
4504 
4505                 if (ibds->rib_ibd_cnt >= ibds->rib_ibd_alloc) {
4506                     rpcib_ats_t *tmp;
4507 
4508                     tmp = (rpcib_ats_t *)kmem_zalloc((ibds->rib_ibd_alloc +
4509                         N_IBD_INSTANCES) * sizeof (rpcib_ats_t), KM_SLEEP);
4510                     bcopy(ibds->rib_ats, tmp,
4511                         ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
4512                     kmem_free(ibds->rib_ats,
4513                         ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
4514                     ibds->rib_ats = tmp;
4515                     ibds->rib_ibd_alloc += N_IBD_INSTANCES;
4516                 }
4517                 if (((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY,
4518                         dip, 0, "hca-guid", 0)) == 0) ||
4519                     ((port = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4520                         0, "port-number", 0)) == 0) ||
4521                     (ibt_get_port_state_byguid(hca_guid, port,
4522                         &port_gid, NULL) != IBT_SUCCESS) ||
4523                     ((pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4524                         "port-pkey", IB_PKEY_INVALID_LIMITED)) <=
4525                         IB_PKEY_INVALID_FULL)) {
4526                     return (DDI_WALK_CONTINUE);
4527                 }
4528                 atsp = &ibds->rib_ats[ibds->rib_ibd_cnt];
4529                 atsp->ras_inst = ddi_get_instance(dip);
4530                 atsp->ras_pkey = pkey;
4531                 atsp->ras_port_gid = port_gid;
4532                 ibds->rib_ibd_cnt++;
4533         }
4534         return (DDI_WALK_CONTINUE);
4535 }
4536 
4537 void
4538 rib_get_ibd_insts(rpcib_ibd_insts_t *ibds)
4539 {
4540         ddi_walk_devs(ddi_root_node(), rib_get_ibd_insts_cb, ibds);
4541 }
4542 
4543 /*
4544  * Return ibd interfaces and ibd instances.
4545  */
4546 int
4547 get_ibd_ipaddr(rpcib_ibd_insts_t *ibds)
4548 {
4549         TIUSER                  *tiptr, *tiptr6;
4550         vnode_t                 *kvp, *kvp6;
4551         vnode_t                 *vp = NULL, *vp6 = NULL;
4552         struct strioctl         iocb;
4553         struct lifreq           lif_req;
4554         int                     k, ip_cnt;
4555         rpcib_ats_t             *atsp;
4556 
4557         if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
4558                 &kvp) == 0) {
4559             if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
4560                 &tiptr, CRED()) == 0) {
4561                 vp = tiptr->fp->f_vnode;
4562             } else {
4563                 VN_RELE(kvp);
4564             }
4565         }
4566 
4567         if (lookupname("/dev/udp6", UIO_SYSSPACE, FOLLOW, NULLVPP,
4568                 &kvp6) == 0) {
4569             if (t_kopen((file_t *)NULL, kvp6->v_rdev, FREAD|FWRITE,
4570                 &tiptr6, CRED()) == 0) {
4571                 vp6 = tiptr6->fp->f_vnode;
4572             } else {
4573                 VN_RELE(kvp6);
4574             }
4575         }
4576 
4577         if (vp == NULL && vp6 == NULL)
4578                 return (-1);
4579 
4580         /* Get ibd ip's */
4581         ip_cnt = 0;
4582         for (k = 0, atsp = ibds->rib_ats; k < ibds->rib_ibd_cnt; k++, atsp++) {
4583                 /* IPv4 */
4584             if (vp != NULL) {
4585                 (void) bzero((void *)&lif_req, sizeof (struct lifreq));
4586                 (void) snprintf(lif_req.lifr_name,
4587                         sizeof (lif_req.lifr_name), "%s%d",
4588                         IBD_NAME, atsp->ras_inst);
4589 
4590                 (void) bzero((void *)&iocb, sizeof (struct strioctl));
4591                 iocb.ic_cmd = SIOCGLIFADDR;
4592                 iocb.ic_timout = 0;
4593                 iocb.ic_len = sizeof (struct lifreq);
4594                 iocb.ic_dp = (caddr_t)&lif_req;
4595                 if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) {
4596                     atsp->ras_inet_type = AF_INET;
4597                     bcopy(&lif_req.lifr_addr, &atsp->ras_sin,
4598                         sizeof (struct sockaddr_in));
4599                     ip_cnt++;
4600                     continue;
4601                 }
4602             }
4603                 /* Try IPv6 */
4604             if (vp6 != NULL) {
4605                 (void) bzero((void *)&lif_req, sizeof (struct lifreq));
4606                 (void) snprintf(lif_req.lifr_name,
4607                         sizeof (lif_req.lifr_name), "%s%d",
4608                         IBD_NAME, atsp->ras_inst);
4609 
4610                 (void) bzero((void *)&iocb, sizeof (struct strioctl));
4611                 iocb.ic_cmd = SIOCGLIFADDR;
4612                 iocb.ic_timout = 0;
4613                 iocb.ic_len = sizeof (struct lifreq);
4614                 iocb.ic_dp = (caddr_t)&lif_req;
4615                 if (kstr_ioctl(vp6, I_STR, (intptr_t)&iocb) == 0) {
4616 
4617                     atsp->ras_inet_type = AF_INET6;
4618                     bcopy(&lif_req.lifr_addr, &atsp->ras_sin6,
4619                             sizeof (struct sockaddr_in6));
4620                     ip_cnt++;
4621                 }
4622             }
4623         }
4624 
4625         if (vp6 != NULL) {
4626             (void) t_kclose(tiptr6, 0);
4627             VN_RELE(kvp6);
4628         }
4629         if (vp != NULL) {
4630             (void) t_kclose(tiptr, 0);
4631             VN_RELE(kvp);
4632         }
4633 
4634         if (ip_cnt == 0)
4635             return (-1);
4636         else
4637             return (0);
4638 }
4639 
4640 char **
4641 get_ip_addrs(int *count)
4642 {
4643         TIUSER                  *tiptr;
4644         vnode_t                 *kvp;
4645         int                     num_of_ifs;
4646         char                    **addresses;
4647         int                     return_code;
4648 
4649         /*
4650          * Open a device for doing down stream kernel ioctls
4651          */
4652         return_code = lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW,
4653             NULLVPP, &kvp);
4654         if (return_code != 0) {
4655                 cmn_err(CE_NOTE, "get_Ip_addrs: lookupname failed\n");
4656                 *count = -1;
4657                 return (NULL);
4658         }
4659 
4660         return_code = t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
4661             &tiptr, CRED());
4662         if (return_code != 0) {
4663                 cmn_err(CE_NOTE, "get_Ip_addrs: t_kopen failed\n");
4664                 VN_RELE(kvp);
4665                 *count = -1;
4666                 return (NULL);
4667         }
4668 
4669         /*
4670          * Perform the first ioctl to get the number of interfaces
4671          */
4672         return_code = get_interfaces(tiptr, &num_of_ifs);
4673         if (return_code != 0 || num_of_ifs == 0) {
4674                 cmn_err(CE_NOTE, "get_Ip_addrs: get_interfaces failed\n");
4675                 (void) t_kclose(tiptr, 0);
4676                 VN_RELE(kvp);
4677                 *count = -1;
4678                 return (NULL);
4679         }
4680 
4681         /*
4682          * Perform the second ioctl to get the address on each interface
4683          * found.
4684          */
4685         addresses = kmem_zalloc(num_of_ifs * sizeof (char *), KM_SLEEP);
4686         return_code = find_addrs(tiptr, addresses, num_of_ifs);
4687         if (return_code <= 0) {
4688                 cmn_err(CE_NOTE, "get_Ip_addrs: find_addrs failed\n");
4689                 (void) t_kclose(tiptr, 0);
4690                 kmem_free(addresses, num_of_ifs * sizeof (char *));
4691                 VN_RELE(kvp);
4692                 *count = -1;
4693                 return (NULL);
4694         }
4695 
4696         *count = return_code;
4697         VN_RELE(kvp);
4698         (void) t_kclose(tiptr, 0);
4699         return (addresses);
4700 }
4701 
4702 int
4703 get_interfaces(TIUSER *tiptr, int *num)
4704 {
4705         struct lifnum           if_buf;
4706         struct strioctl         iocb;
4707         vnode_t                 *vp;
4708         int                     return_code;
4709 
4710         /*
4711          * Prep the number of interfaces request buffer for ioctl
4712          */
4713         (void) bzero((void *)&if_buf, sizeof (struct lifnum));
4714         if_buf.lifn_family = AF_UNSPEC;
4715         if_buf.lifn_flags = 0;
4716 
4717         /*
4718          * Prep the kernel ioctl buffer and send it down stream
4719          */
4720         (void) bzero((void *)&iocb, sizeof (struct strioctl));
4721         iocb.ic_cmd = SIOCGLIFNUM;
4722         iocb.ic_timout = 0;
4723         iocb.ic_len = sizeof (if_buf);
4724         iocb.ic_dp = (caddr_t)&if_buf;
4725 
4726         vp = tiptr->fp->f_vnode;
4727         return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4728         if (return_code != 0) {
4729                 cmn_err(CE_NOTE, "get_interfaces: kstr_ioctl failed\n");
4730                 *num = -1;
4731                 return (-1);
4732         }
4733 
4734         *num = if_buf.lifn_count;
4735 #ifdef  DEBUG
4736         if (rib_debug > 1)
4737                 cmn_err(CE_NOTE, "Number of interfaces detected: %d\n",
4738                     if_buf.lifn_count);
4739 #endif
4740         return (0);
4741 }
4742 
4743 int
4744 find_addrs(TIUSER *tiptr, char **addrs, int num_ifs)
4745 {
4746         struct lifconf          lifc;
4747         struct lifreq           *if_data_buf;
4748         struct strioctl         iocb;
4749         caddr_t                 request_buffer;
4750         struct sockaddr_in      *sin4;
4751         struct sockaddr_in6     *sin6;
4752         vnode_t                 *vp;
4753         int                     i, count, return_code;
4754 
4755         /*
4756          * Prep the buffer for requesting all interface's info
4757          */
4758         (void) bzero((void *)&lifc, sizeof (struct lifconf));
4759         lifc.lifc_family = AF_UNSPEC;
4760         lifc.lifc_flags = 0;
4761         lifc.lifc_len = num_ifs * sizeof (struct lifreq);
4762 
4763         request_buffer = kmem_zalloc(num_ifs * sizeof (struct lifreq),
4764             KM_SLEEP);
4765 
4766         lifc.lifc_buf = request_buffer;
4767 
4768         /*
4769          * Prep the kernel ioctl buffer and send it down stream
4770          */
4771         (void) bzero((void *)&iocb, sizeof (struct strioctl));
4772         iocb.ic_cmd = SIOCGLIFCONF;
4773         iocb.ic_timout = 0;
4774         iocb.ic_len = sizeof (struct lifconf);
4775         iocb.ic_dp = (caddr_t)&lifc;
4776 
4777         vp = tiptr->fp->f_vnode;
4778         return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4779         if (return_code != 0) {
4780                 cmn_err(CE_NOTE, "find_addrs: kstr_ioctl failed\n");
4781                 kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
4782                 return (-1);
4783         }
4784 
4785         /*
4786          * Extract addresses and fill them in the requested array
4787          * IB_SVC_NAME_LEN is defined to be 64 so it  covers both IPv4 &
4788          * IPv6. Here count is the number of IP addresses collected.
4789          */
4790         if_data_buf = lifc.lifc_req;
4791         count = 0;
4792         for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--,
4793         if_data_buf++) {
4794                 if (if_data_buf->lifr_addr.ss_family == AF_INET) {
4795                         sin4 = (struct sockaddr_in *)&if_data_buf->lifr_addr;
4796                         addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
4797                         (void) inet_ntop(AF_INET, &sin4->sin_addr,
4798                             addrs[count], IB_SVC_NAME_LEN);
4799                         count ++;
4800                 }
4801 
4802                 if (if_data_buf->lifr_addr.ss_family == AF_INET6) {
4803                         sin6 = (struct sockaddr_in6 *)&if_data_buf->lifr_addr;
4804                         addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
4805                         (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
4806                             addrs[count], IB_SVC_NAME_LEN);
4807                         count ++;
4808                 }
4809         }
4810 
4811         kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
4812         return (count);
4813 }
4814 
4815 /*
4816  * Goes through all connections and closes the channel
4817  * This will cause all the WRs on those channels to be
4818  * flushed.
4819  */
4820 static void
4821 rib_close_channels(rib_conn_list_t *connlist)
4822 {
4823         CONN            *conn;
4824         rib_qp_t        *qp;
4825 
4826         rw_enter(&connlist->conn_lock, RW_READER);
4827         conn = connlist->conn_hd;
4828         while (conn != NULL) {
4829                 mutex_enter(&conn->c_lock);
4830                 qp = ctoqp(conn);
4831                 if (conn->c_state & C_CONNECTED) {
4832                         /*
4833                          * Live connection in CONNECTED state.
4834                          * Call ibt_close_rc_channel in nonblocking mode
4835                          * with no callbacks.
4836                          */
4837                         conn->c_state = C_ERROR;
4838                         (void) ibt_close_rc_channel(qp->qp_hdl,
4839                                 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
4840                         (void) ibt_free_channel(qp->qp_hdl);
4841                         qp->qp_hdl = NULL;
4842                 } else {
4843                         if (conn->c_state == C_ERROR &&
4844                                 qp->qp_hdl != NULL) {
4845                                 /*
4846                                  * Connection in ERROR state but
4847                                  * channel is not yet freed.
4848                                  */
4849                                 (void) ibt_close_rc_channel(qp->qp_hdl,
4850                                         IBT_NOCALLBACKS, NULL, 0, NULL,
4851                                         NULL, 0);
4852                                 (void) ibt_free_channel(qp->qp_hdl);
4853                                 qp->qp_hdl = NULL;
4854                         }
4855                 }
4856                 mutex_exit(&conn->c_lock);
4857                 conn = conn->c_next;
4858         }
4859         rw_exit(&connlist->conn_lock);
4860 }
4861 
4862 /*
4863  * Frees up all connections that are no longer being referenced
4864  */
4865 static void
4866 rib_purge_connlist(rib_conn_list_t *connlist)
4867 {
4868         CONN            *conn;
4869 
4870 top:
4871         rw_enter(&connlist->conn_lock, RW_READER);
4872         conn = connlist->conn_hd;
4873         while (conn != NULL) {
4874                 mutex_enter(&conn->c_lock);
4875 
4876                 /*
4877                  * At this point connection is either in ERROR
4878                  * or DISCONN_PEND state. If in DISCONN_PEND state
4879                  * then some other thread is culling that connection.
4880                  * If not and if c_ref is 0, then destroy the connection.
4881                  */
4882                 if (conn->c_ref == 0 &&
4883                         conn->c_state != C_DISCONN_PEND) {
4884                         /*
4885                          * Cull the connection
4886                          */
4887                         conn->c_state = C_DISCONN_PEND;
4888                         mutex_exit(&conn->c_lock);
4889                         rw_exit(&connlist->conn_lock);
4890                         (void) rib_disconnect_channel(conn, connlist);
4891                         goto top;
4892                 } else {
4893                         /*
4894                          * conn disconnect already scheduled or will
4895                          * happen from conn_release when c_ref drops to 0.
4896                          */
4897                         mutex_exit(&conn->c_lock);
4898                 }
4899                 conn = conn->c_next;
4900         }
4901         rw_exit(&connlist->conn_lock);
4902 
4903         /*
4904          * At this point, only connections with c_ref != 0 are on the list
4905          */
4906 }
4907 
4908 /*
4909  * Cleans and closes up all uses of the HCA
4910  */
4911 static void
4912 rib_detach_hca(rib_hca_t *hca)
4913 {
4914 
4915         /*
4916          * Stop all services on the HCA
4917          * Go through cl_conn_list and close all rc_channels
4918          * Go through svr_conn_list and close all rc_channels
4919          * Free connections whose c_ref has dropped to 0
4920          * Destroy all CQs
4921          * Deregister and released all buffer pool memory after all
4922          * connections are destroyed
4923          * Free the protection domain
4924          * ibt_close_hca()
4925          */
4926         rw_enter(&hca->state_lock, RW_WRITER);
4927         if (hca->state == HCA_DETACHED) {
4928                 rw_exit(&hca->state_lock);
4929                 return;
4930         }
4931 
4932         hca->state = HCA_DETACHED;
4933         rib_stat->nhca_inited--;
4934 
4935         rib_stop_services(hca);
4936         rib_deregister_ats();
4937         rib_close_channels(&hca->cl_conn_list);
4938         rib_close_channels(&hca->srv_conn_list);
4939         rw_exit(&hca->state_lock);
4940 
4941         rib_purge_connlist(&hca->cl_conn_list);
4942         rib_purge_connlist(&hca->srv_conn_list);
4943 
4944         (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4945         (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4946         (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4947         (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4948         kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4949         kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4950         kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4951         kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4952 
4953         rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4954         rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4955         if (hca->srv_conn_list.conn_hd == NULL &&
4956                 hca->cl_conn_list.conn_hd == NULL) {
4957                 /*
4958                  * conn_lists are NULL, so destroy
4959                  * buffers, close hca and be done.
4960                  */
4961                 rib_rbufpool_destroy(hca, RECV_BUFFER);
4962                 rib_rbufpool_destroy(hca, SEND_BUFFER);



4963                 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4964                 (void) ibt_close_hca(hca->hca_hdl);
4965                 hca->hca_hdl = NULL;
4966         }
4967         rw_exit(&hca->cl_conn_list.conn_lock);
4968         rw_exit(&hca->srv_conn_list.conn_lock);
4969 
4970         if (hca->hca_hdl != NULL) {
4971                 mutex_enter(&hca->inuse_lock);
4972                 while (hca->inuse)
4973                         cv_wait(&hca->cb_cv, &hca->inuse_lock);
4974                 mutex_exit(&hca->inuse_lock);
4975                 /*
4976                  * conn_lists are now NULL, so destroy
4977                  * buffers, close hca and be done.
4978                  */
4979                 rib_rbufpool_destroy(hca, RECV_BUFFER);
4980                 rib_rbufpool_destroy(hca, SEND_BUFFER);
4981                 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4982                 (void) ibt_close_hca(hca->hca_hdl);
4983                 hca->hca_hdl = NULL;
4984         }
4985 }







































































































































































































































































































































--- EOF ---